gr15_close_pot_CKA / trainer_state.json
binhng's picture
Upload folder using huggingface_hub
080ab20 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 18.21668264621285,
"eval_steps": 500,
"global_step": 19000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009587727708533078,
"grad_norm": 27.445323944091797,
"learning_rate": 9.473684210526317e-07,
"loss": 2.1709,
"step": 10
},
{
"epoch": 0.019175455417066157,
"grad_norm": 19.005075454711914,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8704,
"step": 20
},
{
"epoch": 0.028763183125599234,
"grad_norm": 14.785849571228027,
"learning_rate": 3.0526315789473684e-06,
"loss": 1.6318,
"step": 30
},
{
"epoch": 0.038350910834132314,
"grad_norm": 4.634030342102051,
"learning_rate": 4.105263157894737e-06,
"loss": 0.8641,
"step": 40
},
{
"epoch": 0.04793863854266539,
"grad_norm": 2.2945172786712646,
"learning_rate": 5.1578947368421055e-06,
"loss": 0.5394,
"step": 50
},
{
"epoch": 0.05752636625119847,
"grad_norm": 1.7087739706039429,
"learning_rate": 6.2105263157894745e-06,
"loss": 0.4525,
"step": 60
},
{
"epoch": 0.06711409395973154,
"grad_norm": 1.1094379425048828,
"learning_rate": 7.2631578947368426e-06,
"loss": 0.3063,
"step": 70
},
{
"epoch": 0.07670182166826463,
"grad_norm": 1.5301676988601685,
"learning_rate": 8.315789473684212e-06,
"loss": 0.3153,
"step": 80
},
{
"epoch": 0.0862895493767977,
"grad_norm": 1.1719224452972412,
"learning_rate": 9.368421052631579e-06,
"loss": 0.2466,
"step": 90
},
{
"epoch": 0.09587727708533078,
"grad_norm": 1.751291275024414,
"learning_rate": 1.0421052631578948e-05,
"loss": 0.27,
"step": 100
},
{
"epoch": 0.10546500479386385,
"grad_norm": 1.0524818897247314,
"learning_rate": 1.1473684210526315e-05,
"loss": 0.2333,
"step": 110
},
{
"epoch": 0.11505273250239693,
"grad_norm": 1.508988857269287,
"learning_rate": 1.2526315789473686e-05,
"loss": 0.2399,
"step": 120
},
{
"epoch": 0.12464046021093,
"grad_norm": 1.3286081552505493,
"learning_rate": 1.3578947368421053e-05,
"loss": 0.1962,
"step": 130
},
{
"epoch": 0.1342281879194631,
"grad_norm": 1.7412567138671875,
"learning_rate": 1.4631578947368422e-05,
"loss": 0.2004,
"step": 140
},
{
"epoch": 0.14381591562799617,
"grad_norm": 1.8567883968353271,
"learning_rate": 1.568421052631579e-05,
"loss": 0.174,
"step": 150
},
{
"epoch": 0.15340364333652926,
"grad_norm": 1.5139102935791016,
"learning_rate": 1.673684210526316e-05,
"loss": 0.1765,
"step": 160
},
{
"epoch": 0.1629913710450623,
"grad_norm": 1.6859902143478394,
"learning_rate": 1.7789473684210527e-05,
"loss": 0.168,
"step": 170
},
{
"epoch": 0.1725790987535954,
"grad_norm": 1.8252370357513428,
"learning_rate": 1.8842105263157894e-05,
"loss": 0.1645,
"step": 180
},
{
"epoch": 0.18216682646212848,
"grad_norm": 1.2732850313186646,
"learning_rate": 1.9894736842105265e-05,
"loss": 0.1554,
"step": 190
},
{
"epoch": 0.19175455417066156,
"grad_norm": 1.0456390380859375,
"learning_rate": 2.0947368421052632e-05,
"loss": 0.1575,
"step": 200
},
{
"epoch": 0.20134228187919462,
"grad_norm": 0.7651330828666687,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.163,
"step": 210
},
{
"epoch": 0.2109300095877277,
"grad_norm": 0.9984806776046753,
"learning_rate": 2.305263157894737e-05,
"loss": 0.1508,
"step": 220
},
{
"epoch": 0.22051773729626079,
"grad_norm": 1.0750813484191895,
"learning_rate": 2.410526315789474e-05,
"loss": 0.1349,
"step": 230
},
{
"epoch": 0.23010546500479387,
"grad_norm": 1.7777466773986816,
"learning_rate": 2.5157894736842108e-05,
"loss": 0.1448,
"step": 240
},
{
"epoch": 0.23969319271332695,
"grad_norm": 1.3516716957092285,
"learning_rate": 2.6210526315789475e-05,
"loss": 0.1427,
"step": 250
},
{
"epoch": 0.24928092042186,
"grad_norm": 1.1810095310211182,
"learning_rate": 2.7263157894736846e-05,
"loss": 0.1385,
"step": 260
},
{
"epoch": 0.2588686481303931,
"grad_norm": 1.6512832641601562,
"learning_rate": 2.8315789473684213e-05,
"loss": 0.155,
"step": 270
},
{
"epoch": 0.2684563758389262,
"grad_norm": 1.2209525108337402,
"learning_rate": 2.9368421052631577e-05,
"loss": 0.1378,
"step": 280
},
{
"epoch": 0.27804410354745923,
"grad_norm": 1.0236748456954956,
"learning_rate": 3.042105263157895e-05,
"loss": 0.1409,
"step": 290
},
{
"epoch": 0.28763183125599234,
"grad_norm": 1.065836787223816,
"learning_rate": 3.147368421052632e-05,
"loss": 0.1409,
"step": 300
},
{
"epoch": 0.2972195589645254,
"grad_norm": 1.0454283952713013,
"learning_rate": 3.2526315789473686e-05,
"loss": 0.1333,
"step": 310
},
{
"epoch": 0.3068072866730585,
"grad_norm": 0.5515532493591309,
"learning_rate": 3.357894736842105e-05,
"loss": 0.1137,
"step": 320
},
{
"epoch": 0.31639501438159157,
"grad_norm": 1.323104977607727,
"learning_rate": 3.463157894736842e-05,
"loss": 0.1317,
"step": 330
},
{
"epoch": 0.3259827420901246,
"grad_norm": 1.5426658391952515,
"learning_rate": 3.5684210526315794e-05,
"loss": 0.1174,
"step": 340
},
{
"epoch": 0.33557046979865773,
"grad_norm": 0.9131991863250732,
"learning_rate": 3.673684210526316e-05,
"loss": 0.1171,
"step": 350
},
{
"epoch": 0.3451581975071908,
"grad_norm": 1.0024508237838745,
"learning_rate": 3.778947368421053e-05,
"loss": 0.1162,
"step": 360
},
{
"epoch": 0.3547459252157239,
"grad_norm": 1.1091963052749634,
"learning_rate": 3.8842105263157896e-05,
"loss": 0.1272,
"step": 370
},
{
"epoch": 0.36433365292425696,
"grad_norm": 0.9772627949714661,
"learning_rate": 3.989473684210526e-05,
"loss": 0.1059,
"step": 380
},
{
"epoch": 0.37392138063279,
"grad_norm": 0.92393958568573,
"learning_rate": 4.094736842105264e-05,
"loss": 0.113,
"step": 390
},
{
"epoch": 0.3835091083413231,
"grad_norm": 0.9960997700691223,
"learning_rate": 4.2e-05,
"loss": 0.1077,
"step": 400
},
{
"epoch": 0.3930968360498562,
"grad_norm": 1.0618188381195068,
"learning_rate": 4.305263157894737e-05,
"loss": 0.1084,
"step": 410
},
{
"epoch": 0.40268456375838924,
"grad_norm": 0.7491030693054199,
"learning_rate": 4.410526315789474e-05,
"loss": 0.1021,
"step": 420
},
{
"epoch": 0.41227229146692235,
"grad_norm": 0.9327500462532043,
"learning_rate": 4.515789473684211e-05,
"loss": 0.0984,
"step": 430
},
{
"epoch": 0.4218600191754554,
"grad_norm": 0.7720574140548706,
"learning_rate": 4.6210526315789473e-05,
"loss": 0.0971,
"step": 440
},
{
"epoch": 0.4314477468839885,
"grad_norm": 1.2057392597198486,
"learning_rate": 4.726315789473684e-05,
"loss": 0.1088,
"step": 450
},
{
"epoch": 0.44103547459252157,
"grad_norm": 1.1223393678665161,
"learning_rate": 4.8315789473684215e-05,
"loss": 0.0992,
"step": 460
},
{
"epoch": 0.4506232023010546,
"grad_norm": 0.6742480397224426,
"learning_rate": 4.936842105263158e-05,
"loss": 0.0963,
"step": 470
},
{
"epoch": 0.46021093000958774,
"grad_norm": 1.0714161396026611,
"learning_rate": 5.042105263157895e-05,
"loss": 0.0974,
"step": 480
},
{
"epoch": 0.4697986577181208,
"grad_norm": 0.7936097383499146,
"learning_rate": 5.1473684210526317e-05,
"loss": 0.1022,
"step": 490
},
{
"epoch": 0.4793863854266539,
"grad_norm": 1.4822968244552612,
"learning_rate": 5.252631578947369e-05,
"loss": 0.0996,
"step": 500
},
{
"epoch": 0.48897411313518696,
"grad_norm": 1.0476019382476807,
"learning_rate": 5.357894736842105e-05,
"loss": 0.1018,
"step": 510
},
{
"epoch": 0.49856184084372,
"grad_norm": 0.9343310594558716,
"learning_rate": 5.4631578947368425e-05,
"loss": 0.102,
"step": 520
},
{
"epoch": 0.5081495685522531,
"grad_norm": 0.8918314576148987,
"learning_rate": 5.568421052631579e-05,
"loss": 0.0986,
"step": 530
},
{
"epoch": 0.5177372962607862,
"grad_norm": 1.155029296875,
"learning_rate": 5.6736842105263166e-05,
"loss": 0.1031,
"step": 540
},
{
"epoch": 0.5273250239693192,
"grad_norm": 0.625169038772583,
"learning_rate": 5.778947368421053e-05,
"loss": 0.0907,
"step": 550
},
{
"epoch": 0.5369127516778524,
"grad_norm": 1.0989243984222412,
"learning_rate": 5.88421052631579e-05,
"loss": 0.0843,
"step": 560
},
{
"epoch": 0.5465004793863855,
"grad_norm": 0.8834158778190613,
"learning_rate": 5.989473684210527e-05,
"loss": 0.0777,
"step": 570
},
{
"epoch": 0.5560882070949185,
"grad_norm": 0.7638639211654663,
"learning_rate": 6.094736842105263e-05,
"loss": 0.0781,
"step": 580
},
{
"epoch": 0.5656759348034516,
"grad_norm": 1.2423137426376343,
"learning_rate": 6.2e-05,
"loss": 0.0886,
"step": 590
},
{
"epoch": 0.5752636625119847,
"grad_norm": 1.082046627998352,
"learning_rate": 6.305263157894738e-05,
"loss": 0.0921,
"step": 600
},
{
"epoch": 0.5848513902205177,
"grad_norm": 0.8878996968269348,
"learning_rate": 6.410526315789474e-05,
"loss": 0.0926,
"step": 610
},
{
"epoch": 0.5944391179290508,
"grad_norm": 0.80406653881073,
"learning_rate": 6.515789473684211e-05,
"loss": 0.0983,
"step": 620
},
{
"epoch": 0.6040268456375839,
"grad_norm": 0.8726837038993835,
"learning_rate": 6.621052631578947e-05,
"loss": 0.0833,
"step": 630
},
{
"epoch": 0.613614573346117,
"grad_norm": 0.9104009866714478,
"learning_rate": 6.726315789473685e-05,
"loss": 0.0884,
"step": 640
},
{
"epoch": 0.62320230105465,
"grad_norm": 0.6089403629302979,
"learning_rate": 6.83157894736842e-05,
"loss": 0.0835,
"step": 650
},
{
"epoch": 0.6327900287631831,
"grad_norm": 0.8488327860832214,
"learning_rate": 6.936842105263158e-05,
"loss": 0.0812,
"step": 660
},
{
"epoch": 0.6423777564717162,
"grad_norm": 1.121718168258667,
"learning_rate": 7.042105263157895e-05,
"loss": 0.0979,
"step": 670
},
{
"epoch": 0.6519654841802492,
"grad_norm": 0.554762065410614,
"learning_rate": 7.147368421052631e-05,
"loss": 0.0941,
"step": 680
},
{
"epoch": 0.6615532118887824,
"grad_norm": 0.8173949718475342,
"learning_rate": 7.252631578947369e-05,
"loss": 0.09,
"step": 690
},
{
"epoch": 0.6711409395973155,
"grad_norm": 0.9960802793502808,
"learning_rate": 7.357894736842106e-05,
"loss": 0.0969,
"step": 700
},
{
"epoch": 0.6807286673058485,
"grad_norm": 0.9952852725982666,
"learning_rate": 7.463157894736844e-05,
"loss": 0.0927,
"step": 710
},
{
"epoch": 0.6903163950143816,
"grad_norm": 1.1024588346481323,
"learning_rate": 7.56842105263158e-05,
"loss": 0.0874,
"step": 720
},
{
"epoch": 0.6999041227229147,
"grad_norm": 0.7529568672180176,
"learning_rate": 7.673684210526316e-05,
"loss": 0.0853,
"step": 730
},
{
"epoch": 0.7094918504314478,
"grad_norm": 0.8373092412948608,
"learning_rate": 7.778947368421053e-05,
"loss": 0.0783,
"step": 740
},
{
"epoch": 0.7190795781399808,
"grad_norm": 0.6158662438392639,
"learning_rate": 7.884210526315789e-05,
"loss": 0.0872,
"step": 750
},
{
"epoch": 0.7286673058485139,
"grad_norm": 0.7315576076507568,
"learning_rate": 7.989473684210527e-05,
"loss": 0.0841,
"step": 760
},
{
"epoch": 0.738255033557047,
"grad_norm": 0.5791612267494202,
"learning_rate": 8.094736842105264e-05,
"loss": 0.0706,
"step": 770
},
{
"epoch": 0.74784276126558,
"grad_norm": 0.8657413721084595,
"learning_rate": 8.2e-05,
"loss": 0.0689,
"step": 780
},
{
"epoch": 0.7574304889741131,
"grad_norm": 0.9742875695228577,
"learning_rate": 8.305263157894737e-05,
"loss": 0.0869,
"step": 790
},
{
"epoch": 0.7670182166826462,
"grad_norm": 0.7406681776046753,
"learning_rate": 8.410526315789475e-05,
"loss": 0.0869,
"step": 800
},
{
"epoch": 0.7766059443911792,
"grad_norm": 1.168278455734253,
"learning_rate": 8.515789473684211e-05,
"loss": 0.0803,
"step": 810
},
{
"epoch": 0.7861936720997124,
"grad_norm": 1.1049866676330566,
"learning_rate": 8.621052631578947e-05,
"loss": 0.0851,
"step": 820
},
{
"epoch": 0.7957813998082455,
"grad_norm": 0.9790105223655701,
"learning_rate": 8.726315789473684e-05,
"loss": 0.0788,
"step": 830
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.762137770652771,
"learning_rate": 8.831578947368422e-05,
"loss": 0.0715,
"step": 840
},
{
"epoch": 0.8149568552253116,
"grad_norm": 0.8730412125587463,
"learning_rate": 8.936842105263158e-05,
"loss": 0.0898,
"step": 850
},
{
"epoch": 0.8245445829338447,
"grad_norm": 1.1794781684875488,
"learning_rate": 9.042105263157895e-05,
"loss": 0.0798,
"step": 860
},
{
"epoch": 0.8341323106423778,
"grad_norm": 0.7828540205955505,
"learning_rate": 9.147368421052633e-05,
"loss": 0.0848,
"step": 870
},
{
"epoch": 0.8437200383509108,
"grad_norm": 0.7496788501739502,
"learning_rate": 9.252631578947369e-05,
"loss": 0.0836,
"step": 880
},
{
"epoch": 0.8533077660594439,
"grad_norm": 0.7298113703727722,
"learning_rate": 9.357894736842106e-05,
"loss": 0.0804,
"step": 890
},
{
"epoch": 0.862895493767977,
"grad_norm": 0.7915740609169006,
"learning_rate": 9.463157894736842e-05,
"loss": 0.0978,
"step": 900
},
{
"epoch": 0.87248322147651,
"grad_norm": 0.6587068438529968,
"learning_rate": 9.56842105263158e-05,
"loss": 0.0823,
"step": 910
},
{
"epoch": 0.8820709491850431,
"grad_norm": 0.6733153462409973,
"learning_rate": 9.673684210526316e-05,
"loss": 0.0903,
"step": 920
},
{
"epoch": 0.8916586768935763,
"grad_norm": 0.8253368139266968,
"learning_rate": 9.778947368421053e-05,
"loss": 0.0817,
"step": 930
},
{
"epoch": 0.9012464046021093,
"grad_norm": 0.631831169128418,
"learning_rate": 9.88421052631579e-05,
"loss": 0.0692,
"step": 940
},
{
"epoch": 0.9108341323106424,
"grad_norm": 0.4998478293418884,
"learning_rate": 9.989473684210526e-05,
"loss": 0.08,
"step": 950
},
{
"epoch": 0.9204218600191755,
"grad_norm": 0.5345643162727356,
"learning_rate": 9.999993865625701e-05,
"loss": 0.0707,
"step": 960
},
{
"epoch": 0.9300095877277086,
"grad_norm": 0.496713250875473,
"learning_rate": 9.999972660400536e-05,
"loss": 0.0759,
"step": 970
},
{
"epoch": 0.9395973154362416,
"grad_norm": 0.4693014621734619,
"learning_rate": 9.999936308655709e-05,
"loss": 0.0781,
"step": 980
},
{
"epoch": 0.9491850431447747,
"grad_norm": 0.5775050520896912,
"learning_rate": 9.999884810501344e-05,
"loss": 0.0748,
"step": 990
},
{
"epoch": 0.9587727708533078,
"grad_norm": 0.7837674021720886,
"learning_rate": 9.999818166093444e-05,
"loss": 0.0783,
"step": 1000
},
{
"epoch": 0.9683604985618408,
"grad_norm": 0.6740615367889404,
"learning_rate": 9.999736375633896e-05,
"loss": 0.0799,
"step": 1010
},
{
"epoch": 0.9779482262703739,
"grad_norm": 0.644281268119812,
"learning_rate": 9.999639439370469e-05,
"loss": 0.0875,
"step": 1020
},
{
"epoch": 0.987535953978907,
"grad_norm": 0.6877675652503967,
"learning_rate": 9.999527357596816e-05,
"loss": 0.0702,
"step": 1030
},
{
"epoch": 0.99712368168744,
"grad_norm": 0.8206673860549927,
"learning_rate": 9.999400130652465e-05,
"loss": 0.0705,
"step": 1040
},
{
"epoch": 1.0067114093959733,
"grad_norm": 0.5425058007240295,
"learning_rate": 9.999257758922833e-05,
"loss": 0.0773,
"step": 1050
},
{
"epoch": 1.0162991371045063,
"grad_norm": 0.7658944725990295,
"learning_rate": 9.999100242839203e-05,
"loss": 0.0777,
"step": 1060
},
{
"epoch": 1.0258868648130393,
"grad_norm": 0.73934006690979,
"learning_rate": 9.998927582878747e-05,
"loss": 0.0685,
"step": 1070
},
{
"epoch": 1.0354745925215725,
"grad_norm": 0.38501349091529846,
"learning_rate": 9.998739779564506e-05,
"loss": 0.069,
"step": 1080
},
{
"epoch": 1.0450623202301055,
"grad_norm": 0.45449578762054443,
"learning_rate": 9.998536833465394e-05,
"loss": 0.0559,
"step": 1090
},
{
"epoch": 1.0546500479386385,
"grad_norm": 0.8127736449241638,
"learning_rate": 9.998318745196203e-05,
"loss": 0.068,
"step": 1100
},
{
"epoch": 1.0642377756471717,
"grad_norm": 0.6800121068954468,
"learning_rate": 9.998085515417588e-05,
"loss": 0.0683,
"step": 1110
},
{
"epoch": 1.0738255033557047,
"grad_norm": 0.688755214214325,
"learning_rate": 9.997837144836082e-05,
"loss": 0.0619,
"step": 1120
},
{
"epoch": 1.0834132310642377,
"grad_norm": 0.6529737710952759,
"learning_rate": 9.997573634204074e-05,
"loss": 0.0716,
"step": 1130
},
{
"epoch": 1.093000958772771,
"grad_norm": 0.773915708065033,
"learning_rate": 9.997294984319827e-05,
"loss": 0.0667,
"step": 1140
},
{
"epoch": 1.102588686481304,
"grad_norm": 0.611422061920166,
"learning_rate": 9.997001196027457e-05,
"loss": 0.0695,
"step": 1150
},
{
"epoch": 1.112176414189837,
"grad_norm": 0.6238502264022827,
"learning_rate": 9.996692270216947e-05,
"loss": 0.0632,
"step": 1160
},
{
"epoch": 1.1217641418983701,
"grad_norm": 0.6252961158752441,
"learning_rate": 9.996368207824128e-05,
"loss": 0.0708,
"step": 1170
},
{
"epoch": 1.1313518696069031,
"grad_norm": 0.3486538529396057,
"learning_rate": 9.996029009830689e-05,
"loss": 0.0662,
"step": 1180
},
{
"epoch": 1.1409395973154361,
"grad_norm": 0.40418991446495056,
"learning_rate": 9.995674677264173e-05,
"loss": 0.0591,
"step": 1190
},
{
"epoch": 1.1505273250239694,
"grad_norm": 0.4740557074546814,
"learning_rate": 9.995305211197965e-05,
"loss": 0.0701,
"step": 1200
},
{
"epoch": 1.1601150527325024,
"grad_norm": 0.713366687297821,
"learning_rate": 9.994920612751295e-05,
"loss": 0.073,
"step": 1210
},
{
"epoch": 1.1697027804410354,
"grad_norm": 0.6612546443939209,
"learning_rate": 9.994520883089238e-05,
"loss": 0.0681,
"step": 1220
},
{
"epoch": 1.1792905081495686,
"grad_norm": 0.6933987736701965,
"learning_rate": 9.994106023422699e-05,
"loss": 0.0655,
"step": 1230
},
{
"epoch": 1.1888782358581016,
"grad_norm": 0.4890410602092743,
"learning_rate": 9.993676035008423e-05,
"loss": 0.0633,
"step": 1240
},
{
"epoch": 1.1984659635666346,
"grad_norm": 0.5587823987007141,
"learning_rate": 9.993230919148985e-05,
"loss": 0.0656,
"step": 1250
},
{
"epoch": 1.2080536912751678,
"grad_norm": 0.6635778546333313,
"learning_rate": 9.99277067719278e-05,
"loss": 0.0603,
"step": 1260
},
{
"epoch": 1.2176414189837008,
"grad_norm": 0.6514385342597961,
"learning_rate": 9.99229531053403e-05,
"loss": 0.0652,
"step": 1270
},
{
"epoch": 1.2272291466922338,
"grad_norm": 0.5782362818717957,
"learning_rate": 9.991804820612773e-05,
"loss": 0.0644,
"step": 1280
},
{
"epoch": 1.236816874400767,
"grad_norm": 0.39845097064971924,
"learning_rate": 9.99129920891486e-05,
"loss": 0.0617,
"step": 1290
},
{
"epoch": 1.2464046021093,
"grad_norm": 0.5628125667572021,
"learning_rate": 9.990778476971951e-05,
"loss": 0.0613,
"step": 1300
},
{
"epoch": 1.255992329817833,
"grad_norm": 0.4811013340950012,
"learning_rate": 9.99024262636151e-05,
"loss": 0.0644,
"step": 1310
},
{
"epoch": 1.2655800575263663,
"grad_norm": 0.540348470211029,
"learning_rate": 9.989691658706798e-05,
"loss": 0.063,
"step": 1320
},
{
"epoch": 1.2751677852348993,
"grad_norm": 0.593609631061554,
"learning_rate": 9.989125575676876e-05,
"loss": 0.0537,
"step": 1330
},
{
"epoch": 1.2847555129434325,
"grad_norm": 0.4400087296962738,
"learning_rate": 9.988544378986591e-05,
"loss": 0.0634,
"step": 1340
},
{
"epoch": 1.2943432406519655,
"grad_norm": 0.7038517594337463,
"learning_rate": 9.987948070396571e-05,
"loss": 0.0564,
"step": 1350
},
{
"epoch": 1.3039309683604985,
"grad_norm": 0.4805976450443268,
"learning_rate": 9.987336651713229e-05,
"loss": 0.0604,
"step": 1360
},
{
"epoch": 1.3135186960690317,
"grad_norm": 0.5478856563568115,
"learning_rate": 9.986710124788745e-05,
"loss": 0.0573,
"step": 1370
},
{
"epoch": 1.3231064237775647,
"grad_norm": 0.6592814922332764,
"learning_rate": 9.986068491521072e-05,
"loss": 0.0604,
"step": 1380
},
{
"epoch": 1.332694151486098,
"grad_norm": 0.7848181128501892,
"learning_rate": 9.985411753853921e-05,
"loss": 0.055,
"step": 1390
},
{
"epoch": 1.342281879194631,
"grad_norm": 0.40262654423713684,
"learning_rate": 9.984739913776765e-05,
"loss": 0.0629,
"step": 1400
},
{
"epoch": 1.351869606903164,
"grad_norm": 0.6241422295570374,
"learning_rate": 9.984052973324817e-05,
"loss": 0.0609,
"step": 1410
},
{
"epoch": 1.3614573346116972,
"grad_norm": 0.7500850558280945,
"learning_rate": 9.983350934579046e-05,
"loss": 0.0742,
"step": 1420
},
{
"epoch": 1.3710450623202302,
"grad_norm": 0.6990365386009216,
"learning_rate": 9.982633799666146e-05,
"loss": 0.0605,
"step": 1430
},
{
"epoch": 1.3806327900287632,
"grad_norm": 0.5741100311279297,
"learning_rate": 9.981901570758554e-05,
"loss": 0.0639,
"step": 1440
},
{
"epoch": 1.3902205177372964,
"grad_norm": 0.6131389141082764,
"learning_rate": 9.981154250074422e-05,
"loss": 0.0695,
"step": 1450
},
{
"epoch": 1.3998082454458294,
"grad_norm": 0.6654881834983826,
"learning_rate": 9.980391839877628e-05,
"loss": 0.0755,
"step": 1460
},
{
"epoch": 1.4093959731543624,
"grad_norm": 0.5249256491661072,
"learning_rate": 9.979614342477753e-05,
"loss": 0.0613,
"step": 1470
},
{
"epoch": 1.4189837008628956,
"grad_norm": 0.5373178124427795,
"learning_rate": 9.978821760230086e-05,
"loss": 0.072,
"step": 1480
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.4792821407318115,
"learning_rate": 9.978014095535615e-05,
"loss": 0.0549,
"step": 1490
},
{
"epoch": 1.4381591562799616,
"grad_norm": 0.5644699931144714,
"learning_rate": 9.977191350841016e-05,
"loss": 0.065,
"step": 1500
},
{
"epoch": 1.4477468839884948,
"grad_norm": 0.374956339597702,
"learning_rate": 9.976353528638642e-05,
"loss": 0.0545,
"step": 1510
},
{
"epoch": 1.4573346116970278,
"grad_norm": 0.4185064733028412,
"learning_rate": 9.975500631466527e-05,
"loss": 0.0619,
"step": 1520
},
{
"epoch": 1.4669223394055608,
"grad_norm": 0.3903638422489166,
"learning_rate": 9.974632661908372e-05,
"loss": 0.0526,
"step": 1530
},
{
"epoch": 1.476510067114094,
"grad_norm": 0.45104435086250305,
"learning_rate": 9.973749622593534e-05,
"loss": 0.061,
"step": 1540
},
{
"epoch": 1.486097794822627,
"grad_norm": 0.4152944087982178,
"learning_rate": 9.972851516197019e-05,
"loss": 0.0635,
"step": 1550
},
{
"epoch": 1.49568552253116,
"grad_norm": 0.5824716091156006,
"learning_rate": 9.971938345439484e-05,
"loss": 0.0598,
"step": 1560
},
{
"epoch": 1.5052732502396933,
"grad_norm": 0.5598675608634949,
"learning_rate": 9.971010113087212e-05,
"loss": 0.0529,
"step": 1570
},
{
"epoch": 1.5148609779482263,
"grad_norm": 0.6759763956069946,
"learning_rate": 9.970066821952118e-05,
"loss": 0.0687,
"step": 1580
},
{
"epoch": 1.5244487056567593,
"grad_norm": 0.4682703912258148,
"learning_rate": 9.969108474891732e-05,
"loss": 0.0557,
"step": 1590
},
{
"epoch": 1.5340364333652925,
"grad_norm": 0.6091550588607788,
"learning_rate": 9.968135074809194e-05,
"loss": 0.0628,
"step": 1600
},
{
"epoch": 1.5436241610738255,
"grad_norm": 0.5167152881622314,
"learning_rate": 9.96714662465325e-05,
"loss": 0.056,
"step": 1610
},
{
"epoch": 1.5532118887823585,
"grad_norm": 0.5612486004829407,
"learning_rate": 9.966143127418225e-05,
"loss": 0.0565,
"step": 1620
},
{
"epoch": 1.5627996164908917,
"grad_norm": 0.3620167672634125,
"learning_rate": 9.965124586144039e-05,
"loss": 0.0533,
"step": 1630
},
{
"epoch": 1.5723873441994247,
"grad_norm": 0.6704486012458801,
"learning_rate": 9.964091003916179e-05,
"loss": 0.0633,
"step": 1640
},
{
"epoch": 1.5819750719079577,
"grad_norm": 0.6531718969345093,
"learning_rate": 9.963042383865694e-05,
"loss": 0.0665,
"step": 1650
},
{
"epoch": 1.591562799616491,
"grad_norm": 0.5249754786491394,
"learning_rate": 9.961978729169192e-05,
"loss": 0.0471,
"step": 1660
},
{
"epoch": 1.601150527325024,
"grad_norm": 0.4377578794956207,
"learning_rate": 9.960900043048826e-05,
"loss": 0.0561,
"step": 1670
},
{
"epoch": 1.610738255033557,
"grad_norm": 0.34821832180023193,
"learning_rate": 9.959806328772279e-05,
"loss": 0.0575,
"step": 1680
},
{
"epoch": 1.6203259827420902,
"grad_norm": 0.41964197158813477,
"learning_rate": 9.958697589652763e-05,
"loss": 0.0552,
"step": 1690
},
{
"epoch": 1.6299137104506232,
"grad_norm": 0.5038737058639526,
"learning_rate": 9.957573829049004e-05,
"loss": 0.0571,
"step": 1700
},
{
"epoch": 1.6395014381591562,
"grad_norm": 0.5568312406539917,
"learning_rate": 9.956435050365233e-05,
"loss": 0.0535,
"step": 1710
},
{
"epoch": 1.6490891658676894,
"grad_norm": 0.3089469075202942,
"learning_rate": 9.955281257051178e-05,
"loss": 0.0567,
"step": 1720
},
{
"epoch": 1.6586768935762224,
"grad_norm": 0.5025231838226318,
"learning_rate": 9.954112452602045e-05,
"loss": 0.0595,
"step": 1730
},
{
"epoch": 1.6682646212847554,
"grad_norm": 0.6473100185394287,
"learning_rate": 9.952928640558519e-05,
"loss": 0.0583,
"step": 1740
},
{
"epoch": 1.6778523489932886,
"grad_norm": 0.38910412788391113,
"learning_rate": 9.951729824506745e-05,
"loss": 0.0606,
"step": 1750
},
{
"epoch": 1.6874400767018218,
"grad_norm": 0.5367538332939148,
"learning_rate": 9.950516008078325e-05,
"loss": 0.0658,
"step": 1760
},
{
"epoch": 1.6970278044103546,
"grad_norm": 0.5526398420333862,
"learning_rate": 9.949287194950293e-05,
"loss": 0.0554,
"step": 1770
},
{
"epoch": 1.7066155321188878,
"grad_norm": 0.5616441369056702,
"learning_rate": 9.948043388845121e-05,
"loss": 0.0579,
"step": 1780
},
{
"epoch": 1.716203259827421,
"grad_norm": 0.41163280606269836,
"learning_rate": 9.946784593530694e-05,
"loss": 0.0612,
"step": 1790
},
{
"epoch": 1.7257909875359538,
"grad_norm": 0.45861759781837463,
"learning_rate": 9.945510812820308e-05,
"loss": 0.0524,
"step": 1800
},
{
"epoch": 1.735378715244487,
"grad_norm": 0.4847518503665924,
"learning_rate": 9.944222050572653e-05,
"loss": 0.0545,
"step": 1810
},
{
"epoch": 1.7449664429530203,
"grad_norm": 0.36065423488616943,
"learning_rate": 9.942918310691803e-05,
"loss": 0.0503,
"step": 1820
},
{
"epoch": 1.754554170661553,
"grad_norm": 0.5361629128456116,
"learning_rate": 9.941599597127202e-05,
"loss": 0.0582,
"step": 1830
},
{
"epoch": 1.7641418983700863,
"grad_norm": 0.290815532207489,
"learning_rate": 9.940265913873657e-05,
"loss": 0.0626,
"step": 1840
},
{
"epoch": 1.7737296260786195,
"grad_norm": 0.3743116855621338,
"learning_rate": 9.938917264971324e-05,
"loss": 0.0577,
"step": 1850
},
{
"epoch": 1.7833173537871523,
"grad_norm": 0.7040207982063293,
"learning_rate": 9.937553654505691e-05,
"loss": 0.0625,
"step": 1860
},
{
"epoch": 1.7929050814956855,
"grad_norm": 0.4356692135334015,
"learning_rate": 9.936175086607572e-05,
"loss": 0.0616,
"step": 1870
},
{
"epoch": 1.8024928092042187,
"grad_norm": 0.3443772494792938,
"learning_rate": 9.934781565453089e-05,
"loss": 0.0573,
"step": 1880
},
{
"epoch": 1.8120805369127517,
"grad_norm": 0.4956841766834259,
"learning_rate": 9.933373095263667e-05,
"loss": 0.0528,
"step": 1890
},
{
"epoch": 1.8216682646212847,
"grad_norm": 0.5193634629249573,
"learning_rate": 9.931949680306012e-05,
"loss": 0.0548,
"step": 1900
},
{
"epoch": 1.831255992329818,
"grad_norm": 0.3799174129962921,
"learning_rate": 9.930511324892104e-05,
"loss": 0.0563,
"step": 1910
},
{
"epoch": 1.840843720038351,
"grad_norm": 0.3923283815383911,
"learning_rate": 9.929058033379181e-05,
"loss": 0.0595,
"step": 1920
},
{
"epoch": 1.850431447746884,
"grad_norm": 0.47552716732025146,
"learning_rate": 9.927589810169733e-05,
"loss": 0.0546,
"step": 1930
},
{
"epoch": 1.8600191754554172,
"grad_norm": 0.4305611848831177,
"learning_rate": 9.926106659711476e-05,
"loss": 0.0523,
"step": 1940
},
{
"epoch": 1.8696069031639502,
"grad_norm": 0.5576485395431519,
"learning_rate": 9.924608586497348e-05,
"loss": 0.0574,
"step": 1950
},
{
"epoch": 1.8791946308724832,
"grad_norm": 0.31708958745002747,
"learning_rate": 9.923095595065494e-05,
"loss": 0.0482,
"step": 1960
},
{
"epoch": 1.8887823585810164,
"grad_norm": 0.41617056727409363,
"learning_rate": 9.921567689999247e-05,
"loss": 0.0584,
"step": 1970
},
{
"epoch": 1.8983700862895494,
"grad_norm": 0.5047758221626282,
"learning_rate": 9.920024875927125e-05,
"loss": 0.0642,
"step": 1980
},
{
"epoch": 1.9079578139980824,
"grad_norm": 0.4173164367675781,
"learning_rate": 9.918467157522805e-05,
"loss": 0.0548,
"step": 1990
},
{
"epoch": 1.9175455417066156,
"grad_norm": 0.4640159010887146,
"learning_rate": 9.916894539505115e-05,
"loss": 0.0499,
"step": 2000
},
{
"epoch": 1.9271332694151486,
"grad_norm": 0.41713109612464905,
"learning_rate": 9.915307026638018e-05,
"loss": 0.0491,
"step": 2010
},
{
"epoch": 1.9367209971236816,
"grad_norm": 0.392994225025177,
"learning_rate": 9.9137046237306e-05,
"loss": 0.0522,
"step": 2020
},
{
"epoch": 1.9463087248322148,
"grad_norm": 0.32308030128479004,
"learning_rate": 9.912087335637054e-05,
"loss": 0.0557,
"step": 2030
},
{
"epoch": 1.9558964525407478,
"grad_norm": 0.406943678855896,
"learning_rate": 9.910455167256663e-05,
"loss": 0.0523,
"step": 2040
},
{
"epoch": 1.9654841802492808,
"grad_norm": 0.3809382915496826,
"learning_rate": 9.908808123533787e-05,
"loss": 0.0567,
"step": 2050
},
{
"epoch": 1.975071907957814,
"grad_norm": 0.3431997299194336,
"learning_rate": 9.907146209457852e-05,
"loss": 0.0456,
"step": 2060
},
{
"epoch": 1.984659635666347,
"grad_norm": 0.37939101457595825,
"learning_rate": 9.905469430063325e-05,
"loss": 0.0479,
"step": 2070
},
{
"epoch": 1.99424736337488,
"grad_norm": 0.492702841758728,
"learning_rate": 9.903777790429714e-05,
"loss": 0.048,
"step": 2080
},
{
"epoch": 2.0038350910834133,
"grad_norm": 0.41130146384239197,
"learning_rate": 9.90207129568153e-05,
"loss": 0.0545,
"step": 2090
},
{
"epoch": 2.0134228187919465,
"grad_norm": 0.5280721187591553,
"learning_rate": 9.900349950988297e-05,
"loss": 0.0516,
"step": 2100
},
{
"epoch": 2.0230105465004793,
"grad_norm": 0.3090174198150635,
"learning_rate": 9.89861376156452e-05,
"loss": 0.043,
"step": 2110
},
{
"epoch": 2.0325982742090125,
"grad_norm": 0.35579144954681396,
"learning_rate": 9.896862732669671e-05,
"loss": 0.0584,
"step": 2120
},
{
"epoch": 2.0421860019175457,
"grad_norm": 0.44842928647994995,
"learning_rate": 9.89509686960818e-05,
"loss": 0.0523,
"step": 2130
},
{
"epoch": 2.0517737296260785,
"grad_norm": 0.4050745666027069,
"learning_rate": 9.893316177729411e-05,
"loss": 0.0529,
"step": 2140
},
{
"epoch": 2.0613614573346117,
"grad_norm": 0.2710857093334198,
"learning_rate": 9.891520662427651e-05,
"loss": 0.0582,
"step": 2150
},
{
"epoch": 2.070949185043145,
"grad_norm": 0.327932745218277,
"learning_rate": 9.88971032914209e-05,
"loss": 0.056,
"step": 2160
},
{
"epoch": 2.0805369127516777,
"grad_norm": 0.41889169812202454,
"learning_rate": 9.887885183356809e-05,
"loss": 0.0449,
"step": 2170
},
{
"epoch": 2.090124640460211,
"grad_norm": 0.37824153900146484,
"learning_rate": 9.886045230600757e-05,
"loss": 0.0478,
"step": 2180
},
{
"epoch": 2.099712368168744,
"grad_norm": 0.4298747479915619,
"learning_rate": 9.884190476447746e-05,
"loss": 0.0479,
"step": 2190
},
{
"epoch": 2.109300095877277,
"grad_norm": 0.5047415494918823,
"learning_rate": 9.882320926516416e-05,
"loss": 0.0509,
"step": 2200
},
{
"epoch": 2.11888782358581,
"grad_norm": 0.3802444338798523,
"learning_rate": 9.880436586470234e-05,
"loss": 0.0469,
"step": 2210
},
{
"epoch": 2.1284755512943434,
"grad_norm": 0.3608779311180115,
"learning_rate": 9.87853746201747e-05,
"loss": 0.0499,
"step": 2220
},
{
"epoch": 2.138063279002876,
"grad_norm": 0.49108660221099854,
"learning_rate": 9.876623558911181e-05,
"loss": 0.0494,
"step": 2230
},
{
"epoch": 2.1476510067114094,
"grad_norm": 0.35984379053115845,
"learning_rate": 9.874694882949194e-05,
"loss": 0.0513,
"step": 2240
},
{
"epoch": 2.1572387344199426,
"grad_norm": 0.6457746624946594,
"learning_rate": 9.872751439974084e-05,
"loss": 0.0497,
"step": 2250
},
{
"epoch": 2.1668264621284754,
"grad_norm": 0.4572752118110657,
"learning_rate": 9.870793235873164e-05,
"loss": 0.0497,
"step": 2260
},
{
"epoch": 2.1764141898370086,
"grad_norm": 0.5329883098602295,
"learning_rate": 9.868820276578463e-05,
"loss": 0.0597,
"step": 2270
},
{
"epoch": 2.186001917545542,
"grad_norm": 0.4147273302078247,
"learning_rate": 9.866832568066706e-05,
"loss": 0.0537,
"step": 2280
},
{
"epoch": 2.1955896452540746,
"grad_norm": 0.3269449770450592,
"learning_rate": 9.864830116359299e-05,
"loss": 0.0541,
"step": 2290
},
{
"epoch": 2.205177372962608,
"grad_norm": 0.38033929467201233,
"learning_rate": 9.862812927522309e-05,
"loss": 0.0493,
"step": 2300
},
{
"epoch": 2.214765100671141,
"grad_norm": 0.39863190054893494,
"learning_rate": 9.86078100766645e-05,
"loss": 0.0582,
"step": 2310
},
{
"epoch": 2.224352828379674,
"grad_norm": 0.3785865604877472,
"learning_rate": 9.858734362947056e-05,
"loss": 0.0451,
"step": 2320
},
{
"epoch": 2.233940556088207,
"grad_norm": 0.3535449802875519,
"learning_rate": 9.856672999564072e-05,
"loss": 0.0569,
"step": 2330
},
{
"epoch": 2.2435282837967403,
"grad_norm": 0.43401646614074707,
"learning_rate": 9.854596923762026e-05,
"loss": 0.0451,
"step": 2340
},
{
"epoch": 2.253116011505273,
"grad_norm": 0.3438590466976166,
"learning_rate": 9.852506141830018e-05,
"loss": 0.0527,
"step": 2350
},
{
"epoch": 2.2627037392138063,
"grad_norm": 0.524154543876648,
"learning_rate": 9.850400660101698e-05,
"loss": 0.0536,
"step": 2360
},
{
"epoch": 2.2722914669223395,
"grad_norm": 0.6278344392776489,
"learning_rate": 9.848280484955243e-05,
"loss": 0.0566,
"step": 2370
},
{
"epoch": 2.2818791946308723,
"grad_norm": 0.45389410853385925,
"learning_rate": 9.846145622813343e-05,
"loss": 0.0538,
"step": 2380
},
{
"epoch": 2.2914669223394055,
"grad_norm": 0.3653407692909241,
"learning_rate": 9.843996080143181e-05,
"loss": 0.0496,
"step": 2390
},
{
"epoch": 2.3010546500479387,
"grad_norm": 0.39420798420906067,
"learning_rate": 9.84183186345641e-05,
"loss": 0.0507,
"step": 2400
},
{
"epoch": 2.310642377756472,
"grad_norm": 0.36511731147766113,
"learning_rate": 9.839652979309135e-05,
"loss": 0.0415,
"step": 2410
},
{
"epoch": 2.3202301054650047,
"grad_norm": 0.6739844679832458,
"learning_rate": 9.837459434301896e-05,
"loss": 0.0497,
"step": 2420
},
{
"epoch": 2.329817833173538,
"grad_norm": 0.3520050346851349,
"learning_rate": 9.835251235079643e-05,
"loss": 0.0476,
"step": 2430
},
{
"epoch": 2.3394055608820707,
"grad_norm": 0.3880830705165863,
"learning_rate": 9.833028388331719e-05,
"loss": 0.0477,
"step": 2440
},
{
"epoch": 2.348993288590604,
"grad_norm": 0.5605785250663757,
"learning_rate": 9.830790900791842e-05,
"loss": 0.0565,
"step": 2450
},
{
"epoch": 2.358581016299137,
"grad_norm": 0.43835964798927307,
"learning_rate": 9.828538779238074e-05,
"loss": 0.0481,
"step": 2460
},
{
"epoch": 2.3681687440076704,
"grad_norm": 0.46309876441955566,
"learning_rate": 9.826272030492817e-05,
"loss": 0.0459,
"step": 2470
},
{
"epoch": 2.377756471716203,
"grad_norm": 0.315773606300354,
"learning_rate": 9.823990661422778e-05,
"loss": 0.0446,
"step": 2480
},
{
"epoch": 2.3873441994247364,
"grad_norm": 0.37291958928108215,
"learning_rate": 9.821694678938953e-05,
"loss": 0.0394,
"step": 2490
},
{
"epoch": 2.396931927133269,
"grad_norm": 0.5233327150344849,
"learning_rate": 9.819384089996613e-05,
"loss": 0.0494,
"step": 2500
},
{
"epoch": 2.4065196548418024,
"grad_norm": 0.33032602071762085,
"learning_rate": 9.817058901595269e-05,
"loss": 0.0586,
"step": 2510
},
{
"epoch": 2.4161073825503356,
"grad_norm": 0.39209842681884766,
"learning_rate": 9.814719120778663e-05,
"loss": 0.0528,
"step": 2520
},
{
"epoch": 2.425695110258869,
"grad_norm": 0.3824262320995331,
"learning_rate": 9.81236475463474e-05,
"loss": 0.0502,
"step": 2530
},
{
"epoch": 2.4352828379674016,
"grad_norm": 0.4724734127521515,
"learning_rate": 9.809995810295633e-05,
"loss": 0.0538,
"step": 2540
},
{
"epoch": 2.444870565675935,
"grad_norm": 0.4816121459007263,
"learning_rate": 9.80761229493763e-05,
"loss": 0.0599,
"step": 2550
},
{
"epoch": 2.4544582933844676,
"grad_norm": 0.4902478754520416,
"learning_rate": 9.805214215781165e-05,
"loss": 0.0579,
"step": 2560
},
{
"epoch": 2.464046021093001,
"grad_norm": 0.4263833463191986,
"learning_rate": 9.802801580090785e-05,
"loss": 0.0496,
"step": 2570
},
{
"epoch": 2.473633748801534,
"grad_norm": 0.4122842848300934,
"learning_rate": 9.800374395175143e-05,
"loss": 0.0601,
"step": 2580
},
{
"epoch": 2.4832214765100673,
"grad_norm": 0.3193143308162689,
"learning_rate": 9.797932668386955e-05,
"loss": 0.0453,
"step": 2590
},
{
"epoch": 2.4928092042186,
"grad_norm": 0.302079439163208,
"learning_rate": 9.795476407122994e-05,
"loss": 0.0526,
"step": 2600
},
{
"epoch": 2.5023969319271333,
"grad_norm": 0.3169849216938019,
"learning_rate": 9.793005618824066e-05,
"loss": 0.0475,
"step": 2610
},
{
"epoch": 2.511984659635666,
"grad_norm": 0.35016322135925293,
"learning_rate": 9.790520310974978e-05,
"loss": 0.0523,
"step": 2620
},
{
"epoch": 2.5215723873441993,
"grad_norm": 0.5532832741737366,
"learning_rate": 9.788020491104524e-05,
"loss": 0.0516,
"step": 2630
},
{
"epoch": 2.5311601150527325,
"grad_norm": 0.48316141963005066,
"learning_rate": 9.785506166785461e-05,
"loss": 0.0455,
"step": 2640
},
{
"epoch": 2.5407478427612658,
"grad_norm": 0.53989177942276,
"learning_rate": 9.78297734563448e-05,
"loss": 0.05,
"step": 2650
},
{
"epoch": 2.5503355704697985,
"grad_norm": 0.44286760687828064,
"learning_rate": 9.780434035312196e-05,
"loss": 0.0552,
"step": 2660
},
{
"epoch": 2.5599232981783318,
"grad_norm": 0.5638286471366882,
"learning_rate": 9.777876243523108e-05,
"loss": 0.062,
"step": 2670
},
{
"epoch": 2.569511025886865,
"grad_norm": 0.45765963196754456,
"learning_rate": 9.775303978015585e-05,
"loss": 0.0535,
"step": 2680
},
{
"epoch": 2.5790987535953978,
"grad_norm": 0.3893742859363556,
"learning_rate": 9.772717246581848e-05,
"loss": 0.055,
"step": 2690
},
{
"epoch": 2.588686481303931,
"grad_norm": 0.4707334637641907,
"learning_rate": 9.770116057057933e-05,
"loss": 0.055,
"step": 2700
},
{
"epoch": 2.598274209012464,
"grad_norm": 0.4900120198726654,
"learning_rate": 9.767500417323676e-05,
"loss": 0.056,
"step": 2710
},
{
"epoch": 2.607861936720997,
"grad_norm": 0.3331255316734314,
"learning_rate": 9.764870335302689e-05,
"loss": 0.0502,
"step": 2720
},
{
"epoch": 2.61744966442953,
"grad_norm": 0.47928670048713684,
"learning_rate": 9.762225818962336e-05,
"loss": 0.0514,
"step": 2730
},
{
"epoch": 2.6270373921380634,
"grad_norm": 0.3848089873790741,
"learning_rate": 9.759566876313701e-05,
"loss": 0.044,
"step": 2740
},
{
"epoch": 2.636625119846596,
"grad_norm": 0.4957471787929535,
"learning_rate": 9.756893515411574e-05,
"loss": 0.0434,
"step": 2750
},
{
"epoch": 2.6462128475551294,
"grad_norm": 0.5820662975311279,
"learning_rate": 9.754205744354423e-05,
"loss": 0.0484,
"step": 2760
},
{
"epoch": 2.6558005752636626,
"grad_norm": 0.3916762173175812,
"learning_rate": 9.751503571284368e-05,
"loss": 0.0488,
"step": 2770
},
{
"epoch": 2.665388302972196,
"grad_norm": 0.30791330337524414,
"learning_rate": 9.748787004387157e-05,
"loss": 0.0513,
"step": 2780
},
{
"epoch": 2.6749760306807286,
"grad_norm": 0.5171549320220947,
"learning_rate": 9.74605605189214e-05,
"loss": 0.0516,
"step": 2790
},
{
"epoch": 2.684563758389262,
"grad_norm": 0.47496703267097473,
"learning_rate": 9.743310722072251e-05,
"loss": 0.0493,
"step": 2800
},
{
"epoch": 2.6941514860977946,
"grad_norm": 0.5075270533561707,
"learning_rate": 9.74055102324397e-05,
"loss": 0.0489,
"step": 2810
},
{
"epoch": 2.703739213806328,
"grad_norm": 0.4490506052970886,
"learning_rate": 9.737776963767313e-05,
"loss": 0.0576,
"step": 2820
},
{
"epoch": 2.713326941514861,
"grad_norm": 0.3923519551753998,
"learning_rate": 9.734988552045792e-05,
"loss": 0.0513,
"step": 2830
},
{
"epoch": 2.7229146692233943,
"grad_norm": 0.2816771864891052,
"learning_rate": 9.7321857965264e-05,
"loss": 0.0578,
"step": 2840
},
{
"epoch": 2.732502396931927,
"grad_norm": 0.6326708793640137,
"learning_rate": 9.729368705699587e-05,
"loss": 0.0452,
"step": 2850
},
{
"epoch": 2.7420901246404603,
"grad_norm": 0.3657870292663574,
"learning_rate": 9.726537288099215e-05,
"loss": 0.0524,
"step": 2860
},
{
"epoch": 2.751677852348993,
"grad_norm": 0.3347817063331604,
"learning_rate": 9.723691552302562e-05,
"loss": 0.0451,
"step": 2870
},
{
"epoch": 2.7612655800575263,
"grad_norm": 0.4541146457195282,
"learning_rate": 9.720831506930274e-05,
"loss": 0.0487,
"step": 2880
},
{
"epoch": 2.7708533077660595,
"grad_norm": 0.4089963734149933,
"learning_rate": 9.71795716064634e-05,
"loss": 0.0479,
"step": 2890
},
{
"epoch": 2.7804410354745928,
"grad_norm": 0.3474633991718292,
"learning_rate": 9.715068522158081e-05,
"loss": 0.0467,
"step": 2900
},
{
"epoch": 2.7900287631831255,
"grad_norm": 0.49998903274536133,
"learning_rate": 9.712165600216107e-05,
"loss": 0.0579,
"step": 2910
},
{
"epoch": 2.7996164908916588,
"grad_norm": 0.41667240858078003,
"learning_rate": 9.709248403614298e-05,
"loss": 0.0456,
"step": 2920
},
{
"epoch": 2.8092042186001915,
"grad_norm": 0.3876051604747772,
"learning_rate": 9.706316941189779e-05,
"loss": 0.0411,
"step": 2930
},
{
"epoch": 2.8187919463087248,
"grad_norm": 0.34348323941230774,
"learning_rate": 9.703371221822888e-05,
"loss": 0.0463,
"step": 2940
},
{
"epoch": 2.828379674017258,
"grad_norm": 0.5338907241821289,
"learning_rate": 9.700411254437154e-05,
"loss": 0.0476,
"step": 2950
},
{
"epoch": 2.837967401725791,
"grad_norm": 0.5973591804504395,
"learning_rate": 9.697437047999266e-05,
"loss": 0.0531,
"step": 2960
},
{
"epoch": 2.847555129434324,
"grad_norm": 0.31144216656684875,
"learning_rate": 9.694448611519049e-05,
"loss": 0.0494,
"step": 2970
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.4310339391231537,
"learning_rate": 9.691445954049434e-05,
"loss": 0.0448,
"step": 2980
},
{
"epoch": 2.86673058485139,
"grad_norm": 0.36877721548080444,
"learning_rate": 9.688429084686435e-05,
"loss": 0.043,
"step": 2990
},
{
"epoch": 2.876318312559923,
"grad_norm": 0.35387906432151794,
"learning_rate": 9.685398012569115e-05,
"loss": 0.055,
"step": 3000
},
{
"epoch": 2.8859060402684564,
"grad_norm": 0.3781449496746063,
"learning_rate": 9.682352746879562e-05,
"loss": 0.0513,
"step": 3010
},
{
"epoch": 2.8954937679769897,
"grad_norm": 0.3556309938430786,
"learning_rate": 9.679293296842863e-05,
"loss": 0.0556,
"step": 3020
},
{
"epoch": 2.9050814956855224,
"grad_norm": 0.4965471923351288,
"learning_rate": 9.676219671727072e-05,
"loss": 0.0502,
"step": 3030
},
{
"epoch": 2.9146692233940557,
"grad_norm": 0.40289080142974854,
"learning_rate": 9.673131880843185e-05,
"loss": 0.0474,
"step": 3040
},
{
"epoch": 2.9242569511025884,
"grad_norm": 0.3517281115055084,
"learning_rate": 9.67002993354511e-05,
"loss": 0.0557,
"step": 3050
},
{
"epoch": 2.9338446788111217,
"grad_norm": 0.5005010366439819,
"learning_rate": 9.66691383922964e-05,
"loss": 0.059,
"step": 3060
},
{
"epoch": 2.943432406519655,
"grad_norm": 0.36781349778175354,
"learning_rate": 9.66378360733642e-05,
"loss": 0.055,
"step": 3070
},
{
"epoch": 2.953020134228188,
"grad_norm": 0.310249388217926,
"learning_rate": 9.660639247347931e-05,
"loss": 0.0523,
"step": 3080
},
{
"epoch": 2.962607861936721,
"grad_norm": 0.27061378955841064,
"learning_rate": 9.657480768789446e-05,
"loss": 0.0505,
"step": 3090
},
{
"epoch": 2.972195589645254,
"grad_norm": 0.34516626596450806,
"learning_rate": 9.654308181229006e-05,
"loss": 0.0489,
"step": 3100
},
{
"epoch": 2.981783317353787,
"grad_norm": 0.3140753209590912,
"learning_rate": 9.651121494277396e-05,
"loss": 0.0531,
"step": 3110
},
{
"epoch": 2.99137104506232,
"grad_norm": 0.4165388345718384,
"learning_rate": 9.647920717588114e-05,
"loss": 0.0571,
"step": 3120
},
{
"epoch": 3.0009587727708533,
"grad_norm": 0.36014652252197266,
"learning_rate": 9.644705860857339e-05,
"loss": 0.0515,
"step": 3130
},
{
"epoch": 3.0105465004793865,
"grad_norm": 0.4353986382484436,
"learning_rate": 9.641476933823899e-05,
"loss": 0.0488,
"step": 3140
},
{
"epoch": 3.0201342281879193,
"grad_norm": 0.4083373546600342,
"learning_rate": 9.638233946269253e-05,
"loss": 0.052,
"step": 3150
},
{
"epoch": 3.0297219558964525,
"grad_norm": 0.3805656135082245,
"learning_rate": 9.634976908017446e-05,
"loss": 0.0461,
"step": 3160
},
{
"epoch": 3.0393096836049858,
"grad_norm": 0.36862942576408386,
"learning_rate": 9.631705828935092e-05,
"loss": 0.0526,
"step": 3170
},
{
"epoch": 3.0488974113135185,
"grad_norm": 0.4625187814235687,
"learning_rate": 9.628420718931338e-05,
"loss": 0.0536,
"step": 3180
},
{
"epoch": 3.0584851390220518,
"grad_norm": 0.2972494959831238,
"learning_rate": 9.625121587957834e-05,
"loss": 0.0468,
"step": 3190
},
{
"epoch": 3.068072866730585,
"grad_norm": 0.5064423084259033,
"learning_rate": 9.621808446008708e-05,
"loss": 0.0516,
"step": 3200
},
{
"epoch": 3.0776605944391178,
"grad_norm": 0.28751927614212036,
"learning_rate": 9.618481303120528e-05,
"loss": 0.0463,
"step": 3210
},
{
"epoch": 3.087248322147651,
"grad_norm": 0.4198159873485565,
"learning_rate": 9.615140169372274e-05,
"loss": 0.0395,
"step": 3220
},
{
"epoch": 3.096836049856184,
"grad_norm": 0.41463902592658997,
"learning_rate": 9.611785054885312e-05,
"loss": 0.0501,
"step": 3230
},
{
"epoch": 3.106423777564717,
"grad_norm": 0.37878739833831787,
"learning_rate": 9.608415969823361e-05,
"loss": 0.0484,
"step": 3240
},
{
"epoch": 3.11601150527325,
"grad_norm": 0.4990726113319397,
"learning_rate": 9.605032924392457e-05,
"loss": 0.049,
"step": 3250
},
{
"epoch": 3.1255992329817834,
"grad_norm": 0.39530688524246216,
"learning_rate": 9.601635928840927e-05,
"loss": 0.0658,
"step": 3260
},
{
"epoch": 3.135186960690316,
"grad_norm": 0.5206883549690247,
"learning_rate": 9.598224993459364e-05,
"loss": 0.0538,
"step": 3270
},
{
"epoch": 3.1447746883988494,
"grad_norm": 0.5972046256065369,
"learning_rate": 9.594800128580582e-05,
"loss": 0.054,
"step": 3280
},
{
"epoch": 3.1543624161073827,
"grad_norm": 0.33001407980918884,
"learning_rate": 9.591361344579595e-05,
"loss": 0.0544,
"step": 3290
},
{
"epoch": 3.1639501438159154,
"grad_norm": 0.38547295331954956,
"learning_rate": 9.58790865187358e-05,
"loss": 0.0422,
"step": 3300
},
{
"epoch": 3.1735378715244487,
"grad_norm": 0.3369503915309906,
"learning_rate": 9.584442060921851e-05,
"loss": 0.0472,
"step": 3310
},
{
"epoch": 3.183125599232982,
"grad_norm": 0.2815903127193451,
"learning_rate": 9.580961582225826e-05,
"loss": 0.0463,
"step": 3320
},
{
"epoch": 3.1927133269415147,
"grad_norm": 0.42745402455329895,
"learning_rate": 9.577467226328987e-05,
"loss": 0.0517,
"step": 3330
},
{
"epoch": 3.202301054650048,
"grad_norm": 0.46006882190704346,
"learning_rate": 9.573959003816856e-05,
"loss": 0.0494,
"step": 3340
},
{
"epoch": 3.211888782358581,
"grad_norm": 0.47103896737098694,
"learning_rate": 9.57043692531697e-05,
"loss": 0.0511,
"step": 3350
},
{
"epoch": 3.221476510067114,
"grad_norm": 0.41211676597595215,
"learning_rate": 9.566901001498826e-05,
"loss": 0.0512,
"step": 3360
},
{
"epoch": 3.231064237775647,
"grad_norm": 0.5582764148712158,
"learning_rate": 9.563351243073878e-05,
"loss": 0.0584,
"step": 3370
},
{
"epoch": 3.2406519654841803,
"grad_norm": 0.3129172921180725,
"learning_rate": 9.559787660795474e-05,
"loss": 0.0596,
"step": 3380
},
{
"epoch": 3.2502396931927136,
"grad_norm": 0.4259207844734192,
"learning_rate": 9.556210265458854e-05,
"loss": 0.0507,
"step": 3390
},
{
"epoch": 3.2598274209012463,
"grad_norm": 0.29509371519088745,
"learning_rate": 9.552619067901089e-05,
"loss": 0.0519,
"step": 3400
},
{
"epoch": 3.2694151486097796,
"grad_norm": 0.33097851276397705,
"learning_rate": 9.549014079001074e-05,
"loss": 0.0503,
"step": 3410
},
{
"epoch": 3.2790028763183123,
"grad_norm": 0.6283732056617737,
"learning_rate": 9.545395309679469e-05,
"loss": 0.052,
"step": 3420
},
{
"epoch": 3.2885906040268456,
"grad_norm": 0.29192429780960083,
"learning_rate": 9.54176277089869e-05,
"loss": 0.0452,
"step": 3430
},
{
"epoch": 3.2981783317353788,
"grad_norm": 0.3860151767730713,
"learning_rate": 9.538116473662861e-05,
"loss": 0.0536,
"step": 3440
},
{
"epoch": 3.307766059443912,
"grad_norm": 0.5127553343772888,
"learning_rate": 9.534456429017784e-05,
"loss": 0.0521,
"step": 3450
},
{
"epoch": 3.3173537871524448,
"grad_norm": 0.4540964961051941,
"learning_rate": 9.530782648050907e-05,
"loss": 0.0552,
"step": 3460
},
{
"epoch": 3.326941514860978,
"grad_norm": 0.34647271037101746,
"learning_rate": 9.52709514189129e-05,
"loss": 0.0457,
"step": 3470
},
{
"epoch": 3.336529242569511,
"grad_norm": 0.4515313506126404,
"learning_rate": 9.523393921709574e-05,
"loss": 0.0467,
"step": 3480
},
{
"epoch": 3.346116970278044,
"grad_norm": 0.3084343373775482,
"learning_rate": 9.519678998717935e-05,
"loss": 0.0462,
"step": 3490
},
{
"epoch": 3.3557046979865772,
"grad_norm": 0.5871327519416809,
"learning_rate": 9.515950384170073e-05,
"loss": 0.0566,
"step": 3500
},
{
"epoch": 3.3652924256951104,
"grad_norm": 0.4407544732093811,
"learning_rate": 9.51220808936115e-05,
"loss": 0.0436,
"step": 3510
},
{
"epoch": 3.3748801534036432,
"grad_norm": 0.3434475362300873,
"learning_rate": 9.508452125627779e-05,
"loss": 0.0483,
"step": 3520
},
{
"epoch": 3.3844678811121764,
"grad_norm": 0.5896394848823547,
"learning_rate": 9.504682504347978e-05,
"loss": 0.0435,
"step": 3530
},
{
"epoch": 3.3940556088207097,
"grad_norm": 0.380214661359787,
"learning_rate": 9.500899236941139e-05,
"loss": 0.053,
"step": 3540
},
{
"epoch": 3.4036433365292424,
"grad_norm": 0.2878900170326233,
"learning_rate": 9.497102334867989e-05,
"loss": 0.0488,
"step": 3550
},
{
"epoch": 3.4132310642377757,
"grad_norm": 0.6185137629508972,
"learning_rate": 9.493291809630562e-05,
"loss": 0.0512,
"step": 3560
},
{
"epoch": 3.422818791946309,
"grad_norm": 0.5001134872436523,
"learning_rate": 9.489467672772162e-05,
"loss": 0.055,
"step": 3570
},
{
"epoch": 3.4324065196548417,
"grad_norm": 0.46808385848999023,
"learning_rate": 9.485629935877323e-05,
"loss": 0.0524,
"step": 3580
},
{
"epoch": 3.441994247363375,
"grad_norm": 0.4512917399406433,
"learning_rate": 9.481778610571782e-05,
"loss": 0.0487,
"step": 3590
},
{
"epoch": 3.451581975071908,
"grad_norm": 0.39726588129997253,
"learning_rate": 9.477913708522435e-05,
"loss": 0.0578,
"step": 3600
},
{
"epoch": 3.461169702780441,
"grad_norm": 0.32351112365722656,
"learning_rate": 9.474035241437312e-05,
"loss": 0.0488,
"step": 3610
},
{
"epoch": 3.470757430488974,
"grad_norm": 0.47034138441085815,
"learning_rate": 9.470143221065531e-05,
"loss": 0.0618,
"step": 3620
},
{
"epoch": 3.4803451581975073,
"grad_norm": 0.23497724533081055,
"learning_rate": 9.46623765919727e-05,
"loss": 0.0499,
"step": 3630
},
{
"epoch": 3.48993288590604,
"grad_norm": 0.25630268454551697,
"learning_rate": 9.462318567663728e-05,
"loss": 0.0508,
"step": 3640
},
{
"epoch": 3.4995206136145733,
"grad_norm": 0.3957800269126892,
"learning_rate": 9.458385958337087e-05,
"loss": 0.0554,
"step": 3650
},
{
"epoch": 3.5091083413231066,
"grad_norm": 0.25262129306793213,
"learning_rate": 9.454439843130483e-05,
"loss": 0.0473,
"step": 3660
},
{
"epoch": 3.5186960690316393,
"grad_norm": 0.3933389186859131,
"learning_rate": 9.450480233997963e-05,
"loss": 0.0471,
"step": 3670
},
{
"epoch": 3.5282837967401726,
"grad_norm": 0.26438847184181213,
"learning_rate": 9.446507142934452e-05,
"loss": 0.0557,
"step": 3680
},
{
"epoch": 3.537871524448706,
"grad_norm": 0.2720869183540344,
"learning_rate": 9.442520581975718e-05,
"loss": 0.0492,
"step": 3690
},
{
"epoch": 3.547459252157239,
"grad_norm": 0.3165934383869171,
"learning_rate": 9.438520563198328e-05,
"loss": 0.0512,
"step": 3700
},
{
"epoch": 3.557046979865772,
"grad_norm": 0.6523368954658508,
"learning_rate": 9.434507098719624e-05,
"loss": 0.0574,
"step": 3710
},
{
"epoch": 3.566634707574305,
"grad_norm": 0.41401076316833496,
"learning_rate": 9.430480200697676e-05,
"loss": 0.0509,
"step": 3720
},
{
"epoch": 3.576222435282838,
"grad_norm": 0.29742154479026794,
"learning_rate": 9.426439881331248e-05,
"loss": 0.0489,
"step": 3730
},
{
"epoch": 3.585810162991371,
"grad_norm": 0.40217605233192444,
"learning_rate": 9.422386152859763e-05,
"loss": 0.0466,
"step": 3740
},
{
"epoch": 3.5953978906999042,
"grad_norm": 0.3434045612812042,
"learning_rate": 9.418319027563263e-05,
"loss": 0.0575,
"step": 3750
},
{
"epoch": 3.6049856184084375,
"grad_norm": 0.6345980763435364,
"learning_rate": 9.414238517762373e-05,
"loss": 0.0453,
"step": 3760
},
{
"epoch": 3.6145733461169702,
"grad_norm": 0.43346667289733887,
"learning_rate": 9.410144635818266e-05,
"loss": 0.055,
"step": 3770
},
{
"epoch": 3.6241610738255035,
"grad_norm": 0.36115562915802,
"learning_rate": 9.406037394132623e-05,
"loss": 0.0535,
"step": 3780
},
{
"epoch": 3.6337488015340362,
"grad_norm": 0.2766103744506836,
"learning_rate": 9.401916805147596e-05,
"loss": 0.0463,
"step": 3790
},
{
"epoch": 3.6433365292425695,
"grad_norm": 0.39829254150390625,
"learning_rate": 9.397782881345767e-05,
"loss": 0.0463,
"step": 3800
},
{
"epoch": 3.6529242569511027,
"grad_norm": 0.3240996301174164,
"learning_rate": 9.39363563525012e-05,
"loss": 0.0516,
"step": 3810
},
{
"epoch": 3.662511984659636,
"grad_norm": 0.416238009929657,
"learning_rate": 9.389475079423988e-05,
"loss": 0.0483,
"step": 3820
},
{
"epoch": 3.6720997123681687,
"grad_norm": 0.24697421491146088,
"learning_rate": 9.385301226471032e-05,
"loss": 0.0451,
"step": 3830
},
{
"epoch": 3.681687440076702,
"grad_norm": 0.3078657388687134,
"learning_rate": 9.381114089035188e-05,
"loss": 0.0454,
"step": 3840
},
{
"epoch": 3.6912751677852347,
"grad_norm": 0.26055672764778137,
"learning_rate": 9.376913679800638e-05,
"loss": 0.0426,
"step": 3850
},
{
"epoch": 3.700862895493768,
"grad_norm": 0.36363962292671204,
"learning_rate": 9.372700011491768e-05,
"loss": 0.0535,
"step": 3860
},
{
"epoch": 3.710450623202301,
"grad_norm": 0.23066310584545135,
"learning_rate": 9.36847309687313e-05,
"loss": 0.0391,
"step": 3870
},
{
"epoch": 3.7200383509108343,
"grad_norm": 0.35935813188552856,
"learning_rate": 9.364232948749402e-05,
"loss": 0.0404,
"step": 3880
},
{
"epoch": 3.729626078619367,
"grad_norm": 0.42284151911735535,
"learning_rate": 9.359979579965352e-05,
"loss": 0.0456,
"step": 3890
},
{
"epoch": 3.7392138063279003,
"grad_norm": 0.29598748683929443,
"learning_rate": 9.355713003405797e-05,
"loss": 0.0486,
"step": 3900
},
{
"epoch": 3.748801534036433,
"grad_norm": 0.30880895256996155,
"learning_rate": 9.351433231995568e-05,
"loss": 0.0524,
"step": 3910
},
{
"epoch": 3.7583892617449663,
"grad_norm": 0.2683268189430237,
"learning_rate": 9.34714027869946e-05,
"loss": 0.0458,
"step": 3920
},
{
"epoch": 3.7679769894534996,
"grad_norm": 0.3789876401424408,
"learning_rate": 9.342834156522204e-05,
"loss": 0.0529,
"step": 3930
},
{
"epoch": 3.777564717162033,
"grad_norm": 0.2747150957584381,
"learning_rate": 9.338514878508428e-05,
"loss": 0.0474,
"step": 3940
},
{
"epoch": 3.7871524448705656,
"grad_norm": 0.3292723000049591,
"learning_rate": 9.334182457742607e-05,
"loss": 0.0544,
"step": 3950
},
{
"epoch": 3.796740172579099,
"grad_norm": 0.28527846932411194,
"learning_rate": 9.329836907349033e-05,
"loss": 0.0419,
"step": 3960
},
{
"epoch": 3.8063279002876316,
"grad_norm": 0.37766164541244507,
"learning_rate": 9.325478240491771e-05,
"loss": 0.0503,
"step": 3970
},
{
"epoch": 3.815915627996165,
"grad_norm": 0.4285350739955902,
"learning_rate": 9.321106470374618e-05,
"loss": 0.0493,
"step": 3980
},
{
"epoch": 3.825503355704698,
"grad_norm": 0.432804137468338,
"learning_rate": 9.316721610241068e-05,
"loss": 0.0452,
"step": 3990
},
{
"epoch": 3.8350910834132312,
"grad_norm": 0.32709524035453796,
"learning_rate": 9.312323673374269e-05,
"loss": 0.049,
"step": 4000
},
{
"epoch": 3.844678811121764,
"grad_norm": 0.2850819230079651,
"learning_rate": 9.30791267309698e-05,
"loss": 0.0379,
"step": 4010
},
{
"epoch": 3.8542665388302972,
"grad_norm": 0.3472555875778198,
"learning_rate": 9.303488622771535e-05,
"loss": 0.0412,
"step": 4020
},
{
"epoch": 3.8638542665388305,
"grad_norm": 0.545179545879364,
"learning_rate": 9.299051535799799e-05,
"loss": 0.0535,
"step": 4030
},
{
"epoch": 3.8734419942473632,
"grad_norm": 0.43045416474342346,
"learning_rate": 9.29460142562313e-05,
"loss": 0.0564,
"step": 4040
},
{
"epoch": 3.8830297219558965,
"grad_norm": 0.30958643555641174,
"learning_rate": 9.290138305722343e-05,
"loss": 0.0423,
"step": 4050
},
{
"epoch": 3.8926174496644297,
"grad_norm": 0.3504599630832672,
"learning_rate": 9.285662189617652e-05,
"loss": 0.0525,
"step": 4060
},
{
"epoch": 3.9022051773729625,
"grad_norm": 0.5074465870857239,
"learning_rate": 9.281173090868651e-05,
"loss": 0.0505,
"step": 4070
},
{
"epoch": 3.9117929050814957,
"grad_norm": 0.30970317125320435,
"learning_rate": 9.27667102307426e-05,
"loss": 0.0404,
"step": 4080
},
{
"epoch": 3.921380632790029,
"grad_norm": 0.35298407077789307,
"learning_rate": 9.27215599987268e-05,
"loss": 0.0461,
"step": 4090
},
{
"epoch": 3.9309683604985617,
"grad_norm": 0.32086381316185,
"learning_rate": 9.267628034941369e-05,
"loss": 0.0476,
"step": 4100
},
{
"epoch": 3.940556088207095,
"grad_norm": 0.33907032012939453,
"learning_rate": 9.26308714199698e-05,
"loss": 0.0446,
"step": 4110
},
{
"epoch": 3.950143815915628,
"grad_norm": 0.23291510343551636,
"learning_rate": 9.258533334795336e-05,
"loss": 0.0542,
"step": 4120
},
{
"epoch": 3.959731543624161,
"grad_norm": 0.3786979913711548,
"learning_rate": 9.253966627131379e-05,
"loss": 0.049,
"step": 4130
},
{
"epoch": 3.969319271332694,
"grad_norm": 0.4073876142501831,
"learning_rate": 9.249387032839125e-05,
"loss": 0.046,
"step": 4140
},
{
"epoch": 3.9789069990412274,
"grad_norm": 0.3822251856327057,
"learning_rate": 9.244794565791639e-05,
"loss": 0.0472,
"step": 4150
},
{
"epoch": 3.98849472674976,
"grad_norm": 0.43598631024360657,
"learning_rate": 9.240189239900972e-05,
"loss": 0.0388,
"step": 4160
},
{
"epoch": 3.9980824544582934,
"grad_norm": 0.2129432111978531,
"learning_rate": 9.235571069118131e-05,
"loss": 0.0492,
"step": 4170
},
{
"epoch": 4.007670182166827,
"grad_norm": 0.3745039999485016,
"learning_rate": 9.23094006743304e-05,
"loss": 0.0447,
"step": 4180
},
{
"epoch": 4.01725790987536,
"grad_norm": 0.3619850277900696,
"learning_rate": 9.226296248874482e-05,
"loss": 0.0523,
"step": 4190
},
{
"epoch": 4.026845637583893,
"grad_norm": 0.3835139274597168,
"learning_rate": 9.221639627510076e-05,
"loss": 0.048,
"step": 4200
},
{
"epoch": 4.036433365292425,
"grad_norm": 0.28674259781837463,
"learning_rate": 9.216970217446219e-05,
"loss": 0.0387,
"step": 4210
},
{
"epoch": 4.046021093000959,
"grad_norm": 0.25763458013534546,
"learning_rate": 9.21228803282805e-05,
"loss": 0.0506,
"step": 4220
},
{
"epoch": 4.055608820709492,
"grad_norm": 0.36224737763404846,
"learning_rate": 9.207593087839406e-05,
"loss": 0.0453,
"step": 4230
},
{
"epoch": 4.065196548418025,
"grad_norm": 0.38200250267982483,
"learning_rate": 9.202885396702782e-05,
"loss": 0.0431,
"step": 4240
},
{
"epoch": 4.074784276126558,
"grad_norm": 0.336946964263916,
"learning_rate": 9.198164973679285e-05,
"loss": 0.0443,
"step": 4250
},
{
"epoch": 4.0843720038350915,
"grad_norm": 0.3541509807109833,
"learning_rate": 9.193431833068586e-05,
"loss": 0.0499,
"step": 4260
},
{
"epoch": 4.093959731543624,
"grad_norm": 0.3337682783603668,
"learning_rate": 9.188685989208886e-05,
"loss": 0.0474,
"step": 4270
},
{
"epoch": 4.103547459252157,
"grad_norm": 0.4774644076824188,
"learning_rate": 9.183927456476864e-05,
"loss": 0.0413,
"step": 4280
},
{
"epoch": 4.11313518696069,
"grad_norm": 0.3974810540676117,
"learning_rate": 9.179156249287646e-05,
"loss": 0.0495,
"step": 4290
},
{
"epoch": 4.1227229146692235,
"grad_norm": 0.35930874943733215,
"learning_rate": 9.174372382094745e-05,
"loss": 0.0481,
"step": 4300
},
{
"epoch": 4.132310642377757,
"grad_norm": 0.39746561646461487,
"learning_rate": 9.169575869390028e-05,
"loss": 0.0401,
"step": 4310
},
{
"epoch": 4.14189837008629,
"grad_norm": 0.3344055414199829,
"learning_rate": 9.164766725703669e-05,
"loss": 0.0471,
"step": 4320
},
{
"epoch": 4.151486097794822,
"grad_norm": 0.23866185545921326,
"learning_rate": 9.159944965604105e-05,
"loss": 0.0424,
"step": 4330
},
{
"epoch": 4.1610738255033555,
"grad_norm": 0.3230268657207489,
"learning_rate": 9.155110603697996e-05,
"loss": 0.0475,
"step": 4340
},
{
"epoch": 4.170661553211889,
"grad_norm": 0.3797110915184021,
"learning_rate": 9.150263654630172e-05,
"loss": 0.0458,
"step": 4350
},
{
"epoch": 4.180249280920422,
"grad_norm": 0.41824665665626526,
"learning_rate": 9.145404133083591e-05,
"loss": 0.0401,
"step": 4360
},
{
"epoch": 4.189837008628955,
"grad_norm": 0.45811742544174194,
"learning_rate": 9.140532053779307e-05,
"loss": 0.0533,
"step": 4370
},
{
"epoch": 4.199424736337488,
"grad_norm": 0.3115192651748657,
"learning_rate": 9.135647431476407e-05,
"loss": 0.0475,
"step": 4380
},
{
"epoch": 4.209012464046021,
"grad_norm": 0.27874428033828735,
"learning_rate": 9.130750280971978e-05,
"loss": 0.0444,
"step": 4390
},
{
"epoch": 4.218600191754554,
"grad_norm": 0.5270777940750122,
"learning_rate": 9.125840617101058e-05,
"loss": 0.0514,
"step": 4400
},
{
"epoch": 4.228187919463087,
"grad_norm": 0.40683162212371826,
"learning_rate": 9.120918454736593e-05,
"loss": 0.0472,
"step": 4410
},
{
"epoch": 4.23777564717162,
"grad_norm": 0.30064043402671814,
"learning_rate": 9.11598380878939e-05,
"loss": 0.0492,
"step": 4420
},
{
"epoch": 4.247363374880154,
"grad_norm": 0.4496791362762451,
"learning_rate": 9.111036694208072e-05,
"loss": 0.0471,
"step": 4430
},
{
"epoch": 4.256951102588687,
"grad_norm": 0.39262011647224426,
"learning_rate": 9.106077125979037e-05,
"loss": 0.0487,
"step": 4440
},
{
"epoch": 4.26653883029722,
"grad_norm": 0.34774985909461975,
"learning_rate": 9.101105119126405e-05,
"loss": 0.0452,
"step": 4450
},
{
"epoch": 4.276126558005752,
"grad_norm": 0.4597591459751129,
"learning_rate": 9.096120688711978e-05,
"loss": 0.0521,
"step": 4460
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.594453752040863,
"learning_rate": 9.091123849835195e-05,
"loss": 0.0555,
"step": 4470
},
{
"epoch": 4.295302013422819,
"grad_norm": 0.45329248905181885,
"learning_rate": 9.086114617633079e-05,
"loss": 0.0408,
"step": 4480
},
{
"epoch": 4.304889741131352,
"grad_norm": 0.34534817934036255,
"learning_rate": 9.081093007280205e-05,
"loss": 0.0554,
"step": 4490
},
{
"epoch": 4.314477468839885,
"grad_norm": 0.36244168877601624,
"learning_rate": 9.076059033988636e-05,
"loss": 0.0487,
"step": 4500
},
{
"epoch": 4.324065196548418,
"grad_norm": 0.32668572664260864,
"learning_rate": 9.071012713007892e-05,
"loss": 0.0483,
"step": 4510
},
{
"epoch": 4.333652924256951,
"grad_norm": 0.31663575768470764,
"learning_rate": 9.065954059624895e-05,
"loss": 0.0484,
"step": 4520
},
{
"epoch": 4.343240651965484,
"grad_norm": 0.2809025049209595,
"learning_rate": 9.06088308916393e-05,
"loss": 0.042,
"step": 4530
},
{
"epoch": 4.352828379674017,
"grad_norm": 0.2432290017604828,
"learning_rate": 9.05579981698659e-05,
"loss": 0.0463,
"step": 4540
},
{
"epoch": 4.3624161073825505,
"grad_norm": 0.2573339343070984,
"learning_rate": 9.050704258491736e-05,
"loss": 0.0462,
"step": 4550
},
{
"epoch": 4.372003835091084,
"grad_norm": 0.42221635580062866,
"learning_rate": 9.045596429115447e-05,
"loss": 0.0472,
"step": 4560
},
{
"epoch": 4.381591562799617,
"grad_norm": 0.35964876413345337,
"learning_rate": 9.040476344330977e-05,
"loss": 0.0448,
"step": 4570
},
{
"epoch": 4.391179290508149,
"grad_norm": 0.27407506108283997,
"learning_rate": 9.035344019648702e-05,
"loss": 0.0431,
"step": 4580
},
{
"epoch": 4.4007670182166825,
"grad_norm": 0.31676268577575684,
"learning_rate": 9.03019947061608e-05,
"loss": 0.0441,
"step": 4590
},
{
"epoch": 4.410354745925216,
"grad_norm": 0.2982436716556549,
"learning_rate": 9.025042712817598e-05,
"loss": 0.043,
"step": 4600
},
{
"epoch": 4.419942473633749,
"grad_norm": 0.3181396424770355,
"learning_rate": 9.019873761874727e-05,
"loss": 0.0484,
"step": 4610
},
{
"epoch": 4.429530201342282,
"grad_norm": 0.3732481002807617,
"learning_rate": 9.014692633445878e-05,
"loss": 0.055,
"step": 4620
},
{
"epoch": 4.439117929050815,
"grad_norm": 0.42074957489967346,
"learning_rate": 9.009499343226348e-05,
"loss": 0.047,
"step": 4630
},
{
"epoch": 4.448705656759348,
"grad_norm": 0.35802584886550903,
"learning_rate": 9.004293906948278e-05,
"loss": 0.0489,
"step": 4640
},
{
"epoch": 4.458293384467881,
"grad_norm": 0.33133867383003235,
"learning_rate": 8.999076340380603e-05,
"loss": 0.049,
"step": 4650
},
{
"epoch": 4.467881112176414,
"grad_norm": 0.28263920545578003,
"learning_rate": 8.993846659329005e-05,
"loss": 0.056,
"step": 4660
},
{
"epoch": 4.477468839884947,
"grad_norm": 0.5171105861663818,
"learning_rate": 8.988604879635862e-05,
"loss": 0.047,
"step": 4670
},
{
"epoch": 4.487056567593481,
"grad_norm": 0.264189749956131,
"learning_rate": 8.983351017180208e-05,
"loss": 0.0432,
"step": 4680
},
{
"epoch": 4.496644295302014,
"grad_norm": 0.2710209786891937,
"learning_rate": 8.978085087877672e-05,
"loss": 0.048,
"step": 4690
},
{
"epoch": 4.506232023010546,
"grad_norm": 0.20794712007045746,
"learning_rate": 8.972807107680445e-05,
"loss": 0.0524,
"step": 4700
},
{
"epoch": 4.515819750719079,
"grad_norm": 0.2759157419204712,
"learning_rate": 8.96751709257722e-05,
"loss": 0.0463,
"step": 4710
},
{
"epoch": 4.525407478427613,
"grad_norm": 0.45379728078842163,
"learning_rate": 8.962215058593146e-05,
"loss": 0.0483,
"step": 4720
},
{
"epoch": 4.534995206136146,
"grad_norm": 0.35511714220046997,
"learning_rate": 8.956901021789785e-05,
"loss": 0.0473,
"step": 4730
},
{
"epoch": 4.544582933844679,
"grad_norm": 0.49189603328704834,
"learning_rate": 8.951574998265058e-05,
"loss": 0.0448,
"step": 4740
},
{
"epoch": 4.554170661553212,
"grad_norm": 0.7247273921966553,
"learning_rate": 8.946237004153197e-05,
"loss": 0.0514,
"step": 4750
},
{
"epoch": 4.563758389261745,
"grad_norm": 0.5640259385108948,
"learning_rate": 8.940887055624696e-05,
"loss": 0.0495,
"step": 4760
},
{
"epoch": 4.573346116970278,
"grad_norm": 0.9589868187904358,
"learning_rate": 8.935525168886262e-05,
"loss": 0.0497,
"step": 4770
},
{
"epoch": 4.582933844678811,
"grad_norm": 0.24826788902282715,
"learning_rate": 8.930151360180773e-05,
"loss": 0.0526,
"step": 4780
},
{
"epoch": 4.592521572387344,
"grad_norm": 0.4066452980041504,
"learning_rate": 8.924765645787216e-05,
"loss": 0.0482,
"step": 4790
},
{
"epoch": 4.6021093000958775,
"grad_norm": 0.41626861691474915,
"learning_rate": 8.919368042020645e-05,
"loss": 0.0469,
"step": 4800
},
{
"epoch": 4.611697027804411,
"grad_norm": 0.35766589641571045,
"learning_rate": 8.913958565232132e-05,
"loss": 0.0489,
"step": 4810
},
{
"epoch": 4.621284755512944,
"grad_norm": 0.24869422614574432,
"learning_rate": 8.908537231808716e-05,
"loss": 0.043,
"step": 4820
},
{
"epoch": 4.630872483221476,
"grad_norm": 0.3498132526874542,
"learning_rate": 8.903104058173354e-05,
"loss": 0.044,
"step": 4830
},
{
"epoch": 4.6404602109300095,
"grad_norm": 0.5257985591888428,
"learning_rate": 8.897659060784869e-05,
"loss": 0.0487,
"step": 4840
},
{
"epoch": 4.650047938638543,
"grad_norm": 0.3492990732192993,
"learning_rate": 8.892202256137905e-05,
"loss": 0.0516,
"step": 4850
},
{
"epoch": 4.659635666347076,
"grad_norm": 0.5162085294723511,
"learning_rate": 8.886733660762871e-05,
"loss": 0.0526,
"step": 4860
},
{
"epoch": 4.669223394055609,
"grad_norm": 0.3405402600765228,
"learning_rate": 8.881253291225895e-05,
"loss": 0.0449,
"step": 4870
},
{
"epoch": 4.6788111217641415,
"grad_norm": 0.4526231586933136,
"learning_rate": 8.875761164128772e-05,
"loss": 0.053,
"step": 4880
},
{
"epoch": 4.688398849472675,
"grad_norm": 0.3826616108417511,
"learning_rate": 8.870257296108918e-05,
"loss": 0.0467,
"step": 4890
},
{
"epoch": 4.697986577181208,
"grad_norm": 0.3477012813091278,
"learning_rate": 8.86474170383931e-05,
"loss": 0.0486,
"step": 4900
},
{
"epoch": 4.707574304889741,
"grad_norm": 0.2914051115512848,
"learning_rate": 8.859214404028447e-05,
"loss": 0.042,
"step": 4910
},
{
"epoch": 4.717162032598274,
"grad_norm": 0.40637078881263733,
"learning_rate": 8.85367541342029e-05,
"loss": 0.0432,
"step": 4920
},
{
"epoch": 4.726749760306808,
"grad_norm": 0.36229225993156433,
"learning_rate": 8.848124748794218e-05,
"loss": 0.0498,
"step": 4930
},
{
"epoch": 4.736337488015341,
"grad_norm": 0.33015790581703186,
"learning_rate": 8.842562426964974e-05,
"loss": 0.0441,
"step": 4940
},
{
"epoch": 4.745925215723873,
"grad_norm": 0.35154151916503906,
"learning_rate": 8.83698846478261e-05,
"loss": 0.0463,
"step": 4950
},
{
"epoch": 4.755512943432406,
"grad_norm": 0.2888050377368927,
"learning_rate": 8.831402879132446e-05,
"loss": 0.0455,
"step": 4960
},
{
"epoch": 4.76510067114094,
"grad_norm": 0.3235926628112793,
"learning_rate": 8.825805686935011e-05,
"loss": 0.0551,
"step": 4970
},
{
"epoch": 4.774688398849473,
"grad_norm": 0.44466277956962585,
"learning_rate": 8.820196905145997e-05,
"loss": 0.0476,
"step": 4980
},
{
"epoch": 4.784276126558006,
"grad_norm": 0.39051833748817444,
"learning_rate": 8.814576550756197e-05,
"loss": 0.04,
"step": 4990
},
{
"epoch": 4.793863854266538,
"grad_norm": 0.3532402813434601,
"learning_rate": 8.808944640791467e-05,
"loss": 0.0489,
"step": 5000
},
{
"epoch": 4.803451581975072,
"grad_norm": 0.34791117906570435,
"learning_rate": 8.803301192312667e-05,
"loss": 0.0466,
"step": 5010
},
{
"epoch": 4.813039309683605,
"grad_norm": 0.31138908863067627,
"learning_rate": 8.797646222415614e-05,
"loss": 0.0407,
"step": 5020
},
{
"epoch": 4.822627037392138,
"grad_norm": 0.2896534502506256,
"learning_rate": 8.79197974823102e-05,
"loss": 0.0479,
"step": 5030
},
{
"epoch": 4.832214765100671,
"grad_norm": 0.26334378123283386,
"learning_rate": 8.786301786924456e-05,
"loss": 0.0469,
"step": 5040
},
{
"epoch": 4.8418024928092045,
"grad_norm": 0.2446843832731247,
"learning_rate": 8.780612355696283e-05,
"loss": 0.0461,
"step": 5050
},
{
"epoch": 4.851390220517738,
"grad_norm": 0.2954402267932892,
"learning_rate": 8.774911471781613e-05,
"loss": 0.0472,
"step": 5060
},
{
"epoch": 4.86097794822627,
"grad_norm": 0.22677741944789886,
"learning_rate": 8.769199152450249e-05,
"loss": 0.04,
"step": 5070
},
{
"epoch": 4.870565675934803,
"grad_norm": 0.32872337102890015,
"learning_rate": 8.76347541500664e-05,
"loss": 0.0479,
"step": 5080
},
{
"epoch": 4.8801534036433365,
"grad_norm": 0.4457066059112549,
"learning_rate": 8.757740276789818e-05,
"loss": 0.0439,
"step": 5090
},
{
"epoch": 4.88974113135187,
"grad_norm": 0.24604512751102448,
"learning_rate": 8.751993755173358e-05,
"loss": 0.0468,
"step": 5100
},
{
"epoch": 4.899328859060403,
"grad_norm": 0.3143763840198517,
"learning_rate": 8.746235867565313e-05,
"loss": 0.0458,
"step": 5110
},
{
"epoch": 4.908916586768935,
"grad_norm": 0.3161276876926422,
"learning_rate": 8.74046663140817e-05,
"loss": 0.0502,
"step": 5120
},
{
"epoch": 4.9185043144774685,
"grad_norm": 0.2833130657672882,
"learning_rate": 8.734686064178797e-05,
"loss": 0.0419,
"step": 5130
},
{
"epoch": 4.928092042186002,
"grad_norm": 0.4420258104801178,
"learning_rate": 8.728894183388381e-05,
"loss": 0.0465,
"step": 5140
},
{
"epoch": 4.937679769894535,
"grad_norm": 0.353081077337265,
"learning_rate": 8.723091006582389e-05,
"loss": 0.0451,
"step": 5150
},
{
"epoch": 4.947267497603068,
"grad_norm": 0.4228033125400543,
"learning_rate": 8.717276551340501e-05,
"loss": 0.0495,
"step": 5160
},
{
"epoch": 4.956855225311601,
"grad_norm": 0.3678063452243805,
"learning_rate": 8.711450835276565e-05,
"loss": 0.0395,
"step": 5170
},
{
"epoch": 4.966442953020135,
"grad_norm": 0.4963276982307434,
"learning_rate": 8.705613876038543e-05,
"loss": 0.042,
"step": 5180
},
{
"epoch": 4.976030680728667,
"grad_norm": 0.3559805452823639,
"learning_rate": 8.699765691308456e-05,
"loss": 0.0448,
"step": 5190
},
{
"epoch": 4.9856184084372,
"grad_norm": 0.253312885761261,
"learning_rate": 8.69390629880233e-05,
"loss": 0.0539,
"step": 5200
},
{
"epoch": 4.995206136145733,
"grad_norm": 0.29010236263275146,
"learning_rate": 8.688035716270141e-05,
"loss": 0.0447,
"step": 5210
},
{
"epoch": 5.004793863854267,
"grad_norm": 0.35962191224098206,
"learning_rate": 8.682153961495767e-05,
"loss": 0.0484,
"step": 5220
},
{
"epoch": 5.0143815915628,
"grad_norm": 0.2923009395599365,
"learning_rate": 8.676261052296928e-05,
"loss": 0.0488,
"step": 5230
},
{
"epoch": 5.023969319271333,
"grad_norm": 0.33261337876319885,
"learning_rate": 8.670357006525131e-05,
"loss": 0.053,
"step": 5240
},
{
"epoch": 5.033557046979865,
"grad_norm": 0.3641784191131592,
"learning_rate": 8.66444184206563e-05,
"loss": 0.0429,
"step": 5250
},
{
"epoch": 5.043144774688399,
"grad_norm": 0.4545520544052124,
"learning_rate": 8.658515576837347e-05,
"loss": 0.0487,
"step": 5260
},
{
"epoch": 5.052732502396932,
"grad_norm": 0.3597351312637329,
"learning_rate": 8.652578228792841e-05,
"loss": 0.0571,
"step": 5270
},
{
"epoch": 5.062320230105465,
"grad_norm": 0.26271480321884155,
"learning_rate": 8.646629815918244e-05,
"loss": 0.046,
"step": 5280
},
{
"epoch": 5.071907957813998,
"grad_norm": 0.2976760268211365,
"learning_rate": 8.640670356233202e-05,
"loss": 0.049,
"step": 5290
},
{
"epoch": 5.0814956855225315,
"grad_norm": 0.3539637327194214,
"learning_rate": 8.634699867790832e-05,
"loss": 0.046,
"step": 5300
},
{
"epoch": 5.091083413231064,
"grad_norm": 0.314113587141037,
"learning_rate": 8.628718368677655e-05,
"loss": 0.0474,
"step": 5310
},
{
"epoch": 5.100671140939597,
"grad_norm": 0.3386295735836029,
"learning_rate": 8.622725877013549e-05,
"loss": 0.0438,
"step": 5320
},
{
"epoch": 5.11025886864813,
"grad_norm": 0.4622576832771301,
"learning_rate": 8.616722410951689e-05,
"loss": 0.0447,
"step": 5330
},
{
"epoch": 5.1198465963566635,
"grad_norm": 0.23671875894069672,
"learning_rate": 8.610707988678503e-05,
"loss": 0.0457,
"step": 5340
},
{
"epoch": 5.129434324065197,
"grad_norm": 0.38376542925834656,
"learning_rate": 8.604682628413601e-05,
"loss": 0.0521,
"step": 5350
},
{
"epoch": 5.13902205177373,
"grad_norm": 0.2503417432308197,
"learning_rate": 8.598646348409729e-05,
"loss": 0.0466,
"step": 5360
},
{
"epoch": 5.148609779482262,
"grad_norm": 0.33504578471183777,
"learning_rate": 8.592599166952718e-05,
"loss": 0.0499,
"step": 5370
},
{
"epoch": 5.1581975071907955,
"grad_norm": 0.2641712725162506,
"learning_rate": 8.586541102361414e-05,
"loss": 0.0471,
"step": 5380
},
{
"epoch": 5.167785234899329,
"grad_norm": 0.363615483045578,
"learning_rate": 8.580472172987638e-05,
"loss": 0.0451,
"step": 5390
},
{
"epoch": 5.177372962607862,
"grad_norm": 0.29901939630508423,
"learning_rate": 8.574392397216123e-05,
"loss": 0.0472,
"step": 5400
},
{
"epoch": 5.186960690316395,
"grad_norm": 0.299882173538208,
"learning_rate": 8.568301793464457e-05,
"loss": 0.0492,
"step": 5410
},
{
"epoch": 5.196548418024928,
"grad_norm": 0.25945836305618286,
"learning_rate": 8.562200380183033e-05,
"loss": 0.0354,
"step": 5420
},
{
"epoch": 5.206136145733462,
"grad_norm": 0.39987847208976746,
"learning_rate": 8.556088175854984e-05,
"loss": 0.0367,
"step": 5430
},
{
"epoch": 5.215723873441994,
"grad_norm": 0.31205254793167114,
"learning_rate": 8.54996519899614e-05,
"loss": 0.0412,
"step": 5440
},
{
"epoch": 5.225311601150527,
"grad_norm": 0.3277497887611389,
"learning_rate": 8.543831468154955e-05,
"loss": 0.0502,
"step": 5450
},
{
"epoch": 5.23489932885906,
"grad_norm": 0.3311022222042084,
"learning_rate": 8.537687001912471e-05,
"loss": 0.0477,
"step": 5460
},
{
"epoch": 5.244487056567594,
"grad_norm": 0.42579907178878784,
"learning_rate": 8.531531818882241e-05,
"loss": 0.0509,
"step": 5470
},
{
"epoch": 5.254074784276127,
"grad_norm": 0.30724838376045227,
"learning_rate": 8.52536593771029e-05,
"loss": 0.0418,
"step": 5480
},
{
"epoch": 5.263662511984659,
"grad_norm": 0.3175548017024994,
"learning_rate": 8.519189377075049e-05,
"loss": 0.0507,
"step": 5490
},
{
"epoch": 5.273250239693192,
"grad_norm": 0.3461003601551056,
"learning_rate": 8.513002155687297e-05,
"loss": 0.0495,
"step": 5500
},
{
"epoch": 5.282837967401726,
"grad_norm": 0.27968931198120117,
"learning_rate": 8.50680429229011e-05,
"loss": 0.0424,
"step": 5510
},
{
"epoch": 5.292425695110259,
"grad_norm": 0.2532777190208435,
"learning_rate": 8.500595805658806e-05,
"loss": 0.0429,
"step": 5520
},
{
"epoch": 5.302013422818792,
"grad_norm": 0.2897396981716156,
"learning_rate": 8.494376714600878e-05,
"loss": 0.0479,
"step": 5530
},
{
"epoch": 5.311601150527325,
"grad_norm": 0.32838040590286255,
"learning_rate": 8.48814703795595e-05,
"loss": 0.0462,
"step": 5540
},
{
"epoch": 5.3211888782358585,
"grad_norm": 0.23218947649002075,
"learning_rate": 8.481906794595702e-05,
"loss": 0.038,
"step": 5550
},
{
"epoch": 5.330776605944391,
"grad_norm": 0.4271414577960968,
"learning_rate": 8.475656003423837e-05,
"loss": 0.0424,
"step": 5560
},
{
"epoch": 5.340364333652924,
"grad_norm": 0.3327130079269409,
"learning_rate": 8.469394683376003e-05,
"loss": 0.0461,
"step": 5570
},
{
"epoch": 5.349952061361457,
"grad_norm": 0.34635308384895325,
"learning_rate": 8.463122853419748e-05,
"loss": 0.0462,
"step": 5580
},
{
"epoch": 5.3595397890699905,
"grad_norm": 0.35077422857284546,
"learning_rate": 8.456840532554448e-05,
"loss": 0.0477,
"step": 5590
},
{
"epoch": 5.369127516778524,
"grad_norm": 0.44980722665786743,
"learning_rate": 8.450547739811275e-05,
"loss": 0.0423,
"step": 5600
},
{
"epoch": 5.378715244487057,
"grad_norm": 0.28166648745536804,
"learning_rate": 8.444244494253106e-05,
"loss": 0.0431,
"step": 5610
},
{
"epoch": 5.388302972195589,
"grad_norm": 0.33736804127693176,
"learning_rate": 8.437930814974499e-05,
"loss": 0.0479,
"step": 5620
},
{
"epoch": 5.3978906999041225,
"grad_norm": 0.25710147619247437,
"learning_rate": 8.43160672110161e-05,
"loss": 0.042,
"step": 5630
},
{
"epoch": 5.407478427612656,
"grad_norm": 0.29803675413131714,
"learning_rate": 8.425272231792148e-05,
"loss": 0.0488,
"step": 5640
},
{
"epoch": 5.417066155321189,
"grad_norm": 0.35298973321914673,
"learning_rate": 8.418927366235305e-05,
"loss": 0.042,
"step": 5650
},
{
"epoch": 5.426653883029722,
"grad_norm": 0.32311904430389404,
"learning_rate": 8.41257214365172e-05,
"loss": 0.0452,
"step": 5660
},
{
"epoch": 5.436241610738255,
"grad_norm": 0.38360047340393066,
"learning_rate": 8.406206583293394e-05,
"loss": 0.0572,
"step": 5670
},
{
"epoch": 5.445829338446788,
"grad_norm": 0.4456116855144501,
"learning_rate": 8.399830704443653e-05,
"loss": 0.0464,
"step": 5680
},
{
"epoch": 5.455417066155321,
"grad_norm": 0.3833318054676056,
"learning_rate": 8.393444526417071e-05,
"loss": 0.0461,
"step": 5690
},
{
"epoch": 5.465004793863854,
"grad_norm": 0.27611926198005676,
"learning_rate": 8.387048068559435e-05,
"loss": 0.0437,
"step": 5700
},
{
"epoch": 5.474592521572387,
"grad_norm": 0.3786008954048157,
"learning_rate": 8.380641350247665e-05,
"loss": 0.0477,
"step": 5710
},
{
"epoch": 5.484180249280921,
"grad_norm": 0.471384197473526,
"learning_rate": 8.37422439088976e-05,
"loss": 0.0449,
"step": 5720
},
{
"epoch": 5.493767976989454,
"grad_norm": 0.2924197018146515,
"learning_rate": 8.36779720992475e-05,
"loss": 0.0476,
"step": 5730
},
{
"epoch": 5.503355704697986,
"grad_norm": 0.24068906903266907,
"learning_rate": 8.361359826822625e-05,
"loss": 0.0477,
"step": 5740
},
{
"epoch": 5.512943432406519,
"grad_norm": 0.24523060023784637,
"learning_rate": 8.354912261084281e-05,
"loss": 0.0489,
"step": 5750
},
{
"epoch": 5.522531160115053,
"grad_norm": 0.3498481810092926,
"learning_rate": 8.348454532241461e-05,
"loss": 0.0387,
"step": 5760
},
{
"epoch": 5.532118887823586,
"grad_norm": 0.3108651340007782,
"learning_rate": 8.341986659856698e-05,
"loss": 0.0377,
"step": 5770
},
{
"epoch": 5.541706615532119,
"grad_norm": 0.3618451654911041,
"learning_rate": 8.335508663523248e-05,
"loss": 0.048,
"step": 5780
},
{
"epoch": 5.551294343240652,
"grad_norm": 0.769836962223053,
"learning_rate": 8.329020562865038e-05,
"loss": 0.0422,
"step": 5790
},
{
"epoch": 5.5608820709491855,
"grad_norm": 0.24395880103111267,
"learning_rate": 8.322522377536604e-05,
"loss": 0.0395,
"step": 5800
},
{
"epoch": 5.570469798657718,
"grad_norm": 0.5865891575813293,
"learning_rate": 8.316014127223033e-05,
"loss": 0.0565,
"step": 5810
},
{
"epoch": 5.580057526366251,
"grad_norm": 0.318808376789093,
"learning_rate": 8.3094958316399e-05,
"loss": 0.0453,
"step": 5820
},
{
"epoch": 5.589645254074784,
"grad_norm": 0.44590169191360474,
"learning_rate": 8.302967510533213e-05,
"loss": 0.0524,
"step": 5830
},
{
"epoch": 5.5992329817833175,
"grad_norm": 0.3664915859699249,
"learning_rate": 8.296429183679349e-05,
"loss": 0.0434,
"step": 5840
},
{
"epoch": 5.608820709491851,
"grad_norm": 0.34023183584213257,
"learning_rate": 8.289880870884995e-05,
"loss": 0.0595,
"step": 5850
},
{
"epoch": 5.618408437200383,
"grad_norm": 0.33271753787994385,
"learning_rate": 8.283322591987086e-05,
"loss": 0.0476,
"step": 5860
},
{
"epoch": 5.627996164908916,
"grad_norm": 0.30905163288116455,
"learning_rate": 8.276754366852754e-05,
"loss": 0.0486,
"step": 5870
},
{
"epoch": 5.6375838926174495,
"grad_norm": 0.3950500786304474,
"learning_rate": 8.27017621537926e-05,
"loss": 0.0524,
"step": 5880
},
{
"epoch": 5.647171620325983,
"grad_norm": 0.3802347481250763,
"learning_rate": 8.26358815749393e-05,
"loss": 0.0453,
"step": 5890
},
{
"epoch": 5.656759348034516,
"grad_norm": 0.27361515164375305,
"learning_rate": 8.256990213154102e-05,
"loss": 0.0426,
"step": 5900
},
{
"epoch": 5.666347075743049,
"grad_norm": 0.28120309114456177,
"learning_rate": 8.250382402347065e-05,
"loss": 0.0406,
"step": 5910
},
{
"epoch": 5.675934803451582,
"grad_norm": 0.44831210374832153,
"learning_rate": 8.243764745089999e-05,
"loss": 0.0433,
"step": 5920
},
{
"epoch": 5.685522531160115,
"grad_norm": 0.2854187488555908,
"learning_rate": 8.237137261429904e-05,
"loss": 0.0438,
"step": 5930
},
{
"epoch": 5.695110258868648,
"grad_norm": 0.3696000874042511,
"learning_rate": 8.230499971443555e-05,
"loss": 0.0399,
"step": 5940
},
{
"epoch": 5.704697986577181,
"grad_norm": 0.794933021068573,
"learning_rate": 8.223852895237427e-05,
"loss": 0.0452,
"step": 5950
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.3321564793586731,
"learning_rate": 8.21719605294765e-05,
"loss": 0.0484,
"step": 5960
},
{
"epoch": 5.723873441994248,
"grad_norm": 0.29202380776405334,
"learning_rate": 8.210529464739928e-05,
"loss": 0.0432,
"step": 5970
},
{
"epoch": 5.73346116970278,
"grad_norm": 0.32877346873283386,
"learning_rate": 8.203853150809494e-05,
"loss": 0.046,
"step": 5980
},
{
"epoch": 5.743048897411313,
"grad_norm": 0.45695215463638306,
"learning_rate": 8.197167131381045e-05,
"loss": 0.0464,
"step": 5990
},
{
"epoch": 5.752636625119846,
"grad_norm": 0.20887207984924316,
"learning_rate": 8.190471426708675e-05,
"loss": 0.0428,
"step": 6000
},
{
"epoch": 5.76222435282838,
"grad_norm": 0.31597304344177246,
"learning_rate": 8.183766057075819e-05,
"loss": 0.0409,
"step": 6010
},
{
"epoch": 5.771812080536913,
"grad_norm": 0.3338216245174408,
"learning_rate": 8.177051042795192e-05,
"loss": 0.0461,
"step": 6020
},
{
"epoch": 5.781399808245446,
"grad_norm": 0.32134512066841125,
"learning_rate": 8.170326404208724e-05,
"loss": 0.0411,
"step": 6030
},
{
"epoch": 5.790987535953979,
"grad_norm": 0.2781100571155548,
"learning_rate": 8.163592161687499e-05,
"loss": 0.0425,
"step": 6040
},
{
"epoch": 5.800575263662512,
"grad_norm": 0.34772852063179016,
"learning_rate": 8.156848335631697e-05,
"loss": 0.0368,
"step": 6050
},
{
"epoch": 5.810162991371045,
"grad_norm": 0.3309897184371948,
"learning_rate": 8.15009494647053e-05,
"loss": 0.04,
"step": 6060
},
{
"epoch": 5.819750719079578,
"grad_norm": 0.252763032913208,
"learning_rate": 8.143332014662176e-05,
"loss": 0.0398,
"step": 6070
},
{
"epoch": 5.829338446788111,
"grad_norm": 0.3265877664089203,
"learning_rate": 8.136559560693722e-05,
"loss": 0.045,
"step": 6080
},
{
"epoch": 5.8389261744966445,
"grad_norm": 0.4045432209968567,
"learning_rate": 8.129777605081105e-05,
"loss": 0.0428,
"step": 6090
},
{
"epoch": 5.848513902205178,
"grad_norm": 0.2679883539676666,
"learning_rate": 8.12298616836904e-05,
"loss": 0.0433,
"step": 6100
},
{
"epoch": 5.85810162991371,
"grad_norm": 0.4409831166267395,
"learning_rate": 8.116185271130965e-05,
"loss": 0.0457,
"step": 6110
},
{
"epoch": 5.867689357622243,
"grad_norm": 0.4434974491596222,
"learning_rate": 8.10937493396898e-05,
"loss": 0.0476,
"step": 6120
},
{
"epoch": 5.8772770853307765,
"grad_norm": 0.363570898771286,
"learning_rate": 8.102555177513776e-05,
"loss": 0.0405,
"step": 6130
},
{
"epoch": 5.88686481303931,
"grad_norm": 0.31658318638801575,
"learning_rate": 8.095726022424583e-05,
"loss": 0.0434,
"step": 6140
},
{
"epoch": 5.896452540747843,
"grad_norm": 0.3343175947666168,
"learning_rate": 8.088887489389099e-05,
"loss": 0.0421,
"step": 6150
},
{
"epoch": 5.906040268456376,
"grad_norm": 0.2580268681049347,
"learning_rate": 8.082039599123434e-05,
"loss": 0.0415,
"step": 6160
},
{
"epoch": 5.9156279961649085,
"grad_norm": 0.36179137229919434,
"learning_rate": 8.07518237237204e-05,
"loss": 0.0425,
"step": 6170
},
{
"epoch": 5.925215723873442,
"grad_norm": 0.3440069556236267,
"learning_rate": 8.068315829907658e-05,
"loss": 0.0404,
"step": 6180
},
{
"epoch": 5.934803451581975,
"grad_norm": 0.39785268902778625,
"learning_rate": 8.061439992531241e-05,
"loss": 0.0425,
"step": 6190
},
{
"epoch": 5.944391179290508,
"grad_norm": 0.29912492632865906,
"learning_rate": 8.054554881071909e-05,
"loss": 0.0465,
"step": 6200
},
{
"epoch": 5.953978906999041,
"grad_norm": 0.3317604660987854,
"learning_rate": 8.047660516386868e-05,
"loss": 0.0432,
"step": 6210
},
{
"epoch": 5.963566634707575,
"grad_norm": 0.3451102077960968,
"learning_rate": 8.040756919361358e-05,
"loss": 0.0452,
"step": 6220
},
{
"epoch": 5.973154362416107,
"grad_norm": 0.3293020725250244,
"learning_rate": 8.03384411090859e-05,
"loss": 0.0367,
"step": 6230
},
{
"epoch": 5.98274209012464,
"grad_norm": 0.30293816328048706,
"learning_rate": 8.026922111969674e-05,
"loss": 0.0442,
"step": 6240
},
{
"epoch": 5.992329817833173,
"grad_norm": 0.2671773433685303,
"learning_rate": 8.019990943513565e-05,
"loss": 0.0482,
"step": 6250
},
{
"epoch": 6.001917545541707,
"grad_norm": 0.30587103962898254,
"learning_rate": 8.013050626536992e-05,
"loss": 0.054,
"step": 6260
},
{
"epoch": 6.01150527325024,
"grad_norm": 0.3319852948188782,
"learning_rate": 8.0061011820644e-05,
"loss": 0.0454,
"step": 6270
},
{
"epoch": 6.021093000958773,
"grad_norm": 0.5606246590614319,
"learning_rate": 7.999142631147884e-05,
"loss": 0.0491,
"step": 6280
},
{
"epoch": 6.030680728667305,
"grad_norm": 0.3884483873844147,
"learning_rate": 7.992174994867123e-05,
"loss": 0.0488,
"step": 6290
},
{
"epoch": 6.040268456375839,
"grad_norm": 0.30733785033226013,
"learning_rate": 7.985198294329324e-05,
"loss": 0.0434,
"step": 6300
},
{
"epoch": 6.049856184084372,
"grad_norm": 0.9947719573974609,
"learning_rate": 7.978212550669144e-05,
"loss": 0.0452,
"step": 6310
},
{
"epoch": 6.059443911792905,
"grad_norm": 0.3336857259273529,
"learning_rate": 7.971217785048644e-05,
"loss": 0.0445,
"step": 6320
},
{
"epoch": 6.069031639501438,
"grad_norm": 0.3001098930835724,
"learning_rate": 7.964214018657208e-05,
"loss": 0.042,
"step": 6330
},
{
"epoch": 6.0786193672099715,
"grad_norm": 0.32423412799835205,
"learning_rate": 7.957201272711492e-05,
"loss": 0.041,
"step": 6340
},
{
"epoch": 6.088207094918504,
"grad_norm": 0.2871480882167816,
"learning_rate": 7.950179568455347e-05,
"loss": 0.0436,
"step": 6350
},
{
"epoch": 6.097794822627037,
"grad_norm": 0.4804290533065796,
"learning_rate": 7.94314892715977e-05,
"loss": 0.0393,
"step": 6360
},
{
"epoch": 6.10738255033557,
"grad_norm": 0.459533154964447,
"learning_rate": 7.936109370122824e-05,
"loss": 0.0468,
"step": 6370
},
{
"epoch": 6.1169702780441035,
"grad_norm": 0.25455859303474426,
"learning_rate": 7.929060918669585e-05,
"loss": 0.0409,
"step": 6380
},
{
"epoch": 6.126558005752637,
"grad_norm": 0.34990832209587097,
"learning_rate": 7.922003594152068e-05,
"loss": 0.0389,
"step": 6390
},
{
"epoch": 6.13614573346117,
"grad_norm": 0.2321031242609024,
"learning_rate": 7.914937417949175e-05,
"loss": 0.0428,
"step": 6400
},
{
"epoch": 6.145733461169703,
"grad_norm": 0.3366633951663971,
"learning_rate": 7.907862411466616e-05,
"loss": 0.0417,
"step": 6410
},
{
"epoch": 6.1553211888782355,
"grad_norm": 0.3831850588321686,
"learning_rate": 7.900778596136855e-05,
"loss": 0.0409,
"step": 6420
},
{
"epoch": 6.164908916586769,
"grad_norm": 0.3772655129432678,
"learning_rate": 7.893685993419036e-05,
"loss": 0.0412,
"step": 6430
},
{
"epoch": 6.174496644295302,
"grad_norm": 0.4264662563800812,
"learning_rate": 7.88658462479893e-05,
"loss": 0.0437,
"step": 6440
},
{
"epoch": 6.184084372003835,
"grad_norm": 0.3162544369697571,
"learning_rate": 7.879474511788854e-05,
"loss": 0.0388,
"step": 6450
},
{
"epoch": 6.193672099712368,
"grad_norm": 0.34539514780044556,
"learning_rate": 7.872355675927623e-05,
"loss": 0.0416,
"step": 6460
},
{
"epoch": 6.203259827420902,
"grad_norm": 0.3206475079059601,
"learning_rate": 7.865228138780469e-05,
"loss": 0.0468,
"step": 6470
},
{
"epoch": 6.212847555129434,
"grad_norm": 0.3619016110897064,
"learning_rate": 7.858091921938988e-05,
"loss": 0.0448,
"step": 6480
},
{
"epoch": 6.222435282837967,
"grad_norm": 0.3190850615501404,
"learning_rate": 7.850947047021069e-05,
"loss": 0.0388,
"step": 6490
},
{
"epoch": 6.2320230105465,
"grad_norm": 0.3191368579864502,
"learning_rate": 7.843793535670827e-05,
"loss": 0.0449,
"step": 6500
},
{
"epoch": 6.241610738255034,
"grad_norm": 0.24938683211803436,
"learning_rate": 7.836631409558538e-05,
"loss": 0.0379,
"step": 6510
},
{
"epoch": 6.251198465963567,
"grad_norm": 0.27279171347618103,
"learning_rate": 7.829460690380584e-05,
"loss": 0.0398,
"step": 6520
},
{
"epoch": 6.2607861936721,
"grad_norm": 0.4261578917503357,
"learning_rate": 7.822281399859365e-05,
"loss": 0.0441,
"step": 6530
},
{
"epoch": 6.270373921380632,
"grad_norm": 0.3505672216415405,
"learning_rate": 7.815093559743256e-05,
"loss": 0.0464,
"step": 6540
},
{
"epoch": 6.279961649089166,
"grad_norm": 0.8695809841156006,
"learning_rate": 7.807897191806527e-05,
"loss": 0.0459,
"step": 6550
},
{
"epoch": 6.289549376797699,
"grad_norm": 0.3453594446182251,
"learning_rate": 7.800692317849285e-05,
"loss": 0.0437,
"step": 6560
},
{
"epoch": 6.299137104506232,
"grad_norm": 0.4360389709472656,
"learning_rate": 7.7934789596974e-05,
"loss": 0.0495,
"step": 6570
},
{
"epoch": 6.308724832214765,
"grad_norm": 0.4259977340698242,
"learning_rate": 7.786257139202447e-05,
"loss": 0.0486,
"step": 6580
},
{
"epoch": 6.3183125599232985,
"grad_norm": 0.4518745541572571,
"learning_rate": 7.779026878241635e-05,
"loss": 0.0455,
"step": 6590
},
{
"epoch": 6.327900287631831,
"grad_norm": 0.38590195775032043,
"learning_rate": 7.771788198717741e-05,
"loss": 0.043,
"step": 6600
},
{
"epoch": 6.337488015340364,
"grad_norm": 0.2825833559036255,
"learning_rate": 7.764541122559046e-05,
"loss": 0.0439,
"step": 6610
},
{
"epoch": 6.347075743048897,
"grad_norm": 0.364486962556839,
"learning_rate": 7.757285671719264e-05,
"loss": 0.0429,
"step": 6620
},
{
"epoch": 6.3566634707574305,
"grad_norm": 0.32037052512168884,
"learning_rate": 7.750021868177485e-05,
"loss": 0.0433,
"step": 6630
},
{
"epoch": 6.366251198465964,
"grad_norm": 0.2986597716808319,
"learning_rate": 7.742749733938094e-05,
"loss": 0.0407,
"step": 6640
},
{
"epoch": 6.375838926174497,
"grad_norm": 0.20917120575904846,
"learning_rate": 7.73546929103072e-05,
"loss": 0.0361,
"step": 6650
},
{
"epoch": 6.385426653883029,
"grad_norm": 0.3319404125213623,
"learning_rate": 7.728180561510155e-05,
"loss": 0.04,
"step": 6660
},
{
"epoch": 6.3950143815915625,
"grad_norm": 0.4171611964702606,
"learning_rate": 7.720883567456298e-05,
"loss": 0.0348,
"step": 6670
},
{
"epoch": 6.404602109300096,
"grad_norm": 0.44948673248291016,
"learning_rate": 7.713578330974081e-05,
"loss": 0.0489,
"step": 6680
},
{
"epoch": 6.414189837008629,
"grad_norm": 0.3433539569377899,
"learning_rate": 7.706264874193409e-05,
"loss": 0.038,
"step": 6690
},
{
"epoch": 6.423777564717162,
"grad_norm": 0.44886866211891174,
"learning_rate": 7.698943219269086e-05,
"loss": 0.0437,
"step": 6700
},
{
"epoch": 6.433365292425695,
"grad_norm": 0.30656543374061584,
"learning_rate": 7.691613388380752e-05,
"loss": 0.0409,
"step": 6710
},
{
"epoch": 6.442953020134228,
"grad_norm": 0.3929513692855835,
"learning_rate": 7.684275403732811e-05,
"loss": 0.0441,
"step": 6720
},
{
"epoch": 6.452540747842761,
"grad_norm": 0.44606807827949524,
"learning_rate": 7.676929287554372e-05,
"loss": 0.0457,
"step": 6730
},
{
"epoch": 6.462128475551294,
"grad_norm": 0.3216160535812378,
"learning_rate": 7.669575062099175e-05,
"loss": 0.0469,
"step": 6740
},
{
"epoch": 6.471716203259827,
"grad_norm": 0.24256640672683716,
"learning_rate": 7.662212749645527e-05,
"loss": 0.0384,
"step": 6750
},
{
"epoch": 6.481303930968361,
"grad_norm": 0.37510934472084045,
"learning_rate": 7.654842372496232e-05,
"loss": 0.0389,
"step": 6760
},
{
"epoch": 6.490891658676894,
"grad_norm": 0.3382836878299713,
"learning_rate": 7.647463952978524e-05,
"loss": 0.0448,
"step": 6770
},
{
"epoch": 6.500479386385427,
"grad_norm": 0.4976375102996826,
"learning_rate": 7.640077513443999e-05,
"loss": 0.0413,
"step": 6780
},
{
"epoch": 6.510067114093959,
"grad_norm": 0.273062527179718,
"learning_rate": 7.632683076268552e-05,
"loss": 0.0432,
"step": 6790
},
{
"epoch": 6.519654841802493,
"grad_norm": 0.34846237301826477,
"learning_rate": 7.625280663852301e-05,
"loss": 0.0501,
"step": 6800
},
{
"epoch": 6.529242569511026,
"grad_norm": 0.26076826453208923,
"learning_rate": 7.617870298619527e-05,
"loss": 0.0428,
"step": 6810
},
{
"epoch": 6.538830297219559,
"grad_norm": 0.8371449708938599,
"learning_rate": 7.610452003018602e-05,
"loss": 0.0437,
"step": 6820
},
{
"epoch": 6.548418024928092,
"grad_norm": 0.28489676117897034,
"learning_rate": 7.603025799521918e-05,
"loss": 0.0446,
"step": 6830
},
{
"epoch": 6.558005752636625,
"grad_norm": 0.3971545994281769,
"learning_rate": 7.595591710625829e-05,
"loss": 0.045,
"step": 6840
},
{
"epoch": 6.567593480345158,
"grad_norm": 0.24828213453292847,
"learning_rate": 7.588149758850572e-05,
"loss": 0.0431,
"step": 6850
},
{
"epoch": 6.577181208053691,
"grad_norm": 0.23631419241428375,
"learning_rate": 7.580699966740201e-05,
"loss": 0.0384,
"step": 6860
},
{
"epoch": 6.586768935762224,
"grad_norm": 0.3739171326160431,
"learning_rate": 7.57324235686253e-05,
"loss": 0.0513,
"step": 6870
},
{
"epoch": 6.5963566634707576,
"grad_norm": 0.29776638746261597,
"learning_rate": 7.565776951809043e-05,
"loss": 0.0437,
"step": 6880
},
{
"epoch": 6.605944391179291,
"grad_norm": 0.24786557257175446,
"learning_rate": 7.558303774194848e-05,
"loss": 0.045,
"step": 6890
},
{
"epoch": 6.615532118887824,
"grad_norm": 0.2621402442455292,
"learning_rate": 7.550822846658592e-05,
"loss": 0.036,
"step": 6900
},
{
"epoch": 6.625119846596356,
"grad_norm": 0.4778667092323303,
"learning_rate": 7.543334191862408e-05,
"loss": 0.0403,
"step": 6910
},
{
"epoch": 6.6347075743048896,
"grad_norm": 0.37852802872657776,
"learning_rate": 7.535837832491826e-05,
"loss": 0.0433,
"step": 6920
},
{
"epoch": 6.644295302013423,
"grad_norm": 0.5725548267364502,
"learning_rate": 7.528333791255723e-05,
"loss": 0.0434,
"step": 6930
},
{
"epoch": 6.653883029721956,
"grad_norm": 0.39372578263282776,
"learning_rate": 7.520822090886245e-05,
"loss": 0.0403,
"step": 6940
},
{
"epoch": 6.663470757430489,
"grad_norm": 0.2831190526485443,
"learning_rate": 7.513302754138741e-05,
"loss": 0.0424,
"step": 6950
},
{
"epoch": 6.673058485139022,
"grad_norm": 0.27865827083587646,
"learning_rate": 7.50577580379169e-05,
"loss": 0.0397,
"step": 6960
},
{
"epoch": 6.682646212847555,
"grad_norm": 0.42975571751594543,
"learning_rate": 7.49824126264664e-05,
"loss": 0.0426,
"step": 6970
},
{
"epoch": 6.692233940556088,
"grad_norm": 0.3423265218734741,
"learning_rate": 7.490699153528124e-05,
"loss": 0.045,
"step": 6980
},
{
"epoch": 6.701821668264621,
"grad_norm": 0.25411704182624817,
"learning_rate": 7.483149499283616e-05,
"loss": 0.0396,
"step": 6990
},
{
"epoch": 6.7114093959731544,
"grad_norm": 0.35409414768218994,
"learning_rate": 7.475592322783434e-05,
"loss": 0.0382,
"step": 7000
},
{
"epoch": 6.720997123681688,
"grad_norm": 0.28262168169021606,
"learning_rate": 7.468027646920687e-05,
"loss": 0.045,
"step": 7010
},
{
"epoch": 6.730584851390221,
"grad_norm": 0.4541366398334503,
"learning_rate": 7.460455494611206e-05,
"loss": 0.0389,
"step": 7020
},
{
"epoch": 6.740172579098753,
"grad_norm": 0.27586543560028076,
"learning_rate": 7.452875888793465e-05,
"loss": 0.0352,
"step": 7030
},
{
"epoch": 6.7497603068072864,
"grad_norm": 0.2681753933429718,
"learning_rate": 7.445288852428518e-05,
"loss": 0.0492,
"step": 7040
},
{
"epoch": 6.75934803451582,
"grad_norm": 0.32088425755500793,
"learning_rate": 7.437694408499933e-05,
"loss": 0.0524,
"step": 7050
},
{
"epoch": 6.768935762224353,
"grad_norm": 0.3608848452568054,
"learning_rate": 7.430092580013712e-05,
"loss": 0.0444,
"step": 7060
},
{
"epoch": 6.778523489932886,
"grad_norm": 0.2983666658401489,
"learning_rate": 7.42248338999823e-05,
"loss": 0.0484,
"step": 7070
},
{
"epoch": 6.788111217641419,
"grad_norm": 0.48037657141685486,
"learning_rate": 7.414866861504164e-05,
"loss": 0.0441,
"step": 7080
},
{
"epoch": 6.797698945349952,
"grad_norm": 0.3220434784889221,
"learning_rate": 7.407243017604418e-05,
"loss": 0.0407,
"step": 7090
},
{
"epoch": 6.807286673058485,
"grad_norm": 0.21454603970050812,
"learning_rate": 7.399611881394061e-05,
"loss": 0.0484,
"step": 7100
},
{
"epoch": 6.816874400767018,
"grad_norm": 0.3658502995967865,
"learning_rate": 7.391973475990247e-05,
"loss": 0.0471,
"step": 7110
},
{
"epoch": 6.826462128475551,
"grad_norm": 0.6076493859291077,
"learning_rate": 7.384327824532158e-05,
"loss": 0.0512,
"step": 7120
},
{
"epoch": 6.836049856184085,
"grad_norm": 0.27629798650741577,
"learning_rate": 7.376674950180918e-05,
"loss": 0.0432,
"step": 7130
},
{
"epoch": 6.845637583892618,
"grad_norm": 0.4255768954753876,
"learning_rate": 7.36901487611954e-05,
"loss": 0.042,
"step": 7140
},
{
"epoch": 6.855225311601151,
"grad_norm": 0.34027740359306335,
"learning_rate": 7.361347625552842e-05,
"loss": 0.0417,
"step": 7150
},
{
"epoch": 6.864813039309683,
"grad_norm": 0.29743191599845886,
"learning_rate": 7.353673221707382e-05,
"loss": 0.0506,
"step": 7160
},
{
"epoch": 6.874400767018217,
"grad_norm": 0.2994328439235687,
"learning_rate": 7.345991687831393e-05,
"loss": 0.042,
"step": 7170
},
{
"epoch": 6.88398849472675,
"grad_norm": 0.2891611158847809,
"learning_rate": 7.338303047194697e-05,
"loss": 0.0396,
"step": 7180
},
{
"epoch": 6.893576222435283,
"grad_norm": 0.2870160937309265,
"learning_rate": 7.330607323088657e-05,
"loss": 0.0477,
"step": 7190
},
{
"epoch": 6.903163950143816,
"grad_norm": 0.4798467457294464,
"learning_rate": 7.322904538826083e-05,
"loss": 0.0409,
"step": 7200
},
{
"epoch": 6.912751677852349,
"grad_norm": 0.30976602435112,
"learning_rate": 7.31519471774118e-05,
"loss": 0.0431,
"step": 7210
},
{
"epoch": 6.922339405560882,
"grad_norm": 0.32751721143722534,
"learning_rate": 7.307477883189463e-05,
"loss": 0.0415,
"step": 7220
},
{
"epoch": 6.931927133269415,
"grad_norm": 0.3902662992477417,
"learning_rate": 7.299754058547704e-05,
"loss": 0.0359,
"step": 7230
},
{
"epoch": 6.941514860977948,
"grad_norm": 0.21194472908973694,
"learning_rate": 7.292023267213835e-05,
"loss": 0.0409,
"step": 7240
},
{
"epoch": 6.9511025886864815,
"grad_norm": 0.28738507628440857,
"learning_rate": 7.284285532606906e-05,
"loss": 0.0433,
"step": 7250
},
{
"epoch": 6.960690316395015,
"grad_norm": 0.27712157368659973,
"learning_rate": 7.276540878166996e-05,
"loss": 0.0445,
"step": 7260
},
{
"epoch": 6.970278044103548,
"grad_norm": 0.36444854736328125,
"learning_rate": 7.268789327355143e-05,
"loss": 0.0424,
"step": 7270
},
{
"epoch": 6.97986577181208,
"grad_norm": 0.26638609170913696,
"learning_rate": 7.261030903653278e-05,
"loss": 0.0415,
"step": 7280
},
{
"epoch": 6.9894534995206135,
"grad_norm": 0.29326483607292175,
"learning_rate": 7.253265630564155e-05,
"loss": 0.0404,
"step": 7290
},
{
"epoch": 6.999041227229147,
"grad_norm": 0.563951849937439,
"learning_rate": 7.245493531611274e-05,
"loss": 0.0462,
"step": 7300
},
{
"epoch": 7.00862895493768,
"grad_norm": 0.2669621407985687,
"learning_rate": 7.237714630338812e-05,
"loss": 0.0489,
"step": 7310
},
{
"epoch": 7.018216682646213,
"grad_norm": 0.29936525225639343,
"learning_rate": 7.229928950311558e-05,
"loss": 0.042,
"step": 7320
},
{
"epoch": 7.027804410354746,
"grad_norm": 0.29611873626708984,
"learning_rate": 7.222136515114828e-05,
"loss": 0.0451,
"step": 7330
},
{
"epoch": 7.037392138063279,
"grad_norm": 0.2841253876686096,
"learning_rate": 7.214337348354408e-05,
"loss": 0.0401,
"step": 7340
},
{
"epoch": 7.046979865771812,
"grad_norm": 0.39095616340637207,
"learning_rate": 7.206531473656473e-05,
"loss": 0.0443,
"step": 7350
},
{
"epoch": 7.056567593480345,
"grad_norm": 0.3568895757198334,
"learning_rate": 7.19871891466752e-05,
"loss": 0.04,
"step": 7360
},
{
"epoch": 7.066155321188878,
"grad_norm": 0.4422648549079895,
"learning_rate": 7.190899695054293e-05,
"loss": 0.0357,
"step": 7370
},
{
"epoch": 7.075743048897412,
"grad_norm": 0.3040291965007782,
"learning_rate": 7.183073838503715e-05,
"loss": 0.0375,
"step": 7380
},
{
"epoch": 7.085330776605945,
"grad_norm": 0.3379688560962677,
"learning_rate": 7.175241368722812e-05,
"loss": 0.0441,
"step": 7390
},
{
"epoch": 7.094918504314477,
"grad_norm": 0.23404334485530853,
"learning_rate": 7.167402309438649e-05,
"loss": 0.0438,
"step": 7400
},
{
"epoch": 7.10450623202301,
"grad_norm": 0.19392350316047668,
"learning_rate": 7.159556684398246e-05,
"loss": 0.0429,
"step": 7410
},
{
"epoch": 7.114093959731544,
"grad_norm": 0.3650771975517273,
"learning_rate": 7.151704517368513e-05,
"loss": 0.0417,
"step": 7420
},
{
"epoch": 7.123681687440077,
"grad_norm": 0.3727266788482666,
"learning_rate": 7.143845832136188e-05,
"loss": 0.0381,
"step": 7430
},
{
"epoch": 7.13326941514861,
"grad_norm": 0.2589777410030365,
"learning_rate": 7.13598065250774e-05,
"loss": 0.046,
"step": 7440
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.3064965009689331,
"learning_rate": 7.128109002309324e-05,
"loss": 0.0419,
"step": 7450
},
{
"epoch": 7.152444870565676,
"grad_norm": 0.3681334853172302,
"learning_rate": 7.120230905386688e-05,
"loss": 0.0456,
"step": 7460
},
{
"epoch": 7.162032598274209,
"grad_norm": 0.23908288776874542,
"learning_rate": 7.112346385605115e-05,
"loss": 0.0395,
"step": 7470
},
{
"epoch": 7.171620325982742,
"grad_norm": 0.26035764813423157,
"learning_rate": 7.104455466849339e-05,
"loss": 0.0411,
"step": 7480
},
{
"epoch": 7.181208053691275,
"grad_norm": 0.25808098912239075,
"learning_rate": 7.096558173023486e-05,
"loss": 0.0405,
"step": 7490
},
{
"epoch": 7.1907957813998085,
"grad_norm": 0.21516771614551544,
"learning_rate": 7.088654528050986e-05,
"loss": 0.0411,
"step": 7500
},
{
"epoch": 7.200383509108342,
"grad_norm": 0.27496856451034546,
"learning_rate": 7.080744555874517e-05,
"loss": 0.0332,
"step": 7510
},
{
"epoch": 7.209971236816874,
"grad_norm": 0.43999767303466797,
"learning_rate": 7.072828280455917e-05,
"loss": 0.0384,
"step": 7520
},
{
"epoch": 7.219558964525407,
"grad_norm": 0.3292781710624695,
"learning_rate": 7.06490572577612e-05,
"loss": 0.042,
"step": 7530
},
{
"epoch": 7.2291466922339405,
"grad_norm": 0.3117612600326538,
"learning_rate": 7.056976915835087e-05,
"loss": 0.0387,
"step": 7540
},
{
"epoch": 7.238734419942474,
"grad_norm": 0.2206171602010727,
"learning_rate": 7.049041874651722e-05,
"loss": 0.0362,
"step": 7550
},
{
"epoch": 7.248322147651007,
"grad_norm": 0.2644396722316742,
"learning_rate": 7.04110062626381e-05,
"loss": 0.0373,
"step": 7560
},
{
"epoch": 7.25790987535954,
"grad_norm": 0.2682825028896332,
"learning_rate": 7.033153194727934e-05,
"loss": 0.039,
"step": 7570
},
{
"epoch": 7.2674976030680725,
"grad_norm": 0.3411322832107544,
"learning_rate": 7.025199604119416e-05,
"loss": 0.0454,
"step": 7580
},
{
"epoch": 7.277085330776606,
"grad_norm": 0.3761787712574005,
"learning_rate": 7.017239878532227e-05,
"loss": 0.0379,
"step": 7590
},
{
"epoch": 7.286673058485139,
"grad_norm": 0.24610835313796997,
"learning_rate": 7.009274042078927e-05,
"loss": 0.0465,
"step": 7600
},
{
"epoch": 7.296260786193672,
"grad_norm": 0.3763638138771057,
"learning_rate": 7.00130211889059e-05,
"loss": 0.0351,
"step": 7610
},
{
"epoch": 7.305848513902205,
"grad_norm": 0.2616029679775238,
"learning_rate": 6.993324133116726e-05,
"loss": 0.039,
"step": 7620
},
{
"epoch": 7.315436241610739,
"grad_norm": 0.40914463996887207,
"learning_rate": 6.985340108925209e-05,
"loss": 0.0417,
"step": 7630
},
{
"epoch": 7.325023969319272,
"grad_norm": 0.3503078520298004,
"learning_rate": 6.977350070502208e-05,
"loss": 0.0456,
"step": 7640
},
{
"epoch": 7.334611697027804,
"grad_norm": 0.40051010251045227,
"learning_rate": 6.96935404205211e-05,
"loss": 0.047,
"step": 7650
},
{
"epoch": 7.344199424736337,
"grad_norm": 0.3985821306705475,
"learning_rate": 6.96135204779745e-05,
"loss": 0.0409,
"step": 7660
},
{
"epoch": 7.353787152444871,
"grad_norm": 0.5366324782371521,
"learning_rate": 6.95334411197883e-05,
"loss": 0.0445,
"step": 7670
},
{
"epoch": 7.363374880153404,
"grad_norm": 0.2314271628856659,
"learning_rate": 6.945330258854854e-05,
"loss": 0.0345,
"step": 7680
},
{
"epoch": 7.372962607861937,
"grad_norm": 0.24734103679656982,
"learning_rate": 6.937310512702056e-05,
"loss": 0.0354,
"step": 7690
},
{
"epoch": 7.382550335570469,
"grad_norm": 0.7746879458427429,
"learning_rate": 6.929284897814812e-05,
"loss": 0.0398,
"step": 7700
},
{
"epoch": 7.392138063279003,
"grad_norm": 0.3436695635318756,
"learning_rate": 6.921253438505285e-05,
"loss": 0.0426,
"step": 7710
},
{
"epoch": 7.401725790987536,
"grad_norm": 0.3027035593986511,
"learning_rate": 6.913216159103339e-05,
"loss": 0.0365,
"step": 7720
},
{
"epoch": 7.411313518696069,
"grad_norm": 0.23207184672355652,
"learning_rate": 6.905173083956468e-05,
"loss": 0.0397,
"step": 7730
},
{
"epoch": 7.420901246404602,
"grad_norm": 0.2601774036884308,
"learning_rate": 6.897124237429726e-05,
"loss": 0.0377,
"step": 7740
},
{
"epoch": 7.4304889741131355,
"grad_norm": 0.37864232063293457,
"learning_rate": 6.889069643905646e-05,
"loss": 0.0426,
"step": 7750
},
{
"epoch": 7.440076701821669,
"grad_norm": 0.29199257493019104,
"learning_rate": 6.881009327784176e-05,
"loss": 0.0414,
"step": 7760
},
{
"epoch": 7.449664429530201,
"grad_norm": 0.39418113231658936,
"learning_rate": 6.872943313482596e-05,
"loss": 0.04,
"step": 7770
},
{
"epoch": 7.459252157238734,
"grad_norm": 0.2868475615978241,
"learning_rate": 6.864871625435448e-05,
"loss": 0.0373,
"step": 7780
},
{
"epoch": 7.4688398849472675,
"grad_norm": 0.27719494700431824,
"learning_rate": 6.856794288094461e-05,
"loss": 0.0401,
"step": 7790
},
{
"epoch": 7.478427612655801,
"grad_norm": 0.33910930156707764,
"learning_rate": 6.848711325928481e-05,
"loss": 0.0375,
"step": 7800
},
{
"epoch": 7.488015340364334,
"grad_norm": 0.4122414290904999,
"learning_rate": 6.840622763423391e-05,
"loss": 0.0437,
"step": 7810
},
{
"epoch": 7.497603068072867,
"grad_norm": 0.2600208818912506,
"learning_rate": 6.832528625082036e-05,
"loss": 0.0418,
"step": 7820
},
{
"epoch": 7.5071907957813995,
"grad_norm": 0.27382367849349976,
"learning_rate": 6.824428935424158e-05,
"loss": 0.0512,
"step": 7830
},
{
"epoch": 7.516778523489933,
"grad_norm": 0.27426889538764954,
"learning_rate": 6.816323718986313e-05,
"loss": 0.0339,
"step": 7840
},
{
"epoch": 7.526366251198466,
"grad_norm": 0.32315194606781006,
"learning_rate": 6.808213000321796e-05,
"loss": 0.0387,
"step": 7850
},
{
"epoch": 7.535953978906999,
"grad_norm": 0.2910844683647156,
"learning_rate": 6.80009680400058e-05,
"loss": 0.0351,
"step": 7860
},
{
"epoch": 7.545541706615532,
"grad_norm": 0.3915770649909973,
"learning_rate": 6.791975154609216e-05,
"loss": 0.0439,
"step": 7870
},
{
"epoch": 7.555129434324066,
"grad_norm": 0.2871047258377075,
"learning_rate": 6.78384807675079e-05,
"loss": 0.039,
"step": 7880
},
{
"epoch": 7.564717162032598,
"grad_norm": 0.3511698544025421,
"learning_rate": 6.775715595044822e-05,
"loss": 0.039,
"step": 7890
},
{
"epoch": 7.574304889741131,
"grad_norm": 0.23974575102329254,
"learning_rate": 6.767577734127209e-05,
"loss": 0.0438,
"step": 7900
},
{
"epoch": 7.583892617449664,
"grad_norm": 0.21983303129673004,
"learning_rate": 6.759434518650133e-05,
"loss": 0.043,
"step": 7910
},
{
"epoch": 7.593480345158198,
"grad_norm": 0.2729918658733368,
"learning_rate": 6.75128597328201e-05,
"loss": 0.0423,
"step": 7920
},
{
"epoch": 7.603068072866731,
"grad_norm": 0.34236469864845276,
"learning_rate": 6.743132122707394e-05,
"loss": 0.0443,
"step": 7930
},
{
"epoch": 7.612655800575264,
"grad_norm": 0.24948126077651978,
"learning_rate": 6.73497299162691e-05,
"loss": 0.037,
"step": 7940
},
{
"epoch": 7.622243528283796,
"grad_norm": 0.3250608444213867,
"learning_rate": 6.726808604757184e-05,
"loss": 0.0476,
"step": 7950
},
{
"epoch": 7.63183125599233,
"grad_norm": 0.2713163495063782,
"learning_rate": 6.718638986830758e-05,
"loss": 0.0391,
"step": 7960
},
{
"epoch": 7.641418983700863,
"grad_norm": 0.3012318015098572,
"learning_rate": 6.710464162596023e-05,
"loss": 0.0445,
"step": 7970
},
{
"epoch": 7.651006711409396,
"grad_norm": 0.4039930999279022,
"learning_rate": 6.702284156817143e-05,
"loss": 0.045,
"step": 7980
},
{
"epoch": 7.660594439117929,
"grad_norm": 0.22321514785289764,
"learning_rate": 6.694098994273977e-05,
"loss": 0.0395,
"step": 7990
},
{
"epoch": 7.6701821668264625,
"grad_norm": 0.3009647727012634,
"learning_rate": 6.685908699762002e-05,
"loss": 0.0425,
"step": 8000
},
{
"epoch": 7.679769894534996,
"grad_norm": 0.23675967752933502,
"learning_rate": 6.677713298092251e-05,
"loss": 0.043,
"step": 8010
},
{
"epoch": 7.689357622243528,
"grad_norm": 0.3453296422958374,
"learning_rate": 6.669512814091219e-05,
"loss": 0.0402,
"step": 8020
},
{
"epoch": 7.698945349952061,
"grad_norm": 0.35849177837371826,
"learning_rate": 6.6613072726008e-05,
"loss": 0.0412,
"step": 8030
},
{
"epoch": 7.7085330776605945,
"grad_norm": 0.2602018117904663,
"learning_rate": 6.65309669847821e-05,
"loss": 0.0456,
"step": 8040
},
{
"epoch": 7.718120805369128,
"grad_norm": 0.296563059091568,
"learning_rate": 6.64488111659591e-05,
"loss": 0.0354,
"step": 8050
},
{
"epoch": 7.727708533077661,
"grad_norm": 0.2529861629009247,
"learning_rate": 6.636660551841527e-05,
"loss": 0.046,
"step": 8060
},
{
"epoch": 7.737296260786193,
"grad_norm": 0.3589211404323578,
"learning_rate": 6.62843502911779e-05,
"loss": 0.0486,
"step": 8070
},
{
"epoch": 7.7468839884947265,
"grad_norm": 0.28562942147254944,
"learning_rate": 6.620204573342444e-05,
"loss": 0.04,
"step": 8080
},
{
"epoch": 7.75647171620326,
"grad_norm": 0.42662665247917175,
"learning_rate": 6.611969209448175e-05,
"loss": 0.0417,
"step": 8090
},
{
"epoch": 7.766059443911793,
"grad_norm": 0.3339911997318268,
"learning_rate": 6.603728962382542e-05,
"loss": 0.0344,
"step": 8100
},
{
"epoch": 7.775647171620326,
"grad_norm": 0.5838896632194519,
"learning_rate": 6.595483857107891e-05,
"loss": 0.0371,
"step": 8110
},
{
"epoch": 7.785234899328859,
"grad_norm": 0.30259743332862854,
"learning_rate": 6.587233918601292e-05,
"loss": 0.0392,
"step": 8120
},
{
"epoch": 7.794822627037393,
"grad_norm": 0.4095616340637207,
"learning_rate": 6.578979171854449e-05,
"loss": 0.034,
"step": 8130
},
{
"epoch": 7.804410354745925,
"grad_norm": 0.4089941084384918,
"learning_rate": 6.570719641873639e-05,
"loss": 0.0432,
"step": 8140
},
{
"epoch": 7.813998082454458,
"grad_norm": 0.22477275133132935,
"learning_rate": 6.562455353679624e-05,
"loss": 0.0482,
"step": 8150
},
{
"epoch": 7.823585810162991,
"grad_norm": 0.24884644150733948,
"learning_rate": 6.554186332307583e-05,
"loss": 0.0357,
"step": 8160
},
{
"epoch": 7.833173537871525,
"grad_norm": 0.40433716773986816,
"learning_rate": 6.545912602807029e-05,
"loss": 0.0393,
"step": 8170
},
{
"epoch": 7.842761265580058,
"grad_norm": 0.1963358074426651,
"learning_rate": 6.537634190241742e-05,
"loss": 0.0369,
"step": 8180
},
{
"epoch": 7.85234899328859,
"grad_norm": 0.30618107318878174,
"learning_rate": 6.529351119689688e-05,
"loss": 0.0365,
"step": 8190
},
{
"epoch": 7.861936720997123,
"grad_norm": 0.9213468432426453,
"learning_rate": 6.52106341624294e-05,
"loss": 0.0415,
"step": 8200
},
{
"epoch": 7.871524448705657,
"grad_norm": 0.41490432620048523,
"learning_rate": 6.512771105007609e-05,
"loss": 0.0432,
"step": 8210
},
{
"epoch": 7.88111217641419,
"grad_norm": 0.3433400094509125,
"learning_rate": 6.504474211103766e-05,
"loss": 0.0383,
"step": 8220
},
{
"epoch": 7.890699904122723,
"grad_norm": 0.2565036714076996,
"learning_rate": 6.496172759665357e-05,
"loss": 0.039,
"step": 8230
},
{
"epoch": 7.900287631831256,
"grad_norm": 0.36820822954177856,
"learning_rate": 6.487866775840141e-05,
"loss": 0.0373,
"step": 8240
},
{
"epoch": 7.9098753595397895,
"grad_norm": 0.26671302318573,
"learning_rate": 6.479556284789608e-05,
"loss": 0.0339,
"step": 8250
},
{
"epoch": 7.919463087248322,
"grad_norm": 0.3026654124259949,
"learning_rate": 6.471241311688894e-05,
"loss": 0.0363,
"step": 8260
},
{
"epoch": 7.929050814956855,
"grad_norm": 0.24896202981472015,
"learning_rate": 6.46292188172672e-05,
"loss": 0.0394,
"step": 8270
},
{
"epoch": 7.938638542665388,
"grad_norm": 0.3126719892024994,
"learning_rate": 6.454598020105306e-05,
"loss": 0.0439,
"step": 8280
},
{
"epoch": 7.9482262703739215,
"grad_norm": 0.33165302872657776,
"learning_rate": 6.446269752040295e-05,
"loss": 0.0393,
"step": 8290
},
{
"epoch": 7.957813998082455,
"grad_norm": 0.6648756265640259,
"learning_rate": 6.437937102760682e-05,
"loss": 0.0356,
"step": 8300
},
{
"epoch": 7.967401725790987,
"grad_norm": 0.24022682011127472,
"learning_rate": 6.429600097508732e-05,
"loss": 0.0406,
"step": 8310
},
{
"epoch": 7.97698945349952,
"grad_norm": 1.2279690504074097,
"learning_rate": 6.421258761539904e-05,
"loss": 0.0434,
"step": 8320
},
{
"epoch": 7.9865771812080535,
"grad_norm": 0.2868311107158661,
"learning_rate": 6.412913120122779e-05,
"loss": 0.0372,
"step": 8330
},
{
"epoch": 7.996164908916587,
"grad_norm": 0.25136950612068176,
"learning_rate": 6.40456319853898e-05,
"loss": 0.0405,
"step": 8340
},
{
"epoch": 8.00575263662512,
"grad_norm": 0.3662584722042084,
"learning_rate": 6.396209022083098e-05,
"loss": 0.041,
"step": 8350
},
{
"epoch": 8.015340364333653,
"grad_norm": 0.3134470283985138,
"learning_rate": 6.387850616062605e-05,
"loss": 0.0357,
"step": 8360
},
{
"epoch": 8.024928092042186,
"grad_norm": 0.3947703540325165,
"learning_rate": 6.379488005797797e-05,
"loss": 0.0384,
"step": 8370
},
{
"epoch": 8.03451581975072,
"grad_norm": 0.3272991478443146,
"learning_rate": 6.371121216621698e-05,
"loss": 0.0392,
"step": 8380
},
{
"epoch": 8.044103547459253,
"grad_norm": 1.1089465618133545,
"learning_rate": 6.362750273879996e-05,
"loss": 0.047,
"step": 8390
},
{
"epoch": 8.053691275167786,
"grad_norm": 0.2133249044418335,
"learning_rate": 6.354375202930958e-05,
"loss": 0.0333,
"step": 8400
},
{
"epoch": 8.063279002876317,
"grad_norm": 0.3814240097999573,
"learning_rate": 6.345996029145356e-05,
"loss": 0.0419,
"step": 8410
},
{
"epoch": 8.07286673058485,
"grad_norm": 0.38257062435150146,
"learning_rate": 6.337612777906398e-05,
"loss": 0.0412,
"step": 8420
},
{
"epoch": 8.082454458293384,
"grad_norm": 0.20826545357704163,
"learning_rate": 6.329225474609633e-05,
"loss": 0.0402,
"step": 8430
},
{
"epoch": 8.092042186001917,
"grad_norm": 0.2289332151412964,
"learning_rate": 6.320834144662897e-05,
"loss": 0.0392,
"step": 8440
},
{
"epoch": 8.10162991371045,
"grad_norm": 0.29565075039863586,
"learning_rate": 6.312438813486211e-05,
"loss": 0.0347,
"step": 8450
},
{
"epoch": 8.111217641418984,
"grad_norm": 0.21872690320014954,
"learning_rate": 6.30403950651173e-05,
"loss": 0.0357,
"step": 8460
},
{
"epoch": 8.120805369127517,
"grad_norm": 0.24760524928569794,
"learning_rate": 6.295636249183643e-05,
"loss": 0.0331,
"step": 8470
},
{
"epoch": 8.13039309683605,
"grad_norm": 0.2806303799152374,
"learning_rate": 6.287229066958113e-05,
"loss": 0.0393,
"step": 8480
},
{
"epoch": 8.139980824544583,
"grad_norm": 0.45841529965400696,
"learning_rate": 6.278817985303184e-05,
"loss": 0.0434,
"step": 8490
},
{
"epoch": 8.149568552253116,
"grad_norm": 0.21284928917884827,
"learning_rate": 6.270403029698722e-05,
"loss": 0.0311,
"step": 8500
},
{
"epoch": 8.15915627996165,
"grad_norm": 0.312191367149353,
"learning_rate": 6.261984225636324e-05,
"loss": 0.0409,
"step": 8510
},
{
"epoch": 8.168744007670183,
"grad_norm": 0.38339605927467346,
"learning_rate": 6.253561598619247e-05,
"loss": 0.0367,
"step": 8520
},
{
"epoch": 8.178331735378714,
"grad_norm": 0.24168361723423004,
"learning_rate": 6.245135174162323e-05,
"loss": 0.0419,
"step": 8530
},
{
"epoch": 8.187919463087248,
"grad_norm": 0.3038835823535919,
"learning_rate": 6.236704977791898e-05,
"loss": 0.0349,
"step": 8540
},
{
"epoch": 8.19750719079578,
"grad_norm": 0.32537156343460083,
"learning_rate": 6.228271035045735e-05,
"loss": 0.0347,
"step": 8550
},
{
"epoch": 8.207094918504314,
"grad_norm": 0.2789401412010193,
"learning_rate": 6.21983337147295e-05,
"loss": 0.0339,
"step": 8560
},
{
"epoch": 8.216682646212847,
"grad_norm": 0.4282236397266388,
"learning_rate": 6.211392012633932e-05,
"loss": 0.0352,
"step": 8570
},
{
"epoch": 8.22627037392138,
"grad_norm": 0.3608817458152771,
"learning_rate": 6.202946984100261e-05,
"loss": 0.0373,
"step": 8580
},
{
"epoch": 8.235858101629914,
"grad_norm": 0.29480835795402527,
"learning_rate": 6.194498311454636e-05,
"loss": 0.0321,
"step": 8590
},
{
"epoch": 8.245445829338447,
"grad_norm": 0.27964943647384644,
"learning_rate": 6.186046020290792e-05,
"loss": 0.0428,
"step": 8600
},
{
"epoch": 8.25503355704698,
"grad_norm": 0.2138575315475464,
"learning_rate": 6.177590136213429e-05,
"loss": 0.0344,
"step": 8610
},
{
"epoch": 8.264621284755513,
"grad_norm": 0.3693723678588867,
"learning_rate": 6.169130684838132e-05,
"loss": 0.0449,
"step": 8620
},
{
"epoch": 8.274209012464047,
"grad_norm": 0.24271826446056366,
"learning_rate": 6.160667691791287e-05,
"loss": 0.0414,
"step": 8630
},
{
"epoch": 8.28379674017258,
"grad_norm": 0.27349698543548584,
"learning_rate": 6.152201182710016e-05,
"loss": 0.0437,
"step": 8640
},
{
"epoch": 8.293384467881111,
"grad_norm": 0.265661358833313,
"learning_rate": 6.143731183242085e-05,
"loss": 0.0402,
"step": 8650
},
{
"epoch": 8.302972195589644,
"grad_norm": 0.3084318935871124,
"learning_rate": 6.13525771904584e-05,
"loss": 0.0424,
"step": 8660
},
{
"epoch": 8.312559923298178,
"grad_norm": 0.42005741596221924,
"learning_rate": 6.126780815790116e-05,
"loss": 0.0386,
"step": 8670
},
{
"epoch": 8.322147651006711,
"grad_norm": 0.349277526140213,
"learning_rate": 6.118300499154174e-05,
"loss": 0.0355,
"step": 8680
},
{
"epoch": 8.331735378715244,
"grad_norm": 0.3930281102657318,
"learning_rate": 6.109816794827607e-05,
"loss": 0.0386,
"step": 8690
},
{
"epoch": 8.341323106423777,
"grad_norm": 0.2631587088108063,
"learning_rate": 6.101329728510278e-05,
"loss": 0.0376,
"step": 8700
},
{
"epoch": 8.35091083413231,
"grad_norm": 0.3070177137851715,
"learning_rate": 6.0928393259122285e-05,
"loss": 0.039,
"step": 8710
},
{
"epoch": 8.360498561840844,
"grad_norm": 0.3494318425655365,
"learning_rate": 6.084345612753611e-05,
"loss": 0.0405,
"step": 8720
},
{
"epoch": 8.370086289549377,
"grad_norm": 0.2996184825897217,
"learning_rate": 6.0758486147646035e-05,
"loss": 0.0386,
"step": 8730
},
{
"epoch": 8.37967401725791,
"grad_norm": 0.39091756939888,
"learning_rate": 6.0673483576853365e-05,
"loss": 0.038,
"step": 8740
},
{
"epoch": 8.389261744966444,
"grad_norm": 0.28855571150779724,
"learning_rate": 6.0588448672658125e-05,
"loss": 0.0403,
"step": 8750
},
{
"epoch": 8.398849472674977,
"grad_norm": 0.25725746154785156,
"learning_rate": 6.05033816926583e-05,
"loss": 0.0338,
"step": 8760
},
{
"epoch": 8.40843720038351,
"grad_norm": 0.2737105190753937,
"learning_rate": 6.041828289454903e-05,
"loss": 0.0417,
"step": 8770
},
{
"epoch": 8.418024928092041,
"grad_norm": 0.3197145462036133,
"learning_rate": 6.033315253612186e-05,
"loss": 0.0428,
"step": 8780
},
{
"epoch": 8.427612655800575,
"grad_norm": 0.35713446140289307,
"learning_rate": 6.0247990875263914e-05,
"loss": 0.0376,
"step": 8790
},
{
"epoch": 8.437200383509108,
"grad_norm": 0.354390949010849,
"learning_rate": 6.016279816995718e-05,
"loss": 0.0384,
"step": 8800
},
{
"epoch": 8.446788111217641,
"grad_norm": 0.31738895177841187,
"learning_rate": 6.0077574678277636e-05,
"loss": 0.048,
"step": 8810
},
{
"epoch": 8.456375838926174,
"grad_norm": 0.28505873680114746,
"learning_rate": 5.999232065839456e-05,
"loss": 0.0353,
"step": 8820
},
{
"epoch": 8.465963566634708,
"grad_norm": 0.3551139831542969,
"learning_rate": 5.990703636856974e-05,
"loss": 0.0422,
"step": 8830
},
{
"epoch": 8.47555129434324,
"grad_norm": 0.23753251135349274,
"learning_rate": 5.982172206715656e-05,
"loss": 0.0356,
"step": 8840
},
{
"epoch": 8.485139022051774,
"grad_norm": 0.3025340735912323,
"learning_rate": 5.973637801259944e-05,
"loss": 0.0416,
"step": 8850
},
{
"epoch": 8.494726749760307,
"grad_norm": 0.3358081579208374,
"learning_rate": 5.9651004463432826e-05,
"loss": 0.0406,
"step": 8860
},
{
"epoch": 8.50431447746884,
"grad_norm": 0.2748364508152008,
"learning_rate": 5.95656016782806e-05,
"loss": 0.0355,
"step": 8870
},
{
"epoch": 8.513902205177374,
"grad_norm": 0.27150842547416687,
"learning_rate": 5.948016991585514e-05,
"loss": 0.0356,
"step": 8880
},
{
"epoch": 8.523489932885907,
"grad_norm": 0.2812124490737915,
"learning_rate": 5.9394709434956664e-05,
"loss": 0.0419,
"step": 8890
},
{
"epoch": 8.53307766059444,
"grad_norm": 0.29283568263053894,
"learning_rate": 5.9309220494472314e-05,
"loss": 0.0408,
"step": 8900
},
{
"epoch": 8.542665388302972,
"grad_norm": 0.4069705605506897,
"learning_rate": 5.9223703353375534e-05,
"loss": 0.0425,
"step": 8910
},
{
"epoch": 8.552253116011505,
"grad_norm": 0.2776540219783783,
"learning_rate": 5.913815827072513e-05,
"loss": 0.0365,
"step": 8920
},
{
"epoch": 8.561840843720038,
"grad_norm": 0.2777857482433319,
"learning_rate": 5.905258550566458e-05,
"loss": 0.0368,
"step": 8930
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.3018902838230133,
"learning_rate": 5.896698531742122e-05,
"loss": 0.0377,
"step": 8940
},
{
"epoch": 8.581016299137104,
"grad_norm": 0.622887134552002,
"learning_rate": 5.888135796530544e-05,
"loss": 0.0448,
"step": 8950
},
{
"epoch": 8.590604026845638,
"grad_norm": 0.28407829999923706,
"learning_rate": 5.879570370870995e-05,
"loss": 0.0373,
"step": 8960
},
{
"epoch": 8.60019175455417,
"grad_norm": 0.2791987955570221,
"learning_rate": 5.871002280710892e-05,
"loss": 0.0402,
"step": 8970
},
{
"epoch": 8.609779482262704,
"grad_norm": 0.27533990144729614,
"learning_rate": 5.862431552005729e-05,
"loss": 0.0434,
"step": 8980
},
{
"epoch": 8.619367209971237,
"grad_norm": 0.27701878547668457,
"learning_rate": 5.85385821071899e-05,
"loss": 0.0383,
"step": 8990
},
{
"epoch": 8.62895493767977,
"grad_norm": 0.269197016954422,
"learning_rate": 5.845282282822071e-05,
"loss": 0.0389,
"step": 9000
},
{
"epoch": 8.638542665388304,
"grad_norm": 0.3775997757911682,
"learning_rate": 5.836703794294208e-05,
"loss": 0.0401,
"step": 9010
},
{
"epoch": 8.648130393096835,
"grad_norm": 0.21519199013710022,
"learning_rate": 5.828122771122392e-05,
"loss": 0.0326,
"step": 9020
},
{
"epoch": 8.657718120805368,
"grad_norm": 0.4001868963241577,
"learning_rate": 5.819539239301291e-05,
"loss": 0.04,
"step": 9030
},
{
"epoch": 8.667305848513902,
"grad_norm": 0.19594238698482513,
"learning_rate": 5.810953224833177e-05,
"loss": 0.0301,
"step": 9040
},
{
"epoch": 8.676893576222435,
"grad_norm": 0.19823068380355835,
"learning_rate": 5.802364753727836e-05,
"loss": 0.0344,
"step": 9050
},
{
"epoch": 8.686481303930968,
"grad_norm": 0.26146700978279114,
"learning_rate": 5.793773852002502e-05,
"loss": 0.0444,
"step": 9060
},
{
"epoch": 8.696069031639501,
"grad_norm": 0.36863768100738525,
"learning_rate": 5.7851805456817677e-05,
"loss": 0.0364,
"step": 9070
},
{
"epoch": 8.705656759348035,
"grad_norm": 0.2518344521522522,
"learning_rate": 5.7765848607975136e-05,
"loss": 0.0394,
"step": 9080
},
{
"epoch": 8.715244487056568,
"grad_norm": 0.2473488301038742,
"learning_rate": 5.767986823388825e-05,
"loss": 0.0326,
"step": 9090
},
{
"epoch": 8.724832214765101,
"grad_norm": 0.20669348537921906,
"learning_rate": 5.7593864595019096e-05,
"loss": 0.0408,
"step": 9100
},
{
"epoch": 8.734419942473634,
"grad_norm": 0.32804393768310547,
"learning_rate": 5.750783795190029e-05,
"loss": 0.0388,
"step": 9110
},
{
"epoch": 8.744007670182167,
"grad_norm": 0.18472160398960114,
"learning_rate": 5.7421788565134074e-05,
"loss": 0.0395,
"step": 9120
},
{
"epoch": 8.7535953978907,
"grad_norm": 0.3553003668785095,
"learning_rate": 5.733571669539167e-05,
"loss": 0.0432,
"step": 9130
},
{
"epoch": 8.763183125599234,
"grad_norm": 0.2398902177810669,
"learning_rate": 5.72496226034123e-05,
"loss": 0.0354,
"step": 9140
},
{
"epoch": 8.772770853307765,
"grad_norm": 0.2900802195072174,
"learning_rate": 5.716350655000261e-05,
"loss": 0.0449,
"step": 9150
},
{
"epoch": 8.782358581016299,
"grad_norm": 0.17919373512268066,
"learning_rate": 5.707736879603568e-05,
"loss": 0.0413,
"step": 9160
},
{
"epoch": 8.791946308724832,
"grad_norm": 0.2598424255847931,
"learning_rate": 5.6991209602450424e-05,
"loss": 0.0432,
"step": 9170
},
{
"epoch": 8.801534036433365,
"grad_norm": 0.4794408082962036,
"learning_rate": 5.69050292302506e-05,
"loss": 0.0392,
"step": 9180
},
{
"epoch": 8.811121764141898,
"grad_norm": 0.3420094847679138,
"learning_rate": 5.6818827940504225e-05,
"loss": 0.0335,
"step": 9190
},
{
"epoch": 8.820709491850431,
"grad_norm": 1.9920908212661743,
"learning_rate": 5.673260599434259e-05,
"loss": 0.0427,
"step": 9200
},
{
"epoch": 8.830297219558965,
"grad_norm": 0.28250133991241455,
"learning_rate": 5.664636365295965e-05,
"loss": 0.0349,
"step": 9210
},
{
"epoch": 8.839884947267498,
"grad_norm": 0.22743001580238342,
"learning_rate": 5.656010117761105e-05,
"loss": 0.0401,
"step": 9220
},
{
"epoch": 8.849472674976031,
"grad_norm": 0.2771368622779846,
"learning_rate": 5.647381882961349e-05,
"loss": 0.0424,
"step": 9230
},
{
"epoch": 8.859060402684564,
"grad_norm": 0.38394448161125183,
"learning_rate": 5.638751687034387e-05,
"loss": 0.0357,
"step": 9240
},
{
"epoch": 8.868648130393098,
"grad_norm": 0.22416839003562927,
"learning_rate": 5.630119556123848e-05,
"loss": 0.0347,
"step": 9250
},
{
"epoch": 8.87823585810163,
"grad_norm": 0.1746525913476944,
"learning_rate": 5.6214855163792224e-05,
"loss": 0.0366,
"step": 9260
},
{
"epoch": 8.887823585810162,
"grad_norm": 0.26215359568595886,
"learning_rate": 5.6128495939557835e-05,
"loss": 0.0411,
"step": 9270
},
{
"epoch": 8.897411313518695,
"grad_norm": 0.3498288691043854,
"learning_rate": 5.604211815014509e-05,
"loss": 0.0404,
"step": 9280
},
{
"epoch": 8.906999041227229,
"grad_norm": 0.19935335218906403,
"learning_rate": 5.595572205721999e-05,
"loss": 0.0356,
"step": 9290
},
{
"epoch": 8.916586768935762,
"grad_norm": 0.3347182869911194,
"learning_rate": 5.5869307922504e-05,
"loss": 0.0393,
"step": 9300
},
{
"epoch": 8.926174496644295,
"grad_norm": 0.3638782203197479,
"learning_rate": 5.578287600777321e-05,
"loss": 0.0324,
"step": 9310
},
{
"epoch": 8.935762224352828,
"grad_norm": 0.2433633953332901,
"learning_rate": 5.569642657485761e-05,
"loss": 0.0351,
"step": 9320
},
{
"epoch": 8.945349952061362,
"grad_norm": 0.2311711609363556,
"learning_rate": 5.560995988564023e-05,
"loss": 0.0386,
"step": 9330
},
{
"epoch": 8.954937679769895,
"grad_norm": 0.2803432047367096,
"learning_rate": 5.552347620205638e-05,
"loss": 0.0461,
"step": 9340
},
{
"epoch": 8.964525407478428,
"grad_norm": 0.25586047768592834,
"learning_rate": 5.5436975786092873e-05,
"loss": 0.0384,
"step": 9350
},
{
"epoch": 8.974113135186961,
"grad_norm": 0.3626959025859833,
"learning_rate": 5.535045889978717e-05,
"loss": 0.0374,
"step": 9360
},
{
"epoch": 8.983700862895494,
"grad_norm": 0.3548148572444916,
"learning_rate": 5.526392580522666e-05,
"loss": 0.0416,
"step": 9370
},
{
"epoch": 8.993288590604028,
"grad_norm": 2.09843111038208,
"learning_rate": 5.5177376764547814e-05,
"loss": 0.0434,
"step": 9380
},
{
"epoch": 9.002876318312559,
"grad_norm": 0.4216479957103729,
"learning_rate": 5.5090812039935426e-05,
"loss": 0.0404,
"step": 9390
},
{
"epoch": 9.012464046021092,
"grad_norm": 0.292222261428833,
"learning_rate": 5.5004231893621774e-05,
"loss": 0.0362,
"step": 9400
},
{
"epoch": 9.022051773729626,
"grad_norm": 0.37306836247444153,
"learning_rate": 5.491763658788589e-05,
"loss": 0.0367,
"step": 9410
},
{
"epoch": 9.031639501438159,
"grad_norm": 0.2755350172519684,
"learning_rate": 5.483102638505269e-05,
"loss": 0.0401,
"step": 9420
},
{
"epoch": 9.041227229146692,
"grad_norm": 0.2616848349571228,
"learning_rate": 5.4744401547492254e-05,
"loss": 0.0337,
"step": 9430
},
{
"epoch": 9.050814956855225,
"grad_norm": 0.28111451864242554,
"learning_rate": 5.465776233761896e-05,
"loss": 0.0384,
"step": 9440
},
{
"epoch": 9.060402684563758,
"grad_norm": 0.23586216568946838,
"learning_rate": 5.4571109017890753e-05,
"loss": 0.0405,
"step": 9450
},
{
"epoch": 9.069990412272292,
"grad_norm": 0.3019304871559143,
"learning_rate": 5.44844418508083e-05,
"loss": 0.0389,
"step": 9460
},
{
"epoch": 9.079578139980825,
"grad_norm": 0.3531333804130554,
"learning_rate": 5.4397761098914254e-05,
"loss": 0.0334,
"step": 9470
},
{
"epoch": 9.089165867689358,
"grad_norm": 0.40830254554748535,
"learning_rate": 5.431106702479235e-05,
"loss": 0.0357,
"step": 9480
},
{
"epoch": 9.098753595397891,
"grad_norm": 0.44957104325294495,
"learning_rate": 5.4224359891066765e-05,
"loss": 0.039,
"step": 9490
},
{
"epoch": 9.108341323106425,
"grad_norm": 0.6519899964332581,
"learning_rate": 5.413763996040117e-05,
"loss": 0.0402,
"step": 9500
},
{
"epoch": 9.117929050814958,
"grad_norm": 0.4034676253795624,
"learning_rate": 5.405090749549804e-05,
"loss": 0.0459,
"step": 9510
},
{
"epoch": 9.12751677852349,
"grad_norm": 0.3996933698654175,
"learning_rate": 5.396416275909779e-05,
"loss": 0.0398,
"step": 9520
},
{
"epoch": 9.137104506232022,
"grad_norm": 0.16408595442771912,
"learning_rate": 5.387740601397806e-05,
"loss": 0.0358,
"step": 9530
},
{
"epoch": 9.146692233940556,
"grad_norm": 0.3471783995628357,
"learning_rate": 5.379063752295282e-05,
"loss": 0.0391,
"step": 9540
},
{
"epoch": 9.156279961649089,
"grad_norm": 0.4107268452644348,
"learning_rate": 5.370385754887164e-05,
"loss": 0.0424,
"step": 9550
},
{
"epoch": 9.165867689357622,
"grad_norm": 0.32927405834198,
"learning_rate": 5.3617066354618874e-05,
"loss": 0.0453,
"step": 9560
},
{
"epoch": 9.175455417066155,
"grad_norm": 0.41520607471466064,
"learning_rate": 5.3530264203112856e-05,
"loss": 0.0392,
"step": 9570
},
{
"epoch": 9.185043144774689,
"grad_norm": 0.3985765278339386,
"learning_rate": 5.344345135730513e-05,
"loss": 0.0364,
"step": 9580
},
{
"epoch": 9.194630872483222,
"grad_norm": 0.344056099653244,
"learning_rate": 5.335662808017964e-05,
"loss": 0.0444,
"step": 9590
},
{
"epoch": 9.204218600191755,
"grad_norm": 0.3382169008255005,
"learning_rate": 5.32697946347519e-05,
"loss": 0.0375,
"step": 9600
},
{
"epoch": 9.213806327900288,
"grad_norm": 0.3668196499347687,
"learning_rate": 5.318295128406825e-05,
"loss": 0.0427,
"step": 9610
},
{
"epoch": 9.223394055608821,
"grad_norm": 0.22777938842773438,
"learning_rate": 5.3096098291205044e-05,
"loss": 0.0362,
"step": 9620
},
{
"epoch": 9.232981783317355,
"grad_norm": 0.2992532551288605,
"learning_rate": 5.300923591926783e-05,
"loss": 0.0344,
"step": 9630
},
{
"epoch": 9.242569511025886,
"grad_norm": 0.2733289301395416,
"learning_rate": 5.292236443139056e-05,
"loss": 0.0318,
"step": 9640
},
{
"epoch": 9.25215723873442,
"grad_norm": 0.2972942292690277,
"learning_rate": 5.283548409073482e-05,
"loss": 0.0357,
"step": 9650
},
{
"epoch": 9.261744966442953,
"grad_norm": 0.3721420466899872,
"learning_rate": 5.274859516048901e-05,
"loss": 0.0356,
"step": 9660
},
{
"epoch": 9.271332694151486,
"grad_norm": 0.13791558146476746,
"learning_rate": 5.266169790386756e-05,
"loss": 0.0345,
"step": 9670
},
{
"epoch": 9.280920421860019,
"grad_norm": 0.2645628750324249,
"learning_rate": 5.257479258411008e-05,
"loss": 0.0426,
"step": 9680
},
{
"epoch": 9.290508149568552,
"grad_norm": 0.3136797845363617,
"learning_rate": 5.248787946448065e-05,
"loss": 0.0354,
"step": 9690
},
{
"epoch": 9.300095877277085,
"grad_norm": 0.25481873750686646,
"learning_rate": 5.240095880826695e-05,
"loss": 0.0401,
"step": 9700
},
{
"epoch": 9.309683604985619,
"grad_norm": 0.24243059754371643,
"learning_rate": 5.231403087877955e-05,
"loss": 0.0422,
"step": 9710
},
{
"epoch": 9.319271332694152,
"grad_norm": 0.22734355926513672,
"learning_rate": 5.2227095939350966e-05,
"loss": 0.0409,
"step": 9720
},
{
"epoch": 9.328859060402685,
"grad_norm": 0.35372641682624817,
"learning_rate": 5.214015425333502e-05,
"loss": 0.0413,
"step": 9730
},
{
"epoch": 9.338446788111218,
"grad_norm": 0.2218106985092163,
"learning_rate": 5.205320608410591e-05,
"loss": 0.0385,
"step": 9740
},
{
"epoch": 9.348034515819752,
"grad_norm": 0.8550918698310852,
"learning_rate": 5.196625169505755e-05,
"loss": 0.0383,
"step": 9750
},
{
"epoch": 9.357622243528283,
"grad_norm": 0.325469434261322,
"learning_rate": 5.18792913496026e-05,
"loss": 0.0377,
"step": 9760
},
{
"epoch": 9.367209971236816,
"grad_norm": 0.2887977063655853,
"learning_rate": 5.1792325311171875e-05,
"loss": 0.039,
"step": 9770
},
{
"epoch": 9.37679769894535,
"grad_norm": 0.267398476600647,
"learning_rate": 5.1705353843213336e-05,
"loss": 0.0351,
"step": 9780
},
{
"epoch": 9.386385426653883,
"grad_norm": 0.3469073176383972,
"learning_rate": 5.1618377209191447e-05,
"loss": 0.0373,
"step": 9790
},
{
"epoch": 9.395973154362416,
"grad_norm": 0.399781733751297,
"learning_rate": 5.1531395672586314e-05,
"loss": 0.0345,
"step": 9800
},
{
"epoch": 9.405560882070949,
"grad_norm": 0.3050326704978943,
"learning_rate": 5.144440949689287e-05,
"loss": 0.0436,
"step": 9810
},
{
"epoch": 9.415148609779482,
"grad_norm": 0.22124247252941132,
"learning_rate": 5.135741894562014e-05,
"loss": 0.0384,
"step": 9820
},
{
"epoch": 9.424736337488016,
"grad_norm": 0.32914167642593384,
"learning_rate": 5.127042428229036e-05,
"loss": 0.0395,
"step": 9830
},
{
"epoch": 9.434324065196549,
"grad_norm": 0.302157998085022,
"learning_rate": 5.118342577043829e-05,
"loss": 0.0446,
"step": 9840
},
{
"epoch": 9.443911792905082,
"grad_norm": 0.29756733775138855,
"learning_rate": 5.1096423673610246e-05,
"loss": 0.035,
"step": 9850
},
{
"epoch": 9.453499520613615,
"grad_norm": 0.21626603603363037,
"learning_rate": 5.100941825536353e-05,
"loss": 0.0487,
"step": 9860
},
{
"epoch": 9.463087248322148,
"grad_norm": 0.31502407789230347,
"learning_rate": 5.092240977926538e-05,
"loss": 0.0384,
"step": 9870
},
{
"epoch": 9.47267497603068,
"grad_norm": 0.3153168261051178,
"learning_rate": 5.083539850889239e-05,
"loss": 0.0377,
"step": 9880
},
{
"epoch": 9.482262703739213,
"grad_norm": 0.3235209584236145,
"learning_rate": 5.074838470782957e-05,
"loss": 0.0402,
"step": 9890
},
{
"epoch": 9.491850431447746,
"grad_norm": 0.4194275438785553,
"learning_rate": 5.066136863966963e-05,
"loss": 0.0349,
"step": 9900
},
{
"epoch": 9.50143815915628,
"grad_norm": 0.26690346002578735,
"learning_rate": 5.0574350568012086e-05,
"loss": 0.037,
"step": 9910
},
{
"epoch": 9.511025886864813,
"grad_norm": 0.3191596567630768,
"learning_rate": 5.0487330756462624e-05,
"loss": 0.0427,
"step": 9920
},
{
"epoch": 9.520613614573346,
"grad_norm": 0.21837887167930603,
"learning_rate": 5.040030946863209e-05,
"loss": 0.031,
"step": 9930
},
{
"epoch": 9.53020134228188,
"grad_norm": 0.28201964497566223,
"learning_rate": 5.0313286968135884e-05,
"loss": 0.0348,
"step": 9940
},
{
"epoch": 9.539789069990412,
"grad_norm": 0.6378640532493591,
"learning_rate": 5.022626351859305e-05,
"loss": 0.0392,
"step": 9950
},
{
"epoch": 9.549376797698946,
"grad_norm": 0.27877506613731384,
"learning_rate": 5.01392393836255e-05,
"loss": 0.0435,
"step": 9960
},
{
"epoch": 9.558964525407479,
"grad_norm": 0.21583925187587738,
"learning_rate": 5.0052214826857225e-05,
"loss": 0.036,
"step": 9970
},
{
"epoch": 9.568552253116012,
"grad_norm": 0.3575581908226013,
"learning_rate": 4.996519011191351e-05,
"loss": 0.0344,
"step": 9980
},
{
"epoch": 9.578139980824545,
"grad_norm": 0.2446652501821518,
"learning_rate": 4.9878165502420104e-05,
"loss": 0.0382,
"step": 9990
},
{
"epoch": 9.587727708533077,
"grad_norm": 0.1690993756055832,
"learning_rate": 4.979114126200244e-05,
"loss": 0.0392,
"step": 10000
},
{
"epoch": 9.59731543624161,
"grad_norm": 0.3892661929130554,
"learning_rate": 4.970411765428484e-05,
"loss": 0.0366,
"step": 10010
},
{
"epoch": 9.606903163950143,
"grad_norm": 0.26752811670303345,
"learning_rate": 4.961709494288966e-05,
"loss": 0.0377,
"step": 10020
},
{
"epoch": 9.616490891658676,
"grad_norm": 0.3104531466960907,
"learning_rate": 4.9530073391436654e-05,
"loss": 0.0371,
"step": 10030
},
{
"epoch": 9.62607861936721,
"grad_norm": 0.3081854283809662,
"learning_rate": 4.944305326354194e-05,
"loss": 0.0377,
"step": 10040
},
{
"epoch": 9.635666347075743,
"grad_norm": 0.32180699706077576,
"learning_rate": 4.935603482281739e-05,
"loss": 0.0364,
"step": 10050
},
{
"epoch": 9.645254074784276,
"grad_norm": 0.30046379566192627,
"learning_rate": 4.926901833286974e-05,
"loss": 0.0341,
"step": 10060
},
{
"epoch": 9.65484180249281,
"grad_norm": 0.24152809381484985,
"learning_rate": 4.918200405729986e-05,
"loss": 0.0453,
"step": 10070
},
{
"epoch": 9.664429530201343,
"grad_norm": 0.8806717395782471,
"learning_rate": 4.909499225970184e-05,
"loss": 0.0352,
"step": 10080
},
{
"epoch": 9.674017257909876,
"grad_norm": 0.3561595380306244,
"learning_rate": 4.9007983203662326e-05,
"loss": 0.0337,
"step": 10090
},
{
"epoch": 9.683604985618409,
"grad_norm": 0.3623135983943939,
"learning_rate": 4.892097715275961e-05,
"loss": 0.0361,
"step": 10100
},
{
"epoch": 9.693192713326942,
"grad_norm": 0.3282937705516815,
"learning_rate": 4.883397437056293e-05,
"loss": 0.0357,
"step": 10110
},
{
"epoch": 9.702780441035475,
"grad_norm": 0.28583481907844543,
"learning_rate": 4.87469751206316e-05,
"loss": 0.032,
"step": 10120
},
{
"epoch": 9.712368168744007,
"grad_norm": 0.20011906325817108,
"learning_rate": 4.865997966651421e-05,
"loss": 0.0335,
"step": 10130
},
{
"epoch": 9.72195589645254,
"grad_norm": 0.23072586953639984,
"learning_rate": 4.857298827174787e-05,
"loss": 0.0326,
"step": 10140
},
{
"epoch": 9.731543624161073,
"grad_norm": 0.21280129253864288,
"learning_rate": 4.8486001199857416e-05,
"loss": 0.0354,
"step": 10150
},
{
"epoch": 9.741131351869607,
"grad_norm": 0.4237668812274933,
"learning_rate": 4.839901871435457e-05,
"loss": 0.0351,
"step": 10160
},
{
"epoch": 9.75071907957814,
"grad_norm": 0.2798875868320465,
"learning_rate": 4.831204107873713e-05,
"loss": 0.0353,
"step": 10170
},
{
"epoch": 9.760306807286673,
"grad_norm": 0.20780718326568604,
"learning_rate": 4.822506855648825e-05,
"loss": 0.0326,
"step": 10180
},
{
"epoch": 9.769894534995206,
"grad_norm": 0.2649904489517212,
"learning_rate": 4.8138101411075574e-05,
"loss": 0.035,
"step": 10190
},
{
"epoch": 9.77948226270374,
"grad_norm": 0.26445141434669495,
"learning_rate": 4.805113990595046e-05,
"loss": 0.0468,
"step": 10200
},
{
"epoch": 9.789069990412273,
"grad_norm": 0.3209472894668579,
"learning_rate": 4.796418430454718e-05,
"loss": 0.0375,
"step": 10210
},
{
"epoch": 9.798657718120806,
"grad_norm": 0.19877949357032776,
"learning_rate": 4.787723487028209e-05,
"loss": 0.0381,
"step": 10220
},
{
"epoch": 9.808245445829339,
"grad_norm": 0.3071509301662445,
"learning_rate": 4.779029186655292e-05,
"loss": 0.0432,
"step": 10230
},
{
"epoch": 9.817833173537872,
"grad_norm": 0.4730135500431061,
"learning_rate": 4.77033555567379e-05,
"loss": 0.0374,
"step": 10240
},
{
"epoch": 9.827420901246404,
"grad_norm": 0.29888778924942017,
"learning_rate": 4.761642620419497e-05,
"loss": 0.0357,
"step": 10250
},
{
"epoch": 9.837008628954937,
"grad_norm": 0.2550467550754547,
"learning_rate": 4.7529504072260974e-05,
"loss": 0.0309,
"step": 10260
},
{
"epoch": 9.84659635666347,
"grad_norm": 0.25972646474838257,
"learning_rate": 4.744258942425094e-05,
"loss": 0.0421,
"step": 10270
},
{
"epoch": 9.856184084372003,
"grad_norm": 0.4071574807167053,
"learning_rate": 4.735568252345718e-05,
"loss": 0.0351,
"step": 10280
},
{
"epoch": 9.865771812080537,
"grad_norm": 0.4687805771827698,
"learning_rate": 4.726878363314855e-05,
"loss": 0.0369,
"step": 10290
},
{
"epoch": 9.87535953978907,
"grad_norm": 0.41865023970603943,
"learning_rate": 4.718189301656962e-05,
"loss": 0.0345,
"step": 10300
},
{
"epoch": 9.884947267497603,
"grad_norm": 0.30435627698898315,
"learning_rate": 4.709501093693997e-05,
"loss": 0.0321,
"step": 10310
},
{
"epoch": 9.894534995206136,
"grad_norm": 0.3561161458492279,
"learning_rate": 4.7008137657453214e-05,
"loss": 0.0409,
"step": 10320
},
{
"epoch": 9.90412272291467,
"grad_norm": 0.36440134048461914,
"learning_rate": 4.692127344127637e-05,
"loss": 0.033,
"step": 10330
},
{
"epoch": 9.913710450623203,
"grad_norm": 0.26994454860687256,
"learning_rate": 4.683441855154899e-05,
"loss": 0.0346,
"step": 10340
},
{
"epoch": 9.923298178331736,
"grad_norm": 0.2506847381591797,
"learning_rate": 4.674757325138239e-05,
"loss": 0.0314,
"step": 10350
},
{
"epoch": 9.93288590604027,
"grad_norm": 0.20864498615264893,
"learning_rate": 4.666073780385879e-05,
"loss": 0.0366,
"step": 10360
},
{
"epoch": 9.9424736337488,
"grad_norm": 0.18419000506401062,
"learning_rate": 4.65739124720306e-05,
"loss": 0.0329,
"step": 10370
},
{
"epoch": 9.952061361457334,
"grad_norm": 0.3387259244918823,
"learning_rate": 4.648709751891957e-05,
"loss": 0.0381,
"step": 10380
},
{
"epoch": 9.961649089165867,
"grad_norm": 0.2119244635105133,
"learning_rate": 4.640029320751606e-05,
"loss": 0.0351,
"step": 10390
},
{
"epoch": 9.9712368168744,
"grad_norm": 0.4716765880584717,
"learning_rate": 4.63134998007781e-05,
"loss": 0.0378,
"step": 10400
},
{
"epoch": 9.980824544582934,
"grad_norm": 0.47296905517578125,
"learning_rate": 4.622671756163075e-05,
"loss": 0.0397,
"step": 10410
},
{
"epoch": 9.990412272291467,
"grad_norm": 0.3720930218696594,
"learning_rate": 4.6139946752965216e-05,
"loss": 0.0387,
"step": 10420
},
{
"epoch": 10.0,
"grad_norm": 0.2873878479003906,
"learning_rate": 4.6053187637638115e-05,
"loss": 0.0336,
"step": 10430
},
{
"epoch": 10.009587727708533,
"grad_norm": 0.27077776193618774,
"learning_rate": 4.596644047847061e-05,
"loss": 0.0335,
"step": 10440
},
{
"epoch": 10.019175455417066,
"grad_norm": 0.29882556200027466,
"learning_rate": 4.587970553824762e-05,
"loss": 0.0329,
"step": 10450
},
{
"epoch": 10.0287631831256,
"grad_norm": 0.23539794981479645,
"learning_rate": 4.579298307971709e-05,
"loss": 0.0319,
"step": 10460
},
{
"epoch": 10.038350910834133,
"grad_norm": 0.47081291675567627,
"learning_rate": 4.570627336558915e-05,
"loss": 0.0448,
"step": 10470
},
{
"epoch": 10.047938638542666,
"grad_norm": 0.21392913162708282,
"learning_rate": 4.561957665853532e-05,
"loss": 0.0406,
"step": 10480
},
{
"epoch": 10.0575263662512,
"grad_norm": 0.31942254304885864,
"learning_rate": 4.553289322118769e-05,
"loss": 0.0347,
"step": 10490
},
{
"epoch": 10.06711409395973,
"grad_norm": 0.22749362885951996,
"learning_rate": 4.544622331613817e-05,
"loss": 0.0414,
"step": 10500
},
{
"epoch": 10.076701821668264,
"grad_norm": 0.24884119629859924,
"learning_rate": 4.5359567205937706e-05,
"loss": 0.0314,
"step": 10510
},
{
"epoch": 10.086289549376797,
"grad_norm": 0.26897284388542175,
"learning_rate": 4.527292515309541e-05,
"loss": 0.0394,
"step": 10520
},
{
"epoch": 10.09587727708533,
"grad_norm": 0.3579690158367157,
"learning_rate": 4.518629742007786e-05,
"loss": 0.0365,
"step": 10530
},
{
"epoch": 10.105465004793864,
"grad_norm": 0.19811834394931793,
"learning_rate": 4.509968426930817e-05,
"loss": 0.0358,
"step": 10540
},
{
"epoch": 10.115052732502397,
"grad_norm": 0.2834417223930359,
"learning_rate": 4.501308596316537e-05,
"loss": 0.0329,
"step": 10550
},
{
"epoch": 10.12464046021093,
"grad_norm": 0.1813543736934662,
"learning_rate": 4.492650276398347e-05,
"loss": 0.0345,
"step": 10560
},
{
"epoch": 10.134228187919463,
"grad_norm": 0.23895332217216492,
"learning_rate": 4.483993493405075e-05,
"loss": 0.0328,
"step": 10570
},
{
"epoch": 10.143815915627997,
"grad_norm": 0.2329237461090088,
"learning_rate": 4.475338273560886e-05,
"loss": 0.0334,
"step": 10580
},
{
"epoch": 10.15340364333653,
"grad_norm": 0.32786402106285095,
"learning_rate": 4.466684643085223e-05,
"loss": 0.0362,
"step": 10590
},
{
"epoch": 10.162991371045063,
"grad_norm": 0.2858993709087372,
"learning_rate": 4.458032628192699e-05,
"loss": 0.0349,
"step": 10600
},
{
"epoch": 10.172579098753596,
"grad_norm": 0.38395509123802185,
"learning_rate": 4.449382255093044e-05,
"loss": 0.0384,
"step": 10610
},
{
"epoch": 10.182166826462128,
"grad_norm": 0.35513293743133545,
"learning_rate": 4.440733549991006e-05,
"loss": 0.0317,
"step": 10620
},
{
"epoch": 10.191754554170661,
"grad_norm": 0.21551890671253204,
"learning_rate": 4.432086539086292e-05,
"loss": 0.0373,
"step": 10630
},
{
"epoch": 10.201342281879194,
"grad_norm": 0.22998203337192535,
"learning_rate": 4.423441248573463e-05,
"loss": 0.0376,
"step": 10640
},
{
"epoch": 10.210930009587727,
"grad_norm": 0.4294188618659973,
"learning_rate": 4.4147977046418776e-05,
"loss": 0.0356,
"step": 10650
},
{
"epoch": 10.22051773729626,
"grad_norm": 0.2688153386116028,
"learning_rate": 4.406155933475599e-05,
"loss": 0.0364,
"step": 10660
},
{
"epoch": 10.230105465004794,
"grad_norm": 0.39193832874298096,
"learning_rate": 4.3975159612533244e-05,
"loss": 0.0337,
"step": 10670
},
{
"epoch": 10.239693192713327,
"grad_norm": 0.4422641694545746,
"learning_rate": 4.388877814148296e-05,
"loss": 0.0328,
"step": 10680
},
{
"epoch": 10.24928092042186,
"grad_norm": 0.25854796171188354,
"learning_rate": 4.380241518328231e-05,
"loss": 0.0338,
"step": 10690
},
{
"epoch": 10.258868648130393,
"grad_norm": 0.282626748085022,
"learning_rate": 4.371607099955236e-05,
"loss": 0.0398,
"step": 10700
},
{
"epoch": 10.268456375838927,
"grad_norm": 0.2568127512931824,
"learning_rate": 4.362974585185734e-05,
"loss": 0.0354,
"step": 10710
},
{
"epoch": 10.27804410354746,
"grad_norm": 0.28798142075538635,
"learning_rate": 4.3543440001703786e-05,
"loss": 0.0354,
"step": 10720
},
{
"epoch": 10.287631831255993,
"grad_norm": 0.28471261262893677,
"learning_rate": 4.345715371053976e-05,
"loss": 0.0365,
"step": 10730
},
{
"epoch": 10.297219558964525,
"grad_norm": 0.27555039525032043,
"learning_rate": 4.3370887239754085e-05,
"loss": 0.0324,
"step": 10740
},
{
"epoch": 10.306807286673058,
"grad_norm": 0.34258362650871277,
"learning_rate": 4.328464085067559e-05,
"loss": 0.0313,
"step": 10750
},
{
"epoch": 10.316395014381591,
"grad_norm": 0.2875727117061615,
"learning_rate": 4.319841480457221e-05,
"loss": 0.034,
"step": 10760
},
{
"epoch": 10.325982742090124,
"grad_norm": 0.37291842699050903,
"learning_rate": 4.311220936265025e-05,
"loss": 0.0358,
"step": 10770
},
{
"epoch": 10.335570469798657,
"grad_norm": 0.28330934047698975,
"learning_rate": 4.302602478605364e-05,
"loss": 0.0371,
"step": 10780
},
{
"epoch": 10.34515819750719,
"grad_norm": 0.2582619786262512,
"learning_rate": 4.29398613358631e-05,
"loss": 0.0373,
"step": 10790
},
{
"epoch": 10.354745925215724,
"grad_norm": 0.4369192123413086,
"learning_rate": 4.2853719273095306e-05,
"loss": 0.035,
"step": 10800
},
{
"epoch": 10.364333652924257,
"grad_norm": 0.7189898490905762,
"learning_rate": 4.276759885870221e-05,
"loss": 0.0306,
"step": 10810
},
{
"epoch": 10.37392138063279,
"grad_norm": 0.25174766778945923,
"learning_rate": 4.26815003535701e-05,
"loss": 0.0409,
"step": 10820
},
{
"epoch": 10.383509108341324,
"grad_norm": 0.251800537109375,
"learning_rate": 4.2595424018518994e-05,
"loss": 0.0338,
"step": 10830
},
{
"epoch": 10.393096836049857,
"grad_norm": 0.2858979105949402,
"learning_rate": 4.250937011430167e-05,
"loss": 0.041,
"step": 10840
},
{
"epoch": 10.40268456375839,
"grad_norm": 0.1836014688014984,
"learning_rate": 4.2423338901602985e-05,
"loss": 0.0356,
"step": 10850
},
{
"epoch": 10.412272291466923,
"grad_norm": 0.279307097196579,
"learning_rate": 4.233733064103906e-05,
"loss": 0.0359,
"step": 10860
},
{
"epoch": 10.421860019175455,
"grad_norm": 0.32045918703079224,
"learning_rate": 4.225134559315647e-05,
"loss": 0.0377,
"step": 10870
},
{
"epoch": 10.431447746883988,
"grad_norm": 0.2521663010120392,
"learning_rate": 4.2165384018431495e-05,
"loss": 0.0301,
"step": 10880
},
{
"epoch": 10.441035474592521,
"grad_norm": 0.7854000329971313,
"learning_rate": 4.207944617726931e-05,
"loss": 0.0337,
"step": 10890
},
{
"epoch": 10.450623202301054,
"grad_norm": 0.2677070200443268,
"learning_rate": 4.1993532330003146e-05,
"loss": 0.0392,
"step": 10900
},
{
"epoch": 10.460210930009588,
"grad_norm": 0.4461430609226227,
"learning_rate": 4.190764273689359e-05,
"loss": 0.0306,
"step": 10910
},
{
"epoch": 10.46979865771812,
"grad_norm": 0.30843472480773926,
"learning_rate": 4.1821777658127765e-05,
"loss": 0.0259,
"step": 10920
},
{
"epoch": 10.479386385426654,
"grad_norm": 0.5075517296791077,
"learning_rate": 4.17359373538185e-05,
"loss": 0.0376,
"step": 10930
},
{
"epoch": 10.488974113135187,
"grad_norm": 0.3522166609764099,
"learning_rate": 4.16501220840036e-05,
"loss": 0.0272,
"step": 10940
},
{
"epoch": 10.49856184084372,
"grad_norm": 0.3115832805633545,
"learning_rate": 4.156433210864499e-05,
"loss": 0.0421,
"step": 10950
},
{
"epoch": 10.508149568552254,
"grad_norm": 0.29928937554359436,
"learning_rate": 4.147856768762804e-05,
"loss": 0.0329,
"step": 10960
},
{
"epoch": 10.517737296260787,
"grad_norm": 0.2621513903141022,
"learning_rate": 4.139282908076064e-05,
"loss": 0.0313,
"step": 10970
},
{
"epoch": 10.527325023969318,
"grad_norm": 0.31416305899620056,
"learning_rate": 4.130711654777254e-05,
"loss": 0.0311,
"step": 10980
},
{
"epoch": 10.536912751677852,
"grad_norm": 0.23825299739837646,
"learning_rate": 4.1221430348314415e-05,
"loss": 0.0386,
"step": 10990
},
{
"epoch": 10.546500479386385,
"grad_norm": 0.2471434473991394,
"learning_rate": 4.11357707419573e-05,
"loss": 0.038,
"step": 11000
},
{
"epoch": 10.556088207094918,
"grad_norm": 0.2707345187664032,
"learning_rate": 4.105013798819155e-05,
"loss": 0.0356,
"step": 11010
},
{
"epoch": 10.565675934803451,
"grad_norm": 0.3994966149330139,
"learning_rate": 4.0964532346426235e-05,
"loss": 0.0326,
"step": 11020
},
{
"epoch": 10.575263662511984,
"grad_norm": 0.5146787762641907,
"learning_rate": 4.087895407598824e-05,
"loss": 0.0361,
"step": 11030
},
{
"epoch": 10.584851390220518,
"grad_norm": 0.2920519709587097,
"learning_rate": 4.079340343612165e-05,
"loss": 0.0326,
"step": 11040
},
{
"epoch": 10.594439117929051,
"grad_norm": 0.27901026606559753,
"learning_rate": 4.070788068598672e-05,
"loss": 0.037,
"step": 11050
},
{
"epoch": 10.604026845637584,
"grad_norm": 0.26402774453163147,
"learning_rate": 4.062238608465927e-05,
"loss": 0.0337,
"step": 11060
},
{
"epoch": 10.613614573346117,
"grad_norm": 0.24872805178165436,
"learning_rate": 4.053691989112986e-05,
"loss": 0.0343,
"step": 11070
},
{
"epoch": 10.62320230105465,
"grad_norm": 0.21889743208885193,
"learning_rate": 4.0451482364303e-05,
"loss": 0.0329,
"step": 11080
},
{
"epoch": 10.632790028763184,
"grad_norm": 0.31977149844169617,
"learning_rate": 4.03660737629963e-05,
"loss": 0.0395,
"step": 11090
},
{
"epoch": 10.642377756471717,
"grad_norm": 0.3449043929576874,
"learning_rate": 4.028069434593982e-05,
"loss": 0.0362,
"step": 11100
},
{
"epoch": 10.651965484180248,
"grad_norm": 0.356534481048584,
"learning_rate": 4.019534437177516e-05,
"loss": 0.0453,
"step": 11110
},
{
"epoch": 10.661553211888782,
"grad_norm": 0.3510785400867462,
"learning_rate": 4.0110024099054756e-05,
"loss": 0.03,
"step": 11120
},
{
"epoch": 10.671140939597315,
"grad_norm": 0.4049818813800812,
"learning_rate": 4.002473378624107e-05,
"loss": 0.0337,
"step": 11130
},
{
"epoch": 10.680728667305848,
"grad_norm": 0.2889692485332489,
"learning_rate": 3.9939473691705765e-05,
"loss": 0.0369,
"step": 11140
},
{
"epoch": 10.690316395014381,
"grad_norm": 0.25454413890838623,
"learning_rate": 3.9854244073728996e-05,
"loss": 0.0373,
"step": 11150
},
{
"epoch": 10.699904122722915,
"grad_norm": 0.28601503372192383,
"learning_rate": 3.976904519049862e-05,
"loss": 0.0384,
"step": 11160
},
{
"epoch": 10.709491850431448,
"grad_norm": 0.22738857567310333,
"learning_rate": 3.968387730010935e-05,
"loss": 0.0352,
"step": 11170
},
{
"epoch": 10.719079578139981,
"grad_norm": 0.2723415493965149,
"learning_rate": 3.9598740660562005e-05,
"loss": 0.0372,
"step": 11180
},
{
"epoch": 10.728667305848514,
"grad_norm": 0.35877975821495056,
"learning_rate": 3.951363552976275e-05,
"loss": 0.0321,
"step": 11190
},
{
"epoch": 10.738255033557047,
"grad_norm": 0.2732999324798584,
"learning_rate": 3.942856216552234e-05,
"loss": 0.0423,
"step": 11200
},
{
"epoch": 10.74784276126558,
"grad_norm": 0.1939064860343933,
"learning_rate": 3.934352082555522e-05,
"loss": 0.0383,
"step": 11210
},
{
"epoch": 10.757430488974114,
"grad_norm": 0.34008413553237915,
"learning_rate": 3.92585117674789e-05,
"loss": 0.0374,
"step": 11220
},
{
"epoch": 10.767018216682647,
"grad_norm": 0.32701992988586426,
"learning_rate": 3.917353524881302e-05,
"loss": 0.0336,
"step": 11230
},
{
"epoch": 10.776605944391179,
"grad_norm": 0.29676583409309387,
"learning_rate": 3.908859152697872e-05,
"loss": 0.0358,
"step": 11240
},
{
"epoch": 10.786193672099712,
"grad_norm": 0.21634122729301453,
"learning_rate": 3.900368085929775e-05,
"loss": 0.0357,
"step": 11250
},
{
"epoch": 10.795781399808245,
"grad_norm": 0.29007887840270996,
"learning_rate": 3.8918803502991744e-05,
"loss": 0.0396,
"step": 11260
},
{
"epoch": 10.805369127516778,
"grad_norm": 0.2906304895877838,
"learning_rate": 3.883395971518138e-05,
"loss": 0.0293,
"step": 11270
},
{
"epoch": 10.814956855225311,
"grad_norm": 0.19408248364925385,
"learning_rate": 3.874914975288575e-05,
"loss": 0.0338,
"step": 11280
},
{
"epoch": 10.824544582933845,
"grad_norm": 0.9713996052742004,
"learning_rate": 3.8664373873021356e-05,
"loss": 0.0367,
"step": 11290
},
{
"epoch": 10.834132310642378,
"grad_norm": 0.43305110931396484,
"learning_rate": 3.857963233240153e-05,
"loss": 0.0409,
"step": 11300
},
{
"epoch": 10.843720038350911,
"grad_norm": 0.4623974859714508,
"learning_rate": 3.849492538773552e-05,
"loss": 0.0322,
"step": 11310
},
{
"epoch": 10.853307766059444,
"grad_norm": 0.13911698758602142,
"learning_rate": 3.841025329562789e-05,
"loss": 0.0371,
"step": 11320
},
{
"epoch": 10.862895493767978,
"grad_norm": 0.40783533453941345,
"learning_rate": 3.832561631257748e-05,
"loss": 0.0334,
"step": 11330
},
{
"epoch": 10.87248322147651,
"grad_norm": 0.2820438742637634,
"learning_rate": 3.824101469497685e-05,
"loss": 0.0357,
"step": 11340
},
{
"epoch": 10.882070949185042,
"grad_norm": 0.2518521547317505,
"learning_rate": 3.8156448699111414e-05,
"loss": 0.0398,
"step": 11350
},
{
"epoch": 10.891658676893575,
"grad_norm": 0.22868366539478302,
"learning_rate": 3.80719185811587e-05,
"loss": 0.0329,
"step": 11360
},
{
"epoch": 10.901246404602109,
"grad_norm": 0.28649628162384033,
"learning_rate": 3.79874245971875e-05,
"loss": 0.0362,
"step": 11370
},
{
"epoch": 10.910834132310642,
"grad_norm": 0.2933325171470642,
"learning_rate": 3.790296700315717e-05,
"loss": 0.0322,
"step": 11380
},
{
"epoch": 10.920421860019175,
"grad_norm": 0.34184950590133667,
"learning_rate": 3.781854605491684e-05,
"loss": 0.034,
"step": 11390
},
{
"epoch": 10.930009587727708,
"grad_norm": 0.26722094416618347,
"learning_rate": 3.773416200820463e-05,
"loss": 0.0369,
"step": 11400
},
{
"epoch": 10.939597315436242,
"grad_norm": 0.22674645483493805,
"learning_rate": 3.764981511864686e-05,
"loss": 0.0349,
"step": 11410
},
{
"epoch": 10.949185043144775,
"grad_norm": 0.6623883843421936,
"learning_rate": 3.756550564175727e-05,
"loss": 0.0331,
"step": 11420
},
{
"epoch": 10.958772770853308,
"grad_norm": 0.3025140166282654,
"learning_rate": 3.748123383293629e-05,
"loss": 0.0364,
"step": 11430
},
{
"epoch": 10.968360498561841,
"grad_norm": 0.2423921674489975,
"learning_rate": 3.739699994747026e-05,
"loss": 0.0305,
"step": 11440
},
{
"epoch": 10.977948226270374,
"grad_norm": 0.2216835469007492,
"learning_rate": 3.731280424053061e-05,
"loss": 0.0338,
"step": 11450
},
{
"epoch": 10.987535953978908,
"grad_norm": 0.4063700735569,
"learning_rate": 3.7228646967173096e-05,
"loss": 0.0437,
"step": 11460
},
{
"epoch": 10.997123681687441,
"grad_norm": 0.21180011332035065,
"learning_rate": 3.7144528382337086e-05,
"loss": 0.0362,
"step": 11470
},
{
"epoch": 11.006711409395972,
"grad_norm": 0.22706526517868042,
"learning_rate": 3.706044874084474e-05,
"loss": 0.0343,
"step": 11480
},
{
"epoch": 11.016299137104506,
"grad_norm": 0.3348940908908844,
"learning_rate": 3.6976408297400257e-05,
"loss": 0.0344,
"step": 11490
},
{
"epoch": 11.025886864813039,
"grad_norm": 0.21291491389274597,
"learning_rate": 3.6892407306589035e-05,
"loss": 0.0329,
"step": 11500
},
{
"epoch": 11.035474592521572,
"grad_norm": 0.3505829870700836,
"learning_rate": 3.6808446022877e-05,
"loss": 0.0339,
"step": 11510
},
{
"epoch": 11.045062320230105,
"grad_norm": 0.36319780349731445,
"learning_rate": 3.672452470060982e-05,
"loss": 0.0338,
"step": 11520
},
{
"epoch": 11.054650047938638,
"grad_norm": 0.3714457154273987,
"learning_rate": 3.6640643594012057e-05,
"loss": 0.0419,
"step": 11530
},
{
"epoch": 11.064237775647172,
"grad_norm": 0.27974534034729004,
"learning_rate": 3.6556802957186486e-05,
"loss": 0.0359,
"step": 11540
},
{
"epoch": 11.073825503355705,
"grad_norm": 0.34719452261924744,
"learning_rate": 3.647300304411323e-05,
"loss": 0.0367,
"step": 11550
},
{
"epoch": 11.083413231064238,
"grad_norm": 0.24294276535511017,
"learning_rate": 3.6389244108649114e-05,
"loss": 0.0316,
"step": 11560
},
{
"epoch": 11.093000958772771,
"grad_norm": 0.3280002474784851,
"learning_rate": 3.6305526404526785e-05,
"loss": 0.0315,
"step": 11570
},
{
"epoch": 11.102588686481305,
"grad_norm": 0.25797387957572937,
"learning_rate": 3.6221850185354014e-05,
"loss": 0.0306,
"step": 11580
},
{
"epoch": 11.112176414189838,
"grad_norm": 0.2705564498901367,
"learning_rate": 3.613821570461284e-05,
"loss": 0.0333,
"step": 11590
},
{
"epoch": 11.12176414189837,
"grad_norm": 0.2857078015804291,
"learning_rate": 3.605462321565899e-05,
"loss": 0.0329,
"step": 11600
},
{
"epoch": 11.131351869606902,
"grad_norm": 0.23920407891273499,
"learning_rate": 3.597107297172084e-05,
"loss": 0.0366,
"step": 11610
},
{
"epoch": 11.140939597315436,
"grad_norm": 0.31336209177970886,
"learning_rate": 3.588756522589888e-05,
"loss": 0.03,
"step": 11620
},
{
"epoch": 11.150527325023969,
"grad_norm": 0.2026471495628357,
"learning_rate": 3.5804100231164824e-05,
"loss": 0.0328,
"step": 11630
},
{
"epoch": 11.160115052732502,
"grad_norm": 0.166408970952034,
"learning_rate": 3.572067824036092e-05,
"loss": 0.0357,
"step": 11640
},
{
"epoch": 11.169702780441035,
"grad_norm": 0.24978677928447723,
"learning_rate": 3.5637299506199075e-05,
"loss": 0.0289,
"step": 11650
},
{
"epoch": 11.179290508149569,
"grad_norm": 0.36853691935539246,
"learning_rate": 3.5553964281260225e-05,
"loss": 0.036,
"step": 11660
},
{
"epoch": 11.188878235858102,
"grad_norm": 0.31218189001083374,
"learning_rate": 3.547067281799345e-05,
"loss": 0.0327,
"step": 11670
},
{
"epoch": 11.198465963566635,
"grad_norm": 0.2616768777370453,
"learning_rate": 3.538742536871531e-05,
"loss": 0.0378,
"step": 11680
},
{
"epoch": 11.208053691275168,
"grad_norm": 0.3586946725845337,
"learning_rate": 3.530422218560903e-05,
"loss": 0.0378,
"step": 11690
},
{
"epoch": 11.217641418983701,
"grad_norm": 0.1958369016647339,
"learning_rate": 3.522106352072366e-05,
"loss": 0.0368,
"step": 11700
},
{
"epoch": 11.227229146692235,
"grad_norm": 0.30349719524383545,
"learning_rate": 3.5137949625973484e-05,
"loss": 0.0396,
"step": 11710
},
{
"epoch": 11.236816874400766,
"grad_norm": 0.22439143061637878,
"learning_rate": 3.505488075313712e-05,
"loss": 0.0275,
"step": 11720
},
{
"epoch": 11.2464046021093,
"grad_norm": 0.3639642596244812,
"learning_rate": 3.4971857153856825e-05,
"loss": 0.03,
"step": 11730
},
{
"epoch": 11.255992329817833,
"grad_norm": 0.19874945282936096,
"learning_rate": 3.488887907963766e-05,
"loss": 0.0341,
"step": 11740
},
{
"epoch": 11.265580057526366,
"grad_norm": 0.6180244088172913,
"learning_rate": 3.480594678184681e-05,
"loss": 0.0346,
"step": 11750
},
{
"epoch": 11.275167785234899,
"grad_norm": 0.27457571029663086,
"learning_rate": 3.472306051171281e-05,
"loss": 0.0359,
"step": 11760
},
{
"epoch": 11.284755512943432,
"grad_norm": 0.18931525945663452,
"learning_rate": 3.464022052032473e-05,
"loss": 0.0311,
"step": 11770
},
{
"epoch": 11.294343240651965,
"grad_norm": 0.2550256848335266,
"learning_rate": 3.455742705863143e-05,
"loss": 0.0346,
"step": 11780
},
{
"epoch": 11.303930968360499,
"grad_norm": 0.21088473498821259,
"learning_rate": 3.447468037744084e-05,
"loss": 0.0295,
"step": 11790
},
{
"epoch": 11.313518696069032,
"grad_norm": 0.25027552247047424,
"learning_rate": 3.439198072741921e-05,
"loss": 0.0375,
"step": 11800
},
{
"epoch": 11.323106423777565,
"grad_norm": 0.5064207315444946,
"learning_rate": 3.4309328359090264e-05,
"loss": 0.0332,
"step": 11810
},
{
"epoch": 11.332694151486098,
"grad_norm": 0.2110755443572998,
"learning_rate": 3.422672352283453e-05,
"loss": 0.0351,
"step": 11820
},
{
"epoch": 11.342281879194632,
"grad_norm": 0.27771392464637756,
"learning_rate": 3.41441664688885e-05,
"loss": 0.0383,
"step": 11830
},
{
"epoch": 11.351869606903165,
"grad_norm": 0.34242868423461914,
"learning_rate": 3.406165744734397e-05,
"loss": 0.0298,
"step": 11840
},
{
"epoch": 11.361457334611696,
"grad_norm": 0.3390040099620819,
"learning_rate": 3.397919670814723e-05,
"loss": 0.0377,
"step": 11850
},
{
"epoch": 11.37104506232023,
"grad_norm": 0.15492115914821625,
"learning_rate": 3.389678450109827e-05,
"loss": 0.0403,
"step": 11860
},
{
"epoch": 11.380632790028763,
"grad_norm": 0.3101263642311096,
"learning_rate": 3.3814421075850035e-05,
"loss": 0.0362,
"step": 11870
},
{
"epoch": 11.390220517737296,
"grad_norm": 0.2800522446632385,
"learning_rate": 3.3732106681907816e-05,
"loss": 0.032,
"step": 11880
},
{
"epoch": 11.39980824544583,
"grad_norm": 0.26244333386421204,
"learning_rate": 3.364984156862825e-05,
"loss": 0.0307,
"step": 11890
},
{
"epoch": 11.409395973154362,
"grad_norm": 0.48606979846954346,
"learning_rate": 3.356762598521874e-05,
"loss": 0.0335,
"step": 11900
},
{
"epoch": 11.418983700862896,
"grad_norm": 0.5852661728858948,
"learning_rate": 3.348546018073662e-05,
"loss": 0.0433,
"step": 11910
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.252837598323822,
"learning_rate": 3.340334440408846e-05,
"loss": 0.0257,
"step": 11920
},
{
"epoch": 11.438159156279962,
"grad_norm": 0.2573808431625366,
"learning_rate": 3.332127890402926e-05,
"loss": 0.0331,
"step": 11930
},
{
"epoch": 11.447746883988495,
"grad_norm": 0.25154879689216614,
"learning_rate": 3.3239263929161734e-05,
"loss": 0.0389,
"step": 11940
},
{
"epoch": 11.457334611697028,
"grad_norm": 0.2564004957675934,
"learning_rate": 3.315729972793553e-05,
"loss": 0.0386,
"step": 11950
},
{
"epoch": 11.466922339405562,
"grad_norm": 0.45886269211769104,
"learning_rate": 3.307538654864645e-05,
"loss": 0.0365,
"step": 11960
},
{
"epoch": 11.476510067114093,
"grad_norm": 0.157767191529274,
"learning_rate": 3.29935246394358e-05,
"loss": 0.0356,
"step": 11970
},
{
"epoch": 11.486097794822626,
"grad_norm": 0.3403734564781189,
"learning_rate": 3.2911714248289525e-05,
"loss": 0.0335,
"step": 11980
},
{
"epoch": 11.49568552253116,
"grad_norm": 0.207637757062912,
"learning_rate": 3.282995562303754e-05,
"loss": 0.0291,
"step": 11990
},
{
"epoch": 11.505273250239693,
"grad_norm": 0.2571353614330292,
"learning_rate": 3.2748249011352864e-05,
"loss": 0.031,
"step": 12000
},
{
"epoch": 11.514860977948226,
"grad_norm": 0.29838424921035767,
"learning_rate": 3.266659466075108e-05,
"loss": 0.0312,
"step": 12010
},
{
"epoch": 11.52444870565676,
"grad_norm": 0.35853999853134155,
"learning_rate": 3.258499281858936e-05,
"loss": 0.0349,
"step": 12020
},
{
"epoch": 11.534036433365292,
"grad_norm": 0.22435715794563293,
"learning_rate": 3.250344373206584e-05,
"loss": 0.0321,
"step": 12030
},
{
"epoch": 11.543624161073826,
"grad_norm": 0.364653617143631,
"learning_rate": 3.242194764821881e-05,
"loss": 0.0291,
"step": 12040
},
{
"epoch": 11.553211888782359,
"grad_norm": 0.20518454909324646,
"learning_rate": 3.2340504813926086e-05,
"loss": 0.0335,
"step": 12050
},
{
"epoch": 11.562799616490892,
"grad_norm": 0.3099921941757202,
"learning_rate": 3.2259115475904064e-05,
"loss": 0.036,
"step": 12060
},
{
"epoch": 11.572387344199425,
"grad_norm": 0.40152508020401,
"learning_rate": 3.217777988070715e-05,
"loss": 0.0377,
"step": 12070
},
{
"epoch": 11.581975071907959,
"grad_norm": 0.2941493093967438,
"learning_rate": 3.2096498274726925e-05,
"loss": 0.0304,
"step": 12080
},
{
"epoch": 11.59156279961649,
"grad_norm": 0.1939501017332077,
"learning_rate": 3.201527090419144e-05,
"loss": 0.0309,
"step": 12090
},
{
"epoch": 11.601150527325023,
"grad_norm": 0.28782132267951965,
"learning_rate": 3.193409801516443e-05,
"loss": 0.0368,
"step": 12100
},
{
"epoch": 11.610738255033556,
"grad_norm": 0.22255367040634155,
"learning_rate": 3.1852979853544575e-05,
"loss": 0.034,
"step": 12110
},
{
"epoch": 11.62032598274209,
"grad_norm": 0.24580125510692596,
"learning_rate": 3.177191666506479e-05,
"loss": 0.0316,
"step": 12120
},
{
"epoch": 11.629913710450623,
"grad_norm": 0.16919176280498505,
"learning_rate": 3.169090869529146e-05,
"loss": 0.032,
"step": 12130
},
{
"epoch": 11.639501438159156,
"grad_norm": 0.16586647927761078,
"learning_rate": 3.1609956189623704e-05,
"loss": 0.0318,
"step": 12140
},
{
"epoch": 11.64908916586769,
"grad_norm": 0.25521326065063477,
"learning_rate": 3.1529059393292573e-05,
"loss": 0.0339,
"step": 12150
},
{
"epoch": 11.658676893576223,
"grad_norm": 0.40948987007141113,
"learning_rate": 3.1448218551360394e-05,
"loss": 0.0417,
"step": 12160
},
{
"epoch": 11.668264621284756,
"grad_norm": 0.2603534460067749,
"learning_rate": 3.136743390872001e-05,
"loss": 0.0332,
"step": 12170
},
{
"epoch": 11.677852348993289,
"grad_norm": 0.24372372031211853,
"learning_rate": 3.128670571009399e-05,
"loss": 0.0325,
"step": 12180
},
{
"epoch": 11.687440076701822,
"grad_norm": 0.18494637310504913,
"learning_rate": 3.1206034200033904e-05,
"loss": 0.0324,
"step": 12190
},
{
"epoch": 11.697027804410356,
"grad_norm": 0.3946174681186676,
"learning_rate": 3.1125419622919614e-05,
"loss": 0.0327,
"step": 12200
},
{
"epoch": 11.706615532118889,
"grad_norm": 0.5735461115837097,
"learning_rate": 3.104486222295853e-05,
"loss": 0.0294,
"step": 12210
},
{
"epoch": 11.71620325982742,
"grad_norm": 0.25579607486724854,
"learning_rate": 3.096436224418482e-05,
"loss": 0.0347,
"step": 12220
},
{
"epoch": 11.725790987535953,
"grad_norm": 0.40547341108322144,
"learning_rate": 3.088391993045873e-05,
"loss": 0.037,
"step": 12230
},
{
"epoch": 11.735378715244487,
"grad_norm": 0.3765973746776581,
"learning_rate": 3.080353552546578e-05,
"loss": 0.0307,
"step": 12240
},
{
"epoch": 11.74496644295302,
"grad_norm": 0.40163904428482056,
"learning_rate": 3.0723209272716124e-05,
"loss": 0.0295,
"step": 12250
},
{
"epoch": 11.754554170661553,
"grad_norm": 0.3667445182800293,
"learning_rate": 3.064294141554372e-05,
"loss": 0.0328,
"step": 12260
},
{
"epoch": 11.764141898370086,
"grad_norm": 0.22410856187343597,
"learning_rate": 3.056273219710565e-05,
"loss": 0.0355,
"step": 12270
},
{
"epoch": 11.77372962607862,
"grad_norm": 0.278154581785202,
"learning_rate": 3.048258186038129e-05,
"loss": 0.038,
"step": 12280
},
{
"epoch": 11.783317353787153,
"grad_norm": 0.4203621745109558,
"learning_rate": 3.040249064817176e-05,
"loss": 0.0338,
"step": 12290
},
{
"epoch": 11.792905081495686,
"grad_norm": 0.29441940784454346,
"learning_rate": 3.0322458803098973e-05,
"loss": 0.027,
"step": 12300
},
{
"epoch": 11.80249280920422,
"grad_norm": 0.2775827646255493,
"learning_rate": 3.0242486567605068e-05,
"loss": 0.031,
"step": 12310
},
{
"epoch": 11.812080536912752,
"grad_norm": 0.38520553708076477,
"learning_rate": 3.016257418395152e-05,
"loss": 0.0333,
"step": 12320
},
{
"epoch": 11.821668264621284,
"grad_norm": 0.26599544286727905,
"learning_rate": 3.008272189421861e-05,
"loss": 0.0301,
"step": 12330
},
{
"epoch": 11.831255992329817,
"grad_norm": 0.22733962535858154,
"learning_rate": 3.0002929940304498e-05,
"loss": 0.0298,
"step": 12340
},
{
"epoch": 11.84084372003835,
"grad_norm": 0.27661770582199097,
"learning_rate": 2.992319856392457e-05,
"loss": 0.0342,
"step": 12350
},
{
"epoch": 11.850431447746884,
"grad_norm": 0.26731380820274353,
"learning_rate": 2.9843528006610733e-05,
"loss": 0.0295,
"step": 12360
},
{
"epoch": 11.860019175455417,
"grad_norm": 0.3973303437232971,
"learning_rate": 2.976391850971065e-05,
"loss": 0.0301,
"step": 12370
},
{
"epoch": 11.86960690316395,
"grad_norm": 0.3120301067829132,
"learning_rate": 2.968437031438698e-05,
"loss": 0.0348,
"step": 12380
},
{
"epoch": 11.879194630872483,
"grad_norm": 0.2932593524456024,
"learning_rate": 2.9604883661616702e-05,
"loss": 0.0308,
"step": 12390
},
{
"epoch": 11.888782358581016,
"grad_norm": 0.2067721039056778,
"learning_rate": 2.9525458792190365e-05,
"loss": 0.0323,
"step": 12400
},
{
"epoch": 11.89837008628955,
"grad_norm": 0.30877119302749634,
"learning_rate": 2.9446095946711367e-05,
"loss": 0.0336,
"step": 12410
},
{
"epoch": 11.907957813998083,
"grad_norm": 0.1372332125902176,
"learning_rate": 2.93667953655952e-05,
"loss": 0.0341,
"step": 12420
},
{
"epoch": 11.917545541706616,
"grad_norm": 0.2722005844116211,
"learning_rate": 2.9287557289068736e-05,
"loss": 0.0347,
"step": 12430
},
{
"epoch": 11.92713326941515,
"grad_norm": 0.35675281286239624,
"learning_rate": 2.9208381957169485e-05,
"loss": 0.0354,
"step": 12440
},
{
"epoch": 11.936720997123683,
"grad_norm": 0.4129658639431,
"learning_rate": 2.9129269609744935e-05,
"loss": 0.0235,
"step": 12450
},
{
"epoch": 11.946308724832214,
"grad_norm": 0.23059901595115662,
"learning_rate": 2.905022048645172e-05,
"loss": 0.0361,
"step": 12460
},
{
"epoch": 11.955896452540747,
"grad_norm": 0.20640157163143158,
"learning_rate": 2.8971234826754983e-05,
"loss": 0.0306,
"step": 12470
},
{
"epoch": 11.96548418024928,
"grad_norm": 0.27325066924095154,
"learning_rate": 2.8892312869927578e-05,
"loss": 0.033,
"step": 12480
},
{
"epoch": 11.975071907957814,
"grad_norm": 0.2237732708454132,
"learning_rate": 2.881345485504945e-05,
"loss": 0.0309,
"step": 12490
},
{
"epoch": 11.984659635666347,
"grad_norm": 0.2271834760904312,
"learning_rate": 2.8734661021006747e-05,
"loss": 0.0267,
"step": 12500
},
{
"epoch": 11.99424736337488,
"grad_norm": 0.27549734711647034,
"learning_rate": 2.8655931606491294e-05,
"loss": 0.0338,
"step": 12510
},
{
"epoch": 12.003835091083413,
"grad_norm": 0.19603657722473145,
"learning_rate": 2.8577266849999672e-05,
"loss": 0.0303,
"step": 12520
},
{
"epoch": 12.013422818791947,
"grad_norm": 0.1858394742012024,
"learning_rate": 2.849866698983267e-05,
"loss": 0.0255,
"step": 12530
},
{
"epoch": 12.02301054650048,
"grad_norm": 0.17287525534629822,
"learning_rate": 2.8420132264094468e-05,
"loss": 0.0297,
"step": 12540
},
{
"epoch": 12.032598274209013,
"grad_norm": 0.32775846123695374,
"learning_rate": 2.83416629106919e-05,
"loss": 0.0345,
"step": 12550
},
{
"epoch": 12.042186001917546,
"grad_norm": 0.17536644637584686,
"learning_rate": 2.8263259167333777e-05,
"loss": 0.0286,
"step": 12560
},
{
"epoch": 12.05177372962608,
"grad_norm": 0.18874387443065643,
"learning_rate": 2.818492127153018e-05,
"loss": 0.0293,
"step": 12570
},
{
"epoch": 12.06136145733461,
"grad_norm": 0.1686885803937912,
"learning_rate": 2.8106649460591716e-05,
"loss": 0.0302,
"step": 12580
},
{
"epoch": 12.070949185043144,
"grad_norm": 0.14021116495132446,
"learning_rate": 2.802844397162877e-05,
"loss": 0.0321,
"step": 12590
},
{
"epoch": 12.080536912751677,
"grad_norm": 0.32412388920783997,
"learning_rate": 2.7950305041550818e-05,
"loss": 0.0337,
"step": 12600
},
{
"epoch": 12.09012464046021,
"grad_norm": 0.2775496244430542,
"learning_rate": 2.7872232907065738e-05,
"loss": 0.0348,
"step": 12610
},
{
"epoch": 12.099712368168744,
"grad_norm": 0.20718041062355042,
"learning_rate": 2.7794227804679063e-05,
"loss": 0.0318,
"step": 12620
},
{
"epoch": 12.109300095877277,
"grad_norm": 0.14198093116283417,
"learning_rate": 2.7716289970693236e-05,
"loss": 0.0285,
"step": 12630
},
{
"epoch": 12.11888782358581,
"grad_norm": 0.23473426699638367,
"learning_rate": 2.7638419641206914e-05,
"loss": 0.0311,
"step": 12640
},
{
"epoch": 12.128475551294343,
"grad_norm": 0.22687584161758423,
"learning_rate": 2.7560617052114297e-05,
"loss": 0.0265,
"step": 12650
},
{
"epoch": 12.138063279002877,
"grad_norm": 0.22875012457370758,
"learning_rate": 2.7482882439104385e-05,
"loss": 0.0324,
"step": 12660
},
{
"epoch": 12.14765100671141,
"grad_norm": 0.2869175970554352,
"learning_rate": 2.740521603766022e-05,
"loss": 0.0343,
"step": 12670
},
{
"epoch": 12.157238734419943,
"grad_norm": 0.24454490840435028,
"learning_rate": 2.7327618083058192e-05,
"loss": 0.0354,
"step": 12680
},
{
"epoch": 12.166826462128476,
"grad_norm": 0.26888319849967957,
"learning_rate": 2.7250088810367404e-05,
"loss": 0.0317,
"step": 12690
},
{
"epoch": 12.176414189837008,
"grad_norm": 0.2190038412809372,
"learning_rate": 2.7172628454448888e-05,
"loss": 0.0394,
"step": 12700
},
{
"epoch": 12.186001917545541,
"grad_norm": 0.1673816591501236,
"learning_rate": 2.7095237249954875e-05,
"loss": 0.0272,
"step": 12710
},
{
"epoch": 12.195589645254074,
"grad_norm": 0.32721394300460815,
"learning_rate": 2.7017915431328078e-05,
"loss": 0.0341,
"step": 12720
},
{
"epoch": 12.205177372962607,
"grad_norm": 0.2936406135559082,
"learning_rate": 2.6940663232801144e-05,
"loss": 0.0294,
"step": 12730
},
{
"epoch": 12.21476510067114,
"grad_norm": 3.8611295223236084,
"learning_rate": 2.6863480888395714e-05,
"loss": 0.0293,
"step": 12740
},
{
"epoch": 12.224352828379674,
"grad_norm": 0.16587217152118683,
"learning_rate": 2.6786368631921836e-05,
"loss": 0.03,
"step": 12750
},
{
"epoch": 12.233940556088207,
"grad_norm": 0.5451092720031738,
"learning_rate": 2.6709326696977215e-05,
"loss": 0.0325,
"step": 12760
},
{
"epoch": 12.24352828379674,
"grad_norm": 0.20002365112304688,
"learning_rate": 2.6632355316946643e-05,
"loss": 0.0255,
"step": 12770
},
{
"epoch": 12.253116011505274,
"grad_norm": 0.8898112773895264,
"learning_rate": 2.655545472500105e-05,
"loss": 0.0348,
"step": 12780
},
{
"epoch": 12.262703739213807,
"grad_norm": 0.3279706835746765,
"learning_rate": 2.647862515409697e-05,
"loss": 0.0259,
"step": 12790
},
{
"epoch": 12.27229146692234,
"grad_norm": 0.2899661958217621,
"learning_rate": 2.6401866836975795e-05,
"loss": 0.0375,
"step": 12800
},
{
"epoch": 12.281879194630873,
"grad_norm": 0.2332329899072647,
"learning_rate": 2.632518000616312e-05,
"loss": 0.0319,
"step": 12810
},
{
"epoch": 12.291466922339406,
"grad_norm": 0.23844292759895325,
"learning_rate": 2.6248564893967886e-05,
"loss": 0.0344,
"step": 12820
},
{
"epoch": 12.301054650047938,
"grad_norm": 0.20757047832012177,
"learning_rate": 2.617202173248181e-05,
"loss": 0.0365,
"step": 12830
},
{
"epoch": 12.310642377756471,
"grad_norm": 0.23326794803142548,
"learning_rate": 2.609555075357869e-05,
"loss": 0.0385,
"step": 12840
},
{
"epoch": 12.320230105465004,
"grad_norm": 0.20900526642799377,
"learning_rate": 2.6019152188913638e-05,
"loss": 0.0333,
"step": 12850
},
{
"epoch": 12.329817833173538,
"grad_norm": 0.2453479766845703,
"learning_rate": 2.5942826269922376e-05,
"loss": 0.0317,
"step": 12860
},
{
"epoch": 12.33940556088207,
"grad_norm": 0.45544683933258057,
"learning_rate": 2.5866573227820557e-05,
"loss": 0.0299,
"step": 12870
},
{
"epoch": 12.348993288590604,
"grad_norm": 0.31227871775627136,
"learning_rate": 2.5790393293603097e-05,
"loss": 0.029,
"step": 12880
},
{
"epoch": 12.358581016299137,
"grad_norm": 0.32639333605766296,
"learning_rate": 2.571428669804346e-05,
"loss": 0.0323,
"step": 12890
},
{
"epoch": 12.36816874400767,
"grad_norm": 0.3351771831512451,
"learning_rate": 2.563825367169289e-05,
"loss": 0.0304,
"step": 12900
},
{
"epoch": 12.377756471716204,
"grad_norm": 0.47458702325820923,
"learning_rate": 2.5562294444879787e-05,
"loss": 0.03,
"step": 12910
},
{
"epoch": 12.387344199424737,
"grad_norm": 0.2465980499982834,
"learning_rate": 2.5486409247708987e-05,
"loss": 0.0378,
"step": 12920
},
{
"epoch": 12.39693192713327,
"grad_norm": 0.42310255765914917,
"learning_rate": 2.5410598310061118e-05,
"loss": 0.0323,
"step": 12930
},
{
"epoch": 12.406519654841803,
"grad_norm": 1.066576361656189,
"learning_rate": 2.5334861861591753e-05,
"loss": 0.0347,
"step": 12940
},
{
"epoch": 12.416107382550335,
"grad_norm": 0.24553652107715607,
"learning_rate": 2.525920013173091e-05,
"loss": 0.0288,
"step": 12950
},
{
"epoch": 12.425695110258868,
"grad_norm": 0.17061471939086914,
"learning_rate": 2.51836133496822e-05,
"loss": 0.0293,
"step": 12960
},
{
"epoch": 12.435282837967401,
"grad_norm": 0.2702957093715668,
"learning_rate": 2.5108101744422197e-05,
"loss": 0.0337,
"step": 12970
},
{
"epoch": 12.444870565675934,
"grad_norm": 0.2967221736907959,
"learning_rate": 2.5032665544699762e-05,
"loss": 0.0388,
"step": 12980
},
{
"epoch": 12.454458293384468,
"grad_norm": 0.18429528176784515,
"learning_rate": 2.495730497903535e-05,
"loss": 0.0339,
"step": 12990
},
{
"epoch": 12.464046021093,
"grad_norm": 0.4446472227573395,
"learning_rate": 2.4882020275720247e-05,
"loss": 0.0297,
"step": 13000
},
{
"epoch": 12.473633748801534,
"grad_norm": 0.2481614649295807,
"learning_rate": 2.480681166281592e-05,
"loss": 0.0332,
"step": 13010
},
{
"epoch": 12.483221476510067,
"grad_norm": 0.4030400216579437,
"learning_rate": 2.4731679368153392e-05,
"loss": 0.0386,
"step": 13020
},
{
"epoch": 12.4928092042186,
"grad_norm": 0.20716169476509094,
"learning_rate": 2.4656623619332476e-05,
"loss": 0.0289,
"step": 13030
},
{
"epoch": 12.502396931927134,
"grad_norm": 0.18714624643325806,
"learning_rate": 2.4581644643721075e-05,
"loss": 0.0257,
"step": 13040
},
{
"epoch": 12.511984659635667,
"grad_norm": 0.2566820979118347,
"learning_rate": 2.4506742668454514e-05,
"loss": 0.0267,
"step": 13050
},
{
"epoch": 12.5215723873442,
"grad_norm": 0.237356036901474,
"learning_rate": 2.44319179204349e-05,
"loss": 0.0317,
"step": 13060
},
{
"epoch": 12.531160115052732,
"grad_norm": 0.29655054211616516,
"learning_rate": 2.4357170626330394e-05,
"loss": 0.0328,
"step": 13070
},
{
"epoch": 12.540747842761265,
"grad_norm": 0.29281550645828247,
"learning_rate": 2.4282501012574495e-05,
"loss": 0.0295,
"step": 13080
},
{
"epoch": 12.550335570469798,
"grad_norm": 0.477317750453949,
"learning_rate": 2.4207909305365363e-05,
"loss": 0.0353,
"step": 13090
},
{
"epoch": 12.559923298178331,
"grad_norm": 0.2606201767921448,
"learning_rate": 2.4133395730665214e-05,
"loss": 0.0288,
"step": 13100
},
{
"epoch": 12.569511025886865,
"grad_norm": 0.18180538713932037,
"learning_rate": 2.405896051419957e-05,
"loss": 0.0349,
"step": 13110
},
{
"epoch": 12.579098753595398,
"grad_norm": 0.3665505349636078,
"learning_rate": 2.398460388145653e-05,
"loss": 0.0321,
"step": 13120
},
{
"epoch": 12.588686481303931,
"grad_norm": 0.28408095240592957,
"learning_rate": 2.3910326057686127e-05,
"loss": 0.0359,
"step": 13130
},
{
"epoch": 12.598274209012464,
"grad_norm": 0.19122740626335144,
"learning_rate": 2.3836127267899778e-05,
"loss": 0.0299,
"step": 13140
},
{
"epoch": 12.607861936720997,
"grad_norm": 0.18212218582630157,
"learning_rate": 2.3762007736869353e-05,
"loss": 0.0328,
"step": 13150
},
{
"epoch": 12.61744966442953,
"grad_norm": 0.33118176460266113,
"learning_rate": 2.3687967689126667e-05,
"loss": 0.0291,
"step": 13160
},
{
"epoch": 12.627037392138064,
"grad_norm": 0.43079885840415955,
"learning_rate": 2.3614007348962724e-05,
"loss": 0.0303,
"step": 13170
},
{
"epoch": 12.636625119846597,
"grad_norm": 0.21110649406909943,
"learning_rate": 2.3540126940427166e-05,
"loss": 0.0334,
"step": 13180
},
{
"epoch": 12.64621284755513,
"grad_norm": 0.18830737471580505,
"learning_rate": 2.3466326687327396e-05,
"loss": 0.0316,
"step": 13190
},
{
"epoch": 12.655800575263662,
"grad_norm": 0.33135518431663513,
"learning_rate": 2.3392606813228008e-05,
"loss": 0.0375,
"step": 13200
},
{
"epoch": 12.665388302972195,
"grad_norm": 0.2647267282009125,
"learning_rate": 2.3318967541450153e-05,
"loss": 0.0294,
"step": 13210
},
{
"epoch": 12.674976030680728,
"grad_norm": 0.2796458303928375,
"learning_rate": 2.3245409095070803e-05,
"loss": 0.0282,
"step": 13220
},
{
"epoch": 12.684563758389261,
"grad_norm": 0.31999823451042175,
"learning_rate": 2.317193169692205e-05,
"loss": 0.0363,
"step": 13230
},
{
"epoch": 12.694151486097795,
"grad_norm": 0.21032322943210602,
"learning_rate": 2.3098535569590458e-05,
"loss": 0.0341,
"step": 13240
},
{
"epoch": 12.703739213806328,
"grad_norm": 0.31383687257766724,
"learning_rate": 2.3025220935416447e-05,
"loss": 0.0301,
"step": 13250
},
{
"epoch": 12.713326941514861,
"grad_norm": 0.4095149040222168,
"learning_rate": 2.2951988016493548e-05,
"loss": 0.036,
"step": 13260
},
{
"epoch": 12.722914669223394,
"grad_norm": 0.21426613628864288,
"learning_rate": 2.2878837034667737e-05,
"loss": 0.0346,
"step": 13270
},
{
"epoch": 12.732502396931928,
"grad_norm": 0.312098890542984,
"learning_rate": 2.2805768211536758e-05,
"loss": 0.0342,
"step": 13280
},
{
"epoch": 12.74209012464046,
"grad_norm": 0.2564839720726013,
"learning_rate": 2.273278176844951e-05,
"loss": 0.0323,
"step": 13290
},
{
"epoch": 12.751677852348994,
"grad_norm": 0.314685583114624,
"learning_rate": 2.2659877926505353e-05,
"loss": 0.0382,
"step": 13300
},
{
"epoch": 12.761265580057525,
"grad_norm": 0.1301986277103424,
"learning_rate": 2.2587056906553348e-05,
"loss": 0.034,
"step": 13310
},
{
"epoch": 12.770853307766059,
"grad_norm": 0.23595231771469116,
"learning_rate": 2.251431892919171e-05,
"loss": 0.0293,
"step": 13320
},
{
"epoch": 12.780441035474592,
"grad_norm": 0.23706960678100586,
"learning_rate": 2.2441664214767085e-05,
"loss": 0.0355,
"step": 13330
},
{
"epoch": 12.790028763183125,
"grad_norm": 0.20160214602947235,
"learning_rate": 2.2369092983373912e-05,
"loss": 0.0315,
"step": 13340
},
{
"epoch": 12.799616490891658,
"grad_norm": 0.1787547618150711,
"learning_rate": 2.2296605454853673e-05,
"loss": 0.0314,
"step": 13350
},
{
"epoch": 12.809204218600192,
"grad_norm": 0.36770564317703247,
"learning_rate": 2.222420184879437e-05,
"loss": 0.0372,
"step": 13360
},
{
"epoch": 12.818791946308725,
"grad_norm": 0.3025970160961151,
"learning_rate": 2.2151882384529683e-05,
"loss": 0.0255,
"step": 13370
},
{
"epoch": 12.828379674017258,
"grad_norm": 0.25169727206230164,
"learning_rate": 2.207964728113848e-05,
"loss": 0.0269,
"step": 13380
},
{
"epoch": 12.837967401725791,
"grad_norm": 0.37031155824661255,
"learning_rate": 2.200749675744402e-05,
"loss": 0.0293,
"step": 13390
},
{
"epoch": 12.847555129434324,
"grad_norm": 0.21579872071743011,
"learning_rate": 2.1935431032013388e-05,
"loss": 0.0302,
"step": 13400
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.20838379859924316,
"learning_rate": 2.1863450323156725e-05,
"loss": 0.034,
"step": 13410
},
{
"epoch": 12.86673058485139,
"grad_norm": 0.2365337610244751,
"learning_rate": 2.179155484892671e-05,
"loss": 0.0321,
"step": 13420
},
{
"epoch": 12.876318312559924,
"grad_norm": 0.24535539746284485,
"learning_rate": 2.1719744827117737e-05,
"loss": 0.0318,
"step": 13430
},
{
"epoch": 12.885906040268456,
"grad_norm": 0.32186776399612427,
"learning_rate": 2.1648020475265418e-05,
"loss": 0.0353,
"step": 13440
},
{
"epoch": 12.895493767976989,
"grad_norm": 0.2927076518535614,
"learning_rate": 2.1576382010645764e-05,
"loss": 0.0318,
"step": 13450
},
{
"epoch": 12.905081495685522,
"grad_norm": 0.2444140613079071,
"learning_rate": 2.1504829650274672e-05,
"loss": 0.034,
"step": 13460
},
{
"epoch": 12.914669223394055,
"grad_norm": 0.17273946106433868,
"learning_rate": 2.1433363610907147e-05,
"loss": 0.0339,
"step": 13470
},
{
"epoch": 12.924256951102588,
"grad_norm": 0.3511595129966736,
"learning_rate": 2.1361984109036765e-05,
"loss": 0.0284,
"step": 13480
},
{
"epoch": 12.933844678811122,
"grad_norm": 0.21930259466171265,
"learning_rate": 2.1290691360894872e-05,
"loss": 0.0337,
"step": 13490
},
{
"epoch": 12.943432406519655,
"grad_norm": 0.13534465432167053,
"learning_rate": 2.121948558245008e-05,
"loss": 0.0325,
"step": 13500
},
{
"epoch": 12.953020134228188,
"grad_norm": 0.25757452845573425,
"learning_rate": 2.1148366989407496e-05,
"loss": 0.0344,
"step": 13510
},
{
"epoch": 12.962607861936721,
"grad_norm": 0.3126337230205536,
"learning_rate": 2.1077335797208153e-05,
"loss": 0.0266,
"step": 13520
},
{
"epoch": 12.972195589645255,
"grad_norm": 0.2144749015569687,
"learning_rate": 2.100639222102827e-05,
"loss": 0.0296,
"step": 13530
},
{
"epoch": 12.981783317353788,
"grad_norm": 0.33655446767807007,
"learning_rate": 2.0935536475778682e-05,
"loss": 0.0319,
"step": 13540
},
{
"epoch": 12.991371045062321,
"grad_norm": 0.16992558538913727,
"learning_rate": 2.0864768776104183e-05,
"loss": 0.0335,
"step": 13550
},
{
"epoch": 13.000958772770852,
"grad_norm": 0.2082756608724594,
"learning_rate": 2.079408933638279e-05,
"loss": 0.0338,
"step": 13560
},
{
"epoch": 13.010546500479386,
"grad_norm": 0.2862843871116638,
"learning_rate": 2.0723498370725162e-05,
"loss": 0.0289,
"step": 13570
},
{
"epoch": 13.020134228187919,
"grad_norm": 0.29127344489097595,
"learning_rate": 2.0652996092973974e-05,
"loss": 0.0379,
"step": 13580
},
{
"epoch": 13.029721955896452,
"grad_norm": 0.1825907677412033,
"learning_rate": 2.0582582716703243e-05,
"loss": 0.0267,
"step": 13590
},
{
"epoch": 13.039309683604985,
"grad_norm": 0.20657765865325928,
"learning_rate": 2.0512258455217636e-05,
"loss": 0.0337,
"step": 13600
},
{
"epoch": 13.048897411313519,
"grad_norm": 0.20046214759349823,
"learning_rate": 2.044202352155185e-05,
"loss": 0.0256,
"step": 13610
},
{
"epoch": 13.058485139022052,
"grad_norm": 0.23749665915966034,
"learning_rate": 2.0371878128470047e-05,
"loss": 0.033,
"step": 13620
},
{
"epoch": 13.068072866730585,
"grad_norm": 0.1981140673160553,
"learning_rate": 2.0301822488465106e-05,
"loss": 0.0323,
"step": 13630
},
{
"epoch": 13.077660594439118,
"grad_norm": 0.3064008951187134,
"learning_rate": 2.0231856813757995e-05,
"loss": 0.029,
"step": 13640
},
{
"epoch": 13.087248322147651,
"grad_norm": 0.3160218596458435,
"learning_rate": 2.016198131629716e-05,
"loss": 0.0317,
"step": 13650
},
{
"epoch": 13.096836049856185,
"grad_norm": 0.1925330013036728,
"learning_rate": 2.0092196207757886e-05,
"loss": 0.0308,
"step": 13660
},
{
"epoch": 13.106423777564718,
"grad_norm": 0.2060590237379074,
"learning_rate": 2.002250169954165e-05,
"loss": 0.0352,
"step": 13670
},
{
"epoch": 13.116011505273251,
"grad_norm": 0.21879933774471283,
"learning_rate": 1.9952898002775444e-05,
"loss": 0.0262,
"step": 13680
},
{
"epoch": 13.125599232981783,
"grad_norm": 0.22108188271522522,
"learning_rate": 1.9883385328311155e-05,
"loss": 0.0333,
"step": 13690
},
{
"epoch": 13.135186960690316,
"grad_norm": 0.26251569390296936,
"learning_rate": 1.981396388672496e-05,
"loss": 0.0314,
"step": 13700
},
{
"epoch": 13.144774688398849,
"grad_norm": 0.29389551281929016,
"learning_rate": 1.9744633888316684e-05,
"loss": 0.0333,
"step": 13710
},
{
"epoch": 13.154362416107382,
"grad_norm": 0.1754542887210846,
"learning_rate": 1.9675395543109087e-05,
"loss": 0.0306,
"step": 13720
},
{
"epoch": 13.163950143815915,
"grad_norm": 0.2529279589653015,
"learning_rate": 1.9606249060847275e-05,
"loss": 0.029,
"step": 13730
},
{
"epoch": 13.173537871524449,
"grad_norm": 0.25833970308303833,
"learning_rate": 1.9537194650998176e-05,
"loss": 0.0257,
"step": 13740
},
{
"epoch": 13.183125599232982,
"grad_norm": 0.2809722423553467,
"learning_rate": 1.9468232522749685e-05,
"loss": 0.03,
"step": 13750
},
{
"epoch": 13.192713326941515,
"grad_norm": 0.2745196521282196,
"learning_rate": 1.9399362885010186e-05,
"loss": 0.0259,
"step": 13760
},
{
"epoch": 13.202301054650048,
"grad_norm": 0.26047447323799133,
"learning_rate": 1.9330585946407896e-05,
"loss": 0.0293,
"step": 13770
},
{
"epoch": 13.211888782358582,
"grad_norm": 0.2309299260377884,
"learning_rate": 1.9261901915290222e-05,
"loss": 0.0263,
"step": 13780
},
{
"epoch": 13.221476510067115,
"grad_norm": 0.19574059545993805,
"learning_rate": 1.9193310999723086e-05,
"loss": 0.0256,
"step": 13790
},
{
"epoch": 13.231064237775648,
"grad_norm": 0.24411630630493164,
"learning_rate": 1.9124813407490345e-05,
"loss": 0.0266,
"step": 13800
},
{
"epoch": 13.24065196548418,
"grad_norm": 0.2317860871553421,
"learning_rate": 1.9056409346093167e-05,
"loss": 0.0362,
"step": 13810
},
{
"epoch": 13.250239693192713,
"grad_norm": 0.34288397431373596,
"learning_rate": 1.89880990227494e-05,
"loss": 0.031,
"step": 13820
},
{
"epoch": 13.259827420901246,
"grad_norm": 0.22115236520767212,
"learning_rate": 1.8919882644392894e-05,
"loss": 0.0303,
"step": 13830
},
{
"epoch": 13.269415148609779,
"grad_norm": 0.1675620973110199,
"learning_rate": 1.8851760417672897e-05,
"loss": 0.0267,
"step": 13840
},
{
"epoch": 13.279002876318312,
"grad_norm": 0.22504985332489014,
"learning_rate": 1.8783732548953487e-05,
"loss": 0.03,
"step": 13850
},
{
"epoch": 13.288590604026846,
"grad_norm": 0.2568277418613434,
"learning_rate": 1.87157992443129e-05,
"loss": 0.0347,
"step": 13860
},
{
"epoch": 13.298178331735379,
"grad_norm": 0.24830462038516998,
"learning_rate": 1.8647960709542866e-05,
"loss": 0.0313,
"step": 13870
},
{
"epoch": 13.307766059443912,
"grad_norm": 0.1982988864183426,
"learning_rate": 1.8580217150148034e-05,
"loss": 0.0286,
"step": 13880
},
{
"epoch": 13.317353787152445,
"grad_norm": 0.17509537935256958,
"learning_rate": 1.851256877134538e-05,
"loss": 0.0283,
"step": 13890
},
{
"epoch": 13.326941514860978,
"grad_norm": 0.27267399430274963,
"learning_rate": 1.8445015778063528e-05,
"loss": 0.0308,
"step": 13900
},
{
"epoch": 13.336529242569512,
"grad_norm": 0.2444014698266983,
"learning_rate": 1.8377558374942143e-05,
"loss": 0.0335,
"step": 13910
},
{
"epoch": 13.346116970278045,
"grad_norm": 0.4355910122394562,
"learning_rate": 1.831019676633129e-05,
"loss": 0.0326,
"step": 13920
},
{
"epoch": 13.355704697986576,
"grad_norm": 0.6526142954826355,
"learning_rate": 1.8242931156290893e-05,
"loss": 0.0299,
"step": 13930
},
{
"epoch": 13.36529242569511,
"grad_norm": 0.20145297050476074,
"learning_rate": 1.8175761748590063e-05,
"loss": 0.0315,
"step": 13940
},
{
"epoch": 13.374880153403643,
"grad_norm": 0.22952324151992798,
"learning_rate": 1.8108688746706427e-05,
"loss": 0.031,
"step": 13950
},
{
"epoch": 13.384467881112176,
"grad_norm": 0.38137954473495483,
"learning_rate": 1.8041712353825635e-05,
"loss": 0.0387,
"step": 13960
},
{
"epoch": 13.39405560882071,
"grad_norm": 0.2673424482345581,
"learning_rate": 1.7974832772840617e-05,
"loss": 0.0272,
"step": 13970
},
{
"epoch": 13.403643336529242,
"grad_norm": 0.2189689427614212,
"learning_rate": 1.790805020635109e-05,
"loss": 0.0317,
"step": 13980
},
{
"epoch": 13.413231064237776,
"grad_norm": 1.2192716598510742,
"learning_rate": 1.7841364856662824e-05,
"loss": 0.0258,
"step": 13990
},
{
"epoch": 13.422818791946309,
"grad_norm": 0.13329686224460602,
"learning_rate": 1.7774776925787136e-05,
"loss": 0.0257,
"step": 14000
},
{
"epoch": 13.432406519654842,
"grad_norm": 0.2741002142429352,
"learning_rate": 1.7708286615440183e-05,
"loss": 0.0271,
"step": 14010
},
{
"epoch": 13.441994247363375,
"grad_norm": 0.7737520337104797,
"learning_rate": 1.764189412704247e-05,
"loss": 0.0283,
"step": 14020
},
{
"epoch": 13.451581975071909,
"grad_norm": 0.24316097795963287,
"learning_rate": 1.7575599661718068e-05,
"loss": 0.0302,
"step": 14030
},
{
"epoch": 13.461169702780442,
"grad_norm": 0.23543784022331238,
"learning_rate": 1.7509403420294208e-05,
"loss": 0.0311,
"step": 14040
},
{
"epoch": 13.470757430488973,
"grad_norm": 0.19010919332504272,
"learning_rate": 1.7443305603300497e-05,
"loss": 0.0276,
"step": 14050
},
{
"epoch": 13.480345158197506,
"grad_norm": 0.1994113028049469,
"learning_rate": 1.7377306410968396e-05,
"loss": 0.0298,
"step": 14060
},
{
"epoch": 13.48993288590604,
"grad_norm": 0.30696478486061096,
"learning_rate": 1.731140604323063e-05,
"loss": 0.0275,
"step": 14070
},
{
"epoch": 13.499520613614573,
"grad_norm": 0.3128091096878052,
"learning_rate": 1.7245604699720535e-05,
"loss": 0.0272,
"step": 14080
},
{
"epoch": 13.509108341323106,
"grad_norm": 2.206577777862549,
"learning_rate": 1.7179902579771474e-05,
"loss": 0.0326,
"step": 14090
},
{
"epoch": 13.51869606903164,
"grad_norm": 0.18835577368736267,
"learning_rate": 1.711429988241619e-05,
"loss": 0.0276,
"step": 14100
},
{
"epoch": 13.528283796740173,
"grad_norm": 0.2255256026983261,
"learning_rate": 1.7048796806386304e-05,
"loss": 0.0301,
"step": 14110
},
{
"epoch": 13.537871524448706,
"grad_norm": 0.3144644796848297,
"learning_rate": 1.6983393550111648e-05,
"loss": 0.0324,
"step": 14120
},
{
"epoch": 13.547459252157239,
"grad_norm": 0.20487931370735168,
"learning_rate": 1.691809031171962e-05,
"loss": 0.0352,
"step": 14130
},
{
"epoch": 13.557046979865772,
"grad_norm": 0.22863590717315674,
"learning_rate": 1.6852887289034632e-05,
"loss": 0.0343,
"step": 14140
},
{
"epoch": 13.566634707574305,
"grad_norm": 0.30829718708992004,
"learning_rate": 1.67877846795776e-05,
"loss": 0.0342,
"step": 14150
},
{
"epoch": 13.576222435282839,
"grad_norm": 0.2026831954717636,
"learning_rate": 1.672278268056516e-05,
"loss": 0.0266,
"step": 14160
},
{
"epoch": 13.585810162991372,
"grad_norm": 0.18998700380325317,
"learning_rate": 1.6657881488909192e-05,
"loss": 0.0316,
"step": 14170
},
{
"epoch": 13.595397890699903,
"grad_norm": 0.2338184267282486,
"learning_rate": 1.659308130121622e-05,
"loss": 0.0315,
"step": 14180
},
{
"epoch": 13.604985618408437,
"grad_norm": 0.421129047870636,
"learning_rate": 1.6528382313786784e-05,
"loss": 0.0322,
"step": 14190
},
{
"epoch": 13.61457334611697,
"grad_norm": 0.28092893958091736,
"learning_rate": 1.6463784722614845e-05,
"loss": 0.0269,
"step": 14200
},
{
"epoch": 13.624161073825503,
"grad_norm": 0.19112944602966309,
"learning_rate": 1.6399288723387195e-05,
"loss": 0.0258,
"step": 14210
},
{
"epoch": 13.633748801534036,
"grad_norm": 0.286045640707016,
"learning_rate": 1.63348945114829e-05,
"loss": 0.0324,
"step": 14220
},
{
"epoch": 13.64333652924257,
"grad_norm": 0.280977338552475,
"learning_rate": 1.6270602281972686e-05,
"loss": 0.0265,
"step": 14230
},
{
"epoch": 13.652924256951103,
"grad_norm": 0.28009748458862305,
"learning_rate": 1.6206412229618307e-05,
"loss": 0.034,
"step": 14240
},
{
"epoch": 13.662511984659636,
"grad_norm": 0.2950078845024109,
"learning_rate": 1.6142324548871978e-05,
"loss": 0.0332,
"step": 14250
},
{
"epoch": 13.67209971236817,
"grad_norm": 0.19593513011932373,
"learning_rate": 1.607833943387585e-05,
"loss": 0.0322,
"step": 14260
},
{
"epoch": 13.681687440076702,
"grad_norm": 0.3256717026233673,
"learning_rate": 1.6014457078461353e-05,
"loss": 0.0311,
"step": 14270
},
{
"epoch": 13.691275167785236,
"grad_norm": 0.48480740189552307,
"learning_rate": 1.59506776761486e-05,
"loss": 0.0265,
"step": 14280
},
{
"epoch": 13.700862895493769,
"grad_norm": 0.17794422805309296,
"learning_rate": 1.588700142014583e-05,
"loss": 0.0302,
"step": 14290
},
{
"epoch": 13.7104506232023,
"grad_norm": 0.21641989052295685,
"learning_rate": 1.5823428503348846e-05,
"loss": 0.0269,
"step": 14300
},
{
"epoch": 13.720038350910833,
"grad_norm": 0.21487939357757568,
"learning_rate": 1.57599591183404e-05,
"loss": 0.0333,
"step": 14310
},
{
"epoch": 13.729626078619367,
"grad_norm": 0.20198583602905273,
"learning_rate": 1.569659345738959e-05,
"loss": 0.0316,
"step": 14320
},
{
"epoch": 13.7392138063279,
"grad_norm": 0.24818021059036255,
"learning_rate": 1.5633331712451287e-05,
"loss": 0.0322,
"step": 14330
},
{
"epoch": 13.748801534036433,
"grad_norm": 0.3211008906364441,
"learning_rate": 1.5570174075165617e-05,
"loss": 0.0286,
"step": 14340
},
{
"epoch": 13.758389261744966,
"grad_norm": 0.27913060784339905,
"learning_rate": 1.5507120736857316e-05,
"loss": 0.0309,
"step": 14350
},
{
"epoch": 13.7679769894535,
"grad_norm": 0.3094828724861145,
"learning_rate": 1.5444171888535127e-05,
"loss": 0.0262,
"step": 14360
},
{
"epoch": 13.777564717162033,
"grad_norm": 0.26376375555992126,
"learning_rate": 1.538132772089131e-05,
"loss": 0.0312,
"step": 14370
},
{
"epoch": 13.787152444870566,
"grad_norm": 0.27103152871131897,
"learning_rate": 1.531858842430096e-05,
"loss": 0.029,
"step": 14380
},
{
"epoch": 13.7967401725791,
"grad_norm": 0.2528936564922333,
"learning_rate": 1.5255954188821554e-05,
"loss": 0.0302,
"step": 14390
},
{
"epoch": 13.806327900287632,
"grad_norm": 0.2022869884967804,
"learning_rate": 1.519342520419223e-05,
"loss": 0.028,
"step": 14400
},
{
"epoch": 13.815915627996166,
"grad_norm": 0.2736548185348511,
"learning_rate": 1.5131001659833349e-05,
"loss": 0.0391,
"step": 14410
},
{
"epoch": 13.825503355704697,
"grad_norm": 0.20340123772621155,
"learning_rate": 1.5068683744845802e-05,
"loss": 0.0259,
"step": 14420
},
{
"epoch": 13.83509108341323,
"grad_norm": 0.30253875255584717,
"learning_rate": 1.5006471648010567e-05,
"loss": 0.0318,
"step": 14430
},
{
"epoch": 13.844678811121764,
"grad_norm": 0.18290819227695465,
"learning_rate": 1.4944365557787982e-05,
"loss": 0.0266,
"step": 14440
},
{
"epoch": 13.854266538830297,
"grad_norm": 0.17378397285938263,
"learning_rate": 1.4882365662317338e-05,
"loss": 0.0307,
"step": 14450
},
{
"epoch": 13.86385426653883,
"grad_norm": 0.17450757324695587,
"learning_rate": 1.4820472149416154e-05,
"loss": 0.0375,
"step": 14460
},
{
"epoch": 13.873441994247363,
"grad_norm": 0.17673359811306,
"learning_rate": 1.4758685206579754e-05,
"loss": 0.0336,
"step": 14470
},
{
"epoch": 13.883029721955896,
"grad_norm": 0.17782671749591827,
"learning_rate": 1.4697005020980547e-05,
"loss": 0.0264,
"step": 14480
},
{
"epoch": 13.89261744966443,
"grad_norm": 0.22997714579105377,
"learning_rate": 1.4635431779467628e-05,
"loss": 0.0364,
"step": 14490
},
{
"epoch": 13.902205177372963,
"grad_norm": 0.23629331588745117,
"learning_rate": 1.4573965668566037e-05,
"loss": 0.0293,
"step": 14500
},
{
"epoch": 13.911792905081496,
"grad_norm": 0.2348259836435318,
"learning_rate": 1.4512606874476348e-05,
"loss": 0.0296,
"step": 14510
},
{
"epoch": 13.92138063279003,
"grad_norm": 0.2225087732076645,
"learning_rate": 1.4451355583074027e-05,
"loss": 0.0286,
"step": 14520
},
{
"epoch": 13.930968360498563,
"grad_norm": 0.23287685215473175,
"learning_rate": 1.4390211979908847e-05,
"loss": 0.0279,
"step": 14530
},
{
"epoch": 13.940556088207096,
"grad_norm": 0.19362808763980865,
"learning_rate": 1.4329176250204369e-05,
"loss": 0.0334,
"step": 14540
},
{
"epoch": 13.950143815915627,
"grad_norm": 0.25659292936325073,
"learning_rate": 1.4268248578857384e-05,
"loss": 0.0286,
"step": 14550
},
{
"epoch": 13.95973154362416,
"grad_norm": 0.19965949654579163,
"learning_rate": 1.4207429150437368e-05,
"loss": 0.0336,
"step": 14560
},
{
"epoch": 13.969319271332694,
"grad_norm": 0.21127323806285858,
"learning_rate": 1.4146718149185833e-05,
"loss": 0.0311,
"step": 14570
},
{
"epoch": 13.978906999041227,
"grad_norm": 0.2175043374300003,
"learning_rate": 1.408611575901585e-05,
"loss": 0.0232,
"step": 14580
},
{
"epoch": 13.98849472674976,
"grad_norm": 0.2855774462223053,
"learning_rate": 1.4025622163511498e-05,
"loss": 0.03,
"step": 14590
},
{
"epoch": 13.998082454458293,
"grad_norm": 0.27606961131095886,
"learning_rate": 1.3965237545927274e-05,
"loss": 0.0285,
"step": 14600
},
{
"epoch": 14.007670182166827,
"grad_norm": 0.20237654447555542,
"learning_rate": 1.3904962089187529e-05,
"loss": 0.0263,
"step": 14610
},
{
"epoch": 14.01725790987536,
"grad_norm": 0.17577792704105377,
"learning_rate": 1.3844795975885921e-05,
"loss": 0.028,
"step": 14620
},
{
"epoch": 14.026845637583893,
"grad_norm": 0.24930806457996368,
"learning_rate": 1.3784739388284911e-05,
"loss": 0.0308,
"step": 14630
},
{
"epoch": 14.036433365292426,
"grad_norm": 0.16480274498462677,
"learning_rate": 1.372479250831516e-05,
"loss": 0.0301,
"step": 14640
},
{
"epoch": 14.04602109300096,
"grad_norm": 0.20912165939807892,
"learning_rate": 1.3664955517574968e-05,
"loss": 0.0278,
"step": 14650
},
{
"epoch": 14.055608820709493,
"grad_norm": 0.3317655622959137,
"learning_rate": 1.3605228597329738e-05,
"loss": 0.0317,
"step": 14660
},
{
"epoch": 14.065196548418024,
"grad_norm": 0.240800142288208,
"learning_rate": 1.3545611928511475e-05,
"loss": 0.0352,
"step": 14670
},
{
"epoch": 14.074784276126557,
"grad_norm": 0.2574955224990845,
"learning_rate": 1.3486105691718187e-05,
"loss": 0.0272,
"step": 14680
},
{
"epoch": 14.08437200383509,
"grad_norm": 0.26954057812690735,
"learning_rate": 1.3426710067213322e-05,
"loss": 0.0309,
"step": 14690
},
{
"epoch": 14.093959731543624,
"grad_norm": 0.23546206951141357,
"learning_rate": 1.336742523492523e-05,
"loss": 0.0332,
"step": 14700
},
{
"epoch": 14.103547459252157,
"grad_norm": 0.2285180389881134,
"learning_rate": 1.3308251374446734e-05,
"loss": 0.0436,
"step": 14710
},
{
"epoch": 14.11313518696069,
"grad_norm": 0.22198130190372467,
"learning_rate": 1.324918866503439e-05,
"loss": 0.0283,
"step": 14720
},
{
"epoch": 14.122722914669223,
"grad_norm": 0.37202128767967224,
"learning_rate": 1.3190237285608076e-05,
"loss": 0.0296,
"step": 14730
},
{
"epoch": 14.132310642377757,
"grad_norm": 0.2728140652179718,
"learning_rate": 1.3131397414750385e-05,
"loss": 0.0313,
"step": 14740
},
{
"epoch": 14.14189837008629,
"grad_norm": 0.19201789796352386,
"learning_rate": 1.3072669230706197e-05,
"loss": 0.0315,
"step": 14750
},
{
"epoch": 14.151486097794823,
"grad_norm": 0.2704322040081024,
"learning_rate": 1.3014052911381974e-05,
"loss": 0.0279,
"step": 14760
},
{
"epoch": 14.161073825503356,
"grad_norm": 0.23162490129470825,
"learning_rate": 1.2955548634345327e-05,
"loss": 0.0288,
"step": 14770
},
{
"epoch": 14.17066155321189,
"grad_norm": 0.1527073085308075,
"learning_rate": 1.289715657682447e-05,
"loss": 0.0287,
"step": 14780
},
{
"epoch": 14.180249280920421,
"grad_norm": 0.48836442828178406,
"learning_rate": 1.2838876915707681e-05,
"loss": 0.0334,
"step": 14790
},
{
"epoch": 14.189837008628954,
"grad_norm": 0.22852776944637299,
"learning_rate": 1.2780709827542708e-05,
"loss": 0.0301,
"step": 14800
},
{
"epoch": 14.199424736337487,
"grad_norm": 1.632561445236206,
"learning_rate": 1.2722655488536294e-05,
"loss": 0.0296,
"step": 14810
},
{
"epoch": 14.20901246404602,
"grad_norm": 0.20910300314426422,
"learning_rate": 1.2664714074553652e-05,
"loss": 0.0277,
"step": 14820
},
{
"epoch": 14.218600191754554,
"grad_norm": 0.284138023853302,
"learning_rate": 1.260688576111791e-05,
"loss": 0.0275,
"step": 14830
},
{
"epoch": 14.228187919463087,
"grad_norm": 0.24799588322639465,
"learning_rate": 1.2549170723409549e-05,
"loss": 0.0291,
"step": 14840
},
{
"epoch": 14.23777564717162,
"grad_norm": 0.18639959394931793,
"learning_rate": 1.2491569136265896e-05,
"loss": 0.0284,
"step": 14850
},
{
"epoch": 14.247363374880154,
"grad_norm": 0.19724729657173157,
"learning_rate": 1.243408117418064e-05,
"loss": 0.0266,
"step": 14860
},
{
"epoch": 14.256951102588687,
"grad_norm": 0.1451575756072998,
"learning_rate": 1.2376707011303257e-05,
"loss": 0.0313,
"step": 14870
},
{
"epoch": 14.26653883029722,
"grad_norm": 0.13136418163776398,
"learning_rate": 1.2319446821438458e-05,
"loss": 0.0257,
"step": 14880
},
{
"epoch": 14.276126558005753,
"grad_norm": 0.212480828166008,
"learning_rate": 1.2262300778045693e-05,
"loss": 0.0309,
"step": 14890
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.179280087351799,
"learning_rate": 1.220526905423866e-05,
"loss": 0.0334,
"step": 14900
},
{
"epoch": 14.29530201342282,
"grad_norm": 0.19260522723197937,
"learning_rate": 1.2148351822784748e-05,
"loss": 0.0321,
"step": 14910
},
{
"epoch": 14.304889741131351,
"grad_norm": 0.2079414278268814,
"learning_rate": 1.2091549256104457e-05,
"loss": 0.0314,
"step": 14920
},
{
"epoch": 14.314477468839884,
"grad_norm": 0.1942739635705948,
"learning_rate": 1.2034861526270996e-05,
"loss": 0.0307,
"step": 14930
},
{
"epoch": 14.324065196548418,
"grad_norm": 0.28928378224372864,
"learning_rate": 1.1978288805009641e-05,
"loss": 0.0267,
"step": 14940
},
{
"epoch": 14.33365292425695,
"grad_norm": 0.3712955415248871,
"learning_rate": 1.192183126369732e-05,
"loss": 0.0329,
"step": 14950
},
{
"epoch": 14.343240651965484,
"grad_norm": 0.22929075360298157,
"learning_rate": 1.1865489073361996e-05,
"loss": 0.0264,
"step": 14960
},
{
"epoch": 14.352828379674017,
"grad_norm": 0.31317007541656494,
"learning_rate": 1.1809262404682247e-05,
"loss": 0.0242,
"step": 14970
},
{
"epoch": 14.36241610738255,
"grad_norm": 0.5237254500389099,
"learning_rate": 1.1753151427986646e-05,
"loss": 0.0292,
"step": 14980
},
{
"epoch": 14.372003835091084,
"grad_norm": 0.21789228916168213,
"learning_rate": 1.169715631325336e-05,
"loss": 0.0314,
"step": 14990
},
{
"epoch": 14.381591562799617,
"grad_norm": 0.29379501938819885,
"learning_rate": 1.1641277230109492e-05,
"loss": 0.0332,
"step": 15000
},
{
"epoch": 14.39117929050815,
"grad_norm": 0.17771072685718536,
"learning_rate": 1.1585514347830738e-05,
"loss": 0.0267,
"step": 15010
},
{
"epoch": 14.400767018216683,
"grad_norm": 0.24794255197048187,
"learning_rate": 1.1529867835340707e-05,
"loss": 0.0267,
"step": 15020
},
{
"epoch": 14.410354745925215,
"grad_norm": 0.21468493342399597,
"learning_rate": 1.1474337861210543e-05,
"loss": 0.0267,
"step": 15030
},
{
"epoch": 14.419942473633748,
"grad_norm": 0.17512547969818115,
"learning_rate": 1.1418924593658314e-05,
"loss": 0.0239,
"step": 15040
},
{
"epoch": 14.429530201342281,
"grad_norm": 0.2626974284648895,
"learning_rate": 1.1363628200548593e-05,
"loss": 0.0328,
"step": 15050
},
{
"epoch": 14.439117929050814,
"grad_norm": 0.21883651614189148,
"learning_rate": 1.1308448849391846e-05,
"loss": 0.0283,
"step": 15060
},
{
"epoch": 14.448705656759348,
"grad_norm": 0.2517321705818176,
"learning_rate": 1.1253386707344044e-05,
"loss": 0.0319,
"step": 15070
},
{
"epoch": 14.458293384467881,
"grad_norm": 0.23790787160396576,
"learning_rate": 1.1198441941206033e-05,
"loss": 0.0254,
"step": 15080
},
{
"epoch": 14.467881112176414,
"grad_norm": 0.2755306363105774,
"learning_rate": 1.1143614717423145e-05,
"loss": 0.0297,
"step": 15090
},
{
"epoch": 14.477468839884947,
"grad_norm": 0.17343682050704956,
"learning_rate": 1.1088905202084604e-05,
"loss": 0.0271,
"step": 15100
},
{
"epoch": 14.48705656759348,
"grad_norm": 0.4037168323993683,
"learning_rate": 1.1034313560923032e-05,
"loss": 0.0318,
"step": 15110
},
{
"epoch": 14.496644295302014,
"grad_norm": 0.25027063488960266,
"learning_rate": 1.097983995931407e-05,
"loss": 0.0344,
"step": 15120
},
{
"epoch": 14.506232023010547,
"grad_norm": 0.2531662583351135,
"learning_rate": 1.0925484562275678e-05,
"loss": 0.0336,
"step": 15130
},
{
"epoch": 14.51581975071908,
"grad_norm": 0.27917400002479553,
"learning_rate": 1.0871247534467788e-05,
"loss": 0.0316,
"step": 15140
},
{
"epoch": 14.525407478427613,
"grad_norm": 0.26147523522377014,
"learning_rate": 1.0817129040191698e-05,
"loss": 0.0278,
"step": 15150
},
{
"epoch": 14.534995206136145,
"grad_norm": 0.24168430268764496,
"learning_rate": 1.076312924338973e-05,
"loss": 0.03,
"step": 15160
},
{
"epoch": 14.544582933844678,
"grad_norm": 0.17934760451316833,
"learning_rate": 1.0709248307644559e-05,
"loss": 0.0275,
"step": 15170
},
{
"epoch": 14.554170661553211,
"grad_norm": 0.38495177030563354,
"learning_rate": 1.0655486396178782e-05,
"loss": 0.0317,
"step": 15180
},
{
"epoch": 14.563758389261745,
"grad_norm": 0.22225984930992126,
"learning_rate": 1.0601843671854477e-05,
"loss": 0.0312,
"step": 15190
},
{
"epoch": 14.573346116970278,
"grad_norm": 0.29296278953552246,
"learning_rate": 1.0548320297172665e-05,
"loss": 0.0315,
"step": 15200
},
{
"epoch": 14.582933844678811,
"grad_norm": 0.3371207118034363,
"learning_rate": 1.0494916434272783e-05,
"loss": 0.0299,
"step": 15210
},
{
"epoch": 14.592521572387344,
"grad_norm": 0.220375657081604,
"learning_rate": 1.0441632244932237e-05,
"loss": 0.0265,
"step": 15220
},
{
"epoch": 14.602109300095877,
"grad_norm": 0.1987174153327942,
"learning_rate": 1.0388467890565928e-05,
"loss": 0.0261,
"step": 15230
},
{
"epoch": 14.61169702780441,
"grad_norm": 0.25363320112228394,
"learning_rate": 1.0335423532225735e-05,
"loss": 0.0301,
"step": 15240
},
{
"epoch": 14.621284755512944,
"grad_norm": 0.22231195867061615,
"learning_rate": 1.028249933060001e-05,
"loss": 0.0353,
"step": 15250
},
{
"epoch": 14.630872483221477,
"grad_norm": 0.20641197264194489,
"learning_rate": 1.022969544601311e-05,
"loss": 0.0254,
"step": 15260
},
{
"epoch": 14.64046021093001,
"grad_norm": 0.25588056445121765,
"learning_rate": 1.0177012038424927e-05,
"loss": 0.0327,
"step": 15270
},
{
"epoch": 14.650047938638544,
"grad_norm": 0.3196217715740204,
"learning_rate": 1.0124449267430414e-05,
"loss": 0.0306,
"step": 15280
},
{
"epoch": 14.659635666347075,
"grad_norm": 0.37711241841316223,
"learning_rate": 1.0072007292259029e-05,
"loss": 0.0314,
"step": 15290
},
{
"epoch": 14.669223394055608,
"grad_norm": 0.299496591091156,
"learning_rate": 1.0019686271774314e-05,
"loss": 0.0273,
"step": 15300
},
{
"epoch": 14.678811121764141,
"grad_norm": 0.20070233941078186,
"learning_rate": 9.967486364473416e-06,
"loss": 0.0348,
"step": 15310
},
{
"epoch": 14.688398849472675,
"grad_norm": 0.1786354035139084,
"learning_rate": 9.915407728486603e-06,
"loss": 0.0315,
"step": 15320
},
{
"epoch": 14.697986577181208,
"grad_norm": 0.19913482666015625,
"learning_rate": 9.863450521576729e-06,
"loss": 0.0332,
"step": 15330
},
{
"epoch": 14.707574304889741,
"grad_norm": 0.26217663288116455,
"learning_rate": 9.81161490113885e-06,
"loss": 0.0299,
"step": 15340
},
{
"epoch": 14.717162032598274,
"grad_norm": 0.17626221477985382,
"learning_rate": 9.759901024199642e-06,
"loss": 0.0258,
"step": 15350
},
{
"epoch": 14.726749760306808,
"grad_norm": 0.5230224132537842,
"learning_rate": 9.708309047417041e-06,
"loss": 0.0286,
"step": 15360
},
{
"epoch": 14.73633748801534,
"grad_norm": 0.19318176805973053,
"learning_rate": 9.656839127079659e-06,
"loss": 0.0254,
"step": 15370
},
{
"epoch": 14.745925215723874,
"grad_norm": 0.30321067571640015,
"learning_rate": 9.6054914191064e-06,
"loss": 0.0304,
"step": 15380
},
{
"epoch": 14.755512943432407,
"grad_norm": 0.2519323229789734,
"learning_rate": 9.554266079045909e-06,
"loss": 0.0325,
"step": 15390
},
{
"epoch": 14.765100671140939,
"grad_norm": 0.24592278897762299,
"learning_rate": 9.503163262076181e-06,
"loss": 0.0336,
"step": 15400
},
{
"epoch": 14.774688398849472,
"grad_norm": 0.19091877341270447,
"learning_rate": 9.452183123004e-06,
"loss": 0.0247,
"step": 15410
},
{
"epoch": 14.784276126558005,
"grad_norm": 0.26081383228302,
"learning_rate": 9.401325816264573e-06,
"loss": 0.0333,
"step": 15420
},
{
"epoch": 14.793863854266538,
"grad_norm": 0.27854666113853455,
"learning_rate": 9.350591495920952e-06,
"loss": 0.024,
"step": 15430
},
{
"epoch": 14.803451581975072,
"grad_norm": 0.36169877648353577,
"learning_rate": 9.299980315663686e-06,
"loss": 0.031,
"step": 15440
},
{
"epoch": 14.813039309683605,
"grad_norm": 0.18000735342502594,
"learning_rate": 9.24949242881023e-06,
"loss": 0.0289,
"step": 15450
},
{
"epoch": 14.822627037392138,
"grad_norm": 0.25608521699905396,
"learning_rate": 9.199127988304607e-06,
"loss": 0.0284,
"step": 15460
},
{
"epoch": 14.832214765100671,
"grad_norm": 0.2771013379096985,
"learning_rate": 9.148887146716812e-06,
"loss": 0.0283,
"step": 15470
},
{
"epoch": 14.841802492809204,
"grad_norm": 0.17078572511672974,
"learning_rate": 9.09877005624249e-06,
"loss": 0.0294,
"step": 15480
},
{
"epoch": 14.851390220517738,
"grad_norm": 0.17408467829227448,
"learning_rate": 9.048776868702347e-06,
"loss": 0.0255,
"step": 15490
},
{
"epoch": 14.860977948226271,
"grad_norm": 0.20527216792106628,
"learning_rate": 8.998907735541789e-06,
"loss": 0.0329,
"step": 15500
},
{
"epoch": 14.870565675934804,
"grad_norm": 0.23558159172534943,
"learning_rate": 8.94916280783038e-06,
"loss": 0.0294,
"step": 15510
},
{
"epoch": 14.880153403643337,
"grad_norm": 0.16163650155067444,
"learning_rate": 8.89954223626146e-06,
"loss": 0.0264,
"step": 15520
},
{
"epoch": 14.889741131351869,
"grad_norm": 0.2564382255077362,
"learning_rate": 8.850046171151666e-06,
"loss": 0.0332,
"step": 15530
},
{
"epoch": 14.899328859060402,
"grad_norm": 0.2050989419221878,
"learning_rate": 8.80067476244042e-06,
"loss": 0.0307,
"step": 15540
},
{
"epoch": 14.908916586768935,
"grad_norm": 0.18448740243911743,
"learning_rate": 8.751428159689528e-06,
"loss": 0.0306,
"step": 15550
},
{
"epoch": 14.918504314477468,
"grad_norm": 0.29133155941963196,
"learning_rate": 8.702306512082753e-06,
"loss": 0.0243,
"step": 15560
},
{
"epoch": 14.928092042186002,
"grad_norm": 0.141392782330513,
"learning_rate": 8.653309968425322e-06,
"loss": 0.0242,
"step": 15570
},
{
"epoch": 14.937679769894535,
"grad_norm": 0.21134333312511444,
"learning_rate": 8.60443867714345e-06,
"loss": 0.0318,
"step": 15580
},
{
"epoch": 14.947267497603068,
"grad_norm": 0.2590806484222412,
"learning_rate": 8.55569278628393e-06,
"loss": 0.0253,
"step": 15590
},
{
"epoch": 14.956855225311601,
"grad_norm": 0.21871857345104218,
"learning_rate": 8.507072443513702e-06,
"loss": 0.0258,
"step": 15600
},
{
"epoch": 14.966442953020135,
"grad_norm": 0.25187286734580994,
"learning_rate": 8.458577796119382e-06,
"loss": 0.03,
"step": 15610
},
{
"epoch": 14.976030680728668,
"grad_norm": 0.17888393998146057,
"learning_rate": 8.410208991006784e-06,
"loss": 0.0274,
"step": 15620
},
{
"epoch": 14.985618408437201,
"grad_norm": 0.1486871838569641,
"learning_rate": 8.361966174700514e-06,
"loss": 0.0269,
"step": 15630
},
{
"epoch": 14.995206136145734,
"grad_norm": 0.6585232019424438,
"learning_rate": 8.31384949334353e-06,
"loss": 0.0294,
"step": 15640
},
{
"epoch": 15.004793863854266,
"grad_norm": 0.36748427152633667,
"learning_rate": 8.265859092696686e-06,
"loss": 0.0318,
"step": 15650
},
{
"epoch": 15.014381591562799,
"grad_norm": 0.22082515060901642,
"learning_rate": 8.217995118138294e-06,
"loss": 0.0294,
"step": 15660
},
{
"epoch": 15.023969319271332,
"grad_norm": 0.1767498254776001,
"learning_rate": 8.170257714663642e-06,
"loss": 0.0275,
"step": 15670
},
{
"epoch": 15.033557046979865,
"grad_norm": 0.24185898900032043,
"learning_rate": 8.12264702688465e-06,
"loss": 0.0279,
"step": 15680
},
{
"epoch": 15.043144774688399,
"grad_norm": 0.22703923285007477,
"learning_rate": 8.075163199029357e-06,
"loss": 0.0268,
"step": 15690
},
{
"epoch": 15.052732502396932,
"grad_norm": 0.2051907479763031,
"learning_rate": 8.027806374941481e-06,
"loss": 0.0272,
"step": 15700
},
{
"epoch": 15.062320230105465,
"grad_norm": 0.24761435389518738,
"learning_rate": 7.980576698080005e-06,
"loss": 0.0301,
"step": 15710
},
{
"epoch": 15.071907957813998,
"grad_norm": 0.17438143491744995,
"learning_rate": 7.933474311518796e-06,
"loss": 0.0351,
"step": 15720
},
{
"epoch": 15.081495685522532,
"grad_norm": 0.20341135561466217,
"learning_rate": 7.88649935794606e-06,
"loss": 0.0264,
"step": 15730
},
{
"epoch": 15.091083413231065,
"grad_norm": 0.24047966301441193,
"learning_rate": 7.83965197966397e-06,
"loss": 0.0268,
"step": 15740
},
{
"epoch": 15.100671140939598,
"grad_norm": 0.19311171770095825,
"learning_rate": 7.792932318588264e-06,
"loss": 0.033,
"step": 15750
},
{
"epoch": 15.110258868648131,
"grad_norm": 0.18407687544822693,
"learning_rate": 7.746340516247779e-06,
"loss": 0.0243,
"step": 15760
},
{
"epoch": 15.119846596356663,
"grad_norm": 0.21947818994522095,
"learning_rate": 7.69987671378401e-06,
"loss": 0.0255,
"step": 15770
},
{
"epoch": 15.129434324065196,
"grad_norm": 0.4175131916999817,
"learning_rate": 7.653541051950692e-06,
"loss": 0.0245,
"step": 15780
},
{
"epoch": 15.139022051773729,
"grad_norm": 0.29046544432640076,
"learning_rate": 7.607333671113409e-06,
"loss": 0.0365,
"step": 15790
},
{
"epoch": 15.148609779482262,
"grad_norm": 0.25391921401023865,
"learning_rate": 7.561254711249127e-06,
"loss": 0.0266,
"step": 15800
},
{
"epoch": 15.158197507190796,
"grad_norm": 0.19595490396022797,
"learning_rate": 7.515304311945787e-06,
"loss": 0.0306,
"step": 15810
},
{
"epoch": 15.167785234899329,
"grad_norm": 0.1492607444524765,
"learning_rate": 7.469482612401857e-06,
"loss": 0.0306,
"step": 15820
},
{
"epoch": 15.177372962607862,
"grad_norm": 0.2468632310628891,
"learning_rate": 7.423789751425958e-06,
"loss": 0.0275,
"step": 15830
},
{
"epoch": 15.186960690316395,
"grad_norm": 0.20901519060134888,
"learning_rate": 7.378225867436428e-06,
"loss": 0.0252,
"step": 15840
},
{
"epoch": 15.196548418024928,
"grad_norm": 0.28785982728004456,
"learning_rate": 7.332791098460867e-06,
"loss": 0.0326,
"step": 15850
},
{
"epoch": 15.206136145733462,
"grad_norm": 0.2834322154521942,
"learning_rate": 7.287485582135728e-06,
"loss": 0.0302,
"step": 15860
},
{
"epoch": 15.215723873441995,
"grad_norm": 0.24561063945293427,
"learning_rate": 7.242309455705959e-06,
"loss": 0.0292,
"step": 15870
},
{
"epoch": 15.225311601150528,
"grad_norm": 0.23040306568145752,
"learning_rate": 7.197262856024539e-06,
"loss": 0.0246,
"step": 15880
},
{
"epoch": 15.234899328859061,
"grad_norm": 0.22045479714870453,
"learning_rate": 7.152345919552045e-06,
"loss": 0.0314,
"step": 15890
},
{
"epoch": 15.244487056567593,
"grad_norm": 0.2748197913169861,
"learning_rate": 7.107558782356255e-06,
"loss": 0.0292,
"step": 15900
},
{
"epoch": 15.254074784276126,
"grad_norm": 0.2709030210971832,
"learning_rate": 7.0629015801117744e-06,
"loss": 0.0299,
"step": 15910
},
{
"epoch": 15.26366251198466,
"grad_norm": 0.2666435241699219,
"learning_rate": 7.018374448099596e-06,
"loss": 0.0324,
"step": 15920
},
{
"epoch": 15.273250239693192,
"grad_norm": 0.32848596572875977,
"learning_rate": 6.973977521206654e-06,
"loss": 0.0344,
"step": 15930
},
{
"epoch": 15.282837967401726,
"grad_norm": 0.23068153858184814,
"learning_rate": 6.929710933925487e-06,
"loss": 0.0262,
"step": 15940
},
{
"epoch": 15.292425695110259,
"grad_norm": 0.24479450285434723,
"learning_rate": 6.885574820353752e-06,
"loss": 0.0269,
"step": 15950
},
{
"epoch": 15.302013422818792,
"grad_norm": 0.21294337511062622,
"learning_rate": 6.841569314193902e-06,
"loss": 0.0265,
"step": 15960
},
{
"epoch": 15.311601150527325,
"grad_norm": 0.28778862953186035,
"learning_rate": 6.797694548752703e-06,
"loss": 0.0273,
"step": 15970
},
{
"epoch": 15.321188878235859,
"grad_norm": 0.189237579703331,
"learning_rate": 6.753950656940905e-06,
"loss": 0.0267,
"step": 15980
},
{
"epoch": 15.330776605944392,
"grad_norm": 0.28015297651290894,
"learning_rate": 6.710337771272745e-06,
"loss": 0.034,
"step": 15990
},
{
"epoch": 15.340364333652925,
"grad_norm": 0.1625533103942871,
"learning_rate": 6.666856023865658e-06,
"loss": 0.0233,
"step": 16000
},
{
"epoch": 15.349952061361458,
"grad_norm": 0.21412205696105957,
"learning_rate": 6.623505546439773e-06,
"loss": 0.0253,
"step": 16010
},
{
"epoch": 15.35953978906999,
"grad_norm": 0.26244086027145386,
"learning_rate": 6.580286470317598e-06,
"loss": 0.0256,
"step": 16020
},
{
"epoch": 15.369127516778523,
"grad_norm": 0.28637972474098206,
"learning_rate": 6.537198926423549e-06,
"loss": 0.0283,
"step": 16030
},
{
"epoch": 15.378715244487056,
"grad_norm": 0.2678770124912262,
"learning_rate": 6.494243045283621e-06,
"loss": 0.0271,
"step": 16040
},
{
"epoch": 15.38830297219559,
"grad_norm": 0.1962299942970276,
"learning_rate": 6.45141895702493e-06,
"loss": 0.0258,
"step": 16050
},
{
"epoch": 15.397890699904123,
"grad_norm": 0.26651138067245483,
"learning_rate": 6.40872679137538e-06,
"loss": 0.0276,
"step": 16060
},
{
"epoch": 15.407478427612656,
"grad_norm": 0.23737022280693054,
"learning_rate": 6.366166677663204e-06,
"loss": 0.0309,
"step": 16070
},
{
"epoch": 15.417066155321189,
"grad_norm": 0.2531161606311798,
"learning_rate": 6.323738744816654e-06,
"loss": 0.0329,
"step": 16080
},
{
"epoch": 15.426653883029722,
"grad_norm": 0.26035356521606445,
"learning_rate": 6.2814431213635065e-06,
"loss": 0.0286,
"step": 16090
},
{
"epoch": 15.436241610738255,
"grad_norm": 0.2163701057434082,
"learning_rate": 6.239279935430786e-06,
"loss": 0.027,
"step": 16100
},
{
"epoch": 15.445829338446789,
"grad_norm": 0.18169005215168,
"learning_rate": 6.197249314744275e-06,
"loss": 0.024,
"step": 16110
},
{
"epoch": 15.455417066155322,
"grad_norm": 0.24503251910209656,
"learning_rate": 6.155351386628205e-06,
"loss": 0.0298,
"step": 16120
},
{
"epoch": 15.465004793863855,
"grad_norm": 0.19895343482494354,
"learning_rate": 6.113586278004835e-06,
"loss": 0.0233,
"step": 16130
},
{
"epoch": 15.474592521572387,
"grad_norm": 0.2949654459953308,
"learning_rate": 6.071954115394063e-06,
"loss": 0.0256,
"step": 16140
},
{
"epoch": 15.48418024928092,
"grad_norm": 0.13835924863815308,
"learning_rate": 6.030455024913029e-06,
"loss": 0.029,
"step": 16150
},
{
"epoch": 15.493767976989453,
"grad_norm": 0.36957499384880066,
"learning_rate": 5.989089132275799e-06,
"loss": 0.0369,
"step": 16160
},
{
"epoch": 15.503355704697986,
"grad_norm": 0.22811642289161682,
"learning_rate": 5.947856562792925e-06,
"loss": 0.0306,
"step": 16170
},
{
"epoch": 15.51294343240652,
"grad_norm": 0.3362506330013275,
"learning_rate": 5.906757441371069e-06,
"loss": 0.0346,
"step": 16180
},
{
"epoch": 15.522531160115053,
"grad_norm": 0.20575332641601562,
"learning_rate": 5.865791892512623e-06,
"loss": 0.0305,
"step": 16190
},
{
"epoch": 15.532118887823586,
"grad_norm": 0.1870652139186859,
"learning_rate": 5.824960040315386e-06,
"loss": 0.0253,
"step": 16200
},
{
"epoch": 15.541706615532119,
"grad_norm": 0.4694177508354187,
"learning_rate": 5.784262008472124e-06,
"loss": 0.0287,
"step": 16210
},
{
"epoch": 15.551294343240652,
"grad_norm": 0.2506779134273529,
"learning_rate": 5.7436979202702194e-06,
"loss": 0.0331,
"step": 16220
},
{
"epoch": 15.560882070949186,
"grad_norm": 0.18632706999778748,
"learning_rate": 5.703267898591275e-06,
"loss": 0.0234,
"step": 16230
},
{
"epoch": 15.570469798657719,
"grad_norm": 0.14531591534614563,
"learning_rate": 5.662972065910799e-06,
"loss": 0.0245,
"step": 16240
},
{
"epoch": 15.580057526366252,
"grad_norm": 0.19370119273662567,
"learning_rate": 5.622810544297796e-06,
"loss": 0.0262,
"step": 16250
},
{
"epoch": 15.589645254074785,
"grad_norm": 0.2350122630596161,
"learning_rate": 5.582783455414375e-06,
"loss": 0.0262,
"step": 16260
},
{
"epoch": 15.599232981783317,
"grad_norm": 0.2912338078022003,
"learning_rate": 5.5428909205154035e-06,
"loss": 0.0284,
"step": 16270
},
{
"epoch": 15.60882070949185,
"grad_norm": 0.28382018208503723,
"learning_rate": 5.503133060448168e-06,
"loss": 0.0257,
"step": 16280
},
{
"epoch": 15.618408437200383,
"grad_norm": 0.1536964774131775,
"learning_rate": 5.463509995651978e-06,
"loss": 0.0274,
"step": 16290
},
{
"epoch": 15.627996164908916,
"grad_norm": 0.5844811201095581,
"learning_rate": 5.4240218461577894e-06,
"loss": 0.0294,
"step": 16300
},
{
"epoch": 15.63758389261745,
"grad_norm": 0.2484215646982193,
"learning_rate": 5.384668731587844e-06,
"loss": 0.0278,
"step": 16310
},
{
"epoch": 15.647171620325983,
"grad_norm": 0.2738986015319824,
"learning_rate": 5.345450771155358e-06,
"loss": 0.0271,
"step": 16320
},
{
"epoch": 15.656759348034516,
"grad_norm": 0.23017966747283936,
"learning_rate": 5.3063680836641095e-06,
"loss": 0.0261,
"step": 16330
},
{
"epoch": 15.66634707574305,
"grad_norm": 0.1773134022951126,
"learning_rate": 5.2674207875080595e-06,
"loss": 0.03,
"step": 16340
},
{
"epoch": 15.675934803451582,
"grad_norm": 0.1907745748758316,
"learning_rate": 5.228609000671081e-06,
"loss": 0.0224,
"step": 16350
},
{
"epoch": 15.685522531160116,
"grad_norm": 0.2307148277759552,
"learning_rate": 5.1899328407264855e-06,
"loss": 0.0294,
"step": 16360
},
{
"epoch": 15.695110258868649,
"grad_norm": 0.3302120566368103,
"learning_rate": 5.151392424836782e-06,
"loss": 0.0292,
"step": 16370
},
{
"epoch": 15.70469798657718,
"grad_norm": 0.2139192521572113,
"learning_rate": 5.112987869753216e-06,
"loss": 0.0296,
"step": 16380
},
{
"epoch": 15.714285714285714,
"grad_norm": 0.16015082597732544,
"learning_rate": 5.074719291815522e-06,
"loss": 0.029,
"step": 16390
},
{
"epoch": 15.723873441994247,
"grad_norm": 0.19606702029705048,
"learning_rate": 5.036586806951465e-06,
"loss": 0.029,
"step": 16400
},
{
"epoch": 15.73346116970278,
"grad_norm": 0.30746451020240784,
"learning_rate": 4.998590530676584e-06,
"loss": 0.0285,
"step": 16410
},
{
"epoch": 15.743048897411313,
"grad_norm": 0.16113652288913727,
"learning_rate": 4.960730578093753e-06,
"loss": 0.028,
"step": 16420
},
{
"epoch": 15.752636625119846,
"grad_norm": 0.23624086380004883,
"learning_rate": 4.923007063892926e-06,
"loss": 0.0251,
"step": 16430
},
{
"epoch": 15.76222435282838,
"grad_norm": 0.19934307038784027,
"learning_rate": 4.885420102350696e-06,
"loss": 0.0238,
"step": 16440
},
{
"epoch": 15.771812080536913,
"grad_norm": 0.2440912276506424,
"learning_rate": 4.847969807330038e-06,
"loss": 0.0231,
"step": 16450
},
{
"epoch": 15.781399808245446,
"grad_norm": 0.2768200933933258,
"learning_rate": 4.810656292279875e-06,
"loss": 0.0268,
"step": 16460
},
{
"epoch": 15.79098753595398,
"grad_norm": 0.29489603638648987,
"learning_rate": 4.773479670234821e-06,
"loss": 0.0358,
"step": 16470
},
{
"epoch": 15.800575263662513,
"grad_norm": 0.26058635115623474,
"learning_rate": 4.7364400538147665e-06,
"loss": 0.0272,
"step": 16480
},
{
"epoch": 15.810162991371046,
"grad_norm": 0.19268332421779633,
"learning_rate": 4.699537555224598e-06,
"loss": 0.028,
"step": 16490
},
{
"epoch": 15.819750719079579,
"grad_norm": 0.27744096517562866,
"learning_rate": 4.6627722862537915e-06,
"loss": 0.0278,
"step": 16500
},
{
"epoch": 15.82933844678811,
"grad_norm": 0.3575479984283447,
"learning_rate": 4.626144358276147e-06,
"loss": 0.0275,
"step": 16510
},
{
"epoch": 15.838926174496644,
"grad_norm": 0.20007503032684326,
"learning_rate": 4.589653882249378e-06,
"loss": 0.0309,
"step": 16520
},
{
"epoch": 15.848513902205177,
"grad_norm": 0.20804741978645325,
"learning_rate": 4.553300968714841e-06,
"loss": 0.0249,
"step": 16530
},
{
"epoch": 15.85810162991371,
"grad_norm": 0.2726737856864929,
"learning_rate": 4.5170857277971765e-06,
"loss": 0.0259,
"step": 16540
},
{
"epoch": 15.867689357622243,
"grad_norm": 0.21122261881828308,
"learning_rate": 4.48100826920394e-06,
"loss": 0.029,
"step": 16550
},
{
"epoch": 15.877277085330777,
"grad_norm": 0.28613051772117615,
"learning_rate": 4.4450687022253135e-06,
"loss": 0.0255,
"step": 16560
},
{
"epoch": 15.88686481303931,
"grad_norm": 0.2184969037771225,
"learning_rate": 4.409267135733764e-06,
"loss": 0.0233,
"step": 16570
},
{
"epoch": 15.896452540747843,
"grad_norm": 0.19320517778396606,
"learning_rate": 4.37360367818373e-06,
"loss": 0.0271,
"step": 16580
},
{
"epoch": 15.906040268456376,
"grad_norm": 0.18892447650432587,
"learning_rate": 4.338078437611237e-06,
"loss": 0.0265,
"step": 16590
},
{
"epoch": 15.91562799616491,
"grad_norm": 0.23824314773082733,
"learning_rate": 4.3026915216336225e-06,
"loss": 0.0269,
"step": 16600
},
{
"epoch": 15.925215723873443,
"grad_norm": 0.1431523561477661,
"learning_rate": 4.267443037449198e-06,
"loss": 0.0269,
"step": 16610
},
{
"epoch": 15.934803451581976,
"grad_norm": 0.22107666730880737,
"learning_rate": 4.232333091836932e-06,
"loss": 0.0293,
"step": 16620
},
{
"epoch": 15.944391179290509,
"grad_norm": 0.27542436122894287,
"learning_rate": 4.197361791156096e-06,
"loss": 0.03,
"step": 16630
},
{
"epoch": 15.95397890699904,
"grad_norm": 0.234486922621727,
"learning_rate": 4.162529241345958e-06,
"loss": 0.0325,
"step": 16640
},
{
"epoch": 15.963566634707574,
"grad_norm": 0.24536362290382385,
"learning_rate": 4.127835547925479e-06,
"loss": 0.0211,
"step": 16650
},
{
"epoch": 15.973154362416107,
"grad_norm": 0.2566201686859131,
"learning_rate": 4.093280815992989e-06,
"loss": 0.0244,
"step": 16660
},
{
"epoch": 15.98274209012464,
"grad_norm": 0.3387947380542755,
"learning_rate": 4.058865150225833e-06,
"loss": 0.0279,
"step": 16670
},
{
"epoch": 15.992329817833173,
"grad_norm": 0.5632581114768982,
"learning_rate": 4.024588654880079e-06,
"loss": 0.0298,
"step": 16680
},
{
"epoch": 16.001917545541705,
"grad_norm": 0.2585551142692566,
"learning_rate": 3.990451433790254e-06,
"loss": 0.0313,
"step": 16690
},
{
"epoch": 16.01150527325024,
"grad_norm": 0.2654295563697815,
"learning_rate": 3.956453590368914e-06,
"loss": 0.0258,
"step": 16700
},
{
"epoch": 16.02109300095877,
"grad_norm": 0.243434339761734,
"learning_rate": 3.922595227606435e-06,
"loss": 0.0263,
"step": 16710
},
{
"epoch": 16.030680728667306,
"grad_norm": 0.23672133684158325,
"learning_rate": 3.8888764480706276e-06,
"loss": 0.029,
"step": 16720
},
{
"epoch": 16.040268456375838,
"grad_norm": 0.28110471367836,
"learning_rate": 3.855297353906512e-06,
"loss": 0.0313,
"step": 16730
},
{
"epoch": 16.049856184084373,
"grad_norm": 0.17387288808822632,
"learning_rate": 3.821858046835913e-06,
"loss": 0.0263,
"step": 16740
},
{
"epoch": 16.059443911792904,
"grad_norm": 0.16623635590076447,
"learning_rate": 3.7885586281572016e-06,
"loss": 0.0234,
"step": 16750
},
{
"epoch": 16.06903163950144,
"grad_norm": 0.20889221131801605,
"learning_rate": 3.7553991987449912e-06,
"loss": 0.0198,
"step": 16760
},
{
"epoch": 16.07861936720997,
"grad_norm": 0.2764891982078552,
"learning_rate": 3.7223798590498403e-06,
"loss": 0.0306,
"step": 16770
},
{
"epoch": 16.088207094918506,
"grad_norm": 0.17139260470867157,
"learning_rate": 3.689500709097893e-06,
"loss": 0.0204,
"step": 16780
},
{
"epoch": 16.097794822627037,
"grad_norm": 0.25818943977355957,
"learning_rate": 3.6567618484906307e-06,
"loss": 0.0243,
"step": 16790
},
{
"epoch": 16.107382550335572,
"grad_norm": 0.33521944284439087,
"learning_rate": 3.6241633764045545e-06,
"loss": 0.0289,
"step": 16800
},
{
"epoch": 16.116970278044104,
"grad_norm": 0.23774349689483643,
"learning_rate": 3.591705391590905e-06,
"loss": 0.0284,
"step": 16810
},
{
"epoch": 16.126558005752635,
"grad_norm": 0.17396867275238037,
"learning_rate": 3.5593879923753015e-06,
"loss": 0.0292,
"step": 16820
},
{
"epoch": 16.13614573346117,
"grad_norm": 0.32836684584617615,
"learning_rate": 3.5272112766574993e-06,
"loss": 0.0261,
"step": 16830
},
{
"epoch": 16.1457334611697,
"grad_norm": 0.2727390229701996,
"learning_rate": 3.4951753419110943e-06,
"loss": 0.0294,
"step": 16840
},
{
"epoch": 16.155321188878236,
"grad_norm": 0.36386972665786743,
"learning_rate": 3.4632802851832013e-06,
"loss": 0.0256,
"step": 16850
},
{
"epoch": 16.164908916586768,
"grad_norm": 0.20322419703006744,
"learning_rate": 3.431526203094171e-06,
"loss": 0.0242,
"step": 16860
},
{
"epoch": 16.174496644295303,
"grad_norm": 0.23579928278923035,
"learning_rate": 3.3999131918372785e-06,
"loss": 0.03,
"step": 16870
},
{
"epoch": 16.184084372003834,
"grad_norm": 0.20980890095233917,
"learning_rate": 3.3684413471784804e-06,
"loss": 0.0281,
"step": 16880
},
{
"epoch": 16.19367209971237,
"grad_norm": 0.17388616502285004,
"learning_rate": 3.3371107644560805e-06,
"loss": 0.0312,
"step": 16890
},
{
"epoch": 16.2032598274209,
"grad_norm": 0.43162086606025696,
"learning_rate": 3.3059215385804585e-06,
"loss": 0.0281,
"step": 16900
},
{
"epoch": 16.212847555129436,
"grad_norm": 0.21873044967651367,
"learning_rate": 3.274873764033759e-06,
"loss": 0.0255,
"step": 16910
},
{
"epoch": 16.222435282837967,
"grad_norm": 0.2102050930261612,
"learning_rate": 3.243967534869652e-06,
"loss": 0.0272,
"step": 16920
},
{
"epoch": 16.232023010546502,
"grad_norm": 0.21298690140247345,
"learning_rate": 3.213202944713023e-06,
"loss": 0.0261,
"step": 16930
},
{
"epoch": 16.241610738255034,
"grad_norm": 0.30388498306274414,
"learning_rate": 3.1825800867596566e-06,
"loss": 0.0338,
"step": 16940
},
{
"epoch": 16.251198465963565,
"grad_norm": 0.2536049485206604,
"learning_rate": 3.152099053776014e-06,
"loss": 0.0292,
"step": 16950
},
{
"epoch": 16.2607861936721,
"grad_norm": 0.2809562385082245,
"learning_rate": 3.121759938098906e-06,
"loss": 0.0262,
"step": 16960
},
{
"epoch": 16.27037392138063,
"grad_norm": 0.2241629660129547,
"learning_rate": 3.091562831635253e-06,
"loss": 0.0288,
"step": 16970
},
{
"epoch": 16.279961649089167,
"grad_norm": 0.1237056627869606,
"learning_rate": 3.061507825861748e-06,
"loss": 0.0209,
"step": 16980
},
{
"epoch": 16.289549376797698,
"grad_norm": 0.13440051674842834,
"learning_rate": 3.031595011824656e-06,
"loss": 0.0273,
"step": 16990
},
{
"epoch": 16.299137104506233,
"grad_norm": 0.28445371985435486,
"learning_rate": 3.0018244801394535e-06,
"loss": 0.034,
"step": 17000
},
{
"epoch": 16.308724832214764,
"grad_norm": 0.3177470862865448,
"learning_rate": 2.9721963209906502e-06,
"loss": 0.0301,
"step": 17010
},
{
"epoch": 16.3183125599233,
"grad_norm": 0.1341092437505722,
"learning_rate": 2.942710624131412e-06,
"loss": 0.0266,
"step": 17020
},
{
"epoch": 16.32790028763183,
"grad_norm": 0.19116052985191345,
"learning_rate": 2.9133674788833833e-06,
"loss": 0.0311,
"step": 17030
},
{
"epoch": 16.337488015340366,
"grad_norm": 0.1874174177646637,
"learning_rate": 2.884166974136343e-06,
"loss": 0.0236,
"step": 17040
},
{
"epoch": 16.347075743048897,
"grad_norm": 0.36720889806747437,
"learning_rate": 2.855109198347983e-06,
"loss": 0.0278,
"step": 17050
},
{
"epoch": 16.35666347075743,
"grad_norm": 0.38599368929862976,
"learning_rate": 2.826194239543617e-06,
"loss": 0.0323,
"step": 17060
},
{
"epoch": 16.366251198465964,
"grad_norm": 0.19532305002212524,
"learning_rate": 2.797422185315929e-06,
"loss": 0.0222,
"step": 17070
},
{
"epoch": 16.375838926174495,
"grad_norm": 0.2218206375837326,
"learning_rate": 2.768793122824681e-06,
"loss": 0.0255,
"step": 17080
},
{
"epoch": 16.38542665388303,
"grad_norm": 0.3124590516090393,
"learning_rate": 2.740307138796483e-06,
"loss": 0.0249,
"step": 17090
},
{
"epoch": 16.39501438159156,
"grad_norm": 0.21726781129837036,
"learning_rate": 2.7119643195245238e-06,
"loss": 0.0218,
"step": 17100
},
{
"epoch": 16.404602109300097,
"grad_norm": 0.5927583575248718,
"learning_rate": 2.683764750868273e-06,
"loss": 0.0263,
"step": 17110
},
{
"epoch": 16.414189837008628,
"grad_norm": 0.28960007429122925,
"learning_rate": 2.6557085182532582e-06,
"loss": 0.0291,
"step": 17120
},
{
"epoch": 16.423777564717163,
"grad_norm": 0.35697048902511597,
"learning_rate": 2.6277957066708047e-06,
"loss": 0.0273,
"step": 17130
},
{
"epoch": 16.433365292425695,
"grad_norm": 0.2136591225862503,
"learning_rate": 2.6000264006777743e-06,
"loss": 0.0325,
"step": 17140
},
{
"epoch": 16.44295302013423,
"grad_norm": 0.3051040768623352,
"learning_rate": 2.5724006843962866e-06,
"loss": 0.0298,
"step": 17150
},
{
"epoch": 16.45254074784276,
"grad_norm": 0.1534937173128128,
"learning_rate": 2.5449186415134885e-06,
"loss": 0.0263,
"step": 17160
},
{
"epoch": 16.462128475551296,
"grad_norm": 0.17988426983356476,
"learning_rate": 2.5175803552812906e-06,
"loss": 0.0278,
"step": 17170
},
{
"epoch": 16.471716203259827,
"grad_norm": 0.48748767375946045,
"learning_rate": 2.490385908516141e-06,
"loss": 0.0308,
"step": 17180
},
{
"epoch": 16.48130393096836,
"grad_norm": 0.191914901137352,
"learning_rate": 2.463335383598725e-06,
"loss": 0.0303,
"step": 17190
},
{
"epoch": 16.490891658676894,
"grad_norm": 0.21671634912490845,
"learning_rate": 2.4364288624737442e-06,
"loss": 0.0276,
"step": 17200
},
{
"epoch": 16.500479386385425,
"grad_norm": 0.13923166692256927,
"learning_rate": 2.4096664266496814e-06,
"loss": 0.0331,
"step": 17210
},
{
"epoch": 16.51006711409396,
"grad_norm": 0.20780488848686218,
"learning_rate": 2.3830481571985365e-06,
"loss": 0.0243,
"step": 17220
},
{
"epoch": 16.51965484180249,
"grad_norm": 0.39643654227256775,
"learning_rate": 2.3565741347555792e-06,
"loss": 0.0289,
"step": 17230
},
{
"epoch": 16.529242569511027,
"grad_norm": 0.18083330988883972,
"learning_rate": 2.3302444395190915e-06,
"loss": 0.0216,
"step": 17240
},
{
"epoch": 16.538830297219558,
"grad_norm": 0.1432444006204605,
"learning_rate": 2.3040591512501765e-06,
"loss": 0.0318,
"step": 17250
},
{
"epoch": 16.548418024928093,
"grad_norm": 0.2874661386013031,
"learning_rate": 2.278018349272465e-06,
"loss": 0.0279,
"step": 17260
},
{
"epoch": 16.558005752636625,
"grad_norm": 0.2093266099691391,
"learning_rate": 2.2521221124718826e-06,
"loss": 0.0226,
"step": 17270
},
{
"epoch": 16.56759348034516,
"grad_norm": 0.3234308063983917,
"learning_rate": 2.2263705192964334e-06,
"loss": 0.0295,
"step": 17280
},
{
"epoch": 16.57718120805369,
"grad_norm": 0.6225463151931763,
"learning_rate": 2.2007636477559436e-06,
"loss": 0.031,
"step": 17290
},
{
"epoch": 16.586768935762223,
"grad_norm": 0.31777986884117126,
"learning_rate": 2.1753015754218453e-06,
"loss": 0.0311,
"step": 17300
},
{
"epoch": 16.596356663470758,
"grad_norm": 0.2332683950662613,
"learning_rate": 2.149984379426906e-06,
"loss": 0.0263,
"step": 17310
},
{
"epoch": 16.60594439117929,
"grad_norm": 0.23592767119407654,
"learning_rate": 2.1248121364650265e-06,
"loss": 0.0229,
"step": 17320
},
{
"epoch": 16.615532118887824,
"grad_norm": 0.4014437198638916,
"learning_rate": 2.0997849227909983e-06,
"loss": 0.026,
"step": 17330
},
{
"epoch": 16.625119846596355,
"grad_norm": 0.18571177124977112,
"learning_rate": 2.0749028142202807e-06,
"loss": 0.0281,
"step": 17340
},
{
"epoch": 16.63470757430489,
"grad_norm": 0.2480279952287674,
"learning_rate": 2.050165886128741e-06,
"loss": 0.0283,
"step": 17350
},
{
"epoch": 16.644295302013422,
"grad_norm": 0.20139874517917633,
"learning_rate": 2.0255742134524804e-06,
"loss": 0.0263,
"step": 17360
},
{
"epoch": 16.653883029721957,
"grad_norm": 0.18241684138774872,
"learning_rate": 2.001127870687541e-06,
"loss": 0.0206,
"step": 17370
},
{
"epoch": 16.66347075743049,
"grad_norm": 0.26072490215301514,
"learning_rate": 1.9768269318897414e-06,
"loss": 0.0251,
"step": 17380
},
{
"epoch": 16.673058485139023,
"grad_norm": 0.33512383699417114,
"learning_rate": 1.9526714706744055e-06,
"loss": 0.0282,
"step": 17390
},
{
"epoch": 16.682646212847555,
"grad_norm": 0.279745876789093,
"learning_rate": 1.928661560216172e-06,
"loss": 0.0233,
"step": 17400
},
{
"epoch": 16.69223394055609,
"grad_norm": 0.2306470274925232,
"learning_rate": 1.904797273248754e-06,
"loss": 0.0272,
"step": 17410
},
{
"epoch": 16.70182166826462,
"grad_norm": 0.14322997629642487,
"learning_rate": 1.8810786820647242e-06,
"loss": 0.0272,
"step": 17420
},
{
"epoch": 16.711409395973153,
"grad_norm": 0.25938233733177185,
"learning_rate": 1.8575058585152905e-06,
"loss": 0.0308,
"step": 17430
},
{
"epoch": 16.720997123681688,
"grad_norm": 0.23380053043365479,
"learning_rate": 1.8340788740101034e-06,
"loss": 0.028,
"step": 17440
},
{
"epoch": 16.73058485139022,
"grad_norm": 0.27241095900535583,
"learning_rate": 1.810797799517e-06,
"loss": 0.0293,
"step": 17450
},
{
"epoch": 16.740172579098754,
"grad_norm": 0.24621997773647308,
"learning_rate": 1.7876627055618155e-06,
"loss": 0.0258,
"step": 17460
},
{
"epoch": 16.749760306807286,
"grad_norm": 0.15812641382217407,
"learning_rate": 1.7646736622281667e-06,
"loss": 0.0259,
"step": 17470
},
{
"epoch": 16.75934803451582,
"grad_norm": 0.18936626613140106,
"learning_rate": 1.7418307391572354e-06,
"loss": 0.026,
"step": 17480
},
{
"epoch": 16.768935762224352,
"grad_norm": 0.16878223419189453,
"learning_rate": 1.7191340055475513e-06,
"loss": 0.0281,
"step": 17490
},
{
"epoch": 16.778523489932887,
"grad_norm": 0.18892349302768707,
"learning_rate": 1.696583530154794e-06,
"loss": 0.0259,
"step": 17500
},
{
"epoch": 16.78811121764142,
"grad_norm": 0.243266299366951,
"learning_rate": 1.6741793812915907e-06,
"loss": 0.0248,
"step": 17510
},
{
"epoch": 16.797698945349953,
"grad_norm": 0.20740211009979248,
"learning_rate": 1.6519216268272796e-06,
"loss": 0.0264,
"step": 17520
},
{
"epoch": 16.807286673058485,
"grad_norm": 0.16220887005329132,
"learning_rate": 1.6298103341877369e-06,
"loss": 0.0226,
"step": 17530
},
{
"epoch": 16.81687440076702,
"grad_norm": 0.3126187026500702,
"learning_rate": 1.6078455703551486e-06,
"loss": 0.0326,
"step": 17540
},
{
"epoch": 16.82646212847555,
"grad_norm": 0.1612725555896759,
"learning_rate": 1.5860274018678345e-06,
"loss": 0.0327,
"step": 17550
},
{
"epoch": 16.836049856184083,
"grad_norm": 0.20316867530345917,
"learning_rate": 1.5643558948200131e-06,
"loss": 0.0252,
"step": 17560
},
{
"epoch": 16.845637583892618,
"grad_norm": 0.20207004249095917,
"learning_rate": 1.5428311148616204e-06,
"loss": 0.0298,
"step": 17570
},
{
"epoch": 16.85522531160115,
"grad_norm": 0.2780834436416626,
"learning_rate": 1.5214531271981192e-06,
"loss": 0.026,
"step": 17580
},
{
"epoch": 16.864813039309684,
"grad_norm": 0.3551330268383026,
"learning_rate": 1.5002219965902896e-06,
"loss": 0.0255,
"step": 17590
},
{
"epoch": 16.874400767018216,
"grad_norm": 0.23651057481765747,
"learning_rate": 1.4791377873540235e-06,
"loss": 0.0274,
"step": 17600
},
{
"epoch": 16.88398849472675,
"grad_norm": 0.19430945813655853,
"learning_rate": 1.4582005633601515e-06,
"loss": 0.0232,
"step": 17610
},
{
"epoch": 16.893576222435282,
"grad_norm": 0.21821914613246918,
"learning_rate": 1.437410388034227e-06,
"loss": 0.0278,
"step": 17620
},
{
"epoch": 16.903163950143817,
"grad_norm": 0.23415020108222961,
"learning_rate": 1.4167673243563717e-06,
"loss": 0.0331,
"step": 17630
},
{
"epoch": 16.91275167785235,
"grad_norm": 0.207551971077919,
"learning_rate": 1.3962714348610295e-06,
"loss": 0.0305,
"step": 17640
},
{
"epoch": 16.922339405560884,
"grad_norm": 0.28280988335609436,
"learning_rate": 1.3759227816368182e-06,
"loss": 0.0297,
"step": 17650
},
{
"epoch": 16.931927133269415,
"grad_norm": 0.24366876482963562,
"learning_rate": 1.3557214263263286e-06,
"loss": 0.0247,
"step": 17660
},
{
"epoch": 16.941514860977946,
"grad_norm": 0.20423495769500732,
"learning_rate": 1.3356674301259532e-06,
"loss": 0.0263,
"step": 17670
},
{
"epoch": 16.95110258868648,
"grad_norm": 0.19706788659095764,
"learning_rate": 1.3157608537856582e-06,
"loss": 0.0297,
"step": 17680
},
{
"epoch": 16.960690316395013,
"grad_norm": 0.2174736112356186,
"learning_rate": 1.2960017576088446e-06,
"loss": 0.0278,
"step": 17690
},
{
"epoch": 16.970278044103548,
"grad_norm": 0.2222086638212204,
"learning_rate": 1.2763902014521656e-06,
"loss": 0.0276,
"step": 17700
},
{
"epoch": 16.97986577181208,
"grad_norm": 0.20257794857025146,
"learning_rate": 1.2569262447252928e-06,
"loss": 0.034,
"step": 17710
},
{
"epoch": 16.989453499520614,
"grad_norm": 0.2699783146381378,
"learning_rate": 1.2376099463907887e-06,
"loss": 0.0226,
"step": 17720
},
{
"epoch": 16.999041227229146,
"grad_norm": 0.19566196203231812,
"learning_rate": 1.2184413649639182e-06,
"loss": 0.028,
"step": 17730
},
{
"epoch": 17.00862895493768,
"grad_norm": 0.23381511867046356,
"learning_rate": 1.1994205585124652e-06,
"loss": 0.029,
"step": 17740
},
{
"epoch": 17.018216682646212,
"grad_norm": 0.19119040668010712,
"learning_rate": 1.180547584656533e-06,
"loss": 0.0239,
"step": 17750
},
{
"epoch": 17.027804410354747,
"grad_norm": 0.23085108399391174,
"learning_rate": 1.1618225005684158e-06,
"loss": 0.0275,
"step": 17760
},
{
"epoch": 17.03739213806328,
"grad_norm": 0.21077860891819,
"learning_rate": 1.1432453629723893e-06,
"loss": 0.0309,
"step": 17770
},
{
"epoch": 17.046979865771814,
"grad_norm": 0.18925194442272186,
"learning_rate": 1.124816228144565e-06,
"loss": 0.0271,
"step": 17780
},
{
"epoch": 17.056567593480345,
"grad_norm": 0.22407986223697662,
"learning_rate": 1.106535151912702e-06,
"loss": 0.0273,
"step": 17790
},
{
"epoch": 17.066155321188877,
"grad_norm": 0.21448639035224915,
"learning_rate": 1.0884021896560237e-06,
"loss": 0.0258,
"step": 17800
},
{
"epoch": 17.07574304889741,
"grad_norm": 0.24161478877067566,
"learning_rate": 1.0704173963050957e-06,
"loss": 0.0289,
"step": 17810
},
{
"epoch": 17.085330776605943,
"grad_norm": 0.1643606573343277,
"learning_rate": 1.0525808263416205e-06,
"loss": 0.0258,
"step": 17820
},
{
"epoch": 17.094918504314478,
"grad_norm": 0.2575829327106476,
"learning_rate": 1.0348925337982817e-06,
"loss": 0.0274,
"step": 17830
},
{
"epoch": 17.10450623202301,
"grad_norm": 0.1602732241153717,
"learning_rate": 1.0173525722585897e-06,
"loss": 0.0358,
"step": 17840
},
{
"epoch": 17.114093959731544,
"grad_norm": 0.23271816968917847,
"learning_rate": 9.999609948567024e-07,
"loss": 0.0373,
"step": 17850
},
{
"epoch": 17.123681687440076,
"grad_norm": 0.18822619318962097,
"learning_rate": 9.82717854277293e-07,
"loss": 0.0278,
"step": 17860
},
{
"epoch": 17.13326941514861,
"grad_norm": 0.37295079231262207,
"learning_rate": 9.656232027553558e-07,
"loss": 0.0245,
"step": 17870
},
{
"epoch": 17.142857142857142,
"grad_norm": 0.207114115357399,
"learning_rate": 9.486770920760668e-07,
"loss": 0.0237,
"step": 17880
},
{
"epoch": 17.152444870565677,
"grad_norm": 0.2382437288761139,
"learning_rate": 9.318795735746233e-07,
"loss": 0.0262,
"step": 17890
},
{
"epoch": 17.16203259827421,
"grad_norm": 0.3437121510505676,
"learning_rate": 9.152306981360992e-07,
"loss": 0.0274,
"step": 17900
},
{
"epoch": 17.171620325982744,
"grad_norm": 0.1845656931400299,
"learning_rate": 8.987305161952731e-07,
"loss": 0.0251,
"step": 17910
},
{
"epoch": 17.181208053691275,
"grad_norm": 0.2611910402774811,
"learning_rate": 8.823790777364837e-07,
"loss": 0.0263,
"step": 17920
},
{
"epoch": 17.190795781399807,
"grad_norm": 0.3325332701206207,
"learning_rate": 8.661764322934695e-07,
"loss": 0.0314,
"step": 17930
},
{
"epoch": 17.20038350910834,
"grad_norm": 0.38311854004859924,
"learning_rate": 8.50122628949257e-07,
"loss": 0.0279,
"step": 17940
},
{
"epoch": 17.209971236816873,
"grad_norm": 0.1343742161989212,
"learning_rate": 8.342177163359389e-07,
"loss": 0.028,
"step": 17950
},
{
"epoch": 17.219558964525408,
"grad_norm": 0.19379399716854095,
"learning_rate": 8.184617426346131e-07,
"loss": 0.0301,
"step": 17960
},
{
"epoch": 17.22914669223394,
"grad_norm": 0.16689153015613556,
"learning_rate": 8.028547555751553e-07,
"loss": 0.029,
"step": 17970
},
{
"epoch": 17.238734419942475,
"grad_norm": 0.45647260546684265,
"learning_rate": 7.873968024361467e-07,
"loss": 0.0307,
"step": 17980
},
{
"epoch": 17.248322147651006,
"grad_norm": 0.19029688835144043,
"learning_rate": 7.720879300446682e-07,
"loss": 0.0269,
"step": 17990
},
{
"epoch": 17.25790987535954,
"grad_norm": 0.26700901985168457,
"learning_rate": 7.569281847762122e-07,
"loss": 0.026,
"step": 18000
},
{
"epoch": 17.267497603068072,
"grad_norm": 0.20858362317085266,
"learning_rate": 7.419176125544991e-07,
"loss": 0.0304,
"step": 18010
},
{
"epoch": 17.277085330776607,
"grad_norm": 0.23115743696689606,
"learning_rate": 7.270562588513663e-07,
"loss": 0.0389,
"step": 18020
},
{
"epoch": 17.28667305848514,
"grad_norm": 0.17492881417274475,
"learning_rate": 7.123441686866183e-07,
"loss": 0.0293,
"step": 18030
},
{
"epoch": 17.29626078619367,
"grad_norm": 0.12759244441986084,
"learning_rate": 6.977813866278826e-07,
"loss": 0.0239,
"step": 18040
},
{
"epoch": 17.305848513902205,
"grad_norm": 0.18989066779613495,
"learning_rate": 6.833679567905038e-07,
"loss": 0.0292,
"step": 18050
},
{
"epoch": 17.315436241610737,
"grad_norm": 0.5339308977127075,
"learning_rate": 6.691039228373774e-07,
"loss": 0.0337,
"step": 18060
},
{
"epoch": 17.325023969319272,
"grad_norm": 0.18861901760101318,
"learning_rate": 6.549893279788277e-07,
"loss": 0.0288,
"step": 18070
},
{
"epoch": 17.334611697027803,
"grad_norm": 0.18615840375423431,
"learning_rate": 6.410242149724966e-07,
"loss": 0.0246,
"step": 18080
},
{
"epoch": 17.34419942473634,
"grad_norm": 0.1773938536643982,
"learning_rate": 6.272086261231769e-07,
"loss": 0.0272,
"step": 18090
},
{
"epoch": 17.35378715244487,
"grad_norm": 0.2144092619419098,
"learning_rate": 6.135426032827185e-07,
"loss": 0.0299,
"step": 18100
},
{
"epoch": 17.363374880153405,
"grad_norm": 0.18490025401115417,
"learning_rate": 6.000261878498947e-07,
"loss": 0.0297,
"step": 18110
},
{
"epoch": 17.372962607861936,
"grad_norm": 0.18837903439998627,
"learning_rate": 5.86659420770247e-07,
"loss": 0.0272,
"step": 18120
},
{
"epoch": 17.38255033557047,
"grad_norm": 0.2982289791107178,
"learning_rate": 5.734423425359958e-07,
"loss": 0.0314,
"step": 18130
},
{
"epoch": 17.392138063279003,
"grad_norm": 0.2356351912021637,
"learning_rate": 5.603749931859137e-07,
"loss": 0.0258,
"step": 18140
},
{
"epoch": 17.401725790987538,
"grad_norm": 0.13853472471237183,
"learning_rate": 5.474574123051912e-07,
"loss": 0.0289,
"step": 18150
},
{
"epoch": 17.41131351869607,
"grad_norm": 0.2044096440076828,
"learning_rate": 5.346896390253153e-07,
"loss": 0.0244,
"step": 18160
},
{
"epoch": 17.4209012464046,
"grad_norm": 0.33529403805732727,
"learning_rate": 5.220717120239693e-07,
"loss": 0.0282,
"step": 18170
},
{
"epoch": 17.430488974113135,
"grad_norm": 0.2302224040031433,
"learning_rate": 5.096036695248885e-07,
"loss": 0.0299,
"step": 18180
},
{
"epoch": 17.440076701821667,
"grad_norm": 0.22276417911052704,
"learning_rate": 4.972855492977823e-07,
"loss": 0.0294,
"step": 18190
},
{
"epoch": 17.449664429530202,
"grad_norm": 0.5279762744903564,
"learning_rate": 4.851173886581794e-07,
"loss": 0.0286,
"step": 18200
},
{
"epoch": 17.459252157238733,
"grad_norm": 0.22499582171440125,
"learning_rate": 4.7309922446732715e-07,
"loss": 0.0239,
"step": 18210
},
{
"epoch": 17.46883988494727,
"grad_norm": 0.2594180703163147,
"learning_rate": 4.61231093132114e-07,
"loss": 0.0275,
"step": 18220
},
{
"epoch": 17.4784276126558,
"grad_norm": 0.1713213175535202,
"learning_rate": 4.495130306049034e-07,
"loss": 0.0243,
"step": 18230
},
{
"epoch": 17.488015340364335,
"grad_norm": 0.3286925256252289,
"learning_rate": 4.3794507238347214e-07,
"loss": 0.0316,
"step": 18240
},
{
"epoch": 17.497603068072866,
"grad_norm": 0.23200523853302002,
"learning_rate": 4.2652725351085556e-07,
"loss": 0.0265,
"step": 18250
},
{
"epoch": 17.5071907957814,
"grad_norm": 0.22095492482185364,
"learning_rate": 4.1525960857530243e-07,
"loss": 0.024,
"step": 18260
},
{
"epoch": 17.516778523489933,
"grad_norm": 0.17762340605258942,
"learning_rate": 4.041421717101146e-07,
"loss": 0.0268,
"step": 18270
},
{
"epoch": 17.526366251198468,
"grad_norm": 0.2298087775707245,
"learning_rate": 3.931749765935744e-07,
"loss": 0.0257,
"step": 18280
},
{
"epoch": 17.535953978907,
"grad_norm": 0.21401867270469666,
"learning_rate": 3.8235805644882273e-07,
"loss": 0.0245,
"step": 18290
},
{
"epoch": 17.54554170661553,
"grad_norm": 0.5458080172538757,
"learning_rate": 3.716914440437813e-07,
"loss": 0.033,
"step": 18300
},
{
"epoch": 17.555129434324066,
"grad_norm": 0.17889949679374695,
"learning_rate": 3.611751716910472e-07,
"loss": 0.0303,
"step": 18310
},
{
"epoch": 17.564717162032597,
"grad_norm": 0.0861106589436531,
"learning_rate": 3.508092712477651e-07,
"loss": 0.025,
"step": 18320
},
{
"epoch": 17.574304889741132,
"grad_norm": 0.396636962890625,
"learning_rate": 3.405937741155829e-07,
"loss": 0.03,
"step": 18330
},
{
"epoch": 17.583892617449663,
"grad_norm": 0.3980105221271515,
"learning_rate": 3.30528711240502e-07,
"loss": 0.0217,
"step": 18340
},
{
"epoch": 17.5934803451582,
"grad_norm": 0.2600933313369751,
"learning_rate": 3.206141131128326e-07,
"loss": 0.0278,
"step": 18350
},
{
"epoch": 17.60306807286673,
"grad_norm": 0.20506466925144196,
"learning_rate": 3.108500097670719e-07,
"loss": 0.0216,
"step": 18360
},
{
"epoch": 17.612655800575265,
"grad_norm": 0.31107306480407715,
"learning_rate": 3.0123643078180943e-07,
"loss": 0.0296,
"step": 18370
},
{
"epoch": 17.622243528283796,
"grad_norm": 0.2587839663028717,
"learning_rate": 2.9177340527966613e-07,
"loss": 0.0265,
"step": 18380
},
{
"epoch": 17.63183125599233,
"grad_norm": 0.293157160282135,
"learning_rate": 2.824609619271723e-07,
"loss": 0.0239,
"step": 18390
},
{
"epoch": 17.641418983700863,
"grad_norm": 0.22268742322921753,
"learning_rate": 2.732991289347064e-07,
"loss": 0.0283,
"step": 18400
},
{
"epoch": 17.651006711409394,
"grad_norm": 0.21071119606494904,
"learning_rate": 2.6428793405640087e-07,
"loss": 0.0241,
"step": 18410
},
{
"epoch": 17.66059443911793,
"grad_norm": 0.25878384709358215,
"learning_rate": 2.554274045900418e-07,
"loss": 0.0224,
"step": 18420
},
{
"epoch": 17.67018216682646,
"grad_norm": 0.2513992488384247,
"learning_rate": 2.46717567377025e-07,
"loss": 0.0271,
"step": 18430
},
{
"epoch": 17.679769894534996,
"grad_norm": 0.1096489429473877,
"learning_rate": 2.381584488022337e-07,
"loss": 0.0233,
"step": 18440
},
{
"epoch": 17.689357622243527,
"grad_norm": 0.24723054468631744,
"learning_rate": 2.2975007479397738e-07,
"loss": 0.0254,
"step": 18450
},
{
"epoch": 17.698945349952062,
"grad_norm": 0.22072063386440277,
"learning_rate": 2.2149247082392522e-07,
"loss": 0.0273,
"step": 18460
},
{
"epoch": 17.708533077660594,
"grad_norm": 0.2557280957698822,
"learning_rate": 2.1338566190699517e-07,
"loss": 0.0322,
"step": 18470
},
{
"epoch": 17.71812080536913,
"grad_norm": 0.3068563938140869,
"learning_rate": 2.0542967260131497e-07,
"loss": 0.0211,
"step": 18480
},
{
"epoch": 17.72770853307766,
"grad_norm": 0.18864025175571442,
"learning_rate": 1.976245270081334e-07,
"loss": 0.028,
"step": 18490
},
{
"epoch": 17.737296260786195,
"grad_norm": 0.20000196993350983,
"learning_rate": 1.899702487717203e-07,
"loss": 0.0239,
"step": 18500
},
{
"epoch": 17.746883988494726,
"grad_norm": 0.5573348999023438,
"learning_rate": 1.8246686107935562e-07,
"loss": 0.03,
"step": 18510
},
{
"epoch": 17.75647171620326,
"grad_norm": 0.09101556986570358,
"learning_rate": 1.7511438666119594e-07,
"loss": 0.0336,
"step": 18520
},
{
"epoch": 17.766059443911793,
"grad_norm": 0.2559066712856293,
"learning_rate": 1.6791284779024696e-07,
"loss": 0.0285,
"step": 18530
},
{
"epoch": 17.775647171620324,
"grad_norm": 0.23298071324825287,
"learning_rate": 1.6086226628226898e-07,
"loss": 0.0319,
"step": 18540
},
{
"epoch": 17.78523489932886,
"grad_norm": 0.1978902518749237,
"learning_rate": 1.5396266349574362e-07,
"loss": 0.0269,
"step": 18550
},
{
"epoch": 17.79482262703739,
"grad_norm": 0.5722432732582092,
"learning_rate": 1.4721406033177954e-07,
"loss": 0.0291,
"step": 18560
},
{
"epoch": 17.804410354745926,
"grad_norm": 0.29033163189888,
"learning_rate": 1.4061647723405125e-07,
"loss": 0.0288,
"step": 18570
},
{
"epoch": 17.813998082454457,
"grad_norm": 0.19131603837013245,
"learning_rate": 1.3416993418874924e-07,
"loss": 0.0247,
"step": 18580
},
{
"epoch": 17.823585810162992,
"grad_norm": 0.25687092542648315,
"learning_rate": 1.2787445072452998e-07,
"loss": 0.0267,
"step": 18590
},
{
"epoch": 17.833173537871524,
"grad_norm": 0.16243956983089447,
"learning_rate": 1.217300459124271e-07,
"loss": 0.0273,
"step": 18600
},
{
"epoch": 17.84276126558006,
"grad_norm": 0.17303957045078278,
"learning_rate": 1.1573673836580701e-07,
"loss": 0.0353,
"step": 18610
},
{
"epoch": 17.85234899328859,
"grad_norm": 0.4954906702041626,
"learning_rate": 1.0989454624032448e-07,
"loss": 0.0239,
"step": 18620
},
{
"epoch": 17.861936720997125,
"grad_norm": 0.500385582447052,
"learning_rate": 1.0420348723385043e-07,
"loss": 0.0279,
"step": 18630
},
{
"epoch": 17.871524448705657,
"grad_norm": 0.28065744042396545,
"learning_rate": 9.866357858642205e-08,
"loss": 0.024,
"step": 18640
},
{
"epoch": 17.88111217641419,
"grad_norm": 0.22515705227851868,
"learning_rate": 9.32748370802039e-08,
"loss": 0.0273,
"step": 18650
},
{
"epoch": 17.890699904122723,
"grad_norm": 0.4083874225616455,
"learning_rate": 8.803727903942127e-08,
"loss": 0.0269,
"step": 18660
},
{
"epoch": 17.900287631831254,
"grad_norm": 0.3455846309661865,
"learning_rate": 8.295092033031027e-08,
"loss": 0.0277,
"step": 18670
},
{
"epoch": 17.90987535953979,
"grad_norm": 0.15052051842212677,
"learning_rate": 7.801577636108448e-08,
"loss": 0.0358,
"step": 18680
},
{
"epoch": 17.91946308724832,
"grad_norm": 0.21173402667045593,
"learning_rate": 7.323186208188504e-08,
"loss": 0.0256,
"step": 18690
},
{
"epoch": 17.929050814956856,
"grad_norm": 0.3735136389732361,
"learning_rate": 6.859919198470288e-08,
"loss": 0.031,
"step": 18700
},
{
"epoch": 17.938638542665387,
"grad_norm": 0.2103312462568283,
"learning_rate": 6.411778010340097e-08,
"loss": 0.0322,
"step": 18710
},
{
"epoch": 17.948226270373922,
"grad_norm": 0.19569391012191772,
"learning_rate": 5.978764001359771e-08,
"loss": 0.0291,
"step": 18720
},
{
"epoch": 17.957813998082454,
"grad_norm": 0.25286465883255005,
"learning_rate": 5.5608784832683616e-08,
"loss": 0.0277,
"step": 18730
},
{
"epoch": 17.96740172579099,
"grad_norm": 0.2856442332267761,
"learning_rate": 5.158122721974357e-08,
"loss": 0.0254,
"step": 18740
},
{
"epoch": 17.97698945349952,
"grad_norm": 0.15211383998394012,
"learning_rate": 4.770497937554574e-08,
"loss": 0.024,
"step": 18750
},
{
"epoch": 17.986577181208055,
"grad_norm": 0.28586897253990173,
"learning_rate": 4.398005304248609e-08,
"loss": 0.0239,
"step": 18760
},
{
"epoch": 17.996164908916587,
"grad_norm": 0.18181052803993225,
"learning_rate": 4.0406459504555016e-08,
"loss": 0.0236,
"step": 18770
},
{
"epoch": 18.005752636625118,
"grad_norm": 0.19704671204090118,
"learning_rate": 3.698420958732074e-08,
"loss": 0.0251,
"step": 18780
},
{
"epoch": 18.015340364333653,
"grad_norm": 0.19747470319271088,
"learning_rate": 3.371331365786823e-08,
"loss": 0.0313,
"step": 18790
},
{
"epoch": 18.024928092042185,
"grad_norm": 0.23974737524986267,
"learning_rate": 3.05937816247992e-08,
"loss": 0.0334,
"step": 18800
},
{
"epoch": 18.03451581975072,
"grad_norm": 0.31815865635871887,
"learning_rate": 2.7625622938165507e-08,
"loss": 0.025,
"step": 18810
},
{
"epoch": 18.04410354745925,
"grad_norm": 0.14651015400886536,
"learning_rate": 2.4808846589474687e-08,
"loss": 0.0252,
"step": 18820
},
{
"epoch": 18.053691275167786,
"grad_norm": 0.31359338760375977,
"learning_rate": 2.214346111164556e-08,
"loss": 0.0255,
"step": 18830
},
{
"epoch": 18.063279002876317,
"grad_norm": 0.3521699607372284,
"learning_rate": 1.9629474578986008e-08,
"loss": 0.0229,
"step": 18840
},
{
"epoch": 18.072866730584852,
"grad_norm": 0.2816530168056488,
"learning_rate": 1.726689460716524e-08,
"loss": 0.0262,
"step": 18850
},
{
"epoch": 18.082454458293384,
"grad_norm": 0.27596089243888855,
"learning_rate": 1.5055728353191578e-08,
"loss": 0.0266,
"step": 18860
},
{
"epoch": 18.09204218600192,
"grad_norm": 0.25768667459487915,
"learning_rate": 1.2995982515406901e-08,
"loss": 0.0273,
"step": 18870
},
{
"epoch": 18.10162991371045,
"grad_norm": 0.13152585923671722,
"learning_rate": 1.1087663333431141e-08,
"loss": 0.0268,
"step": 18880
},
{
"epoch": 18.111217641418985,
"grad_norm": 0.1559949666261673,
"learning_rate": 9.330776588184487e-09,
"loss": 0.0307,
"step": 18890
},
{
"epoch": 18.120805369127517,
"grad_norm": 0.25546255707740784,
"learning_rate": 7.725327601826315e-09,
"loss": 0.0254,
"step": 18900
},
{
"epoch": 18.13039309683605,
"grad_norm": 0.17455005645751953,
"learning_rate": 6.271321237788508e-09,
"loss": 0.0331,
"step": 18910
},
{
"epoch": 18.139980824544583,
"grad_norm": 0.25416553020477295,
"learning_rate": 4.9687619007199316e-09,
"loss": 0.0332,
"step": 18920
},
{
"epoch": 18.149568552253115,
"grad_norm": 0.19471152126789093,
"learning_rate": 3.817653536480892e-09,
"loss": 0.0248,
"step": 18930
},
{
"epoch": 18.15915627996165,
"grad_norm": 0.26644882559776306,
"learning_rate": 2.8179996321597845e-09,
"loss": 0.0248,
"step": 18940
},
{
"epoch": 18.16874400767018,
"grad_norm": 0.18680621683597565,
"learning_rate": 1.9698032160231363e-09,
"loss": 0.0252,
"step": 18950
},
{
"epoch": 18.178331735378716,
"grad_norm": 0.22466066479682922,
"learning_rate": 1.2730668575322569e-09,
"loss": 0.0221,
"step": 18960
},
{
"epoch": 18.187919463087248,
"grad_norm": 0.27246662974357605,
"learning_rate": 7.277926673210367e-10,
"loss": 0.0258,
"step": 18970
},
{
"epoch": 18.197507190795783,
"grad_norm": 0.17329837381839752,
"learning_rate": 3.3398229720149607e-10,
"loss": 0.0266,
"step": 18980
},
{
"epoch": 18.207094918504314,
"grad_norm": 0.3577910363674164,
"learning_rate": 9.163694015268398e-11,
"loss": 0.0296,
"step": 18990
},
{
"epoch": 18.21668264621285,
"grad_norm": 0.24373145401477814,
"learning_rate": 7.57330315126481e-13,
"loss": 0.029,
"step": 19000
},
{
"epoch": 18.21668264621285,
"step": 19000,
"total_flos": 0.0,
"train_loss": 0.04628433942951654,
"train_runtime": 5633.6681,
"train_samples_per_second": 107.923,
"train_steps_per_second": 3.373
}
],
"logging_steps": 10,
"max_steps": 19000,
"num_input_tokens_seen": 0,
"num_train_epochs": 19,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}