gr00t-test / trainer_state.json
shuohsuan's picture
Upload folder using huggingface_hub
360e6d9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.8119891008174385,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006811989100817439,
"grad_norm": 11.914588928222656,
"learning_rate": 3.6e-06,
"loss": 0.9281,
"step": 10
},
{
"epoch": 0.013623978201634877,
"grad_norm": 2.2081298828125,
"learning_rate": 7.6e-06,
"loss": 0.4782,
"step": 20
},
{
"epoch": 0.020435967302452316,
"grad_norm": 1.648837924003601,
"learning_rate": 1.16e-05,
"loss": 0.3026,
"step": 30
},
{
"epoch": 0.027247956403269755,
"grad_norm": 1.3150748014450073,
"learning_rate": 1.56e-05,
"loss": 0.2283,
"step": 40
},
{
"epoch": 0.0340599455040872,
"grad_norm": 1.7804750204086304,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.2138,
"step": 50
},
{
"epoch": 0.04087193460490463,
"grad_norm": 1.8128317594528198,
"learning_rate": 2.36e-05,
"loss": 0.2035,
"step": 60
},
{
"epoch": 0.047683923705722074,
"grad_norm": 1.4426037073135376,
"learning_rate": 2.7600000000000003e-05,
"loss": 0.1941,
"step": 70
},
{
"epoch": 0.05449591280653951,
"grad_norm": 2.272278070449829,
"learning_rate": 3.16e-05,
"loss": 0.179,
"step": 80
},
{
"epoch": 0.06130790190735695,
"grad_norm": 1.347985863685608,
"learning_rate": 3.56e-05,
"loss": 0.148,
"step": 90
},
{
"epoch": 0.0681198910081744,
"grad_norm": 1.5960944890975952,
"learning_rate": 3.960000000000001e-05,
"loss": 0.1421,
"step": 100
},
{
"epoch": 0.07493188010899182,
"grad_norm": 0.8870837688446045,
"learning_rate": 4.36e-05,
"loss": 0.1422,
"step": 110
},
{
"epoch": 0.08174386920980926,
"grad_norm": 1.381057858467102,
"learning_rate": 4.76e-05,
"loss": 0.1392,
"step": 120
},
{
"epoch": 0.0885558583106267,
"grad_norm": 1.140463948249817,
"learning_rate": 5.16e-05,
"loss": 0.13,
"step": 130
},
{
"epoch": 0.09536784741144415,
"grad_norm": 1.0376285314559937,
"learning_rate": 5.560000000000001e-05,
"loss": 0.1254,
"step": 140
},
{
"epoch": 0.10217983651226158,
"grad_norm": 0.6157049536705017,
"learning_rate": 5.96e-05,
"loss": 0.1216,
"step": 150
},
{
"epoch": 0.10899182561307902,
"grad_norm": 1.6728339195251465,
"learning_rate": 6.36e-05,
"loss": 0.1169,
"step": 160
},
{
"epoch": 0.11580381471389646,
"grad_norm": 1.3726643323898315,
"learning_rate": 6.76e-05,
"loss": 0.1201,
"step": 170
},
{
"epoch": 0.1226158038147139,
"grad_norm": 1.2212395668029785,
"learning_rate": 7.16e-05,
"loss": 0.12,
"step": 180
},
{
"epoch": 0.12942779291553133,
"grad_norm": 1.034472107887268,
"learning_rate": 7.560000000000001e-05,
"loss": 0.098,
"step": 190
},
{
"epoch": 0.1362397820163488,
"grad_norm": 0.7338757514953613,
"learning_rate": 7.960000000000001e-05,
"loss": 0.0965,
"step": 200
},
{
"epoch": 0.14305177111716622,
"grad_norm": 0.62514328956604,
"learning_rate": 8.36e-05,
"loss": 0.1108,
"step": 210
},
{
"epoch": 0.14986376021798364,
"grad_norm": 0.8058353662490845,
"learning_rate": 8.76e-05,
"loss": 0.0948,
"step": 220
},
{
"epoch": 0.1566757493188011,
"grad_norm": 0.837098240852356,
"learning_rate": 9.16e-05,
"loss": 0.0948,
"step": 230
},
{
"epoch": 0.16348773841961853,
"grad_norm": 1.094609260559082,
"learning_rate": 9.56e-05,
"loss": 0.0935,
"step": 240
},
{
"epoch": 0.17029972752043596,
"grad_norm": 1.555716872215271,
"learning_rate": 9.960000000000001e-05,
"loss": 0.0949,
"step": 250
},
{
"epoch": 0.1771117166212534,
"grad_norm": 0.8324354290962219,
"learning_rate": 9.999911419878559e-05,
"loss": 0.0894,
"step": 260
},
{
"epoch": 0.18392370572207084,
"grad_norm": 1.3196247816085815,
"learning_rate": 9.999605221019081e-05,
"loss": 0.1098,
"step": 270
},
{
"epoch": 0.1907356948228883,
"grad_norm": 0.6212723255157471,
"learning_rate": 9.999080323230761e-05,
"loss": 0.0848,
"step": 280
},
{
"epoch": 0.19754768392370572,
"grad_norm": 0.9073509573936462,
"learning_rate": 9.998336749474329e-05,
"loss": 0.099,
"step": 290
},
{
"epoch": 0.20435967302452315,
"grad_norm": 0.8732384443283081,
"learning_rate": 9.997374532276107e-05,
"loss": 0.0838,
"step": 300
},
{
"epoch": 0.2111716621253406,
"grad_norm": 1.0056062936782837,
"learning_rate": 9.996193713726596e-05,
"loss": 0.0818,
"step": 310
},
{
"epoch": 0.21798365122615804,
"grad_norm": 0.7375788688659668,
"learning_rate": 9.994794345478624e-05,
"loss": 0.0768,
"step": 320
},
{
"epoch": 0.22479564032697547,
"grad_norm": 1.0606452226638794,
"learning_rate": 9.99317648874509e-05,
"loss": 0.0845,
"step": 330
},
{
"epoch": 0.23160762942779292,
"grad_norm": 0.699203610420227,
"learning_rate": 9.991340214296292e-05,
"loss": 0.0767,
"step": 340
},
{
"epoch": 0.23841961852861035,
"grad_norm": 1.3575586080551147,
"learning_rate": 9.989285602456819e-05,
"loss": 0.075,
"step": 350
},
{
"epoch": 0.2452316076294278,
"grad_norm": 0.7841135263442993,
"learning_rate": 9.98701274310205e-05,
"loss": 0.0728,
"step": 360
},
{
"epoch": 0.25204359673024523,
"grad_norm": 0.7767183184623718,
"learning_rate": 9.984521735654218e-05,
"loss": 0.0769,
"step": 370
},
{
"epoch": 0.25885558583106266,
"grad_norm": 0.483733594417572,
"learning_rate": 9.981812689078057e-05,
"loss": 0.0669,
"step": 380
},
{
"epoch": 0.2656675749318801,
"grad_norm": 0.4933801591396332,
"learning_rate": 9.978885721876041e-05,
"loss": 0.0696,
"step": 390
},
{
"epoch": 0.2724795640326976,
"grad_norm": 0.6362014412879944,
"learning_rate": 9.975740962083198e-05,
"loss": 0.0678,
"step": 400
},
{
"epoch": 0.279291553133515,
"grad_norm": 0.7683391571044922,
"learning_rate": 9.972378547261504e-05,
"loss": 0.0735,
"step": 410
},
{
"epoch": 0.28610354223433243,
"grad_norm": 0.8926170468330383,
"learning_rate": 9.968798624493885e-05,
"loss": 0.0656,
"step": 420
},
{
"epoch": 0.29291553133514986,
"grad_norm": 0.6009325385093689,
"learning_rate": 9.965001350377753e-05,
"loss": 0.074,
"step": 430
},
{
"epoch": 0.2997275204359673,
"grad_norm": 0.45287570357322693,
"learning_rate": 9.960986891018183e-05,
"loss": 0.0602,
"step": 440
},
{
"epoch": 0.3065395095367847,
"grad_norm": 0.47310397028923035,
"learning_rate": 9.95675542202063e-05,
"loss": 0.0619,
"step": 450
},
{
"epoch": 0.3133514986376022,
"grad_norm": 1.0169494152069092,
"learning_rate": 9.952307128483256e-05,
"loss": 0.0709,
"step": 460
},
{
"epoch": 0.3201634877384196,
"grad_norm": 0.7056031227111816,
"learning_rate": 9.947642204988835e-05,
"loss": 0.0666,
"step": 470
},
{
"epoch": 0.32697547683923706,
"grad_norm": 0.817714512348175,
"learning_rate": 9.942760855596226e-05,
"loss": 0.0734,
"step": 480
},
{
"epoch": 0.3337874659400545,
"grad_norm": 0.5847324728965759,
"learning_rate": 9.937663293831471e-05,
"loss": 0.0643,
"step": 490
},
{
"epoch": 0.3405994550408719,
"grad_norm": 0.8371860384941101,
"learning_rate": 9.932349742678433e-05,
"loss": 0.0617,
"step": 500
},
{
"epoch": 0.3474114441416894,
"grad_norm": 0.47964030504226685,
"learning_rate": 9.926820434569051e-05,
"loss": 0.0649,
"step": 510
},
{
"epoch": 0.3542234332425068,
"grad_norm": 0.6083551645278931,
"learning_rate": 9.921075611373179e-05,
"loss": 0.0634,
"step": 520
},
{
"epoch": 0.36103542234332425,
"grad_norm": 0.6717512011528015,
"learning_rate": 9.915115524387988e-05,
"loss": 0.0637,
"step": 530
},
{
"epoch": 0.3678474114441417,
"grad_norm": 0.7015753984451294,
"learning_rate": 9.908940434326997e-05,
"loss": 0.0633,
"step": 540
},
{
"epoch": 0.3746594005449591,
"grad_norm": 0.6212232708930969,
"learning_rate": 9.902550611308645e-05,
"loss": 0.0593,
"step": 550
},
{
"epoch": 0.3814713896457766,
"grad_norm": 0.6970530152320862,
"learning_rate": 9.895946334844494e-05,
"loss": 0.0579,
"step": 560
},
{
"epoch": 0.388283378746594,
"grad_norm": 0.5176441669464111,
"learning_rate": 9.889127893826989e-05,
"loss": 0.0559,
"step": 570
},
{
"epoch": 0.39509536784741145,
"grad_norm": 0.44634121656417847,
"learning_rate": 9.882095586516831e-05,
"loss": 0.0576,
"step": 580
},
{
"epoch": 0.4019073569482289,
"grad_norm": 0.6069617867469788,
"learning_rate": 9.874849720529921e-05,
"loss": 0.0608,
"step": 590
},
{
"epoch": 0.4087193460490463,
"grad_norm": 0.610893726348877,
"learning_rate": 9.867390612823914e-05,
"loss": 0.0592,
"step": 600
},
{
"epoch": 0.41553133514986373,
"grad_norm": 0.3479655683040619,
"learning_rate": 9.859718589684344e-05,
"loss": 0.0658,
"step": 610
},
{
"epoch": 0.4223433242506812,
"grad_norm": 0.7216345071792603,
"learning_rate": 9.851833986710353e-05,
"loss": 0.056,
"step": 620
},
{
"epoch": 0.42915531335149865,
"grad_norm": 0.5811245441436768,
"learning_rate": 9.843737148800023e-05,
"loss": 0.0489,
"step": 630
},
{
"epoch": 0.4359673024523161,
"grad_norm": 0.6808714866638184,
"learning_rate": 9.835428430135271e-05,
"loss": 0.0489,
"step": 640
},
{
"epoch": 0.4427792915531335,
"grad_norm": 0.53304123878479,
"learning_rate": 9.82690819416637e-05,
"loss": 0.0551,
"step": 650
},
{
"epoch": 0.44959128065395093,
"grad_norm": 0.5133969783782959,
"learning_rate": 9.818176813596041e-05,
"loss": 0.0523,
"step": 660
},
{
"epoch": 0.4564032697547684,
"grad_norm": 0.42300981283187866,
"learning_rate": 9.809234670363159e-05,
"loss": 0.0635,
"step": 670
},
{
"epoch": 0.46321525885558584,
"grad_norm": 0.5513554811477661,
"learning_rate": 9.800082155626034e-05,
"loss": 0.0571,
"step": 680
},
{
"epoch": 0.47002724795640327,
"grad_norm": 0.4101255238056183,
"learning_rate": 9.790719669745312e-05,
"loss": 0.0571,
"step": 690
},
{
"epoch": 0.4768392370572207,
"grad_norm": 0.7928181886672974,
"learning_rate": 9.781147622266455e-05,
"loss": 0.0576,
"step": 700
},
{
"epoch": 0.48365122615803813,
"grad_norm": 0.6665974855422974,
"learning_rate": 9.771366431901831e-05,
"loss": 0.0626,
"step": 710
},
{
"epoch": 0.4904632152588556,
"grad_norm": 0.43793386220932007,
"learning_rate": 9.761376526512394e-05,
"loss": 0.0567,
"step": 720
},
{
"epoch": 0.49727520435967304,
"grad_norm": 0.4338440001010895,
"learning_rate": 9.751178343088963e-05,
"loss": 0.0519,
"step": 730
},
{
"epoch": 0.5040871934604905,
"grad_norm": 0.47942614555358887,
"learning_rate": 9.740772327733123e-05,
"loss": 0.0446,
"step": 740
},
{
"epoch": 0.510899182561308,
"grad_norm": 0.3280750811100006,
"learning_rate": 9.730158935637697e-05,
"loss": 0.052,
"step": 750
},
{
"epoch": 0.5177111716621253,
"grad_norm": 0.6672011017799377,
"learning_rate": 9.719338631066834e-05,
"loss": 0.0417,
"step": 760
},
{
"epoch": 0.5245231607629428,
"grad_norm": 0.386070191860199,
"learning_rate": 9.708311887335713e-05,
"loss": 0.0436,
"step": 770
},
{
"epoch": 0.5313351498637602,
"grad_norm": 0.428362637758255,
"learning_rate": 9.697079186789823e-05,
"loss": 0.0539,
"step": 780
},
{
"epoch": 0.5381471389645777,
"grad_norm": 0.4888722896575928,
"learning_rate": 9.685641020783876e-05,
"loss": 0.0517,
"step": 790
},
{
"epoch": 0.5449591280653951,
"grad_norm": 0.4673832952976227,
"learning_rate": 9.67399788966031e-05,
"loss": 0.0523,
"step": 800
},
{
"epoch": 0.5517711171662125,
"grad_norm": 0.29115697741508484,
"learning_rate": 9.662150302727395e-05,
"loss": 0.0521,
"step": 810
},
{
"epoch": 0.55858310626703,
"grad_norm": 0.827384352684021,
"learning_rate": 9.650098778236968e-05,
"loss": 0.0477,
"step": 820
},
{
"epoch": 0.5653950953678474,
"grad_norm": 0.33872804045677185,
"learning_rate": 9.637843843361749e-05,
"loss": 0.0471,
"step": 830
},
{
"epoch": 0.5722070844686649,
"grad_norm": 0.3877025842666626,
"learning_rate": 9.62538603417229e-05,
"loss": 0.0426,
"step": 840
},
{
"epoch": 0.5790190735694822,
"grad_norm": 0.42489877343177795,
"learning_rate": 9.612725895613526e-05,
"loss": 0.0551,
"step": 850
},
{
"epoch": 0.5858310626702997,
"grad_norm": 0.3988380432128906,
"learning_rate": 9.599863981480926e-05,
"loss": 0.0487,
"step": 860
},
{
"epoch": 0.5926430517711172,
"grad_norm": 0.5466487407684326,
"learning_rate": 9.586800854396283e-05,
"loss": 0.0467,
"step": 870
},
{
"epoch": 0.5994550408719346,
"grad_norm": 0.37913820147514343,
"learning_rate": 9.573537085783095e-05,
"loss": 0.0399,
"step": 880
},
{
"epoch": 0.6062670299727521,
"grad_norm": 0.47171854972839355,
"learning_rate": 9.560073255841571e-05,
"loss": 0.0402,
"step": 890
},
{
"epoch": 0.6130790190735694,
"grad_norm": 0.5175816416740417,
"learning_rate": 9.546409953523247e-05,
"loss": 0.0514,
"step": 900
},
{
"epoch": 0.6198910081743869,
"grad_norm": 0.3573410212993622,
"learning_rate": 9.532547776505229e-05,
"loss": 0.0384,
"step": 910
},
{
"epoch": 0.6267029972752044,
"grad_norm": 0.4385380148887634,
"learning_rate": 9.518487331164048e-05,
"loss": 0.0478,
"step": 920
},
{
"epoch": 0.6335149863760218,
"grad_norm": 0.49251607060432434,
"learning_rate": 9.504229232549134e-05,
"loss": 0.0421,
"step": 930
},
{
"epoch": 0.6403269754768393,
"grad_norm": 0.33070334792137146,
"learning_rate": 9.489774104355909e-05,
"loss": 0.0402,
"step": 940
},
{
"epoch": 0.6471389645776566,
"grad_norm": 0.616314709186554,
"learning_rate": 9.475122578898507e-05,
"loss": 0.0561,
"step": 950
},
{
"epoch": 0.6539509536784741,
"grad_norm": 0.7195887565612793,
"learning_rate": 9.460275297082119e-05,
"loss": 0.0392,
"step": 960
},
{
"epoch": 0.6607629427792916,
"grad_norm": 0.6277886033058167,
"learning_rate": 9.445232908374948e-05,
"loss": 0.0433,
"step": 970
},
{
"epoch": 0.667574931880109,
"grad_norm": 0.7239585518836975,
"learning_rate": 9.429996070779808e-05,
"loss": 0.0513,
"step": 980
},
{
"epoch": 0.6743869209809265,
"grad_norm": 0.8502191305160522,
"learning_rate": 9.414565450805333e-05,
"loss": 0.052,
"step": 990
},
{
"epoch": 0.6811989100817438,
"grad_norm": 0.26333141326904297,
"learning_rate": 9.398941723436831e-05,
"loss": 0.0467,
"step": 1000
},
{
"epoch": 0.6880108991825613,
"grad_norm": 0.6643750071525574,
"learning_rate": 9.383125572106752e-05,
"loss": 0.0416,
"step": 1010
},
{
"epoch": 0.6948228882833788,
"grad_norm": 0.4409726858139038,
"learning_rate": 9.367117688664791e-05,
"loss": 0.0484,
"step": 1020
},
{
"epoch": 0.7016348773841962,
"grad_norm": 0.575430154800415,
"learning_rate": 9.35091877334763e-05,
"loss": 0.0495,
"step": 1030
},
{
"epoch": 0.7084468664850136,
"grad_norm": 0.4284694492816925,
"learning_rate": 9.334529534748297e-05,
"loss": 0.0442,
"step": 1040
},
{
"epoch": 0.715258855585831,
"grad_norm": 0.4911642372608185,
"learning_rate": 9.317950689785188e-05,
"loss": 0.0405,
"step": 1050
},
{
"epoch": 0.7220708446866485,
"grad_norm": 0.7064844965934753,
"learning_rate": 9.301182963670688e-05,
"loss": 0.054,
"step": 1060
},
{
"epoch": 0.728882833787466,
"grad_norm": 0.7730808258056641,
"learning_rate": 9.284227089879456e-05,
"loss": 0.048,
"step": 1070
},
{
"epoch": 0.7356948228882834,
"grad_norm": 0.3709051012992859,
"learning_rate": 9.26708381011634e-05,
"loss": 0.0505,
"step": 1080
},
{
"epoch": 0.7425068119891008,
"grad_norm": 0.5168929100036621,
"learning_rate": 9.249753874283937e-05,
"loss": 0.0367,
"step": 1090
},
{
"epoch": 0.7493188010899182,
"grad_norm": 0.4022100269794464,
"learning_rate": 9.232238040449779e-05,
"loss": 0.0421,
"step": 1100
},
{
"epoch": 0.7561307901907357,
"grad_norm": 0.4457673728466034,
"learning_rate": 9.214537074813181e-05,
"loss": 0.0417,
"step": 1110
},
{
"epoch": 0.7629427792915532,
"grad_norm": 0.5297082662582397,
"learning_rate": 9.196651751671724e-05,
"loss": 0.0429,
"step": 1120
},
{
"epoch": 0.7697547683923706,
"grad_norm": 0.2837148904800415,
"learning_rate": 9.178582853387384e-05,
"loss": 0.0435,
"step": 1130
},
{
"epoch": 0.776566757493188,
"grad_norm": 0.49884119629859924,
"learning_rate": 9.160331170352304e-05,
"loss": 0.0428,
"step": 1140
},
{
"epoch": 0.7833787465940054,
"grad_norm": 0.3587488532066345,
"learning_rate": 9.141897500954229e-05,
"loss": 0.0403,
"step": 1150
},
{
"epoch": 0.7901907356948229,
"grad_norm": 0.4518432319164276,
"learning_rate": 9.123282651541576e-05,
"loss": 0.0376,
"step": 1160
},
{
"epoch": 0.7970027247956403,
"grad_norm": 0.48306331038475037,
"learning_rate": 9.104487436388161e-05,
"loss": 0.0414,
"step": 1170
},
{
"epoch": 0.8038147138964578,
"grad_norm": 0.44562119245529175,
"learning_rate": 9.085512677657582e-05,
"loss": 0.039,
"step": 1180
},
{
"epoch": 0.8106267029972752,
"grad_norm": 0.5168417692184448,
"learning_rate": 9.066359205367258e-05,
"loss": 0.0433,
"step": 1190
},
{
"epoch": 0.8174386920980926,
"grad_norm": 0.3128747344017029,
"learning_rate": 9.047027857352112e-05,
"loss": 0.0459,
"step": 1200
},
{
"epoch": 0.8242506811989101,
"grad_norm": 0.43080934882164,
"learning_rate": 9.027519479227935e-05,
"loss": 0.0418,
"step": 1210
},
{
"epoch": 0.8310626702997275,
"grad_norm": 0.39959844946861267,
"learning_rate": 9.007834924354383e-05,
"loss": 0.0382,
"step": 1220
},
{
"epoch": 0.837874659400545,
"grad_norm": 0.46520474553108215,
"learning_rate": 8.987975053797655e-05,
"loss": 0.0435,
"step": 1230
},
{
"epoch": 0.8446866485013624,
"grad_norm": 0.500769317150116,
"learning_rate": 8.967940736292825e-05,
"loss": 0.0376,
"step": 1240
},
{
"epoch": 0.8514986376021798,
"grad_norm": 0.7235398888587952,
"learning_rate": 8.947732848205846e-05,
"loss": 0.0495,
"step": 1250
},
{
"epoch": 0.8583106267029973,
"grad_norm": 0.3423875570297241,
"learning_rate": 8.927352273495204e-05,
"loss": 0.0382,
"step": 1260
},
{
"epoch": 0.8651226158038147,
"grad_norm": 0.5364619493484497,
"learning_rate": 8.906799903673265e-05,
"loss": 0.0417,
"step": 1270
},
{
"epoch": 0.8719346049046321,
"grad_norm": 0.6217823028564453,
"learning_rate": 8.88607663776726e-05,
"loss": 0.0436,
"step": 1280
},
{
"epoch": 0.8787465940054496,
"grad_norm": 0.3814774751663208,
"learning_rate": 8.865183382279978e-05,
"loss": 0.0456,
"step": 1290
},
{
"epoch": 0.885558583106267,
"grad_norm": 0.5894232988357544,
"learning_rate": 8.844121051150096e-05,
"loss": 0.0413,
"step": 1300
},
{
"epoch": 0.8923705722070845,
"grad_norm": 0.4752817153930664,
"learning_rate": 8.822890565712211e-05,
"loss": 0.0365,
"step": 1310
},
{
"epoch": 0.8991825613079019,
"grad_norm": 0.3989897072315216,
"learning_rate": 8.801492854656536e-05,
"loss": 0.0397,
"step": 1320
},
{
"epoch": 0.9059945504087193,
"grad_norm": 0.3819690942764282,
"learning_rate": 8.779928853988268e-05,
"loss": 0.0351,
"step": 1330
},
{
"epoch": 0.9128065395095368,
"grad_norm": 0.487627774477005,
"learning_rate": 8.758199506986655e-05,
"loss": 0.038,
"step": 1340
},
{
"epoch": 0.9196185286103542,
"grad_norm": 0.40310102701187134,
"learning_rate": 8.73630576416373e-05,
"loss": 0.0365,
"step": 1350
},
{
"epoch": 0.9264305177111717,
"grad_norm": 0.30654123425483704,
"learning_rate": 8.714248583222726e-05,
"loss": 0.04,
"step": 1360
},
{
"epoch": 0.9332425068119891,
"grad_norm": 0.5350182056427002,
"learning_rate": 8.692028929016196e-05,
"loss": 0.0377,
"step": 1370
},
{
"epoch": 0.9400544959128065,
"grad_norm": 0.39081400632858276,
"learning_rate": 8.669647773503797e-05,
"loss": 0.0363,
"step": 1380
},
{
"epoch": 0.946866485013624,
"grad_norm": 0.4992840886116028,
"learning_rate": 8.647106095709773e-05,
"loss": 0.0355,
"step": 1390
},
{
"epoch": 0.9536784741144414,
"grad_norm": 0.3442818224430084,
"learning_rate": 8.624404881680139e-05,
"loss": 0.0452,
"step": 1400
},
{
"epoch": 0.9604904632152589,
"grad_norm": 0.5338506102561951,
"learning_rate": 8.601545124439535e-05,
"loss": 0.0358,
"step": 1410
},
{
"epoch": 0.9673024523160763,
"grad_norm": 0.3899770975112915,
"learning_rate": 8.5785278239478e-05,
"loss": 0.0422,
"step": 1420
},
{
"epoch": 0.9741144414168937,
"grad_norm": 0.5235274434089661,
"learning_rate": 8.555353987056224e-05,
"loss": 0.0411,
"step": 1430
},
{
"epoch": 0.9809264305177112,
"grad_norm": 0.4164868891239166,
"learning_rate": 8.532024627463505e-05,
"loss": 0.0351,
"step": 1440
},
{
"epoch": 0.9877384196185286,
"grad_norm": 0.3429436683654785,
"learning_rate": 8.508540765671407e-05,
"loss": 0.0396,
"step": 1450
},
{
"epoch": 0.9945504087193461,
"grad_norm": 0.45483162999153137,
"learning_rate": 8.484903428940121e-05,
"loss": 0.0388,
"step": 1460
},
{
"epoch": 1.0013623978201636,
"grad_norm": 0.4117540419101715,
"learning_rate": 8.461113651243334e-05,
"loss": 0.0396,
"step": 1470
},
{
"epoch": 1.008174386920981,
"grad_norm": 0.44719594717025757,
"learning_rate": 8.437172473222987e-05,
"loss": 0.0411,
"step": 1480
},
{
"epoch": 1.0149863760217983,
"grad_norm": 0.5068361759185791,
"learning_rate": 8.413080942143767e-05,
"loss": 0.0343,
"step": 1490
},
{
"epoch": 1.021798365122616,
"grad_norm": 0.43941476941108704,
"learning_rate": 8.388840111847288e-05,
"loss": 0.045,
"step": 1500
},
{
"epoch": 1.0286103542234333,
"grad_norm": 0.4756196141242981,
"learning_rate": 8.364451042705998e-05,
"loss": 0.0337,
"step": 1510
},
{
"epoch": 1.0354223433242506,
"grad_norm": 0.3626450002193451,
"learning_rate": 8.33991480157679e-05,
"loss": 0.0379,
"step": 1520
},
{
"epoch": 1.042234332425068,
"grad_norm": 0.5754261016845703,
"learning_rate": 8.315232461754338e-05,
"loss": 0.0374,
"step": 1530
},
{
"epoch": 1.0490463215258856,
"grad_norm": 0.45411282777786255,
"learning_rate": 8.290405102924144e-05,
"loss": 0.0404,
"step": 1540
},
{
"epoch": 1.055858310626703,
"grad_norm": 0.5540292263031006,
"learning_rate": 8.265433811115316e-05,
"loss": 0.0406,
"step": 1550
},
{
"epoch": 1.0626702997275204,
"grad_norm": 0.4548736810684204,
"learning_rate": 8.240319678653049e-05,
"loss": 0.0353,
"step": 1560
},
{
"epoch": 1.069482288828338,
"grad_norm": 0.3220965564250946,
"learning_rate": 8.215063804110857e-05,
"loss": 0.0395,
"step": 1570
},
{
"epoch": 1.0762942779291553,
"grad_norm": 0.33744776248931885,
"learning_rate": 8.189667292262512e-05,
"loss": 0.0327,
"step": 1580
},
{
"epoch": 1.0831062670299727,
"grad_norm": 0.34971827268600464,
"learning_rate": 8.164131254033716e-05,
"loss": 0.0382,
"step": 1590
},
{
"epoch": 1.0899182561307903,
"grad_norm": 0.3128986060619354,
"learning_rate": 8.138456806453503e-05,
"loss": 0.0322,
"step": 1600
},
{
"epoch": 1.0967302452316077,
"grad_norm": 0.2257993221282959,
"learning_rate": 8.112645072605386e-05,
"loss": 0.0271,
"step": 1610
},
{
"epoch": 1.103542234332425,
"grad_norm": 0.30597376823425293,
"learning_rate": 8.086697181578222e-05,
"loss": 0.0278,
"step": 1620
},
{
"epoch": 1.1103542234332424,
"grad_norm": 0.31509286165237427,
"learning_rate": 8.060614268416823e-05,
"loss": 0.0301,
"step": 1630
},
{
"epoch": 1.11716621253406,
"grad_norm": 0.4431317150592804,
"learning_rate": 8.034397474072309e-05,
"loss": 0.0309,
"step": 1640
},
{
"epoch": 1.1239782016348774,
"grad_norm": 0.4654938578605652,
"learning_rate": 8.008047945352193e-05,
"loss": 0.0406,
"step": 1650
},
{
"epoch": 1.1307901907356948,
"grad_norm": 0.42640626430511475,
"learning_rate": 7.981566834870225e-05,
"loss": 0.0299,
"step": 1660
},
{
"epoch": 1.1376021798365124,
"grad_norm": 0.41219788789749146,
"learning_rate": 7.954955300995961e-05,
"loss": 0.0318,
"step": 1670
},
{
"epoch": 1.1444141689373297,
"grad_norm": 0.3845755159854889,
"learning_rate": 7.928214507804104e-05,
"loss": 0.0338,
"step": 1680
},
{
"epoch": 1.151226158038147,
"grad_norm": 0.31636008620262146,
"learning_rate": 7.901345625023576e-05,
"loss": 0.0352,
"step": 1690
},
{
"epoch": 1.1580381471389645,
"grad_norm": 0.34709426760673523,
"learning_rate": 7.874349827986354e-05,
"loss": 0.0331,
"step": 1700
},
{
"epoch": 1.164850136239782,
"grad_norm": 0.4313192665576935,
"learning_rate": 7.847228297576053e-05,
"loss": 0.0326,
"step": 1710
},
{
"epoch": 1.1716621253405994,
"grad_norm": 0.4032236933708191,
"learning_rate": 7.819982220176276e-05,
"loss": 0.0355,
"step": 1720
},
{
"epoch": 1.1784741144414168,
"grad_norm": 0.3324613571166992,
"learning_rate": 7.792612787618714e-05,
"loss": 0.0355,
"step": 1730
},
{
"epoch": 1.1852861035422344,
"grad_norm": 0.44290757179260254,
"learning_rate": 7.765121197131009e-05,
"loss": 0.0327,
"step": 1740
},
{
"epoch": 1.1920980926430518,
"grad_norm": 0.28540492057800293,
"learning_rate": 7.737508651284391e-05,
"loss": 0.0367,
"step": 1750
},
{
"epoch": 1.1989100817438691,
"grad_norm": 0.38834914565086365,
"learning_rate": 7.709776357941069e-05,
"loss": 0.0373,
"step": 1760
},
{
"epoch": 1.2057220708446867,
"grad_norm": 0.34177857637405396,
"learning_rate": 7.681925530201392e-05,
"loss": 0.0368,
"step": 1770
},
{
"epoch": 1.2125340599455041,
"grad_norm": 0.45681893825531006,
"learning_rate": 7.65395738635079e-05,
"loss": 0.0318,
"step": 1780
},
{
"epoch": 1.2193460490463215,
"grad_norm": 0.32232654094696045,
"learning_rate": 7.62587314980648e-05,
"loss": 0.0365,
"step": 1790
},
{
"epoch": 1.226158038147139,
"grad_norm": 0.2634826898574829,
"learning_rate": 7.597674049063947e-05,
"loss": 0.0327,
"step": 1800
},
{
"epoch": 1.2329700272479565,
"grad_norm": 0.4753483235836029,
"learning_rate": 7.569361317643211e-05,
"loss": 0.0337,
"step": 1810
},
{
"epoch": 1.2397820163487738,
"grad_norm": 0.3038065433502197,
"learning_rate": 7.540936194034865e-05,
"loss": 0.0309,
"step": 1820
},
{
"epoch": 1.2465940054495912,
"grad_norm": 0.32555919885635376,
"learning_rate": 7.512399921645901e-05,
"loss": 0.0313,
"step": 1830
},
{
"epoch": 1.2534059945504088,
"grad_norm": 0.3383468985557556,
"learning_rate": 7.483753748745317e-05,
"loss": 0.032,
"step": 1840
},
{
"epoch": 1.2602179836512262,
"grad_norm": 0.26944777369499207,
"learning_rate": 7.454998928409516e-05,
"loss": 0.0308,
"step": 1850
},
{
"epoch": 1.2670299727520435,
"grad_norm": 0.2938184142112732,
"learning_rate": 7.426136718467493e-05,
"loss": 0.0324,
"step": 1860
},
{
"epoch": 1.273841961852861,
"grad_norm": 0.276143878698349,
"learning_rate": 7.397168381445812e-05,
"loss": 0.0325,
"step": 1870
},
{
"epoch": 1.2806539509536785,
"grad_norm": 0.3054909408092499,
"learning_rate": 7.368095184513377e-05,
"loss": 0.03,
"step": 1880
},
{
"epoch": 1.2874659400544959,
"grad_norm": 0.24084536731243134,
"learning_rate": 7.338918399426005e-05,
"loss": 0.0274,
"step": 1890
},
{
"epoch": 1.2942779291553133,
"grad_norm": 0.41324862837791443,
"learning_rate": 7.309639302470801e-05,
"loss": 0.0348,
"step": 1900
},
{
"epoch": 1.3010899182561309,
"grad_norm": 0.29731935262680054,
"learning_rate": 7.280259174410312e-05,
"loss": 0.0312,
"step": 1910
},
{
"epoch": 1.3079019073569482,
"grad_norm": 0.22514300048351288,
"learning_rate": 7.250779300426517e-05,
"loss": 0.0312,
"step": 1920
},
{
"epoch": 1.3147138964577656,
"grad_norm": 0.5704501271247864,
"learning_rate": 7.22120097006461e-05,
"loss": 0.0325,
"step": 1930
},
{
"epoch": 1.3215258855585832,
"grad_norm": 0.27702492475509644,
"learning_rate": 7.191525477176577e-05,
"loss": 0.0321,
"step": 1940
},
{
"epoch": 1.3283378746594006,
"grad_norm": 0.34598076343536377,
"learning_rate": 7.161754119864616e-05,
"loss": 0.0298,
"step": 1950
},
{
"epoch": 1.335149863760218,
"grad_norm": 0.24778622388839722,
"learning_rate": 7.131888200424339e-05,
"loss": 0.0277,
"step": 1960
},
{
"epoch": 1.3419618528610355,
"grad_norm": 0.2454395443201065,
"learning_rate": 7.101929025287816e-05,
"loss": 0.0357,
"step": 1970
},
{
"epoch": 1.348773841961853,
"grad_norm": 0.47679805755615234,
"learning_rate": 7.071877904966423e-05,
"loss": 0.0378,
"step": 1980
},
{
"epoch": 1.3555858310626703,
"grad_norm": 0.2696547210216522,
"learning_rate": 7.04173615399351e-05,
"loss": 0.0299,
"step": 1990
},
{
"epoch": 1.3623978201634879,
"grad_norm": 0.3305070698261261,
"learning_rate": 7.011505090866913e-05,
"loss": 0.0298,
"step": 2000
},
{
"epoch": 1.3692098092643052,
"grad_norm": 0.35810503363609314,
"learning_rate": 6.981186037991271e-05,
"loss": 0.0304,
"step": 2010
},
{
"epoch": 1.3760217983651226,
"grad_norm": 0.314117968082428,
"learning_rate": 6.950780321620174e-05,
"loss": 0.0352,
"step": 2020
},
{
"epoch": 1.38283378746594,
"grad_norm": 0.33775216341018677,
"learning_rate": 6.920289271798157e-05,
"loss": 0.0378,
"step": 2030
},
{
"epoch": 1.3896457765667574,
"grad_norm": 0.33370664715766907,
"learning_rate": 6.889714222302517e-05,
"loss": 0.0336,
"step": 2040
},
{
"epoch": 1.396457765667575,
"grad_norm": 0.48640260100364685,
"learning_rate": 6.85905651058497e-05,
"loss": 0.0323,
"step": 2050
},
{
"epoch": 1.4032697547683923,
"grad_norm": 0.3220215141773224,
"learning_rate": 6.82831747771314e-05,
"loss": 0.0276,
"step": 2060
},
{
"epoch": 1.4100817438692097,
"grad_norm": 0.32791373133659363,
"learning_rate": 6.797498468311907e-05,
"loss": 0.0287,
"step": 2070
},
{
"epoch": 1.4168937329700273,
"grad_norm": 0.36337828636169434,
"learning_rate": 6.766600830504585e-05,
"loss": 0.0291,
"step": 2080
},
{
"epoch": 1.4237057220708447,
"grad_norm": 0.3391413390636444,
"learning_rate": 6.735625915853942e-05,
"loss": 0.0284,
"step": 2090
},
{
"epoch": 1.430517711171662,
"grad_norm": 0.35755249857902527,
"learning_rate": 6.70457507930309e-05,
"loss": 0.0274,
"step": 2100
},
{
"epoch": 1.4373297002724796,
"grad_norm": 0.2682415843009949,
"learning_rate": 6.673449679116215e-05,
"loss": 0.0274,
"step": 2110
},
{
"epoch": 1.444141689373297,
"grad_norm": 0.475309818983078,
"learning_rate": 6.642251076819148e-05,
"loss": 0.0262,
"step": 2120
},
{
"epoch": 1.4509536784741144,
"grad_norm": 0.3676445186138153,
"learning_rate": 6.610980637139827e-05,
"loss": 0.0318,
"step": 2130
},
{
"epoch": 1.457765667574932,
"grad_norm": 0.45259350538253784,
"learning_rate": 6.579639727948583e-05,
"loss": 0.0296,
"step": 2140
},
{
"epoch": 1.4645776566757494,
"grad_norm": 0.38819339871406555,
"learning_rate": 6.548229720198315e-05,
"loss": 0.0334,
"step": 2150
},
{
"epoch": 1.4713896457765667,
"grad_norm": 0.4020323157310486,
"learning_rate": 6.516751987864517e-05,
"loss": 0.0273,
"step": 2160
},
{
"epoch": 1.4782016348773843,
"grad_norm": 0.1928047388792038,
"learning_rate": 6.485207907885175e-05,
"loss": 0.0266,
"step": 2170
},
{
"epoch": 1.4850136239782017,
"grad_norm": 0.442618727684021,
"learning_rate": 6.453598860100536e-05,
"loss": 0.0299,
"step": 2180
},
{
"epoch": 1.491825613079019,
"grad_norm": 0.36381062865257263,
"learning_rate": 6.421926227192749e-05,
"loss": 0.0252,
"step": 2190
},
{
"epoch": 1.4986376021798364,
"grad_norm": 0.4495033621788025,
"learning_rate": 6.390191394625381e-05,
"loss": 0.0265,
"step": 2200
},
{
"epoch": 1.5054495912806538,
"grad_norm": 0.3564695715904236,
"learning_rate": 6.358395750582817e-05,
"loss": 0.026,
"step": 2210
},
{
"epoch": 1.5122615803814714,
"grad_norm": 0.28276216983795166,
"learning_rate": 6.326540685909532e-05,
"loss": 0.0245,
"step": 2220
},
{
"epoch": 1.5190735694822888,
"grad_norm": 0.44450217485427856,
"learning_rate": 6.294627594049249e-05,
"loss": 0.0253,
"step": 2230
},
{
"epoch": 1.5258855585831061,
"grad_norm": 0.2726491391658783,
"learning_rate": 6.262657870983989e-05,
"loss": 0.0258,
"step": 2240
},
{
"epoch": 1.5326975476839237,
"grad_norm": 0.35235723853111267,
"learning_rate": 6.230632915173009e-05,
"loss": 0.0303,
"step": 2250
},
{
"epoch": 1.5395095367847411,
"grad_norm": 0.2119748741388321,
"learning_rate": 6.198554127491622e-05,
"loss": 0.029,
"step": 2260
},
{
"epoch": 1.5463215258855585,
"grad_norm": 0.34444141387939453,
"learning_rate": 6.166422911169923e-05,
"loss": 0.0269,
"step": 2270
},
{
"epoch": 1.553133514986376,
"grad_norm": 0.2883770763874054,
"learning_rate": 6.1342406717314e-05,
"loss": 0.0303,
"step": 2280
},
{
"epoch": 1.5599455040871935,
"grad_norm": 0.2837648093700409,
"learning_rate": 6.102008816931466e-05,
"loss": 0.0272,
"step": 2290
},
{
"epoch": 1.5667574931880108,
"grad_norm": 0.2236020863056183,
"learning_rate": 6.069728756695866e-05,
"loss": 0.0234,
"step": 2300
},
{
"epoch": 1.5735694822888284,
"grad_norm": 0.4470672607421875,
"learning_rate": 6.037401903059008e-05,
"loss": 0.032,
"step": 2310
},
{
"epoch": 1.5803814713896458,
"grad_norm": 0.3020336627960205,
"learning_rate": 6.005029670102195e-05,
"loss": 0.0227,
"step": 2320
},
{
"epoch": 1.5871934604904632,
"grad_norm": 0.27960023283958435,
"learning_rate": 5.972613473891766e-05,
"loss": 0.0335,
"step": 2330
},
{
"epoch": 1.5940054495912808,
"grad_norm": 0.308479368686676,
"learning_rate": 5.940154732417158e-05,
"loss": 0.0297,
"step": 2340
},
{
"epoch": 1.6008174386920981,
"grad_norm": 0.3311978876590729,
"learning_rate": 5.907654865528876e-05,
"loss": 0.0312,
"step": 2350
},
{
"epoch": 1.6076294277929155,
"grad_norm": 0.26757732033729553,
"learning_rate": 5.875115294876381e-05,
"loss": 0.0234,
"step": 2360
},
{
"epoch": 1.614441416893733,
"grad_norm": 0.40103888511657715,
"learning_rate": 5.842537443845908e-05,
"loss": 0.0274,
"step": 2370
},
{
"epoch": 1.6212534059945503,
"grad_norm": 0.17837531864643097,
"learning_rate": 5.809922737498198e-05,
"loss": 0.0225,
"step": 2380
},
{
"epoch": 1.6280653950953679,
"grad_norm": 0.42968425154685974,
"learning_rate": 5.777272602506165e-05,
"loss": 0.027,
"step": 2390
},
{
"epoch": 1.6348773841961854,
"grad_norm": 0.24213114380836487,
"learning_rate": 5.744588467092483e-05,
"loss": 0.0265,
"step": 2400
},
{
"epoch": 1.6416893732970026,
"grad_norm": 0.3060871660709381,
"learning_rate": 5.7118717609671194e-05,
"loss": 0.0235,
"step": 2410
},
{
"epoch": 1.6485013623978202,
"grad_norm": 0.20384085178375244,
"learning_rate": 5.679123915264786e-05,
"loss": 0.0261,
"step": 2420
},
{
"epoch": 1.6553133514986376,
"grad_norm": 0.3139786720275879,
"learning_rate": 5.646346362482342e-05,
"loss": 0.0225,
"step": 2430
},
{
"epoch": 1.662125340599455,
"grad_norm": 0.2353772073984146,
"learning_rate": 5.613540536416132e-05,
"loss": 0.0273,
"step": 2440
},
{
"epoch": 1.6689373297002725,
"grad_norm": 0.3663155436515808,
"learning_rate": 5.5807078720992645e-05,
"loss": 0.0237,
"step": 2450
},
{
"epoch": 1.67574931880109,
"grad_norm": 0.4667767882347107,
"learning_rate": 5.547849805738836e-05,
"loss": 0.0308,
"step": 2460
},
{
"epoch": 1.6825613079019073,
"grad_norm": 0.2913496792316437,
"learning_rate": 5.514967774653118e-05,
"loss": 0.0222,
"step": 2470
},
{
"epoch": 1.6893732970027249,
"grad_norm": 0.22617073357105255,
"learning_rate": 5.482063217208674e-05,
"loss": 0.0251,
"step": 2480
},
{
"epoch": 1.6961852861035422,
"grad_norm": 0.3499128222465515,
"learning_rate": 5.449137572757439e-05,
"loss": 0.0216,
"step": 2490
},
{
"epoch": 1.7029972752043596,
"grad_norm": 0.24365057051181793,
"learning_rate": 5.4161922815737696e-05,
"loss": 0.0268,
"step": 2500
},
{
"epoch": 1.7098092643051772,
"grad_norm": 0.21294479072093964,
"learning_rate": 5.3832287847914276e-05,
"loss": 0.0273,
"step": 2510
},
{
"epoch": 1.7166212534059946,
"grad_norm": 0.31520646810531616,
"learning_rate": 5.35024852434055e-05,
"loss": 0.0258,
"step": 2520
},
{
"epoch": 1.723433242506812,
"grad_norm": 0.4261656403541565,
"learning_rate": 5.317252942884567e-05,
"loss": 0.0231,
"step": 2530
},
{
"epoch": 1.7302452316076296,
"grad_norm": 0.29408591985702515,
"learning_rate": 5.284243483757109e-05,
"loss": 0.0304,
"step": 2540
},
{
"epoch": 1.7370572207084467,
"grad_norm": 0.333383172750473,
"learning_rate": 5.2512215908988484e-05,
"loss": 0.0295,
"step": 2550
},
{
"epoch": 1.7438692098092643,
"grad_norm": 0.2510589361190796,
"learning_rate": 5.218188708794357e-05,
"loss": 0.0254,
"step": 2560
},
{
"epoch": 1.750681198910082,
"grad_norm": 0.3071255385875702,
"learning_rate": 5.18514628240891e-05,
"loss": 0.0233,
"step": 2570
},
{
"epoch": 1.757493188010899,
"grad_norm": 0.3328297436237335,
"learning_rate": 5.1520957571252795e-05,
"loss": 0.0237,
"step": 2580
},
{
"epoch": 1.7643051771117166,
"grad_norm": 0.2048969864845276,
"learning_rate": 5.1190385786805106e-05,
"loss": 0.0278,
"step": 2590
},
{
"epoch": 1.771117166212534,
"grad_norm": 0.4445406496524811,
"learning_rate": 5.085976193102677e-05,
"loss": 0.0247,
"step": 2600
},
{
"epoch": 1.7779291553133514,
"grad_norm": 0.2530488967895508,
"learning_rate": 5.052910046647634e-05,
"loss": 0.0218,
"step": 2610
},
{
"epoch": 1.784741144414169,
"grad_norm": 0.31554245948791504,
"learning_rate": 5.0198415857357464e-05,
"loss": 0.0237,
"step": 2620
},
{
"epoch": 1.7915531335149864,
"grad_norm": 0.2431655079126358,
"learning_rate": 4.9867722568886223e-05,
"loss": 0.0214,
"step": 2630
},
{
"epoch": 1.7983651226158037,
"grad_norm": 0.28798162937164307,
"learning_rate": 4.9537035066658314e-05,
"loss": 0.0213,
"step": 2640
},
{
"epoch": 1.8051771117166213,
"grad_norm": 0.25857627391815186,
"learning_rate": 4.920636781601638e-05,
"loss": 0.0272,
"step": 2650
},
{
"epoch": 1.8119891008174387,
"grad_norm": 0.2804415225982666,
"learning_rate": 4.88757352814172e-05,
"loss": 0.0288,
"step": 2660
},
{
"epoch": 1.818801089918256,
"grad_norm": 0.23555926978588104,
"learning_rate": 4.8545151925798924e-05,
"loss": 0.0247,
"step": 2670
},
{
"epoch": 1.8256130790190737,
"grad_norm": 0.3501521050930023,
"learning_rate": 4.821463220994848e-05,
"loss": 0.026,
"step": 2680
},
{
"epoch": 1.832425068119891,
"grad_norm": 0.3100302517414093,
"learning_rate": 4.788419059186895e-05,
"loss": 0.021,
"step": 2690
},
{
"epoch": 1.8392370572207084,
"grad_norm": 0.28045013546943665,
"learning_rate": 4.7553841526147205e-05,
"loss": 0.0257,
"step": 2700
},
{
"epoch": 1.846049046321526,
"grad_norm": 0.17547450959682465,
"learning_rate": 4.722359946332156e-05,
"loss": 0.023,
"step": 2710
},
{
"epoch": 1.8528610354223434,
"grad_norm": 0.2572614550590515,
"learning_rate": 4.6893478849249654e-05,
"loss": 0.0226,
"step": 2720
},
{
"epoch": 1.8596730245231607,
"grad_norm": 0.42476364970207214,
"learning_rate": 4.656349412447664e-05,
"loss": 0.023,
"step": 2730
},
{
"epoch": 1.8664850136239783,
"grad_norm": 0.37075158953666687,
"learning_rate": 4.623365972360337e-05,
"loss": 0.0239,
"step": 2740
},
{
"epoch": 1.8732970027247955,
"grad_norm": 0.27569836378097534,
"learning_rate": 4.590399007465503e-05,
"loss": 0.0216,
"step": 2750
},
{
"epoch": 1.880108991825613,
"grad_norm": 0.25869858264923096,
"learning_rate": 4.557449959845005e-05,
"loss": 0.024,
"step": 2760
},
{
"epoch": 1.8869209809264307,
"grad_norm": 0.2198791801929474,
"learning_rate": 4.524520270796927e-05,
"loss": 0.0213,
"step": 2770
},
{
"epoch": 1.8937329700272478,
"grad_norm": 0.3058468997478485,
"learning_rate": 4.491611380772545e-05,
"loss": 0.0218,
"step": 2780
},
{
"epoch": 1.9005449591280654,
"grad_norm": 0.2228512316942215,
"learning_rate": 4.458724729313318e-05,
"loss": 0.0218,
"step": 2790
},
{
"epoch": 1.9073569482288828,
"grad_norm": 0.2506347894668579,
"learning_rate": 4.42586175498792e-05,
"loss": 0.023,
"step": 2800
},
{
"epoch": 1.9141689373297002,
"grad_norm": 0.28511497378349304,
"learning_rate": 4.3930238953293094e-05,
"loss": 0.0211,
"step": 2810
},
{
"epoch": 1.9209809264305178,
"grad_norm": 0.2836903929710388,
"learning_rate": 4.360212586771847e-05,
"loss": 0.0174,
"step": 2820
},
{
"epoch": 1.9277929155313351,
"grad_norm": 0.2694113254547119,
"learning_rate": 4.327429264588463e-05,
"loss": 0.024,
"step": 2830
},
{
"epoch": 1.9346049046321525,
"grad_norm": 0.25238320231437683,
"learning_rate": 4.2946753628278725e-05,
"loss": 0.022,
"step": 2840
},
{
"epoch": 1.94141689373297,
"grad_norm": 0.22233974933624268,
"learning_rate": 4.2619523142518474e-05,
"loss": 0.0218,
"step": 2850
},
{
"epoch": 1.9482288828337875,
"grad_norm": 0.22567766904830933,
"learning_rate": 4.229261550272539e-05,
"loss": 0.0211,
"step": 2860
},
{
"epoch": 1.9550408719346049,
"grad_norm": 0.21269120275974274,
"learning_rate": 4.196604500889868e-05,
"loss": 0.0207,
"step": 2870
},
{
"epoch": 1.9618528610354224,
"grad_norm": 0.25701943039894104,
"learning_rate": 4.163982594628969e-05,
"loss": 0.0218,
"step": 2880
},
{
"epoch": 1.9686648501362398,
"grad_norm": 0.2941311299800873,
"learning_rate": 4.131397258477702e-05,
"loss": 0.0222,
"step": 2890
},
{
"epoch": 1.9754768392370572,
"grad_norm": 0.20397907495498657,
"learning_rate": 4.0988499178242315e-05,
"loss": 0.0205,
"step": 2900
},
{
"epoch": 1.9822888283378748,
"grad_norm": 0.21562394499778748,
"learning_rate": 4.066341996394678e-05,
"loss": 0.0288,
"step": 2910
},
{
"epoch": 1.989100817438692,
"grad_norm": 0.25813037157058716,
"learning_rate": 4.033874916190833e-05,
"loss": 0.0215,
"step": 2920
},
{
"epoch": 1.9959128065395095,
"grad_norm": 0.1991417109966278,
"learning_rate": 4.001450097427966e-05,
"loss": 0.019,
"step": 2930
},
{
"epoch": 2.002724795640327,
"grad_norm": 0.21835818886756897,
"learning_rate": 3.9690689584726894e-05,
"loss": 0.0249,
"step": 2940
},
{
"epoch": 2.0095367847411443,
"grad_norm": 0.24195794761180878,
"learning_rate": 3.936732915780923e-05,
"loss": 0.0177,
"step": 2950
},
{
"epoch": 2.016348773841962,
"grad_norm": 0.3374285101890564,
"learning_rate": 3.904443383835929e-05,
"loss": 0.0247,
"step": 2960
},
{
"epoch": 2.0231607629427795,
"grad_norm": 0.2824082374572754,
"learning_rate": 3.872201775086437e-05,
"loss": 0.0216,
"step": 2970
},
{
"epoch": 2.0299727520435966,
"grad_norm": 0.29006993770599365,
"learning_rate": 3.8400094998848616e-05,
"loss": 0.0206,
"step": 2980
},
{
"epoch": 2.036784741144414,
"grad_norm": 0.3308681547641754,
"learning_rate": 3.807867966425611e-05,
"loss": 0.0178,
"step": 2990
},
{
"epoch": 2.043596730245232,
"grad_norm": 0.24560880661010742,
"learning_rate": 3.775778580683481e-05,
"loss": 0.0226,
"step": 3000
},
{
"epoch": 2.050408719346049,
"grad_norm": 0.2389586716890335,
"learning_rate": 3.743742746352156e-05,
"loss": 0.021,
"step": 3010
},
{
"epoch": 2.0572207084468666,
"grad_norm": 0.35238826274871826,
"learning_rate": 3.711761864782817e-05,
"loss": 0.0251,
"step": 3020
},
{
"epoch": 2.0640326975476837,
"grad_norm": 0.2502613365650177,
"learning_rate": 3.679837334922825e-05,
"loss": 0.0201,
"step": 3030
},
{
"epoch": 2.0708446866485013,
"grad_norm": 0.2527748942375183,
"learning_rate": 3.647970553254538e-05,
"loss": 0.0211,
"step": 3040
},
{
"epoch": 2.077656675749319,
"grad_norm": 0.3349742293357849,
"learning_rate": 3.61616291373422e-05,
"loss": 0.0243,
"step": 3050
},
{
"epoch": 2.084468664850136,
"grad_norm": 0.2768033444881439,
"learning_rate": 3.584415807731065e-05,
"loss": 0.0229,
"step": 3060
},
{
"epoch": 2.0912806539509536,
"grad_norm": 0.21673381328582764,
"learning_rate": 3.552730623966337e-05,
"loss": 0.0223,
"step": 3070
},
{
"epoch": 2.0980926430517712,
"grad_norm": 0.20745591819286346,
"learning_rate": 3.521108748452617e-05,
"loss": 0.0196,
"step": 3080
},
{
"epoch": 2.1049046321525884,
"grad_norm": 0.27668702602386475,
"learning_rate": 3.489551564433186e-05,
"loss": 0.024,
"step": 3090
},
{
"epoch": 2.111716621253406,
"grad_norm": 0.2564879357814789,
"learning_rate": 3.4580604523215006e-05,
"loss": 0.0194,
"step": 3100
},
{
"epoch": 2.1185286103542236,
"grad_norm": 0.21311357617378235,
"learning_rate": 3.4266367896408216e-05,
"loss": 0.0291,
"step": 3110
},
{
"epoch": 2.1253405994550407,
"grad_norm": 0.21265241503715515,
"learning_rate": 3.3952819509639534e-05,
"loss": 0.019,
"step": 3120
},
{
"epoch": 2.1321525885558583,
"grad_norm": 0.25450852513313293,
"learning_rate": 3.3639973078531165e-05,
"loss": 0.0207,
"step": 3130
},
{
"epoch": 2.138964577656676,
"grad_norm": 0.24124109745025635,
"learning_rate": 3.332784228799947e-05,
"loss": 0.0195,
"step": 3140
},
{
"epoch": 2.145776566757493,
"grad_norm": 0.3012523055076599,
"learning_rate": 3.301644079165638e-05,
"loss": 0.0206,
"step": 3150
},
{
"epoch": 2.1525885558583107,
"grad_norm": 0.2553965151309967,
"learning_rate": 3.27057822112122e-05,
"loss": 0.0169,
"step": 3160
},
{
"epoch": 2.1594005449591283,
"grad_norm": 0.28278952836990356,
"learning_rate": 3.239588013587958e-05,
"loss": 0.0222,
"step": 3170
},
{
"epoch": 2.1662125340599454,
"grad_norm": 0.2095153033733368,
"learning_rate": 3.208674812177926e-05,
"loss": 0.0189,
"step": 3180
},
{
"epoch": 2.173024523160763,
"grad_norm": 0.30485105514526367,
"learning_rate": 3.177839969134698e-05,
"loss": 0.0219,
"step": 3190
},
{
"epoch": 2.1798365122615806,
"grad_norm": 0.35161760449409485,
"learning_rate": 3.1470848332742e-05,
"loss": 0.0217,
"step": 3200
},
{
"epoch": 2.1866485013623977,
"grad_norm": 0.24349473416805267,
"learning_rate": 3.116410749925708e-05,
"loss": 0.0222,
"step": 3210
},
{
"epoch": 2.1934604904632153,
"grad_norm": 0.15715332329273224,
"learning_rate": 3.085819060872995e-05,
"loss": 0.0179,
"step": 3220
},
{
"epoch": 2.2002724795640325,
"grad_norm": 0.22666095197200775,
"learning_rate": 3.055311104295648e-05,
"loss": 0.0198,
"step": 3230
},
{
"epoch": 2.20708446866485,
"grad_norm": 0.22959241271018982,
"learning_rate": 3.024888214710517e-05,
"loss": 0.0162,
"step": 3240
},
{
"epoch": 2.2138964577656677,
"grad_norm": 0.22255851328372955,
"learning_rate": 2.994551722913349e-05,
"loss": 0.0159,
"step": 3250
},
{
"epoch": 2.220708446866485,
"grad_norm": 0.2214617133140564,
"learning_rate": 2.9643029559205727e-05,
"loss": 0.0225,
"step": 3260
},
{
"epoch": 2.2275204359673024,
"grad_norm": 0.1882133036851883,
"learning_rate": 2.934143236911248e-05,
"loss": 0.0179,
"step": 3270
},
{
"epoch": 2.23433242506812,
"grad_norm": 0.4131694436073303,
"learning_rate": 2.90407388516919e-05,
"loss": 0.0194,
"step": 3280
},
{
"epoch": 2.241144414168937,
"grad_norm": 0.3278559148311615,
"learning_rate": 2.8740962160252495e-05,
"loss": 0.02,
"step": 3290
},
{
"epoch": 2.2479564032697548,
"grad_norm": 0.21860350668430328,
"learning_rate": 2.844211540799797e-05,
"loss": 0.0177,
"step": 3300
},
{
"epoch": 2.2547683923705724,
"grad_norm": 0.2650901675224304,
"learning_rate": 2.8144211667453368e-05,
"loss": 0.0183,
"step": 3310
},
{
"epoch": 2.2615803814713895,
"grad_norm": 0.2598157823085785,
"learning_rate": 2.7847263969893344e-05,
"loss": 0.016,
"step": 3320
},
{
"epoch": 2.268392370572207,
"grad_norm": 0.21535956859588623,
"learning_rate": 2.7551285304772206e-05,
"loss": 0.0173,
"step": 3330
},
{
"epoch": 2.2752043596730247,
"grad_norm": 0.19479890167713165,
"learning_rate": 2.7256288619155567e-05,
"loss": 0.0181,
"step": 3340
},
{
"epoch": 2.282016348773842,
"grad_norm": 0.21761104464530945,
"learning_rate": 2.6962286817154158e-05,
"loss": 0.0208,
"step": 3350
},
{
"epoch": 2.2888283378746594,
"grad_norm": 0.18495774269104004,
"learning_rate": 2.6669292759359166e-05,
"loss": 0.0173,
"step": 3360
},
{
"epoch": 2.2956403269754766,
"grad_norm": 0.2476925402879715,
"learning_rate": 2.637731926227993e-05,
"loss": 0.0231,
"step": 3370
},
{
"epoch": 2.302452316076294,
"grad_norm": 0.3167796730995178,
"learning_rate": 2.6086379097783033e-05,
"loss": 0.0219,
"step": 3380
},
{
"epoch": 2.309264305177112,
"grad_norm": 0.3013063371181488,
"learning_rate": 2.579648499253377e-05,
"loss": 0.0183,
"step": 3390
},
{
"epoch": 2.316076294277929,
"grad_norm": 0.2609173357486725,
"learning_rate": 2.5507649627439466e-05,
"loss": 0.0214,
"step": 3400
},
{
"epoch": 2.3228882833787465,
"grad_norm": 0.1826580911874771,
"learning_rate": 2.5219885637094653e-05,
"loss": 0.0191,
"step": 3410
},
{
"epoch": 2.329700272479564,
"grad_norm": 0.21605326235294342,
"learning_rate": 2.4933205609228533e-05,
"loss": 0.0209,
"step": 3420
},
{
"epoch": 2.3365122615803813,
"grad_norm": 0.23476341366767883,
"learning_rate": 2.464762208415419e-05,
"loss": 0.018,
"step": 3430
},
{
"epoch": 2.343324250681199,
"grad_norm": 0.1948312371969223,
"learning_rate": 2.4363147554220213e-05,
"loss": 0.0145,
"step": 3440
},
{
"epoch": 2.3501362397820165,
"grad_norm": 0.20815841853618622,
"learning_rate": 2.407979446326411e-05,
"loss": 0.0196,
"step": 3450
},
{
"epoch": 2.3569482288828336,
"grad_norm": 0.23515887558460236,
"learning_rate": 2.379757520606799e-05,
"loss": 0.0203,
"step": 3460
},
{
"epoch": 2.363760217983651,
"grad_norm": 0.2154649794101715,
"learning_rate": 2.3516502127816455e-05,
"loss": 0.0175,
"step": 3470
},
{
"epoch": 2.370572207084469,
"grad_norm": 0.23456346988677979,
"learning_rate": 2.323658752355647e-05,
"loss": 0.0173,
"step": 3480
},
{
"epoch": 2.377384196185286,
"grad_norm": 0.21330733597278595,
"learning_rate": 2.2957843637659654e-05,
"loss": 0.0178,
"step": 3490
},
{
"epoch": 2.3841961852861036,
"grad_norm": 0.19244815409183502,
"learning_rate": 2.2680282663286552e-05,
"loss": 0.0229,
"step": 3500
},
{
"epoch": 2.391008174386921,
"grad_norm": 0.20745113492012024,
"learning_rate": 2.2403916741853364e-05,
"loss": 0.0173,
"step": 3510
},
{
"epoch": 2.3978201634877383,
"grad_norm": 0.19936102628707886,
"learning_rate": 2.2128757962500817e-05,
"loss": 0.0172,
"step": 3520
},
{
"epoch": 2.404632152588556,
"grad_norm": 0.2921135127544403,
"learning_rate": 2.1854818361565275e-05,
"loss": 0.0171,
"step": 3530
},
{
"epoch": 2.4114441416893735,
"grad_norm": 0.2126695066690445,
"learning_rate": 2.1582109922052364e-05,
"loss": 0.0199,
"step": 3540
},
{
"epoch": 2.4182561307901906,
"grad_norm": 0.161210298538208,
"learning_rate": 2.1310644573112635e-05,
"loss": 0.0202,
"step": 3550
},
{
"epoch": 2.4250681198910082,
"grad_norm": 0.1921418011188507,
"learning_rate": 2.1040434189519924e-05,
"loss": 0.0168,
"step": 3560
},
{
"epoch": 2.431880108991826,
"grad_norm": 0.17595872282981873,
"learning_rate": 2.0771490591151733e-05,
"loss": 0.0208,
"step": 3570
},
{
"epoch": 2.438692098092643,
"grad_norm": 0.18638396263122559,
"learning_rate": 2.0503825542472317e-05,
"loss": 0.0214,
"step": 3580
},
{
"epoch": 2.4455040871934606,
"grad_norm": 0.24000069499015808,
"learning_rate": 2.023745075201805e-05,
"loss": 0.0155,
"step": 3590
},
{
"epoch": 2.452316076294278,
"grad_norm": 0.13929104804992676,
"learning_rate": 1.9972377871885157e-05,
"loss": 0.0201,
"step": 3600
},
{
"epoch": 2.4591280653950953,
"grad_norm": 0.23332083225250244,
"learning_rate": 1.970861849722017e-05,
"loss": 0.0159,
"step": 3610
},
{
"epoch": 2.465940054495913,
"grad_norm": 0.2451397329568863,
"learning_rate": 1.9446184165712587e-05,
"loss": 0.0172,
"step": 3620
},
{
"epoch": 2.47275204359673,
"grad_norm": 0.1490626186132431,
"learning_rate": 1.9185086357090214e-05,
"loss": 0.018,
"step": 3630
},
{
"epoch": 2.4795640326975477,
"grad_norm": 0.16023452579975128,
"learning_rate": 1.8925336492617057e-05,
"loss": 0.0167,
"step": 3640
},
{
"epoch": 2.4863760217983653,
"grad_norm": 0.2159489542245865,
"learning_rate": 1.8666945934593666e-05,
"loss": 0.0185,
"step": 3650
},
{
"epoch": 2.4931880108991824,
"grad_norm": 0.18671192228794098,
"learning_rate": 1.8409925985860126e-05,
"loss": 0.0129,
"step": 3660
},
{
"epoch": 2.5,
"grad_norm": 0.20349836349487305,
"learning_rate": 1.8154287889301603e-05,
"loss": 0.0177,
"step": 3670
},
{
"epoch": 2.5068119891008176,
"grad_norm": 0.18601705133914948,
"learning_rate": 1.7900042827356612e-05,
"loss": 0.0205,
"step": 3680
},
{
"epoch": 2.5136239782016347,
"grad_norm": 0.22594991326332092,
"learning_rate": 1.76472019215278e-05,
"loss": 0.0205,
"step": 3690
},
{
"epoch": 2.5204359673024523,
"grad_norm": 0.18238820135593414,
"learning_rate": 1.739577623189545e-05,
"loss": 0.0142,
"step": 3700
},
{
"epoch": 2.5272479564032695,
"grad_norm": 0.1694435328245163,
"learning_rate": 1.7145776756633768e-05,
"loss": 0.022,
"step": 3710
},
{
"epoch": 2.534059945504087,
"grad_norm": 0.2308904379606247,
"learning_rate": 1.6897214431529646e-05,
"loss": 0.0166,
"step": 3720
},
{
"epoch": 2.5408719346049047,
"grad_norm": 0.18409192562103271,
"learning_rate": 1.6650100129504475e-05,
"loss": 0.0132,
"step": 3730
},
{
"epoch": 2.547683923705722,
"grad_norm": 0.17650723457336426,
"learning_rate": 1.6404444660138335e-05,
"loss": 0.0197,
"step": 3740
},
{
"epoch": 2.5544959128065394,
"grad_norm": 0.24465468525886536,
"learning_rate": 1.616025876919725e-05,
"loss": 0.0163,
"step": 3750
},
{
"epoch": 2.561307901907357,
"grad_norm": 0.19395938515663147,
"learning_rate": 1.5917553138163172e-05,
"loss": 0.0176,
"step": 3760
},
{
"epoch": 2.568119891008174,
"grad_norm": 0.19339482486248016,
"learning_rate": 1.5676338383766632e-05,
"loss": 0.0196,
"step": 3770
},
{
"epoch": 2.5749318801089918,
"grad_norm": 0.18326933681964874,
"learning_rate": 1.5436625057522447e-05,
"loss": 0.0154,
"step": 3780
},
{
"epoch": 2.5817438692098094,
"grad_norm": 0.17008966207504272,
"learning_rate": 1.519842364526804e-05,
"loss": 0.0137,
"step": 3790
},
{
"epoch": 2.5885558583106265,
"grad_norm": 0.1793888807296753,
"learning_rate": 1.4961744566704855e-05,
"loss": 0.0165,
"step": 3800
},
{
"epoch": 2.595367847411444,
"grad_norm": 0.1575794667005539,
"learning_rate": 1.4726598174942551e-05,
"loss": 0.0147,
"step": 3810
},
{
"epoch": 2.6021798365122617,
"grad_norm": 0.24643422663211823,
"learning_rate": 1.4492994756046035e-05,
"loss": 0.0207,
"step": 3820
},
{
"epoch": 2.608991825613079,
"grad_norm": 0.1690363883972168,
"learning_rate": 1.4260944528585645e-05,
"loss": 0.0179,
"step": 3830
},
{
"epoch": 2.6158038147138964,
"grad_norm": 0.229860320687294,
"learning_rate": 1.4030457643190048e-05,
"loss": 0.0138,
"step": 3840
},
{
"epoch": 2.622615803814714,
"grad_norm": 0.1885327398777008,
"learning_rate": 1.3801544182102311e-05,
"loss": 0.016,
"step": 3850
},
{
"epoch": 2.629427792915531,
"grad_norm": 0.1853918582201004,
"learning_rate": 1.3574214158738763e-05,
"loss": 0.0178,
"step": 3860
},
{
"epoch": 2.636239782016349,
"grad_norm": 0.17312754690647125,
"learning_rate": 1.3348477517251101e-05,
"loss": 0.0159,
"step": 3870
},
{
"epoch": 2.6430517711171664,
"grad_norm": 0.14870062470436096,
"learning_rate": 1.312434413209131e-05,
"loss": 0.0179,
"step": 3880
},
{
"epoch": 2.6498637602179835,
"grad_norm": 0.35962745547294617,
"learning_rate": 1.2901823807579727e-05,
"loss": 0.0148,
"step": 3890
},
{
"epoch": 2.656675749318801,
"grad_norm": 0.14894793927669525,
"learning_rate": 1.2680926277476245e-05,
"loss": 0.017,
"step": 3900
},
{
"epoch": 2.6634877384196187,
"grad_norm": 0.20324669778347015,
"learning_rate": 1.2461661204554397e-05,
"loss": 0.0166,
"step": 3910
},
{
"epoch": 2.670299727520436,
"grad_norm": 0.2097160369157791,
"learning_rate": 1.2244038180178835e-05,
"loss": 0.0161,
"step": 3920
},
{
"epoch": 2.6771117166212535,
"grad_norm": 0.17441681027412415,
"learning_rate": 1.2028066723885612e-05,
"loss": 0.0163,
"step": 3930
},
{
"epoch": 2.683923705722071,
"grad_norm": 0.18608888983726501,
"learning_rate": 1.1813756282965888e-05,
"loss": 0.0176,
"step": 3940
},
{
"epoch": 2.690735694822888,
"grad_norm": 0.18648923933506012,
"learning_rate": 1.1601116232052638e-05,
"loss": 0.0168,
"step": 3950
},
{
"epoch": 2.697547683923706,
"grad_norm": 0.15261727571487427,
"learning_rate": 1.1390155872710517e-05,
"loss": 0.0149,
"step": 3960
},
{
"epoch": 2.7043596730245234,
"grad_norm": 0.2162063866853714,
"learning_rate": 1.1180884433029087e-05,
"loss": 0.0168,
"step": 3970
},
{
"epoch": 2.7111716621253406,
"grad_norm": 0.24533921480178833,
"learning_rate": 1.097331106721904e-05,
"loss": 0.0147,
"step": 3980
},
{
"epoch": 2.717983651226158,
"grad_norm": 0.20895080268383026,
"learning_rate": 1.0767444855211862e-05,
"loss": 0.015,
"step": 3990
},
{
"epoch": 2.7247956403269757,
"grad_norm": 0.2006479650735855,
"learning_rate": 1.0563294802262558e-05,
"loss": 0.0173,
"step": 4000
},
{
"epoch": 2.731607629427793,
"grad_norm": 0.16398422420024872,
"learning_rate": 1.0360869838555809e-05,
"loss": 0.0169,
"step": 4010
},
{
"epoch": 2.7384196185286105,
"grad_norm": 0.22024202346801758,
"learning_rate": 1.0160178818815313e-05,
"loss": 0.015,
"step": 4020
},
{
"epoch": 2.7452316076294276,
"grad_norm": 0.1872708946466446,
"learning_rate": 9.961230521916387e-06,
"loss": 0.0168,
"step": 4030
},
{
"epoch": 2.7520435967302452,
"grad_norm": 0.2346954643726349,
"learning_rate": 9.764033650502074e-06,
"loss": 0.0176,
"step": 4040
},
{
"epoch": 2.758855585831063,
"grad_norm": 0.15068836510181427,
"learning_rate": 9.568596830602344e-06,
"loss": 0.0137,
"step": 4050
},
{
"epoch": 2.76566757493188,
"grad_norm": 0.20182640850543976,
"learning_rate": 9.37492861125681e-06,
"loss": 0.0181,
"step": 4060
},
{
"epoch": 2.7724795640326976,
"grad_norm": 0.1375190019607544,
"learning_rate": 9.183037464140804e-06,
"loss": 0.0158,
"step": 4070
},
{
"epoch": 2.7792915531335147,
"grad_norm": 0.25182825326919556,
"learning_rate": 8.992931783194735e-06,
"loss": 0.0134,
"step": 4080
},
{
"epoch": 2.7861035422343323,
"grad_norm": 0.18647728860378265,
"learning_rate": 8.80461988425696e-06,
"loss": 0.0136,
"step": 4090
},
{
"epoch": 2.79291553133515,
"grad_norm": 0.16191458702087402,
"learning_rate": 8.618110004699974e-06,
"loss": 0.0164,
"step": 4100
},
{
"epoch": 2.799727520435967,
"grad_norm": 0.18361864984035492,
"learning_rate": 8.4334103030701e-06,
"loss": 0.0155,
"step": 4110
},
{
"epoch": 2.8065395095367847,
"grad_norm": 0.21431824564933777,
"learning_rate": 8.25052885873066e-06,
"loss": 0.0154,
"step": 4120
},
{
"epoch": 2.8133514986376023,
"grad_norm": 0.18994954228401184,
"learning_rate": 8.06947367150846e-06,
"loss": 0.016,
"step": 4130
},
{
"epoch": 2.8201634877384194,
"grad_norm": 0.21481618285179138,
"learning_rate": 7.890252661343938e-06,
"loss": 0.0166,
"step": 4140
},
{
"epoch": 2.826975476839237,
"grad_norm": 0.11670587211847305,
"learning_rate": 7.712873667944681e-06,
"loss": 0.0142,
"step": 4150
},
{
"epoch": 2.8337874659400546,
"grad_norm": 0.19601042568683624,
"learning_rate": 7.537344450442469e-06,
"loss": 0.0104,
"step": 4160
},
{
"epoch": 2.8405994550408717,
"grad_norm": 0.15036450326442719,
"learning_rate": 7.36367268705393e-06,
"loss": 0.0174,
"step": 4170
},
{
"epoch": 2.8474114441416893,
"grad_norm": 0.23941321671009064,
"learning_rate": 7.1918659747446e-06,
"loss": 0.0191,
"step": 4180
},
{
"epoch": 2.854223433242507,
"grad_norm": 0.1950898915529251,
"learning_rate": 7.021931828896666e-06,
"loss": 0.018,
"step": 4190
},
{
"epoch": 2.861035422343324,
"grad_norm": 0.23307918012142181,
"learning_rate": 6.8538776829801584e-06,
"loss": 0.0127,
"step": 4200
},
{
"epoch": 2.8678474114441417,
"grad_norm": 0.23717965185642242,
"learning_rate": 6.687710888227849e-06,
"loss": 0.0125,
"step": 4210
},
{
"epoch": 2.8746594005449593,
"grad_norm": 0.18568864464759827,
"learning_rate": 6.5234387133136565e-06,
"loss": 0.0132,
"step": 4220
},
{
"epoch": 2.8814713896457764,
"grad_norm": 0.18601331114768982,
"learning_rate": 6.361068344034665e-06,
"loss": 0.0156,
"step": 4230
},
{
"epoch": 2.888283378746594,
"grad_norm": 0.1895252913236618,
"learning_rate": 6.200606882996846e-06,
"loss": 0.0144,
"step": 4240
},
{
"epoch": 2.8950953678474116,
"grad_norm": 0.13856425881385803,
"learning_rate": 6.042061349304312e-06,
"loss": 0.0164,
"step": 4250
},
{
"epoch": 2.9019073569482288,
"grad_norm": 0.14244164526462555,
"learning_rate": 5.885438678252342e-06,
"loss": 0.0178,
"step": 4260
},
{
"epoch": 2.9087193460490464,
"grad_norm": 0.10831771790981293,
"learning_rate": 5.730745721023939e-06,
"loss": 0.0135,
"step": 4270
},
{
"epoch": 2.915531335149864,
"grad_norm": 0.2154112458229065,
"learning_rate": 5.577989244390192e-06,
"loss": 0.014,
"step": 4280
},
{
"epoch": 2.922343324250681,
"grad_norm": 0.1787579506635666,
"learning_rate": 5.4271759304142635e-06,
"loss": 0.0122,
"step": 4290
},
{
"epoch": 2.9291553133514987,
"grad_norm": 0.20089909434318542,
"learning_rate": 5.278312376159051e-06,
"loss": 0.0147,
"step": 4300
},
{
"epoch": 2.9359673024523163,
"grad_norm": 0.20588675141334534,
"learning_rate": 5.1314050933986944e-06,
"loss": 0.0134,
"step": 4310
},
{
"epoch": 2.9427792915531334,
"grad_norm": 0.15475359559059143,
"learning_rate": 4.986460508333634e-06,
"loss": 0.0171,
"step": 4320
},
{
"epoch": 2.949591280653951,
"grad_norm": 0.12290208041667938,
"learning_rate": 4.843484961309597e-06,
"loss": 0.0108,
"step": 4330
},
{
"epoch": 2.9564032697547686,
"grad_norm": 0.23685646057128906,
"learning_rate": 4.702484706540161e-06,
"loss": 0.015,
"step": 4340
},
{
"epoch": 2.963215258855586,
"grad_norm": 0.17012353241443634,
"learning_rate": 4.563465911833259e-06,
"loss": 0.0144,
"step": 4350
},
{
"epoch": 2.9700272479564034,
"grad_norm": 0.15839093923568726,
"learning_rate": 4.426434658321344e-06,
"loss": 0.0118,
"step": 4360
},
{
"epoch": 2.976839237057221,
"grad_norm": 0.14717762172222137,
"learning_rate": 4.2913969401953466e-06,
"loss": 0.0135,
"step": 4370
},
{
"epoch": 2.983651226158038,
"grad_norm": 0.16831554472446442,
"learning_rate": 4.15835866444253e-06,
"loss": 0.013,
"step": 4380
},
{
"epoch": 2.9904632152588557,
"grad_norm": 0.13316653668880463,
"learning_rate": 4.027325650588043e-06,
"loss": 0.0167,
"step": 4390
},
{
"epoch": 2.997275204359673,
"grad_norm": 0.2631996273994446,
"learning_rate": 3.898303630440419e-06,
"loss": 0.0178,
"step": 4400
},
{
"epoch": 3.0040871934604905,
"grad_norm": 0.16159358620643616,
"learning_rate": 3.7712982478407877e-06,
"loss": 0.0169,
"step": 4410
},
{
"epoch": 3.010899182561308,
"grad_norm": 0.16235774755477905,
"learning_rate": 3.6463150584160053e-06,
"loss": 0.0171,
"step": 4420
},
{
"epoch": 3.017711171662125,
"grad_norm": 0.11211927980184555,
"learning_rate": 3.5233595293356957e-06,
"loss": 0.0117,
"step": 4430
},
{
"epoch": 3.024523160762943,
"grad_norm": 0.18224704265594482,
"learning_rate": 3.4024370390730033e-06,
"loss": 0.017,
"step": 4440
},
{
"epoch": 3.0313351498637604,
"grad_norm": 0.18648995459079742,
"learning_rate": 3.2835528771693992e-06,
"loss": 0.0144,
"step": 4450
},
{
"epoch": 3.0381471389645776,
"grad_norm": 0.1381874680519104,
"learning_rate": 3.1667122440032505e-06,
"loss": 0.0131,
"step": 4460
},
{
"epoch": 3.044959128065395,
"grad_norm": 0.13673441112041473,
"learning_rate": 3.051920250562351e-06,
"loss": 0.0126,
"step": 4470
},
{
"epoch": 3.0517711171662127,
"grad_norm": 0.17434169352054596,
"learning_rate": 2.939181918220385e-06,
"loss": 0.0136,
"step": 4480
},
{
"epoch": 3.05858310626703,
"grad_norm": 0.17766402661800385,
"learning_rate": 2.8285021785172226e-06,
"loss": 0.0137,
"step": 4490
},
{
"epoch": 3.0653950953678475,
"grad_norm": 0.22053247690200806,
"learning_rate": 2.7198858729432288e-06,
"loss": 0.0145,
"step": 4500
},
{
"epoch": 3.0722070844686646,
"grad_norm": 0.19397376477718353,
"learning_rate": 2.6133377527274905e-06,
"loss": 0.0149,
"step": 4510
},
{
"epoch": 3.0790190735694822,
"grad_norm": 0.10889869183301926,
"learning_rate": 2.5088624786299366e-06,
"loss": 0.0123,
"step": 4520
},
{
"epoch": 3.0858310626703,
"grad_norm": 0.1636773943901062,
"learning_rate": 2.406464620737531e-06,
"loss": 0.0127,
"step": 4530
},
{
"epoch": 3.092643051771117,
"grad_norm": 0.13466012477874756,
"learning_rate": 2.3061486582642734e-06,
"loss": 0.0135,
"step": 4540
},
{
"epoch": 3.0994550408719346,
"grad_norm": 0.13705144822597504,
"learning_rate": 2.2079189793553667e-06,
"loss": 0.0126,
"step": 4550
},
{
"epoch": 3.106267029972752,
"grad_norm": 0.204204723238945,
"learning_rate": 2.111779880895165e-06,
"loss": 0.011,
"step": 4560
},
{
"epoch": 3.1130790190735693,
"grad_norm": 0.17932602763175964,
"learning_rate": 2.01773556831929e-06,
"loss": 0.0118,
"step": 4570
},
{
"epoch": 3.119891008174387,
"grad_norm": 0.18473263084888458,
"learning_rate": 1.9257901554306513e-06,
"loss": 0.0118,
"step": 4580
},
{
"epoch": 3.1267029972752045,
"grad_norm": 0.1656373143196106,
"learning_rate": 1.835947664219445e-06,
"loss": 0.0135,
"step": 4590
},
{
"epoch": 3.1335149863760217,
"grad_norm": 0.18078264594078064,
"learning_rate": 1.748212024687307e-06,
"loss": 0.0118,
"step": 4600
},
{
"epoch": 3.1403269754768393,
"grad_norm": 0.23627698421478271,
"learning_rate": 1.6625870746753147e-06,
"loss": 0.0151,
"step": 4610
},
{
"epoch": 3.147138964577657,
"grad_norm": 0.15048933029174805,
"learning_rate": 1.5790765596961853e-06,
"loss": 0.015,
"step": 4620
},
{
"epoch": 3.153950953678474,
"grad_norm": 0.2178574502468109,
"learning_rate": 1.4976841327703717e-06,
"loss": 0.0135,
"step": 4630
},
{
"epoch": 3.1607629427792916,
"grad_norm": 0.1818486899137497,
"learning_rate": 1.4184133542663014e-06,
"loss": 0.0122,
"step": 4640
},
{
"epoch": 3.167574931880109,
"grad_norm": 0.1654607504606247,
"learning_rate": 1.341267691744641e-06,
"loss": 0.0128,
"step": 4650
},
{
"epoch": 3.1743869209809263,
"grad_norm": 0.207754448056221,
"learning_rate": 1.2662505198065666e-06,
"loss": 0.0224,
"step": 4660
},
{
"epoch": 3.181198910081744,
"grad_norm": 0.16341248154640198,
"learning_rate": 1.193365119946216e-06,
"loss": 0.0153,
"step": 4670
},
{
"epoch": 3.1880108991825615,
"grad_norm": 0.1576090306043625,
"learning_rate": 1.1226146804070859e-06,
"loss": 0.0102,
"step": 4680
},
{
"epoch": 3.1948228882833787,
"grad_norm": 0.1799905151128769,
"learning_rate": 1.0540022960426111e-06,
"loss": 0.0134,
"step": 4690
},
{
"epoch": 3.2016348773841963,
"grad_norm": 0.23539473116397858,
"learning_rate": 9.875309681807443e-07,
"loss": 0.0171,
"step": 4700
},
{
"epoch": 3.2084468664850134,
"grad_norm": 0.1891935020685196,
"learning_rate": 9.232036044927061e-07,
"loss": 0.0136,
"step": 4710
},
{
"epoch": 3.215258855585831,
"grad_norm": 0.1502537727355957,
"learning_rate": 8.610230188657919e-07,
"loss": 0.0135,
"step": 4720
},
{
"epoch": 3.2220708446866486,
"grad_norm": 0.14308865368366241,
"learning_rate": 8.009919312802372e-07,
"loss": 0.0125,
"step": 4730
},
{
"epoch": 3.2288828337874658,
"grad_norm": 0.17500245571136475,
"learning_rate": 7.431129676902904e-07,
"loss": 0.01,
"step": 4740
},
{
"epoch": 3.2356948228882834,
"grad_norm": 0.12005341053009033,
"learning_rate": 6.873886599093215e-07,
"loss": 0.013,
"step": 4750
},
{
"epoch": 3.242506811989101,
"grad_norm": 0.22890767455101013,
"learning_rate": 6.338214454990776e-07,
"loss": 0.0165,
"step": 4760
},
{
"epoch": 3.249318801089918,
"grad_norm": 0.12232371419668198,
"learning_rate": 5.82413667663051e-07,
"loss": 0.0153,
"step": 4770
},
{
"epoch": 3.2561307901907357,
"grad_norm": 0.16289682686328888,
"learning_rate": 5.331675751439725e-07,
"loss": 0.0144,
"step": 4780
},
{
"epoch": 3.2629427792915533,
"grad_norm": 0.19280773401260376,
"learning_rate": 4.86085322125479e-07,
"loss": 0.012,
"step": 4790
},
{
"epoch": 3.2697547683923704,
"grad_norm": 0.12008260190486908,
"learning_rate": 4.411689681378284e-07,
"loss": 0.0148,
"step": 4800
},
{
"epoch": 3.276566757493188,
"grad_norm": 0.16363725066184998,
"learning_rate": 3.9842047796786466e-07,
"loss": 0.0125,
"step": 4810
},
{
"epoch": 3.2833787465940056,
"grad_norm": 0.16861975193023682,
"learning_rate": 3.578417215730323e-07,
"loss": 0.0114,
"step": 4820
},
{
"epoch": 3.290190735694823,
"grad_norm": 0.17795579135417938,
"learning_rate": 3.1943447399958027e-07,
"loss": 0.0136,
"step": 4830
},
{
"epoch": 3.2970027247956404,
"grad_norm": 0.07885803282260895,
"learning_rate": 2.8320041530495724e-07,
"loss": 0.0203,
"step": 4840
},
{
"epoch": 3.3038147138964575,
"grad_norm": 0.12531672418117523,
"learning_rate": 2.491411304842539e-07,
"loss": 0.0129,
"step": 4850
},
{
"epoch": 3.310626702997275,
"grad_norm": 0.17444051802158356,
"learning_rate": 2.1725810940094183e-07,
"loss": 0.012,
"step": 4860
},
{
"epoch": 3.3174386920980927,
"grad_norm": 0.14167378842830658,
"learning_rate": 1.8755274672164202e-07,
"loss": 0.0129,
"step": 4870
},
{
"epoch": 3.32425068119891,
"grad_norm": 0.12788553535938263,
"learning_rate": 1.600263418551573e-07,
"loss": 0.0148,
"step": 4880
},
{
"epoch": 3.3310626702997275,
"grad_norm": 0.2057434469461441,
"learning_rate": 1.346800988955954e-07,
"loss": 0.0154,
"step": 4890
},
{
"epoch": 3.337874659400545,
"grad_norm": 0.17330636084079742,
"learning_rate": 1.1151512656975005e-07,
"loss": 0.0116,
"step": 4900
},
{
"epoch": 3.344686648501362,
"grad_norm": 0.09420597553253174,
"learning_rate": 9.053243818853973e-08,
"loss": 0.0124,
"step": 4910
},
{
"epoch": 3.35149863760218,
"grad_norm": 0.15236696600914001,
"learning_rate": 7.173295160273763e-08,
"loss": 0.0124,
"step": 4920
},
{
"epoch": 3.3583106267029974,
"grad_norm": 0.15374703705310822,
"learning_rate": 5.511748916279258e-08,
"loss": 0.0132,
"step": 4930
},
{
"epoch": 3.3651226158038146,
"grad_norm": 0.273964524269104,
"learning_rate": 4.068677768285234e-08,
"loss": 0.0115,
"step": 4940
},
{
"epoch": 3.371934604904632,
"grad_norm": 0.11771193891763687,
"learning_rate": 2.844144840898344e-08,
"loss": 0.015,
"step": 4950
},
{
"epoch": 3.3787465940054497,
"grad_norm": 0.12935270369052887,
"learning_rate": 1.8382036991559936e-08,
"loss": 0.0112,
"step": 4960
},
{
"epoch": 3.385558583106267,
"grad_norm": 0.1538880318403244,
"learning_rate": 1.0508983461832156e-08,
"loss": 0.0166,
"step": 4970
},
{
"epoch": 3.3923705722070845,
"grad_norm": 0.10488380491733551,
"learning_rate": 4.822632212653222e-09,
"loss": 0.018,
"step": 4980
},
{
"epoch": 3.399182561307902,
"grad_norm": 0.18381240963935852,
"learning_rate": 1.3232319834632912e-09,
"loss": 0.0135,
"step": 4990
},
{
"epoch": 3.4059945504087192,
"grad_norm": 0.14481449127197266,
"learning_rate": 1.0935849353854721e-11,
"loss": 0.0137,
"step": 5000
},
{
"epoch": 3.412806539509537,
"grad_norm": 0.44966641068458557,
"learning_rate": 5.398064519110622e-05,
"loss": 0.0203,
"step": 5010
},
{
"epoch": 3.4196185286103544,
"grad_norm": 0.3312857747077942,
"learning_rate": 5.3815801579167394e-05,
"loss": 0.0175,
"step": 5020
},
{
"epoch": 3.4264305177111716,
"grad_norm": 0.5842679738998413,
"learning_rate": 5.365091623823382e-05,
"loss": 0.0213,
"step": 5030
},
{
"epoch": 3.433242506811989,
"grad_norm": 0.5707949995994568,
"learning_rate": 5.348599097146521e-05,
"loss": 0.0228,
"step": 5040
},
{
"epoch": 3.4400544959128068,
"grad_norm": 0.2389402836561203,
"learning_rate": 5.3321027582457836e-05,
"loss": 0.0237,
"step": 5050
},
{
"epoch": 3.446866485013624,
"grad_norm": 0.3142755329608917,
"learning_rate": 5.315602787522491e-05,
"loss": 0.0248,
"step": 5060
},
{
"epoch": 3.4536784741144415,
"grad_norm": 0.35478901863098145,
"learning_rate": 5.299099365417678e-05,
"loss": 0.0201,
"step": 5070
},
{
"epoch": 3.460490463215259,
"grad_norm": 0.4287269115447998,
"learning_rate": 5.2825926724101236e-05,
"loss": 0.026,
"step": 5080
},
{
"epoch": 3.4673024523160763,
"grad_norm": 0.5050956606864929,
"learning_rate": 5.26608288901438e-05,
"loss": 0.0295,
"step": 5090
},
{
"epoch": 3.474114441416894,
"grad_norm": 0.36942875385284424,
"learning_rate": 5.24957019577879e-05,
"loss": 0.0259,
"step": 5100
},
{
"epoch": 3.480926430517711,
"grad_norm": 0.40414538979530334,
"learning_rate": 5.2330547732835266e-05,
"loss": 0.0242,
"step": 5110
},
{
"epoch": 3.4877384196185286,
"grad_norm": 0.35221511125564575,
"learning_rate": 5.2165368021385996e-05,
"loss": 0.0304,
"step": 5120
},
{
"epoch": 3.494550408719346,
"grad_norm": 0.4094237685203552,
"learning_rate": 5.200016462981897e-05,
"loss": 0.0249,
"step": 5130
},
{
"epoch": 3.5013623978201633,
"grad_norm": 0.24707941710948944,
"learning_rate": 5.1834939364772015e-05,
"loss": 0.0219,
"step": 5140
},
{
"epoch": 3.508174386920981,
"grad_norm": 0.38713163137435913,
"learning_rate": 5.166969403312214e-05,
"loss": 0.0288,
"step": 5150
},
{
"epoch": 3.5149863760217985,
"grad_norm": 0.3290533721446991,
"learning_rate": 5.1504430441965844e-05,
"loss": 0.0262,
"step": 5160
},
{
"epoch": 3.5217983651226157,
"grad_norm": 0.3959462642669678,
"learning_rate": 5.133915039859923e-05,
"loss": 0.02,
"step": 5170
},
{
"epoch": 3.5286103542234333,
"grad_norm": 0.3446705937385559,
"learning_rate": 5.1173855710498444e-05,
"loss": 0.023,
"step": 5180
},
{
"epoch": 3.5354223433242504,
"grad_norm": 0.27180591225624084,
"learning_rate": 5.100854818529967e-05,
"loss": 0.0283,
"step": 5190
},
{
"epoch": 3.542234332425068,
"grad_norm": 0.39243829250335693,
"learning_rate": 5.084322963077951e-05,
"loss": 0.029,
"step": 5200
},
{
"epoch": 3.5490463215258856,
"grad_norm": 0.2588927149772644,
"learning_rate": 5.067790185483522e-05,
"loss": 0.0282,
"step": 5210
},
{
"epoch": 3.5558583106267028,
"grad_norm": 0.18376407027244568,
"learning_rate": 5.0512566665464844e-05,
"loss": 0.0272,
"step": 5220
},
{
"epoch": 3.5626702997275204,
"grad_norm": 0.29992175102233887,
"learning_rate": 5.034722587074755e-05,
"loss": 0.0257,
"step": 5230
},
{
"epoch": 3.569482288828338,
"grad_norm": 0.243015319108963,
"learning_rate": 5.018188127882375e-05,
"loss": 0.0229,
"step": 5240
},
{
"epoch": 3.576294277929155,
"grad_norm": 0.32886067032814026,
"learning_rate": 5.0016534697875417e-05,
"loss": 0.0211,
"step": 5250
},
{
"epoch": 3.5831062670299727,
"grad_norm": 0.4220637083053589,
"learning_rate": 4.9851187936106294e-05,
"loss": 0.0246,
"step": 5260
},
{
"epoch": 3.5899182561307903,
"grad_norm": 0.2974489629268646,
"learning_rate": 4.968584280172206e-05,
"loss": 0.0233,
"step": 5270
},
{
"epoch": 3.5967302452316074,
"grad_norm": 0.502668023109436,
"learning_rate": 4.95205011029106e-05,
"loss": 0.0285,
"step": 5280
},
{
"epoch": 3.603542234332425,
"grad_norm": 0.3639957904815674,
"learning_rate": 4.935516464782227e-05,
"loss": 0.0268,
"step": 5290
},
{
"epoch": 3.6103542234332426,
"grad_norm": 0.36707913875579834,
"learning_rate": 4.918983524455003e-05,
"loss": 0.0246,
"step": 5300
},
{
"epoch": 3.61716621253406,
"grad_norm": 0.22181017696857452,
"learning_rate": 4.9024514701109766e-05,
"loss": 0.0252,
"step": 5310
},
{
"epoch": 3.6239782016348774,
"grad_norm": 0.42766740918159485,
"learning_rate": 4.885920482542043e-05,
"loss": 0.0225,
"step": 5320
},
{
"epoch": 3.630790190735695,
"grad_norm": 0.26574602723121643,
"learning_rate": 4.869390742528438e-05,
"loss": 0.0208,
"step": 5330
},
{
"epoch": 3.637602179836512,
"grad_norm": 0.18494778871536255,
"learning_rate": 4.852862430836744e-05,
"loss": 0.0248,
"step": 5340
},
{
"epoch": 3.6444141689373297,
"grad_norm": 0.3686949610710144,
"learning_rate": 4.836335728217933e-05,
"loss": 0.0226,
"step": 5350
},
{
"epoch": 3.6512261580381473,
"grad_norm": 0.29411113262176514,
"learning_rate": 4.819810815405379e-05,
"loss": 0.0255,
"step": 5360
},
{
"epoch": 3.6580381471389645,
"grad_norm": 0.2379477620124817,
"learning_rate": 4.803287873112877e-05,
"loss": 0.0229,
"step": 5370
},
{
"epoch": 3.664850136239782,
"grad_norm": 0.3780541718006134,
"learning_rate": 4.786767082032681e-05,
"loss": 0.0234,
"step": 5380
},
{
"epoch": 3.6716621253405997,
"grad_norm": 0.24052190780639648,
"learning_rate": 4.77024862283351e-05,
"loss": 0.0229,
"step": 5390
},
{
"epoch": 3.678474114441417,
"grad_norm": 0.2713554799556732,
"learning_rate": 4.753732676158593e-05,
"loss": 0.0242,
"step": 5400
},
{
"epoch": 3.6852861035422344,
"grad_norm": 0.3661803603172302,
"learning_rate": 4.737219422623672e-05,
"loss": 0.0239,
"step": 5410
},
{
"epoch": 3.692098092643052,
"grad_norm": 0.4185531735420227,
"learning_rate": 4.720709042815044e-05,
"loss": 0.0204,
"step": 5420
},
{
"epoch": 3.698910081743869,
"grad_norm": 0.2620242238044739,
"learning_rate": 4.704201717287578e-05,
"loss": 0.0211,
"step": 5430
},
{
"epoch": 3.7057220708446867,
"grad_norm": 0.26090627908706665,
"learning_rate": 4.6876976265627404e-05,
"loss": 0.0224,
"step": 5440
},
{
"epoch": 3.7125340599455043,
"grad_norm": 0.2731458842754364,
"learning_rate": 4.671196951126626e-05,
"loss": 0.0269,
"step": 5450
},
{
"epoch": 3.7193460490463215,
"grad_norm": 0.31026485562324524,
"learning_rate": 4.654699871427971e-05,
"loss": 0.0218,
"step": 5460
},
{
"epoch": 3.726158038147139,
"grad_norm": 0.233415424823761,
"learning_rate": 4.6382065678762034e-05,
"loss": 0.0204,
"step": 5470
},
{
"epoch": 3.7329700272479567,
"grad_norm": 0.3344708979129791,
"learning_rate": 4.6217172208394424e-05,
"loss": 0.0197,
"step": 5480
},
{
"epoch": 3.739782016348774,
"grad_norm": 0.25975632667541504,
"learning_rate": 4.605232010642549e-05,
"loss": 0.0194,
"step": 5490
},
{
"epoch": 3.7465940054495914,
"grad_norm": 0.2950715720653534,
"learning_rate": 4.588751117565142e-05,
"loss": 0.0193,
"step": 5500
},
{
"epoch": 3.7534059945504086,
"grad_norm": 0.2784842252731323,
"learning_rate": 4.5722747218396214e-05,
"loss": 0.0251,
"step": 5510
},
{
"epoch": 3.760217983651226,
"grad_norm": 0.2176719456911087,
"learning_rate": 4.5558030036492194e-05,
"loss": 0.0204,
"step": 5520
},
{
"epoch": 3.7670299727520433,
"grad_norm": 0.28440573811531067,
"learning_rate": 4.539336143125999e-05,
"loss": 0.0265,
"step": 5530
},
{
"epoch": 3.773841961852861,
"grad_norm": 0.25604936480522156,
"learning_rate": 4.522874320348916e-05,
"loss": 0.0225,
"step": 5540
},
{
"epoch": 3.7806539509536785,
"grad_norm": 0.2565711438655853,
"learning_rate": 4.506417715341821e-05,
"loss": 0.019,
"step": 5550
},
{
"epoch": 3.7874659400544957,
"grad_norm": 0.2216968685388565,
"learning_rate": 4.489966508071511e-05,
"loss": 0.022,
"step": 5560
},
{
"epoch": 3.7942779291553133,
"grad_norm": 0.22490093111991882,
"learning_rate": 4.4735208784457575e-05,
"loss": 0.0197,
"step": 5570
},
{
"epoch": 3.801089918256131,
"grad_norm": 0.3565233647823334,
"learning_rate": 4.457081006311325e-05,
"loss": 0.0242,
"step": 5580
},
{
"epoch": 3.807901907356948,
"grad_norm": 0.270898699760437,
"learning_rate": 4.440647071452027e-05,
"loss": 0.0226,
"step": 5590
},
{
"epoch": 3.8147138964577656,
"grad_norm": 0.26380616426467896,
"learning_rate": 4.424219253586737e-05,
"loss": 0.0221,
"step": 5600
},
{
"epoch": 3.821525885558583,
"grad_norm": 0.3055083751678467,
"learning_rate": 4.407797732367443e-05,
"loss": 0.0251,
"step": 5610
},
{
"epoch": 3.8283378746594003,
"grad_norm": 0.2543126046657562,
"learning_rate": 4.391382687377268e-05,
"loss": 0.0248,
"step": 5620
},
{
"epoch": 3.835149863760218,
"grad_norm": 0.43203112483024597,
"learning_rate": 4.374974298128512e-05,
"loss": 0.0202,
"step": 5630
},
{
"epoch": 3.8419618528610355,
"grad_norm": 0.20501923561096191,
"learning_rate": 4.358572744060699e-05,
"loss": 0.0243,
"step": 5640
},
{
"epoch": 3.8487738419618527,
"grad_norm": 0.2543809413909912,
"learning_rate": 4.342178204538588e-05,
"loss": 0.0202,
"step": 5650
},
{
"epoch": 3.8555858310626703,
"grad_norm": 0.37627357244491577,
"learning_rate": 4.325790858850241e-05,
"loss": 0.0208,
"step": 5660
},
{
"epoch": 3.862397820163488,
"grad_norm": 0.19202812016010284,
"learning_rate": 4.309410886205043e-05,
"loss": 0.0228,
"step": 5670
},
{
"epoch": 3.869209809264305,
"grad_norm": 0.19026115536689758,
"learning_rate": 4.293038465731752e-05,
"loss": 0.0222,
"step": 5680
},
{
"epoch": 3.8760217983651226,
"grad_norm": 0.27928781509399414,
"learning_rate": 4.276673776476533e-05,
"loss": 0.0222,
"step": 5690
},
{
"epoch": 3.88283378746594,
"grad_norm": 0.30648189783096313,
"learning_rate": 4.260316997401007e-05,
"loss": 0.0186,
"step": 5700
},
{
"epoch": 3.8896457765667574,
"grad_norm": 0.2663455903530121,
"learning_rate": 4.243968307380293e-05,
"loss": 0.0237,
"step": 5710
},
{
"epoch": 3.896457765667575,
"grad_norm": 0.22592630982398987,
"learning_rate": 4.22762788520104e-05,
"loss": 0.0234,
"step": 5720
},
{
"epoch": 3.9032697547683926,
"grad_norm": 0.21950973570346832,
"learning_rate": 4.211295909559491e-05,
"loss": 0.0265,
"step": 5730
},
{
"epoch": 3.9100817438692097,
"grad_norm": 0.21050743758678436,
"learning_rate": 4.194972559059511e-05,
"loss": 0.0197,
"step": 5740
},
{
"epoch": 3.9168937329700273,
"grad_norm": 0.22975432872772217,
"learning_rate": 4.178658012210651e-05,
"loss": 0.0228,
"step": 5750
},
{
"epoch": 3.923705722070845,
"grad_norm": 0.349044531583786,
"learning_rate": 4.162352447426177e-05,
"loss": 0.0207,
"step": 5760
},
{
"epoch": 3.930517711171662,
"grad_norm": 0.22395232319831848,
"learning_rate": 4.146056043021135e-05,
"loss": 0.0203,
"step": 5770
},
{
"epoch": 3.9373297002724796,
"grad_norm": 0.24076318740844727,
"learning_rate": 4.1297689772103944e-05,
"loss": 0.0218,
"step": 5780
},
{
"epoch": 3.9441416893732972,
"grad_norm": 0.311708003282547,
"learning_rate": 4.113491428106694e-05,
"loss": 0.0247,
"step": 5790
},
{
"epoch": 3.9509536784741144,
"grad_norm": 0.265595406293869,
"learning_rate": 4.0972235737187055e-05,
"loss": 0.0181,
"step": 5800
},
{
"epoch": 3.957765667574932,
"grad_norm": 0.3528865873813629,
"learning_rate": 4.080965591949076e-05,
"loss": 0.0194,
"step": 5810
},
{
"epoch": 3.9645776566757496,
"grad_norm": 0.3113243877887726,
"learning_rate": 4.0647176605924924e-05,
"loss": 0.0225,
"step": 5820
},
{
"epoch": 3.9713896457765667,
"grad_norm": 0.3198659420013428,
"learning_rate": 4.0484799573337255e-05,
"loss": 0.0256,
"step": 5830
},
{
"epoch": 3.9782016348773843,
"grad_norm": 0.22167012095451355,
"learning_rate": 4.032252659745699e-05,
"loss": 0.0226,
"step": 5840
},
{
"epoch": 3.9850136239782015,
"grad_norm": 0.22256286442279816,
"learning_rate": 4.016035945287539e-05,
"loss": 0.0278,
"step": 5850
},
{
"epoch": 3.991825613079019,
"grad_norm": 0.2504684329032898,
"learning_rate": 3.999829991302635e-05,
"loss": 0.0245,
"step": 5860
},
{
"epoch": 3.9986376021798367,
"grad_norm": 0.2460675686597824,
"learning_rate": 3.983634975016707e-05,
"loss": 0.0214,
"step": 5870
},
{
"epoch": 4.005449591280654,
"grad_norm": 0.28262001276016235,
"learning_rate": 3.967451073535854e-05,
"loss": 0.0256,
"step": 5880
},
{
"epoch": 4.012261580381471,
"grad_norm": 0.3468887507915497,
"learning_rate": 3.951278463844633e-05,
"loss": 0.0251,
"step": 5890
},
{
"epoch": 4.0190735694822886,
"grad_norm": 0.3931543231010437,
"learning_rate": 3.935117322804111e-05,
"loss": 0.0234,
"step": 5900
},
{
"epoch": 4.025885558583107,
"grad_norm": 0.35787367820739746,
"learning_rate": 3.918967827149938e-05,
"loss": 0.0175,
"step": 5910
},
{
"epoch": 4.032697547683924,
"grad_norm": 0.24113652110099792,
"learning_rate": 3.9028301534904094e-05,
"loss": 0.0222,
"step": 5920
},
{
"epoch": 4.039509536784741,
"grad_norm": 0.265298068523407,
"learning_rate": 3.88670447830454e-05,
"loss": 0.0218,
"step": 5930
},
{
"epoch": 4.046321525885559,
"grad_norm": 0.3670673072338104,
"learning_rate": 3.870590977940132e-05,
"loss": 0.0195,
"step": 5940
},
{
"epoch": 4.053133514986376,
"grad_norm": 0.20872969925403595,
"learning_rate": 3.8544898286118404e-05,
"loss": 0.0181,
"step": 5950
},
{
"epoch": 4.059945504087193,
"grad_norm": 0.2174217849969864,
"learning_rate": 3.838401206399257e-05,
"loss": 0.0189,
"step": 5960
},
{
"epoch": 4.066757493188011,
"grad_norm": 0.25039082765579224,
"learning_rate": 3.822325287244975e-05,
"loss": 0.0203,
"step": 5970
},
{
"epoch": 4.073569482288828,
"grad_norm": 0.2662447690963745,
"learning_rate": 3.8062622469526725e-05,
"loss": 0.0195,
"step": 5980
},
{
"epoch": 4.080381471389646,
"grad_norm": 0.2717086374759674,
"learning_rate": 3.790212261185183e-05,
"loss": 0.0186,
"step": 5990
},
{
"epoch": 4.087193460490464,
"grad_norm": 0.2525738477706909,
"learning_rate": 3.7741755054625794e-05,
"loss": 0.0229,
"step": 6000
},
{
"epoch": 4.094005449591281,
"grad_norm": 0.20453284680843353,
"learning_rate": 3.758152155160255e-05,
"loss": 0.0178,
"step": 6010
},
{
"epoch": 4.100817438692098,
"grad_norm": 0.28254011273384094,
"learning_rate": 3.742142385506999e-05,
"loss": 0.0171,
"step": 6020
},
{
"epoch": 4.107629427792916,
"grad_norm": 0.4284875690937042,
"learning_rate": 3.72614637158309e-05,
"loss": 0.0196,
"step": 6030
},
{
"epoch": 4.114441416893733,
"grad_norm": 0.2086813747882843,
"learning_rate": 3.710164288318371e-05,
"loss": 0.0194,
"step": 6040
},
{
"epoch": 4.12125340599455,
"grad_norm": 0.20289340615272522,
"learning_rate": 3.694196310490345e-05,
"loss": 0.0152,
"step": 6050
},
{
"epoch": 4.128065395095367,
"grad_norm": 0.34958550333976746,
"learning_rate": 3.678242612722259e-05,
"loss": 0.0209,
"step": 6060
},
{
"epoch": 4.1348773841961854,
"grad_norm": 0.2462022453546524,
"learning_rate": 3.6623033694811953e-05,
"loss": 0.0186,
"step": 6070
},
{
"epoch": 4.141689373297003,
"grad_norm": 0.15042909979820251,
"learning_rate": 3.6463787550761665e-05,
"loss": 0.0198,
"step": 6080
},
{
"epoch": 4.14850136239782,
"grad_norm": 0.2676561176776886,
"learning_rate": 3.630468943656202e-05,
"loss": 0.0181,
"step": 6090
},
{
"epoch": 4.155313351498638,
"grad_norm": 0.2557305097579956,
"learning_rate": 3.6145741092084523e-05,
"loss": 0.0168,
"step": 6100
},
{
"epoch": 4.162125340599455,
"grad_norm": 0.28285613656044006,
"learning_rate": 3.598694425556278e-05,
"loss": 0.0174,
"step": 6110
},
{
"epoch": 4.168937329700272,
"grad_norm": 0.19794082641601562,
"learning_rate": 3.58283006635736e-05,
"loss": 0.0187,
"step": 6120
},
{
"epoch": 4.17574931880109,
"grad_norm": 0.3199867308139801,
"learning_rate": 3.566981205101781e-05,
"loss": 0.0188,
"step": 6130
},
{
"epoch": 4.182561307901907,
"grad_norm": 0.21557827293872833,
"learning_rate": 3.5511480151101556e-05,
"loss": 0.0179,
"step": 6140
},
{
"epoch": 4.189373297002724,
"grad_norm": 0.22134508192539215,
"learning_rate": 3.5353306695317104e-05,
"loss": 0.0148,
"step": 6150
},
{
"epoch": 4.1961852861035425,
"grad_norm": 0.2104470133781433,
"learning_rate": 3.519529341342402e-05,
"loss": 0.0193,
"step": 6160
},
{
"epoch": 4.20299727520436,
"grad_norm": 0.2785221040248871,
"learning_rate": 3.503744203343026e-05,
"loss": 0.0182,
"step": 6170
},
{
"epoch": 4.209809264305177,
"grad_norm": 0.27562573552131653,
"learning_rate": 3.487975428157318e-05,
"loss": 0.0197,
"step": 6180
},
{
"epoch": 4.216621253405995,
"grad_norm": 0.35668033361434937,
"learning_rate": 3.472223188230083e-05,
"loss": 0.0196,
"step": 6190
},
{
"epoch": 4.223433242506812,
"grad_norm": 0.2609441578388214,
"learning_rate": 3.4564876558252866e-05,
"loss": 0.0218,
"step": 6200
},
{
"epoch": 4.230245231607629,
"grad_norm": 0.28609734773635864,
"learning_rate": 3.440769003024195e-05,
"loss": 0.0169,
"step": 6210
},
{
"epoch": 4.237057220708447,
"grad_norm": 0.18339040875434875,
"learning_rate": 3.425067401723477e-05,
"loss": 0.0186,
"step": 6220
},
{
"epoch": 4.243869209809264,
"grad_norm": 0.138119637966156,
"learning_rate": 3.409383023633325e-05,
"loss": 0.0177,
"step": 6230
},
{
"epoch": 4.2506811989100814,
"grad_norm": 0.22962254285812378,
"learning_rate": 3.3937160402755894e-05,
"loss": 0.0172,
"step": 6240
},
{
"epoch": 4.2574931880108995,
"grad_norm": 0.2682797908782959,
"learning_rate": 3.378066622981885e-05,
"loss": 0.0189,
"step": 6250
},
{
"epoch": 4.264305177111717,
"grad_norm": 0.20227015018463135,
"learning_rate": 3.362434942891738e-05,
"loss": 0.022,
"step": 6260
},
{
"epoch": 4.271117166212534,
"grad_norm": 0.2518126368522644,
"learning_rate": 3.346821170950693e-05,
"loss": 0.0195,
"step": 6270
},
{
"epoch": 4.277929155313352,
"grad_norm": 0.2634272575378418,
"learning_rate": 3.3312254779084585e-05,
"loss": 0.0188,
"step": 6280
},
{
"epoch": 4.284741144414169,
"grad_norm": 0.25564995408058167,
"learning_rate": 3.315648034317039e-05,
"loss": 0.0215,
"step": 6290
},
{
"epoch": 4.291553133514986,
"grad_norm": 0.31350597739219666,
"learning_rate": 3.3000890105288564e-05,
"loss": 0.0181,
"step": 6300
},
{
"epoch": 4.298365122615804,
"grad_norm": 0.23671625554561615,
"learning_rate": 3.284548576694908e-05,
"loss": 0.0176,
"step": 6310
},
{
"epoch": 4.305177111716621,
"grad_norm": 0.2342391163110733,
"learning_rate": 3.2690269027628815e-05,
"loss": 0.0156,
"step": 6320
},
{
"epoch": 4.3119891008174385,
"grad_norm": 0.24075733125209808,
"learning_rate": 3.253524158475324e-05,
"loss": 0.0187,
"step": 6330
},
{
"epoch": 4.3188010899182565,
"grad_norm": 0.16117766499519348,
"learning_rate": 3.238040513367757e-05,
"loss": 0.019,
"step": 6340
},
{
"epoch": 4.325613079019074,
"grad_norm": 0.16016744077205658,
"learning_rate": 3.222576136766843e-05,
"loss": 0.0146,
"step": 6350
},
{
"epoch": 4.332425068119891,
"grad_norm": 0.236736461520195,
"learning_rate": 3.2071311977885324e-05,
"loss": 0.018,
"step": 6360
},
{
"epoch": 4.339237057220709,
"grad_norm": 0.182217076420784,
"learning_rate": 3.191705865336197e-05,
"loss": 0.0171,
"step": 6370
},
{
"epoch": 4.346049046321526,
"grad_norm": 0.19513262808322906,
"learning_rate": 3.1763003080988075e-05,
"loss": 0.0155,
"step": 6380
},
{
"epoch": 4.352861035422343,
"grad_norm": 0.19296902418136597,
"learning_rate": 3.160914694549063e-05,
"loss": 0.0191,
"step": 6390
},
{
"epoch": 4.359673024523161,
"grad_norm": 0.2901662588119507,
"learning_rate": 3.145549192941573e-05,
"loss": 0.0174,
"step": 6400
},
{
"epoch": 4.366485013623978,
"grad_norm": 0.21007436513900757,
"learning_rate": 3.130203971310999e-05,
"loss": 0.0194,
"step": 6410
},
{
"epoch": 4.3732970027247955,
"grad_norm": 0.19525951147079468,
"learning_rate": 3.114879197470225e-05,
"loss": 0.0181,
"step": 6420
},
{
"epoch": 4.3801089918256135,
"grad_norm": 0.22157281637191772,
"learning_rate": 3.0995750390085285e-05,
"loss": 0.0165,
"step": 6430
},
{
"epoch": 4.386920980926431,
"grad_norm": 0.2652972340583801,
"learning_rate": 3.084291663289728e-05,
"loss": 0.0171,
"step": 6440
},
{
"epoch": 4.393732970027248,
"grad_norm": 0.24139529466629028,
"learning_rate": 3.069029237450375e-05,
"loss": 0.0143,
"step": 6450
},
{
"epoch": 4.400544959128065,
"grad_norm": 0.17755670845508575,
"learning_rate": 3.053787928397911e-05,
"loss": 0.0145,
"step": 6460
},
{
"epoch": 4.407356948228883,
"grad_norm": 0.33576318621635437,
"learning_rate": 3.0385679028088526e-05,
"loss": 0.0179,
"step": 6470
},
{
"epoch": 4.4141689373297,
"grad_norm": 0.12500669062137604,
"learning_rate": 3.023369327126959e-05,
"loss": 0.0147,
"step": 6480
},
{
"epoch": 4.420980926430517,
"grad_norm": 0.17751501500606537,
"learning_rate": 3.0081923675614198e-05,
"loss": 0.016,
"step": 6490
},
{
"epoch": 4.427792915531335,
"grad_norm": 0.22551394999027252,
"learning_rate": 2.993037190085034e-05,
"loss": 0.0157,
"step": 6500
},
{
"epoch": 4.4346049046321525,
"grad_norm": 0.22585496306419373,
"learning_rate": 2.977903960432392e-05,
"loss": 0.0168,
"step": 6510
},
{
"epoch": 4.44141689373297,
"grad_norm": 0.21578261256217957,
"learning_rate": 2.9627928440980722e-05,
"loss": 0.0187,
"step": 6520
},
{
"epoch": 4.448228882833788,
"grad_norm": 0.22021692991256714,
"learning_rate": 2.9477040063348183e-05,
"loss": 0.0188,
"step": 6530
},
{
"epoch": 4.455040871934605,
"grad_norm": 0.28897762298583984,
"learning_rate": 2.9326376121517456e-05,
"loss": 0.0165,
"step": 6540
},
{
"epoch": 4.461852861035422,
"grad_norm": 0.25159355998039246,
"learning_rate": 2.9175938263125236e-05,
"loss": 0.0152,
"step": 6550
},
{
"epoch": 4.46866485013624,
"grad_norm": 0.23536550998687744,
"learning_rate": 2.9025728133335873e-05,
"loss": 0.0203,
"step": 6560
},
{
"epoch": 4.475476839237057,
"grad_norm": 0.2687535583972931,
"learning_rate": 2.8875747374823288e-05,
"loss": 0.0217,
"step": 6570
},
{
"epoch": 4.482288828337874,
"grad_norm": 0.28790390491485596,
"learning_rate": 2.872599762775298e-05,
"loss": 0.0228,
"step": 6580
},
{
"epoch": 4.489100817438692,
"grad_norm": 0.20599423348903656,
"learning_rate": 2.857648052976425e-05,
"loss": 0.0154,
"step": 6590
},
{
"epoch": 4.4959128065395095,
"grad_norm": 0.25862014293670654,
"learning_rate": 2.8427197715952047e-05,
"loss": 0.0169,
"step": 6600
},
{
"epoch": 4.502724795640327,
"grad_norm": 0.29005661606788635,
"learning_rate": 2.8278150818849393e-05,
"loss": 0.0176,
"step": 6610
},
{
"epoch": 4.509536784741145,
"grad_norm": 0.20444929599761963,
"learning_rate": 2.812934146840922e-05,
"loss": 0.0168,
"step": 6620
},
{
"epoch": 4.516348773841962,
"grad_norm": 0.16426856815814972,
"learning_rate": 2.7980771291986764e-05,
"loss": 0.0183,
"step": 6630
},
{
"epoch": 4.523160762942779,
"grad_norm": 0.2749025225639343,
"learning_rate": 2.783244191432167e-05,
"loss": 0.0175,
"step": 6640
},
{
"epoch": 4.529972752043597,
"grad_norm": 0.28311431407928467,
"learning_rate": 2.768435495752022e-05,
"loss": 0.0151,
"step": 6650
},
{
"epoch": 4.536784741144414,
"grad_norm": 0.24218498170375824,
"learning_rate": 2.753651204103771e-05,
"loss": 0.0175,
"step": 6660
},
{
"epoch": 4.543596730245231,
"grad_norm": 0.214820995926857,
"learning_rate": 2.7388914781660523e-05,
"loss": 0.0138,
"step": 6670
},
{
"epoch": 4.550408719346049,
"grad_norm": 0.2261001467704773,
"learning_rate": 2.7241564793488693e-05,
"loss": 0.0183,
"step": 6680
},
{
"epoch": 4.5572207084468666,
"grad_norm": 0.21669824421405792,
"learning_rate": 2.7094463687918037e-05,
"loss": 0.0161,
"step": 6690
},
{
"epoch": 4.564032697547684,
"grad_norm": 0.25731489062309265,
"learning_rate": 2.694761307362268e-05,
"loss": 0.0149,
"step": 6700
},
{
"epoch": 4.570844686648502,
"grad_norm": 0.29376113414764404,
"learning_rate": 2.6801014556537467e-05,
"loss": 0.0179,
"step": 6710
},
{
"epoch": 4.577656675749319,
"grad_norm": 0.2186402678489685,
"learning_rate": 2.6654669739840243e-05,
"loss": 0.0191,
"step": 6720
},
{
"epoch": 4.584468664850136,
"grad_norm": 0.21597842872142792,
"learning_rate": 2.650858022393451e-05,
"loss": 0.019,
"step": 6730
},
{
"epoch": 4.591280653950953,
"grad_norm": 0.20672723650932312,
"learning_rate": 2.6362747606431747e-05,
"loss": 0.0173,
"step": 6740
},
{
"epoch": 4.598092643051771,
"grad_norm": 0.16333183646202087,
"learning_rate": 2.6217173482134172e-05,
"loss": 0.0149,
"step": 6750
},
{
"epoch": 4.604904632152588,
"grad_norm": 0.3173683285713196,
"learning_rate": 2.6071859443017044e-05,
"loss": 0.0136,
"step": 6760
},
{
"epoch": 4.6117166212534055,
"grad_norm": 0.331967294216156,
"learning_rate": 2.5926807078211414e-05,
"loss": 0.0147,
"step": 6770
},
{
"epoch": 4.618528610354224,
"grad_norm": 0.26017463207244873,
"learning_rate": 2.5782017973986728e-05,
"loss": 0.0151,
"step": 6780
},
{
"epoch": 4.625340599455041,
"grad_norm": 0.17480212450027466,
"learning_rate": 2.5637493713733374e-05,
"loss": 0.0171,
"step": 6790
},
{
"epoch": 4.632152588555858,
"grad_norm": 0.20509187877178192,
"learning_rate": 2.549323587794559e-05,
"loss": 0.0203,
"step": 6800
},
{
"epoch": 4.638964577656676,
"grad_norm": 0.203098326921463,
"learning_rate": 2.5349246044203895e-05,
"loss": 0.0144,
"step": 6810
},
{
"epoch": 4.645776566757493,
"grad_norm": 0.25146251916885376,
"learning_rate": 2.520552578715808e-05,
"loss": 0.0159,
"step": 6820
},
{
"epoch": 4.65258855585831,
"grad_norm": 0.2880435585975647,
"learning_rate": 2.506207667850981e-05,
"loss": 0.0154,
"step": 6830
},
{
"epoch": 4.659400544959128,
"grad_norm": 0.1960860937833786,
"learning_rate": 2.4918900286995555e-05,
"loss": 0.0155,
"step": 6840
},
{
"epoch": 4.666212534059945,
"grad_norm": 0.1823454052209854,
"learning_rate": 2.4775998178369458e-05,
"loss": 0.0145,
"step": 6850
},
{
"epoch": 4.6730245231607626,
"grad_norm": 0.2692583203315735,
"learning_rate": 2.4633371915386017e-05,
"loss": 0.0161,
"step": 6860
},
{
"epoch": 4.679836512261581,
"grad_norm": 0.30107152462005615,
"learning_rate": 2.4491023057783235e-05,
"loss": 0.0184,
"step": 6870
},
{
"epoch": 4.686648501362398,
"grad_norm": 0.19429023563861847,
"learning_rate": 2.4348953162265375e-05,
"loss": 0.0179,
"step": 6880
},
{
"epoch": 4.693460490463215,
"grad_norm": 0.18987010419368744,
"learning_rate": 2.420716378248607e-05,
"loss": 0.0191,
"step": 6890
},
{
"epoch": 4.700272479564033,
"grad_norm": 0.21912752091884613,
"learning_rate": 2.4065656469031266e-05,
"loss": 0.0136,
"step": 6900
},
{
"epoch": 4.70708446866485,
"grad_norm": 0.17700830101966858,
"learning_rate": 2.3924432769402268e-05,
"loss": 0.0167,
"step": 6910
},
{
"epoch": 4.713896457765667,
"grad_norm": 0.14365394413471222,
"learning_rate": 2.3783494227998844e-05,
"loss": 0.0203,
"step": 6920
},
{
"epoch": 4.720708446866485,
"grad_norm": 0.2490224689245224,
"learning_rate": 2.3642842386102264e-05,
"loss": 0.0163,
"step": 6930
},
{
"epoch": 4.727520435967302,
"grad_norm": 0.3222252428531647,
"learning_rate": 2.3502478781858567e-05,
"loss": 0.0133,
"step": 6940
},
{
"epoch": 4.73433242506812,
"grad_norm": 0.2206520438194275,
"learning_rate": 2.3362404950261628e-05,
"loss": 0.0164,
"step": 6950
},
{
"epoch": 4.741144414168938,
"grad_norm": 0.21772713959217072,
"learning_rate": 2.3222622423136458e-05,
"loss": 0.0148,
"step": 6960
},
{
"epoch": 4.747956403269755,
"grad_norm": 0.18722061812877655,
"learning_rate": 2.3083132729122332e-05,
"loss": 0.014,
"step": 6970
},
{
"epoch": 4.754768392370572,
"grad_norm": 0.3535923659801483,
"learning_rate": 2.294393739365621e-05,
"loss": 0.0211,
"step": 6980
},
{
"epoch": 4.76158038147139,
"grad_norm": 0.1893048882484436,
"learning_rate": 2.2805037938956e-05,
"loss": 0.0167,
"step": 6990
},
{
"epoch": 4.768392370572207,
"grad_norm": 0.23466837406158447,
"learning_rate": 2.266643588400386e-05,
"loss": 0.0169,
"step": 7000
},
{
"epoch": 4.775204359673024,
"grad_norm": 0.1818532645702362,
"learning_rate": 2.252813274452969e-05,
"loss": 0.0174,
"step": 7010
},
{
"epoch": 4.782016348773842,
"grad_norm": 0.24044625461101532,
"learning_rate": 2.2390130032994427e-05,
"loss": 0.0146,
"step": 7020
},
{
"epoch": 4.7888283378746594,
"grad_norm": 0.19146227836608887,
"learning_rate": 2.2252429258573633e-05,
"loss": 0.0163,
"step": 7030
},
{
"epoch": 4.795640326975477,
"grad_norm": 0.2928459644317627,
"learning_rate": 2.2115031927140904e-05,
"loss": 0.0159,
"step": 7040
},
{
"epoch": 4.802452316076295,
"grad_norm": 0.26016002893447876,
"learning_rate": 2.1977939541251463e-05,
"loss": 0.0182,
"step": 7050
},
{
"epoch": 4.809264305177112,
"grad_norm": 0.2691255509853363,
"learning_rate": 2.1841153600125684e-05,
"loss": 0.0158,
"step": 7060
},
{
"epoch": 4.816076294277929,
"grad_norm": 0.21671241521835327,
"learning_rate": 2.170467559963267e-05,
"loss": 0.0167,
"step": 7070
},
{
"epoch": 4.822888283378747,
"grad_norm": 0.2578423023223877,
"learning_rate": 2.1568507032273982e-05,
"loss": 0.0131,
"step": 7080
},
{
"epoch": 4.829700272479564,
"grad_norm": 0.22187665104866028,
"learning_rate": 2.1432649387167264e-05,
"loss": 0.0147,
"step": 7090
},
{
"epoch": 4.836512261580381,
"grad_norm": 0.16120664775371552,
"learning_rate": 2.1297104150029973e-05,
"loss": 0.0146,
"step": 7100
},
{
"epoch": 4.843324250681199,
"grad_norm": 0.20281171798706055,
"learning_rate": 2.116187280316307e-05,
"loss": 0.0163,
"step": 7110
},
{
"epoch": 4.8501362397820165,
"grad_norm": 0.30870872735977173,
"learning_rate": 2.1026956825434908e-05,
"loss": 0.0137,
"step": 7120
},
{
"epoch": 4.856948228882834,
"grad_norm": 0.16448527574539185,
"learning_rate": 2.0892357692265017e-05,
"loss": 0.0135,
"step": 7130
},
{
"epoch": 4.863760217983652,
"grad_norm": 0.229940727353096,
"learning_rate": 2.0758076875607947e-05,
"loss": 0.0159,
"step": 7140
},
{
"epoch": 4.870572207084469,
"grad_norm": 0.1661119908094406,
"learning_rate": 2.0624115843937207e-05,
"loss": 0.0171,
"step": 7150
},
{
"epoch": 4.877384196185286,
"grad_norm": 0.17345386743545532,
"learning_rate": 2.0490476062229157e-05,
"loss": 0.0156,
"step": 7160
},
{
"epoch": 4.884196185286104,
"grad_norm": 0.30998191237449646,
"learning_rate": 2.035715899194704e-05,
"loss": 0.0151,
"step": 7170
},
{
"epoch": 4.891008174386921,
"grad_norm": 0.16312265396118164,
"learning_rate": 2.022416609102499e-05,
"loss": 0.0146,
"step": 7180
},
{
"epoch": 4.897820163487738,
"grad_norm": 0.19796396791934967,
"learning_rate": 2.009149881385205e-05,
"loss": 0.0197,
"step": 7190
},
{
"epoch": 4.904632152588556,
"grad_norm": 0.27385029196739197,
"learning_rate": 1.995915861125634e-05,
"loss": 0.0143,
"step": 7200
},
{
"epoch": 4.9114441416893735,
"grad_norm": 0.13566231727600098,
"learning_rate": 1.9827146930489065e-05,
"loss": 0.0131,
"step": 7210
},
{
"epoch": 4.918256130790191,
"grad_norm": 0.26954782009124756,
"learning_rate": 1.9695465215208848e-05,
"loss": 0.018,
"step": 7220
},
{
"epoch": 4.925068119891008,
"grad_norm": 0.20488935708999634,
"learning_rate": 1.9564114905465813e-05,
"loss": 0.0139,
"step": 7230
},
{
"epoch": 4.931880108991826,
"grad_norm": 0.25250253081321716,
"learning_rate": 1.9433097437685936e-05,
"loss": 0.014,
"step": 7240
},
{
"epoch": 4.938692098092643,
"grad_norm": 0.22722044587135315,
"learning_rate": 1.930241424465521e-05,
"loss": 0.0129,
"step": 7250
},
{
"epoch": 4.94550408719346,
"grad_norm": 0.12395540624856949,
"learning_rate": 1.9172066755504115e-05,
"loss": 0.0117,
"step": 7260
},
{
"epoch": 4.952316076294278,
"grad_norm": 0.1848660111427307,
"learning_rate": 1.9042056395691914e-05,
"loss": 0.0153,
"step": 7270
},
{
"epoch": 4.959128065395095,
"grad_norm": 0.1646895408630371,
"learning_rate": 1.8912384586991066e-05,
"loss": 0.0127,
"step": 7280
},
{
"epoch": 4.9659400544959125,
"grad_norm": 0.2536143958568573,
"learning_rate": 1.8783052747471717e-05,
"loss": 0.0145,
"step": 7290
},
{
"epoch": 4.9727520435967305,
"grad_norm": 0.17167410254478455,
"learning_rate": 1.865406229148611e-05,
"loss": 0.0138,
"step": 7300
},
{
"epoch": 4.979564032697548,
"grad_norm": 0.24148517847061157,
"learning_rate": 1.8525414629653233e-05,
"loss": 0.016,
"step": 7310
},
{
"epoch": 4.986376021798365,
"grad_norm": 0.2849847674369812,
"learning_rate": 1.8397111168843255e-05,
"loss": 0.0142,
"step": 7320
},
{
"epoch": 4.993188010899183,
"grad_norm": 0.19562356173992157,
"learning_rate": 1.8269153312162323e-05,
"loss": 0.0153,
"step": 7330
},
{
"epoch": 5.0,
"grad_norm": 0.25182366371154785,
"learning_rate": 1.8141542458937054e-05,
"loss": 0.0128,
"step": 7340
},
{
"epoch": 5.006811989100817,
"grad_norm": 0.22833839058876038,
"learning_rate": 1.8014280004699268e-05,
"loss": 0.0127,
"step": 7350
},
{
"epoch": 5.013623978201635,
"grad_norm": 0.17050805687904358,
"learning_rate": 1.788736734117078e-05,
"loss": 0.0113,
"step": 7360
},
{
"epoch": 5.020435967302452,
"grad_norm": 0.2042902410030365,
"learning_rate": 1.7760805856248152e-05,
"loss": 0.0131,
"step": 7370
},
{
"epoch": 5.0272479564032695,
"grad_norm": 0.24889463186264038,
"learning_rate": 1.7634596933987518e-05,
"loss": 0.0164,
"step": 7380
},
{
"epoch": 5.0340599455040875,
"grad_norm": 0.2117907702922821,
"learning_rate": 1.7508741954589404e-05,
"loss": 0.0177,
"step": 7390
},
{
"epoch": 5.040871934604905,
"grad_norm": 0.17846384644508362,
"learning_rate": 1.7383242294383717e-05,
"loss": 0.0135,
"step": 7400
},
{
"epoch": 5.047683923705722,
"grad_norm": 0.17487211525440216,
"learning_rate": 1.7258099325814632e-05,
"loss": 0.0115,
"step": 7410
},
{
"epoch": 5.05449591280654,
"grad_norm": 0.18704567849636078,
"learning_rate": 1.7133314417425594e-05,
"loss": 0.0146,
"step": 7420
},
{
"epoch": 5.061307901907357,
"grad_norm": 0.24722889065742493,
"learning_rate": 1.7008888933844408e-05,
"loss": 0.0148,
"step": 7430
},
{
"epoch": 5.068119891008174,
"grad_norm": 0.19086501002311707,
"learning_rate": 1.6884824235768172e-05,
"loss": 0.0148,
"step": 7440
},
{
"epoch": 5.074931880108992,
"grad_norm": 0.18787351250648499,
"learning_rate": 1.6761121679948592e-05,
"loss": 0.0125,
"step": 7450
},
{
"epoch": 5.081743869209809,
"grad_norm": 0.28582966327667236,
"learning_rate": 1.663778261917695e-05,
"loss": 0.0148,
"step": 7460
},
{
"epoch": 5.0885558583106265,
"grad_norm": 0.1483089029788971,
"learning_rate": 1.651480840226952e-05,
"loss": 0.0123,
"step": 7470
},
{
"epoch": 5.0953678474114446,
"grad_norm": 0.24858340620994568,
"learning_rate": 1.639220037405258e-05,
"loss": 0.0148,
"step": 7480
},
{
"epoch": 5.102179836512262,
"grad_norm": 0.1595468819141388,
"learning_rate": 1.6269959875347906e-05,
"loss": 0.0137,
"step": 7490
},
{
"epoch": 5.108991825613079,
"grad_norm": 0.27670302987098694,
"learning_rate": 1.614808824295802e-05,
"loss": 0.0143,
"step": 7500
},
{
"epoch": 5.115803814713897,
"grad_norm": 0.13641585409641266,
"learning_rate": 1.602658680965152e-05,
"loss": 0.0103,
"step": 7510
},
{
"epoch": 5.122615803814714,
"grad_norm": 0.23924027383327484,
"learning_rate": 1.5905456904148686e-05,
"loss": 0.0146,
"step": 7520
},
{
"epoch": 5.129427792915531,
"grad_norm": 0.21490581333637238,
"learning_rate": 1.57846998511067e-05,
"loss": 0.0146,
"step": 7530
},
{
"epoch": 5.136239782016348,
"grad_norm": 0.28059524297714233,
"learning_rate": 1.566431697110538e-05,
"loss": 0.0117,
"step": 7540
},
{
"epoch": 5.143051771117166,
"grad_norm": 0.20346660912036896,
"learning_rate": 1.554430958063259e-05,
"loss": 0.0113,
"step": 7550
},
{
"epoch": 5.1498637602179835,
"grad_norm": 0.17303743958473206,
"learning_rate": 1.5424678992069912e-05,
"loss": 0.0123,
"step": 7560
},
{
"epoch": 5.156675749318801,
"grad_norm": 0.21222537755966187,
"learning_rate": 1.5305426513678362e-05,
"loss": 0.012,
"step": 7570
},
{
"epoch": 5.163487738419619,
"grad_norm": 0.22923482954502106,
"learning_rate": 1.518655344958388e-05,
"loss": 0.0136,
"step": 7580
},
{
"epoch": 5.170299727520436,
"grad_norm": 0.28508874773979187,
"learning_rate": 1.5068061099763275e-05,
"loss": 0.0124,
"step": 7590
},
{
"epoch": 5.177111716621253,
"grad_norm": 0.31998851895332336,
"learning_rate": 1.494995076002988e-05,
"loss": 0.0155,
"step": 7600
},
{
"epoch": 5.183923705722071,
"grad_norm": 0.20339614152908325,
"learning_rate": 1.4832223722019456e-05,
"loss": 0.0125,
"step": 7610
},
{
"epoch": 5.190735694822888,
"grad_norm": 0.19148500263690948,
"learning_rate": 1.4714881273176035e-05,
"loss": 0.0139,
"step": 7620
},
{
"epoch": 5.197547683923705,
"grad_norm": 0.17169404029846191,
"learning_rate": 1.4597924696737835e-05,
"loss": 0.0123,
"step": 7630
},
{
"epoch": 5.204359673024523,
"grad_norm": 0.17913302779197693,
"learning_rate": 1.4481355271723252e-05,
"loss": 0.0123,
"step": 7640
},
{
"epoch": 5.2111716621253406,
"grad_norm": 0.34465935826301575,
"learning_rate": 1.4365174272916809e-05,
"loss": 0.0154,
"step": 7650
},
{
"epoch": 5.217983651226158,
"grad_norm": 0.17733906209468842,
"learning_rate": 1.4249382970855319e-05,
"loss": 0.012,
"step": 7660
},
{
"epoch": 5.224795640326976,
"grad_norm": 0.12495987117290497,
"learning_rate": 1.4133982631813903e-05,
"loss": 0.014,
"step": 7670
},
{
"epoch": 5.231607629427793,
"grad_norm": 0.14085407555103302,
"learning_rate": 1.4018974517792194e-05,
"loss": 0.0108,
"step": 7680
},
{
"epoch": 5.23841961852861,
"grad_norm": 0.1950143575668335,
"learning_rate": 1.390435988650048e-05,
"loss": 0.0115,
"step": 7690
},
{
"epoch": 5.245231607629428,
"grad_norm": 0.2449447363615036,
"learning_rate": 1.3790139991346006e-05,
"loss": 0.0126,
"step": 7700
},
{
"epoch": 5.252043596730245,
"grad_norm": 0.148986354470253,
"learning_rate": 1.367631608141926e-05,
"loss": 0.0117,
"step": 7710
},
{
"epoch": 5.258855585831062,
"grad_norm": 0.2584574520587921,
"learning_rate": 1.3562889401480278e-05,
"loss": 0.0122,
"step": 7720
},
{
"epoch": 5.26566757493188,
"grad_norm": 0.19506706297397614,
"learning_rate": 1.3449861191945074e-05,
"loss": 0.0145,
"step": 7730
},
{
"epoch": 5.272479564032698,
"grad_norm": 0.15596213936805725,
"learning_rate": 1.3337232688872009e-05,
"loss": 0.0131,
"step": 7740
},
{
"epoch": 5.279291553133515,
"grad_norm": 0.227974072098732,
"learning_rate": 1.3225005123948364e-05,
"loss": 0.0128,
"step": 7750
},
{
"epoch": 5.286103542234333,
"grad_norm": 0.15332451462745667,
"learning_rate": 1.311317972447681e-05,
"loss": 0.0103,
"step": 7760
},
{
"epoch": 5.29291553133515,
"grad_norm": 0.14478209614753723,
"learning_rate": 1.3001757713361996e-05,
"loss": 0.0123,
"step": 7770
},
{
"epoch": 5.299727520435967,
"grad_norm": 0.2231355756521225,
"learning_rate": 1.2890740309097204e-05,
"loss": 0.0122,
"step": 7780
},
{
"epoch": 5.306539509536785,
"grad_norm": 0.15700560808181763,
"learning_rate": 1.2780128725750944e-05,
"loss": 0.0122,
"step": 7790
},
{
"epoch": 5.313351498637602,
"grad_norm": 0.1040923222899437,
"learning_rate": 1.266992417295379e-05,
"loss": 0.0122,
"step": 7800
},
{
"epoch": 5.320163487738419,
"grad_norm": 0.23007836937904358,
"learning_rate": 1.2560127855885073e-05,
"loss": 0.0125,
"step": 7810
},
{
"epoch": 5.3269754768392374,
"grad_norm": 0.19076195359230042,
"learning_rate": 1.2450740975259745e-05,
"loss": 0.0146,
"step": 7820
},
{
"epoch": 5.333787465940055,
"grad_norm": 0.1625741422176361,
"learning_rate": 1.234176472731517e-05,
"loss": 0.0206,
"step": 7830
},
{
"epoch": 5.340599455040872,
"grad_norm": 0.19957181811332703,
"learning_rate": 1.2233200303798158e-05,
"loss": 0.0111,
"step": 7840
},
{
"epoch": 5.34741144414169,
"grad_norm": 0.26022225618362427,
"learning_rate": 1.2125048891951846e-05,
"loss": 0.015,
"step": 7850
},
{
"epoch": 5.354223433242507,
"grad_norm": 0.19087891280651093,
"learning_rate": 1.2017311674502745e-05,
"loss": 0.0099,
"step": 7860
},
{
"epoch": 5.361035422343324,
"grad_norm": 0.20264586806297302,
"learning_rate": 1.1909989829647822e-05,
"loss": 0.0111,
"step": 7870
},
{
"epoch": 5.367847411444142,
"grad_norm": 0.15192349255084991,
"learning_rate": 1.1803084531041553e-05,
"loss": 0.0133,
"step": 7880
},
{
"epoch": 5.374659400544959,
"grad_norm": 0.1685389131307602,
"learning_rate": 1.1696596947783162e-05,
"loss": 0.0115,
"step": 7890
},
{
"epoch": 5.381471389645776,
"grad_norm": 0.16295170783996582,
"learning_rate": 1.1590528244403803e-05,
"loss": 0.012,
"step": 7900
},
{
"epoch": 5.3882833787465945,
"grad_norm": 0.21759124100208282,
"learning_rate": 1.148487958085382e-05,
"loss": 0.013,
"step": 7910
},
{
"epoch": 5.395095367847412,
"grad_norm": 0.14942030608654022,
"learning_rate": 1.1379652112490086e-05,
"loss": 0.0139,
"step": 7920
},
{
"epoch": 5.401907356948229,
"grad_norm": 0.22778572142124176,
"learning_rate": 1.1274846990063315e-05,
"loss": 0.0121,
"step": 7930
},
{
"epoch": 5.408719346049046,
"grad_norm": 0.18669773638248444,
"learning_rate": 1.117046535970554e-05,
"loss": 0.0147,
"step": 7940
},
{
"epoch": 5.415531335149864,
"grad_norm": 0.15506009757518768,
"learning_rate": 1.106650836291755e-05,
"loss": 0.0177,
"step": 7950
},
{
"epoch": 5.422343324250681,
"grad_norm": 0.2541573643684387,
"learning_rate": 1.0962977136556418e-05,
"loss": 0.0153,
"step": 7960
},
{
"epoch": 5.429155313351498,
"grad_norm": 0.1981164813041687,
"learning_rate": 1.0859872812823024e-05,
"loss": 0.0132,
"step": 7970
},
{
"epoch": 5.435967302452316,
"grad_norm": 0.2570594549179077,
"learning_rate": 1.0757196519249747e-05,
"loss": 0.016,
"step": 7980
},
{
"epoch": 5.4427792915531334,
"grad_norm": 0.11997192353010178,
"learning_rate": 1.0654949378688077e-05,
"loss": 0.014,
"step": 7990
},
{
"epoch": 5.449591280653951,
"grad_norm": 0.194411501288414,
"learning_rate": 1.0553132509296376e-05,
"loss": 0.013,
"step": 8000
},
{
"epoch": 5.456403269754769,
"grad_norm": 0.16398310661315918,
"learning_rate": 1.0451747024527613e-05,
"loss": 0.013,
"step": 8010
},
{
"epoch": 5.463215258855586,
"grad_norm": 0.2351941168308258,
"learning_rate": 1.0350794033117189e-05,
"loss": 0.0141,
"step": 8020
},
{
"epoch": 5.470027247956403,
"grad_norm": 0.254794180393219,
"learning_rate": 1.0250274639070856e-05,
"loss": 0.0115,
"step": 8030
},
{
"epoch": 5.476839237057221,
"grad_norm": 0.12862807512283325,
"learning_rate": 1.0150189941652599e-05,
"loss": 0.0104,
"step": 8040
},
{
"epoch": 5.483651226158038,
"grad_norm": 0.1112130880355835,
"learning_rate": 1.0050541035372635e-05,
"loss": 0.0105,
"step": 8050
},
{
"epoch": 5.490463215258855,
"grad_norm": 0.15486888587474823,
"learning_rate": 9.951329009975458e-06,
"loss": 0.0108,
"step": 8060
},
{
"epoch": 5.497275204359673,
"grad_norm": 0.13335685431957245,
"learning_rate": 9.852554950427845e-06,
"loss": 0.0128,
"step": 8070
},
{
"epoch": 5.5040871934604905,
"grad_norm": 0.16484335064888,
"learning_rate": 9.754219936907105e-06,
"loss": 0.0123,
"step": 8080
},
{
"epoch": 5.510899182561308,
"grad_norm": 0.13687945902347565,
"learning_rate": 9.656325044789194e-06,
"loss": 0.0096,
"step": 8090
},
{
"epoch": 5.517711171662126,
"grad_norm": 0.1303662657737732,
"learning_rate": 9.55887134463697e-06,
"loss": 0.0086,
"step": 8100
},
{
"epoch": 5.524523160762943,
"grad_norm": 0.1451333612203598,
"learning_rate": 9.461859902188475e-06,
"loss": 0.0119,
"step": 8110
},
{
"epoch": 5.53133514986376,
"grad_norm": 0.2170574814081192,
"learning_rate": 9.365291778345303e-06,
"loss": 0.0121,
"step": 8120
},
{
"epoch": 5.538147138964578,
"grad_norm": 0.16463404893875122,
"learning_rate": 9.269168029160991e-06,
"loss": 0.0089,
"step": 8130
},
{
"epoch": 5.544959128065395,
"grad_norm": 0.2275201976299286,
"learning_rate": 9.173489705829447e-06,
"loss": 0.0129,
"step": 8140
},
{
"epoch": 5.551771117166212,
"grad_norm": 0.11590515077114105,
"learning_rate": 9.078257854673516e-06,
"loss": 0.0124,
"step": 8150
},
{
"epoch": 5.55858310626703,
"grad_norm": 0.1156085953116417,
"learning_rate": 8.983473517133429e-06,
"loss": 0.0135,
"step": 8160
},
{
"epoch": 5.5653950953678475,
"grad_norm": 0.12850528955459595,
"learning_rate": 8.889137729755537e-06,
"loss": 0.0105,
"step": 8170
},
{
"epoch": 5.572207084468665,
"grad_norm": 0.2325507402420044,
"learning_rate": 8.79525152418087e-06,
"loss": 0.0136,
"step": 8180
},
{
"epoch": 5.579019073569482,
"grad_norm": 0.17301425337791443,
"learning_rate": 8.701815927133961e-06,
"loss": 0.0124,
"step": 8190
},
{
"epoch": 5.5858310626703,
"grad_norm": 0.13355191051959991,
"learning_rate": 8.608831960411534e-06,
"loss": 0.0114,
"step": 8200
},
{
"epoch": 5.592643051771117,
"grad_norm": 0.2157035917043686,
"learning_rate": 8.516300640871321e-06,
"loss": 0.0126,
"step": 8210
},
{
"epoch": 5.599455040871934,
"grad_norm": 0.16316112875938416,
"learning_rate": 8.424222980421038e-06,
"loss": 0.0133,
"step": 8220
},
{
"epoch": 5.606267029972752,
"grad_norm": 0.10164311528205872,
"learning_rate": 8.332599986007184e-06,
"loss": 0.0111,
"step": 8230
},
{
"epoch": 5.613079019073569,
"grad_norm": 0.13754205405712128,
"learning_rate": 8.241432659604203e-06,
"loss": 0.0098,
"step": 8240
},
{
"epoch": 5.6198910081743865,
"grad_norm": 0.17243002355098724,
"learning_rate": 8.150721998203331e-06,
"loss": 0.0092,
"step": 8250
},
{
"epoch": 5.6267029972752045,
"grad_norm": 0.14749637246131897,
"learning_rate": 8.06046899380184e-06,
"loss": 0.0098,
"step": 8260
},
{
"epoch": 5.633514986376022,
"grad_norm": 0.12213444709777832,
"learning_rate": 7.970674633392133e-06,
"loss": 0.0139,
"step": 8270
},
{
"epoch": 5.640326975476839,
"grad_norm": 0.1787102073431015,
"learning_rate": 7.881339898950924e-06,
"loss": 0.0142,
"step": 8280
},
{
"epoch": 5.647138964577657,
"grad_norm": 0.10263296216726303,
"learning_rate": 7.792465767428597e-06,
"loss": 0.0116,
"step": 8290
},
{
"epoch": 5.653950953678474,
"grad_norm": 0.11837161332368851,
"learning_rate": 7.704053210738376e-06,
"loss": 0.0095,
"step": 8300
},
{
"epoch": 5.660762942779291,
"grad_norm": 0.13488389551639557,
"learning_rate": 7.6161031957458494e-06,
"loss": 0.0138,
"step": 8310
},
{
"epoch": 5.667574931880109,
"grad_norm": 0.19569364190101624,
"learning_rate": 7.5286166842582605e-06,
"loss": 0.0121,
"step": 8320
},
{
"epoch": 5.674386920980926,
"grad_norm": 0.19341343641281128,
"learning_rate": 7.4415946330140814e-06,
"loss": 0.0115,
"step": 8330
},
{
"epoch": 5.6811989100817435,
"grad_norm": 0.15243728458881378,
"learning_rate": 7.3550379936725644e-06,
"loss": 0.0114,
"step": 8340
},
{
"epoch": 5.6880108991825615,
"grad_norm": 0.13914422690868378,
"learning_rate": 7.2689477128032035e-06,
"loss": 0.0125,
"step": 8350
},
{
"epoch": 5.694822888283379,
"grad_norm": 0.15893633663654327,
"learning_rate": 7.183324731875551e-06,
"loss": 0.0098,
"step": 8360
},
{
"epoch": 5.701634877384196,
"grad_norm": 0.16882383823394775,
"learning_rate": 7.098169987248782e-06,
"loss": 0.0089,
"step": 8370
},
{
"epoch": 5.708446866485014,
"grad_norm": 0.11707707494497299,
"learning_rate": 7.013484410161553e-06,
"loss": 0.0111,
"step": 8380
},
{
"epoch": 5.715258855585831,
"grad_norm": 0.15138401091098785,
"learning_rate": 6.92926892672176e-06,
"loss": 0.011,
"step": 8390
},
{
"epoch": 5.722070844686648,
"grad_norm": 0.15782202780246735,
"learning_rate": 6.845524457896446e-06,
"loss": 0.0087,
"step": 8400
},
{
"epoch": 5.728882833787466,
"grad_norm": 0.14364789426326752,
"learning_rate": 6.7622519195017165e-06,
"loss": 0.0099,
"step": 8410
},
{
"epoch": 5.735694822888283,
"grad_norm": 0.1990385502576828,
"learning_rate": 6.679452222192684e-06,
"loss": 0.0099,
"step": 8420
},
{
"epoch": 5.7425068119891005,
"grad_norm": 0.11444421857595444,
"learning_rate": 6.597126271453579e-06,
"loss": 0.0088,
"step": 8430
},
{
"epoch": 5.7493188010899186,
"grad_norm": 0.09519212692975998,
"learning_rate": 6.51527496758782e-06,
"loss": 0.0089,
"step": 8440
},
{
"epoch": 5.756130790190736,
"grad_norm": 0.1508159637451172,
"learning_rate": 6.433899205708155e-06,
"loss": 0.0097,
"step": 8450
},
{
"epoch": 5.762942779291553,
"grad_norm": 0.12732820212841034,
"learning_rate": 6.352999875726856e-06,
"loss": 0.0091,
"step": 8460
},
{
"epoch": 5.769754768392371,
"grad_norm": 0.09891568869352341,
"learning_rate": 6.272577862346052e-06,
"loss": 0.0113,
"step": 8470
},
{
"epoch": 5.776566757493188,
"grad_norm": 0.2046702355146408,
"learning_rate": 6.192634045047996e-06,
"loss": 0.0112,
"step": 8480
},
{
"epoch": 5.783378746594005,
"grad_norm": 0.2202032059431076,
"learning_rate": 6.113169298085458e-06,
"loss": 0.0166,
"step": 8490
},
{
"epoch": 5.790190735694823,
"grad_norm": 0.2339613288640976,
"learning_rate": 6.034184490472195e-06,
"loss": 0.0079,
"step": 8500
},
{
"epoch": 5.79700272479564,
"grad_norm": 0.20225585997104645,
"learning_rate": 5.955680485973386e-06,
"loss": 0.0131,
"step": 8510
},
{
"epoch": 5.8038147138964575,
"grad_norm": 0.2018497884273529,
"learning_rate": 5.877658143096265e-06,
"loss": 0.011,
"step": 8520
},
{
"epoch": 5.810626702997276,
"grad_norm": 0.13856525719165802,
"learning_rate": 5.800118315080661e-06,
"loss": 0.01,
"step": 8530
},
{
"epoch": 5.817438692098093,
"grad_norm": 0.1401432752609253,
"learning_rate": 5.723061849889716e-06,
"loss": 0.0084,
"step": 8540
},
{
"epoch": 5.82425068119891,
"grad_norm": 0.1731623411178589,
"learning_rate": 5.646489590200604e-06,
"loss": 0.0126,
"step": 8550
},
{
"epoch": 5.831062670299728,
"grad_norm": 0.12786374986171722,
"learning_rate": 5.570402373395256e-06,
"loss": 0.01,
"step": 8560
},
{
"epoch": 5.837874659400545,
"grad_norm": 0.2641719877719879,
"learning_rate": 5.494801031551305e-06,
"loss": 0.0111,
"step": 8570
},
{
"epoch": 5.844686648501362,
"grad_norm": 0.16163118183612823,
"learning_rate": 5.41968639143291e-06,
"loss": 0.0106,
"step": 8580
},
{
"epoch": 5.85149863760218,
"grad_norm": 0.1381234973669052,
"learning_rate": 5.345059274481751e-06,
"loss": 0.0093,
"step": 8590
},
{
"epoch": 5.858310626702997,
"grad_norm": 0.1420307159423828,
"learning_rate": 5.270920496808002e-06,
"loss": 0.0134,
"step": 8600
},
{
"epoch": 5.8651226158038146,
"grad_norm": 0.1673470139503479,
"learning_rate": 5.1972708691814695e-06,
"loss": 0.0109,
"step": 8610
},
{
"epoch": 5.871934604904633,
"grad_norm": 0.2173473834991455,
"learning_rate": 5.124111197022674e-06,
"loss": 0.0119,
"step": 8620
},
{
"epoch": 5.87874659400545,
"grad_norm": 0.11630476266145706,
"learning_rate": 5.051442280394081e-06,
"loss": 0.009,
"step": 8630
},
{
"epoch": 5.885558583106267,
"grad_norm": 0.0949091911315918,
"learning_rate": 4.979264913991322e-06,
"loss": 0.0119,
"step": 8640
},
{
"epoch": 5.892370572207085,
"grad_norm": 0.16577839851379395,
"learning_rate": 4.907579887134489e-06,
"loss": 0.0126,
"step": 8650
},
{
"epoch": 5.899182561307902,
"grad_norm": 0.17283402383327484,
"learning_rate": 4.836387983759572e-06,
"loss": 0.011,
"step": 8660
},
{
"epoch": 5.905994550408719,
"grad_norm": 0.1463468372821808,
"learning_rate": 4.765689982409816e-06,
"loss": 0.0102,
"step": 8670
},
{
"epoch": 5.912806539509537,
"grad_norm": 0.21168796718120575,
"learning_rate": 4.695486656227233e-06,
"loss": 0.0132,
"step": 8680
},
{
"epoch": 5.919618528610354,
"grad_norm": 0.24207310378551483,
"learning_rate": 4.625778772944156e-06,
"loss": 0.0119,
"step": 8690
},
{
"epoch": 5.926430517711172,
"grad_norm": 0.20093917846679688,
"learning_rate": 4.556567094874825e-06,
"loss": 0.0123,
"step": 8700
},
{
"epoch": 5.933242506811989,
"grad_norm": 0.11250998079776764,
"learning_rate": 4.487852378907059e-06,
"loss": 0.0076,
"step": 8710
},
{
"epoch": 5.940054495912807,
"grad_norm": 0.11169631779193878,
"learning_rate": 4.419635376493986e-06,
"loss": 0.0113,
"step": 8720
},
{
"epoch": 5.946866485013624,
"grad_norm": 0.17939099669456482,
"learning_rate": 4.351916833645825e-06,
"loss": 0.0105,
"step": 8730
},
{
"epoch": 5.953678474114441,
"grad_norm": 0.19434191286563873,
"learning_rate": 4.284697490921691e-06,
"loss": 0.0106,
"step": 8740
},
{
"epoch": 5.960490463215259,
"grad_norm": 0.16198799014091492,
"learning_rate": 4.2179780834215585e-06,
"loss": 0.0127,
"step": 8750
},
{
"epoch": 5.967302452316076,
"grad_norm": 0.22619812190532684,
"learning_rate": 4.151759340778178e-06,
"loss": 0.0117,
"step": 8760
},
{
"epoch": 5.974114441416893,
"grad_norm": 0.2598056495189667,
"learning_rate": 4.086041987149109e-06,
"loss": 0.01,
"step": 8770
},
{
"epoch": 5.9809264305177114,
"grad_norm": 0.18251881003379822,
"learning_rate": 4.020826741208811e-06,
"loss": 0.0102,
"step": 8780
},
{
"epoch": 5.987738419618529,
"grad_norm": 0.18505583703517914,
"learning_rate": 3.956114316140746e-06,
"loss": 0.0121,
"step": 8790
},
{
"epoch": 5.994550408719346,
"grad_norm": 0.14361293613910675,
"learning_rate": 3.891905419629643e-06,
"loss": 0.0099,
"step": 8800
},
{
"epoch": 6.001362397820164,
"grad_norm": 0.10514985024929047,
"learning_rate": 3.8282007538536946e-06,
"loss": 0.0127,
"step": 8810
},
{
"epoch": 6.008174386920981,
"grad_norm": 0.16004830598831177,
"learning_rate": 3.7650010154769265e-06,
"loss": 0.0089,
"step": 8820
},
{
"epoch": 6.014986376021798,
"grad_norm": 0.18699565529823303,
"learning_rate": 3.7023068956415608e-06,
"loss": 0.0123,
"step": 8830
},
{
"epoch": 6.021798365122616,
"grad_norm": 0.17017434537410736,
"learning_rate": 3.6401190799604303e-06,
"loss": 0.0084,
"step": 8840
},
{
"epoch": 6.028610354223433,
"grad_norm": 0.18797238171100616,
"learning_rate": 3.578438248509536e-06,
"loss": 0.012,
"step": 8850
},
{
"epoch": 6.03542234332425,
"grad_norm": 0.16716784238815308,
"learning_rate": 3.5172650758205583e-06,
"loss": 0.012,
"step": 8860
},
{
"epoch": 6.0422343324250685,
"grad_norm": 0.10475629568099976,
"learning_rate": 3.45660023087353e-06,
"loss": 0.008,
"step": 8870
},
{
"epoch": 6.049046321525886,
"grad_norm": 0.12020768970251083,
"learning_rate": 3.3964443770894528e-06,
"loss": 0.0087,
"step": 8880
},
{
"epoch": 6.055858310626703,
"grad_norm": 0.10397229343652725,
"learning_rate": 3.3367981723231245e-06,
"loss": 0.0091,
"step": 8890
},
{
"epoch": 6.062670299727521,
"grad_norm": 0.20012831687927246,
"learning_rate": 3.2776622688558746e-06,
"loss": 0.0099,
"step": 8900
},
{
"epoch": 6.069482288828338,
"grad_norm": 0.19983907043933868,
"learning_rate": 3.2190373133884677e-06,
"loss": 0.0102,
"step": 8910
},
{
"epoch": 6.076294277929155,
"grad_norm": 0.17271621525287628,
"learning_rate": 3.1609239470340446e-06,
"loss": 0.0104,
"step": 8920
},
{
"epoch": 6.083106267029973,
"grad_norm": 0.16302776336669922,
"learning_rate": 3.1033228053110373e-06,
"loss": 0.0078,
"step": 8930
},
{
"epoch": 6.08991825613079,
"grad_norm": 0.12263508886098862,
"learning_rate": 3.0462345181363314e-06,
"loss": 0.009,
"step": 8940
},
{
"epoch": 6.0967302452316074,
"grad_norm": 0.11456681787967682,
"learning_rate": 2.9896597098182654e-06,
"loss": 0.0109,
"step": 8950
},
{
"epoch": 6.1035422343324255,
"grad_norm": 0.08905057609081268,
"learning_rate": 2.933598999049891e-06,
"loss": 0.0112,
"step": 8960
},
{
"epoch": 6.110354223433243,
"grad_norm": 0.15491244196891785,
"learning_rate": 2.8780529989021697e-06,
"loss": 0.0095,
"step": 8970
},
{
"epoch": 6.11716621253406,
"grad_norm": 0.15372540056705475,
"learning_rate": 2.823022316817242e-06,
"loss": 0.0124,
"step": 8980
},
{
"epoch": 6.123978201634877,
"grad_norm": 0.20342043042182922,
"learning_rate": 2.7685075546018456e-06,
"loss": 0.0123,
"step": 8990
},
{
"epoch": 6.130790190735695,
"grad_norm": 0.12789203226566315,
"learning_rate": 2.7145093084206598e-06,
"loss": 0.0108,
"step": 9000
},
{
"epoch": 6.137602179836512,
"grad_norm": 0.19718892872333527,
"learning_rate": 2.661028168789892e-06,
"loss": 0.0094,
"step": 9010
},
{
"epoch": 6.144414168937329,
"grad_norm": 0.2571142911911011,
"learning_rate": 2.6080647205706855e-06,
"loss": 0.0091,
"step": 9020
},
{
"epoch": 6.151226158038147,
"grad_norm": 0.08045794069766998,
"learning_rate": 2.555619542962834e-06,
"loss": 0.0101,
"step": 9030
},
{
"epoch": 6.1580381471389645,
"grad_norm": 0.10921412706375122,
"learning_rate": 2.503693209498409e-06,
"loss": 0.0064,
"step": 9040
},
{
"epoch": 6.164850136239782,
"grad_norm": 0.14346344769001007,
"learning_rate": 2.452286288035449e-06,
"loss": 0.0091,
"step": 9050
},
{
"epoch": 6.1716621253406,
"grad_norm": 0.12146768718957901,
"learning_rate": 2.4013993407518363e-06,
"loss": 0.0127,
"step": 9060
},
{
"epoch": 6.178474114441417,
"grad_norm": 0.11175204068422318,
"learning_rate": 2.351032924139063e-06,
"loss": 0.0076,
"step": 9070
},
{
"epoch": 6.185286103542234,
"grad_norm": 0.1668560802936554,
"learning_rate": 2.30118758899619e-06,
"loss": 0.0112,
"step": 9080
},
{
"epoch": 6.192098092643052,
"grad_norm": 0.15498773753643036,
"learning_rate": 2.2518638804238157e-06,
"loss": 0.0084,
"step": 9090
},
{
"epoch": 6.198910081743869,
"grad_norm": 0.2582722306251526,
"learning_rate": 2.203062337818118e-06,
"loss": 0.0107,
"step": 9100
},
{
"epoch": 6.205722070844686,
"grad_norm": 0.14667384326457977,
"learning_rate": 2.1547834948649483e-06,
"loss": 0.0106,
"step": 9110
},
{
"epoch": 6.212534059945504,
"grad_norm": 0.08730646222829819,
"learning_rate": 2.1070278795340017e-06,
"loss": 0.011,
"step": 9120
},
{
"epoch": 6.2193460490463215,
"grad_norm": 0.19518472254276276,
"learning_rate": 2.059796014073029e-06,
"loss": 0.0078,
"step": 9130
},
{
"epoch": 6.226158038147139,
"grad_norm": 0.09343539923429489,
"learning_rate": 2.01308841500214e-06,
"loss": 0.0079,
"step": 9140
},
{
"epoch": 6.232970027247957,
"grad_norm": 0.2299136221408844,
"learning_rate": 1.9669055931081704e-06,
"loss": 0.0122,
"step": 9150
},
{
"epoch": 6.239782016348774,
"grad_norm": 0.16625770926475525,
"learning_rate": 1.9212480534390507e-06,
"loss": 0.0097,
"step": 9160
},
{
"epoch": 6.246594005449591,
"grad_norm": 0.18141430616378784,
"learning_rate": 1.8761162952983246e-06,
"loss": 0.011,
"step": 9170
},
{
"epoch": 6.253405994550409,
"grad_norm": 0.22686415910720825,
"learning_rate": 1.8315108122396618e-06,
"loss": 0.0114,
"step": 9180
},
{
"epoch": 6.260217983651226,
"grad_norm": 0.14493921399116516,
"learning_rate": 1.787432092061475e-06,
"loss": 0.0085,
"step": 9190
},
{
"epoch": 6.267029972752043,
"grad_norm": 0.12274694442749023,
"learning_rate": 1.743880616801602e-06,
"loss": 0.0113,
"step": 9200
},
{
"epoch": 6.273841961852861,
"grad_norm": 0.10201839357614517,
"learning_rate": 1.7008568627319865e-06,
"loss": 0.0096,
"step": 9210
},
{
"epoch": 6.2806539509536785,
"grad_norm": 0.20207750797271729,
"learning_rate": 1.6583613003535226e-06,
"loss": 0.0121,
"step": 9220
},
{
"epoch": 6.287465940054496,
"grad_norm": 0.13486947119235992,
"learning_rate": 1.6163943943908522e-06,
"loss": 0.0123,
"step": 9230
},
{
"epoch": 6.294277929155314,
"grad_norm": 0.14914485812187195,
"learning_rate": 1.5749566037873476e-06,
"loss": 0.01,
"step": 9240
},
{
"epoch": 6.301089918256131,
"grad_norm": 0.1396232694387436,
"learning_rate": 1.5340483817000428e-06,
"loss": 0.0113,
"step": 9250
},
{
"epoch": 6.307901907356948,
"grad_norm": 0.11976684629917145,
"learning_rate": 1.4936701754947101e-06,
"loss": 0.0096,
"step": 9260
},
{
"epoch": 6.314713896457766,
"grad_norm": 0.14177760481834412,
"learning_rate": 1.4538224267409361e-06,
"loss": 0.0116,
"step": 9270
},
{
"epoch": 6.321525885558583,
"grad_norm": 0.15875473618507385,
"learning_rate": 1.414505571207314e-06,
"loss": 0.0076,
"step": 9280
},
{
"epoch": 6.3283378746594,
"grad_norm": 0.10427635163068771,
"learning_rate": 1.3757200388566816e-06,
"loss": 0.0077,
"step": 9290
},
{
"epoch": 6.335149863760218,
"grad_norm": 0.11724657565355301,
"learning_rate": 1.3374662538414074e-06,
"loss": 0.0123,
"step": 9300
},
{
"epoch": 6.3419618528610355,
"grad_norm": 0.08624394983053207,
"learning_rate": 1.2997446344987617e-06,
"loss": 0.0099,
"step": 9310
},
{
"epoch": 6.348773841961853,
"grad_norm": 0.11943169683218002,
"learning_rate": 1.262555593346315e-06,
"loss": 0.0099,
"step": 9320
},
{
"epoch": 6.355585831062671,
"grad_norm": 0.22859704494476318,
"learning_rate": 1.2258995370774685e-06,
"loss": 0.0116,
"step": 9330
},
{
"epoch": 6.362397820163488,
"grad_norm": 0.20983096957206726,
"learning_rate": 1.1897768665569798e-06,
"loss": 0.0117,
"step": 9340
},
{
"epoch": 6.369209809264305,
"grad_norm": 0.13772162795066833,
"learning_rate": 1.1541879768165954e-06,
"loss": 0.0092,
"step": 9350
},
{
"epoch": 6.376021798365123,
"grad_norm": 0.15202628076076508,
"learning_rate": 1.1191332570507085e-06,
"loss": 0.0098,
"step": 9360
},
{
"epoch": 6.38283378746594,
"grad_norm": 0.18177203834056854,
"learning_rate": 1.0846130906121132e-06,
"loss": 0.0164,
"step": 9370
},
{
"epoch": 6.389645776566757,
"grad_norm": 0.17858490347862244,
"learning_rate": 1.0506278550078131e-06,
"loss": 0.0103,
"step": 9380
},
{
"epoch": 6.396457765667575,
"grad_norm": 0.18811877071857452,
"learning_rate": 1.0171779218949185e-06,
"loss": 0.0125,
"step": 9390
},
{
"epoch": 6.4032697547683926,
"grad_norm": 0.1804962009191513,
"learning_rate": 9.842636570765174e-07,
"loss": 0.0097,
"step": 9400
},
{
"epoch": 6.41008174386921,
"grad_norm": 0.20443765819072723,
"learning_rate": 9.518854204977612e-07,
"loss": 0.01,
"step": 9410
},
{
"epoch": 6.416893732970027,
"grad_norm": 0.11135527491569519,
"learning_rate": 9.200435662418349e-07,
"loss": 0.0101,
"step": 9420
},
{
"epoch": 6.423705722070845,
"grad_norm": 0.10986144840717316,
"learning_rate": 8.887384425261658e-07,
"loss": 0.008,
"step": 9430
},
{
"epoch": 6.430517711171662,
"grad_norm": 0.15490956604480743,
"learning_rate": 8.579703916985648e-07,
"loss": 0.0094,
"step": 9440
},
{
"epoch": 6.437329700272479,
"grad_norm": 0.12304934859275818,
"learning_rate": 8.277397502335194e-07,
"loss": 0.0134,
"step": 9450
},
{
"epoch": 6.444141689373297,
"grad_norm": 0.15748490393161774,
"learning_rate": 7.980468487284675e-07,
"loss": 0.0104,
"step": 9460
},
{
"epoch": 6.450953678474114,
"grad_norm": 0.15610432624816895,
"learning_rate": 7.688920119002297e-07,
"loss": 0.0089,
"step": 9470
},
{
"epoch": 6.4577656675749315,
"grad_norm": 0.1030815839767456,
"learning_rate": 7.402755585814269e-07,
"loss": 0.0099,
"step": 9480
},
{
"epoch": 6.46457765667575,
"grad_norm": 0.20818915963172913,
"learning_rate": 7.121978017170073e-07,
"loss": 0.0115,
"step": 9490
},
{
"epoch": 6.471389645776567,
"grad_norm": 0.1520918905735016,
"learning_rate": 6.846590483608306e-07,
"loss": 0.0084,
"step": 9500
},
{
"epoch": 6.478201634877384,
"grad_norm": 0.13606111705303192,
"learning_rate": 6.576595996722834e-07,
"loss": 0.0159,
"step": 9510
},
{
"epoch": 6.485013623978202,
"grad_norm": 0.1213141530752182,
"learning_rate": 6.311997509130141e-07,
"loss": 0.0093,
"step": 9520
},
{
"epoch": 6.491825613079019,
"grad_norm": 0.18930743634700775,
"learning_rate": 6.052797914436803e-07,
"loss": 0.0114,
"step": 9530
},
{
"epoch": 6.498637602179836,
"grad_norm": 0.2151637226343155,
"learning_rate": 5.799000047208181e-07,
"loss": 0.0133,
"step": 9540
},
{
"epoch": 6.505449591280654,
"grad_norm": 0.16114141047000885,
"learning_rate": 5.550606682937054e-07,
"loss": 0.0115,
"step": 9550
},
{
"epoch": 6.512261580381471,
"grad_norm": 0.1699608713388443,
"learning_rate": 5.307620538013481e-07,
"loss": 0.0114,
"step": 9560
},
{
"epoch": 6.5190735694822886,
"grad_norm": 0.15840139985084534,
"learning_rate": 5.070044269694874e-07,
"loss": 0.0101,
"step": 9570
},
{
"epoch": 6.525885558583107,
"grad_norm": 0.22831596434116364,
"learning_rate": 4.837880476077417e-07,
"loss": 0.0106,
"step": 9580
},
{
"epoch": 6.532697547683924,
"grad_norm": 0.118828684091568,
"learning_rate": 4.6111316960670835e-07,
"loss": 0.0098,
"step": 9590
},
{
"epoch": 6.539509536784741,
"grad_norm": 0.1655462384223938,
"learning_rate": 4.389800409352218e-07,
"loss": 0.0082,
"step": 9600
},
{
"epoch": 6.546321525885559,
"grad_norm": 0.1253342479467392,
"learning_rate": 4.173889036376277e-07,
"loss": 0.0111,
"step": 9610
},
{
"epoch": 6.553133514986376,
"grad_norm": 0.15380145609378815,
"learning_rate": 3.963399938311463e-07,
"loss": 0.0115,
"step": 9620
},
{
"epoch": 6.559945504087193,
"grad_norm": 0.13774822652339935,
"learning_rate": 3.7583354170328545e-07,
"loss": 0.012,
"step": 9630
},
{
"epoch": 6.566757493188011,
"grad_norm": 0.08887213468551636,
"learning_rate": 3.558697715093207e-07,
"loss": 0.0084,
"step": 9640
},
{
"epoch": 6.573569482288828,
"grad_norm": 0.2804868817329407,
"learning_rate": 3.3644890156983576e-07,
"loss": 0.0109,
"step": 9650
},
{
"epoch": 6.580381471389646,
"grad_norm": 0.12525686621665955,
"learning_rate": 3.175711442683638e-07,
"loss": 0.0084,
"step": 9660
},
{
"epoch": 6.587193460490463,
"grad_norm": 0.12775982916355133,
"learning_rate": 2.9923670604902197e-07,
"loss": 0.0097,
"step": 9670
},
{
"epoch": 6.594005449591281,
"grad_norm": 0.22419363260269165,
"learning_rate": 2.814457874143028e-07,
"loss": 0.0122,
"step": 9680
},
{
"epoch": 6.600817438692098,
"grad_norm": 0.16230632364749908,
"learning_rate": 2.641985829228366e-07,
"loss": 0.0102,
"step": 9690
},
{
"epoch": 6.607629427792915,
"grad_norm": 0.15815846621990204,
"learning_rate": 2.474952811872877e-07,
"loss": 0.0092,
"step": 9700
},
{
"epoch": 6.614441416893733,
"grad_norm": 0.13755181431770325,
"learning_rate": 2.3133606487228397e-07,
"loss": 0.0116,
"step": 9710
},
{
"epoch": 6.62125340599455,
"grad_norm": 0.09371072053909302,
"learning_rate": 2.157211106924295e-07,
"loss": 0.0087,
"step": 9720
},
{
"epoch": 6.628065395095367,
"grad_norm": 0.1671672761440277,
"learning_rate": 2.006505894103672e-07,
"loss": 0.0107,
"step": 9730
},
{
"epoch": 6.6348773841961854,
"grad_norm": 0.1295129358768463,
"learning_rate": 1.8612466583489696e-07,
"loss": 0.0098,
"step": 9740
},
{
"epoch": 6.641689373297003,
"grad_norm": 0.2207920253276825,
"learning_rate": 1.7214349881918834e-07,
"loss": 0.0097,
"step": 9750
},
{
"epoch": 6.64850136239782,
"grad_norm": 0.130056232213974,
"learning_rate": 1.5870724125904845e-07,
"loss": 0.0081,
"step": 9760
},
{
"epoch": 6.655313351498638,
"grad_norm": 0.12633217871189117,
"learning_rate": 1.4581604009124006e-07,
"loss": 0.0096,
"step": 9770
},
{
"epoch": 6.662125340599455,
"grad_norm": 0.16835469007492065,
"learning_rate": 1.334700362918717e-07,
"loss": 0.0091,
"step": 9780
},
{
"epoch": 6.668937329700272,
"grad_norm": 0.26601773500442505,
"learning_rate": 1.2166936487486015e-07,
"loss": 0.0104,
"step": 9790
},
{
"epoch": 6.67574931880109,
"grad_norm": 0.15718552470207214,
"learning_rate": 1.1041415489045914e-07,
"loss": 0.0089,
"step": 9800
},
{
"epoch": 6.682561307901907,
"grad_norm": 0.14041031897068024,
"learning_rate": 9.970452942384412e-08,
"loss": 0.0104,
"step": 9810
},
{
"epoch": 6.689373297002724,
"grad_norm": 0.10807531327009201,
"learning_rate": 8.954060559375754e-08,
"loss": 0.0087,
"step": 9820
},
{
"epoch": 6.6961852861035425,
"grad_norm": 0.13568098843097687,
"learning_rate": 7.99224945512489e-08,
"loss": 0.0159,
"step": 9830
},
{
"epoch": 6.70299727520436,
"grad_norm": 0.306471049785614,
"learning_rate": 7.085030147843675e-08,
"loss": 0.0124,
"step": 9840
},
{
"epoch": 6.709809264305177,
"grad_norm": 0.14044924080371857,
"learning_rate": 6.232412558736523e-08,
"loss": 0.0117,
"step": 9850
},
{
"epoch": 6.716621253405995,
"grad_norm": 0.14973674714565277,
"learning_rate": 5.434406011893822e-08,
"loss": 0.0139,
"step": 9860
},
{
"epoch": 6.723433242506812,
"grad_norm": 0.10210314393043518,
"learning_rate": 4.6910192341864664e-08,
"loss": 0.0078,
"step": 9870
},
{
"epoch": 6.730245231607629,
"grad_norm": 0.15292491018772125,
"learning_rate": 4.0022603551737035e-08,
"loss": 0.0099,
"step": 9880
},
{
"epoch": 6.737057220708447,
"grad_norm": 0.17868728935718536,
"learning_rate": 3.3681369070120985e-08,
"loss": 0.012,
"step": 9890
},
{
"epoch": 6.743869209809264,
"grad_norm": 0.17693090438842773,
"learning_rate": 2.7886558243744866e-08,
"loss": 0.0112,
"step": 9900
},
{
"epoch": 6.7506811989100814,
"grad_norm": 0.1320875883102417,
"learning_rate": 2.2638234443722596e-08,
"loss": 0.0096,
"step": 9910
},
{
"epoch": 6.7574931880108995,
"grad_norm": 0.1211492195725441,
"learning_rate": 1.7936455064887504e-08,
"loss": 0.013,
"step": 9920
},
{
"epoch": 6.764305177111717,
"grad_norm": 0.1284903734922409,
"learning_rate": 1.378127152514841e-08,
"loss": 0.0066,
"step": 9930
},
{
"epoch": 6.771117166212534,
"grad_norm": 0.12337515503168106,
"learning_rate": 1.0172729264917857e-08,
"loss": 0.0118,
"step": 9940
},
{
"epoch": 6.777929155313352,
"grad_norm": 0.15872040390968323,
"learning_rate": 7.1108677466458215e-09,
"loss": 0.0107,
"step": 9950
},
{
"epoch": 6.784741144414169,
"grad_norm": 0.13814593851566315,
"learning_rate": 4.595720454353414e-09,
"loss": 0.0119,
"step": 9960
},
{
"epoch": 6.791553133514986,
"grad_norm": 0.16548724472522736,
"learning_rate": 2.627314893294264e-09,
"loss": 0.008,
"step": 9970
},
{
"epoch": 6.798365122615804,
"grad_norm": 0.16446246206760406,
"learning_rate": 1.2056725896270048e-09,
"loss": 0.0097,
"step": 9980
},
{
"epoch": 6.805177111716621,
"grad_norm": 0.1332317590713501,
"learning_rate": 3.308090902098826e-10,
"loss": 0.0099,
"step": 9990
},
{
"epoch": 6.8119891008174385,
"grad_norm": 0.15192656219005585,
"learning_rate": 2.7339624120159555e-12,
"loss": 0.01,
"step": 10000
},
{
"epoch": 6.8119891008174385,
"step": 10000,
"total_flos": 0.0,
"train_loss": 0.0075618208244442936,
"train_runtime": 3810.2859,
"train_samples_per_second": 83.983,
"train_steps_per_second": 2.624
}
],
"logging_steps": 10,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}