phy_3D_unmovable_15000 / trainer_state.json
qjuu's picture
Upload folder using huggingface_hub
9cfdad9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.538796861377507,
"eval_steps": 500,
"global_step": 15000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004359197907585004,
"grad_norm": 11.853053092956543,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.8048,
"step": 10
},
{
"epoch": 0.008718395815170008,
"grad_norm": 7.921761512756348,
"learning_rate": 2.5333333333333334e-06,
"loss": 2.7551,
"step": 20
},
{
"epoch": 0.013077593722755012,
"grad_norm": 4.87008810043335,
"learning_rate": 3.866666666666667e-06,
"loss": 2.4567,
"step": 30
},
{
"epoch": 0.017436791630340016,
"grad_norm": 4.188699722290039,
"learning_rate": 5.2e-06,
"loss": 2.1709,
"step": 40
},
{
"epoch": 0.021795989537925022,
"grad_norm": 3.5124988555908203,
"learning_rate": 6.533333333333333e-06,
"loss": 1.9095,
"step": 50
},
{
"epoch": 0.026155187445510025,
"grad_norm": 3.8245785236358643,
"learning_rate": 7.866666666666667e-06,
"loss": 1.7831,
"step": 60
},
{
"epoch": 0.03051438535309503,
"grad_norm": 2.5836427211761475,
"learning_rate": 9.2e-06,
"loss": 1.6111,
"step": 70
},
{
"epoch": 0.03487358326068003,
"grad_norm": 5.420677185058594,
"learning_rate": 1.0533333333333335e-05,
"loss": 1.4375,
"step": 80
},
{
"epoch": 0.03923278116826504,
"grad_norm": 4.477448463439941,
"learning_rate": 1.1866666666666668e-05,
"loss": 1.4068,
"step": 90
},
{
"epoch": 0.043591979075850044,
"grad_norm": 1.6446658372879028,
"learning_rate": 1.32e-05,
"loss": 1.2661,
"step": 100
},
{
"epoch": 0.04795117698343505,
"grad_norm": 2.5110535621643066,
"learning_rate": 1.4533333333333335e-05,
"loss": 1.2488,
"step": 110
},
{
"epoch": 0.05231037489102005,
"grad_norm": 1.5065827369689941,
"learning_rate": 1.586666666666667e-05,
"loss": 1.1814,
"step": 120
},
{
"epoch": 0.05666957279860506,
"grad_norm": 3.6299448013305664,
"learning_rate": 1.7199999999999998e-05,
"loss": 1.187,
"step": 130
},
{
"epoch": 0.06102877070619006,
"grad_norm": 1.9311683177947998,
"learning_rate": 1.8533333333333334e-05,
"loss": 1.1004,
"step": 140
},
{
"epoch": 0.06538796861377506,
"grad_norm": 0.9780636429786682,
"learning_rate": 1.9866666666666667e-05,
"loss": 1.094,
"step": 150
},
{
"epoch": 0.06974716652136007,
"grad_norm": 1.6050626039505005,
"learning_rate": 2.12e-05,
"loss": 1.1096,
"step": 160
},
{
"epoch": 0.07410636442894507,
"grad_norm": 3.1172068119049072,
"learning_rate": 2.2533333333333333e-05,
"loss": 1.1344,
"step": 170
},
{
"epoch": 0.07846556233653008,
"grad_norm": 3.489858865737915,
"learning_rate": 2.3866666666666666e-05,
"loss": 1.0982,
"step": 180
},
{
"epoch": 0.08282476024411509,
"grad_norm": 2.0369985103607178,
"learning_rate": 2.5200000000000003e-05,
"loss": 1.1281,
"step": 190
},
{
"epoch": 0.08718395815170009,
"grad_norm": 1.0101550817489624,
"learning_rate": 2.6533333333333332e-05,
"loss": 1.0518,
"step": 200
},
{
"epoch": 0.09154315605928509,
"grad_norm": 1.3004437685012817,
"learning_rate": 2.786666666666667e-05,
"loss": 1.0985,
"step": 210
},
{
"epoch": 0.0959023539668701,
"grad_norm": 6.565486431121826,
"learning_rate": 2.9199999999999998e-05,
"loss": 1.1108,
"step": 220
},
{
"epoch": 0.1002615518744551,
"grad_norm": 0.6993623375892639,
"learning_rate": 3.0533333333333335e-05,
"loss": 1.1143,
"step": 230
},
{
"epoch": 0.1046207497820401,
"grad_norm": 5.090579986572266,
"learning_rate": 3.1866666666666664e-05,
"loss": 1.067,
"step": 240
},
{
"epoch": 0.10897994768962511,
"grad_norm": 1.5457755327224731,
"learning_rate": 3.32e-05,
"loss": 1.0876,
"step": 250
},
{
"epoch": 0.11333914559721012,
"grad_norm": 1.9382998943328857,
"learning_rate": 3.453333333333334e-05,
"loss": 1.0713,
"step": 260
},
{
"epoch": 0.11769834350479512,
"grad_norm": 3.967763900756836,
"learning_rate": 3.586666666666667e-05,
"loss": 1.0995,
"step": 270
},
{
"epoch": 0.12205754141238012,
"grad_norm": 1.1996185779571533,
"learning_rate": 3.72e-05,
"loss": 1.1083,
"step": 280
},
{
"epoch": 0.12641673931996514,
"grad_norm": 1.581020474433899,
"learning_rate": 3.853333333333334e-05,
"loss": 1.0617,
"step": 290
},
{
"epoch": 0.13077593722755013,
"grad_norm": 1.5426340103149414,
"learning_rate": 3.986666666666667e-05,
"loss": 1.0893,
"step": 300
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.2590006589889526,
"learning_rate": 4.12e-05,
"loss": 1.0718,
"step": 310
},
{
"epoch": 0.13949433304272013,
"grad_norm": 0.9600476026535034,
"learning_rate": 4.2533333333333335e-05,
"loss": 1.0769,
"step": 320
},
{
"epoch": 0.14385353095030515,
"grad_norm": 1.3560572862625122,
"learning_rate": 4.3866666666666665e-05,
"loss": 1.0867,
"step": 330
},
{
"epoch": 0.14821272885789014,
"grad_norm": 3.1593310832977295,
"learning_rate": 4.52e-05,
"loss": 1.0845,
"step": 340
},
{
"epoch": 0.15257192676547515,
"grad_norm": 1.2943660020828247,
"learning_rate": 4.653333333333334e-05,
"loss": 1.0656,
"step": 350
},
{
"epoch": 0.15693112467306017,
"grad_norm": 0.8209326267242432,
"learning_rate": 4.7866666666666674e-05,
"loss": 1.0717,
"step": 360
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.0528558492660522,
"learning_rate": 4.92e-05,
"loss": 1.0956,
"step": 370
},
{
"epoch": 0.16564952048823017,
"grad_norm": 0.7630209922790527,
"learning_rate": 5.053333333333333e-05,
"loss": 1.1068,
"step": 380
},
{
"epoch": 0.17000871839581516,
"grad_norm": 1.6207125186920166,
"learning_rate": 5.1866666666666676e-05,
"loss": 1.0622,
"step": 390
},
{
"epoch": 0.17436791630340018,
"grad_norm": 0.951837956905365,
"learning_rate": 5.3200000000000006e-05,
"loss": 1.0681,
"step": 400
},
{
"epoch": 0.17872711421098517,
"grad_norm": 1.2214558124542236,
"learning_rate": 5.4533333333333335e-05,
"loss": 1.0688,
"step": 410
},
{
"epoch": 0.18308631211857018,
"grad_norm": 2.2449309825897217,
"learning_rate": 5.5866666666666665e-05,
"loss": 1.084,
"step": 420
},
{
"epoch": 0.1874455100261552,
"grad_norm": 1.1216390132904053,
"learning_rate": 5.72e-05,
"loss": 1.0733,
"step": 430
},
{
"epoch": 0.1918047079337402,
"grad_norm": 0.7358635067939758,
"learning_rate": 5.853333333333334e-05,
"loss": 1.0753,
"step": 440
},
{
"epoch": 0.1961639058413252,
"grad_norm": 0.8561913371086121,
"learning_rate": 5.9866666666666674e-05,
"loss": 1.0714,
"step": 450
},
{
"epoch": 0.2005231037489102,
"grad_norm": 1.0774980783462524,
"learning_rate": 6.12e-05,
"loss": 1.0618,
"step": 460
},
{
"epoch": 0.2048823016564952,
"grad_norm": 1.3355389833450317,
"learning_rate": 6.253333333333333e-05,
"loss": 1.0681,
"step": 470
},
{
"epoch": 0.2092414995640802,
"grad_norm": 0.6401084065437317,
"learning_rate": 6.386666666666667e-05,
"loss": 1.0583,
"step": 480
},
{
"epoch": 0.2136006974716652,
"grad_norm": 1.3419454097747803,
"learning_rate": 6.52e-05,
"loss": 1.074,
"step": 490
},
{
"epoch": 0.21795989537925023,
"grad_norm": 0.993840217590332,
"learning_rate": 6.653333333333334e-05,
"loss": 1.0666,
"step": 500
},
{
"epoch": 0.22231909328683522,
"grad_norm": 1.9545044898986816,
"learning_rate": 6.786666666666667e-05,
"loss": 1.0842,
"step": 510
},
{
"epoch": 0.22667829119442023,
"grad_norm": 0.7455446124076843,
"learning_rate": 6.92e-05,
"loss": 1.057,
"step": 520
},
{
"epoch": 0.23103748910200522,
"grad_norm": 1.0625560283660889,
"learning_rate": 7.053333333333334e-05,
"loss": 1.0793,
"step": 530
},
{
"epoch": 0.23539668700959024,
"grad_norm": 1.8029131889343262,
"learning_rate": 7.186666666666667e-05,
"loss": 1.0806,
"step": 540
},
{
"epoch": 0.23975588491717523,
"grad_norm": 0.7211843729019165,
"learning_rate": 7.32e-05,
"loss": 1.0851,
"step": 550
},
{
"epoch": 0.24411508282476024,
"grad_norm": 1.2813831567764282,
"learning_rate": 7.453333333333333e-05,
"loss": 1.0613,
"step": 560
},
{
"epoch": 0.24847428073234526,
"grad_norm": 1.113389015197754,
"learning_rate": 7.586666666666668e-05,
"loss": 1.0536,
"step": 570
},
{
"epoch": 0.2528334786399303,
"grad_norm": 1.0559017658233643,
"learning_rate": 7.72e-05,
"loss": 1.0626,
"step": 580
},
{
"epoch": 0.25719267654751526,
"grad_norm": 2.7463810443878174,
"learning_rate": 7.853333333333334e-05,
"loss": 1.0652,
"step": 590
},
{
"epoch": 0.26155187445510025,
"grad_norm": 1.103918433189392,
"learning_rate": 7.986666666666667e-05,
"loss": 1.0634,
"step": 600
},
{
"epoch": 0.26591107236268524,
"grad_norm": 1.6153513193130493,
"learning_rate": 8.120000000000001e-05,
"loss": 1.0869,
"step": 610
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.8530434370040894,
"learning_rate": 8.253333333333334e-05,
"loss": 1.0693,
"step": 620
},
{
"epoch": 0.2746294681778553,
"grad_norm": 0.752463161945343,
"learning_rate": 8.386666666666667e-05,
"loss": 1.0919,
"step": 630
},
{
"epoch": 0.27898866608544026,
"grad_norm": 0.6594322323799133,
"learning_rate": 8.52e-05,
"loss": 1.0756,
"step": 640
},
{
"epoch": 0.2833478639930253,
"grad_norm": 0.9615456461906433,
"learning_rate": 8.653333333333333e-05,
"loss": 1.0839,
"step": 650
},
{
"epoch": 0.2877070619006103,
"grad_norm": 0.9804999828338623,
"learning_rate": 8.786666666666667e-05,
"loss": 1.0633,
"step": 660
},
{
"epoch": 0.2920662598081953,
"grad_norm": 1.1694570779800415,
"learning_rate": 8.92e-05,
"loss": 1.0849,
"step": 670
},
{
"epoch": 0.29642545771578027,
"grad_norm": 1.0795670747756958,
"learning_rate": 9.053333333333334e-05,
"loss": 1.0552,
"step": 680
},
{
"epoch": 0.3007846556233653,
"grad_norm": 1.1118414402008057,
"learning_rate": 9.186666666666667e-05,
"loss": 1.0753,
"step": 690
},
{
"epoch": 0.3051438535309503,
"grad_norm": 1.0514802932739258,
"learning_rate": 9.320000000000002e-05,
"loss": 1.0641,
"step": 700
},
{
"epoch": 0.3095030514385353,
"grad_norm": 0.9771641492843628,
"learning_rate": 9.453333333333335e-05,
"loss": 1.0756,
"step": 710
},
{
"epoch": 0.31386224934612034,
"grad_norm": 2.385375499725342,
"learning_rate": 9.586666666666667e-05,
"loss": 1.0475,
"step": 720
},
{
"epoch": 0.3182214472537053,
"grad_norm": 1.542068600654602,
"learning_rate": 9.72e-05,
"loss": 1.0494,
"step": 730
},
{
"epoch": 0.3225806451612903,
"grad_norm": 2.3379392623901367,
"learning_rate": 9.853333333333333e-05,
"loss": 1.0553,
"step": 740
},
{
"epoch": 0.3269398430688753,
"grad_norm": 0.5032880902290344,
"learning_rate": 9.986666666666668e-05,
"loss": 1.0873,
"step": 750
},
{
"epoch": 0.33129904097646035,
"grad_norm": 1.1064118146896362,
"learning_rate": 9.999990157738453e-05,
"loss": 1.0807,
"step": 760
},
{
"epoch": 0.33565823888404533,
"grad_norm": 0.9185916781425476,
"learning_rate": 9.999956135155687e-05,
"loss": 1.0868,
"step": 770
},
{
"epoch": 0.3400174367916303,
"grad_norm": 1.1694447994232178,
"learning_rate": 9.99989781090763e-05,
"loss": 1.098,
"step": 780
},
{
"epoch": 0.34437663469921537,
"grad_norm": 0.6519765257835388,
"learning_rate": 9.999815185277755e-05,
"loss": 1.0567,
"step": 790
},
{
"epoch": 0.34873583260680036,
"grad_norm": 1.022257924079895,
"learning_rate": 9.999708258667652e-05,
"loss": 1.0396,
"step": 800
},
{
"epoch": 0.35309503051438534,
"grad_norm": 0.7446913719177246,
"learning_rate": 9.999577031597029e-05,
"loss": 1.0574,
"step": 810
},
{
"epoch": 0.35745422842197033,
"grad_norm": 0.6779941916465759,
"learning_rate": 9.999421504703696e-05,
"loss": 1.0582,
"step": 820
},
{
"epoch": 0.3618134263295554,
"grad_norm": 0.8097116351127625,
"learning_rate": 9.999241678743574e-05,
"loss": 1.0507,
"step": 830
},
{
"epoch": 0.36617262423714037,
"grad_norm": 1.2719435691833496,
"learning_rate": 9.999037554590683e-05,
"loss": 1.0814,
"step": 840
},
{
"epoch": 0.37053182214472535,
"grad_norm": 1.1539487838745117,
"learning_rate": 9.998809133237143e-05,
"loss": 1.0376,
"step": 850
},
{
"epoch": 0.3748910200523104,
"grad_norm": 3.295795440673828,
"learning_rate": 9.998556415793169e-05,
"loss": 1.0528,
"step": 860
},
{
"epoch": 0.3792502179598954,
"grad_norm": 0.8902769088745117,
"learning_rate": 9.998279403487062e-05,
"loss": 1.0357,
"step": 870
},
{
"epoch": 0.3836094158674804,
"grad_norm": 0.5429982542991638,
"learning_rate": 9.997978097665205e-05,
"loss": 1.0108,
"step": 880
},
{
"epoch": 0.38796861377506536,
"grad_norm": 0.8729974627494812,
"learning_rate": 9.99765249979206e-05,
"loss": 1.0723,
"step": 890
},
{
"epoch": 0.3923278116826504,
"grad_norm": 1.0210973024368286,
"learning_rate": 9.997302611450154e-05,
"loss": 1.0844,
"step": 900
},
{
"epoch": 0.3966870095902354,
"grad_norm": 0.8282007575035095,
"learning_rate": 9.996928434340073e-05,
"loss": 1.0487,
"step": 910
},
{
"epoch": 0.4010462074978204,
"grad_norm": 0.6480956077575684,
"learning_rate": 9.996529970280462e-05,
"loss": 1.0471,
"step": 920
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.5509655475616455,
"learning_rate": 9.996107221208004e-05,
"loss": 1.0526,
"step": 930
},
{
"epoch": 0.4097646033129904,
"grad_norm": 1.1537059545516968,
"learning_rate": 9.995660189177419e-05,
"loss": 1.0561,
"step": 940
},
{
"epoch": 0.4141238012205754,
"grad_norm": 2.260296583175659,
"learning_rate": 9.995188876361451e-05,
"loss": 1.0208,
"step": 950
},
{
"epoch": 0.4184829991281604,
"grad_norm": 0.829742431640625,
"learning_rate": 9.994693285050857e-05,
"loss": 1.0498,
"step": 960
},
{
"epoch": 0.42284219703574544,
"grad_norm": 0.6473196148872375,
"learning_rate": 9.994173417654395e-05,
"loss": 1.0367,
"step": 970
},
{
"epoch": 0.4272013949433304,
"grad_norm": 0.6572287678718567,
"learning_rate": 9.993629276698821e-05,
"loss": 1.0416,
"step": 980
},
{
"epoch": 0.4315605928509154,
"grad_norm": 0.47851496934890747,
"learning_rate": 9.993060864828858e-05,
"loss": 1.0464,
"step": 990
},
{
"epoch": 0.43591979075850046,
"grad_norm": 0.43127620220184326,
"learning_rate": 9.992468184807206e-05,
"loss": 1.0678,
"step": 1000
},
{
"epoch": 0.44027898866608545,
"grad_norm": 0.5899858474731445,
"learning_rate": 9.991851239514511e-05,
"loss": 1.0726,
"step": 1010
},
{
"epoch": 0.44463818657367044,
"grad_norm": 0.6047391295433044,
"learning_rate": 9.991210031949359e-05,
"loss": 1.0497,
"step": 1020
},
{
"epoch": 0.4489973844812554,
"grad_norm": 0.5648536682128906,
"learning_rate": 9.990544565228259e-05,
"loss": 1.0402,
"step": 1030
},
{
"epoch": 0.45335658238884047,
"grad_norm": 0.5038257241249084,
"learning_rate": 9.989854842585631e-05,
"loss": 1.0459,
"step": 1040
},
{
"epoch": 0.45771578029642546,
"grad_norm": 0.5015464425086975,
"learning_rate": 9.989140867373783e-05,
"loss": 1.0453,
"step": 1050
},
{
"epoch": 0.46207497820401044,
"grad_norm": 0.6012107729911804,
"learning_rate": 9.988402643062907e-05,
"loss": 1.0527,
"step": 1060
},
{
"epoch": 0.4664341761115955,
"grad_norm": 0.8822391033172607,
"learning_rate": 9.987640173241046e-05,
"loss": 1.0598,
"step": 1070
},
{
"epoch": 0.4707933740191805,
"grad_norm": 0.8211294412612915,
"learning_rate": 9.986853461614093e-05,
"loss": 1.0355,
"step": 1080
},
{
"epoch": 0.47515257192676547,
"grad_norm": 0.5536686182022095,
"learning_rate": 9.986042512005763e-05,
"loss": 1.0622,
"step": 1090
},
{
"epoch": 0.47951176983435045,
"grad_norm": 0.7554659247398376,
"learning_rate": 9.985207328357573e-05,
"loss": 1.038,
"step": 1100
},
{
"epoch": 0.4838709677419355,
"grad_norm": 0.6661390066146851,
"learning_rate": 9.984347914728829e-05,
"loss": 1.0291,
"step": 1110
},
{
"epoch": 0.4882301656495205,
"grad_norm": 0.5693694353103638,
"learning_rate": 9.983464275296605e-05,
"loss": 1.0499,
"step": 1120
},
{
"epoch": 0.4925893635571055,
"grad_norm": 0.3903997838497162,
"learning_rate": 9.982556414355724e-05,
"loss": 1.0945,
"step": 1130
},
{
"epoch": 0.4969485614646905,
"grad_norm": 0.534357488155365,
"learning_rate": 9.981624336318726e-05,
"loss": 1.0483,
"step": 1140
},
{
"epoch": 0.5013077593722755,
"grad_norm": 0.7887455224990845,
"learning_rate": 9.980668045715864e-05,
"loss": 1.0626,
"step": 1150
},
{
"epoch": 0.5056669572798606,
"grad_norm": 1.018442988395691,
"learning_rate": 9.979687547195066e-05,
"loss": 1.0215,
"step": 1160
},
{
"epoch": 0.5100261551874455,
"grad_norm": 0.8873037099838257,
"learning_rate": 9.978682845521927e-05,
"loss": 1.082,
"step": 1170
},
{
"epoch": 0.5143853530950305,
"grad_norm": 0.523079514503479,
"learning_rate": 9.977653945579673e-05,
"loss": 1.0835,
"step": 1180
},
{
"epoch": 0.5187445510026155,
"grad_norm": 0.5072969198226929,
"learning_rate": 9.976600852369144e-05,
"loss": 1.0621,
"step": 1190
},
{
"epoch": 0.5231037489102005,
"grad_norm": 0.9282866716384888,
"learning_rate": 9.975523571008769e-05,
"loss": 1.0583,
"step": 1200
},
{
"epoch": 0.5274629468177855,
"grad_norm": 1.1153844594955444,
"learning_rate": 9.97442210673454e-05,
"loss": 1.0592,
"step": 1210
},
{
"epoch": 0.5318221447253705,
"grad_norm": 0.5975160598754883,
"learning_rate": 9.973296464899988e-05,
"loss": 1.0872,
"step": 1220
},
{
"epoch": 0.5361813426329556,
"grad_norm": 0.6239467859268188,
"learning_rate": 9.972146650976154e-05,
"loss": 1.0563,
"step": 1230
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.4678373336791992,
"learning_rate": 9.970972670551566e-05,
"loss": 1.0855,
"step": 1240
},
{
"epoch": 0.5448997384481256,
"grad_norm": 0.6621468663215637,
"learning_rate": 9.969774529332212e-05,
"loss": 1.0673,
"step": 1250
},
{
"epoch": 0.5492589363557105,
"grad_norm": 0.6079438328742981,
"learning_rate": 9.968552233141504e-05,
"loss": 1.0705,
"step": 1260
},
{
"epoch": 0.5536181342632955,
"grad_norm": 0.57035231590271,
"learning_rate": 9.967305787920264e-05,
"loss": 1.0736,
"step": 1270
},
{
"epoch": 0.5579773321708805,
"grad_norm": 0.7185502052307129,
"learning_rate": 9.966035199726684e-05,
"loss": 1.0673,
"step": 1280
},
{
"epoch": 0.5623365300784655,
"grad_norm": 0.6128619909286499,
"learning_rate": 9.9647404747363e-05,
"loss": 1.0699,
"step": 1290
},
{
"epoch": 0.5666957279860506,
"grad_norm": 0.5556525588035583,
"learning_rate": 9.96342161924196e-05,
"loss": 1.0554,
"step": 1300
},
{
"epoch": 0.5710549258936356,
"grad_norm": 0.6182255148887634,
"learning_rate": 9.962078639653797e-05,
"loss": 1.0799,
"step": 1310
},
{
"epoch": 0.5754141238012206,
"grad_norm": 0.7383370399475098,
"learning_rate": 9.960711542499202e-05,
"loss": 1.0533,
"step": 1320
},
{
"epoch": 0.5797733217088056,
"grad_norm": 0.4465934932231903,
"learning_rate": 9.959320334422772e-05,
"loss": 1.059,
"step": 1330
},
{
"epoch": 0.5841325196163906,
"grad_norm": 1.1247705221176147,
"learning_rate": 9.957905022186309e-05,
"loss": 1.0402,
"step": 1340
},
{
"epoch": 0.5884917175239756,
"grad_norm": 0.7212786674499512,
"learning_rate": 9.956465612668757e-05,
"loss": 1.0318,
"step": 1350
},
{
"epoch": 0.5928509154315605,
"grad_norm": 0.6343246698379517,
"learning_rate": 9.95500211286619e-05,
"loss": 1.0641,
"step": 1360
},
{
"epoch": 0.5972101133391456,
"grad_norm": 0.8640972971916199,
"learning_rate": 9.953514529891763e-05,
"loss": 1.0435,
"step": 1370
},
{
"epoch": 0.6015693112467306,
"grad_norm": 1.0832066535949707,
"learning_rate": 9.952002870975693e-05,
"loss": 1.0402,
"step": 1380
},
{
"epoch": 0.6059285091543156,
"grad_norm": 0.8250817060470581,
"learning_rate": 9.950467143465207e-05,
"loss": 1.0501,
"step": 1390
},
{
"epoch": 0.6102877070619006,
"grad_norm": 0.8554821014404297,
"learning_rate": 9.94890735482452e-05,
"loss": 1.0527,
"step": 1400
},
{
"epoch": 0.6146469049694856,
"grad_norm": 0.6301167011260986,
"learning_rate": 9.947323512634788e-05,
"loss": 1.069,
"step": 1410
},
{
"epoch": 0.6190061028770706,
"grad_norm": 0.5204041600227356,
"learning_rate": 9.945715624594081e-05,
"loss": 1.0728,
"step": 1420
},
{
"epoch": 0.6233653007846556,
"grad_norm": 0.8133730888366699,
"learning_rate": 9.944083698517339e-05,
"loss": 1.0364,
"step": 1430
},
{
"epoch": 0.6277244986922407,
"grad_norm": 1.3038055896759033,
"learning_rate": 9.942427742336334e-05,
"loss": 1.0204,
"step": 1440
},
{
"epoch": 0.6320836965998257,
"grad_norm": 0.687967836856842,
"learning_rate": 9.940747764099638e-05,
"loss": 1.032,
"step": 1450
},
{
"epoch": 0.6364428945074107,
"grad_norm": 0.5078476667404175,
"learning_rate": 9.939043771972574e-05,
"loss": 1.0334,
"step": 1460
},
{
"epoch": 0.6408020924149956,
"grad_norm": 0.5568360090255737,
"learning_rate": 9.937315774237186e-05,
"loss": 1.0348,
"step": 1470
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6646256446838379,
"learning_rate": 9.93556377929219e-05,
"loss": 1.0356,
"step": 1480
},
{
"epoch": 0.6495204882301656,
"grad_norm": 0.6583265662193298,
"learning_rate": 9.933787795652942e-05,
"loss": 1.0468,
"step": 1490
},
{
"epoch": 0.6538796861377506,
"grad_norm": 0.9695225358009338,
"learning_rate": 9.931987831951386e-05,
"loss": 1.0574,
"step": 1500
},
{
"epoch": 0.6582388840453357,
"grad_norm": 0.5819401741027832,
"learning_rate": 9.930163896936027e-05,
"loss": 1.0484,
"step": 1510
},
{
"epoch": 0.6625980819529207,
"grad_norm": 0.7117506265640259,
"learning_rate": 9.92831599947187e-05,
"loss": 1.0333,
"step": 1520
},
{
"epoch": 0.6669572798605057,
"grad_norm": 1.2941336631774902,
"learning_rate": 9.926444148540393e-05,
"loss": 1.0536,
"step": 1530
},
{
"epoch": 0.6713164777680907,
"grad_norm": 0.5326636433601379,
"learning_rate": 9.924548353239495e-05,
"loss": 1.0346,
"step": 1540
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.4321506917476654,
"learning_rate": 9.922628622783451e-05,
"loss": 1.0311,
"step": 1550
},
{
"epoch": 0.6800348735832606,
"grad_norm": 0.7744588851928711,
"learning_rate": 9.920684966502878e-05,
"loss": 1.0428,
"step": 1560
},
{
"epoch": 0.6843940714908456,
"grad_norm": 0.456521600484848,
"learning_rate": 9.918717393844669e-05,
"loss": 1.0725,
"step": 1570
},
{
"epoch": 0.6887532693984307,
"grad_norm": 1.024055004119873,
"learning_rate": 9.916725914371969e-05,
"loss": 1.0228,
"step": 1580
},
{
"epoch": 0.6931124673060157,
"grad_norm": 0.7041059732437134,
"learning_rate": 9.914710537764117e-05,
"loss": 1.056,
"step": 1590
},
{
"epoch": 0.6974716652136007,
"grad_norm": 0.5206664800643921,
"learning_rate": 9.912671273816601e-05,
"loss": 1.0437,
"step": 1600
},
{
"epoch": 0.7018308631211857,
"grad_norm": 0.5767226815223694,
"learning_rate": 9.910608132441008e-05,
"loss": 1.0398,
"step": 1610
},
{
"epoch": 0.7061900610287707,
"grad_norm": 1.048291802406311,
"learning_rate": 9.908521123664981e-05,
"loss": 1.0063,
"step": 1620
},
{
"epoch": 0.7105492589363557,
"grad_norm": 0.4653307795524597,
"learning_rate": 9.906410257632168e-05,
"loss": 1.0541,
"step": 1630
},
{
"epoch": 0.7149084568439407,
"grad_norm": 0.6922876834869385,
"learning_rate": 9.904275544602169e-05,
"loss": 1.0439,
"step": 1640
},
{
"epoch": 0.7192676547515258,
"grad_norm": 0.4627837538719177,
"learning_rate": 9.902116994950493e-05,
"loss": 1.0504,
"step": 1650
},
{
"epoch": 0.7236268526591108,
"grad_norm": 1.1777377128601074,
"learning_rate": 9.899934619168501e-05,
"loss": 1.0402,
"step": 1660
},
{
"epoch": 0.7279860505666957,
"grad_norm": 0.6930601000785828,
"learning_rate": 9.89772842786336e-05,
"loss": 1.0721,
"step": 1670
},
{
"epoch": 0.7323452484742807,
"grad_norm": 1.1171027421951294,
"learning_rate": 9.895498431757989e-05,
"loss": 1.062,
"step": 1680
},
{
"epoch": 0.7367044463818657,
"grad_norm": 0.7950462102890015,
"learning_rate": 9.893244641691006e-05,
"loss": 1.0624,
"step": 1690
},
{
"epoch": 0.7410636442894507,
"grad_norm": 1.396531581878662,
"learning_rate": 9.890967068616677e-05,
"loss": 1.0489,
"step": 1700
},
{
"epoch": 0.7454228421970357,
"grad_norm": 0.8184282183647156,
"learning_rate": 9.888665723604864e-05,
"loss": 1.05,
"step": 1710
},
{
"epoch": 0.7497820401046208,
"grad_norm": 1.0491915941238403,
"learning_rate": 9.886340617840968e-05,
"loss": 1.0579,
"step": 1720
},
{
"epoch": 0.7541412380122058,
"grad_norm": 0.5465502142906189,
"learning_rate": 9.883991762625876e-05,
"loss": 1.0527,
"step": 1730
},
{
"epoch": 0.7585004359197908,
"grad_norm": 0.7717880010604858,
"learning_rate": 9.881619169375908e-05,
"loss": 1.0409,
"step": 1740
},
{
"epoch": 0.7628596338273758,
"grad_norm": 0.7524123191833496,
"learning_rate": 9.879222849622758e-05,
"loss": 1.0472,
"step": 1750
},
{
"epoch": 0.7672188317349607,
"grad_norm": 0.7540025115013123,
"learning_rate": 9.876802815013439e-05,
"loss": 1.0535,
"step": 1760
},
{
"epoch": 0.7715780296425457,
"grad_norm": 0.5446664094924927,
"learning_rate": 9.87435907731023e-05,
"loss": 1.0344,
"step": 1770
},
{
"epoch": 0.7759372275501307,
"grad_norm": 0.5577282309532166,
"learning_rate": 9.871891648390614e-05,
"loss": 1.0484,
"step": 1780
},
{
"epoch": 0.7802964254577158,
"grad_norm": 0.6824474930763245,
"learning_rate": 9.869400540247223e-05,
"loss": 1.0257,
"step": 1790
},
{
"epoch": 0.7846556233653008,
"grad_norm": 1.3244407176971436,
"learning_rate": 9.866885764987776e-05,
"loss": 1.0293,
"step": 1800
},
{
"epoch": 0.7890148212728858,
"grad_norm": 0.7491902709007263,
"learning_rate": 9.86434733483503e-05,
"loss": 1.0338,
"step": 1810
},
{
"epoch": 0.7933740191804708,
"grad_norm": 0.7118448615074158,
"learning_rate": 9.861785262126705e-05,
"loss": 1.0245,
"step": 1820
},
{
"epoch": 0.7977332170880558,
"grad_norm": 0.7211303114891052,
"learning_rate": 9.85919955931544e-05,
"loss": 1.0225,
"step": 1830
},
{
"epoch": 0.8020924149956408,
"grad_norm": 0.5354200601577759,
"learning_rate": 9.856590238968721e-05,
"loss": 1.0291,
"step": 1840
},
{
"epoch": 0.8064516129032258,
"grad_norm": 0.42781445384025574,
"learning_rate": 9.853957313768824e-05,
"loss": 1.0356,
"step": 1850
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.6619517207145691,
"learning_rate": 9.851300796512755e-05,
"loss": 1.0363,
"step": 1860
},
{
"epoch": 0.8151700087183958,
"grad_norm": 0.9382469058036804,
"learning_rate": 9.848620700112188e-05,
"loss": 1.01,
"step": 1870
},
{
"epoch": 0.8195292066259808,
"grad_norm": 0.9875907301902771,
"learning_rate": 9.845917037593396e-05,
"loss": 1.048,
"step": 1880
},
{
"epoch": 0.8238884045335658,
"grad_norm": 0.6974698305130005,
"learning_rate": 9.843189822097196e-05,
"loss": 1.0514,
"step": 1890
},
{
"epoch": 0.8282476024411508,
"grad_norm": 0.5646869540214539,
"learning_rate": 9.84043906687888e-05,
"loss": 1.0436,
"step": 1900
},
{
"epoch": 0.8326068003487358,
"grad_norm": 0.5306071639060974,
"learning_rate": 9.837664785308149e-05,
"loss": 1.0578,
"step": 1910
},
{
"epoch": 0.8369659982563208,
"grad_norm": 0.4362487494945526,
"learning_rate": 9.834866990869059e-05,
"loss": 1.0183,
"step": 1920
},
{
"epoch": 0.8413251961639059,
"grad_norm": 0.5740931630134583,
"learning_rate": 9.832045697159938e-05,
"loss": 1.0421,
"step": 1930
},
{
"epoch": 0.8456843940714909,
"grad_norm": 0.37821829319000244,
"learning_rate": 9.829200917893334e-05,
"loss": 1.0275,
"step": 1940
},
{
"epoch": 0.8500435919790759,
"grad_norm": 0.567612886428833,
"learning_rate": 9.826332666895944e-05,
"loss": 1.0372,
"step": 1950
},
{
"epoch": 0.8544027898866609,
"grad_norm": 0.512275755405426,
"learning_rate": 9.823440958108545e-05,
"loss": 1.0482,
"step": 1960
},
{
"epoch": 0.8587619877942458,
"grad_norm": 0.7865485548973083,
"learning_rate": 9.820525805585927e-05,
"loss": 1.0437,
"step": 1970
},
{
"epoch": 0.8631211857018308,
"grad_norm": 0.6144497394561768,
"learning_rate": 9.81758722349683e-05,
"loss": 1.0411,
"step": 1980
},
{
"epoch": 0.8674803836094158,
"grad_norm": 0.7085159420967102,
"learning_rate": 9.814625226123862e-05,
"loss": 1.0431,
"step": 1990
},
{
"epoch": 0.8718395815170009,
"grad_norm": 0.5472831130027771,
"learning_rate": 9.811639827863449e-05,
"loss": 1.047,
"step": 2000
},
{
"epoch": 0.8761987794245859,
"grad_norm": 0.6783573627471924,
"learning_rate": 9.808631043225741e-05,
"loss": 1.0448,
"step": 2010
},
{
"epoch": 0.8805579773321709,
"grad_norm": 0.7649420499801636,
"learning_rate": 9.805598886834567e-05,
"loss": 1.0241,
"step": 2020
},
{
"epoch": 0.8849171752397559,
"grad_norm": 0.5474071502685547,
"learning_rate": 9.802543373427344e-05,
"loss": 1.035,
"step": 2030
},
{
"epoch": 0.8892763731473409,
"grad_norm": 0.40244612097740173,
"learning_rate": 9.799464517855018e-05,
"loss": 1.0289,
"step": 2040
},
{
"epoch": 0.8936355710549259,
"grad_norm": 0.5829120874404907,
"learning_rate": 9.79636233508198e-05,
"loss": 1.0308,
"step": 2050
},
{
"epoch": 0.8979947689625108,
"grad_norm": 0.6150997877120972,
"learning_rate": 9.793236840186005e-05,
"loss": 1.0344,
"step": 2060
},
{
"epoch": 0.902353966870096,
"grad_norm": 0.6614237427711487,
"learning_rate": 9.790088048358175e-05,
"loss": 1.0162,
"step": 2070
},
{
"epoch": 0.9067131647776809,
"grad_norm": 0.5127846598625183,
"learning_rate": 9.786915974902798e-05,
"loss": 1.0383,
"step": 2080
},
{
"epoch": 0.9110723626852659,
"grad_norm": 0.6490142941474915,
"learning_rate": 9.783720635237343e-05,
"loss": 1.0134,
"step": 2090
},
{
"epoch": 0.9154315605928509,
"grad_norm": 0.6402510404586792,
"learning_rate": 9.780502044892362e-05,
"loss": 1.0332,
"step": 2100
},
{
"epoch": 0.9197907585004359,
"grad_norm": 0.4572957754135132,
"learning_rate": 9.777260219511415e-05,
"loss": 1.0204,
"step": 2110
},
{
"epoch": 0.9241499564080209,
"grad_norm": 1.4024547338485718,
"learning_rate": 9.773995174850989e-05,
"loss": 1.0246,
"step": 2120
},
{
"epoch": 0.9285091543156059,
"grad_norm": 0.765164852142334,
"learning_rate": 9.770706926780428e-05,
"loss": 1.0217,
"step": 2130
},
{
"epoch": 0.932868352223191,
"grad_norm": 0.8015472292900085,
"learning_rate": 9.767395491281855e-05,
"loss": 1.0586,
"step": 2140
},
{
"epoch": 0.937227550130776,
"grad_norm": 0.7269527912139893,
"learning_rate": 9.764060884450086e-05,
"loss": 1.0121,
"step": 2150
},
{
"epoch": 0.941586748038361,
"grad_norm": 0.7793417572975159,
"learning_rate": 9.76070312249257e-05,
"loss": 1.0036,
"step": 2160
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.6566328406333923,
"learning_rate": 9.757322221729283e-05,
"loss": 1.024,
"step": 2170
},
{
"epoch": 0.9503051438535309,
"grad_norm": 0.7390000224113464,
"learning_rate": 9.753918198592682e-05,
"loss": 1.0305,
"step": 2180
},
{
"epoch": 0.9546643417611159,
"grad_norm": 0.6735124588012695,
"learning_rate": 9.750491069627593e-05,
"loss": 1.0197,
"step": 2190
},
{
"epoch": 0.9590235396687009,
"grad_norm": 1.4354088306427002,
"learning_rate": 9.747040851491149e-05,
"loss": 1.0231,
"step": 2200
},
{
"epoch": 0.963382737576286,
"grad_norm": 0.5722385048866272,
"learning_rate": 9.743567560952711e-05,
"loss": 1.008,
"step": 2210
},
{
"epoch": 0.967741935483871,
"grad_norm": 1.234014868736267,
"learning_rate": 9.740071214893773e-05,
"loss": 1.0208,
"step": 2220
},
{
"epoch": 0.972101133391456,
"grad_norm": 0.6652539372444153,
"learning_rate": 9.736551830307892e-05,
"loss": 1.0306,
"step": 2230
},
{
"epoch": 0.976460331299041,
"grad_norm": 0.5687094330787659,
"learning_rate": 9.733009424300597e-05,
"loss": 1.0456,
"step": 2240
},
{
"epoch": 0.980819529206626,
"grad_norm": 1.096685528755188,
"learning_rate": 9.729444014089314e-05,
"loss": 1.0357,
"step": 2250
},
{
"epoch": 0.985178727114211,
"grad_norm": 0.6169744729995728,
"learning_rate": 9.725855617003275e-05,
"loss": 1.0044,
"step": 2260
},
{
"epoch": 0.9895379250217959,
"grad_norm": 0.7805534601211548,
"learning_rate": 9.72224425048344e-05,
"loss": 1.0375,
"step": 2270
},
{
"epoch": 0.993897122929381,
"grad_norm": 1.3868515491485596,
"learning_rate": 9.718609932082405e-05,
"loss": 1.0149,
"step": 2280
},
{
"epoch": 0.998256320836966,
"grad_norm": 0.625151515007019,
"learning_rate": 9.714952679464323e-05,
"loss": 0.9914,
"step": 2290
},
{
"epoch": 1.002615518744551,
"grad_norm": 0.7442333102226257,
"learning_rate": 9.711272510404816e-05,
"loss": 1.0047,
"step": 2300
},
{
"epoch": 1.0069747166521361,
"grad_norm": 0.561114490032196,
"learning_rate": 9.70756944279089e-05,
"loss": 1.0157,
"step": 2310
},
{
"epoch": 1.011333914559721,
"grad_norm": 0.6227409839630127,
"learning_rate": 9.70384349462084e-05,
"loss": 1.0276,
"step": 2320
},
{
"epoch": 1.015693112467306,
"grad_norm": 1.2291237115859985,
"learning_rate": 9.700094684004182e-05,
"loss": 0.9942,
"step": 2330
},
{
"epoch": 1.020052310374891,
"grad_norm": 1.3991217613220215,
"learning_rate": 9.696323029161535e-05,
"loss": 1.0272,
"step": 2340
},
{
"epoch": 1.024411508282476,
"grad_norm": 1.0560461282730103,
"learning_rate": 9.692528548424567e-05,
"loss": 1.0041,
"step": 2350
},
{
"epoch": 1.028770706190061,
"grad_norm": 1.0611501932144165,
"learning_rate": 9.688711260235872e-05,
"loss": 0.9916,
"step": 2360
},
{
"epoch": 1.033129904097646,
"grad_norm": 0.9067592620849609,
"learning_rate": 9.684871183148912e-05,
"loss": 1.0055,
"step": 2370
},
{
"epoch": 1.037489102005231,
"grad_norm": 1.063376784324646,
"learning_rate": 9.681008335827898e-05,
"loss": 0.9674,
"step": 2380
},
{
"epoch": 1.041848299912816,
"grad_norm": 2.4049572944641113,
"learning_rate": 9.677122737047724e-05,
"loss": 0.9767,
"step": 2390
},
{
"epoch": 1.046207497820401,
"grad_norm": 1.297662377357483,
"learning_rate": 9.673214405693857e-05,
"loss": 0.919,
"step": 2400
},
{
"epoch": 1.050566695727986,
"grad_norm": 0.7325993180274963,
"learning_rate": 9.669283360762258e-05,
"loss": 0.9641,
"step": 2410
},
{
"epoch": 1.054925893635571,
"grad_norm": 1.0571956634521484,
"learning_rate": 9.66532962135928e-05,
"loss": 0.9962,
"step": 2420
},
{
"epoch": 1.059285091543156,
"grad_norm": 1.3534421920776367,
"learning_rate": 9.661353206701582e-05,
"loss": 0.9791,
"step": 2430
},
{
"epoch": 1.063644289450741,
"grad_norm": 1.4913572072982788,
"learning_rate": 9.657354136116035e-05,
"loss": 0.9379,
"step": 2440
},
{
"epoch": 1.0680034873583262,
"grad_norm": 2.4377195835113525,
"learning_rate": 9.653332429039625e-05,
"loss": 0.9346,
"step": 2450
},
{
"epoch": 1.0723626852659112,
"grad_norm": 1.0011940002441406,
"learning_rate": 9.649288105019356e-05,
"loss": 0.9223,
"step": 2460
},
{
"epoch": 1.0767218831734962,
"grad_norm": 1.409762978553772,
"learning_rate": 9.645221183712165e-05,
"loss": 0.9075,
"step": 2470
},
{
"epoch": 1.0810810810810811,
"grad_norm": 1.3468458652496338,
"learning_rate": 9.641131684884817e-05,
"loss": 0.9658,
"step": 2480
},
{
"epoch": 1.0854402789886661,
"grad_norm": 1.6132701635360718,
"learning_rate": 9.637019628413813e-05,
"loss": 0.9184,
"step": 2490
},
{
"epoch": 1.0897994768962511,
"grad_norm": 2.994033098220825,
"learning_rate": 9.632885034285291e-05,
"loss": 0.934,
"step": 2500
},
{
"epoch": 1.094158674803836,
"grad_norm": 1.964921236038208,
"learning_rate": 9.628727922594931e-05,
"loss": 0.8986,
"step": 2510
},
{
"epoch": 1.098517872711421,
"grad_norm": 1.2736964225769043,
"learning_rate": 9.624548313547862e-05,
"loss": 0.8723,
"step": 2520
},
{
"epoch": 1.102877070619006,
"grad_norm": 1.9553364515304565,
"learning_rate": 9.620346227458547e-05,
"loss": 0.8539,
"step": 2530
},
{
"epoch": 1.107236268526591,
"grad_norm": 1.969037652015686,
"learning_rate": 9.616121684750712e-05,
"loss": 0.8377,
"step": 2540
},
{
"epoch": 1.111595466434176,
"grad_norm": 2.0418758392333984,
"learning_rate": 9.611874705957215e-05,
"loss": 0.8172,
"step": 2550
},
{
"epoch": 1.115954664341761,
"grad_norm": 2.196484327316284,
"learning_rate": 9.607605311719972e-05,
"loss": 0.7764,
"step": 2560
},
{
"epoch": 1.120313862249346,
"grad_norm": 2.5240135192871094,
"learning_rate": 9.603313522789841e-05,
"loss": 0.7373,
"step": 2570
},
{
"epoch": 1.124673060156931,
"grad_norm": 3.0054984092712402,
"learning_rate": 9.598999360026529e-05,
"loss": 0.6584,
"step": 2580
},
{
"epoch": 1.129032258064516,
"grad_norm": 3.442847490310669,
"learning_rate": 9.59466284439849e-05,
"loss": 0.669,
"step": 2590
},
{
"epoch": 1.1333914559721012,
"grad_norm": 2.902653217315674,
"learning_rate": 9.590303996982815e-05,
"loss": 0.6999,
"step": 2600
},
{
"epoch": 1.1377506538796862,
"grad_norm": 2.5193164348602295,
"learning_rate": 9.585922838965145e-05,
"loss": 0.6424,
"step": 2610
},
{
"epoch": 1.1421098517872712,
"grad_norm": 3.2627856731414795,
"learning_rate": 9.581519391639549e-05,
"loss": 0.5839,
"step": 2620
},
{
"epoch": 1.1464690496948562,
"grad_norm": 4.419332504272461,
"learning_rate": 9.577093676408439e-05,
"loss": 0.5886,
"step": 2630
},
{
"epoch": 1.1508282476024412,
"grad_norm": 3.463974952697754,
"learning_rate": 9.572645714782453e-05,
"loss": 0.4981,
"step": 2640
},
{
"epoch": 1.1551874455100262,
"grad_norm": 3.361687183380127,
"learning_rate": 9.568175528380354e-05,
"loss": 0.5007,
"step": 2650
},
{
"epoch": 1.1595466434176112,
"grad_norm": 2.817034959793091,
"learning_rate": 9.56368313892893e-05,
"loss": 0.5828,
"step": 2660
},
{
"epoch": 1.1639058413251961,
"grad_norm": 5.572615623474121,
"learning_rate": 9.55916856826288e-05,
"loss": 0.5255,
"step": 2670
},
{
"epoch": 1.1682650392327811,
"grad_norm": 2.425114154815674,
"learning_rate": 9.554631838324713e-05,
"loss": 0.549,
"step": 2680
},
{
"epoch": 1.1726242371403661,
"grad_norm": 3.455268621444702,
"learning_rate": 9.55007297116464e-05,
"loss": 0.4834,
"step": 2690
},
{
"epoch": 1.176983435047951,
"grad_norm": 2.3823935985565186,
"learning_rate": 9.545491988940472e-05,
"loss": 0.4525,
"step": 2700
},
{
"epoch": 1.181342632955536,
"grad_norm": 3.978440284729004,
"learning_rate": 9.540888913917501e-05,
"loss": 0.4156,
"step": 2710
},
{
"epoch": 1.1857018308631213,
"grad_norm": 3.4890248775482178,
"learning_rate": 9.536263768468401e-05,
"loss": 0.4701,
"step": 2720
},
{
"epoch": 1.1900610287707063,
"grad_norm": 4.7818379402160645,
"learning_rate": 9.531616575073117e-05,
"loss": 0.4442,
"step": 2730
},
{
"epoch": 1.1944202266782913,
"grad_norm": 3.9928464889526367,
"learning_rate": 9.526947356318754e-05,
"loss": 0.447,
"step": 2740
},
{
"epoch": 1.1987794245858763,
"grad_norm": 3.704075336456299,
"learning_rate": 9.52225613489947e-05,
"loss": 0.4156,
"step": 2750
},
{
"epoch": 1.2031386224934613,
"grad_norm": 2.7910592555999756,
"learning_rate": 9.517542933616365e-05,
"loss": 0.3874,
"step": 2760
},
{
"epoch": 1.2074978204010463,
"grad_norm": 6.242345809936523,
"learning_rate": 9.512807775377366e-05,
"loss": 0.3684,
"step": 2770
},
{
"epoch": 1.2118570183086312,
"grad_norm": 4.7204766273498535,
"learning_rate": 9.508050683197121e-05,
"loss": 0.3744,
"step": 2780
},
{
"epoch": 1.2162162162162162,
"grad_norm": 6.195404052734375,
"learning_rate": 9.503271680196888e-05,
"loss": 0.3408,
"step": 2790
},
{
"epoch": 1.2205754141238012,
"grad_norm": 8.634724617004395,
"learning_rate": 9.498470789604413e-05,
"loss": 0.3721,
"step": 2800
},
{
"epoch": 1.2249346120313862,
"grad_norm": 4.196957111358643,
"learning_rate": 9.49364803475383e-05,
"loss": 0.4259,
"step": 2810
},
{
"epoch": 1.2292938099389712,
"grad_norm": 3.526580810546875,
"learning_rate": 9.48880343908554e-05,
"loss": 0.352,
"step": 2820
},
{
"epoch": 1.2336530078465562,
"grad_norm": 4.0493316650390625,
"learning_rate": 9.4839370261461e-05,
"loss": 0.3469,
"step": 2830
},
{
"epoch": 1.2380122057541412,
"grad_norm": 6.060046672821045,
"learning_rate": 9.479048819588098e-05,
"loss": 0.3126,
"step": 2840
},
{
"epoch": 1.2423714036617262,
"grad_norm": 3.5340590476989746,
"learning_rate": 9.474138843170063e-05,
"loss": 0.331,
"step": 2850
},
{
"epoch": 1.2467306015693111,
"grad_norm": 2.743643045425415,
"learning_rate": 9.46920712075632e-05,
"loss": 0.2869,
"step": 2860
},
{
"epoch": 1.2510897994768961,
"grad_norm": 7.408202171325684,
"learning_rate": 9.464253676316893e-05,
"loss": 0.3174,
"step": 2870
},
{
"epoch": 1.2554489973844811,
"grad_norm": 8.131254196166992,
"learning_rate": 9.459278533927384e-05,
"loss": 0.2857,
"step": 2880
},
{
"epoch": 1.2598081952920663,
"grad_norm": 8.597091674804688,
"learning_rate": 9.454281717768854e-05,
"loss": 0.2979,
"step": 2890
},
{
"epoch": 1.2641673931996513,
"grad_norm": 5.740150451660156,
"learning_rate": 9.449263252127708e-05,
"loss": 0.2927,
"step": 2900
},
{
"epoch": 1.2685265911072363,
"grad_norm": 5.028838634490967,
"learning_rate": 9.444223161395573e-05,
"loss": 0.2927,
"step": 2910
},
{
"epoch": 1.2728857890148213,
"grad_norm": 6.9507904052734375,
"learning_rate": 9.439161470069184e-05,
"loss": 0.3225,
"step": 2920
},
{
"epoch": 1.2772449869224063,
"grad_norm": 3.765681266784668,
"learning_rate": 9.43407820275026e-05,
"loss": 0.309,
"step": 2930
},
{
"epoch": 1.2816041848299913,
"grad_norm": 4.51765251159668,
"learning_rate": 9.428973384145396e-05,
"loss": 0.334,
"step": 2940
},
{
"epoch": 1.2859633827375763,
"grad_norm": 6.146585464477539,
"learning_rate": 9.423847039065922e-05,
"loss": 0.2429,
"step": 2950
},
{
"epoch": 1.2903225806451613,
"grad_norm": 4.963317394256592,
"learning_rate": 9.418699192427805e-05,
"loss": 0.269,
"step": 2960
},
{
"epoch": 1.2946817785527462,
"grad_norm": 6.3673996925354,
"learning_rate": 9.41352986925151e-05,
"loss": 0.2482,
"step": 2970
},
{
"epoch": 1.2990409764603312,
"grad_norm": 6.359657287597656,
"learning_rate": 9.408339094661895e-05,
"loss": 0.2639,
"step": 2980
},
{
"epoch": 1.3034001743679164,
"grad_norm": 2.6293511390686035,
"learning_rate": 9.40312689388807e-05,
"loss": 0.2487,
"step": 2990
},
{
"epoch": 1.3077593722755014,
"grad_norm": 4.45819091796875,
"learning_rate": 9.397893292263292e-05,
"loss": 0.2576,
"step": 3000
},
{
"epoch": 1.3121185701830864,
"grad_norm": 7.133167266845703,
"learning_rate": 9.392638315224829e-05,
"loss": 0.2167,
"step": 3010
},
{
"epoch": 1.3164777680906714,
"grad_norm": 7.530590057373047,
"learning_rate": 9.387361988313846e-05,
"loss": 0.2565,
"step": 3020
},
{
"epoch": 1.3208369659982564,
"grad_norm": 5.552865982055664,
"learning_rate": 9.38206433717527e-05,
"loss": 0.2203,
"step": 3030
},
{
"epoch": 1.3251961639058414,
"grad_norm": 3.770548105239868,
"learning_rate": 9.376745387557681e-05,
"loss": 0.2232,
"step": 3040
},
{
"epoch": 1.3295553618134264,
"grad_norm": 5.5245561599731445,
"learning_rate": 9.371405165313169e-05,
"loss": 0.2363,
"step": 3050
},
{
"epoch": 1.3339145597210114,
"grad_norm": 4.505107402801514,
"learning_rate": 9.366043696397222e-05,
"loss": 0.2091,
"step": 3060
},
{
"epoch": 1.3382737576285963,
"grad_norm": 4.21371603012085,
"learning_rate": 9.360661006868592e-05,
"loss": 0.2356,
"step": 3070
},
{
"epoch": 1.3426329555361813,
"grad_norm": 4.076999187469482,
"learning_rate": 9.355257122889173e-05,
"loss": 0.2194,
"step": 3080
},
{
"epoch": 1.3469921534437663,
"grad_norm": 2.4215261936187744,
"learning_rate": 9.349832070723871e-05,
"loss": 0.2031,
"step": 3090
},
{
"epoch": 1.3513513513513513,
"grad_norm": 10.00662612915039,
"learning_rate": 9.34438587674048e-05,
"loss": 0.2013,
"step": 3100
},
{
"epoch": 1.3557105492589363,
"grad_norm": 3.5849783420562744,
"learning_rate": 9.338918567409545e-05,
"loss": 0.1867,
"step": 3110
},
{
"epoch": 1.3600697471665213,
"grad_norm": 3.882270336151123,
"learning_rate": 9.333430169304247e-05,
"loss": 0.1831,
"step": 3120
},
{
"epoch": 1.3644289450741063,
"grad_norm": 1.877372145652771,
"learning_rate": 9.327920709100259e-05,
"loss": 0.1926,
"step": 3130
},
{
"epoch": 1.3687881429816913,
"grad_norm": 3.195490598678589,
"learning_rate": 9.322390213575631e-05,
"loss": 0.2008,
"step": 3140
},
{
"epoch": 1.3731473408892763,
"grad_norm": 3.353106737136841,
"learning_rate": 9.316838709610648e-05,
"loss": 0.2177,
"step": 3150
},
{
"epoch": 1.3775065387968612,
"grad_norm": 4.3637237548828125,
"learning_rate": 9.311266224187706e-05,
"loss": 0.2103,
"step": 3160
},
{
"epoch": 1.3818657367044465,
"grad_norm": 3.7159504890441895,
"learning_rate": 9.305672784391175e-05,
"loss": 0.2003,
"step": 3170
},
{
"epoch": 1.3862249346120314,
"grad_norm": 4.276161193847656,
"learning_rate": 9.300058417407276e-05,
"loss": 0.2439,
"step": 3180
},
{
"epoch": 1.3905841325196164,
"grad_norm": 4.765425682067871,
"learning_rate": 9.29442315052394e-05,
"loss": 0.2352,
"step": 3190
},
{
"epoch": 1.3949433304272014,
"grad_norm": 4.818525791168213,
"learning_rate": 9.288767011130684e-05,
"loss": 0.2355,
"step": 3200
},
{
"epoch": 1.3993025283347864,
"grad_norm": 3.578951597213745,
"learning_rate": 9.283090026718466e-05,
"loss": 0.2045,
"step": 3210
},
{
"epoch": 1.4036617262423714,
"grad_norm": 15.848716735839844,
"learning_rate": 9.277392224879568e-05,
"loss": 0.2175,
"step": 3220
},
{
"epoch": 1.4080209241499564,
"grad_norm": 9.305033683776855,
"learning_rate": 9.271673633307445e-05,
"loss": 0.237,
"step": 3230
},
{
"epoch": 1.4123801220575414,
"grad_norm": 5.726974010467529,
"learning_rate": 9.265934279796602e-05,
"loss": 0.243,
"step": 3240
},
{
"epoch": 1.4167393199651264,
"grad_norm": 5.116995334625244,
"learning_rate": 9.260174192242453e-05,
"loss": 0.2599,
"step": 3250
},
{
"epoch": 1.4210985178727114,
"grad_norm": 2.4488906860351562,
"learning_rate": 9.254393398641185e-05,
"loss": 0.23,
"step": 3260
},
{
"epoch": 1.4254577157802966,
"grad_norm": 5.382657051086426,
"learning_rate": 9.248591927089628e-05,
"loss": 0.2062,
"step": 3270
},
{
"epoch": 1.4298169136878816,
"grad_norm": 5.034473419189453,
"learning_rate": 9.242769805785115e-05,
"loss": 0.2262,
"step": 3280
},
{
"epoch": 1.4341761115954665,
"grad_norm": 3.196340322494507,
"learning_rate": 9.236927063025342e-05,
"loss": 0.179,
"step": 3290
},
{
"epoch": 1.4385353095030515,
"grad_norm": 6.5333452224731445,
"learning_rate": 9.231063727208234e-05,
"loss": 0.2302,
"step": 3300
},
{
"epoch": 1.4428945074106365,
"grad_norm": 2.912020206451416,
"learning_rate": 9.225179826831807e-05,
"loss": 0.1975,
"step": 3310
},
{
"epoch": 1.4472537053182215,
"grad_norm": 3.3908817768096924,
"learning_rate": 9.219275390494024e-05,
"loss": 0.1751,
"step": 3320
},
{
"epoch": 1.4516129032258065,
"grad_norm": 2.084850788116455,
"learning_rate": 9.213350446892668e-05,
"loss": 0.1586,
"step": 3330
},
{
"epoch": 1.4559721011333915,
"grad_norm": 2.0085716247558594,
"learning_rate": 9.207405024825186e-05,
"loss": 0.1521,
"step": 3340
},
{
"epoch": 1.4603312990409765,
"grad_norm": 4.547303676605225,
"learning_rate": 9.201439153188569e-05,
"loss": 0.1683,
"step": 3350
},
{
"epoch": 1.4646904969485615,
"grad_norm": 12.060445785522461,
"learning_rate": 9.19545286097919e-05,
"loss": 0.1724,
"step": 3360
},
{
"epoch": 1.4690496948561464,
"grad_norm": 2.533823013305664,
"learning_rate": 9.189446177292679e-05,
"loss": 0.1988,
"step": 3370
},
{
"epoch": 1.4734088927637314,
"grad_norm": 8.067084312438965,
"learning_rate": 9.183419131323778e-05,
"loss": 0.2145,
"step": 3380
},
{
"epoch": 1.4777680906713164,
"grad_norm": 5.309566974639893,
"learning_rate": 9.177371752366191e-05,
"loss": 0.1976,
"step": 3390
},
{
"epoch": 1.4821272885789014,
"grad_norm": 5.301851272583008,
"learning_rate": 9.171304069812454e-05,
"loss": 0.1951,
"step": 3400
},
{
"epoch": 1.4864864864864864,
"grad_norm": 3.730698347091675,
"learning_rate": 9.165216113153782e-05,
"loss": 0.2042,
"step": 3410
},
{
"epoch": 1.4908456843940714,
"grad_norm": 6.294808864593506,
"learning_rate": 9.159107911979936e-05,
"loss": 0.2248,
"step": 3420
},
{
"epoch": 1.4952048823016564,
"grad_norm": 4.979791164398193,
"learning_rate": 9.152979495979063e-05,
"loss": 0.1873,
"step": 3430
},
{
"epoch": 1.4995640802092414,
"grad_norm": 3.240687370300293,
"learning_rate": 9.146830894937571e-05,
"loss": 0.1832,
"step": 3440
},
{
"epoch": 1.5039232781168264,
"grad_norm": 1.4726358652114868,
"learning_rate": 9.140662138739969e-05,
"loss": 0.1798,
"step": 3450
},
{
"epoch": 1.5082824760244113,
"grad_norm": 3.543588161468506,
"learning_rate": 9.134473257368732e-05,
"loss": 0.1774,
"step": 3460
},
{
"epoch": 1.5126416739319966,
"grad_norm": 7.755619525909424,
"learning_rate": 9.128264280904145e-05,
"loss": 0.1684,
"step": 3470
},
{
"epoch": 1.5170008718395815,
"grad_norm": 2.366468667984009,
"learning_rate": 9.122035239524169e-05,
"loss": 0.1566,
"step": 3480
},
{
"epoch": 1.5213600697471665,
"grad_norm": 2.500969171524048,
"learning_rate": 9.115786163504285e-05,
"loss": 0.1442,
"step": 3490
},
{
"epoch": 1.5257192676547515,
"grad_norm": 3.558485984802246,
"learning_rate": 9.10951708321735e-05,
"loss": 0.1739,
"step": 3500
},
{
"epoch": 1.5300784655623365,
"grad_norm": 5.444303512573242,
"learning_rate": 9.10322802913345e-05,
"loss": 0.2408,
"step": 3510
},
{
"epoch": 1.5344376634699215,
"grad_norm": 5.714164733886719,
"learning_rate": 9.096919031819751e-05,
"loss": 0.1765,
"step": 3520
},
{
"epoch": 1.5387968613775065,
"grad_norm": 4.49392557144165,
"learning_rate": 9.090590121940348e-05,
"loss": 0.2224,
"step": 3530
},
{
"epoch": 1.5431560592850917,
"grad_norm": 3.7500405311584473,
"learning_rate": 9.084241330256121e-05,
"loss": 0.1803,
"step": 3540
},
{
"epoch": 1.5475152571926767,
"grad_norm": 5.087535858154297,
"learning_rate": 9.077872687624586e-05,
"loss": 0.209,
"step": 3550
},
{
"epoch": 1.5518744551002617,
"grad_norm": 2.011319875717163,
"learning_rate": 9.071484224999735e-05,
"loss": 0.1479,
"step": 3560
},
{
"epoch": 1.5562336530078467,
"grad_norm": 2.3110170364379883,
"learning_rate": 9.0650759734319e-05,
"loss": 0.1551,
"step": 3570
},
{
"epoch": 1.5605928509154317,
"grad_norm": 7.891526222229004,
"learning_rate": 9.05864796406759e-05,
"loss": 0.1787,
"step": 3580
},
{
"epoch": 1.5649520488230166,
"grad_norm": 1.9730972051620483,
"learning_rate": 9.052200228149343e-05,
"loss": 0.1774,
"step": 3590
},
{
"epoch": 1.5693112467306016,
"grad_norm": 2.8834328651428223,
"learning_rate": 9.04573279701558e-05,
"loss": 0.1431,
"step": 3600
},
{
"epoch": 1.5736704446381866,
"grad_norm": 1.477083444595337,
"learning_rate": 9.039245702100448e-05,
"loss": 0.1366,
"step": 3610
},
{
"epoch": 1.5780296425457716,
"grad_norm": 3.89342999458313,
"learning_rate": 9.032738974933664e-05,
"loss": 0.1327,
"step": 3620
},
{
"epoch": 1.5823888404533566,
"grad_norm": 7.806596279144287,
"learning_rate": 9.026212647140365e-05,
"loss": 0.1546,
"step": 3630
},
{
"epoch": 1.5867480383609416,
"grad_norm": 9.280295372009277,
"learning_rate": 9.019666750440956e-05,
"loss": 0.1438,
"step": 3640
},
{
"epoch": 1.5911072362685266,
"grad_norm": 4.350841999053955,
"learning_rate": 9.013101316650956e-05,
"loss": 0.1765,
"step": 3650
},
{
"epoch": 1.5954664341761116,
"grad_norm": 1.526946783065796,
"learning_rate": 9.00651637768084e-05,
"loss": 0.1389,
"step": 3660
},
{
"epoch": 1.5998256320836965,
"grad_norm": 1.9134260416030884,
"learning_rate": 8.999911965535885e-05,
"loss": 0.1341,
"step": 3670
},
{
"epoch": 1.6041848299912815,
"grad_norm": 7.601817607879639,
"learning_rate": 8.993288112316014e-05,
"loss": 0.1553,
"step": 3680
},
{
"epoch": 1.6085440278988665,
"grad_norm": 1.6179394721984863,
"learning_rate": 8.986644850215644e-05,
"loss": 0.1662,
"step": 3690
},
{
"epoch": 1.6129032258064515,
"grad_norm": 3.03326678276062,
"learning_rate": 8.979982211523523e-05,
"loss": 0.1536,
"step": 3700
},
{
"epoch": 1.6172624237140365,
"grad_norm": 1.7008064985275269,
"learning_rate": 8.97330022862258e-05,
"loss": 0.1675,
"step": 3710
},
{
"epoch": 1.6216216216216215,
"grad_norm": 5.134827136993408,
"learning_rate": 8.96659893398976e-05,
"loss": 0.1658,
"step": 3720
},
{
"epoch": 1.6259808195292065,
"grad_norm": 3.6326844692230225,
"learning_rate": 8.959878360195876e-05,
"loss": 0.1819,
"step": 3730
},
{
"epoch": 1.6303400174367915,
"grad_norm": 2.703415632247925,
"learning_rate": 8.953138539905438e-05,
"loss": 0.1482,
"step": 3740
},
{
"epoch": 1.6346992153443767,
"grad_norm": 3.979405164718628,
"learning_rate": 8.946379505876506e-05,
"loss": 0.1582,
"step": 3750
},
{
"epoch": 1.6390584132519617,
"grad_norm": 3.959192991256714,
"learning_rate": 8.939601290960527e-05,
"loss": 0.1273,
"step": 3760
},
{
"epoch": 1.6434176111595467,
"grad_norm": 3.0278549194335938,
"learning_rate": 8.932803928102167e-05,
"loss": 0.1596,
"step": 3770
},
{
"epoch": 1.6477768090671316,
"grad_norm": 2.5706779956817627,
"learning_rate": 8.925987450339168e-05,
"loss": 0.1411,
"step": 3780
},
{
"epoch": 1.6521360069747166,
"grad_norm": 3.269700765609741,
"learning_rate": 8.919151890802172e-05,
"loss": 0.1273,
"step": 3790
},
{
"epoch": 1.6564952048823016,
"grad_norm": 2.8354709148406982,
"learning_rate": 8.912297282714564e-05,
"loss": 0.1608,
"step": 3800
},
{
"epoch": 1.6608544027898866,
"grad_norm": 4.5689215660095215,
"learning_rate": 8.905423659392316e-05,
"loss": 0.1328,
"step": 3810
},
{
"epoch": 1.6652136006974718,
"grad_norm": 0.7547925710678101,
"learning_rate": 8.898531054243822e-05,
"loss": 0.1371,
"step": 3820
},
{
"epoch": 1.6695727986050568,
"grad_norm": 4.1064534187316895,
"learning_rate": 8.891619500769729e-05,
"loss": 0.1616,
"step": 3830
},
{
"epoch": 1.6739319965126418,
"grad_norm": 2.2212579250335693,
"learning_rate": 8.884689032562785e-05,
"loss": 0.1283,
"step": 3840
},
{
"epoch": 1.6782911944202268,
"grad_norm": 0.8942421674728394,
"learning_rate": 8.87773968330767e-05,
"loss": 0.1357,
"step": 3850
},
{
"epoch": 1.6826503923278118,
"grad_norm": 4.484212875366211,
"learning_rate": 8.870771486780832e-05,
"loss": 0.14,
"step": 3860
},
{
"epoch": 1.6870095902353968,
"grad_norm": 4.993788719177246,
"learning_rate": 8.863784476850322e-05,
"loss": 0.1519,
"step": 3870
},
{
"epoch": 1.6913687881429817,
"grad_norm": 4.12445592880249,
"learning_rate": 8.856778687475635e-05,
"loss": 0.129,
"step": 3880
},
{
"epoch": 1.6957279860505667,
"grad_norm": 1.0877776145935059,
"learning_rate": 8.849754152707541e-05,
"loss": 0.1312,
"step": 3890
},
{
"epoch": 1.7000871839581517,
"grad_norm": 1.0505248308181763,
"learning_rate": 8.842710906687916e-05,
"loss": 0.1313,
"step": 3900
},
{
"epoch": 1.7044463818657367,
"grad_norm": 1.6745731830596924,
"learning_rate": 8.83564898364958e-05,
"loss": 0.1265,
"step": 3910
},
{
"epoch": 1.7088055797733217,
"grad_norm": 1.7037270069122314,
"learning_rate": 8.828568417916136e-05,
"loss": 0.1428,
"step": 3920
},
{
"epoch": 1.7131647776809067,
"grad_norm": 1.1031413078308105,
"learning_rate": 8.821469243901794e-05,
"loss": 0.1301,
"step": 3930
},
{
"epoch": 1.7175239755884917,
"grad_norm": 5.505949974060059,
"learning_rate": 8.814351496111201e-05,
"loss": 0.1263,
"step": 3940
},
{
"epoch": 1.7218831734960767,
"grad_norm": 11.303099632263184,
"learning_rate": 8.807215209139293e-05,
"loss": 0.1418,
"step": 3950
},
{
"epoch": 1.7262423714036617,
"grad_norm": 1.550581932067871,
"learning_rate": 8.8000604176711e-05,
"loss": 0.1367,
"step": 3960
},
{
"epoch": 1.7306015693112466,
"grad_norm": 1.0917490720748901,
"learning_rate": 8.792887156481598e-05,
"loss": 0.134,
"step": 3970
},
{
"epoch": 1.7349607672188316,
"grad_norm": 4.534971237182617,
"learning_rate": 8.785695460435534e-05,
"loss": 0.1088,
"step": 3980
},
{
"epoch": 1.7393199651264166,
"grad_norm": 3.131039619445801,
"learning_rate": 8.778485364487248e-05,
"loss": 0.1267,
"step": 3990
},
{
"epoch": 1.7436791630340016,
"grad_norm": 2.5974009037017822,
"learning_rate": 8.771256903680519e-05,
"loss": 0.1223,
"step": 4000
},
{
"epoch": 1.7480383609415866,
"grad_norm": 2.29083251953125,
"learning_rate": 8.764010113148382e-05,
"loss": 0.175,
"step": 4010
},
{
"epoch": 1.7523975588491716,
"grad_norm": 5.5496344566345215,
"learning_rate": 8.756745028112959e-05,
"loss": 0.1417,
"step": 4020
},
{
"epoch": 1.7567567567567568,
"grad_norm": 1.8061878681182861,
"learning_rate": 8.749461683885296e-05,
"loss": 0.1137,
"step": 4030
},
{
"epoch": 1.7611159546643418,
"grad_norm": 2.0431365966796875,
"learning_rate": 8.742160115865179e-05,
"loss": 0.1357,
"step": 4040
},
{
"epoch": 1.7654751525719268,
"grad_norm": 17.856874465942383,
"learning_rate": 8.734840359540974e-05,
"loss": 0.1235,
"step": 4050
},
{
"epoch": 1.7698343504795118,
"grad_norm": 1.4601013660430908,
"learning_rate": 8.727502450489446e-05,
"loss": 0.1515,
"step": 4060
},
{
"epoch": 1.7741935483870968,
"grad_norm": 1.304296612739563,
"learning_rate": 8.720146424375591e-05,
"loss": 0.1416,
"step": 4070
},
{
"epoch": 1.7785527462946817,
"grad_norm": 3.461920738220215,
"learning_rate": 8.712772316952458e-05,
"loss": 0.1428,
"step": 4080
},
{
"epoch": 1.7829119442022667,
"grad_norm": 1.8705286979675293,
"learning_rate": 8.705380164060982e-05,
"loss": 0.1494,
"step": 4090
},
{
"epoch": 1.787271142109852,
"grad_norm": 1.7368310689926147,
"learning_rate": 8.697970001629799e-05,
"loss": 0.1351,
"step": 4100
},
{
"epoch": 1.791630340017437,
"grad_norm": 10.622164726257324,
"learning_rate": 8.690541865675084e-05,
"loss": 0.1249,
"step": 4110
},
{
"epoch": 1.795989537925022,
"grad_norm": 1.60009765625,
"learning_rate": 8.68309579230037e-05,
"loss": 0.1539,
"step": 4120
},
{
"epoch": 1.800348735832607,
"grad_norm": 5.96570348739624,
"learning_rate": 8.675631817696372e-05,
"loss": 0.1325,
"step": 4130
},
{
"epoch": 1.804707933740192,
"grad_norm": 1.100277066230774,
"learning_rate": 8.668149978140808e-05,
"loss": 0.1531,
"step": 4140
},
{
"epoch": 1.8090671316477769,
"grad_norm": 3.805157423019409,
"learning_rate": 8.66065030999823e-05,
"loss": 0.1369,
"step": 4150
},
{
"epoch": 1.8134263295553619,
"grad_norm": 1.2350760698318481,
"learning_rate": 8.653132849719845e-05,
"loss": 0.1633,
"step": 4160
},
{
"epoch": 1.8177855274629469,
"grad_norm": 1.554290771484375,
"learning_rate": 8.64559763384333e-05,
"loss": 0.1456,
"step": 4170
},
{
"epoch": 1.8221447253705318,
"grad_norm": 4.020148754119873,
"learning_rate": 8.638044698992669e-05,
"loss": 0.1351,
"step": 4180
},
{
"epoch": 1.8265039232781168,
"grad_norm": 2.593574047088623,
"learning_rate": 8.630474081877959e-05,
"loss": 0.123,
"step": 4190
},
{
"epoch": 1.8308631211857018,
"grad_norm": 2.704629421234131,
"learning_rate": 8.62288581929525e-05,
"loss": 0.1513,
"step": 4200
},
{
"epoch": 1.8352223190932868,
"grad_norm": 8.686463356018066,
"learning_rate": 8.615279948126343e-05,
"loss": 0.1441,
"step": 4210
},
{
"epoch": 1.8395815170008718,
"grad_norm": 0.7825227975845337,
"learning_rate": 8.60765650533863e-05,
"loss": 0.1232,
"step": 4220
},
{
"epoch": 1.8439407149084568,
"grad_norm": 4.397798538208008,
"learning_rate": 8.60001552798491e-05,
"loss": 0.1438,
"step": 4230
},
{
"epoch": 1.8482999128160418,
"grad_norm": 1.311542272567749,
"learning_rate": 8.592357053203202e-05,
"loss": 0.1373,
"step": 4240
},
{
"epoch": 1.8526591107236268,
"grad_norm": 5.1354498863220215,
"learning_rate": 8.58468111821657e-05,
"loss": 0.1423,
"step": 4250
},
{
"epoch": 1.8570183086312118,
"grad_norm": 1.4782944917678833,
"learning_rate": 8.576987760332943e-05,
"loss": 0.1389,
"step": 4260
},
{
"epoch": 1.8613775065387967,
"grad_norm": 6.529745578765869,
"learning_rate": 8.56927701694493e-05,
"loss": 0.1546,
"step": 4270
},
{
"epoch": 1.8657367044463817,
"grad_norm": 1.609787940979004,
"learning_rate": 8.561548925529643e-05,
"loss": 0.137,
"step": 4280
},
{
"epoch": 1.8700959023539667,
"grad_norm": 0.907353401184082,
"learning_rate": 8.553803523648506e-05,
"loss": 0.1189,
"step": 4290
},
{
"epoch": 1.8744551002615517,
"grad_norm": 1.036514163017273,
"learning_rate": 8.546040848947086e-05,
"loss": 0.1202,
"step": 4300
},
{
"epoch": 1.878814298169137,
"grad_norm": 1.3986374139785767,
"learning_rate": 8.538260939154894e-05,
"loss": 0.1271,
"step": 4310
},
{
"epoch": 1.883173496076722,
"grad_norm": 1.7852044105529785,
"learning_rate": 8.530463832085218e-05,
"loss": 0.1398,
"step": 4320
},
{
"epoch": 1.887532693984307,
"grad_norm": 0.8431325554847717,
"learning_rate": 8.522649565634927e-05,
"loss": 0.1065,
"step": 4330
},
{
"epoch": 1.8918918918918919,
"grad_norm": 2.7174603939056396,
"learning_rate": 8.51481817778429e-05,
"loss": 0.1086,
"step": 4340
},
{
"epoch": 1.8962510897994769,
"grad_norm": 4.746669769287109,
"learning_rate": 8.506969706596797e-05,
"loss": 0.1113,
"step": 4350
},
{
"epoch": 1.9006102877070619,
"grad_norm": 4.1237030029296875,
"learning_rate": 8.499104190218964e-05,
"loss": 0.1104,
"step": 4360
},
{
"epoch": 1.9049694856146469,
"grad_norm": 1.4972171783447266,
"learning_rate": 8.49122166688016e-05,
"loss": 0.1305,
"step": 4370
},
{
"epoch": 1.909328683522232,
"grad_norm": 2.446460247039795,
"learning_rate": 8.483322174892404e-05,
"loss": 0.1177,
"step": 4380
},
{
"epoch": 1.913687881429817,
"grad_norm": 7.465681552886963,
"learning_rate": 8.475405752650199e-05,
"loss": 0.1059,
"step": 4390
},
{
"epoch": 1.918047079337402,
"grad_norm": 1.2010679244995117,
"learning_rate": 8.467472438630328e-05,
"loss": 0.1323,
"step": 4400
},
{
"epoch": 1.922406277244987,
"grad_norm": 3.103692054748535,
"learning_rate": 8.459522271391682e-05,
"loss": 0.1379,
"step": 4410
},
{
"epoch": 1.926765475152572,
"grad_norm": 5.2912211418151855,
"learning_rate": 8.451555289575057e-05,
"loss": 0.1461,
"step": 4420
},
{
"epoch": 1.931124673060157,
"grad_norm": 4.1434407234191895,
"learning_rate": 8.443571531902981e-05,
"loss": 0.1373,
"step": 4430
},
{
"epoch": 1.935483870967742,
"grad_norm": 4.254314422607422,
"learning_rate": 8.435571037179512e-05,
"loss": 0.1266,
"step": 4440
},
{
"epoch": 1.939843068875327,
"grad_norm": 5.463832378387451,
"learning_rate": 8.427553844290062e-05,
"loss": 0.1534,
"step": 4450
},
{
"epoch": 1.944202266782912,
"grad_norm": 6.808103084564209,
"learning_rate": 8.419519992201201e-05,
"loss": 0.1324,
"step": 4460
},
{
"epoch": 1.948561464690497,
"grad_norm": 2.2606234550476074,
"learning_rate": 8.411469519960469e-05,
"loss": 0.1246,
"step": 4470
},
{
"epoch": 1.952920662598082,
"grad_norm": 7.445507526397705,
"learning_rate": 8.403402466696182e-05,
"loss": 0.125,
"step": 4480
},
{
"epoch": 1.957279860505667,
"grad_norm": 1.0063568353652954,
"learning_rate": 8.395318871617255e-05,
"loss": 0.1373,
"step": 4490
},
{
"epoch": 1.961639058413252,
"grad_norm": 1.5997445583343506,
"learning_rate": 8.387218774012992e-05,
"loss": 0.1191,
"step": 4500
},
{
"epoch": 1.965998256320837,
"grad_norm": 1.0268845558166504,
"learning_rate": 8.379102213252915e-05,
"loss": 0.1221,
"step": 4510
},
{
"epoch": 1.970357454228422,
"grad_norm": 1.748948574066162,
"learning_rate": 8.370969228786556e-05,
"loss": 0.1186,
"step": 4520
},
{
"epoch": 1.974716652136007,
"grad_norm": 6.44364595413208,
"learning_rate": 8.362819860143275e-05,
"loss": 0.1157,
"step": 4530
},
{
"epoch": 1.9790758500435919,
"grad_norm": 3.785693883895874,
"learning_rate": 8.354654146932066e-05,
"loss": 0.1319,
"step": 4540
},
{
"epoch": 1.9834350479511769,
"grad_norm": 2.268069267272949,
"learning_rate": 8.346472128841364e-05,
"loss": 0.1048,
"step": 4550
},
{
"epoch": 1.9877942458587619,
"grad_norm": 1.350669264793396,
"learning_rate": 8.338273845638848e-05,
"loss": 0.0982,
"step": 4560
},
{
"epoch": 1.9921534437663468,
"grad_norm": 1.2077120542526245,
"learning_rate": 8.330059337171258e-05,
"loss": 0.116,
"step": 4570
},
{
"epoch": 1.9965126416739318,
"grad_norm": 0.8750612735748291,
"learning_rate": 8.32182864336419e-05,
"loss": 0.1206,
"step": 4580
},
{
"epoch": 2.000871839581517,
"grad_norm": 2.0030882358551025,
"learning_rate": 8.313581804221908e-05,
"loss": 0.1223,
"step": 4590
},
{
"epoch": 2.005231037489102,
"grad_norm": 2.0037591457366943,
"learning_rate": 8.305318859827147e-05,
"loss": 0.1079,
"step": 4600
},
{
"epoch": 2.009590235396687,
"grad_norm": 1.143376111984253,
"learning_rate": 8.297039850340923e-05,
"loss": 0.125,
"step": 4610
},
{
"epoch": 2.0139494333042722,
"grad_norm": 1.0574803352355957,
"learning_rate": 8.288744816002331e-05,
"loss": 0.0963,
"step": 4620
},
{
"epoch": 2.018308631211857,
"grad_norm": 0.8436086177825928,
"learning_rate": 8.280433797128357e-05,
"loss": 0.1012,
"step": 4630
},
{
"epoch": 2.022667829119442,
"grad_norm": 7.240283012390137,
"learning_rate": 8.272106834113674e-05,
"loss": 0.1186,
"step": 4640
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.9012176394462585,
"learning_rate": 8.26376396743045e-05,
"loss": 0.1106,
"step": 4650
},
{
"epoch": 2.031386224934612,
"grad_norm": 1.180629849433899,
"learning_rate": 8.25540523762815e-05,
"loss": 0.1064,
"step": 4660
},
{
"epoch": 2.035745422842197,
"grad_norm": 1.1704062223434448,
"learning_rate": 8.247030685333346e-05,
"loss": 0.1247,
"step": 4670
},
{
"epoch": 2.040104620749782,
"grad_norm": 2.9050512313842773,
"learning_rate": 8.238640351249503e-05,
"loss": 0.1024,
"step": 4680
},
{
"epoch": 2.044463818657367,
"grad_norm": 1.1793980598449707,
"learning_rate": 8.2302342761568e-05,
"loss": 0.1375,
"step": 4690
},
{
"epoch": 2.048823016564952,
"grad_norm": 0.8722379803657532,
"learning_rate": 8.221812500911919e-05,
"loss": 0.0941,
"step": 4700
},
{
"epoch": 2.053182214472537,
"grad_norm": 2.5320537090301514,
"learning_rate": 8.213375066447853e-05,
"loss": 0.1049,
"step": 4710
},
{
"epoch": 2.057541412380122,
"grad_norm": 1.0855395793914795,
"learning_rate": 8.204922013773702e-05,
"loss": 0.1126,
"step": 4720
},
{
"epoch": 2.061900610287707,
"grad_norm": 7.323856830596924,
"learning_rate": 8.196453383974478e-05,
"loss": 0.1281,
"step": 4730
},
{
"epoch": 2.066259808195292,
"grad_norm": 5.2091474533081055,
"learning_rate": 8.187969218210904e-05,
"loss": 0.1096,
"step": 4740
},
{
"epoch": 2.070619006102877,
"grad_norm": 1.0623677968978882,
"learning_rate": 8.179469557719213e-05,
"loss": 0.0995,
"step": 4750
},
{
"epoch": 2.074978204010462,
"grad_norm": 4.782459259033203,
"learning_rate": 8.170954443810948e-05,
"loss": 0.1205,
"step": 4760
},
{
"epoch": 2.079337401918047,
"grad_norm": 0.9393866658210754,
"learning_rate": 8.162423917872764e-05,
"loss": 0.1196,
"step": 4770
},
{
"epoch": 2.083696599825632,
"grad_norm": 6.70692253112793,
"learning_rate": 8.153878021366217e-05,
"loss": 0.1242,
"step": 4780
},
{
"epoch": 2.088055797733217,
"grad_norm": 9.28081226348877,
"learning_rate": 8.14531679582758e-05,
"loss": 0.1081,
"step": 4790
},
{
"epoch": 2.092414995640802,
"grad_norm": 0.7705070972442627,
"learning_rate": 8.136740282867621e-05,
"loss": 0.1239,
"step": 4800
},
{
"epoch": 2.096774193548387,
"grad_norm": 11.719225883483887,
"learning_rate": 8.128148524171418e-05,
"loss": 0.1137,
"step": 4810
},
{
"epoch": 2.101133391455972,
"grad_norm": 0.9089305996894836,
"learning_rate": 8.119541561498146e-05,
"loss": 0.1466,
"step": 4820
},
{
"epoch": 2.105492589363557,
"grad_norm": 0.7215758562088013,
"learning_rate": 8.110919436680877e-05,
"loss": 0.0967,
"step": 4830
},
{
"epoch": 2.109851787271142,
"grad_norm": 1.2035927772521973,
"learning_rate": 8.102282191626378e-05,
"loss": 0.1128,
"step": 4840
},
{
"epoch": 2.114210985178727,
"grad_norm": 2.1401278972625732,
"learning_rate": 8.0936298683149e-05,
"loss": 0.097,
"step": 4850
},
{
"epoch": 2.118570183086312,
"grad_norm": 0.7536699771881104,
"learning_rate": 8.084962508799991e-05,
"loss": 0.1036,
"step": 4860
},
{
"epoch": 2.122929380993897,
"grad_norm": 0.9746949672698975,
"learning_rate": 8.076280155208273e-05,
"loss": 0.1012,
"step": 4870
},
{
"epoch": 2.127288578901482,
"grad_norm": 2.9108328819274902,
"learning_rate": 8.067582849739245e-05,
"loss": 0.0931,
"step": 4880
},
{
"epoch": 2.131647776809067,
"grad_norm": 2.736168622970581,
"learning_rate": 8.058870634665079e-05,
"loss": 0.136,
"step": 4890
},
{
"epoch": 2.1360069747166524,
"grad_norm": 9.78114128112793,
"learning_rate": 8.050143552330414e-05,
"loss": 0.0862,
"step": 4900
},
{
"epoch": 2.1403661726242373,
"grad_norm": 0.6983315944671631,
"learning_rate": 8.041401645152151e-05,
"loss": 0.0964,
"step": 4910
},
{
"epoch": 2.1447253705318223,
"grad_norm": 1.677988886833191,
"learning_rate": 8.032644955619239e-05,
"loss": 0.1192,
"step": 4920
},
{
"epoch": 2.1490845684394073,
"grad_norm": 5.114157199859619,
"learning_rate": 8.023873526292483e-05,
"loss": 0.1027,
"step": 4930
},
{
"epoch": 2.1534437663469923,
"grad_norm": 1.319881796836853,
"learning_rate": 8.015087399804322e-05,
"loss": 0.082,
"step": 4940
},
{
"epoch": 2.1578029642545773,
"grad_norm": 1.808841347694397,
"learning_rate": 8.006286618858635e-05,
"loss": 0.1118,
"step": 4950
},
{
"epoch": 2.1621621621621623,
"grad_norm": 3.633739709854126,
"learning_rate": 7.99747122623052e-05,
"loss": 0.1027,
"step": 4960
},
{
"epoch": 2.1665213600697473,
"grad_norm": 0.9861094951629639,
"learning_rate": 7.988641264766097e-05,
"loss": 0.1185,
"step": 4970
},
{
"epoch": 2.1708805579773323,
"grad_norm": 1.4471122026443481,
"learning_rate": 7.9797967773823e-05,
"loss": 0.0963,
"step": 4980
},
{
"epoch": 2.1752397558849172,
"grad_norm": 1.0878154039382935,
"learning_rate": 7.970937807066659e-05,
"loss": 0.1053,
"step": 4990
},
{
"epoch": 2.1795989537925022,
"grad_norm": 1.0093618631362915,
"learning_rate": 7.962064396877098e-05,
"loss": 0.1105,
"step": 5000
},
{
"epoch": 2.1839581517000872,
"grad_norm": 0.8233932852745056,
"learning_rate": 7.953176589941722e-05,
"loss": 0.104,
"step": 5010
},
{
"epoch": 2.188317349607672,
"grad_norm": 0.7308264970779419,
"learning_rate": 7.944274429458614e-05,
"loss": 0.1007,
"step": 5020
},
{
"epoch": 2.192676547515257,
"grad_norm": 0.8960587382316589,
"learning_rate": 7.93535795869562e-05,
"loss": 0.1017,
"step": 5030
},
{
"epoch": 2.197035745422842,
"grad_norm": 1.306472659111023,
"learning_rate": 7.926427220990134e-05,
"loss": 0.0971,
"step": 5040
},
{
"epoch": 2.201394943330427,
"grad_norm": 1.0485479831695557,
"learning_rate": 7.9174822597489e-05,
"loss": 0.1033,
"step": 5050
},
{
"epoch": 2.205754141238012,
"grad_norm": 4.241369724273682,
"learning_rate": 7.908523118447789e-05,
"loss": 0.0802,
"step": 5060
},
{
"epoch": 2.210113339145597,
"grad_norm": 1.2369358539581299,
"learning_rate": 7.89954984063159e-05,
"loss": 0.1125,
"step": 5070
},
{
"epoch": 2.214472537053182,
"grad_norm": 0.5996437072753906,
"learning_rate": 7.890562469913811e-05,
"loss": 0.1047,
"step": 5080
},
{
"epoch": 2.218831734960767,
"grad_norm": 0.778715968132019,
"learning_rate": 7.881561049976447e-05,
"loss": 0.0911,
"step": 5090
},
{
"epoch": 2.223190932868352,
"grad_norm": 18.25193977355957,
"learning_rate": 7.872545624569779e-05,
"loss": 0.1072,
"step": 5100
},
{
"epoch": 2.227550130775937,
"grad_norm": 0.5723896622657776,
"learning_rate": 7.863516237512164e-05,
"loss": 0.1119,
"step": 5110
},
{
"epoch": 2.231909328683522,
"grad_norm": 1.7326332330703735,
"learning_rate": 7.854472932689815e-05,
"loss": 0.1062,
"step": 5120
},
{
"epoch": 2.236268526591107,
"grad_norm": 7.85261869430542,
"learning_rate": 7.845415754056591e-05,
"loss": 0.1157,
"step": 5130
},
{
"epoch": 2.240627724498692,
"grad_norm": 0.9551669359207153,
"learning_rate": 7.836344745633783e-05,
"loss": 0.0999,
"step": 5140
},
{
"epoch": 2.244986922406277,
"grad_norm": 0.7430893778800964,
"learning_rate": 7.8272599515099e-05,
"loss": 0.096,
"step": 5150
},
{
"epoch": 2.249346120313862,
"grad_norm": 6.560830116271973,
"learning_rate": 7.818161415840453e-05,
"loss": 0.1066,
"step": 5160
},
{
"epoch": 2.2537053182214475,
"grad_norm": 5.854716777801514,
"learning_rate": 7.809049182847745e-05,
"loss": 0.0974,
"step": 5170
},
{
"epoch": 2.258064516129032,
"grad_norm": 4.683152675628662,
"learning_rate": 7.799923296820653e-05,
"loss": 0.0949,
"step": 5180
},
{
"epoch": 2.2624237140366175,
"grad_norm": 0.6916239261627197,
"learning_rate": 7.790783802114408e-05,
"loss": 0.1007,
"step": 5190
},
{
"epoch": 2.2667829119442024,
"grad_norm": 0.8912094235420227,
"learning_rate": 7.781630743150392e-05,
"loss": 0.0906,
"step": 5200
},
{
"epoch": 2.2711421098517874,
"grad_norm": 4.473104953765869,
"learning_rate": 7.772464164415907e-05,
"loss": 0.0984,
"step": 5210
},
{
"epoch": 2.2755013077593724,
"grad_norm": 0.9665694832801819,
"learning_rate": 7.763284110463973e-05,
"loss": 0.1049,
"step": 5220
},
{
"epoch": 2.2798605056669574,
"grad_norm": 9.33495807647705,
"learning_rate": 7.754090625913099e-05,
"loss": 0.0989,
"step": 5230
},
{
"epoch": 2.2842197035745424,
"grad_norm": 7.102493762969971,
"learning_rate": 7.744883755447075e-05,
"loss": 0.0991,
"step": 5240
},
{
"epoch": 2.2885789014821274,
"grad_norm": 3.3411788940429688,
"learning_rate": 7.735663543814749e-05,
"loss": 0.1208,
"step": 5250
},
{
"epoch": 2.2929380993897124,
"grad_norm": 0.6373879909515381,
"learning_rate": 7.726430035829813e-05,
"loss": 0.0938,
"step": 5260
},
{
"epoch": 2.2972972972972974,
"grad_norm": 2.2355239391326904,
"learning_rate": 7.717183276370586e-05,
"loss": 0.0954,
"step": 5270
},
{
"epoch": 2.3016564952048824,
"grad_norm": 0.8228651285171509,
"learning_rate": 7.707923310379794e-05,
"loss": 0.0911,
"step": 5280
},
{
"epoch": 2.3060156931124673,
"grad_norm": 1.4001113176345825,
"learning_rate": 7.698650182864351e-05,
"loss": 0.101,
"step": 5290
},
{
"epoch": 2.3103748910200523,
"grad_norm": 8.409546852111816,
"learning_rate": 7.689363938895138e-05,
"loss": 0.1008,
"step": 5300
},
{
"epoch": 2.3147340889276373,
"grad_norm": 3.2548961639404297,
"learning_rate": 7.680064623606791e-05,
"loss": 0.0809,
"step": 5310
},
{
"epoch": 2.3190932868352223,
"grad_norm": 0.7357600331306458,
"learning_rate": 7.670752282197476e-05,
"loss": 0.0834,
"step": 5320
},
{
"epoch": 2.3234524847428073,
"grad_norm": 0.5668644905090332,
"learning_rate": 7.66142695992867e-05,
"loss": 0.0998,
"step": 5330
},
{
"epoch": 2.3278116826503923,
"grad_norm": 0.7869893908500671,
"learning_rate": 7.652088702124944e-05,
"loss": 0.0934,
"step": 5340
},
{
"epoch": 2.3321708805579773,
"grad_norm": 1.659505009651184,
"learning_rate": 7.64273755417374e-05,
"loss": 0.0888,
"step": 5350
},
{
"epoch": 2.3365300784655623,
"grad_norm": 0.6730926036834717,
"learning_rate": 7.633373561525148e-05,
"loss": 0.0892,
"step": 5360
},
{
"epoch": 2.3408892763731473,
"grad_norm": 5.145294666290283,
"learning_rate": 7.623996769691691e-05,
"loss": 0.103,
"step": 5370
},
{
"epoch": 2.3452484742807322,
"grad_norm": 7.745975017547607,
"learning_rate": 7.614607224248103e-05,
"loss": 0.0946,
"step": 5380
},
{
"epoch": 2.3496076721883172,
"grad_norm": 1.3489940166473389,
"learning_rate": 7.605204970831096e-05,
"loss": 0.0831,
"step": 5390
},
{
"epoch": 2.353966870095902,
"grad_norm": 1.2738442420959473,
"learning_rate": 7.595790055139163e-05,
"loss": 0.1029,
"step": 5400
},
{
"epoch": 2.358326068003487,
"grad_norm": 0.987291693687439,
"learning_rate": 7.586362522932323e-05,
"loss": 0.109,
"step": 5410
},
{
"epoch": 2.362685265911072,
"grad_norm": 6.236382961273193,
"learning_rate": 7.576922420031929e-05,
"loss": 0.1027,
"step": 5420
},
{
"epoch": 2.367044463818657,
"grad_norm": 0.9077024459838867,
"learning_rate": 7.567469792320428e-05,
"loss": 0.1084,
"step": 5430
},
{
"epoch": 2.3714036617262426,
"grad_norm": 0.6678237915039062,
"learning_rate": 7.558004685741137e-05,
"loss": 0.0982,
"step": 5440
},
{
"epoch": 2.375762859633827,
"grad_norm": 0.6273680925369263,
"learning_rate": 7.548527146298036e-05,
"loss": 0.0933,
"step": 5450
},
{
"epoch": 2.3801220575414126,
"grad_norm": 0.6411100625991821,
"learning_rate": 7.539037220055527e-05,
"loss": 0.0914,
"step": 5460
},
{
"epoch": 2.384481255448997,
"grad_norm": 0.9402559995651245,
"learning_rate": 7.529534953138213e-05,
"loss": 0.0778,
"step": 5470
},
{
"epoch": 2.3888404533565826,
"grad_norm": 0.9908243417739868,
"learning_rate": 7.520020391730684e-05,
"loss": 0.0866,
"step": 5480
},
{
"epoch": 2.3931996512641676,
"grad_norm": 1.700788140296936,
"learning_rate": 7.510493582077281e-05,
"loss": 0.098,
"step": 5490
},
{
"epoch": 2.3975588491717525,
"grad_norm": 6.728827476501465,
"learning_rate": 7.500954570481882e-05,
"loss": 0.1064,
"step": 5500
},
{
"epoch": 2.4019180470793375,
"grad_norm": 4.217410564422607,
"learning_rate": 7.491403403307662e-05,
"loss": 0.1054,
"step": 5510
},
{
"epoch": 2.4062772449869225,
"grad_norm": 7.545099258422852,
"learning_rate": 7.481840126976885e-05,
"loss": 0.0976,
"step": 5520
},
{
"epoch": 2.4106364428945075,
"grad_norm": 6.637195110321045,
"learning_rate": 7.472264787970666e-05,
"loss": 0.1006,
"step": 5530
},
{
"epoch": 2.4149956408020925,
"grad_norm": 0.878823459148407,
"learning_rate": 7.462677432828751e-05,
"loss": 0.084,
"step": 5540
},
{
"epoch": 2.4193548387096775,
"grad_norm": 1.0781915187835693,
"learning_rate": 7.453078108149287e-05,
"loss": 0.0995,
"step": 5550
},
{
"epoch": 2.4237140366172625,
"grad_norm": 0.7587170004844666,
"learning_rate": 7.443466860588599e-05,
"loss": 0.0971,
"step": 5560
},
{
"epoch": 2.4280732345248475,
"grad_norm": 1.204644799232483,
"learning_rate": 7.43384373686096e-05,
"loss": 0.0864,
"step": 5570
},
{
"epoch": 2.4324324324324325,
"grad_norm": 1.538623571395874,
"learning_rate": 7.424208783738367e-05,
"loss": 0.104,
"step": 5580
},
{
"epoch": 2.4367916303400174,
"grad_norm": 1.163440465927124,
"learning_rate": 7.414562048050315e-05,
"loss": 0.0782,
"step": 5590
},
{
"epoch": 2.4411508282476024,
"grad_norm": 1.4269083738327026,
"learning_rate": 7.404903576683559e-05,
"loss": 0.0973,
"step": 5600
},
{
"epoch": 2.4455100261551874,
"grad_norm": 2.2883760929107666,
"learning_rate": 7.3952334165819e-05,
"loss": 0.0805,
"step": 5610
},
{
"epoch": 2.4498692240627724,
"grad_norm": 0.9393578767776489,
"learning_rate": 7.385551614745952e-05,
"loss": 0.0712,
"step": 5620
},
{
"epoch": 2.4542284219703574,
"grad_norm": 7.377894401550293,
"learning_rate": 7.375858218232905e-05,
"loss": 0.0733,
"step": 5630
},
{
"epoch": 2.4585876198779424,
"grad_norm": 0.8878276944160461,
"learning_rate": 7.366153274156312e-05,
"loss": 0.0836,
"step": 5640
},
{
"epoch": 2.4629468177855274,
"grad_norm": 0.679876983165741,
"learning_rate": 7.356436829685844e-05,
"loss": 0.0877,
"step": 5650
},
{
"epoch": 2.4673060156931124,
"grad_norm": 0.6781834363937378,
"learning_rate": 7.346708932047074e-05,
"loss": 0.0897,
"step": 5660
},
{
"epoch": 2.4716652136006974,
"grad_norm": 6.125814437866211,
"learning_rate": 7.336969628521237e-05,
"loss": 0.09,
"step": 5670
},
{
"epoch": 2.4760244115082823,
"grad_norm": 0.6775864958763123,
"learning_rate": 7.32721896644501e-05,
"loss": 0.0975,
"step": 5680
},
{
"epoch": 2.4803836094158673,
"grad_norm": 0.590700089931488,
"learning_rate": 7.317456993210272e-05,
"loss": 0.098,
"step": 5690
},
{
"epoch": 2.4847428073234523,
"grad_norm": 2.4274611473083496,
"learning_rate": 7.307683756263881e-05,
"loss": 0.0749,
"step": 5700
},
{
"epoch": 2.4891020052310373,
"grad_norm": 0.6488708257675171,
"learning_rate": 7.297899303107441e-05,
"loss": 0.0906,
"step": 5710
},
{
"epoch": 2.4934612031386223,
"grad_norm": 0.7550959587097168,
"learning_rate": 7.288103681297068e-05,
"loss": 0.0891,
"step": 5720
},
{
"epoch": 2.4978204010462077,
"grad_norm": 12.706351280212402,
"learning_rate": 7.278296938443166e-05,
"loss": 0.0779,
"step": 5730
},
{
"epoch": 2.5021795989537923,
"grad_norm": 0.9015982151031494,
"learning_rate": 7.26847912221019e-05,
"loss": 0.0884,
"step": 5740
},
{
"epoch": 2.5065387968613777,
"grad_norm": 3.624945640563965,
"learning_rate": 7.258650280316415e-05,
"loss": 0.0887,
"step": 5750
},
{
"epoch": 2.5108979947689622,
"grad_norm": 0.4754033088684082,
"learning_rate": 7.248810460533706e-05,
"loss": 0.0806,
"step": 5760
},
{
"epoch": 2.5152571926765477,
"grad_norm": 1.9334558248519897,
"learning_rate": 7.238959710687282e-05,
"loss": 0.0778,
"step": 5770
},
{
"epoch": 2.5196163905841327,
"grad_norm": 1.0764563083648682,
"learning_rate": 7.229098078655489e-05,
"loss": 0.0887,
"step": 5780
},
{
"epoch": 2.5239755884917177,
"grad_norm": 0.44298532605171204,
"learning_rate": 7.219225612369565e-05,
"loss": 0.0691,
"step": 5790
},
{
"epoch": 2.5283347863993026,
"grad_norm": 0.8877589106559753,
"learning_rate": 7.209342359813404e-05,
"loss": 0.0959,
"step": 5800
},
{
"epoch": 2.5326939843068876,
"grad_norm": 0.5124678015708923,
"learning_rate": 7.199448369023327e-05,
"loss": 0.0816,
"step": 5810
},
{
"epoch": 2.5370531822144726,
"grad_norm": 1.4232996702194214,
"learning_rate": 7.189543688087845e-05,
"loss": 0.0827,
"step": 5820
},
{
"epoch": 2.5414123801220576,
"grad_norm": 0.7989560961723328,
"learning_rate": 7.17962836514743e-05,
"loss": 0.0919,
"step": 5830
},
{
"epoch": 2.5457715780296426,
"grad_norm": 0.958767831325531,
"learning_rate": 7.169702448394279e-05,
"loss": 0.0777,
"step": 5840
},
{
"epoch": 2.5501307759372276,
"grad_norm": 0.5822910070419312,
"learning_rate": 7.159765986072071e-05,
"loss": 0.0823,
"step": 5850
},
{
"epoch": 2.5544899738448126,
"grad_norm": 3.608449935913086,
"learning_rate": 7.149819026475751e-05,
"loss": 0.0895,
"step": 5860
},
{
"epoch": 2.5588491717523976,
"grad_norm": 1.6009862422943115,
"learning_rate": 7.139861617951275e-05,
"loss": 0.0869,
"step": 5870
},
{
"epoch": 2.5632083696599826,
"grad_norm": 0.9188637733459473,
"learning_rate": 7.129893808895395e-05,
"loss": 0.1151,
"step": 5880
},
{
"epoch": 2.5675675675675675,
"grad_norm": 2.6896746158599854,
"learning_rate": 7.119915647755404e-05,
"loss": 0.0987,
"step": 5890
},
{
"epoch": 2.5719267654751525,
"grad_norm": 11.660469055175781,
"learning_rate": 7.109927183028914e-05,
"loss": 0.1049,
"step": 5900
},
{
"epoch": 2.5762859633827375,
"grad_norm": 1.580359935760498,
"learning_rate": 7.099928463263619e-05,
"loss": 0.0894,
"step": 5910
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.4905918538570404,
"learning_rate": 7.08991953705705e-05,
"loss": 0.0752,
"step": 5920
},
{
"epoch": 2.5850043591979075,
"grad_norm": 0.6062779426574707,
"learning_rate": 7.07990045305635e-05,
"loss": 0.0762,
"step": 5930
},
{
"epoch": 2.5893635571054925,
"grad_norm": 0.9602181911468506,
"learning_rate": 7.069871259958034e-05,
"loss": 0.0864,
"step": 5940
},
{
"epoch": 2.5937227550130775,
"grad_norm": 3.8249127864837646,
"learning_rate": 7.059832006507745e-05,
"loss": 0.0913,
"step": 5950
},
{
"epoch": 2.5980819529206625,
"grad_norm": 1.6133575439453125,
"learning_rate": 7.049782741500028e-05,
"loss": 0.0744,
"step": 5960
},
{
"epoch": 2.6024411508282475,
"grad_norm": 1.4505705833435059,
"learning_rate": 7.039723513778087e-05,
"loss": 0.0862,
"step": 5970
},
{
"epoch": 2.606800348735833,
"grad_norm": 0.8165828585624695,
"learning_rate": 7.029654372233544e-05,
"loss": 0.0833,
"step": 5980
},
{
"epoch": 2.6111595466434174,
"grad_norm": 1.2085750102996826,
"learning_rate": 7.019575365806215e-05,
"loss": 0.0985,
"step": 5990
},
{
"epoch": 2.615518744551003,
"grad_norm": 0.4820369482040405,
"learning_rate": 7.009486543483858e-05,
"loss": 0.1111,
"step": 6000
},
{
"epoch": 2.6198779424585874,
"grad_norm": 1.5817570686340332,
"learning_rate": 6.999387954301934e-05,
"loss": 0.107,
"step": 6010
},
{
"epoch": 2.624237140366173,
"grad_norm": 1.02738356590271,
"learning_rate": 6.989279647343388e-05,
"loss": 0.0881,
"step": 6020
},
{
"epoch": 2.6285963382737574,
"grad_norm": 3.4080002307891846,
"learning_rate": 6.979161671738382e-05,
"loss": 0.0849,
"step": 6030
},
{
"epoch": 2.632955536181343,
"grad_norm": 0.4838300347328186,
"learning_rate": 6.969034076664085e-05,
"loss": 0.0913,
"step": 6040
},
{
"epoch": 2.637314734088928,
"grad_norm": 0.7174921631813049,
"learning_rate": 6.958896911344411e-05,
"loss": 0.0868,
"step": 6050
},
{
"epoch": 2.641673931996513,
"grad_norm": 0.812843382358551,
"learning_rate": 6.948750225049791e-05,
"loss": 0.0831,
"step": 6060
},
{
"epoch": 2.646033129904098,
"grad_norm": 1.0127894878387451,
"learning_rate": 6.938594067096936e-05,
"loss": 0.0861,
"step": 6070
},
{
"epoch": 2.6503923278116828,
"grad_norm": 8.35067367553711,
"learning_rate": 6.928428486848587e-05,
"loss": 0.0849,
"step": 6080
},
{
"epoch": 2.6547515257192678,
"grad_norm": 0.7862615585327148,
"learning_rate": 6.918253533713282e-05,
"loss": 0.0807,
"step": 6090
},
{
"epoch": 2.6591107236268527,
"grad_norm": 0.5965198278427124,
"learning_rate": 6.908069257145118e-05,
"loss": 0.1029,
"step": 6100
},
{
"epoch": 2.6634699215344377,
"grad_norm": 1.679243803024292,
"learning_rate": 6.897875706643506e-05,
"loss": 0.0702,
"step": 6110
},
{
"epoch": 2.6678291194420227,
"grad_norm": 0.9817731976509094,
"learning_rate": 6.887672931752927e-05,
"loss": 0.0817,
"step": 6120
},
{
"epoch": 2.6721883173496077,
"grad_norm": 0.6078197360038757,
"learning_rate": 6.877460982062706e-05,
"loss": 0.0705,
"step": 6130
},
{
"epoch": 2.6765475152571927,
"grad_norm": 2.130326271057129,
"learning_rate": 6.86723990720675e-05,
"loss": 0.0907,
"step": 6140
},
{
"epoch": 2.6809067131647777,
"grad_norm": 0.932861328125,
"learning_rate": 6.857009756863326e-05,
"loss": 0.0974,
"step": 6150
},
{
"epoch": 2.6852659110723627,
"grad_norm": 0.7639493346214294,
"learning_rate": 6.846770580754807e-05,
"loss": 0.1037,
"step": 6160
},
{
"epoch": 2.6896251089799477,
"grad_norm": 0.9204385280609131,
"learning_rate": 6.836522428647438e-05,
"loss": 0.0859,
"step": 6170
},
{
"epoch": 2.6939843068875327,
"grad_norm": 0.6915794014930725,
"learning_rate": 6.826265350351083e-05,
"loss": 0.0664,
"step": 6180
},
{
"epoch": 2.6983435047951176,
"grad_norm": 0.9047311544418335,
"learning_rate": 6.815999395719e-05,
"loss": 0.105,
"step": 6190
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.9636686444282532,
"learning_rate": 6.805724614647586e-05,
"loss": 0.072,
"step": 6200
},
{
"epoch": 2.7070619006102876,
"grad_norm": 0.9881836175918579,
"learning_rate": 6.795441057076136e-05,
"loss": 0.0813,
"step": 6210
},
{
"epoch": 2.7114210985178726,
"grad_norm": 0.8798909187316895,
"learning_rate": 6.785148772986603e-05,
"loss": 0.0811,
"step": 6220
},
{
"epoch": 2.7157802964254576,
"grad_norm": 2.17606520652771,
"learning_rate": 6.774847812403355e-05,
"loss": 0.0894,
"step": 6230
},
{
"epoch": 2.7201394943330426,
"grad_norm": 1.370004415512085,
"learning_rate": 6.76453822539293e-05,
"loss": 0.0858,
"step": 6240
},
{
"epoch": 2.7244986922406276,
"grad_norm": 0.4295946955680847,
"learning_rate": 6.754220062063793e-05,
"loss": 0.0846,
"step": 6250
},
{
"epoch": 2.7288578901482126,
"grad_norm": 0.9039126634597778,
"learning_rate": 6.743893372566099e-05,
"loss": 0.0762,
"step": 6260
},
{
"epoch": 2.733217088055798,
"grad_norm": 0.831985592842102,
"learning_rate": 6.733558207091434e-05,
"loss": 0.0763,
"step": 6270
},
{
"epoch": 2.7375762859633825,
"grad_norm": 0.4425466060638428,
"learning_rate": 6.723214615872585e-05,
"loss": 0.0798,
"step": 6280
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.5437725782394409,
"learning_rate": 6.712862649183295e-05,
"loss": 0.0709,
"step": 6290
},
{
"epoch": 2.7462946817785525,
"grad_norm": 0.663580060005188,
"learning_rate": 6.70250235733801e-05,
"loss": 0.0847,
"step": 6300
},
{
"epoch": 2.750653879686138,
"grad_norm": 1.1464177370071411,
"learning_rate": 6.692133790691639e-05,
"loss": 0.0665,
"step": 6310
},
{
"epoch": 2.7550130775937225,
"grad_norm": 0.781203031539917,
"learning_rate": 6.681756999639311e-05,
"loss": 0.074,
"step": 6320
},
{
"epoch": 2.759372275501308,
"grad_norm": 0.7381405830383301,
"learning_rate": 6.671372034616132e-05,
"loss": 0.0841,
"step": 6330
},
{
"epoch": 2.763731473408893,
"grad_norm": 0.7967920899391174,
"learning_rate": 6.660978946096933e-05,
"loss": 0.0954,
"step": 6340
},
{
"epoch": 2.768090671316478,
"grad_norm": 1.6040399074554443,
"learning_rate": 6.650577784596026e-05,
"loss": 0.0774,
"step": 6350
},
{
"epoch": 2.772449869224063,
"grad_norm": 0.9123194217681885,
"learning_rate": 6.640168600666967e-05,
"loss": 0.0835,
"step": 6360
},
{
"epoch": 2.776809067131648,
"grad_norm": 1.2217650413513184,
"learning_rate": 6.629751444902299e-05,
"loss": 0.0717,
"step": 6370
},
{
"epoch": 2.781168265039233,
"grad_norm": 0.8930469751358032,
"learning_rate": 6.619326367933312e-05,
"loss": 0.0708,
"step": 6380
},
{
"epoch": 2.785527462946818,
"grad_norm": 0.5414270162582397,
"learning_rate": 6.608893420429798e-05,
"loss": 0.0749,
"step": 6390
},
{
"epoch": 2.789886660854403,
"grad_norm": 0.9353501200675964,
"learning_rate": 6.598452653099803e-05,
"loss": 0.0801,
"step": 6400
},
{
"epoch": 2.794245858761988,
"grad_norm": 0.48062050342559814,
"learning_rate": 6.588004116689375e-05,
"loss": 0.0631,
"step": 6410
},
{
"epoch": 2.798605056669573,
"grad_norm": 0.7349405288696289,
"learning_rate": 6.57754786198233e-05,
"loss": 0.0794,
"step": 6420
},
{
"epoch": 2.802964254577158,
"grad_norm": 0.48554134368896484,
"learning_rate": 6.567083939799992e-05,
"loss": 0.07,
"step": 6430
},
{
"epoch": 2.807323452484743,
"grad_norm": 0.5531144738197327,
"learning_rate": 6.556612401000954e-05,
"loss": 0.0673,
"step": 6440
},
{
"epoch": 2.811682650392328,
"grad_norm": 0.40997496247291565,
"learning_rate": 6.54613329648083e-05,
"loss": 0.0732,
"step": 6450
},
{
"epoch": 2.8160418482999128,
"grad_norm": 0.8054825663566589,
"learning_rate": 6.535646677172005e-05,
"loss": 0.0656,
"step": 6460
},
{
"epoch": 2.8204010462074978,
"grad_norm": 4.4836554527282715,
"learning_rate": 6.52515259404339e-05,
"loss": 0.0751,
"step": 6470
},
{
"epoch": 2.8247602441150828,
"grad_norm": 0.6638883352279663,
"learning_rate": 6.514651098100167e-05,
"loss": 0.0623,
"step": 6480
},
{
"epoch": 2.8291194420226677,
"grad_norm": 0.8358332514762878,
"learning_rate": 6.504142240383555e-05,
"loss": 0.0911,
"step": 6490
},
{
"epoch": 2.8334786399302527,
"grad_norm": 0.8474389910697937,
"learning_rate": 6.493626071970549e-05,
"loss": 0.0609,
"step": 6500
},
{
"epoch": 2.8378378378378377,
"grad_norm": 1.1841853857040405,
"learning_rate": 6.483102643973682e-05,
"loss": 0.066,
"step": 6510
},
{
"epoch": 2.8421970357454227,
"grad_norm": 0.5661811828613281,
"learning_rate": 6.472572007540764e-05,
"loss": 0.07,
"step": 6520
},
{
"epoch": 2.8465562336530077,
"grad_norm": 0.6846306920051575,
"learning_rate": 6.462034213854645e-05,
"loss": 0.0701,
"step": 6530
},
{
"epoch": 2.850915431560593,
"grad_norm": 0.5802059769630432,
"learning_rate": 6.451489314132962e-05,
"loss": 0.0755,
"step": 6540
},
{
"epoch": 2.8552746294681777,
"grad_norm": 0.7946178913116455,
"learning_rate": 6.440937359627893e-05,
"loss": 0.0674,
"step": 6550
},
{
"epoch": 2.859633827375763,
"grad_norm": 0.34522873163223267,
"learning_rate": 6.430378401625894e-05,
"loss": 0.0735,
"step": 6560
},
{
"epoch": 2.8639930252833476,
"grad_norm": 0.5881823897361755,
"learning_rate": 6.419812491447472e-05,
"loss": 0.0771,
"step": 6570
},
{
"epoch": 2.868352223190933,
"grad_norm": 2.1091787815093994,
"learning_rate": 6.409239680446919e-05,
"loss": 0.0712,
"step": 6580
},
{
"epoch": 2.8727114210985176,
"grad_norm": 0.9964777827262878,
"learning_rate": 6.398660020012072e-05,
"loss": 0.0804,
"step": 6590
},
{
"epoch": 2.877070619006103,
"grad_norm": 0.7100098729133606,
"learning_rate": 6.38807356156405e-05,
"loss": 0.0785,
"step": 6600
},
{
"epoch": 2.881429816913688,
"grad_norm": 0.7806901931762695,
"learning_rate": 6.377480356557022e-05,
"loss": 0.0603,
"step": 6610
},
{
"epoch": 2.885789014821273,
"grad_norm": 0.6067240834236145,
"learning_rate": 6.366880456477942e-05,
"loss": 0.0683,
"step": 6620
},
{
"epoch": 2.890148212728858,
"grad_norm": 0.9816045761108398,
"learning_rate": 6.356273912846312e-05,
"loss": 0.0696,
"step": 6630
},
{
"epoch": 2.894507410636443,
"grad_norm": 0.5616809129714966,
"learning_rate": 6.34566077721391e-05,
"loss": 0.0605,
"step": 6640
},
{
"epoch": 2.898866608544028,
"grad_norm": 0.5172761082649231,
"learning_rate": 6.335041101164569e-05,
"loss": 0.0747,
"step": 6650
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.6442545056343079,
"learning_rate": 6.324414936313904e-05,
"loss": 0.0581,
"step": 6660
},
{
"epoch": 2.907585004359198,
"grad_norm": 0.48619189858436584,
"learning_rate": 6.313782334309066e-05,
"loss": 0.0597,
"step": 6670
},
{
"epoch": 2.911944202266783,
"grad_norm": 0.8751986026763916,
"learning_rate": 6.303143346828499e-05,
"loss": 0.0697,
"step": 6680
},
{
"epoch": 2.916303400174368,
"grad_norm": 0.6420536041259766,
"learning_rate": 6.292498025581674e-05,
"loss": 0.0595,
"step": 6690
},
{
"epoch": 2.920662598081953,
"grad_norm": 0.6701872944831848,
"learning_rate": 6.281846422308857e-05,
"loss": 0.0676,
"step": 6700
},
{
"epoch": 2.925021795989538,
"grad_norm": 0.7736986875534058,
"learning_rate": 6.271188588780839e-05,
"loss": 0.0762,
"step": 6710
},
{
"epoch": 2.929380993897123,
"grad_norm": 1.2286581993103027,
"learning_rate": 6.260524576798694e-05,
"loss": 0.08,
"step": 6720
},
{
"epoch": 2.933740191804708,
"grad_norm": 0.6742132902145386,
"learning_rate": 6.249854438193528e-05,
"loss": 0.0707,
"step": 6730
},
{
"epoch": 2.938099389712293,
"grad_norm": 0.7686195373535156,
"learning_rate": 6.239178224826224e-05,
"loss": 0.0852,
"step": 6740
},
{
"epoch": 2.942458587619878,
"grad_norm": 0.44351834058761597,
"learning_rate": 6.228495988587188e-05,
"loss": 0.0678,
"step": 6750
},
{
"epoch": 2.946817785527463,
"grad_norm": 0.33258873224258423,
"learning_rate": 6.217807781396106e-05,
"loss": 0.0656,
"step": 6760
},
{
"epoch": 2.951176983435048,
"grad_norm": 0.9385144114494324,
"learning_rate": 6.207113655201676e-05,
"loss": 0.0641,
"step": 6770
},
{
"epoch": 2.955536181342633,
"grad_norm": 0.7347838878631592,
"learning_rate": 6.196413661981368e-05,
"loss": 0.073,
"step": 6780
},
{
"epoch": 2.959895379250218,
"grad_norm": 1.1833434104919434,
"learning_rate": 6.185707853741175e-05,
"loss": 0.0691,
"step": 6790
},
{
"epoch": 2.964254577157803,
"grad_norm": 0.773758590221405,
"learning_rate": 6.174996282515344e-05,
"loss": 0.0668,
"step": 6800
},
{
"epoch": 2.968613775065388,
"grad_norm": 0.6628344655036926,
"learning_rate": 6.164279000366131e-05,
"loss": 0.0596,
"step": 6810
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.7836283445358276,
"learning_rate": 6.153556059383561e-05,
"loss": 0.0774,
"step": 6820
},
{
"epoch": 2.9773321708805582,
"grad_norm": 0.6452964544296265,
"learning_rate": 6.142827511685152e-05,
"loss": 0.0654,
"step": 6830
},
{
"epoch": 2.981691368788143,
"grad_norm": 1.0415467023849487,
"learning_rate": 6.132093409415678e-05,
"loss": 0.0754,
"step": 6840
},
{
"epoch": 2.986050566695728,
"grad_norm": 2.455578088760376,
"learning_rate": 6.121353804746907e-05,
"loss": 0.0672,
"step": 6850
},
{
"epoch": 2.9904097646033128,
"grad_norm": 0.8278747797012329,
"learning_rate": 6.110608749877352e-05,
"loss": 0.0627,
"step": 6860
},
{
"epoch": 2.994768962510898,
"grad_norm": 1.0058568716049194,
"learning_rate": 6.0998582970320205e-05,
"loss": 0.089,
"step": 6870
},
{
"epoch": 2.9991281604184827,
"grad_norm": 0.38594409823417664,
"learning_rate": 6.0891024984621506e-05,
"loss": 0.0785,
"step": 6880
},
{
"epoch": 3.003487358326068,
"grad_norm": 0.6081041097640991,
"learning_rate": 6.078341406444961e-05,
"loss": 0.068,
"step": 6890
},
{
"epoch": 3.007846556233653,
"grad_norm": 0.5051882863044739,
"learning_rate": 6.067575073283405e-05,
"loss": 0.077,
"step": 6900
},
{
"epoch": 3.012205754141238,
"grad_norm": 0.813621461391449,
"learning_rate": 6.0568035513059073e-05,
"loss": 0.0561,
"step": 6910
},
{
"epoch": 3.016564952048823,
"grad_norm": 1.0698422193527222,
"learning_rate": 6.046026892866109e-05,
"loss": 0.0809,
"step": 6920
},
{
"epoch": 3.020924149956408,
"grad_norm": 0.8250692486763,
"learning_rate": 6.0352451503426214e-05,
"loss": 0.0737,
"step": 6930
},
{
"epoch": 3.025283347863993,
"grad_norm": 0.6636641621589661,
"learning_rate": 6.024458376138762e-05,
"loss": 0.0526,
"step": 6940
},
{
"epoch": 3.029642545771578,
"grad_norm": 2.4719927310943604,
"learning_rate": 6.013666622682306e-05,
"loss": 0.0587,
"step": 6950
},
{
"epoch": 3.034001743679163,
"grad_norm": 0.7209786176681519,
"learning_rate": 6.002869942425231e-05,
"loss": 0.0616,
"step": 6960
},
{
"epoch": 3.038360941586748,
"grad_norm": 0.8328136801719666,
"learning_rate": 5.992068387843459e-05,
"loss": 0.0669,
"step": 6970
},
{
"epoch": 3.042720139494333,
"grad_norm": 1.0429414510726929,
"learning_rate": 5.981262011436603e-05,
"loss": 0.0752,
"step": 6980
},
{
"epoch": 3.047079337401918,
"grad_norm": 0.6350201368331909,
"learning_rate": 5.970450865727712e-05,
"loss": 0.0588,
"step": 6990
},
{
"epoch": 3.051438535309503,
"grad_norm": 0.8136351704597473,
"learning_rate": 5.9596350032630156e-05,
"loss": 0.0625,
"step": 7000
},
{
"epoch": 3.055797733217088,
"grad_norm": 0.6018245816230774,
"learning_rate": 5.9488144766116714e-05,
"loss": 0.0629,
"step": 7010
},
{
"epoch": 3.060156931124673,
"grad_norm": 1.1054131984710693,
"learning_rate": 5.9379893383655006e-05,
"loss": 0.0613,
"step": 7020
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.5404367446899414,
"learning_rate": 5.927159641138744e-05,
"loss": 0.0836,
"step": 7030
},
{
"epoch": 3.068875326939843,
"grad_norm": 0.3001430332660675,
"learning_rate": 5.916325437567799e-05,
"loss": 0.0697,
"step": 7040
},
{
"epoch": 3.073234524847428,
"grad_norm": 0.9972290396690369,
"learning_rate": 5.905486780310966e-05,
"loss": 0.0591,
"step": 7050
},
{
"epoch": 3.077593722755013,
"grad_norm": 1.5061616897583008,
"learning_rate": 5.8946437220481887e-05,
"loss": 0.0655,
"step": 7060
},
{
"epoch": 3.081952920662598,
"grad_norm": 1.0841275453567505,
"learning_rate": 5.883796315480805e-05,
"loss": 0.0667,
"step": 7070
},
{
"epoch": 3.086312118570183,
"grad_norm": 0.5677183270454407,
"learning_rate": 5.872944613331288e-05,
"loss": 0.0833,
"step": 7080
},
{
"epoch": 3.090671316477768,
"grad_norm": 0.5351662039756775,
"learning_rate": 5.862088668342986e-05,
"loss": 0.0667,
"step": 7090
},
{
"epoch": 3.095030514385353,
"grad_norm": 0.8425320982933044,
"learning_rate": 5.8512285332798714e-05,
"loss": 0.0604,
"step": 7100
},
{
"epoch": 3.099389712292938,
"grad_norm": 1.6601287126541138,
"learning_rate": 5.840364260926277e-05,
"loss": 0.0831,
"step": 7110
},
{
"epoch": 3.103748910200523,
"grad_norm": 0.8863855004310608,
"learning_rate": 5.8294959040866505e-05,
"loss": 0.0737,
"step": 7120
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.8012006282806396,
"learning_rate": 5.818623515585292e-05,
"loss": 0.0662,
"step": 7130
},
{
"epoch": 3.1124673060156933,
"grad_norm": 5.165378570556641,
"learning_rate": 5.8077471482660896e-05,
"loss": 0.0806,
"step": 7140
},
{
"epoch": 3.1168265039232783,
"grad_norm": 1.3551973104476929,
"learning_rate": 5.796866854992276e-05,
"loss": 0.0844,
"step": 7150
},
{
"epoch": 3.1211857018308633,
"grad_norm": 0.6418676972389221,
"learning_rate": 5.7859826886461676e-05,
"loss": 0.0722,
"step": 7160
},
{
"epoch": 3.1255448997384483,
"grad_norm": 7.927807807922363,
"learning_rate": 5.775094702128899e-05,
"loss": 0.0773,
"step": 7170
},
{
"epoch": 3.1299040976460333,
"grad_norm": 0.8966444134712219,
"learning_rate": 5.7642029483601746e-05,
"loss": 0.0819,
"step": 7180
},
{
"epoch": 3.1342632955536183,
"grad_norm": 0.7477391362190247,
"learning_rate": 5.753307480278012e-05,
"loss": 0.0795,
"step": 7190
},
{
"epoch": 3.1386224934612033,
"grad_norm": 1.0312063694000244,
"learning_rate": 5.742408350838478e-05,
"loss": 0.0789,
"step": 7200
},
{
"epoch": 3.1429816913687882,
"grad_norm": 1.2121704816818237,
"learning_rate": 5.7315056130154374e-05,
"loss": 0.0768,
"step": 7210
},
{
"epoch": 3.1473408892763732,
"grad_norm": 0.6697542667388916,
"learning_rate": 5.720599319800292e-05,
"loss": 0.0704,
"step": 7220
},
{
"epoch": 3.151700087183958,
"grad_norm": 0.5981730818748474,
"learning_rate": 5.709689524201722e-05,
"loss": 0.069,
"step": 7230
},
{
"epoch": 3.156059285091543,
"grad_norm": 0.5300177931785583,
"learning_rate": 5.698776279245437e-05,
"loss": 0.0717,
"step": 7240
},
{
"epoch": 3.160418482999128,
"grad_norm": 0.4018495976924896,
"learning_rate": 5.6878596379739036e-05,
"loss": 0.0665,
"step": 7250
},
{
"epoch": 3.164777680906713,
"grad_norm": 2.6015779972076416,
"learning_rate": 5.676939653446103e-05,
"loss": 0.0604,
"step": 7260
},
{
"epoch": 3.169136878814298,
"grad_norm": 2.2606658935546875,
"learning_rate": 5.666016378737261e-05,
"loss": 0.0546,
"step": 7270
},
{
"epoch": 3.173496076721883,
"grad_norm": 0.53534996509552,
"learning_rate": 5.655089866938596e-05,
"loss": 0.0659,
"step": 7280
},
{
"epoch": 3.177855274629468,
"grad_norm": 0.6438631415367126,
"learning_rate": 5.6441601711570615e-05,
"loss": 0.0742,
"step": 7290
},
{
"epoch": 3.182214472537053,
"grad_norm": 0.7055009007453918,
"learning_rate": 5.633227344515085e-05,
"loss": 0.0702,
"step": 7300
},
{
"epoch": 3.186573670444638,
"grad_norm": 0.48063281178474426,
"learning_rate": 5.6222914401503116e-05,
"loss": 0.0529,
"step": 7310
},
{
"epoch": 3.190932868352223,
"grad_norm": 0.5260463356971741,
"learning_rate": 5.611352511215343e-05,
"loss": 0.0467,
"step": 7320
},
{
"epoch": 3.195292066259808,
"grad_norm": 1.3753858804702759,
"learning_rate": 5.600410610877488e-05,
"loss": 0.0662,
"step": 7330
},
{
"epoch": 3.199651264167393,
"grad_norm": 0.5484173893928528,
"learning_rate": 5.58946579231849e-05,
"loss": 0.0586,
"step": 7340
},
{
"epoch": 3.204010462074978,
"grad_norm": 0.7811186909675598,
"learning_rate": 5.578518108734279e-05,
"loss": 0.0744,
"step": 7350
},
{
"epoch": 3.208369659982563,
"grad_norm": 0.7245134711265564,
"learning_rate": 5.5675676133347096e-05,
"loss": 0.0878,
"step": 7360
},
{
"epoch": 3.212728857890148,
"grad_norm": 0.5464441776275635,
"learning_rate": 5.556614359343307e-05,
"loss": 0.0681,
"step": 7370
},
{
"epoch": 3.217088055797733,
"grad_norm": 1.7045855522155762,
"learning_rate": 5.545658399996999e-05,
"loss": 0.0709,
"step": 7380
},
{
"epoch": 3.221447253705318,
"grad_norm": 0.6500688791275024,
"learning_rate": 5.534699788545862e-05,
"loss": 0.0728,
"step": 7390
},
{
"epoch": 3.225806451612903,
"grad_norm": 16.512657165527344,
"learning_rate": 5.523738578252867e-05,
"loss": 0.0713,
"step": 7400
},
{
"epoch": 3.2301656495204885,
"grad_norm": 0.5367026329040527,
"learning_rate": 5.512774822393614e-05,
"loss": 0.0689,
"step": 7410
},
{
"epoch": 3.234524847428073,
"grad_norm": 5.1868181228637695,
"learning_rate": 5.5018085742560744e-05,
"loss": 0.0924,
"step": 7420
},
{
"epoch": 3.2388840453356584,
"grad_norm": 0.6594675779342651,
"learning_rate": 5.4908398871403365e-05,
"loss": 0.0745,
"step": 7430
},
{
"epoch": 3.2432432432432434,
"grad_norm": 2.2696409225463867,
"learning_rate": 5.4798688143583375e-05,
"loss": 0.0768,
"step": 7440
},
{
"epoch": 3.2476024411508284,
"grad_norm": 0.6064624190330505,
"learning_rate": 5.468895409233615e-05,
"loss": 0.0701,
"step": 7450
},
{
"epoch": 3.2519616390584134,
"grad_norm": 0.9498799443244934,
"learning_rate": 5.4579197251010414e-05,
"loss": 0.0661,
"step": 7460
},
{
"epoch": 3.2563208369659984,
"grad_norm": 0.8516326546669006,
"learning_rate": 5.446941815306563e-05,
"loss": 0.0715,
"step": 7470
},
{
"epoch": 3.2606800348735834,
"grad_norm": 0.7903364300727844,
"learning_rate": 5.435961733206947e-05,
"loss": 0.0675,
"step": 7480
},
{
"epoch": 3.2650392327811684,
"grad_norm": 0.40599560737609863,
"learning_rate": 5.424979532169516e-05,
"loss": 0.0683,
"step": 7490
},
{
"epoch": 3.2693984306887534,
"grad_norm": 0.4230225086212158,
"learning_rate": 5.413995265571895e-05,
"loss": 0.0585,
"step": 7500
},
{
"epoch": 3.2737576285963383,
"grad_norm": 11.551130294799805,
"learning_rate": 5.403008986801746e-05,
"loss": 0.0713,
"step": 7510
},
{
"epoch": 3.2781168265039233,
"grad_norm": 0.5420308113098145,
"learning_rate": 5.3920207492565114e-05,
"loss": 0.0618,
"step": 7520
},
{
"epoch": 3.2824760244115083,
"grad_norm": 3.584078073501587,
"learning_rate": 5.381030606343154e-05,
"loss": 0.0558,
"step": 7530
},
{
"epoch": 3.2868352223190933,
"grad_norm": 1.0643659830093384,
"learning_rate": 5.370038611477894e-05,
"loss": 0.0715,
"step": 7540
},
{
"epoch": 3.2911944202266783,
"grad_norm": 0.557769775390625,
"learning_rate": 5.359044818085963e-05,
"loss": 0.0676,
"step": 7550
},
{
"epoch": 3.2955536181342633,
"grad_norm": 2.791323184967041,
"learning_rate": 5.3480492796013214e-05,
"loss": 0.0694,
"step": 7560
},
{
"epoch": 3.2999128160418483,
"grad_norm": 0.6536844372749329,
"learning_rate": 5.33705204946642e-05,
"loss": 0.0604,
"step": 7570
},
{
"epoch": 3.3042720139494333,
"grad_norm": 0.44391271471977234,
"learning_rate": 5.326053181131927e-05,
"loss": 0.0781,
"step": 7580
},
{
"epoch": 3.3086312118570183,
"grad_norm": 2.1630494594573975,
"learning_rate": 5.3150527280564776e-05,
"loss": 0.0801,
"step": 7590
},
{
"epoch": 3.3129904097646032,
"grad_norm": 3.6389074325561523,
"learning_rate": 5.3040507437064034e-05,
"loss": 0.0702,
"step": 7600
},
{
"epoch": 3.3173496076721882,
"grad_norm": 3.0503857135772705,
"learning_rate": 5.293047281555482e-05,
"loss": 0.0642,
"step": 7610
},
{
"epoch": 3.321708805579773,
"grad_norm": 0.6594390869140625,
"learning_rate": 5.2820423950846765e-05,
"loss": 0.0749,
"step": 7620
},
{
"epoch": 3.326068003487358,
"grad_norm": 2.7960643768310547,
"learning_rate": 5.2710361377818696e-05,
"loss": 0.082,
"step": 7630
},
{
"epoch": 3.330427201394943,
"grad_norm": 5.0908894538879395,
"learning_rate": 5.2600285631416026e-05,
"loss": 0.0732,
"step": 7640
},
{
"epoch": 3.334786399302528,
"grad_norm": 1.8586153984069824,
"learning_rate": 5.249019724664826e-05,
"loss": 0.0621,
"step": 7650
},
{
"epoch": 3.339145597210113,
"grad_norm": 2.7996575832366943,
"learning_rate": 5.2380096758586315e-05,
"loss": 0.0737,
"step": 7660
},
{
"epoch": 3.343504795117698,
"grad_norm": 9.720712661743164,
"learning_rate": 5.226998470235993e-05,
"loss": 0.0774,
"step": 7670
},
{
"epoch": 3.3478639930252836,
"grad_norm": 0.7286834120750427,
"learning_rate": 5.215986161315507e-05,
"loss": 0.0827,
"step": 7680
},
{
"epoch": 3.352223190932868,
"grad_norm": 0.6988757848739624,
"learning_rate": 5.20497280262113e-05,
"loss": 0.0679,
"step": 7690
},
{
"epoch": 3.3565823888404536,
"grad_norm": 0.6899250149726868,
"learning_rate": 5.193958447681924e-05,
"loss": 0.0618,
"step": 7700
},
{
"epoch": 3.360941586748038,
"grad_norm": 0.8897220492362976,
"learning_rate": 5.182943150031793e-05,
"loss": 0.0766,
"step": 7710
},
{
"epoch": 3.3653007846556235,
"grad_norm": 0.7581049799919128,
"learning_rate": 5.1719269632092204e-05,
"loss": 0.0819,
"step": 7720
},
{
"epoch": 3.3696599825632085,
"grad_norm": 0.6817976832389832,
"learning_rate": 5.160909940757015e-05,
"loss": 0.0769,
"step": 7730
},
{
"epoch": 3.3740191804707935,
"grad_norm": 0.8774738311767578,
"learning_rate": 5.149892136222043e-05,
"loss": 0.0631,
"step": 7740
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.7809702157974243,
"learning_rate": 5.1388736031549744e-05,
"loss": 0.0583,
"step": 7750
},
{
"epoch": 3.3827375762859635,
"grad_norm": 1.1148380041122437,
"learning_rate": 5.127854395110021e-05,
"loss": 0.0675,
"step": 7760
},
{
"epoch": 3.3870967741935485,
"grad_norm": 0.7259299159049988,
"learning_rate": 5.116834565644671e-05,
"loss": 0.0636,
"step": 7770
},
{
"epoch": 3.3914559721011335,
"grad_norm": 0.5108294486999512,
"learning_rate": 5.10581416831944e-05,
"loss": 0.0588,
"step": 7780
},
{
"epoch": 3.3958151700087185,
"grad_norm": 0.6647517681121826,
"learning_rate": 5.094793256697593e-05,
"loss": 0.0606,
"step": 7790
},
{
"epoch": 3.4001743679163035,
"grad_norm": 0.4952324330806732,
"learning_rate": 5.0837718843449075e-05,
"loss": 0.062,
"step": 7800
},
{
"epoch": 3.4045335658238884,
"grad_norm": 3.1278605461120605,
"learning_rate": 5.07275010482939e-05,
"loss": 0.0645,
"step": 7810
},
{
"epoch": 3.4088927637314734,
"grad_norm": 1.1586763858795166,
"learning_rate": 5.061727971721032e-05,
"loss": 0.0551,
"step": 7820
},
{
"epoch": 3.4132519616390584,
"grad_norm": 0.4465099275112152,
"learning_rate": 5.050705538591538e-05,
"loss": 0.0567,
"step": 7830
},
{
"epoch": 3.4176111595466434,
"grad_norm": 0.5040956735610962,
"learning_rate": 5.0396828590140785e-05,
"loss": 0.0518,
"step": 7840
},
{
"epoch": 3.4219703574542284,
"grad_norm": 0.5282771587371826,
"learning_rate": 5.0286599865630157e-05,
"loss": 0.0653,
"step": 7850
},
{
"epoch": 3.4263295553618134,
"grad_norm": 0.5432827472686768,
"learning_rate": 5.017636974813649e-05,
"loss": 0.0698,
"step": 7860
},
{
"epoch": 3.4306887532693984,
"grad_norm": 0.6134181618690491,
"learning_rate": 5.006613877341959e-05,
"loss": 0.0699,
"step": 7870
},
{
"epoch": 3.4350479511769834,
"grad_norm": 0.490448534488678,
"learning_rate": 4.99559074772434e-05,
"loss": 0.0621,
"step": 7880
},
{
"epoch": 3.4394071490845683,
"grad_norm": 0.5438141226768494,
"learning_rate": 4.9845676395373455e-05,
"loss": 0.0551,
"step": 7890
},
{
"epoch": 3.4437663469921533,
"grad_norm": 0.5387142896652222,
"learning_rate": 4.9735446063574184e-05,
"loss": 0.0668,
"step": 7900
},
{
"epoch": 3.4481255448997383,
"grad_norm": 0.5340880751609802,
"learning_rate": 4.962521701760645e-05,
"loss": 0.0617,
"step": 7910
},
{
"epoch": 3.4524847428073233,
"grad_norm": 0.6971394419670105,
"learning_rate": 4.951498979322482e-05,
"loss": 0.0806,
"step": 7920
},
{
"epoch": 3.4568439407149083,
"grad_norm": 0.5765938758850098,
"learning_rate": 4.9404764926174996e-05,
"loss": 0.0576,
"step": 7930
},
{
"epoch": 3.4612031386224933,
"grad_norm": 0.4569430351257324,
"learning_rate": 4.929454295219127e-05,
"loss": 0.0558,
"step": 7940
},
{
"epoch": 3.4655623365300783,
"grad_norm": 1.1662955284118652,
"learning_rate": 4.9184324406993844e-05,
"loss": 0.0517,
"step": 7950
},
{
"epoch": 3.4699215344376633,
"grad_norm": 0.9679137468338013,
"learning_rate": 4.907410982628623e-05,
"loss": 0.0578,
"step": 7960
},
{
"epoch": 3.4742807323452487,
"grad_norm": 0.47717949748039246,
"learning_rate": 4.896389974575273e-05,
"loss": 0.0629,
"step": 7970
},
{
"epoch": 3.4786399302528332,
"grad_norm": 1.1076433658599854,
"learning_rate": 4.885369470105571e-05,
"loss": 0.0611,
"step": 7980
},
{
"epoch": 3.4829991281604187,
"grad_norm": 0.634964644908905,
"learning_rate": 4.874349522783313e-05,
"loss": 0.06,
"step": 7990
},
{
"epoch": 3.4873583260680037,
"grad_norm": 0.566712498664856,
"learning_rate": 4.863330186169581e-05,
"loss": 0.058,
"step": 8000
},
{
"epoch": 3.4917175239755887,
"grad_norm": 0.4812932312488556,
"learning_rate": 4.8523115138224885e-05,
"loss": 0.0512,
"step": 8010
},
{
"epoch": 3.4960767218831736,
"grad_norm": 0.5662280917167664,
"learning_rate": 4.841293559296928e-05,
"loss": 0.058,
"step": 8020
},
{
"epoch": 3.5004359197907586,
"grad_norm": 0.5453780889511108,
"learning_rate": 4.830276376144295e-05,
"loss": 0.056,
"step": 8030
},
{
"epoch": 3.5047951176983436,
"grad_norm": 0.6273401975631714,
"learning_rate": 4.819260017912237e-05,
"loss": 0.0438,
"step": 8040
},
{
"epoch": 3.5091543156059286,
"grad_norm": 0.5913659930229187,
"learning_rate": 4.808244538144396e-05,
"loss": 0.0583,
"step": 8050
},
{
"epoch": 3.5135135135135136,
"grad_norm": 0.46796709299087524,
"learning_rate": 4.797229990380142e-05,
"loss": 0.0626,
"step": 8060
},
{
"epoch": 3.5178727114210986,
"grad_norm": 0.633021891117096,
"learning_rate": 4.786216428154317e-05,
"loss": 0.0468,
"step": 8070
},
{
"epoch": 3.5222319093286836,
"grad_norm": 0.8615811467170715,
"learning_rate": 4.7752039049969685e-05,
"loss": 0.0716,
"step": 8080
},
{
"epoch": 3.5265911072362686,
"grad_norm": 0.5270361304283142,
"learning_rate": 4.7641924744330956e-05,
"loss": 0.0555,
"step": 8090
},
{
"epoch": 3.5309503051438536,
"grad_norm": 0.6436976790428162,
"learning_rate": 4.7531821899823925e-05,
"loss": 0.0589,
"step": 8100
},
{
"epoch": 3.5353095030514385,
"grad_norm": 0.5558163523674011,
"learning_rate": 4.742173105158973e-05,
"loss": 0.0704,
"step": 8110
},
{
"epoch": 3.5396687009590235,
"grad_norm": 0.4449492394924164,
"learning_rate": 4.731165273471129e-05,
"loss": 0.0515,
"step": 8120
},
{
"epoch": 3.5440278988666085,
"grad_norm": 1.0276836156845093,
"learning_rate": 4.720158748421057e-05,
"loss": 0.074,
"step": 8130
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.6685786247253418,
"learning_rate": 4.709153583504602e-05,
"loss": 0.0642,
"step": 8140
},
{
"epoch": 3.5527462946817785,
"grad_norm": 0.7969890832901001,
"learning_rate": 4.6981498322110027e-05,
"loss": 0.0643,
"step": 8150
},
{
"epoch": 3.5571054925893635,
"grad_norm": 0.8791037797927856,
"learning_rate": 4.6871475480226256e-05,
"loss": 0.0726,
"step": 8160
},
{
"epoch": 3.5614646904969485,
"grad_norm": 0.6621007919311523,
"learning_rate": 4.6761467844147004e-05,
"loss": 0.0629,
"step": 8170
},
{
"epoch": 3.5658238884045335,
"grad_norm": 1.726035237312317,
"learning_rate": 4.665147594855076e-05,
"loss": 0.0568,
"step": 8180
},
{
"epoch": 3.5701830863121184,
"grad_norm": 0.7457987666130066,
"learning_rate": 4.654150032803943e-05,
"loss": 0.0599,
"step": 8190
},
{
"epoch": 3.5745422842197034,
"grad_norm": 0.4260644018650055,
"learning_rate": 4.643154151713588e-05,
"loss": 0.0534,
"step": 8200
},
{
"epoch": 3.5789014821272884,
"grad_norm": 0.5488954186439514,
"learning_rate": 4.6321600050281225e-05,
"loss": 0.0448,
"step": 8210
},
{
"epoch": 3.583260680034874,
"grad_norm": 0.7686980962753296,
"learning_rate": 4.6211676461832264e-05,
"loss": 0.055,
"step": 8220
},
{
"epoch": 3.5876198779424584,
"grad_norm": 2.9167089462280273,
"learning_rate": 4.610177128605899e-05,
"loss": 0.0673,
"step": 8230
},
{
"epoch": 3.591979075850044,
"grad_norm": 0.48421207070350647,
"learning_rate": 4.599188505714184e-05,
"loss": 0.0573,
"step": 8240
},
{
"epoch": 3.5963382737576284,
"grad_norm": 0.742896318435669,
"learning_rate": 4.588201830916912e-05,
"loss": 0.0541,
"step": 8250
},
{
"epoch": 3.600697471665214,
"grad_norm": 0.7824766635894775,
"learning_rate": 4.577217157613456e-05,
"loss": 0.0538,
"step": 8260
},
{
"epoch": 3.6050566695727984,
"grad_norm": 2.995638608932495,
"learning_rate": 4.566234539193452e-05,
"loss": 0.0522,
"step": 8270
},
{
"epoch": 3.609415867480384,
"grad_norm": 0.5708383917808533,
"learning_rate": 4.555254029036555e-05,
"loss": 0.0701,
"step": 8280
},
{
"epoch": 3.6137750653879688,
"grad_norm": 15.95019245147705,
"learning_rate": 4.544275680512165e-05,
"loss": 0.0678,
"step": 8290
},
{
"epoch": 3.6181342632955538,
"grad_norm": 0.5620335936546326,
"learning_rate": 4.5332995469791836e-05,
"loss": 0.0822,
"step": 8300
},
{
"epoch": 3.6224934612031388,
"grad_norm": 8.422737121582031,
"learning_rate": 4.522325681785744e-05,
"loss": 0.055,
"step": 8310
},
{
"epoch": 3.6268526591107237,
"grad_norm": 0.734219491481781,
"learning_rate": 4.511354138268952e-05,
"loss": 0.0472,
"step": 8320
},
{
"epoch": 3.6312118570183087,
"grad_norm": 2.058011293411255,
"learning_rate": 4.50038496975463e-05,
"loss": 0.055,
"step": 8330
},
{
"epoch": 3.6355710549258937,
"grad_norm": 0.9331876039505005,
"learning_rate": 4.489418229557063e-05,
"loss": 0.0454,
"step": 8340
},
{
"epoch": 3.6399302528334787,
"grad_norm": 0.7415900826454163,
"learning_rate": 4.478453970978722e-05,
"loss": 0.0605,
"step": 8350
},
{
"epoch": 3.6442894507410637,
"grad_norm": 0.7572534084320068,
"learning_rate": 4.4674922473100286e-05,
"loss": 0.0568,
"step": 8360
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.7530124187469482,
"learning_rate": 4.4565331118290756e-05,
"loss": 0.0499,
"step": 8370
},
{
"epoch": 3.6530078465562337,
"grad_norm": 1.116607666015625,
"learning_rate": 4.4455766178013775e-05,
"loss": 0.0596,
"step": 8380
},
{
"epoch": 3.6573670444638187,
"grad_norm": 0.5457643270492554,
"learning_rate": 4.434622818479615e-05,
"loss": 0.0648,
"step": 8390
},
{
"epoch": 3.6617262423714037,
"grad_norm": 0.6090409755706787,
"learning_rate": 4.4236717671033646e-05,
"loss": 0.0584,
"step": 8400
},
{
"epoch": 3.6660854402789886,
"grad_norm": 0.4845934212207794,
"learning_rate": 4.412723516898853e-05,
"loss": 0.0548,
"step": 8410
},
{
"epoch": 3.6704446381865736,
"grad_norm": 0.7643135786056519,
"learning_rate": 4.40177812107869e-05,
"loss": 0.0488,
"step": 8420
},
{
"epoch": 3.6748038360941586,
"grad_norm": 0.6320275068283081,
"learning_rate": 4.390835632841606e-05,
"loss": 0.0501,
"step": 8430
},
{
"epoch": 3.6791630340017436,
"grad_norm": 1.2610498666763306,
"learning_rate": 4.3798961053722115e-05,
"loss": 0.0558,
"step": 8440
},
{
"epoch": 3.6835222319093286,
"grad_norm": 0.3895055651664734,
"learning_rate": 4.368959591840718e-05,
"loss": 0.0478,
"step": 8450
},
{
"epoch": 3.6878814298169136,
"grad_norm": 0.45858561992645264,
"learning_rate": 4.3580261454026865e-05,
"loss": 0.0459,
"step": 8460
},
{
"epoch": 3.6922406277244986,
"grad_norm": 1.6598986387252808,
"learning_rate": 4.3470958191987786e-05,
"loss": 0.051,
"step": 8470
},
{
"epoch": 3.6965998256320836,
"grad_norm": 0.7669274806976318,
"learning_rate": 4.336168666354484e-05,
"loss": 0.0455,
"step": 8480
},
{
"epoch": 3.7009590235396685,
"grad_norm": 1.024935007095337,
"learning_rate": 4.325244739979873e-05,
"loss": 0.0618,
"step": 8490
},
{
"epoch": 3.7053182214472535,
"grad_norm": 0.9452399611473083,
"learning_rate": 4.314324093169332e-05,
"loss": 0.0701,
"step": 8500
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.4857301414012909,
"learning_rate": 4.303406779001302e-05,
"loss": 0.054,
"step": 8510
},
{
"epoch": 3.7140366172624235,
"grad_norm": 0.5200791954994202,
"learning_rate": 4.292492850538038e-05,
"loss": 0.0461,
"step": 8520
},
{
"epoch": 3.718395815170009,
"grad_norm": 8.943376541137695,
"learning_rate": 4.28158236082533e-05,
"loss": 0.0594,
"step": 8530
},
{
"epoch": 3.7227550130775935,
"grad_norm": 0.6628287434577942,
"learning_rate": 4.270675362892256e-05,
"loss": 0.0525,
"step": 8540
},
{
"epoch": 3.727114210985179,
"grad_norm": 0.5522739291191101,
"learning_rate": 4.2597719097509246e-05,
"loss": 0.0456,
"step": 8550
},
{
"epoch": 3.7314734088927635,
"grad_norm": 0.5528436899185181,
"learning_rate": 4.2488720543962146e-05,
"loss": 0.0628,
"step": 8560
},
{
"epoch": 3.735832606800349,
"grad_norm": 0.5776464939117432,
"learning_rate": 4.23797584980552e-05,
"loss": 0.0832,
"step": 8570
},
{
"epoch": 3.740191804707934,
"grad_norm": 0.8870580196380615,
"learning_rate": 4.227083348938486e-05,
"loss": 0.0701,
"step": 8580
},
{
"epoch": 3.744551002615519,
"grad_norm": 0.5437199473381042,
"learning_rate": 4.2161946047367586e-05,
"loss": 0.055,
"step": 8590
},
{
"epoch": 3.748910200523104,
"grad_norm": 0.4483864903450012,
"learning_rate": 4.2053096701237294e-05,
"loss": 0.0482,
"step": 8600
},
{
"epoch": 3.753269398430689,
"grad_norm": 0.6559789776802063,
"learning_rate": 4.1944285980042656e-05,
"loss": 0.047,
"step": 8610
},
{
"epoch": 3.757628596338274,
"grad_norm": 0.5018060803413391,
"learning_rate": 4.183551441264469e-05,
"loss": 0.0577,
"step": 8620
},
{
"epoch": 3.761987794245859,
"grad_norm": 0.6484914422035217,
"learning_rate": 4.172678252771408e-05,
"loss": 0.0527,
"step": 8630
},
{
"epoch": 3.766346992153444,
"grad_norm": 0.6902757883071899,
"learning_rate": 4.16180908537286e-05,
"loss": 0.0597,
"step": 8640
},
{
"epoch": 3.770706190061029,
"grad_norm": 0.7869307398796082,
"learning_rate": 4.150943991897065e-05,
"loss": 0.0563,
"step": 8650
},
{
"epoch": 3.775065387968614,
"grad_norm": 0.6430687308311462,
"learning_rate": 4.1400830251524605e-05,
"loss": 0.0558,
"step": 8660
},
{
"epoch": 3.779424585876199,
"grad_norm": 0.5719813704490662,
"learning_rate": 4.1292262379274215e-05,
"loss": 0.0464,
"step": 8670
},
{
"epoch": 3.7837837837837838,
"grad_norm": 0.4877850413322449,
"learning_rate": 4.118373682990016e-05,
"loss": 0.0411,
"step": 8680
},
{
"epoch": 3.7881429816913688,
"grad_norm": 0.48677220940589905,
"learning_rate": 4.107525413087737e-05,
"loss": 0.0499,
"step": 8690
},
{
"epoch": 3.7925021795989537,
"grad_norm": 0.2930895984172821,
"learning_rate": 4.096681480947252e-05,
"loss": 0.0491,
"step": 8700
},
{
"epoch": 3.7968613775065387,
"grad_norm": 0.690412700176239,
"learning_rate": 4.085841939274146e-05,
"loss": 0.0538,
"step": 8710
},
{
"epoch": 3.8012205754141237,
"grad_norm": 0.8729444146156311,
"learning_rate": 4.075006840752662e-05,
"loss": 0.0756,
"step": 8720
},
{
"epoch": 3.8055797733217087,
"grad_norm": 0.40643206238746643,
"learning_rate": 4.0641762380454515e-05,
"loss": 0.0443,
"step": 8730
},
{
"epoch": 3.8099389712292937,
"grad_norm": 0.5322911143302917,
"learning_rate": 4.0533501837933134e-05,
"loss": 0.0514,
"step": 8740
},
{
"epoch": 3.8142981691368787,
"grad_norm": 0.47203385829925537,
"learning_rate": 4.042528730614936e-05,
"loss": 0.0549,
"step": 8750
},
{
"epoch": 3.8186573670444637,
"grad_norm": 0.5504468679428101,
"learning_rate": 4.0317119311066486e-05,
"loss": 0.0564,
"step": 8760
},
{
"epoch": 3.8230165649520487,
"grad_norm": 0.5727183818817139,
"learning_rate": 4.02089983784216e-05,
"loss": 0.0543,
"step": 8770
},
{
"epoch": 3.827375762859634,
"grad_norm": 0.6335930824279785,
"learning_rate": 4.010092503372309e-05,
"loss": 0.0659,
"step": 8780
},
{
"epoch": 3.8317349607672186,
"grad_norm": 0.5915783047676086,
"learning_rate": 3.999289980224797e-05,
"loss": 0.0499,
"step": 8790
},
{
"epoch": 3.836094158674804,
"grad_norm": 0.4834582209587097,
"learning_rate": 3.9884923209039455e-05,
"loss": 0.0393,
"step": 8800
},
{
"epoch": 3.8404533565823886,
"grad_norm": 1.361459732055664,
"learning_rate": 3.977699577890439e-05,
"loss": 0.0428,
"step": 8810
},
{
"epoch": 3.844812554489974,
"grad_norm": 1.9913321733474731,
"learning_rate": 3.96691180364106e-05,
"loss": 0.0473,
"step": 8820
},
{
"epoch": 3.8491717523975586,
"grad_norm": 0.4347272217273712,
"learning_rate": 3.956129050588446e-05,
"loss": 0.0465,
"step": 8830
},
{
"epoch": 3.853530950305144,
"grad_norm": 0.38418328762054443,
"learning_rate": 3.9453513711408275e-05,
"loss": 0.0448,
"step": 8840
},
{
"epoch": 3.857890148212729,
"grad_norm": 0.6460165977478027,
"learning_rate": 3.934578817681774e-05,
"loss": 0.0431,
"step": 8850
},
{
"epoch": 3.862249346120314,
"grad_norm": 0.5161862969398499,
"learning_rate": 3.9238114425699465e-05,
"loss": 0.0451,
"step": 8860
},
{
"epoch": 3.866608544027899,
"grad_norm": 0.6148392558097839,
"learning_rate": 3.91304929813883e-05,
"loss": 0.0453,
"step": 8870
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.794267475605011,
"learning_rate": 3.902292436696489e-05,
"loss": 0.0582,
"step": 8880
},
{
"epoch": 3.875326939843069,
"grad_norm": 3.6313283443450928,
"learning_rate": 3.891540910525316e-05,
"loss": 0.0528,
"step": 8890
},
{
"epoch": 3.879686137750654,
"grad_norm": 7.387906074523926,
"learning_rate": 3.8807947718817624e-05,
"loss": 0.0534,
"step": 8900
},
{
"epoch": 3.884045335658239,
"grad_norm": 0.6750633716583252,
"learning_rate": 3.870054072996103e-05,
"loss": 0.0433,
"step": 8910
},
{
"epoch": 3.888404533565824,
"grad_norm": 0.7969902753829956,
"learning_rate": 3.859318866072168e-05,
"loss": 0.0602,
"step": 8920
},
{
"epoch": 3.892763731473409,
"grad_norm": 0.44962191581726074,
"learning_rate": 3.8485892032870965e-05,
"loss": 0.0485,
"step": 8930
},
{
"epoch": 3.897122929380994,
"grad_norm": 0.45624884963035583,
"learning_rate": 3.83786513679108e-05,
"loss": 0.0482,
"step": 8940
},
{
"epoch": 3.901482127288579,
"grad_norm": 4.285119533538818,
"learning_rate": 3.8271467187071134e-05,
"loss": 0.047,
"step": 8950
},
{
"epoch": 3.905841325196164,
"grad_norm": 0.7174338698387146,
"learning_rate": 3.816434001130732e-05,
"loss": 0.0465,
"step": 8960
},
{
"epoch": 3.910200523103749,
"grad_norm": 2.061223268508911,
"learning_rate": 3.8057270361297706e-05,
"loss": 0.044,
"step": 8970
},
{
"epoch": 3.914559721011334,
"grad_norm": 0.5447198152542114,
"learning_rate": 3.7950258757440985e-05,
"loss": 0.0481,
"step": 8980
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.40096163749694824,
"learning_rate": 3.78433057198538e-05,
"loss": 0.0627,
"step": 8990
},
{
"epoch": 3.923278116826504,
"grad_norm": 0.5488551259040833,
"learning_rate": 3.773641176836807e-05,
"loss": 0.0513,
"step": 9000
},
{
"epoch": 3.927637314734089,
"grad_norm": 0.5273767709732056,
"learning_rate": 3.7629577422528555e-05,
"loss": 0.0628,
"step": 9010
},
{
"epoch": 3.931996512641674,
"grad_norm": 1.203155755996704,
"learning_rate": 3.7522803201590325e-05,
"loss": 0.043,
"step": 9020
},
{
"epoch": 3.936355710549259,
"grad_norm": 0.4476049244403839,
"learning_rate": 3.741608962451621e-05,
"loss": 0.0557,
"step": 9030
},
{
"epoch": 3.940714908456844,
"grad_norm": 0.6361905336380005,
"learning_rate": 3.730943720997427e-05,
"loss": 0.0533,
"step": 9040
},
{
"epoch": 3.945074106364429,
"grad_norm": 0.361251562833786,
"learning_rate": 3.720284647633532e-05,
"loss": 0.0415,
"step": 9050
},
{
"epoch": 3.949433304272014,
"grad_norm": 1.1388883590698242,
"learning_rate": 3.7096317941670365e-05,
"loss": 0.0549,
"step": 9060
},
{
"epoch": 3.953792502179599,
"grad_norm": 0.37069207429885864,
"learning_rate": 3.698985212374814e-05,
"loss": 0.0386,
"step": 9070
},
{
"epoch": 3.9581517000871838,
"grad_norm": 0.4507943391799927,
"learning_rate": 3.6883449540032477e-05,
"loss": 0.0491,
"step": 9080
},
{
"epoch": 3.962510897994769,
"grad_norm": 0.32320085167884827,
"learning_rate": 3.6777110707679905e-05,
"loss": 0.0499,
"step": 9090
},
{
"epoch": 3.9668700959023537,
"grad_norm": 1.666203260421753,
"learning_rate": 3.667083614353715e-05,
"loss": 0.0551,
"step": 9100
},
{
"epoch": 3.971229293809939,
"grad_norm": 0.42759859561920166,
"learning_rate": 3.6564626364138465e-05,
"loss": 0.0533,
"step": 9110
},
{
"epoch": 3.9755884917175237,
"grad_norm": 0.539943516254425,
"learning_rate": 3.645848188570331e-05,
"loss": 0.0514,
"step": 9120
},
{
"epoch": 3.979947689625109,
"grad_norm": 0.7529284954071045,
"learning_rate": 3.635240322413374e-05,
"loss": 0.052,
"step": 9130
},
{
"epoch": 3.984306887532694,
"grad_norm": 0.6702316999435425,
"learning_rate": 3.624639089501187e-05,
"loss": 0.0387,
"step": 9140
},
{
"epoch": 3.988666085440279,
"grad_norm": 0.5268199443817139,
"learning_rate": 3.614044541359749e-05,
"loss": 0.0581,
"step": 9150
},
{
"epoch": 3.993025283347864,
"grad_norm": 1.170999526977539,
"learning_rate": 3.603456729482541e-05,
"loss": 0.0461,
"step": 9160
},
{
"epoch": 3.997384481255449,
"grad_norm": 0.3432202935218811,
"learning_rate": 3.5928757053303055e-05,
"loss": 0.0492,
"step": 9170
},
{
"epoch": 4.001743679163034,
"grad_norm": 0.6855699419975281,
"learning_rate": 3.5823015203308e-05,
"loss": 0.0454,
"step": 9180
},
{
"epoch": 4.006102877070619,
"grad_norm": 0.5655994415283203,
"learning_rate": 3.57173422587853e-05,
"loss": 0.0381,
"step": 9190
},
{
"epoch": 4.010462074978204,
"grad_norm": 0.46553653478622437,
"learning_rate": 3.561173873334522e-05,
"loss": 0.056,
"step": 9200
},
{
"epoch": 4.014821272885789,
"grad_norm": 0.43640783429145813,
"learning_rate": 3.550620514026056e-05,
"loss": 0.0501,
"step": 9210
},
{
"epoch": 4.019180470793374,
"grad_norm": 0.3680025041103363,
"learning_rate": 3.54007419924642e-05,
"loss": 0.0459,
"step": 9220
},
{
"epoch": 4.023539668700959,
"grad_norm": 1.250110149383545,
"learning_rate": 3.52953498025467e-05,
"loss": 0.0448,
"step": 9230
},
{
"epoch": 4.0278988666085445,
"grad_norm": 0.4605693221092224,
"learning_rate": 3.519002908275368e-05,
"loss": 0.0514,
"step": 9240
},
{
"epoch": 4.032258064516129,
"grad_norm": 0.7519360780715942,
"learning_rate": 3.508478034498339e-05,
"loss": 0.0384,
"step": 9250
},
{
"epoch": 4.036617262423714,
"grad_norm": 0.43192410469055176,
"learning_rate": 3.497960410078427e-05,
"loss": 0.044,
"step": 9260
},
{
"epoch": 4.040976460331299,
"grad_norm": 1.0785924196243286,
"learning_rate": 3.487450086135236e-05,
"loss": 0.0537,
"step": 9270
},
{
"epoch": 4.045335658238884,
"grad_norm": 0.6490569114685059,
"learning_rate": 3.476947113752891e-05,
"loss": 0.049,
"step": 9280
},
{
"epoch": 4.049694856146469,
"grad_norm": 0.8670549988746643,
"learning_rate": 3.4664515439797823e-05,
"loss": 0.0504,
"step": 9290
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.39166024327278137,
"learning_rate": 3.45596342782832e-05,
"loss": 0.054,
"step": 9300
},
{
"epoch": 4.058413251961639,
"grad_norm": 0.5605502128601074,
"learning_rate": 3.4454828162746936e-05,
"loss": 0.047,
"step": 9310
},
{
"epoch": 4.062772449869224,
"grad_norm": 0.33376985788345337,
"learning_rate": 3.435009760258608e-05,
"loss": 0.064,
"step": 9320
},
{
"epoch": 4.067131647776809,
"grad_norm": 1.2488508224487305,
"learning_rate": 3.424544310683057e-05,
"loss": 0.0435,
"step": 9330
},
{
"epoch": 4.071490845684394,
"grad_norm": 0.9323531985282898,
"learning_rate": 3.41408651841405e-05,
"loss": 0.0489,
"step": 9340
},
{
"epoch": 4.075850043591979,
"grad_norm": 0.3177700340747833,
"learning_rate": 3.403636434280388e-05,
"loss": 0.0466,
"step": 9350
},
{
"epoch": 4.080209241499564,
"grad_norm": 0.5773137807846069,
"learning_rate": 3.393194109073411e-05,
"loss": 0.0456,
"step": 9360
},
{
"epoch": 4.084568439407149,
"grad_norm": 1.1703490018844604,
"learning_rate": 3.3827595935467376e-05,
"loss": 0.0647,
"step": 9370
},
{
"epoch": 4.088927637314734,
"grad_norm": 0.34152188897132874,
"learning_rate": 3.3723329384160344e-05,
"loss": 0.0459,
"step": 9380
},
{
"epoch": 4.093286835222319,
"grad_norm": 0.45289939641952515,
"learning_rate": 3.3619141943587646e-05,
"loss": 0.0457,
"step": 9390
},
{
"epoch": 4.097646033129904,
"grad_norm": 0.6022029519081116,
"learning_rate": 3.351503412013935e-05,
"loss": 0.0448,
"step": 9400
},
{
"epoch": 4.102005231037489,
"grad_norm": 0.4732722342014313,
"learning_rate": 3.341100641981863e-05,
"loss": 0.0612,
"step": 9410
},
{
"epoch": 4.106364428945074,
"grad_norm": 0.4304181933403015,
"learning_rate": 3.330705934823919e-05,
"loss": 0.0373,
"step": 9420
},
{
"epoch": 4.110723626852659,
"grad_norm": 0.37233036756515503,
"learning_rate": 3.3203193410622804e-05,
"loss": 0.0435,
"step": 9430
},
{
"epoch": 4.115082824760244,
"grad_norm": 0.3829936683177948,
"learning_rate": 3.309940911179701e-05,
"loss": 0.0474,
"step": 9440
},
{
"epoch": 4.119442022667829,
"grad_norm": 0.5989209413528442,
"learning_rate": 3.2995706956192465e-05,
"loss": 0.0397,
"step": 9450
},
{
"epoch": 4.123801220575414,
"grad_norm": 0.47757670283317566,
"learning_rate": 3.289208744784059e-05,
"loss": 0.0401,
"step": 9460
},
{
"epoch": 4.128160418482999,
"grad_norm": 0.8434343338012695,
"learning_rate": 3.2788551090371164e-05,
"loss": 0.0575,
"step": 9470
},
{
"epoch": 4.132519616390584,
"grad_norm": 0.693182110786438,
"learning_rate": 3.268509838700974e-05,
"loss": 0.0429,
"step": 9480
},
{
"epoch": 4.136878814298169,
"grad_norm": 0.5965169668197632,
"learning_rate": 3.258172984057535e-05,
"loss": 0.0432,
"step": 9490
},
{
"epoch": 4.141238012205754,
"grad_norm": 0.7130163311958313,
"learning_rate": 3.247844595347798e-05,
"loss": 0.0462,
"step": 9500
},
{
"epoch": 4.145597210113339,
"grad_norm": 0.4185994565486908,
"learning_rate": 3.2375247227716077e-05,
"loss": 0.0473,
"step": 9510
},
{
"epoch": 4.149956408020924,
"grad_norm": 0.3187268376350403,
"learning_rate": 3.2272134164874264e-05,
"loss": 0.0418,
"step": 9520
},
{
"epoch": 4.15431560592851,
"grad_norm": 0.3435496985912323,
"learning_rate": 3.216910726612073e-05,
"loss": 0.0472,
"step": 9530
},
{
"epoch": 4.158674803836094,
"grad_norm": 0.7399494051933289,
"learning_rate": 3.2066167032204956e-05,
"loss": 0.0512,
"step": 9540
},
{
"epoch": 4.1630340017436795,
"grad_norm": 0.3519555926322937,
"learning_rate": 3.196331396345512e-05,
"loss": 0.0412,
"step": 9550
},
{
"epoch": 4.167393199651264,
"grad_norm": 0.33296439051628113,
"learning_rate": 3.186054855977577e-05,
"loss": 0.0419,
"step": 9560
},
{
"epoch": 4.1717523975588495,
"grad_norm": 0.8820390701293945,
"learning_rate": 3.175787132064542e-05,
"loss": 0.0541,
"step": 9570
},
{
"epoch": 4.176111595466434,
"grad_norm": 0.4657217860221863,
"learning_rate": 3.165528274511397e-05,
"loss": 0.0496,
"step": 9580
},
{
"epoch": 4.1804707933740195,
"grad_norm": 0.8743098974227905,
"learning_rate": 3.155278333180047e-05,
"loss": 0.0475,
"step": 9590
},
{
"epoch": 4.184829991281604,
"grad_norm": 0.4864312410354614,
"learning_rate": 3.14503735788906e-05,
"loss": 0.0442,
"step": 9600
},
{
"epoch": 4.1891891891891895,
"grad_norm": 0.3236112892627716,
"learning_rate": 3.134805398413419e-05,
"loss": 0.0395,
"step": 9610
},
{
"epoch": 4.193548387096774,
"grad_norm": 0.4888274669647217,
"learning_rate": 3.1245825044842954e-05,
"loss": 0.0617,
"step": 9620
},
{
"epoch": 4.1979075850043595,
"grad_norm": 0.4416561722755432,
"learning_rate": 3.114368725788791e-05,
"loss": 0.0381,
"step": 9630
},
{
"epoch": 4.202266782911944,
"grad_norm": 0.4820091724395752,
"learning_rate": 3.1041641119697075e-05,
"loss": 0.0398,
"step": 9640
},
{
"epoch": 4.206625980819529,
"grad_norm": 0.6689565181732178,
"learning_rate": 3.093968712625306e-05,
"loss": 0.0466,
"step": 9650
},
{
"epoch": 4.210985178727114,
"grad_norm": 0.627013623714447,
"learning_rate": 3.0837825773090535e-05,
"loss": 0.044,
"step": 9660
},
{
"epoch": 4.215344376634699,
"grad_norm": 0.7483475804328918,
"learning_rate": 3.073605755529395e-05,
"loss": 0.0453,
"step": 9670
},
{
"epoch": 4.219703574542284,
"grad_norm": 0.7025408148765564,
"learning_rate": 3.063438296749511e-05,
"loss": 0.0443,
"step": 9680
},
{
"epoch": 4.224062772449869,
"grad_norm": 0.30035483837127686,
"learning_rate": 3.053280250387067e-05,
"loss": 0.0405,
"step": 9690
},
{
"epoch": 4.228421970357454,
"grad_norm": 0.7182040810585022,
"learning_rate": 3.043131665813988e-05,
"loss": 0.0364,
"step": 9700
},
{
"epoch": 4.232781168265039,
"grad_norm": 0.5624287724494934,
"learning_rate": 3.0329925923562073e-05,
"loss": 0.0459,
"step": 9710
},
{
"epoch": 4.237140366172624,
"grad_norm": 0.7536492347717285,
"learning_rate": 3.0228630792934277e-05,
"loss": 0.0462,
"step": 9720
},
{
"epoch": 4.241499564080209,
"grad_norm": 0.5312374234199524,
"learning_rate": 3.0127431758588918e-05,
"loss": 0.059,
"step": 9730
},
{
"epoch": 4.245858761987794,
"grad_norm": 0.2901996672153473,
"learning_rate": 3.002632931239133e-05,
"loss": 0.0353,
"step": 9740
},
{
"epoch": 4.250217959895379,
"grad_norm": 0.7586061954498291,
"learning_rate": 2.992532394573735e-05,
"loss": 0.0384,
"step": 9750
},
{
"epoch": 4.254577157802964,
"grad_norm": 0.9470781087875366,
"learning_rate": 2.982441614955105e-05,
"loss": 0.0458,
"step": 9760
},
{
"epoch": 4.258936355710549,
"grad_norm": 0.4688419997692108,
"learning_rate": 2.972360641428218e-05,
"loss": 0.0422,
"step": 9770
},
{
"epoch": 4.263295553618134,
"grad_norm": 1.2093769311904907,
"learning_rate": 2.9622895229903973e-05,
"loss": 0.043,
"step": 9780
},
{
"epoch": 4.267654751525719,
"grad_norm": 0.4565201699733734,
"learning_rate": 2.9522283085910612e-05,
"loss": 0.0442,
"step": 9790
},
{
"epoch": 4.272013949433305,
"grad_norm": 0.7303440570831299,
"learning_rate": 2.942177047131489e-05,
"loss": 0.0406,
"step": 9800
},
{
"epoch": 4.276373147340889,
"grad_norm": 0.8189641237258911,
"learning_rate": 2.9321357874645905e-05,
"loss": 0.06,
"step": 9810
},
{
"epoch": 4.280732345248475,
"grad_norm": 0.3686642348766327,
"learning_rate": 2.9221045783946577e-05,
"loss": 0.0512,
"step": 9820
},
{
"epoch": 4.285091543156059,
"grad_norm": 1.4388134479522705,
"learning_rate": 2.9120834686771394e-05,
"loss": 0.0465,
"step": 9830
},
{
"epoch": 4.289450741063645,
"grad_norm": 0.3605181574821472,
"learning_rate": 2.902072507018392e-05,
"loss": 0.0615,
"step": 9840
},
{
"epoch": 4.293809938971229,
"grad_norm": 0.8981125354766846,
"learning_rate": 2.892071742075446e-05,
"loss": 0.0469,
"step": 9850
},
{
"epoch": 4.298169136878815,
"grad_norm": 0.336178183555603,
"learning_rate": 2.8820812224557812e-05,
"loss": 0.0364,
"step": 9860
},
{
"epoch": 4.302528334786399,
"grad_norm": 0.26270031929016113,
"learning_rate": 2.8721009967170764e-05,
"loss": 0.0456,
"step": 9870
},
{
"epoch": 4.306887532693985,
"grad_norm": 0.5637022256851196,
"learning_rate": 2.8621311133669748e-05,
"loss": 0.057,
"step": 9880
},
{
"epoch": 4.311246730601569,
"grad_norm": 0.2882184088230133,
"learning_rate": 2.8521716208628595e-05,
"loss": 0.0381,
"step": 9890
},
{
"epoch": 4.315605928509155,
"grad_norm": 0.5276246070861816,
"learning_rate": 2.8422225676116015e-05,
"loss": 0.0402,
"step": 9900
},
{
"epoch": 4.319965126416739,
"grad_norm": 0.37984946370124817,
"learning_rate": 2.832284001969342e-05,
"loss": 0.0442,
"step": 9910
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.600214421749115,
"learning_rate": 2.8223559722412408e-05,
"loss": 0.0531,
"step": 9920
},
{
"epoch": 4.328683522231909,
"grad_norm": 0.2663778066635132,
"learning_rate": 2.8124385266812516e-05,
"loss": 0.037,
"step": 9930
},
{
"epoch": 4.3330427201394945,
"grad_norm": 0.3363720178604126,
"learning_rate": 2.802531713491886e-05,
"loss": 0.0467,
"step": 9940
},
{
"epoch": 4.337401918047079,
"grad_norm": 1.2489317655563354,
"learning_rate": 2.7926355808239822e-05,
"loss": 0.0372,
"step": 9950
},
{
"epoch": 4.3417611159546645,
"grad_norm": 0.7166866660118103,
"learning_rate": 2.782750176776458e-05,
"loss": 0.0416,
"step": 9960
},
{
"epoch": 4.346120313862249,
"grad_norm": 0.468319296836853,
"learning_rate": 2.7728755493960946e-05,
"loss": 0.0416,
"step": 9970
},
{
"epoch": 4.3504795117698345,
"grad_norm": 0.6125366687774658,
"learning_rate": 2.7630117466772876e-05,
"loss": 0.0438,
"step": 9980
},
{
"epoch": 4.354838709677419,
"grad_norm": 0.600077748298645,
"learning_rate": 2.7531588165618278e-05,
"loss": 0.0431,
"step": 9990
},
{
"epoch": 4.3591979075850045,
"grad_norm": 0.8532693386077881,
"learning_rate": 2.7433168069386533e-05,
"loss": 0.0435,
"step": 10000
},
{
"epoch": 4.363557105492589,
"grad_norm": 0.5452614426612854,
"learning_rate": 2.7334857656436308e-05,
"loss": 0.0438,
"step": 10010
},
{
"epoch": 4.3679163034001744,
"grad_norm": 0.5705096125602722,
"learning_rate": 2.7236657404593157e-05,
"loss": 0.0465,
"step": 10020
},
{
"epoch": 4.372275501307759,
"grad_norm": 0.39495906233787537,
"learning_rate": 2.713856779114716e-05,
"loss": 0.0378,
"step": 10030
},
{
"epoch": 4.376634699215344,
"grad_norm": 0.5436052083969116,
"learning_rate": 2.704058929285074e-05,
"loss": 0.0444,
"step": 10040
},
{
"epoch": 4.380993897122929,
"grad_norm": 0.5021957755088806,
"learning_rate": 2.6942722385916175e-05,
"loss": 0.0427,
"step": 10050
},
{
"epoch": 4.385353095030514,
"grad_norm": 0.48665133118629456,
"learning_rate": 2.6844967546013394e-05,
"loss": 0.0448,
"step": 10060
},
{
"epoch": 4.3897122929381,
"grad_norm": 0.43248504400253296,
"learning_rate": 2.6747325248267673e-05,
"loss": 0.0313,
"step": 10070
},
{
"epoch": 4.394071490845684,
"grad_norm": 0.5743213295936584,
"learning_rate": 2.664979596725724e-05,
"loss": 0.0406,
"step": 10080
},
{
"epoch": 4.39843068875327,
"grad_norm": 0.6996235251426697,
"learning_rate": 2.655238017701105e-05,
"loss": 0.0445,
"step": 10090
},
{
"epoch": 4.402789886660854,
"grad_norm": 0.2858131229877472,
"learning_rate": 2.6455078351006455e-05,
"loss": 0.0446,
"step": 10100
},
{
"epoch": 4.40714908456844,
"grad_norm": 0.40710970759391785,
"learning_rate": 2.6357890962166866e-05,
"loss": 0.0337,
"step": 10110
},
{
"epoch": 4.411508282476024,
"grad_norm": 0.37843987345695496,
"learning_rate": 2.6260818482859534e-05,
"loss": 0.0404,
"step": 10120
},
{
"epoch": 4.41586748038361,
"grad_norm": 0.6559879183769226,
"learning_rate": 2.6163861384893156e-05,
"loss": 0.0457,
"step": 10130
},
{
"epoch": 4.420226678291194,
"grad_norm": 0.5004203915596008,
"learning_rate": 2.606702013951564e-05,
"loss": 0.0401,
"step": 10140
},
{
"epoch": 4.42458587619878,
"grad_norm": 0.6248252391815186,
"learning_rate": 2.5970295217411844e-05,
"loss": 0.0483,
"step": 10150
},
{
"epoch": 4.428945074106364,
"grad_norm": 0.5556444525718689,
"learning_rate": 2.5873687088701236e-05,
"loss": 0.0532,
"step": 10160
},
{
"epoch": 4.43330427201395,
"grad_norm": 0.2590063214302063,
"learning_rate": 2.5777196222935596e-05,
"loss": 0.0404,
"step": 10170
},
{
"epoch": 4.437663469921534,
"grad_norm": 1.113957166671753,
"learning_rate": 2.5680823089096807e-05,
"loss": 0.0517,
"step": 10180
},
{
"epoch": 4.44202266782912,
"grad_norm": 0.5693178772926331,
"learning_rate": 2.558456815559448e-05,
"loss": 0.0607,
"step": 10190
},
{
"epoch": 4.446381865736704,
"grad_norm": 0.5899903774261475,
"learning_rate": 2.548843189026378e-05,
"loss": 0.0371,
"step": 10200
},
{
"epoch": 4.45074106364429,
"grad_norm": 0.5805737972259521,
"learning_rate": 2.5392414760363048e-05,
"loss": 0.0452,
"step": 10210
},
{
"epoch": 4.455100261551874,
"grad_norm": 0.5417248606681824,
"learning_rate": 2.529651723257162e-05,
"loss": 0.0408,
"step": 10220
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.5996112823486328,
"learning_rate": 2.5200739772987537e-05,
"loss": 0.0363,
"step": 10230
},
{
"epoch": 4.463818657367044,
"grad_norm": 1.2667959928512573,
"learning_rate": 2.5105082847125184e-05,
"loss": 0.0493,
"step": 10240
},
{
"epoch": 4.46817785527463,
"grad_norm": 0.7686100006103516,
"learning_rate": 2.5009546919913218e-05,
"loss": 0.0465,
"step": 10250
},
{
"epoch": 4.472537053182214,
"grad_norm": 0.509890079498291,
"learning_rate": 2.4914132455692098e-05,
"loss": 0.0418,
"step": 10260
},
{
"epoch": 4.4768962510898,
"grad_norm": 0.4904661476612091,
"learning_rate": 2.4818839918211962e-05,
"loss": 0.0433,
"step": 10270
},
{
"epoch": 4.481255448997384,
"grad_norm": 0.4387027621269226,
"learning_rate": 2.4723669770630376e-05,
"loss": 0.0459,
"step": 10280
},
{
"epoch": 4.48561464690497,
"grad_norm": 0.7030172944068909,
"learning_rate": 2.4628622475509972e-05,
"loss": 0.0476,
"step": 10290
},
{
"epoch": 4.489973844812554,
"grad_norm": 0.9251930117607117,
"learning_rate": 2.4533698494816342e-05,
"loss": 0.0427,
"step": 10300
},
{
"epoch": 4.49433304272014,
"grad_norm": 0.49212366342544556,
"learning_rate": 2.44388982899157e-05,
"loss": 0.0353,
"step": 10310
},
{
"epoch": 4.498692240627724,
"grad_norm": 0.4423346519470215,
"learning_rate": 2.4344222321572636e-05,
"loss": 0.0425,
"step": 10320
},
{
"epoch": 4.5030514385353095,
"grad_norm": 0.4786710739135742,
"learning_rate": 2.4249671049947954e-05,
"loss": 0.0409,
"step": 10330
},
{
"epoch": 4.507410636442895,
"grad_norm": 0.6801364421844482,
"learning_rate": 2.4155244934596333e-05,
"loss": 0.0483,
"step": 10340
},
{
"epoch": 4.5117698343504795,
"grad_norm": 0.39631038904190063,
"learning_rate": 2.406094443446416e-05,
"loss": 0.0497,
"step": 10350
},
{
"epoch": 4.516129032258064,
"grad_norm": 1.449474811553955,
"learning_rate": 2.3966770007887317e-05,
"loss": 0.0377,
"step": 10360
},
{
"epoch": 4.5204882301656495,
"grad_norm": 0.757426381111145,
"learning_rate": 2.3872722112588903e-05,
"loss": 0.0356,
"step": 10370
},
{
"epoch": 4.524847428073235,
"grad_norm": 0.35006773471832275,
"learning_rate": 2.3778801205676997e-05,
"loss": 0.0426,
"step": 10380
},
{
"epoch": 4.5292066259808195,
"grad_norm": 0.3051522374153137,
"learning_rate": 2.3685007743642524e-05,
"loss": 0.0389,
"step": 10390
},
{
"epoch": 4.533565823888405,
"grad_norm": 0.4886200726032257,
"learning_rate": 2.3591342182356914e-05,
"loss": 0.0479,
"step": 10400
},
{
"epoch": 4.5379250217959894,
"grad_norm": 1.0046757459640503,
"learning_rate": 2.3497804977070016e-05,
"loss": 0.0406,
"step": 10410
},
{
"epoch": 4.542284219703575,
"grad_norm": 0.5701280236244202,
"learning_rate": 2.3404396582407777e-05,
"loss": 0.0407,
"step": 10420
},
{
"epoch": 4.546643417611159,
"grad_norm": 0.48940399289131165,
"learning_rate": 2.331111745237007e-05,
"loss": 0.031,
"step": 10430
},
{
"epoch": 4.551002615518745,
"grad_norm": 0.43257051706314087,
"learning_rate": 2.3217968040328526e-05,
"loss": 0.0361,
"step": 10440
},
{
"epoch": 4.555361813426329,
"grad_norm": 0.5097408294677734,
"learning_rate": 2.3124948799024286e-05,
"loss": 0.0457,
"step": 10450
},
{
"epoch": 4.559721011333915,
"grad_norm": 0.7661223411560059,
"learning_rate": 2.3032060180565828e-05,
"loss": 0.0419,
"step": 10460
},
{
"epoch": 4.564080209241499,
"grad_norm": 0.6379186511039734,
"learning_rate": 2.2939302636426724e-05,
"loss": 0.0355,
"step": 10470
},
{
"epoch": 4.568439407149085,
"grad_norm": 0.4069131910800934,
"learning_rate": 2.2846676617443458e-05,
"loss": 0.0355,
"step": 10480
},
{
"epoch": 4.572798605056669,
"grad_norm": 0.6078165173530579,
"learning_rate": 2.275418257381332e-05,
"loss": 0.0332,
"step": 10490
},
{
"epoch": 4.577157802964255,
"grad_norm": 0.3824521601200104,
"learning_rate": 2.2661820955092083e-05,
"loss": 0.0327,
"step": 10500
},
{
"epoch": 4.581517000871839,
"grad_norm": 0.5751760601997375,
"learning_rate": 2.256959221019193e-05,
"loss": 0.0447,
"step": 10510
},
{
"epoch": 4.585876198779425,
"grad_norm": 0.6389051079750061,
"learning_rate": 2.2477496787379227e-05,
"loss": 0.0282,
"step": 10520
},
{
"epoch": 4.590235396687009,
"grad_norm": 0.3855462372303009,
"learning_rate": 2.238553513427229e-05,
"loss": 0.043,
"step": 10530
},
{
"epoch": 4.594594594594595,
"grad_norm": 0.4871591031551361,
"learning_rate": 2.2293707697839344e-05,
"loss": 0.0347,
"step": 10540
},
{
"epoch": 4.598953792502179,
"grad_norm": 0.4838610291481018,
"learning_rate": 2.2202014924396214e-05,
"loss": 0.0282,
"step": 10550
},
{
"epoch": 4.603312990409765,
"grad_norm": 0.477239727973938,
"learning_rate": 2.21104572596042e-05,
"loss": 0.0409,
"step": 10560
},
{
"epoch": 4.607672188317349,
"grad_norm": 0.5358338356018066,
"learning_rate": 2.2019035148468e-05,
"loss": 0.0315,
"step": 10570
},
{
"epoch": 4.612031386224935,
"grad_norm": 2.4292683601379395,
"learning_rate": 2.1927749035333374e-05,
"loss": 0.0449,
"step": 10580
},
{
"epoch": 4.616390584132519,
"grad_norm": 0.7061522603034973,
"learning_rate": 2.1836599363885152e-05,
"loss": 0.0497,
"step": 10590
},
{
"epoch": 4.620749782040105,
"grad_norm": 0.3709982633590698,
"learning_rate": 2.1745586577144993e-05,
"loss": 0.0511,
"step": 10600
},
{
"epoch": 4.62510897994769,
"grad_norm": 1.73948073387146,
"learning_rate": 2.1654711117469207e-05,
"loss": 0.0544,
"step": 10610
},
{
"epoch": 4.629468177855275,
"grad_norm": 0.4604334235191345,
"learning_rate": 2.1563973426546702e-05,
"loss": 0.0536,
"step": 10620
},
{
"epoch": 4.633827375762859,
"grad_norm": 0.3564910888671875,
"learning_rate": 2.1473373945396728e-05,
"loss": 0.0338,
"step": 10630
},
{
"epoch": 4.638186573670445,
"grad_norm": 0.3538748621940613,
"learning_rate": 2.138291311436679e-05,
"loss": 0.0441,
"step": 10640
},
{
"epoch": 4.64254577157803,
"grad_norm": 0.520347535610199,
"learning_rate": 2.1292591373130518e-05,
"loss": 0.0448,
"step": 10650
},
{
"epoch": 4.646904969485615,
"grad_norm": 0.4080093502998352,
"learning_rate": 2.1202409160685528e-05,
"loss": 0.0491,
"step": 10660
},
{
"epoch": 4.6512641673932,
"grad_norm": 0.9783157706260681,
"learning_rate": 2.1112366915351228e-05,
"loss": 0.0511,
"step": 10670
},
{
"epoch": 4.655623365300785,
"grad_norm": 0.809532880783081,
"learning_rate": 2.102246507476679e-05,
"loss": 0.0323,
"step": 10680
},
{
"epoch": 4.65998256320837,
"grad_norm": 0.3239237666130066,
"learning_rate": 2.09327040758889e-05,
"loss": 0.0359,
"step": 10690
},
{
"epoch": 4.6643417611159546,
"grad_norm": 0.6846075057983398,
"learning_rate": 2.0843084354989767e-05,
"loss": 0.0662,
"step": 10700
},
{
"epoch": 4.66870095902354,
"grad_norm": 0.6443043351173401,
"learning_rate": 2.0753606347654892e-05,
"loss": 0.0402,
"step": 10710
},
{
"epoch": 4.6730601569311245,
"grad_norm": 0.6246269941329956,
"learning_rate": 2.0664270488780985e-05,
"loss": 0.0366,
"step": 10720
},
{
"epoch": 4.67741935483871,
"grad_norm": 0.4732634127140045,
"learning_rate": 2.0575077212573905e-05,
"loss": 0.0357,
"step": 10730
},
{
"epoch": 4.6817785527462945,
"grad_norm": 0.579984188079834,
"learning_rate": 2.0486026952546484e-05,
"loss": 0.031,
"step": 10740
},
{
"epoch": 4.68613775065388,
"grad_norm": 0.9379808902740479,
"learning_rate": 2.0397120141516457e-05,
"loss": 0.0402,
"step": 10750
},
{
"epoch": 4.6904969485614645,
"grad_norm": 0.3776817321777344,
"learning_rate": 2.0308357211604313e-05,
"loss": 0.0386,
"step": 10760
},
{
"epoch": 4.69485614646905,
"grad_norm": 1.3237546682357788,
"learning_rate": 2.0219738594231224e-05,
"loss": 0.0427,
"step": 10770
},
{
"epoch": 4.6992153443766345,
"grad_norm": 0.5787206888198853,
"learning_rate": 2.0131264720116993e-05,
"loss": 0.0481,
"step": 10780
},
{
"epoch": 4.70357454228422,
"grad_norm": 0.7264513373374939,
"learning_rate": 2.0042936019277853e-05,
"loss": 0.0367,
"step": 10790
},
{
"epoch": 4.707933740191804,
"grad_norm": 0.655706524848938,
"learning_rate": 1.99547529210245e-05,
"loss": 0.0495,
"step": 10800
},
{
"epoch": 4.71229293809939,
"grad_norm": 0.4518389403820038,
"learning_rate": 1.9866715853959934e-05,
"loss": 0.0332,
"step": 10810
},
{
"epoch": 4.716652136006974,
"grad_norm": 0.4472216069698334,
"learning_rate": 1.977882524597734e-05,
"loss": 0.0413,
"step": 10820
},
{
"epoch": 4.72101133391456,
"grad_norm": 0.4006964862346649,
"learning_rate": 1.969108152425813e-05,
"loss": 0.0359,
"step": 10830
},
{
"epoch": 4.725370531822144,
"grad_norm": 0.30864083766937256,
"learning_rate": 1.9603485115269744e-05,
"loss": 0.048,
"step": 10840
},
{
"epoch": 4.72972972972973,
"grad_norm": 0.9182401895523071,
"learning_rate": 1.9516036444763613e-05,
"loss": 0.0425,
"step": 10850
},
{
"epoch": 4.734088927637314,
"grad_norm": 0.5361258387565613,
"learning_rate": 1.9428735937773173e-05,
"loss": 0.0297,
"step": 10860
},
{
"epoch": 4.7384481255449,
"grad_norm": 0.39075401425361633,
"learning_rate": 1.9341584018611646e-05,
"loss": 0.0348,
"step": 10870
},
{
"epoch": 4.742807323452485,
"grad_norm": 0.28990718722343445,
"learning_rate": 1.9254581110870123e-05,
"loss": 0.046,
"step": 10880
},
{
"epoch": 4.74716652136007,
"grad_norm": 0.3305884003639221,
"learning_rate": 1.916772763741544e-05,
"loss": 0.04,
"step": 10890
},
{
"epoch": 4.751525719267654,
"grad_norm": 0.5273476839065552,
"learning_rate": 1.908102402038807e-05,
"loss": 0.0477,
"step": 10900
},
{
"epoch": 4.75588491717524,
"grad_norm": 0.6094452738761902,
"learning_rate": 1.8994470681200204e-05,
"loss": 0.0311,
"step": 10910
},
{
"epoch": 4.760244115082825,
"grad_norm": 0.4191102385520935,
"learning_rate": 1.8908068040533578e-05,
"loss": 0.0368,
"step": 10920
},
{
"epoch": 4.76460331299041,
"grad_norm": 1.5498143434524536,
"learning_rate": 1.8821816518337455e-05,
"loss": 0.0454,
"step": 10930
},
{
"epoch": 4.768962510897994,
"grad_norm": 0.4996424615383148,
"learning_rate": 1.8735716533826663e-05,
"loss": 0.0414,
"step": 10940
},
{
"epoch": 4.77332170880558,
"grad_norm": 0.5708060264587402,
"learning_rate": 1.8649768505479476e-05,
"loss": 0.0416,
"step": 10950
},
{
"epoch": 4.777680906713165,
"grad_norm": 0.5143120884895325,
"learning_rate": 1.8563972851035616e-05,
"loss": 0.0317,
"step": 10960
},
{
"epoch": 4.78204010462075,
"grad_norm": 0.345091849565506,
"learning_rate": 1.847832998749418e-05,
"loss": 0.0326,
"step": 10970
},
{
"epoch": 4.786399302528335,
"grad_norm": 0.2509484887123108,
"learning_rate": 1.8392840331111644e-05,
"loss": 0.0326,
"step": 10980
},
{
"epoch": 4.79075850043592,
"grad_norm": 0.5557523369789124,
"learning_rate": 1.830750429739989e-05,
"loss": 0.0354,
"step": 10990
},
{
"epoch": 4.795117698343505,
"grad_norm": 0.4494255483150482,
"learning_rate": 1.822232230112409e-05,
"loss": 0.0312,
"step": 11000
},
{
"epoch": 4.79947689625109,
"grad_norm": 0.706129789352417,
"learning_rate": 1.813729475630071e-05,
"loss": 0.0421,
"step": 11010
},
{
"epoch": 4.803836094158675,
"grad_norm": 0.2876376211643219,
"learning_rate": 1.8052422076195635e-05,
"loss": 0.0346,
"step": 11020
},
{
"epoch": 4.80819529206626,
"grad_norm": 0.5017814040184021,
"learning_rate": 1.7967704673321918e-05,
"loss": 0.0362,
"step": 11030
},
{
"epoch": 4.812554489973845,
"grad_norm": 0.5770791172981262,
"learning_rate": 1.7883142959438004e-05,
"loss": 0.0349,
"step": 11040
},
{
"epoch": 4.81691368788143,
"grad_norm": 0.3633856773376465,
"learning_rate": 1.779873734554558e-05,
"loss": 0.0327,
"step": 11050
},
{
"epoch": 4.821272885789015,
"grad_norm": 0.5029147863388062,
"learning_rate": 1.771448824188761e-05,
"loss": 0.0362,
"step": 11060
},
{
"epoch": 4.8256320836966,
"grad_norm": 0.668860912322998,
"learning_rate": 1.763039605794644e-05,
"loss": 0.0379,
"step": 11070
},
{
"epoch": 4.829991281604185,
"grad_norm": 0.34870511293411255,
"learning_rate": 1.754646120244164e-05,
"loss": 0.0462,
"step": 11080
},
{
"epoch": 4.8343504795117695,
"grad_norm": 0.29230761528015137,
"learning_rate": 1.7462684083328144e-05,
"loss": 0.0334,
"step": 11090
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.3179951012134552,
"learning_rate": 1.7379065107794262e-05,
"loss": 0.0352,
"step": 11100
},
{
"epoch": 4.8430688753269395,
"grad_norm": 0.4508034884929657,
"learning_rate": 1.7295604682259586e-05,
"loss": 0.0387,
"step": 11110
},
{
"epoch": 4.847428073234525,
"grad_norm": 0.49519672989845276,
"learning_rate": 1.7212303212373175e-05,
"loss": 0.0341,
"step": 11120
},
{
"epoch": 4.8517872711421095,
"grad_norm": 0.448363721370697,
"learning_rate": 1.712916110301146e-05,
"loss": 0.0397,
"step": 11130
},
{
"epoch": 4.856146469049695,
"grad_norm": 0.2779761552810669,
"learning_rate": 1.7046178758276298e-05,
"loss": 0.0302,
"step": 11140
},
{
"epoch": 4.8605056669572795,
"grad_norm": 1.0302684307098389,
"learning_rate": 1.696335658149309e-05,
"loss": 0.0253,
"step": 11150
},
{
"epoch": 4.864864864864865,
"grad_norm": 0.31043338775634766,
"learning_rate": 1.6880694975208727e-05,
"loss": 0.0339,
"step": 11160
},
{
"epoch": 4.8692240627724495,
"grad_norm": 0.8034403324127197,
"learning_rate": 1.6798194341189687e-05,
"loss": 0.0328,
"step": 11170
},
{
"epoch": 4.873583260680035,
"grad_norm": 0.7642386555671692,
"learning_rate": 1.671585508042003e-05,
"loss": 0.0342,
"step": 11180
},
{
"epoch": 4.87794245858762,
"grad_norm": 0.23145624995231628,
"learning_rate": 1.6633677593099483e-05,
"loss": 0.0373,
"step": 11190
},
{
"epoch": 4.882301656495205,
"grad_norm": 0.38365113735198975,
"learning_rate": 1.655166227864154e-05,
"loss": 0.0372,
"step": 11200
},
{
"epoch": 4.886660854402789,
"grad_norm": 0.6353790163993835,
"learning_rate": 1.6469809535671426e-05,
"loss": 0.0388,
"step": 11210
},
{
"epoch": 4.891020052310375,
"grad_norm": 0.4335973560810089,
"learning_rate": 1.638811976202421e-05,
"loss": 0.0359,
"step": 11220
},
{
"epoch": 4.89537925021796,
"grad_norm": 0.9411593675613403,
"learning_rate": 1.6306593354742895e-05,
"loss": 0.0461,
"step": 11230
},
{
"epoch": 4.899738448125545,
"grad_norm": 0.45522540807724,
"learning_rate": 1.6225230710076455e-05,
"loss": 0.0335,
"step": 11240
},
{
"epoch": 4.90409764603313,
"grad_norm": 0.5510297417640686,
"learning_rate": 1.6144032223477924e-05,
"loss": 0.0277,
"step": 11250
},
{
"epoch": 4.908456843940715,
"grad_norm": 0.730690062046051,
"learning_rate": 1.606299828960243e-05,
"loss": 0.034,
"step": 11260
},
{
"epoch": 4.9128160418483,
"grad_norm": 0.47577011585235596,
"learning_rate": 1.5982129302305337e-05,
"loss": 0.032,
"step": 11270
},
{
"epoch": 4.917175239755885,
"grad_norm": 0.4926137328147888,
"learning_rate": 1.590142565464032e-05,
"loss": 0.0396,
"step": 11280
},
{
"epoch": 4.92153443766347,
"grad_norm": 0.3040984570980072,
"learning_rate": 1.5820887738857408e-05,
"loss": 0.0396,
"step": 11290
},
{
"epoch": 4.925893635571055,
"grad_norm": 0.4636688530445099,
"learning_rate": 1.5740515946401134e-05,
"loss": 0.0325,
"step": 11300
},
{
"epoch": 4.93025283347864,
"grad_norm": 0.5039488673210144,
"learning_rate": 1.5660310667908634e-05,
"loss": 0.0361,
"step": 11310
},
{
"epoch": 4.934612031386225,
"grad_norm": 0.5043140053749084,
"learning_rate": 1.5580272293207655e-05,
"loss": 0.0395,
"step": 11320
},
{
"epoch": 4.93897122929381,
"grad_norm": 0.4170094430446625,
"learning_rate": 1.5500401211314796e-05,
"loss": 0.0323,
"step": 11330
},
{
"epoch": 4.943330427201395,
"grad_norm": 0.3870650827884674,
"learning_rate": 1.542069781043351e-05,
"loss": 0.0256,
"step": 11340
},
{
"epoch": 4.94768962510898,
"grad_norm": 0.567021369934082,
"learning_rate": 1.534116247795226e-05,
"loss": 0.0493,
"step": 11350
},
{
"epoch": 4.952048823016565,
"grad_norm": 0.7827485203742981,
"learning_rate": 1.526179560044267e-05,
"loss": 0.0308,
"step": 11360
},
{
"epoch": 4.95640802092415,
"grad_norm": 1.4778623580932617,
"learning_rate": 1.5182597563657552e-05,
"loss": 0.0246,
"step": 11370
},
{
"epoch": 4.960767218831735,
"grad_norm": 0.6621624827384949,
"learning_rate": 1.5103568752529135e-05,
"loss": 0.0359,
"step": 11380
},
{
"epoch": 4.96512641673932,
"grad_norm": 0.8446053862571716,
"learning_rate": 1.5024709551167142e-05,
"loss": 0.0351,
"step": 11390
},
{
"epoch": 4.969485614646905,
"grad_norm": 0.9632458090782166,
"learning_rate": 1.4946020342856898e-05,
"loss": 0.0451,
"step": 11400
},
{
"epoch": 4.97384481255449,
"grad_norm": 0.44982174038887024,
"learning_rate": 1.4867501510057546e-05,
"loss": 0.0281,
"step": 11410
},
{
"epoch": 4.978204010462075,
"grad_norm": 0.2952253818511963,
"learning_rate": 1.4789153434400094e-05,
"loss": 0.0435,
"step": 11420
},
{
"epoch": 4.98256320836966,
"grad_norm": 1.1763001680374146,
"learning_rate": 1.4710976496685614e-05,
"loss": 0.0324,
"step": 11430
},
{
"epoch": 4.986922406277245,
"grad_norm": 0.6411994099617004,
"learning_rate": 1.4632971076883406e-05,
"loss": 0.0316,
"step": 11440
},
{
"epoch": 4.99128160418483,
"grad_norm": 0.6712291240692139,
"learning_rate": 1.4555137554129117e-05,
"loss": 0.0378,
"step": 11450
},
{
"epoch": 4.9956408020924155,
"grad_norm": 0.3442004919052124,
"learning_rate": 1.4477476306722925e-05,
"loss": 0.043,
"step": 11460
},
{
"epoch": 5.0,
"grad_norm": 1.1500236988067627,
"learning_rate": 1.439998771212766e-05,
"loss": 0.0319,
"step": 11470
},
{
"epoch": 5.004359197907585,
"grad_norm": 0.3608720600605011,
"learning_rate": 1.4322672146966982e-05,
"loss": 0.0297,
"step": 11480
},
{
"epoch": 5.00871839581517,
"grad_norm": 0.41150960326194763,
"learning_rate": 1.4245529987023621e-05,
"loss": 0.043,
"step": 11490
},
{
"epoch": 5.013077593722755,
"grad_norm": 1.1648448705673218,
"learning_rate": 1.4168561607237436e-05,
"loss": 0.0323,
"step": 11500
},
{
"epoch": 5.01743679163034,
"grad_norm": 0.17882536351680756,
"learning_rate": 1.4091767381703657e-05,
"loss": 0.0245,
"step": 11510
},
{
"epoch": 5.021795989537925,
"grad_norm": 0.556475818157196,
"learning_rate": 1.4015147683671087e-05,
"loss": 0.0321,
"step": 11520
},
{
"epoch": 5.02615518744551,
"grad_norm": 0.26074153184890747,
"learning_rate": 1.3938702885540239e-05,
"loss": 0.0257,
"step": 11530
},
{
"epoch": 5.030514385353095,
"grad_norm": 0.5722861886024475,
"learning_rate": 1.3862433358861576e-05,
"loss": 0.038,
"step": 11540
},
{
"epoch": 5.03487358326068,
"grad_norm": 0.5401778221130371,
"learning_rate": 1.3786339474333636e-05,
"loss": 0.0378,
"step": 11550
},
{
"epoch": 5.039232781168265,
"grad_norm": 0.5791760683059692,
"learning_rate": 1.3710421601801265e-05,
"loss": 0.0338,
"step": 11560
},
{
"epoch": 5.04359197907585,
"grad_norm": 0.31818994879722595,
"learning_rate": 1.3634680110253883e-05,
"loss": 0.0296,
"step": 11570
},
{
"epoch": 5.047951176983435,
"grad_norm": 0.6561838984489441,
"learning_rate": 1.3559115367823556e-05,
"loss": 0.0365,
"step": 11580
},
{
"epoch": 5.05231037489102,
"grad_norm": 0.8950375318527222,
"learning_rate": 1.3483727741783342e-05,
"loss": 0.0383,
"step": 11590
},
{
"epoch": 5.056669572798605,
"grad_norm": 0.3717140257358551,
"learning_rate": 1.3408517598545444e-05,
"loss": 0.0292,
"step": 11600
},
{
"epoch": 5.06102877070619,
"grad_norm": 0.31723707914352417,
"learning_rate": 1.3333485303659381e-05,
"loss": 0.0413,
"step": 11610
},
{
"epoch": 5.065387968613775,
"grad_norm": 0.5031527876853943,
"learning_rate": 1.3258631221810331e-05,
"loss": 0.0457,
"step": 11620
},
{
"epoch": 5.06974716652136,
"grad_norm": 0.41329917311668396,
"learning_rate": 1.3183955716817232e-05,
"loss": 0.0521,
"step": 11630
},
{
"epoch": 5.074106364428945,
"grad_norm": 1.1271079778671265,
"learning_rate": 1.3109459151631076e-05,
"loss": 0.0304,
"step": 11640
},
{
"epoch": 5.07846556233653,
"grad_norm": 0.7091724276542664,
"learning_rate": 1.3035141888333202e-05,
"loss": 0.0406,
"step": 11650
},
{
"epoch": 5.082824760244115,
"grad_norm": 0.43470293283462524,
"learning_rate": 1.2961004288133388e-05,
"loss": 0.0356,
"step": 11660
},
{
"epoch": 5.0871839581517,
"grad_norm": 0.7189029455184937,
"learning_rate": 1.2887046711368245e-05,
"loss": 0.0336,
"step": 11670
},
{
"epoch": 5.091543156059285,
"grad_norm": 0.4963429868221283,
"learning_rate": 1.2813269517499399e-05,
"loss": 0.0405,
"step": 11680
},
{
"epoch": 5.09590235396687,
"grad_norm": 0.7285826206207275,
"learning_rate": 1.273967306511169e-05,
"loss": 0.0334,
"step": 11690
},
{
"epoch": 5.100261551874455,
"grad_norm": 0.5211066007614136,
"learning_rate": 1.2666257711911566e-05,
"loss": 0.0348,
"step": 11700
},
{
"epoch": 5.10462074978204,
"grad_norm": 0.2556770145893097,
"learning_rate": 1.2593023814725214e-05,
"loss": 0.0293,
"step": 11710
},
{
"epoch": 5.108979947689625,
"grad_norm": 1.1604270935058594,
"learning_rate": 1.251997172949686e-05,
"loss": 0.0379,
"step": 11720
},
{
"epoch": 5.11333914559721,
"grad_norm": 0.6733997464179993,
"learning_rate": 1.2447101811287109e-05,
"loss": 0.0282,
"step": 11730
},
{
"epoch": 5.117698343504795,
"grad_norm": 0.3374277949333191,
"learning_rate": 1.237441441427114e-05,
"loss": 0.0355,
"step": 11740
},
{
"epoch": 5.12205754141238,
"grad_norm": 0.565790593624115,
"learning_rate": 1.2301909891737018e-05,
"loss": 0.0401,
"step": 11750
},
{
"epoch": 5.126416739319965,
"grad_norm": 0.21954980492591858,
"learning_rate": 1.2229588596083957e-05,
"loss": 0.0337,
"step": 11760
},
{
"epoch": 5.1307759372275505,
"grad_norm": 0.48581477999687195,
"learning_rate": 1.2157450878820608e-05,
"loss": 0.0364,
"step": 11770
},
{
"epoch": 5.135135135135135,
"grad_norm": 0.9423235654830933,
"learning_rate": 1.2085497090563407e-05,
"loss": 0.0289,
"step": 11780
},
{
"epoch": 5.1394943330427205,
"grad_norm": 1.247639536857605,
"learning_rate": 1.2013727581034783e-05,
"loss": 0.0304,
"step": 11790
},
{
"epoch": 5.143853530950305,
"grad_norm": 0.683613121509552,
"learning_rate": 1.1942142699061498e-05,
"loss": 0.0416,
"step": 11800
},
{
"epoch": 5.1482127288578905,
"grad_norm": 0.4748249053955078,
"learning_rate": 1.1870742792572992e-05,
"loss": 0.0417,
"step": 11810
},
{
"epoch": 5.152571926765475,
"grad_norm": 0.8540623784065247,
"learning_rate": 1.1799528208599637e-05,
"loss": 0.0354,
"step": 11820
},
{
"epoch": 5.1569311246730605,
"grad_norm": 0.8364782929420471,
"learning_rate": 1.1728499293271079e-05,
"loss": 0.0485,
"step": 11830
},
{
"epoch": 5.161290322580645,
"grad_norm": 0.3155808448791504,
"learning_rate": 1.1657656391814509e-05,
"loss": 0.0285,
"step": 11840
},
{
"epoch": 5.1656495204882305,
"grad_norm": 0.7690255045890808,
"learning_rate": 1.1586999848553043e-05,
"loss": 0.0305,
"step": 11850
},
{
"epoch": 5.170008718395815,
"grad_norm": 0.48023709654808044,
"learning_rate": 1.1516530006904053e-05,
"loss": 0.0385,
"step": 11860
},
{
"epoch": 5.1743679163034,
"grad_norm": 0.5498557686805725,
"learning_rate": 1.1446247209377403e-05,
"loss": 0.0317,
"step": 11870
},
{
"epoch": 5.178727114210985,
"grad_norm": 0.3345494568347931,
"learning_rate": 1.1376151797573925e-05,
"loss": 0.0376,
"step": 11880
},
{
"epoch": 5.18308631211857,
"grad_norm": 0.9440948963165283,
"learning_rate": 1.1306244112183662e-05,
"loss": 0.0325,
"step": 11890
},
{
"epoch": 5.187445510026155,
"grad_norm": 0.6693992018699646,
"learning_rate": 1.1236524492984203e-05,
"loss": 0.0351,
"step": 11900
},
{
"epoch": 5.19180470793374,
"grad_norm": 0.614152193069458,
"learning_rate": 1.116699327883911e-05,
"loss": 0.0351,
"step": 11910
},
{
"epoch": 5.196163905841325,
"grad_norm": 0.8623160719871521,
"learning_rate": 1.1097650807696209e-05,
"loss": 0.0332,
"step": 11920
},
{
"epoch": 5.20052310374891,
"grad_norm": 0.40504971146583557,
"learning_rate": 1.1028497416585931e-05,
"loss": 0.0338,
"step": 11930
},
{
"epoch": 5.204882301656495,
"grad_norm": 0.39141932129859924,
"learning_rate": 1.0959533441619762e-05,
"loss": 0.0309,
"step": 11940
},
{
"epoch": 5.20924149956408,
"grad_norm": 1.2070059776306152,
"learning_rate": 1.0890759217988527e-05,
"loss": 0.0261,
"step": 11950
},
{
"epoch": 5.213600697471665,
"grad_norm": 0.6789277195930481,
"learning_rate": 1.0822175079960806e-05,
"loss": 0.0258,
"step": 11960
},
{
"epoch": 5.21795989537925,
"grad_norm": 0.546126127243042,
"learning_rate": 1.0753781360881265e-05,
"loss": 0.0337,
"step": 11970
},
{
"epoch": 5.222319093286835,
"grad_norm": 0.6033177375793457,
"learning_rate": 1.0685578393169055e-05,
"loss": 0.0384,
"step": 11980
},
{
"epoch": 5.22667829119442,
"grad_norm": 0.48450666666030884,
"learning_rate": 1.061756650831625e-05,
"loss": 0.0343,
"step": 11990
},
{
"epoch": 5.231037489102005,
"grad_norm": 0.42560434341430664,
"learning_rate": 1.054974603688616e-05,
"loss": 0.0387,
"step": 12000
},
{
"epoch": 5.23539668700959,
"grad_norm": 0.333484947681427,
"learning_rate": 1.048211730851173e-05,
"loss": 0.0352,
"step": 12010
},
{
"epoch": 5.239755884917175,
"grad_norm": 0.6437160968780518,
"learning_rate": 1.0414680651894004e-05,
"loss": 0.0338,
"step": 12020
},
{
"epoch": 5.24411508282476,
"grad_norm": 0.3939863443374634,
"learning_rate": 1.034743639480047e-05,
"loss": 0.0289,
"step": 12030
},
{
"epoch": 5.248474280732346,
"grad_norm": 0.48622220754623413,
"learning_rate": 1.0280384864063497e-05,
"loss": 0.0316,
"step": 12040
},
{
"epoch": 5.25283347863993,
"grad_norm": 1.1890780925750732,
"learning_rate": 1.0213526385578704e-05,
"loss": 0.0396,
"step": 12050
},
{
"epoch": 5.257192676547516,
"grad_norm": 0.6726014614105225,
"learning_rate": 1.0146861284303394e-05,
"loss": 0.0326,
"step": 12060
},
{
"epoch": 5.2615518744551,
"grad_norm": 0.46193766593933105,
"learning_rate": 1.0080389884255037e-05,
"loss": 0.0467,
"step": 12070
},
{
"epoch": 5.265911072362686,
"grad_norm": 0.9077117443084717,
"learning_rate": 1.0014112508509588e-05,
"loss": 0.041,
"step": 12080
},
{
"epoch": 5.27027027027027,
"grad_norm": 0.4619888365268707,
"learning_rate": 9.948029479199994e-06,
"loss": 0.0318,
"step": 12090
},
{
"epoch": 5.274629468177856,
"grad_norm": 0.4092944860458374,
"learning_rate": 9.882141117514632e-06,
"loss": 0.0429,
"step": 12100
},
{
"epoch": 5.27898866608544,
"grad_norm": 0.3199276030063629,
"learning_rate": 9.816447743695656e-06,
"loss": 0.0315,
"step": 12110
},
{
"epoch": 5.283347863993026,
"grad_norm": 0.3841368556022644,
"learning_rate": 9.75094967703758e-06,
"loss": 0.0245,
"step": 12120
},
{
"epoch": 5.28770706190061,
"grad_norm": 0.7042669057846069,
"learning_rate": 9.685647235885597e-06,
"loss": 0.0424,
"step": 12130
},
{
"epoch": 5.292066259808196,
"grad_norm": 0.379041850566864,
"learning_rate": 9.620540737634087e-06,
"loss": 0.0303,
"step": 12140
},
{
"epoch": 5.29642545771578,
"grad_norm": 0.23851239681243896,
"learning_rate": 9.555630498725133e-06,
"loss": 0.026,
"step": 12150
},
{
"epoch": 5.3007846556233655,
"grad_norm": 0.503830075263977,
"learning_rate": 9.49091683464684e-06,
"loss": 0.0251,
"step": 12160
},
{
"epoch": 5.30514385353095,
"grad_norm": 0.8782485127449036,
"learning_rate": 9.426400059931955e-06,
"loss": 0.0361,
"step": 12170
},
{
"epoch": 5.3095030514385355,
"grad_norm": 1.1160036325454712,
"learning_rate": 9.362080488156245e-06,
"loss": 0.0438,
"step": 12180
},
{
"epoch": 5.31386224934612,
"grad_norm": 0.3587688207626343,
"learning_rate": 9.29795843193697e-06,
"loss": 0.0276,
"step": 12190
},
{
"epoch": 5.3182214472537055,
"grad_norm": 0.7131944298744202,
"learning_rate": 9.234034202931447e-06,
"loss": 0.0407,
"step": 12200
},
{
"epoch": 5.32258064516129,
"grad_norm": 0.38495564460754395,
"learning_rate": 9.170308111835418e-06,
"loss": 0.0416,
"step": 12210
},
{
"epoch": 5.3269398430688755,
"grad_norm": 0.32060274481773376,
"learning_rate": 9.106780468381631e-06,
"loss": 0.0316,
"step": 12220
},
{
"epoch": 5.33129904097646,
"grad_norm": 1.0149906873703003,
"learning_rate": 9.043451581338302e-06,
"loss": 0.0384,
"step": 12230
},
{
"epoch": 5.3356582388840454,
"grad_norm": 0.6786044836044312,
"learning_rate": 8.980321758507615e-06,
"loss": 0.0346,
"step": 12240
},
{
"epoch": 5.34001743679163,
"grad_norm": 0.4378306567668915,
"learning_rate": 8.91739130672425e-06,
"loss": 0.0442,
"step": 12250
},
{
"epoch": 5.344376634699215,
"grad_norm": 0.402309775352478,
"learning_rate": 8.85466053185382e-06,
"loss": 0.0371,
"step": 12260
},
{
"epoch": 5.3487358326068,
"grad_norm": 0.21001383662223816,
"learning_rate": 8.792129738791455e-06,
"loss": 0.0299,
"step": 12270
},
{
"epoch": 5.353095030514385,
"grad_norm": 0.3518972396850586,
"learning_rate": 8.729799231460318e-06,
"loss": 0.0319,
"step": 12280
},
{
"epoch": 5.35745422842197,
"grad_norm": 0.4094241261482239,
"learning_rate": 8.66766931281009e-06,
"loss": 0.0329,
"step": 12290
},
{
"epoch": 5.361813426329555,
"grad_norm": 0.3522825837135315,
"learning_rate": 8.6057402848155e-06,
"loss": 0.0247,
"step": 12300
},
{
"epoch": 5.366172624237141,
"grad_norm": 0.4518503248691559,
"learning_rate": 8.544012448474904e-06,
"loss": 0.0345,
"step": 12310
},
{
"epoch": 5.370531822144725,
"grad_norm": 0.5210450291633606,
"learning_rate": 8.482486103808779e-06,
"loss": 0.0418,
"step": 12320
},
{
"epoch": 5.374891020052311,
"grad_norm": 0.30797097086906433,
"learning_rate": 8.42116154985828e-06,
"loss": 0.0305,
"step": 12330
},
{
"epoch": 5.379250217959895,
"grad_norm": 0.4493660628795624,
"learning_rate": 8.360039084683779e-06,
"loss": 0.0319,
"step": 12340
},
{
"epoch": 5.383609415867481,
"grad_norm": 0.4356549382209778,
"learning_rate": 8.299119005363404e-06,
"loss": 0.03,
"step": 12350
},
{
"epoch": 5.387968613775065,
"grad_norm": 1.2415368556976318,
"learning_rate": 8.238401607991647e-06,
"loss": 0.042,
"step": 12360
},
{
"epoch": 5.392327811682651,
"grad_norm": 0.3500710129737854,
"learning_rate": 8.177887187677847e-06,
"loss": 0.0279,
"step": 12370
},
{
"epoch": 5.396687009590235,
"grad_norm": 0.3411845862865448,
"learning_rate": 8.117576038544838e-06,
"loss": 0.0328,
"step": 12380
},
{
"epoch": 5.401046207497821,
"grad_norm": 0.4249480366706848,
"learning_rate": 8.057468453727479e-06,
"loss": 0.0434,
"step": 12390
},
{
"epoch": 5.405405405405405,
"grad_norm": 0.3020738959312439,
"learning_rate": 7.997564725371182e-06,
"loss": 0.037,
"step": 12400
},
{
"epoch": 5.409764603312991,
"grad_norm": 0.4569971561431885,
"learning_rate": 7.937865144630601e-06,
"loss": 0.0428,
"step": 12410
},
{
"epoch": 5.414123801220575,
"grad_norm": 0.22348935902118683,
"learning_rate": 7.878370001668116e-06,
"loss": 0.0271,
"step": 12420
},
{
"epoch": 5.418482999128161,
"grad_norm": 0.5622937679290771,
"learning_rate": 7.819079585652461e-06,
"loss": 0.0438,
"step": 12430
},
{
"epoch": 5.422842197035745,
"grad_norm": 0.46097010374069214,
"learning_rate": 7.759994184757358e-06,
"loss": 0.04,
"step": 12440
},
{
"epoch": 5.427201394943331,
"grad_norm": 0.6537830829620361,
"learning_rate": 7.701114086160027e-06,
"loss": 0.0323,
"step": 12450
},
{
"epoch": 5.431560592850915,
"grad_norm": 3.8158767223358154,
"learning_rate": 7.642439576039884e-06,
"loss": 0.0369,
"step": 12460
},
{
"epoch": 5.435919790758501,
"grad_norm": 0.2568453550338745,
"learning_rate": 7.583970939577101e-06,
"loss": 0.0288,
"step": 12470
},
{
"epoch": 5.440278988666085,
"grad_norm": 0.2281847447156906,
"learning_rate": 7.525708460951197e-06,
"loss": 0.0245,
"step": 12480
},
{
"epoch": 5.444638186573671,
"grad_norm": 0.7292711138725281,
"learning_rate": 7.467652423339733e-06,
"loss": 0.0312,
"step": 12490
},
{
"epoch": 5.448997384481255,
"grad_norm": 0.2882106602191925,
"learning_rate": 7.409803108916841e-06,
"loss": 0.0375,
"step": 12500
},
{
"epoch": 5.453356582388841,
"grad_norm": 0.5214361548423767,
"learning_rate": 7.35216079885192e-06,
"loss": 0.0422,
"step": 12510
},
{
"epoch": 5.457715780296425,
"grad_norm": 0.42829445004463196,
"learning_rate": 7.29472577330827e-06,
"loss": 0.0288,
"step": 12520
},
{
"epoch": 5.4620749782040106,
"grad_norm": 0.5174663662910461,
"learning_rate": 7.237498311441676e-06,
"loss": 0.026,
"step": 12530
},
{
"epoch": 5.466434176111595,
"grad_norm": 0.4603443741798401,
"learning_rate": 7.180478691399134e-06,
"loss": 0.0334,
"step": 12540
},
{
"epoch": 5.4707933740191805,
"grad_norm": 0.8484715223312378,
"learning_rate": 7.123667190317396e-06,
"loss": 0.0392,
"step": 12550
},
{
"epoch": 5.475152571926765,
"grad_norm": 0.5899950265884399,
"learning_rate": 7.06706408432169e-06,
"loss": 0.0327,
"step": 12560
},
{
"epoch": 5.4795117698343505,
"grad_norm": 0.9594012498855591,
"learning_rate": 7.010669648524404e-06,
"loss": 0.0308,
"step": 12570
},
{
"epoch": 5.483870967741936,
"grad_norm": 0.795964777469635,
"learning_rate": 6.954484157023661e-06,
"loss": 0.0279,
"step": 12580
},
{
"epoch": 5.4882301656495205,
"grad_norm": 0.46093374490737915,
"learning_rate": 6.898507882902078e-06,
"loss": 0.039,
"step": 12590
},
{
"epoch": 5.492589363557105,
"grad_norm": 0.5869463086128235,
"learning_rate": 6.842741098225358e-06,
"loss": 0.0359,
"step": 12600
},
{
"epoch": 5.4969485614646905,
"grad_norm": 0.4688364565372467,
"learning_rate": 6.787184074041031e-06,
"loss": 0.0231,
"step": 12610
},
{
"epoch": 5.501307759372276,
"grad_norm": 0.3394297957420349,
"learning_rate": 6.731837080377129e-06,
"loss": 0.0267,
"step": 12620
},
{
"epoch": 5.50566695727986,
"grad_norm": 0.46087536215782166,
"learning_rate": 6.676700386240814e-06,
"loss": 0.0236,
"step": 12630
},
{
"epoch": 5.510026155187446,
"grad_norm": 0.48075783252716064,
"learning_rate": 6.621774259617125e-06,
"loss": 0.0451,
"step": 12640
},
{
"epoch": 5.51438535309503,
"grad_norm": 0.23513250052928925,
"learning_rate": 6.567058967467704e-06,
"loss": 0.028,
"step": 12650
},
{
"epoch": 5.518744551002616,
"grad_norm": 0.5110394358634949,
"learning_rate": 6.51255477572939e-06,
"loss": 0.0384,
"step": 12660
},
{
"epoch": 5.5231037489102,
"grad_norm": 0.47604143619537354,
"learning_rate": 6.45826194931306e-06,
"loss": 0.0326,
"step": 12670
},
{
"epoch": 5.527462946817786,
"grad_norm": 1.0370049476623535,
"learning_rate": 6.4041807521022454e-06,
"loss": 0.0338,
"step": 12680
},
{
"epoch": 5.53182214472537,
"grad_norm": 0.7761918902397156,
"learning_rate": 6.350311446951868e-06,
"loss": 0.0323,
"step": 12690
},
{
"epoch": 5.536181342632956,
"grad_norm": 0.5699298977851868,
"learning_rate": 6.29665429568701e-06,
"loss": 0.0307,
"step": 12700
},
{
"epoch": 5.54054054054054,
"grad_norm": 0.47337207198143005,
"learning_rate": 6.2432095591015705e-06,
"loss": 0.0235,
"step": 12710
},
{
"epoch": 5.544899738448126,
"grad_norm": 0.694935142993927,
"learning_rate": 6.1899774969570444e-06,
"loss": 0.035,
"step": 12720
},
{
"epoch": 5.54925893635571,
"grad_norm": 0.2715081572532654,
"learning_rate": 6.136958367981272e-06,
"loss": 0.0288,
"step": 12730
},
{
"epoch": 5.553618134263296,
"grad_norm": 0.588737964630127,
"learning_rate": 6.084152429867113e-06,
"loss": 0.0341,
"step": 12740
},
{
"epoch": 5.55797733217088,
"grad_norm": 0.3322398066520691,
"learning_rate": 6.0315599392712865e-06,
"loss": 0.0248,
"step": 12750
},
{
"epoch": 5.562336530078466,
"grad_norm": 0.6606812477111816,
"learning_rate": 5.979181151813057e-06,
"loss": 0.0438,
"step": 12760
},
{
"epoch": 5.56669572798605,
"grad_norm": 0.5031450390815735,
"learning_rate": 5.927016322072992e-06,
"loss": 0.0434,
"step": 12770
},
{
"epoch": 5.571054925893636,
"grad_norm": 0.44513940811157227,
"learning_rate": 5.875065703591787e-06,
"loss": 0.0377,
"step": 12780
},
{
"epoch": 5.57541412380122,
"grad_norm": 0.3935040533542633,
"learning_rate": 5.823329548868939e-06,
"loss": 0.0295,
"step": 12790
},
{
"epoch": 5.579773321708806,
"grad_norm": 0.6372385621070862,
"learning_rate": 5.77180810936162e-06,
"loss": 0.0295,
"step": 12800
},
{
"epoch": 5.58413251961639,
"grad_norm": 0.35186004638671875,
"learning_rate": 5.720501635483366e-06,
"loss": 0.0349,
"step": 12810
},
{
"epoch": 5.588491717523976,
"grad_norm": 0.8061618208885193,
"learning_rate": 5.669410376602918e-06,
"loss": 0.0345,
"step": 12820
},
{
"epoch": 5.59285091543156,
"grad_norm": 0.6604674458503723,
"learning_rate": 5.618534581043011e-06,
"loss": 0.0353,
"step": 12830
},
{
"epoch": 5.597210113339146,
"grad_norm": 0.7145407199859619,
"learning_rate": 5.5678744960791005e-06,
"loss": 0.0277,
"step": 12840
},
{
"epoch": 5.601569311246731,
"grad_norm": 0.3610726296901703,
"learning_rate": 5.517430367938237e-06,
"loss": 0.0308,
"step": 12850
},
{
"epoch": 5.605928509154316,
"grad_norm": 0.48084914684295654,
"learning_rate": 5.467202441797842e-06,
"loss": 0.0322,
"step": 12860
},
{
"epoch": 5.6102877070619,
"grad_norm": 0.6275615096092224,
"learning_rate": 5.417190961784497e-06,
"loss": 0.0315,
"step": 12870
},
{
"epoch": 5.614646904969486,
"grad_norm": 0.44279929995536804,
"learning_rate": 5.3673961709727885e-06,
"loss": 0.0272,
"step": 12880
},
{
"epoch": 5.619006102877071,
"grad_norm": 0.5052009224891663,
"learning_rate": 5.317818311384115e-06,
"loss": 0.032,
"step": 12890
},
{
"epoch": 5.623365300784656,
"grad_norm": 0.48991960287094116,
"learning_rate": 5.2684576239854895e-06,
"loss": 0.0356,
"step": 12900
},
{
"epoch": 5.627724498692241,
"grad_norm": 0.6088931560516357,
"learning_rate": 5.219314348688414e-06,
"loss": 0.0298,
"step": 12910
},
{
"epoch": 5.6320836965998256,
"grad_norm": 0.35137468576431274,
"learning_rate": 5.170388724347658e-06,
"loss": 0.0266,
"step": 12920
},
{
"epoch": 5.636442894507411,
"grad_norm": 0.8328220248222351,
"learning_rate": 5.1216809887601245e-06,
"loss": 0.0292,
"step": 12930
},
{
"epoch": 5.6408020924149955,
"grad_norm": 0.33078819513320923,
"learning_rate": 5.073191378663733e-06,
"loss": 0.0291,
"step": 12940
},
{
"epoch": 5.645161290322581,
"grad_norm": 0.2188701182603836,
"learning_rate": 5.024920129736188e-06,
"loss": 0.0233,
"step": 12950
},
{
"epoch": 5.6495204882301655,
"grad_norm": 0.602378785610199,
"learning_rate": 4.976867476593894e-06,
"loss": 0.0244,
"step": 12960
},
{
"epoch": 5.653879686137751,
"grad_norm": 0.6373008489608765,
"learning_rate": 4.929033652790821e-06,
"loss": 0.0245,
"step": 12970
},
{
"epoch": 5.6582388840453355,
"grad_norm": 0.5099015831947327,
"learning_rate": 4.881418890817296e-06,
"loss": 0.0261,
"step": 12980
},
{
"epoch": 5.662598081952921,
"grad_norm": 1.1864821910858154,
"learning_rate": 4.834023422098971e-06,
"loss": 0.0251,
"step": 12990
},
{
"epoch": 5.6669572798605055,
"grad_norm": 0.4129869043827057,
"learning_rate": 4.7868474769956266e-06,
"loss": 0.0294,
"step": 13000
},
{
"epoch": 5.671316477768091,
"grad_norm": 0.45849207043647766,
"learning_rate": 4.7398912848000636e-06,
"loss": 0.0399,
"step": 13010
},
{
"epoch": 5.675675675675675,
"grad_norm": 0.9468610882759094,
"learning_rate": 4.6931550737370264e-06,
"loss": 0.0347,
"step": 13020
},
{
"epoch": 5.680034873583261,
"grad_norm": 1.0548272132873535,
"learning_rate": 4.646639070962067e-06,
"loss": 0.0387,
"step": 13030
},
{
"epoch": 5.684394071490845,
"grad_norm": 0.408389151096344,
"learning_rate": 4.600343502560439e-06,
"loss": 0.0272,
"step": 13040
},
{
"epoch": 5.688753269398431,
"grad_norm": 0.36637166142463684,
"learning_rate": 4.55426859354599e-06,
"loss": 0.021,
"step": 13050
},
{
"epoch": 5.693112467306015,
"grad_norm": 0.21423012018203735,
"learning_rate": 4.5084145678600805e-06,
"loss": 0.027,
"step": 13060
},
{
"epoch": 5.697471665213601,
"grad_norm": 0.29542121291160583,
"learning_rate": 4.462781648370518e-06,
"loss": 0.0399,
"step": 13070
},
{
"epoch": 5.701830863121185,
"grad_norm": 0.4850723147392273,
"learning_rate": 4.417370056870418e-06,
"loss": 0.0304,
"step": 13080
},
{
"epoch": 5.706190061028771,
"grad_norm": 0.9915159344673157,
"learning_rate": 4.372180014077193e-06,
"loss": 0.0303,
"step": 13090
},
{
"epoch": 5.710549258936355,
"grad_norm": 0.1932496577501297,
"learning_rate": 4.327211739631415e-06,
"loss": 0.027,
"step": 13100
},
{
"epoch": 5.714908456843941,
"grad_norm": 0.26505881547927856,
"learning_rate": 4.282465452095802e-06,
"loss": 0.0208,
"step": 13110
},
{
"epoch": 5.719267654751526,
"grad_norm": 0.22322647273540497,
"learning_rate": 4.237941368954124e-06,
"loss": 0.0239,
"step": 13120
},
{
"epoch": 5.723626852659111,
"grad_norm": 0.40861910581588745,
"learning_rate": 4.193639706610147e-06,
"loss": 0.0339,
"step": 13130
},
{
"epoch": 5.727986050566695,
"grad_norm": 0.7970739603042603,
"learning_rate": 4.149560680386588e-06,
"loss": 0.0282,
"step": 13140
},
{
"epoch": 5.732345248474281,
"grad_norm": 0.8208338022232056,
"learning_rate": 4.105704504524094e-06,
"loss": 0.0314,
"step": 13150
},
{
"epoch": 5.736704446381866,
"grad_norm": 0.4076552987098694,
"learning_rate": 4.0620713921801334e-06,
"loss": 0.02,
"step": 13160
},
{
"epoch": 5.741063644289451,
"grad_norm": 0.3223102390766144,
"learning_rate": 4.0186615554280385e-06,
"loss": 0.0232,
"step": 13170
},
{
"epoch": 5.745422842197035,
"grad_norm": 0.4325304925441742,
"learning_rate": 3.975475205255929e-06,
"loss": 0.0306,
"step": 13180
},
{
"epoch": 5.749782040104621,
"grad_norm": 0.40523847937583923,
"learning_rate": 3.932512551565676e-06,
"loss": 0.0249,
"step": 13190
},
{
"epoch": 5.754141238012206,
"grad_norm": 0.4874592423439026,
"learning_rate": 3.889773803171936e-06,
"loss": 0.0345,
"step": 13200
},
{
"epoch": 5.758500435919791,
"grad_norm": 0.5539955496788025,
"learning_rate": 3.847259167801076e-06,
"loss": 0.0184,
"step": 13210
},
{
"epoch": 5.762859633827376,
"grad_norm": 0.22394351661205292,
"learning_rate": 3.804968852090185e-06,
"loss": 0.0255,
"step": 13220
},
{
"epoch": 5.767218831734961,
"grad_norm": 0.28927505016326904,
"learning_rate": 3.762903061586104e-06,
"loss": 0.0325,
"step": 13230
},
{
"epoch": 5.771578029642546,
"grad_norm": 0.401094913482666,
"learning_rate": 3.721062000744363e-06,
"loss": 0.0371,
"step": 13240
},
{
"epoch": 5.775937227550131,
"grad_norm": 0.35075923800468445,
"learning_rate": 3.679445872928244e-06,
"loss": 0.0284,
"step": 13250
},
{
"epoch": 5.780296425457716,
"grad_norm": 0.6507555246353149,
"learning_rate": 3.6380548804077707e-06,
"loss": 0.0272,
"step": 13260
},
{
"epoch": 5.784655623365301,
"grad_norm": 0.270612895488739,
"learning_rate": 3.5968892243587016e-06,
"loss": 0.0268,
"step": 13270
},
{
"epoch": 5.789014821272886,
"grad_norm": 0.5847600102424622,
"learning_rate": 3.555949104861611e-06,
"loss": 0.0259,
"step": 13280
},
{
"epoch": 5.793374019180471,
"grad_norm": 1.932289958000183,
"learning_rate": 3.5152347209008394e-06,
"loss": 0.0279,
"step": 13290
},
{
"epoch": 5.797733217088056,
"grad_norm": 0.4612036943435669,
"learning_rate": 3.4747462703636104e-06,
"loss": 0.0282,
"step": 13300
},
{
"epoch": 5.8020924149956405,
"grad_norm": 0.5467131733894348,
"learning_rate": 3.434483950038986e-06,
"loss": 0.0336,
"step": 13310
},
{
"epoch": 5.806451612903226,
"grad_norm": 0.9813832640647888,
"learning_rate": 3.3944479556169806e-06,
"loss": 0.0346,
"step": 13320
},
{
"epoch": 5.8108108108108105,
"grad_norm": 0.4749749004840851,
"learning_rate": 3.3546384816875665e-06,
"loss": 0.0308,
"step": 13330
},
{
"epoch": 5.815170008718396,
"grad_norm": 0.4085749089717865,
"learning_rate": 3.315055721739746e-06,
"loss": 0.0279,
"step": 13340
},
{
"epoch": 5.8195292066259805,
"grad_norm": 0.5082237124443054,
"learning_rate": 3.275699868160592e-06,
"loss": 0.0269,
"step": 13350
},
{
"epoch": 5.823888404533566,
"grad_norm": 0.3107988238334656,
"learning_rate": 3.23657111223436e-06,
"loss": 0.0298,
"step": 13360
},
{
"epoch": 5.8282476024411505,
"grad_norm": 0.7513600587844849,
"learning_rate": 3.1976696441414764e-06,
"loss": 0.0407,
"step": 13370
},
{
"epoch": 5.832606800348736,
"grad_norm": 0.5760849118232727,
"learning_rate": 3.158995652957719e-06,
"loss": 0.0304,
"step": 13380
},
{
"epoch": 5.8369659982563205,
"grad_norm": 0.3615356981754303,
"learning_rate": 3.1205493266531937e-06,
"loss": 0.0263,
"step": 13390
},
{
"epoch": 5.841325196163906,
"grad_norm": 0.48539432883262634,
"learning_rate": 3.082330852091497e-06,
"loss": 0.0344,
"step": 13400
},
{
"epoch": 5.84568439407149,
"grad_norm": 0.6521255373954773,
"learning_rate": 3.0443404150287847e-06,
"loss": 0.0307,
"step": 13410
},
{
"epoch": 5.850043591979076,
"grad_norm": 0.2879190146923065,
"learning_rate": 3.0065782001128475e-06,
"loss": 0.0283,
"step": 13420
},
{
"epoch": 5.854402789886661,
"grad_norm": 0.30713582038879395,
"learning_rate": 2.9690443908822252e-06,
"loss": 0.0254,
"step": 13430
},
{
"epoch": 5.858761987794246,
"grad_norm": 0.3112742304801941,
"learning_rate": 2.9317391697653518e-06,
"loss": 0.0265,
"step": 13440
},
{
"epoch": 5.86312118570183,
"grad_norm": 0.5539763569831848,
"learning_rate": 2.8946627180795936e-06,
"loss": 0.0381,
"step": 13450
},
{
"epoch": 5.867480383609416,
"grad_norm": 0.7893513441085815,
"learning_rate": 2.8578152160304573e-06,
"loss": 0.0382,
"step": 13460
},
{
"epoch": 5.871839581517001,
"grad_norm": 0.4452410042285919,
"learning_rate": 2.821196842710638e-06,
"loss": 0.0451,
"step": 13470
},
{
"epoch": 5.876198779424586,
"grad_norm": 0.38596704602241516,
"learning_rate": 2.7848077760991853e-06,
"loss": 0.0331,
"step": 13480
},
{
"epoch": 5.880557977332171,
"grad_norm": 0.4066772162914276,
"learning_rate": 2.7486481930606434e-06,
"loss": 0.0348,
"step": 13490
},
{
"epoch": 5.884917175239756,
"grad_norm": 0.3776564598083496,
"learning_rate": 2.712718269344161e-06,
"loss": 0.0225,
"step": 13500
},
{
"epoch": 5.889276373147341,
"grad_norm": 0.2993749678134918,
"learning_rate": 2.677018179582669e-06,
"loss": 0.0306,
"step": 13510
},
{
"epoch": 5.893635571054926,
"grad_norm": 0.3112500011920929,
"learning_rate": 2.641548097292024e-06,
"loss": 0.0376,
"step": 13520
},
{
"epoch": 5.897994768962511,
"grad_norm": 0.49900758266448975,
"learning_rate": 2.606308194870133e-06,
"loss": 0.0337,
"step": 13530
},
{
"epoch": 5.902353966870096,
"grad_norm": 0.49350616335868835,
"learning_rate": 2.5712986435961707e-06,
"loss": 0.03,
"step": 13540
},
{
"epoch": 5.906713164777681,
"grad_norm": 0.36458122730255127,
"learning_rate": 2.536519613629723e-06,
"loss": 0.0249,
"step": 13550
},
{
"epoch": 5.911072362685266,
"grad_norm": 0.35984691977500916,
"learning_rate": 2.501971274009923e-06,
"loss": 0.0378,
"step": 13560
},
{
"epoch": 5.915431560592851,
"grad_norm": 0.41873860359191895,
"learning_rate": 2.467653792654695e-06,
"loss": 0.0236,
"step": 13570
},
{
"epoch": 5.919790758500436,
"grad_norm": 0.9378716945648193,
"learning_rate": 2.4335673363598822e-06,
"loss": 0.0271,
"step": 13580
},
{
"epoch": 5.924149956408021,
"grad_norm": 0.380706787109375,
"learning_rate": 2.399712070798471e-06,
"loss": 0.0351,
"step": 13590
},
{
"epoch": 5.928509154315606,
"grad_norm": 0.42532041668891907,
"learning_rate": 2.3660881605197694e-06,
"loss": 0.0495,
"step": 13600
},
{
"epoch": 5.932868352223191,
"grad_norm": 0.22512225806713104,
"learning_rate": 2.332695768948617e-06,
"loss": 0.0243,
"step": 13610
},
{
"epoch": 5.937227550130776,
"grad_norm": 0.4298464059829712,
"learning_rate": 2.299535058384583e-06,
"loss": 0.0392,
"step": 13620
},
{
"epoch": 5.941586748038361,
"grad_norm": 0.3365100026130676,
"learning_rate": 2.266606190001186e-06,
"loss": 0.022,
"step": 13630
},
{
"epoch": 5.945945945945946,
"grad_norm": 0.38887470960617065,
"learning_rate": 2.2339093238450737e-06,
"loss": 0.0251,
"step": 13640
},
{
"epoch": 5.950305143853531,
"grad_norm": 0.8472762703895569,
"learning_rate": 2.20144461883533e-06,
"loss": 0.0332,
"step": 13650
},
{
"epoch": 5.954664341761116,
"grad_norm": 0.5774504542350769,
"learning_rate": 2.1692122327625908e-06,
"loss": 0.0352,
"step": 13660
},
{
"epoch": 5.959023539668701,
"grad_norm": 0.41319480538368225,
"learning_rate": 2.137212322288379e-06,
"loss": 0.0276,
"step": 13670
},
{
"epoch": 5.963382737576286,
"grad_norm": 0.18480074405670166,
"learning_rate": 2.105445042944282e-06,
"loss": 0.0252,
"step": 13680
},
{
"epoch": 5.967741935483871,
"grad_norm": 0.3829851448535919,
"learning_rate": 2.0739105491312027e-06,
"loss": 0.0313,
"step": 13690
},
{
"epoch": 5.972101133391456,
"grad_norm": 0.3687523603439331,
"learning_rate": 2.0426089941186443e-06,
"loss": 0.0264,
"step": 13700
},
{
"epoch": 5.976460331299041,
"grad_norm": 1.2374975681304932,
"learning_rate": 2.0115405300439093e-06,
"loss": 0.0267,
"step": 13710
},
{
"epoch": 5.9808195292066255,
"grad_norm": 0.49486619234085083,
"learning_rate": 1.9807053079114013e-06,
"loss": 0.0261,
"step": 13720
},
{
"epoch": 5.985178727114211,
"grad_norm": 0.4957039952278137,
"learning_rate": 1.9501034775919024e-06,
"loss": 0.0227,
"step": 13730
},
{
"epoch": 5.989537925021796,
"grad_norm": 0.4244577884674072,
"learning_rate": 1.9197351878217917e-06,
"loss": 0.0241,
"step": 13740
},
{
"epoch": 5.993897122929381,
"grad_norm": 0.19650620222091675,
"learning_rate": 1.8896005862023669e-06,
"loss": 0.027,
"step": 13750
},
{
"epoch": 5.998256320836966,
"grad_norm": 0.5099511742591858,
"learning_rate": 1.8596998191991288e-06,
"loss": 0.0466,
"step": 13760
},
{
"epoch": 6.002615518744551,
"grad_norm": 1.2363959550857544,
"learning_rate": 1.8300330321410208e-06,
"loss": 0.0273,
"step": 13770
},
{
"epoch": 6.006974716652136,
"grad_norm": 0.2866174280643463,
"learning_rate": 1.8006003692197794e-06,
"loss": 0.0265,
"step": 13780
},
{
"epoch": 6.011333914559721,
"grad_norm": 0.44128701090812683,
"learning_rate": 1.7714019734892062e-06,
"loss": 0.0314,
"step": 13790
},
{
"epoch": 6.015693112467306,
"grad_norm": 0.396158367395401,
"learning_rate": 1.7424379868644759e-06,
"loss": 0.0268,
"step": 13800
},
{
"epoch": 6.020052310374891,
"grad_norm": 0.38308459520339966,
"learning_rate": 1.71370855012144e-06,
"loss": 0.0317,
"step": 13810
},
{
"epoch": 6.024411508282476,
"grad_norm": 0.5613490343093872,
"learning_rate": 1.6852138028959574e-06,
"loss": 0.0299,
"step": 13820
},
{
"epoch": 6.028770706190061,
"grad_norm": 0.3245357573032379,
"learning_rate": 1.6569538836832044e-06,
"loss": 0.0282,
"step": 13830
},
{
"epoch": 6.033129904097646,
"grad_norm": 0.5527129173278809,
"learning_rate": 1.6289289298370147e-06,
"loss": 0.0309,
"step": 13840
},
{
"epoch": 6.037489102005231,
"grad_norm": 0.3975665867328644,
"learning_rate": 1.6011390775691748e-06,
"loss": 0.0221,
"step": 13850
},
{
"epoch": 6.041848299912816,
"grad_norm": 0.44122743606567383,
"learning_rate": 1.5735844619488238e-06,
"loss": 0.028,
"step": 13860
},
{
"epoch": 6.046207497820401,
"grad_norm": 0.2691154181957245,
"learning_rate": 1.5462652169017322e-06,
"loss": 0.0336,
"step": 13870
},
{
"epoch": 6.050566695727986,
"grad_norm": 1.3896781206130981,
"learning_rate": 1.5191814752097023e-06,
"loss": 0.0197,
"step": 13880
},
{
"epoch": 6.054925893635571,
"grad_norm": 0.4001099765300751,
"learning_rate": 1.492333368509896e-06,
"loss": 0.0355,
"step": 13890
},
{
"epoch": 6.059285091543156,
"grad_norm": 0.27171969413757324,
"learning_rate": 1.4657210272941923e-06,
"loss": 0.0202,
"step": 13900
},
{
"epoch": 6.063644289450741,
"grad_norm": 0.39832237362861633,
"learning_rate": 1.4393445809085748e-06,
"loss": 0.028,
"step": 13910
},
{
"epoch": 6.068003487358326,
"grad_norm": 0.7771658897399902,
"learning_rate": 1.4132041575524834e-06,
"loss": 0.0337,
"step": 13920
},
{
"epoch": 6.072362685265911,
"grad_norm": 1.9989441633224487,
"learning_rate": 1.387299884278187e-06,
"loss": 0.0294,
"step": 13930
},
{
"epoch": 6.076721883173496,
"grad_norm": 0.39735016226768494,
"learning_rate": 1.3616318869901945e-06,
"loss": 0.0291,
"step": 13940
},
{
"epoch": 6.081081081081081,
"grad_norm": 0.5451767444610596,
"learning_rate": 1.336200290444606e-06,
"loss": 0.0266,
"step": 13950
},
{
"epoch": 6.085440278988666,
"grad_norm": 0.18423081934452057,
"learning_rate": 1.3110052182485454e-06,
"loss": 0.0333,
"step": 13960
},
{
"epoch": 6.089799476896251,
"grad_norm": 0.427682489156723,
"learning_rate": 1.2860467928595298e-06,
"loss": 0.0489,
"step": 13970
},
{
"epoch": 6.094158674803836,
"grad_norm": 0.534563422203064,
"learning_rate": 1.2613251355848732e-06,
"loss": 0.0337,
"step": 13980
},
{
"epoch": 6.098517872711421,
"grad_norm": 0.5575711727142334,
"learning_rate": 1.2368403665811324e-06,
"loss": 0.0337,
"step": 13990
},
{
"epoch": 6.102877070619006,
"grad_norm": 0.2626562714576721,
"learning_rate": 1.2125926048534686e-06,
"loss": 0.0362,
"step": 14000
},
{
"epoch": 6.1072362685265915,
"grad_norm": 0.6961496472358704,
"learning_rate": 1.1885819682551259e-06,
"loss": 0.0253,
"step": 14010
},
{
"epoch": 6.111595466434176,
"grad_norm": 0.687218964099884,
"learning_rate": 1.164808573486814e-06,
"loss": 0.0343,
"step": 14020
},
{
"epoch": 6.1159546643417615,
"grad_norm": 0.5468307733535767,
"learning_rate": 1.1412725360961608e-06,
"loss": 0.0368,
"step": 14030
},
{
"epoch": 6.120313862249346,
"grad_norm": 0.6597806811332703,
"learning_rate": 1.1179739704771486e-06,
"loss": 0.021,
"step": 14040
},
{
"epoch": 6.1246730601569315,
"grad_norm": 0.30327489972114563,
"learning_rate": 1.0949129898695675e-06,
"loss": 0.0283,
"step": 14050
},
{
"epoch": 6.129032258064516,
"grad_norm": 0.4793466627597809,
"learning_rate": 1.0720897063584423e-06,
"loss": 0.0285,
"step": 14060
},
{
"epoch": 6.1333914559721014,
"grad_norm": 0.5061814785003662,
"learning_rate": 1.0495042308735103e-06,
"loss": 0.0324,
"step": 14070
},
{
"epoch": 6.137750653879686,
"grad_norm": 0.6382007598876953,
"learning_rate": 1.0271566731886617e-06,
"loss": 0.023,
"step": 14080
},
{
"epoch": 6.142109851787271,
"grad_norm": 0.7890705466270447,
"learning_rate": 1.005047141921428e-06,
"loss": 0.0271,
"step": 14090
},
{
"epoch": 6.146469049694856,
"grad_norm": 0.9838564991950989,
"learning_rate": 9.831757445324274e-07,
"loss": 0.0394,
"step": 14100
},
{
"epoch": 6.150828247602441,
"grad_norm": 0.3258971571922302,
"learning_rate": 9.615425873248761e-07,
"loss": 0.0271,
"step": 14110
},
{
"epoch": 6.155187445510026,
"grad_norm": 0.5593294501304626,
"learning_rate": 9.401477754440502e-07,
"loss": 0.0283,
"step": 14120
},
{
"epoch": 6.159546643417611,
"grad_norm": 0.4928247332572937,
"learning_rate": 9.189914128767684e-07,
"loss": 0.0366,
"step": 14130
},
{
"epoch": 6.163905841325196,
"grad_norm": 0.2662743031978607,
"learning_rate": 8.980736024508996e-07,
"loss": 0.0348,
"step": 14140
},
{
"epoch": 6.168265039232781,
"grad_norm": 0.6286730170249939,
"learning_rate": 8.77394445834867e-07,
"loss": 0.0237,
"step": 14150
},
{
"epoch": 6.172624237140366,
"grad_norm": 0.6010546088218689,
"learning_rate": 8.569540435371281e-07,
"loss": 0.0362,
"step": 14160
},
{
"epoch": 6.176983435047951,
"grad_norm": 0.32207077741622925,
"learning_rate": 8.367524949057348e-07,
"loss": 0.0282,
"step": 14170
},
{
"epoch": 6.181342632955536,
"grad_norm": 0.18507017195224762,
"learning_rate": 8.167898981277844e-07,
"loss": 0.0313,
"step": 14180
},
{
"epoch": 6.185701830863121,
"grad_norm": 0.5099960565567017,
"learning_rate": 7.970663502290143e-07,
"loss": 0.0289,
"step": 14190
},
{
"epoch": 6.190061028770706,
"grad_norm": 0.4452384114265442,
"learning_rate": 7.775819470732692e-07,
"loss": 0.0268,
"step": 14200
},
{
"epoch": 6.194420226678291,
"grad_norm": 0.2529633939266205,
"learning_rate": 7.583367833620681e-07,
"loss": 0.0317,
"step": 14210
},
{
"epoch": 6.198779424585876,
"grad_norm": 0.2038535624742508,
"learning_rate": 7.39330952634143e-07,
"loss": 0.032,
"step": 14220
},
{
"epoch": 6.203138622493461,
"grad_norm": 0.4185069799423218,
"learning_rate": 7.205645472649681e-07,
"loss": 0.0301,
"step": 14230
},
{
"epoch": 6.207497820401046,
"grad_norm": 0.357864111661911,
"learning_rate": 7.020376584663202e-07,
"loss": 0.0255,
"step": 14240
},
{
"epoch": 6.211857018308631,
"grad_norm": 0.4020203649997711,
"learning_rate": 6.83750376285841e-07,
"loss": 0.0389,
"step": 14250
},
{
"epoch": 6.216216216216216,
"grad_norm": 1.3740479946136475,
"learning_rate": 6.657027896065982e-07,
"loss": 0.0309,
"step": 14260
},
{
"epoch": 6.220575414123801,
"grad_norm": 0.3854996860027313,
"learning_rate": 6.478949861466355e-07,
"loss": 0.0392,
"step": 14270
},
{
"epoch": 6.224934612031387,
"grad_norm": 0.6102597117424011,
"learning_rate": 6.303270524585736e-07,
"loss": 0.0313,
"step": 14280
},
{
"epoch": 6.229293809938971,
"grad_norm": 0.25067058205604553,
"learning_rate": 6.129990739291713e-07,
"loss": 0.0242,
"step": 14290
},
{
"epoch": 6.233653007846557,
"grad_norm": 0.572243332862854,
"learning_rate": 5.959111347789093e-07,
"loss": 0.0353,
"step": 14300
},
{
"epoch": 6.238012205754141,
"grad_norm": 0.49448612332344055,
"learning_rate": 5.790633180615956e-07,
"loss": 0.0245,
"step": 14310
},
{
"epoch": 6.242371403661727,
"grad_norm": 0.22109507024288177,
"learning_rate": 5.624557056639446e-07,
"loss": 0.0411,
"step": 14320
},
{
"epoch": 6.246730601569311,
"grad_norm": 0.3474348485469818,
"learning_rate": 5.460883783051984e-07,
"loss": 0.0233,
"step": 14330
},
{
"epoch": 6.251089799476897,
"grad_norm": 0.18911020457744598,
"learning_rate": 5.299614155367171e-07,
"loss": 0.0285,
"step": 14340
},
{
"epoch": 6.255448997384481,
"grad_norm": 0.5895888209342957,
"learning_rate": 5.140748957415897e-07,
"loss": 0.0292,
"step": 14350
},
{
"epoch": 6.259808195292067,
"grad_norm": 0.19754233956336975,
"learning_rate": 4.984288961342787e-07,
"loss": 0.0242,
"step": 14360
},
{
"epoch": 6.264167393199651,
"grad_norm": 0.3680201470851898,
"learning_rate": 4.830234927602206e-07,
"loss": 0.0283,
"step": 14370
},
{
"epoch": 6.2685265911072365,
"grad_norm": 0.4519497752189636,
"learning_rate": 4.6785876049545986e-07,
"loss": 0.0299,
"step": 14380
},
{
"epoch": 6.272885789014821,
"grad_norm": 0.3829212486743927,
"learning_rate": 4.5293477304629297e-07,
"loss": 0.0373,
"step": 14390
},
{
"epoch": 6.2772449869224065,
"grad_norm": 0.27076077461242676,
"learning_rate": 4.382516029489081e-07,
"loss": 0.0338,
"step": 14400
},
{
"epoch": 6.281604184829991,
"grad_norm": 0.32891130447387695,
"learning_rate": 4.2380932156902975e-07,
"loss": 0.0263,
"step": 14410
},
{
"epoch": 6.2859633827375765,
"grad_norm": 0.380852609872818,
"learning_rate": 4.0960799910156335e-07,
"loss": 0.0314,
"step": 14420
},
{
"epoch": 6.290322580645161,
"grad_norm": 0.39561882615089417,
"learning_rate": 3.956477045702844e-07,
"loss": 0.0313,
"step": 14430
},
{
"epoch": 6.2946817785527465,
"grad_norm": 0.40406253933906555,
"learning_rate": 3.819285058274613e-07,
"loss": 0.0281,
"step": 14440
},
{
"epoch": 6.299040976460331,
"grad_norm": 0.4073195159435272,
"learning_rate": 3.684504695535496e-07,
"loss": 0.0237,
"step": 14450
},
{
"epoch": 6.303400174367916,
"grad_norm": 1.0025192499160767,
"learning_rate": 3.552136612568813e-07,
"loss": 0.03,
"step": 14460
},
{
"epoch": 6.307759372275501,
"grad_norm": 0.7018080353736877,
"learning_rate": 3.422181452733042e-07,
"loss": 0.0352,
"step": 14470
},
{
"epoch": 6.312118570183086,
"grad_norm": 0.6284013390541077,
"learning_rate": 3.294639847659209e-07,
"loss": 0.0246,
"step": 14480
},
{
"epoch": 6.316477768090671,
"grad_norm": 0.3403724431991577,
"learning_rate": 3.169512417247389e-07,
"loss": 0.0259,
"step": 14490
},
{
"epoch": 6.320836965998256,
"grad_norm": 0.7513018250465393,
"learning_rate": 3.046799769663822e-07,
"loss": 0.0327,
"step": 14500
},
{
"epoch": 6.325196163905841,
"grad_norm": 0.42642244696617126,
"learning_rate": 2.926502501338191e-07,
"loss": 0.0301,
"step": 14510
},
{
"epoch": 6.329555361813426,
"grad_norm": 0.184515580534935,
"learning_rate": 2.808621196960404e-07,
"loss": 0.0234,
"step": 14520
},
{
"epoch": 6.333914559721011,
"grad_norm": 0.4309402108192444,
"learning_rate": 2.6931564294778164e-07,
"loss": 0.0478,
"step": 14530
},
{
"epoch": 6.338273757628596,
"grad_norm": 0.1840837150812149,
"learning_rate": 2.58010876009257e-07,
"loss": 0.0282,
"step": 14540
},
{
"epoch": 6.342632955536182,
"grad_norm": 0.797314465045929,
"learning_rate": 2.4694787382589237e-07,
"loss": 0.0311,
"step": 14550
},
{
"epoch": 6.346992153443766,
"grad_norm": 0.5150383114814758,
"learning_rate": 2.3612669016802592e-07,
"loss": 0.0269,
"step": 14560
},
{
"epoch": 6.351351351351352,
"grad_norm": 0.3189009428024292,
"learning_rate": 2.2554737763068045e-07,
"loss": 0.022,
"step": 14570
},
{
"epoch": 6.355710549258936,
"grad_norm": 0.34953877329826355,
"learning_rate": 2.152099876332858e-07,
"loss": 0.0328,
"step": 14580
},
{
"epoch": 6.360069747166522,
"grad_norm": 0.204021617770195,
"learning_rate": 2.051145704194457e-07,
"loss": 0.032,
"step": 14590
},
{
"epoch": 6.364428945074106,
"grad_norm": 1.0681053400039673,
"learning_rate": 1.9526117505667129e-07,
"loss": 0.0345,
"step": 14600
},
{
"epoch": 6.368788142981692,
"grad_norm": 0.1900322437286377,
"learning_rate": 1.856498494361758e-07,
"loss": 0.0279,
"step": 14610
},
{
"epoch": 6.373147340889276,
"grad_norm": 0.967194676399231,
"learning_rate": 1.7628064027260803e-07,
"loss": 0.0269,
"step": 14620
},
{
"epoch": 6.377506538796862,
"grad_norm": 0.4180316925048828,
"learning_rate": 1.671535931038415e-07,
"loss": 0.0247,
"step": 14630
},
{
"epoch": 6.381865736704446,
"grad_norm": 0.31126174330711365,
"learning_rate": 1.5826875229076333e-07,
"loss": 0.0256,
"step": 14640
},
{
"epoch": 6.386224934612032,
"grad_norm": 0.33545228838920593,
"learning_rate": 1.496261610170302e-07,
"loss": 0.0283,
"step": 14650
},
{
"epoch": 6.390584132519616,
"grad_norm": 0.6360080242156982,
"learning_rate": 1.4122586128888503e-07,
"loss": 0.0274,
"step": 14660
},
{
"epoch": 6.394943330427202,
"grad_norm": 0.288099467754364,
"learning_rate": 1.3306789393494612e-07,
"loss": 0.0368,
"step": 14670
},
{
"epoch": 6.399302528334786,
"grad_norm": 1.516507863998413,
"learning_rate": 1.2515229860599054e-07,
"loss": 0.028,
"step": 14680
},
{
"epoch": 6.403661726242372,
"grad_norm": 0.37025946378707886,
"learning_rate": 1.1747911377478771e-07,
"loss": 0.0281,
"step": 14690
},
{
"epoch": 6.408020924149956,
"grad_norm": 0.33924514055252075,
"learning_rate": 1.1004837673589952e-07,
"loss": 0.0283,
"step": 14700
},
{
"epoch": 6.412380122057542,
"grad_norm": 0.3753373622894287,
"learning_rate": 1.0286012360550267e-07,
"loss": 0.0343,
"step": 14710
},
{
"epoch": 6.416739319965126,
"grad_norm": 0.509145975112915,
"learning_rate": 9.591438932121111e-08,
"loss": 0.0344,
"step": 14720
},
{
"epoch": 6.421098517872712,
"grad_norm": 1.0932096242904663,
"learning_rate": 8.921120764189272e-08,
"loss": 0.0352,
"step": 14730
},
{
"epoch": 6.425457715780296,
"grad_norm": 1.1470388174057007,
"learning_rate": 8.275061114753068e-08,
"loss": 0.0353,
"step": 14740
},
{
"epoch": 6.4298169136878816,
"grad_norm": 0.9916203022003174,
"learning_rate": 7.65326312390624e-08,
"loss": 0.0385,
"step": 14750
},
{
"epoch": 6.434176111595466,
"grad_norm": 0.6590924263000488,
"learning_rate": 7.055729813819079e-08,
"loss": 0.0306,
"step": 14760
},
{
"epoch": 6.4385353095030515,
"grad_norm": 0.45894575119018555,
"learning_rate": 6.48246408872899e-08,
"loss": 0.0266,
"step": 14770
},
{
"epoch": 6.442894507410636,
"grad_norm": 0.3868933618068695,
"learning_rate": 5.9334687349227314e-08,
"loss": 0.0286,
"step": 14780
},
{
"epoch": 6.4472537053182215,
"grad_norm": 0.4164086878299713,
"learning_rate": 5.4087464207236426e-08,
"loss": 0.0203,
"step": 14790
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.7725594639778137,
"learning_rate": 4.9082996964794345e-08,
"loss": 0.0282,
"step": 14800
},
{
"epoch": 6.4559721011333915,
"grad_norm": 0.35307568311691284,
"learning_rate": 4.432130994548866e-08,
"loss": 0.0228,
"step": 14810
},
{
"epoch": 6.460331299040977,
"grad_norm": 0.4140183925628662,
"learning_rate": 3.980242629291198e-08,
"loss": 0.0265,
"step": 14820
},
{
"epoch": 6.4646904969485615,
"grad_norm": 0.37078022956848145,
"learning_rate": 3.5526367970539765e-08,
"loss": 0.0329,
"step": 14830
},
{
"epoch": 6.469049694856146,
"grad_norm": 0.6152135729789734,
"learning_rate": 3.1493155761613826e-08,
"loss": 0.04,
"step": 14840
},
{
"epoch": 6.473408892763731,
"grad_norm": 0.7362766861915588,
"learning_rate": 2.7702809269058992e-08,
"loss": 0.0417,
"step": 14850
},
{
"epoch": 6.477768090671317,
"grad_norm": 1.0350167751312256,
"learning_rate": 2.4155346915394337e-08,
"loss": 0.0281,
"step": 14860
},
{
"epoch": 6.482127288578901,
"grad_norm": 0.35365110635757446,
"learning_rate": 2.085078594261103e-08,
"loss": 0.0276,
"step": 14870
},
{
"epoch": 6.486486486486487,
"grad_norm": 0.7158024311065674,
"learning_rate": 1.7789142412122372e-08,
"loss": 0.0341,
"step": 14880
},
{
"epoch": 6.490845684394071,
"grad_norm": 1.3772259950637817,
"learning_rate": 1.4970431204663905e-08,
"loss": 0.0292,
"step": 14890
},
{
"epoch": 6.495204882301657,
"grad_norm": 0.4312567710876465,
"learning_rate": 1.2394666020226764e-08,
"loss": 0.0237,
"step": 14900
},
{
"epoch": 6.499564080209241,
"grad_norm": 0.9634395837783813,
"learning_rate": 1.0061859378007743e-08,
"loss": 0.0325,
"step": 14910
},
{
"epoch": 6.503923278116827,
"grad_norm": 0.502382755279541,
"learning_rate": 7.97202261630936e-09,
"loss": 0.0192,
"step": 14920
},
{
"epoch": 6.508282476024411,
"grad_norm": 0.6565430164337158,
"learning_rate": 6.125165892539863e-09,
"loss": 0.0319,
"step": 14930
},
{
"epoch": 6.512641673931997,
"grad_norm": 0.23635677993297577,
"learning_rate": 4.5212981831022076e-09,
"loss": 0.0391,
"step": 14940
},
{
"epoch": 6.517000871839581,
"grad_norm": 0.5301797389984131,
"learning_rate": 3.1604272834051542e-09,
"loss": 0.0402,
"step": 14950
},
{
"epoch": 6.521360069747167,
"grad_norm": 0.11095762252807617,
"learning_rate": 2.04255980778556e-09,
"loss": 0.0284,
"step": 14960
},
{
"epoch": 6.525719267654751,
"grad_norm": 0.40208858251571655,
"learning_rate": 1.1677011895028234e-09,
"loss": 0.0279,
"step": 14970
},
{
"epoch": 6.530078465562337,
"grad_norm": 0.3951982259750366,
"learning_rate": 5.358556807000259e-10,
"loss": 0.0272,
"step": 14980
},
{
"epoch": 6.534437663469921,
"grad_norm": 0.6511605381965637,
"learning_rate": 1.4702635238728058e-10,
"loss": 0.0299,
"step": 14990
},
{
"epoch": 6.538796861377507,
"grad_norm": 0.603900134563446,
"learning_rate": 1.2150944139754927e-12,
"loss": 0.0342,
"step": 15000
}
],
"logging_steps": 10,
"max_steps": 15000,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}