llama2-7b-mod-sft-full-3 / trainer_state.json
kevin1010607's picture
Model save
33da23c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996550874599656,
"eval_steps": 500,
"global_step": 2536,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003941857600394186,
"grad_norm": 105.2694138675085,
"learning_rate": 7.874015748031497e-08,
"loss": 9.1364,
"step": 1
},
{
"epoch": 0.0007883715200788372,
"grad_norm": 96.39150101397924,
"learning_rate": 1.5748031496062994e-07,
"loss": 9.154,
"step": 2
},
{
"epoch": 0.0011825572801182557,
"grad_norm": 101.32411852803344,
"learning_rate": 2.362204724409449e-07,
"loss": 9.0663,
"step": 3
},
{
"epoch": 0.0015767430401576743,
"grad_norm": 82.40406383837171,
"learning_rate": 3.149606299212599e-07,
"loss": 9.042,
"step": 4
},
{
"epoch": 0.001970928800197093,
"grad_norm": 67.15234107674974,
"learning_rate": 3.937007874015748e-07,
"loss": 8.8627,
"step": 5
},
{
"epoch": 0.0023651145602365115,
"grad_norm": 50.03571730325678,
"learning_rate": 4.724409448818898e-07,
"loss": 8.7217,
"step": 6
},
{
"epoch": 0.00275930032027593,
"grad_norm": 44.44168073728412,
"learning_rate": 5.511811023622048e-07,
"loss": 8.7231,
"step": 7
},
{
"epoch": 0.0031534860803153486,
"grad_norm": 58.27436136079177,
"learning_rate": 6.299212598425198e-07,
"loss": 8.6106,
"step": 8
},
{
"epoch": 0.003547671840354767,
"grad_norm": 53.84109799579168,
"learning_rate": 7.086614173228346e-07,
"loss": 8.5611,
"step": 9
},
{
"epoch": 0.003941857600394186,
"grad_norm": 57.647941933879174,
"learning_rate": 7.874015748031496e-07,
"loss": 8.4481,
"step": 10
},
{
"epoch": 0.004336043360433604,
"grad_norm": 53.27276526815267,
"learning_rate": 8.661417322834646e-07,
"loss": 8.2106,
"step": 11
},
{
"epoch": 0.004730229120473023,
"grad_norm": 41.91985046682287,
"learning_rate": 9.448818897637796e-07,
"loss": 8.178,
"step": 12
},
{
"epoch": 0.0051244148805124415,
"grad_norm": 53.755309589361765,
"learning_rate": 1.0236220472440946e-06,
"loss": 7.9319,
"step": 13
},
{
"epoch": 0.00551860064055186,
"grad_norm": 46.32010256680663,
"learning_rate": 1.1023622047244096e-06,
"loss": 7.8888,
"step": 14
},
{
"epoch": 0.005912786400591279,
"grad_norm": 49.6775736894897,
"learning_rate": 1.1811023622047246e-06,
"loss": 7.5264,
"step": 15
},
{
"epoch": 0.006306972160630697,
"grad_norm": 42.62706429768497,
"learning_rate": 1.2598425196850396e-06,
"loss": 7.3966,
"step": 16
},
{
"epoch": 0.006701157920670116,
"grad_norm": 49.62426641360553,
"learning_rate": 1.3385826771653545e-06,
"loss": 7.2773,
"step": 17
},
{
"epoch": 0.007095343680709534,
"grad_norm": 37.16995292351442,
"learning_rate": 1.4173228346456693e-06,
"loss": 7.1822,
"step": 18
},
{
"epoch": 0.007489529440748953,
"grad_norm": 47.04696207168547,
"learning_rate": 1.4960629921259845e-06,
"loss": 7.0047,
"step": 19
},
{
"epoch": 0.007883715200788372,
"grad_norm": 37.94846722638193,
"learning_rate": 1.5748031496062992e-06,
"loss": 6.7177,
"step": 20
},
{
"epoch": 0.00827790096082779,
"grad_norm": 38.60623637532149,
"learning_rate": 1.6535433070866144e-06,
"loss": 6.6605,
"step": 21
},
{
"epoch": 0.008672086720867209,
"grad_norm": 32.058378304509375,
"learning_rate": 1.7322834645669292e-06,
"loss": 6.4935,
"step": 22
},
{
"epoch": 0.009066272480906627,
"grad_norm": 29.645376469665575,
"learning_rate": 1.8110236220472444e-06,
"loss": 6.1995,
"step": 23
},
{
"epoch": 0.009460458240946046,
"grad_norm": 27.31573397346269,
"learning_rate": 1.8897637795275591e-06,
"loss": 6.0939,
"step": 24
},
{
"epoch": 0.009854644000985464,
"grad_norm": 28.841277709048004,
"learning_rate": 1.968503937007874e-06,
"loss": 5.8683,
"step": 25
},
{
"epoch": 0.010248829761024883,
"grad_norm": 28.590239784856998,
"learning_rate": 2.0472440944881893e-06,
"loss": 5.7835,
"step": 26
},
{
"epoch": 0.010643015521064302,
"grad_norm": 24.204577484360353,
"learning_rate": 2.125984251968504e-06,
"loss": 5.7053,
"step": 27
},
{
"epoch": 0.01103720128110372,
"grad_norm": 26.31176344346586,
"learning_rate": 2.2047244094488192e-06,
"loss": 5.4684,
"step": 28
},
{
"epoch": 0.011431387041143139,
"grad_norm": 32.01648516861882,
"learning_rate": 2.283464566929134e-06,
"loss": 5.3315,
"step": 29
},
{
"epoch": 0.011825572801182557,
"grad_norm": 19.524646014081327,
"learning_rate": 2.362204724409449e-06,
"loss": 5.3126,
"step": 30
},
{
"epoch": 0.012219758561221976,
"grad_norm": 17.63176728075391,
"learning_rate": 2.440944881889764e-06,
"loss": 5.0745,
"step": 31
},
{
"epoch": 0.012613944321261394,
"grad_norm": 15.40058495681761,
"learning_rate": 2.519685039370079e-06,
"loss": 5.0836,
"step": 32
},
{
"epoch": 0.013008130081300813,
"grad_norm": 15.479272180480063,
"learning_rate": 2.598425196850394e-06,
"loss": 4.7458,
"step": 33
},
{
"epoch": 0.013402315841340232,
"grad_norm": 14.695908677216751,
"learning_rate": 2.677165354330709e-06,
"loss": 4.7244,
"step": 34
},
{
"epoch": 0.01379650160137965,
"grad_norm": 13.524764594889588,
"learning_rate": 2.755905511811024e-06,
"loss": 4.3899,
"step": 35
},
{
"epoch": 0.014190687361419069,
"grad_norm": 13.246003272441062,
"learning_rate": 2.8346456692913386e-06,
"loss": 4.5221,
"step": 36
},
{
"epoch": 0.014584873121458487,
"grad_norm": 12.203731349756566,
"learning_rate": 2.9133858267716538e-06,
"loss": 4.3881,
"step": 37
},
{
"epoch": 0.014979058881497906,
"grad_norm": 14.042134575473153,
"learning_rate": 2.992125984251969e-06,
"loss": 4.2049,
"step": 38
},
{
"epoch": 0.015373244641537324,
"grad_norm": 11.055066543698642,
"learning_rate": 3.0708661417322837e-06,
"loss": 4.3104,
"step": 39
},
{
"epoch": 0.015767430401576743,
"grad_norm": 10.153496145045878,
"learning_rate": 3.1496062992125985e-06,
"loss": 4.3375,
"step": 40
},
{
"epoch": 0.01616161616161616,
"grad_norm": 10.799528438464218,
"learning_rate": 3.2283464566929136e-06,
"loss": 4.4063,
"step": 41
},
{
"epoch": 0.01655580192165558,
"grad_norm": 8.368548564564762,
"learning_rate": 3.307086614173229e-06,
"loss": 3.7956,
"step": 42
},
{
"epoch": 0.016949987681695,
"grad_norm": 10.759799829642327,
"learning_rate": 3.3858267716535436e-06,
"loss": 3.9338,
"step": 43
},
{
"epoch": 0.017344173441734417,
"grad_norm": 9.554117583184022,
"learning_rate": 3.4645669291338583e-06,
"loss": 3.8938,
"step": 44
},
{
"epoch": 0.017738359201773836,
"grad_norm": 23.01084152913365,
"learning_rate": 3.5433070866141735e-06,
"loss": 3.8921,
"step": 45
},
{
"epoch": 0.018132544961813254,
"grad_norm": 9.532765765693696,
"learning_rate": 3.6220472440944887e-06,
"loss": 3.9267,
"step": 46
},
{
"epoch": 0.018526730721852673,
"grad_norm": 7.676169667219361,
"learning_rate": 3.7007874015748035e-06,
"loss": 3.5909,
"step": 47
},
{
"epoch": 0.01892091648189209,
"grad_norm": 12.040351419125447,
"learning_rate": 3.7795275590551182e-06,
"loss": 3.9373,
"step": 48
},
{
"epoch": 0.01931510224193151,
"grad_norm": 8.25216993424453,
"learning_rate": 3.858267716535433e-06,
"loss": 3.5314,
"step": 49
},
{
"epoch": 0.01970928800197093,
"grad_norm": 7.474412198918091,
"learning_rate": 3.937007874015748e-06,
"loss": 3.506,
"step": 50
},
{
"epoch": 0.020103473762010347,
"grad_norm": 7.847621110877795,
"learning_rate": 4.015748031496064e-06,
"loss": 3.5028,
"step": 51
},
{
"epoch": 0.020497659522049766,
"grad_norm": 6.570956902449958,
"learning_rate": 4.0944881889763785e-06,
"loss": 3.4612,
"step": 52
},
{
"epoch": 0.020891845282089185,
"grad_norm": 5.5766242231172924,
"learning_rate": 4.173228346456693e-06,
"loss": 3.2965,
"step": 53
},
{
"epoch": 0.021286031042128603,
"grad_norm": 6.108165687578511,
"learning_rate": 4.251968503937008e-06,
"loss": 3.4297,
"step": 54
},
{
"epoch": 0.02168021680216802,
"grad_norm": 5.219670006640724,
"learning_rate": 4.330708661417324e-06,
"loss": 2.9365,
"step": 55
},
{
"epoch": 0.02207440256220744,
"grad_norm": 11.909762655268862,
"learning_rate": 4.4094488188976384e-06,
"loss": 3.3342,
"step": 56
},
{
"epoch": 0.02246858832224686,
"grad_norm": 6.039053713195223,
"learning_rate": 4.488188976377953e-06,
"loss": 3.1308,
"step": 57
},
{
"epoch": 0.022862774082286277,
"grad_norm": 6.330821449415944,
"learning_rate": 4.566929133858268e-06,
"loss": 3.1559,
"step": 58
},
{
"epoch": 0.023256959842325696,
"grad_norm": 5.850842944173947,
"learning_rate": 4.645669291338583e-06,
"loss": 3.1376,
"step": 59
},
{
"epoch": 0.023651145602365115,
"grad_norm": 6.618904157271684,
"learning_rate": 4.724409448818898e-06,
"loss": 3.1044,
"step": 60
},
{
"epoch": 0.024045331362404533,
"grad_norm": 12.768772667010369,
"learning_rate": 4.803149606299213e-06,
"loss": 2.8825,
"step": 61
},
{
"epoch": 0.02443951712244395,
"grad_norm": 7.679745085489206,
"learning_rate": 4.881889763779528e-06,
"loss": 3.0757,
"step": 62
},
{
"epoch": 0.02483370288248337,
"grad_norm": 4.427650604634613,
"learning_rate": 4.960629921259843e-06,
"loss": 2.8175,
"step": 63
},
{
"epoch": 0.02522788864252279,
"grad_norm": 6.028182477121757,
"learning_rate": 5.039370078740158e-06,
"loss": 2.998,
"step": 64
},
{
"epoch": 0.025622074402562207,
"grad_norm": 5.50324148915112,
"learning_rate": 5.118110236220473e-06,
"loss": 2.9141,
"step": 65
},
{
"epoch": 0.026016260162601626,
"grad_norm": 4.48735111430469,
"learning_rate": 5.196850393700788e-06,
"loss": 2.7909,
"step": 66
},
{
"epoch": 0.026410445922641045,
"grad_norm": 5.701752085492088,
"learning_rate": 5.2755905511811025e-06,
"loss": 2.8697,
"step": 67
},
{
"epoch": 0.026804631682680463,
"grad_norm": 9.227957681435909,
"learning_rate": 5.354330708661418e-06,
"loss": 2.6822,
"step": 68
},
{
"epoch": 0.02719881744271988,
"grad_norm": 5.786678373864676,
"learning_rate": 5.433070866141733e-06,
"loss": 2.7271,
"step": 69
},
{
"epoch": 0.0275930032027593,
"grad_norm": 4.652746279810885,
"learning_rate": 5.511811023622048e-06,
"loss": 2.7177,
"step": 70
},
{
"epoch": 0.02798718896279872,
"grad_norm": 6.252735777715452,
"learning_rate": 5.590551181102362e-06,
"loss": 2.8251,
"step": 71
},
{
"epoch": 0.028381374722838137,
"grad_norm": 5.151704866859134,
"learning_rate": 5.669291338582677e-06,
"loss": 2.6813,
"step": 72
},
{
"epoch": 0.028775560482877556,
"grad_norm": 4.337181405580127,
"learning_rate": 5.748031496062993e-06,
"loss": 2.4957,
"step": 73
},
{
"epoch": 0.029169746242916975,
"grad_norm": 5.91427046899434,
"learning_rate": 5.8267716535433075e-06,
"loss": 2.6815,
"step": 74
},
{
"epoch": 0.029563932002956393,
"grad_norm": 7.660058774479181,
"learning_rate": 5.905511811023622e-06,
"loss": 2.7335,
"step": 75
},
{
"epoch": 0.029958117762995812,
"grad_norm": 4.115441568706006,
"learning_rate": 5.984251968503938e-06,
"loss": 2.5424,
"step": 76
},
{
"epoch": 0.03035230352303523,
"grad_norm": 5.097053848951776,
"learning_rate": 6.062992125984253e-06,
"loss": 2.5098,
"step": 77
},
{
"epoch": 0.03074648928307465,
"grad_norm": 3.609880169600323,
"learning_rate": 6.141732283464567e-06,
"loss": 2.4653,
"step": 78
},
{
"epoch": 0.031140675043114067,
"grad_norm": 4.8790844537526326,
"learning_rate": 6.220472440944882e-06,
"loss": 2.5257,
"step": 79
},
{
"epoch": 0.031534860803153486,
"grad_norm": 5.766910080666288,
"learning_rate": 6.299212598425197e-06,
"loss": 2.5395,
"step": 80
},
{
"epoch": 0.031929046563192905,
"grad_norm": 5.536361935443466,
"learning_rate": 6.3779527559055125e-06,
"loss": 2.5367,
"step": 81
},
{
"epoch": 0.03232323232323232,
"grad_norm": 4.770127422423979,
"learning_rate": 6.456692913385827e-06,
"loss": 2.4774,
"step": 82
},
{
"epoch": 0.03271741808327174,
"grad_norm": 4.416647274076856,
"learning_rate": 6.535433070866142e-06,
"loss": 2.4903,
"step": 83
},
{
"epoch": 0.03311160384331116,
"grad_norm": 4.431530080181854,
"learning_rate": 6.614173228346458e-06,
"loss": 2.3936,
"step": 84
},
{
"epoch": 0.03350578960335058,
"grad_norm": 5.6472652822872895,
"learning_rate": 6.692913385826772e-06,
"loss": 2.4404,
"step": 85
},
{
"epoch": 0.03389997536339,
"grad_norm": 5.200598323481072,
"learning_rate": 6.771653543307087e-06,
"loss": 2.4376,
"step": 86
},
{
"epoch": 0.034294161123429416,
"grad_norm": 4.387657662515284,
"learning_rate": 6.850393700787402e-06,
"loss": 2.3363,
"step": 87
},
{
"epoch": 0.034688346883468835,
"grad_norm": 3.2185171323039192,
"learning_rate": 6.929133858267717e-06,
"loss": 2.2646,
"step": 88
},
{
"epoch": 0.03508253264350825,
"grad_norm": 8.73223179057534,
"learning_rate": 7.0078740157480315e-06,
"loss": 2.3927,
"step": 89
},
{
"epoch": 0.03547671840354767,
"grad_norm": 6.784545315493452,
"learning_rate": 7.086614173228347e-06,
"loss": 2.3697,
"step": 90
},
{
"epoch": 0.03587090416358709,
"grad_norm": 4.333450921434643,
"learning_rate": 7.165354330708662e-06,
"loss": 2.304,
"step": 91
},
{
"epoch": 0.03626508992362651,
"grad_norm": 5.218824764842207,
"learning_rate": 7.2440944881889774e-06,
"loss": 2.3646,
"step": 92
},
{
"epoch": 0.03665927568366593,
"grad_norm": 4.149232430620695,
"learning_rate": 7.322834645669292e-06,
"loss": 2.2622,
"step": 93
},
{
"epoch": 0.037053461443705346,
"grad_norm": 4.193773298248102,
"learning_rate": 7.401574803149607e-06,
"loss": 2.2887,
"step": 94
},
{
"epoch": 0.037447647203744765,
"grad_norm": 4.456311860549035,
"learning_rate": 7.480314960629922e-06,
"loss": 2.3007,
"step": 95
},
{
"epoch": 0.03784183296378418,
"grad_norm": 4.576460153117237,
"learning_rate": 7.5590551181102365e-06,
"loss": 2.3021,
"step": 96
},
{
"epoch": 0.0382360187238236,
"grad_norm": 8.479196171237232,
"learning_rate": 7.637795275590551e-06,
"loss": 2.4404,
"step": 97
},
{
"epoch": 0.03863020448386302,
"grad_norm": 7.433380505053241,
"learning_rate": 7.716535433070867e-06,
"loss": 2.2858,
"step": 98
},
{
"epoch": 0.03902439024390244,
"grad_norm": 9.169489148787575,
"learning_rate": 7.79527559055118e-06,
"loss": 2.2905,
"step": 99
},
{
"epoch": 0.03941857600394186,
"grad_norm": 4.505614703608414,
"learning_rate": 7.874015748031496e-06,
"loss": 2.2229,
"step": 100
},
{
"epoch": 0.039812761763981276,
"grad_norm": 3.251111002629772,
"learning_rate": 7.952755905511812e-06,
"loss": 2.1951,
"step": 101
},
{
"epoch": 0.040206947524020695,
"grad_norm": 4.118590361507865,
"learning_rate": 8.031496062992128e-06,
"loss": 2.271,
"step": 102
},
{
"epoch": 0.04060113328406011,
"grad_norm": 6.9488591196561815,
"learning_rate": 8.110236220472441e-06,
"loss": 2.3629,
"step": 103
},
{
"epoch": 0.04099531904409953,
"grad_norm": 3.5799197580937454,
"learning_rate": 8.188976377952757e-06,
"loss": 2.1602,
"step": 104
},
{
"epoch": 0.04138950480413895,
"grad_norm": 3.698515235577877,
"learning_rate": 8.267716535433071e-06,
"loss": 2.1759,
"step": 105
},
{
"epoch": 0.04178369056417837,
"grad_norm": 3.2516137577135646,
"learning_rate": 8.346456692913387e-06,
"loss": 2.2093,
"step": 106
},
{
"epoch": 0.04217787632421779,
"grad_norm": 3.910051851712546,
"learning_rate": 8.4251968503937e-06,
"loss": 2.2229,
"step": 107
},
{
"epoch": 0.042572062084257206,
"grad_norm": 3.7166583065715137,
"learning_rate": 8.503937007874016e-06,
"loss": 2.0932,
"step": 108
},
{
"epoch": 0.042966247844296625,
"grad_norm": 2.6575124301921873,
"learning_rate": 8.582677165354332e-06,
"loss": 2.12,
"step": 109
},
{
"epoch": 0.04336043360433604,
"grad_norm": 3.482590385246152,
"learning_rate": 8.661417322834647e-06,
"loss": 2.0901,
"step": 110
},
{
"epoch": 0.04375461936437546,
"grad_norm": 4.66548163032443,
"learning_rate": 8.740157480314961e-06,
"loss": 2.0983,
"step": 111
},
{
"epoch": 0.04414880512441488,
"grad_norm": 2.813248162118009,
"learning_rate": 8.818897637795277e-06,
"loss": 2.0084,
"step": 112
},
{
"epoch": 0.0445429908844543,
"grad_norm": 2.667639210004557,
"learning_rate": 8.89763779527559e-06,
"loss": 1.9983,
"step": 113
},
{
"epoch": 0.04493717664449372,
"grad_norm": 3.0839886525609463,
"learning_rate": 8.976377952755906e-06,
"loss": 2.0084,
"step": 114
},
{
"epoch": 0.045331362404533136,
"grad_norm": 3.000412565293289,
"learning_rate": 9.05511811023622e-06,
"loss": 1.9718,
"step": 115
},
{
"epoch": 0.045725548164572555,
"grad_norm": 4.642416950929853,
"learning_rate": 9.133858267716536e-06,
"loss": 1.9841,
"step": 116
},
{
"epoch": 0.04611973392461197,
"grad_norm": 2.3154794311302886,
"learning_rate": 9.212598425196852e-06,
"loss": 1.9743,
"step": 117
},
{
"epoch": 0.04651391968465139,
"grad_norm": 2.545829361546042,
"learning_rate": 9.291338582677165e-06,
"loss": 1.9539,
"step": 118
},
{
"epoch": 0.04690810544469081,
"grad_norm": 2.974703874097749,
"learning_rate": 9.370078740157481e-06,
"loss": 1.91,
"step": 119
},
{
"epoch": 0.04730229120473023,
"grad_norm": 2.797427125263561,
"learning_rate": 9.448818897637797e-06,
"loss": 1.9065,
"step": 120
},
{
"epoch": 0.04769647696476965,
"grad_norm": 4.324127605691098,
"learning_rate": 9.52755905511811e-06,
"loss": 1.9863,
"step": 121
},
{
"epoch": 0.048090662724809066,
"grad_norm": 3.2983025416162945,
"learning_rate": 9.606299212598426e-06,
"loss": 1.9546,
"step": 122
},
{
"epoch": 0.048484848484848485,
"grad_norm": 2.2657892364343017,
"learning_rate": 9.68503937007874e-06,
"loss": 1.848,
"step": 123
},
{
"epoch": 0.0488790342448879,
"grad_norm": 3.2601787777289437,
"learning_rate": 9.763779527559056e-06,
"loss": 1.9285,
"step": 124
},
{
"epoch": 0.04927322000492732,
"grad_norm": 3.5243072214231583,
"learning_rate": 9.842519685039371e-06,
"loss": 1.8762,
"step": 125
},
{
"epoch": 0.04966740576496674,
"grad_norm": 3.3017593501688394,
"learning_rate": 9.921259842519685e-06,
"loss": 1.8601,
"step": 126
},
{
"epoch": 0.05006159152500616,
"grad_norm": 3.2653646060771444,
"learning_rate": 1e-05,
"loss": 1.8686,
"step": 127
},
{
"epoch": 0.05045577728504558,
"grad_norm": 2.681802464673681,
"learning_rate": 1.0078740157480316e-05,
"loss": 1.8302,
"step": 128
},
{
"epoch": 0.050849963045084996,
"grad_norm": 3.133929350491433,
"learning_rate": 1.015748031496063e-05,
"loss": 1.8372,
"step": 129
},
{
"epoch": 0.051244148805124415,
"grad_norm": 2.534354682692382,
"learning_rate": 1.0236220472440946e-05,
"loss": 1.8715,
"step": 130
},
{
"epoch": 0.05163833456516383,
"grad_norm": 3.0493154042368023,
"learning_rate": 1.031496062992126e-05,
"loss": 1.8485,
"step": 131
},
{
"epoch": 0.05203252032520325,
"grad_norm": 2.0799972512373834,
"learning_rate": 1.0393700787401575e-05,
"loss": 1.7866,
"step": 132
},
{
"epoch": 0.05242670608524267,
"grad_norm": 1.598403007988912,
"learning_rate": 1.047244094488189e-05,
"loss": 1.8013,
"step": 133
},
{
"epoch": 0.05282089184528209,
"grad_norm": 1.91178664275519,
"learning_rate": 1.0551181102362205e-05,
"loss": 1.8741,
"step": 134
},
{
"epoch": 0.05321507760532151,
"grad_norm": 2.1365165713401906,
"learning_rate": 1.0629921259842522e-05,
"loss": 1.7989,
"step": 135
},
{
"epoch": 0.053609263365360926,
"grad_norm": 2.6948885430012655,
"learning_rate": 1.0708661417322836e-05,
"loss": 1.7984,
"step": 136
},
{
"epoch": 0.054003449125400345,
"grad_norm": 1.8504724810176718,
"learning_rate": 1.0787401574803152e-05,
"loss": 1.7789,
"step": 137
},
{
"epoch": 0.05439763488543976,
"grad_norm": 1.992151255132755,
"learning_rate": 1.0866141732283466e-05,
"loss": 1.803,
"step": 138
},
{
"epoch": 0.05479182064547918,
"grad_norm": 3.10045850302244,
"learning_rate": 1.0944881889763781e-05,
"loss": 1.823,
"step": 139
},
{
"epoch": 0.0551860064055186,
"grad_norm": 2.2624346551381085,
"learning_rate": 1.1023622047244095e-05,
"loss": 1.7608,
"step": 140
},
{
"epoch": 0.05558019216555802,
"grad_norm": 1.9683772470424854,
"learning_rate": 1.1102362204724411e-05,
"loss": 1.8037,
"step": 141
},
{
"epoch": 0.05597437792559744,
"grad_norm": 3.26220140428376,
"learning_rate": 1.1181102362204725e-05,
"loss": 1.7765,
"step": 142
},
{
"epoch": 0.056368563685636856,
"grad_norm": 4.4068981319414595,
"learning_rate": 1.125984251968504e-05,
"loss": 1.8472,
"step": 143
},
{
"epoch": 0.056762749445676275,
"grad_norm": 1.6987954071831348,
"learning_rate": 1.1338582677165354e-05,
"loss": 1.7572,
"step": 144
},
{
"epoch": 0.057156935205715693,
"grad_norm": 1.847159040073359,
"learning_rate": 1.141732283464567e-05,
"loss": 1.6803,
"step": 145
},
{
"epoch": 0.05755112096575511,
"grad_norm": 2.6708041585740596,
"learning_rate": 1.1496062992125985e-05,
"loss": 1.8088,
"step": 146
},
{
"epoch": 0.05794530672579453,
"grad_norm": 1.9604986339037445,
"learning_rate": 1.15748031496063e-05,
"loss": 1.7155,
"step": 147
},
{
"epoch": 0.05833949248583395,
"grad_norm": 1.6691911028581192,
"learning_rate": 1.1653543307086615e-05,
"loss": 1.7748,
"step": 148
},
{
"epoch": 0.05873367824587337,
"grad_norm": 7.3318925396826895,
"learning_rate": 1.1732283464566929e-05,
"loss": 1.7572,
"step": 149
},
{
"epoch": 0.059127864005912786,
"grad_norm": 2.283850168056605,
"learning_rate": 1.1811023622047245e-05,
"loss": 1.7774,
"step": 150
},
{
"epoch": 0.059522049765952205,
"grad_norm": 1.8019088514589012,
"learning_rate": 1.1889763779527562e-05,
"loss": 1.7786,
"step": 151
},
{
"epoch": 0.059916235525991624,
"grad_norm": 1.3816061587980675,
"learning_rate": 1.1968503937007876e-05,
"loss": 1.7504,
"step": 152
},
{
"epoch": 0.06031042128603104,
"grad_norm": 5.720763322290118,
"learning_rate": 1.2047244094488191e-05,
"loss": 1.8016,
"step": 153
},
{
"epoch": 0.06070460704607046,
"grad_norm": 3.3964912544422994,
"learning_rate": 1.2125984251968505e-05,
"loss": 1.6964,
"step": 154
},
{
"epoch": 0.06109879280610988,
"grad_norm": 1.7844098526259298,
"learning_rate": 1.2204724409448821e-05,
"loss": 1.7561,
"step": 155
},
{
"epoch": 0.0614929785661493,
"grad_norm": 1.6826530766646766,
"learning_rate": 1.2283464566929135e-05,
"loss": 1.7069,
"step": 156
},
{
"epoch": 0.061887164326188716,
"grad_norm": 3.4647919464333152,
"learning_rate": 1.236220472440945e-05,
"loss": 1.7096,
"step": 157
},
{
"epoch": 0.062281350086228135,
"grad_norm": 2.0613781006838243,
"learning_rate": 1.2440944881889764e-05,
"loss": 1.732,
"step": 158
},
{
"epoch": 0.06267553584626756,
"grad_norm": 1.9503601214626853,
"learning_rate": 1.251968503937008e-05,
"loss": 1.7402,
"step": 159
},
{
"epoch": 0.06306972160630697,
"grad_norm": 1.8504549835287638,
"learning_rate": 1.2598425196850394e-05,
"loss": 1.7003,
"step": 160
},
{
"epoch": 0.0634639073663464,
"grad_norm": 2.07948846446986,
"learning_rate": 1.267716535433071e-05,
"loss": 1.7004,
"step": 161
},
{
"epoch": 0.06385809312638581,
"grad_norm": 1.7485726412453775,
"learning_rate": 1.2755905511811025e-05,
"loss": 1.725,
"step": 162
},
{
"epoch": 0.06425227888642523,
"grad_norm": 1.7868478014046527,
"learning_rate": 1.2834645669291339e-05,
"loss": 1.6828,
"step": 163
},
{
"epoch": 0.06464646464646465,
"grad_norm": 1.4524583527842783,
"learning_rate": 1.2913385826771655e-05,
"loss": 1.726,
"step": 164
},
{
"epoch": 0.06504065040650407,
"grad_norm": 1.5085438907961388,
"learning_rate": 1.2992125984251968e-05,
"loss": 1.6417,
"step": 165
},
{
"epoch": 0.06543483616654348,
"grad_norm": 1.5307066166089378,
"learning_rate": 1.3070866141732284e-05,
"loss": 1.6291,
"step": 166
},
{
"epoch": 0.06582902192658291,
"grad_norm": 1.5549360763645417,
"learning_rate": 1.3149606299212601e-05,
"loss": 1.6966,
"step": 167
},
{
"epoch": 0.06622320768662232,
"grad_norm": 2.1633140111873272,
"learning_rate": 1.3228346456692915e-05,
"loss": 1.5821,
"step": 168
},
{
"epoch": 0.06661739344666175,
"grad_norm": 1.4726739949688163,
"learning_rate": 1.3307086614173231e-05,
"loss": 1.6008,
"step": 169
},
{
"epoch": 0.06701157920670116,
"grad_norm": 1.933336638607143,
"learning_rate": 1.3385826771653545e-05,
"loss": 1.6237,
"step": 170
},
{
"epoch": 0.06740576496674058,
"grad_norm": 1.53709942550425,
"learning_rate": 1.346456692913386e-05,
"loss": 1.6603,
"step": 171
},
{
"epoch": 0.06779995072678,
"grad_norm": 5.838182266578105,
"learning_rate": 1.3543307086614174e-05,
"loss": 1.7374,
"step": 172
},
{
"epoch": 0.06819413648681942,
"grad_norm": 2.1077670495936105,
"learning_rate": 1.362204724409449e-05,
"loss": 1.6751,
"step": 173
},
{
"epoch": 0.06858832224685883,
"grad_norm": 1.79478201657228,
"learning_rate": 1.3700787401574804e-05,
"loss": 1.6147,
"step": 174
},
{
"epoch": 0.06898250800689826,
"grad_norm": 1.3332167033318783,
"learning_rate": 1.377952755905512e-05,
"loss": 1.6174,
"step": 175
},
{
"epoch": 0.06937669376693767,
"grad_norm": 1.3613261661051188,
"learning_rate": 1.3858267716535433e-05,
"loss": 1.6226,
"step": 176
},
{
"epoch": 0.0697708795269771,
"grad_norm": 1.4747645759596355,
"learning_rate": 1.3937007874015749e-05,
"loss": 1.6831,
"step": 177
},
{
"epoch": 0.0701650652870165,
"grad_norm": 1.2750429533681837,
"learning_rate": 1.4015748031496063e-05,
"loss": 1.7002,
"step": 178
},
{
"epoch": 0.07055925104705593,
"grad_norm": 1.5316341355433367,
"learning_rate": 1.4094488188976379e-05,
"loss": 1.6778,
"step": 179
},
{
"epoch": 0.07095343680709534,
"grad_norm": 1.5302517303234198,
"learning_rate": 1.4173228346456694e-05,
"loss": 1.661,
"step": 180
},
{
"epoch": 0.07134762256713477,
"grad_norm": 1.4890855169186785,
"learning_rate": 1.4251968503937008e-05,
"loss": 1.6873,
"step": 181
},
{
"epoch": 0.07174180832717418,
"grad_norm": 1.4685898866854017,
"learning_rate": 1.4330708661417324e-05,
"loss": 1.6183,
"step": 182
},
{
"epoch": 0.0721359940872136,
"grad_norm": 1.1931151423557926,
"learning_rate": 1.440944881889764e-05,
"loss": 1.6106,
"step": 183
},
{
"epoch": 0.07253017984725302,
"grad_norm": 1.2548801700230896,
"learning_rate": 1.4488188976377955e-05,
"loss": 1.6201,
"step": 184
},
{
"epoch": 0.07292436560729244,
"grad_norm": 1.316626084569457,
"learning_rate": 1.456692913385827e-05,
"loss": 1.6652,
"step": 185
},
{
"epoch": 0.07331855136733186,
"grad_norm": 5.515174587786105,
"learning_rate": 1.4645669291338584e-05,
"loss": 1.6672,
"step": 186
},
{
"epoch": 0.07371273712737128,
"grad_norm": 1.2435134387010485,
"learning_rate": 1.47244094488189e-05,
"loss": 1.5948,
"step": 187
},
{
"epoch": 0.07410692288741069,
"grad_norm": 1.27329799921956,
"learning_rate": 1.4803149606299214e-05,
"loss": 1.6548,
"step": 188
},
{
"epoch": 0.07450110864745012,
"grad_norm": 1.2399973778980402,
"learning_rate": 1.488188976377953e-05,
"loss": 1.604,
"step": 189
},
{
"epoch": 0.07489529440748953,
"grad_norm": 2.394011363721175,
"learning_rate": 1.4960629921259843e-05,
"loss": 1.6027,
"step": 190
},
{
"epoch": 0.07528948016752895,
"grad_norm": 1.3778750181373447,
"learning_rate": 1.5039370078740159e-05,
"loss": 1.6389,
"step": 191
},
{
"epoch": 0.07568366592756837,
"grad_norm": 1.5441433369147584,
"learning_rate": 1.5118110236220473e-05,
"loss": 1.6183,
"step": 192
},
{
"epoch": 0.07607785168760779,
"grad_norm": 4.415312664776792,
"learning_rate": 1.5196850393700789e-05,
"loss": 1.5881,
"step": 193
},
{
"epoch": 0.0764720374476472,
"grad_norm": 1.6220189908817373,
"learning_rate": 1.5275590551181102e-05,
"loss": 1.689,
"step": 194
},
{
"epoch": 0.07686622320768663,
"grad_norm": 1.2264711147522527,
"learning_rate": 1.5354330708661416e-05,
"loss": 1.5776,
"step": 195
},
{
"epoch": 0.07726040896772604,
"grad_norm": 1.2490481285394455,
"learning_rate": 1.5433070866141734e-05,
"loss": 1.6122,
"step": 196
},
{
"epoch": 0.07765459472776547,
"grad_norm": 1.2303899509527259,
"learning_rate": 1.5511811023622048e-05,
"loss": 1.5495,
"step": 197
},
{
"epoch": 0.07804878048780488,
"grad_norm": 3.4482635126365997,
"learning_rate": 1.559055118110236e-05,
"loss": 1.6351,
"step": 198
},
{
"epoch": 0.0784429662478443,
"grad_norm": 1.4430016707011335,
"learning_rate": 1.566929133858268e-05,
"loss": 1.5224,
"step": 199
},
{
"epoch": 0.07883715200788372,
"grad_norm": 1.258723675384828,
"learning_rate": 1.5748031496062993e-05,
"loss": 1.5626,
"step": 200
},
{
"epoch": 0.07923133776792314,
"grad_norm": 1.5678661529755662,
"learning_rate": 1.582677165354331e-05,
"loss": 1.5783,
"step": 201
},
{
"epoch": 0.07962552352796255,
"grad_norm": 2.1867650050329535,
"learning_rate": 1.5905511811023624e-05,
"loss": 1.5969,
"step": 202
},
{
"epoch": 0.08001970928800198,
"grad_norm": 1.2889311434591015,
"learning_rate": 1.5984251968503938e-05,
"loss": 1.564,
"step": 203
},
{
"epoch": 0.08041389504804139,
"grad_norm": 1.1654066224514485,
"learning_rate": 1.6062992125984255e-05,
"loss": 1.5517,
"step": 204
},
{
"epoch": 0.08080808080808081,
"grad_norm": 1.2834026840027142,
"learning_rate": 1.614173228346457e-05,
"loss": 1.5784,
"step": 205
},
{
"epoch": 0.08120226656812023,
"grad_norm": 1.097147109752616,
"learning_rate": 1.6220472440944883e-05,
"loss": 1.593,
"step": 206
},
{
"epoch": 0.08159645232815965,
"grad_norm": 1.0826077251947002,
"learning_rate": 1.6299212598425197e-05,
"loss": 1.6672,
"step": 207
},
{
"epoch": 0.08199063808819906,
"grad_norm": 1.1105586301185173,
"learning_rate": 1.6377952755905514e-05,
"loss": 1.6279,
"step": 208
},
{
"epoch": 0.08238482384823849,
"grad_norm": 1.0509746948712066,
"learning_rate": 1.6456692913385828e-05,
"loss": 1.5676,
"step": 209
},
{
"epoch": 0.0827790096082779,
"grad_norm": 1.0983909936032894,
"learning_rate": 1.6535433070866142e-05,
"loss": 1.5829,
"step": 210
},
{
"epoch": 0.08317319536831733,
"grad_norm": 5.99007589257119,
"learning_rate": 1.6614173228346456e-05,
"loss": 1.7761,
"step": 211
},
{
"epoch": 0.08356738112835674,
"grad_norm": 1.2452212459257412,
"learning_rate": 1.6692913385826773e-05,
"loss": 1.6174,
"step": 212
},
{
"epoch": 0.08396156688839616,
"grad_norm": 1.2716752881032753,
"learning_rate": 1.6771653543307087e-05,
"loss": 1.5855,
"step": 213
},
{
"epoch": 0.08435575264843558,
"grad_norm": 1.1250735671327408,
"learning_rate": 1.68503937007874e-05,
"loss": 1.6358,
"step": 214
},
{
"epoch": 0.084749938408475,
"grad_norm": 1.2260081131211942,
"learning_rate": 1.692913385826772e-05,
"loss": 1.5142,
"step": 215
},
{
"epoch": 0.08514412416851441,
"grad_norm": 1.1674035474423037,
"learning_rate": 1.7007874015748032e-05,
"loss": 1.57,
"step": 216
},
{
"epoch": 0.08553830992855384,
"grad_norm": 1.2049471298049268,
"learning_rate": 1.708661417322835e-05,
"loss": 1.535,
"step": 217
},
{
"epoch": 0.08593249568859325,
"grad_norm": 1.0593135540735228,
"learning_rate": 1.7165354330708663e-05,
"loss": 1.5262,
"step": 218
},
{
"epoch": 0.08632668144863268,
"grad_norm": 1.2230277479432223,
"learning_rate": 1.7244094488188977e-05,
"loss": 1.4963,
"step": 219
},
{
"epoch": 0.08672086720867209,
"grad_norm": 1.0841400801567742,
"learning_rate": 1.7322834645669295e-05,
"loss": 1.464,
"step": 220
},
{
"epoch": 0.08711505296871151,
"grad_norm": 1.0657721135903946,
"learning_rate": 1.740157480314961e-05,
"loss": 1.5183,
"step": 221
},
{
"epoch": 0.08750923872875092,
"grad_norm": 1.0176332279317757,
"learning_rate": 1.7480314960629923e-05,
"loss": 1.5272,
"step": 222
},
{
"epoch": 0.08790342448879035,
"grad_norm": 1.0202676847155607,
"learning_rate": 1.7559055118110236e-05,
"loss": 1.5327,
"step": 223
},
{
"epoch": 0.08829761024882976,
"grad_norm": 6.425041690617794,
"learning_rate": 1.7637795275590554e-05,
"loss": 1.5531,
"step": 224
},
{
"epoch": 0.08869179600886919,
"grad_norm": 1.1786231403068714,
"learning_rate": 1.7716535433070868e-05,
"loss": 1.5453,
"step": 225
},
{
"epoch": 0.0890859817689086,
"grad_norm": 1.2325207985267532,
"learning_rate": 1.779527559055118e-05,
"loss": 1.6243,
"step": 226
},
{
"epoch": 0.08948016752894802,
"grad_norm": 2.8120821758652292,
"learning_rate": 1.7874015748031495e-05,
"loss": 1.5169,
"step": 227
},
{
"epoch": 0.08987435328898744,
"grad_norm": 1.1463382537995392,
"learning_rate": 1.7952755905511813e-05,
"loss": 1.5332,
"step": 228
},
{
"epoch": 0.09026853904902686,
"grad_norm": 1.0849881708965645,
"learning_rate": 1.8031496062992127e-05,
"loss": 1.5723,
"step": 229
},
{
"epoch": 0.09066272480906627,
"grad_norm": 1.1666290000579271,
"learning_rate": 1.811023622047244e-05,
"loss": 1.5618,
"step": 230
},
{
"epoch": 0.0910569105691057,
"grad_norm": 1.2015436620694524,
"learning_rate": 1.8188976377952758e-05,
"loss": 1.4479,
"step": 231
},
{
"epoch": 0.09145109632914511,
"grad_norm": 1.1770257502445032,
"learning_rate": 1.8267716535433072e-05,
"loss": 1.4907,
"step": 232
},
{
"epoch": 0.09184528208918454,
"grad_norm": 1.1626480865358226,
"learning_rate": 1.834645669291339e-05,
"loss": 1.5504,
"step": 233
},
{
"epoch": 0.09223946784922395,
"grad_norm": 1.06078382485064,
"learning_rate": 1.8425196850393703e-05,
"loss": 1.4953,
"step": 234
},
{
"epoch": 0.09263365360926337,
"grad_norm": 1.0930777847490591,
"learning_rate": 1.8503937007874017e-05,
"loss": 1.5751,
"step": 235
},
{
"epoch": 0.09302783936930278,
"grad_norm": 1.0032128686122703,
"learning_rate": 1.858267716535433e-05,
"loss": 1.5573,
"step": 236
},
{
"epoch": 0.09342202512934221,
"grad_norm": 1.316223586320374,
"learning_rate": 1.8661417322834648e-05,
"loss": 1.5121,
"step": 237
},
{
"epoch": 0.09381621088938162,
"grad_norm": 1.2482520651605957,
"learning_rate": 1.8740157480314962e-05,
"loss": 1.5444,
"step": 238
},
{
"epoch": 0.09421039664942105,
"grad_norm": 1.0596918045491734,
"learning_rate": 1.8818897637795276e-05,
"loss": 1.5212,
"step": 239
},
{
"epoch": 0.09460458240946046,
"grad_norm": 10.230035305602996,
"learning_rate": 1.8897637795275593e-05,
"loss": 1.5136,
"step": 240
},
{
"epoch": 0.09499876816949988,
"grad_norm": 1.7311033327684602,
"learning_rate": 1.8976377952755907e-05,
"loss": 1.5087,
"step": 241
},
{
"epoch": 0.0953929539295393,
"grad_norm": 1.3327399439783965,
"learning_rate": 1.905511811023622e-05,
"loss": 1.5182,
"step": 242
},
{
"epoch": 0.09578713968957872,
"grad_norm": 1.0615025753084397,
"learning_rate": 1.9133858267716535e-05,
"loss": 1.5321,
"step": 243
},
{
"epoch": 0.09618132544961813,
"grad_norm": 1.174065978180721,
"learning_rate": 1.9212598425196852e-05,
"loss": 1.4981,
"step": 244
},
{
"epoch": 0.09657551120965756,
"grad_norm": 1.0837767684996553,
"learning_rate": 1.9291338582677166e-05,
"loss": 1.4733,
"step": 245
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.0744329648400928,
"learning_rate": 1.937007874015748e-05,
"loss": 1.5172,
"step": 246
},
{
"epoch": 0.0973638827297364,
"grad_norm": 1.0479477955815488,
"learning_rate": 1.9448818897637797e-05,
"loss": 1.4767,
"step": 247
},
{
"epoch": 0.0977580684897758,
"grad_norm": 0.9622167177031952,
"learning_rate": 1.952755905511811e-05,
"loss": 1.5212,
"step": 248
},
{
"epoch": 0.09815225424981523,
"grad_norm": 2.2109867243739867,
"learning_rate": 1.960629921259843e-05,
"loss": 1.534,
"step": 249
},
{
"epoch": 0.09854644000985464,
"grad_norm": 1.0330950105773389,
"learning_rate": 1.9685039370078743e-05,
"loss": 1.4988,
"step": 250
},
{
"epoch": 0.09894062576989407,
"grad_norm": 1.2543876260436326,
"learning_rate": 1.9763779527559057e-05,
"loss": 1.5515,
"step": 251
},
{
"epoch": 0.09933481152993348,
"grad_norm": 1.0907032902576081,
"learning_rate": 1.984251968503937e-05,
"loss": 1.4944,
"step": 252
},
{
"epoch": 0.0997289972899729,
"grad_norm": 0.9800946085411166,
"learning_rate": 1.9921259842519688e-05,
"loss": 1.4594,
"step": 253
},
{
"epoch": 0.10012318305001232,
"grad_norm": 1.005840927677052,
"learning_rate": 2e-05,
"loss": 1.5125,
"step": 254
},
{
"epoch": 0.10051736881005174,
"grad_norm": 0.9877177677204181,
"learning_rate": 1.9999990523708736e-05,
"loss": 1.4953,
"step": 255
},
{
"epoch": 0.10091155457009116,
"grad_norm": 1.101690731617668,
"learning_rate": 1.999996209485289e-05,
"loss": 1.5291,
"step": 256
},
{
"epoch": 0.10130574033013058,
"grad_norm": 1.056828743252167,
"learning_rate": 1.9999914713486344e-05,
"loss": 1.546,
"step": 257
},
{
"epoch": 0.10169992609016999,
"grad_norm": 1.0379730842348571,
"learning_rate": 1.9999848379698906e-05,
"loss": 1.5252,
"step": 258
},
{
"epoch": 0.10209411185020942,
"grad_norm": 0.9403586150467369,
"learning_rate": 1.999976309361629e-05,
"loss": 1.4487,
"step": 259
},
{
"epoch": 0.10248829761024883,
"grad_norm": 0.9899974982933676,
"learning_rate": 1.9999658855400135e-05,
"loss": 1.4721,
"step": 260
},
{
"epoch": 0.10288248337028826,
"grad_norm": 1.8364244542987356,
"learning_rate": 1.9999535665248e-05,
"loss": 1.5609,
"step": 261
},
{
"epoch": 0.10327666913032767,
"grad_norm": 1.0844452490408925,
"learning_rate": 1.9999393523393365e-05,
"loss": 1.4418,
"step": 262
},
{
"epoch": 0.10367085489036709,
"grad_norm": 0.9972732800206876,
"learning_rate": 1.9999232430105618e-05,
"loss": 1.4595,
"step": 263
},
{
"epoch": 0.1040650406504065,
"grad_norm": 1.0507646311810663,
"learning_rate": 1.999905238569008e-05,
"loss": 1.5172,
"step": 264
},
{
"epoch": 0.10445922641044593,
"grad_norm": 1.095556225355519,
"learning_rate": 1.999885339048798e-05,
"loss": 1.4543,
"step": 265
},
{
"epoch": 0.10485341217048534,
"grad_norm": 1.5429221372847546,
"learning_rate": 1.999863544487646e-05,
"loss": 1.4856,
"step": 266
},
{
"epoch": 0.10524759793052477,
"grad_norm": 1.2099357188247561,
"learning_rate": 1.9998398549268594e-05,
"loss": 1.5493,
"step": 267
},
{
"epoch": 0.10564178369056418,
"grad_norm": 0.935834153327994,
"learning_rate": 1.999814270411335e-05,
"loss": 1.4679,
"step": 268
},
{
"epoch": 0.1060359694506036,
"grad_norm": 0.9438202964074678,
"learning_rate": 1.9997867909895626e-05,
"loss": 1.4995,
"step": 269
},
{
"epoch": 0.10643015521064302,
"grad_norm": 1.033515015322255,
"learning_rate": 1.9997574167136225e-05,
"loss": 1.5551,
"step": 270
},
{
"epoch": 0.10682434097068244,
"grad_norm": 0.9370254571893236,
"learning_rate": 1.9997261476391867e-05,
"loss": 1.4224,
"step": 271
},
{
"epoch": 0.10721852673072185,
"grad_norm": 0.8669854368917412,
"learning_rate": 1.999692983825518e-05,
"loss": 1.4123,
"step": 272
},
{
"epoch": 0.10761271249076128,
"grad_norm": 0.944767717267722,
"learning_rate": 1.999657925335471e-05,
"loss": 1.4617,
"step": 273
},
{
"epoch": 0.10800689825080069,
"grad_norm": 0.8918613394976922,
"learning_rate": 1.9996209722354896e-05,
"loss": 1.4717,
"step": 274
},
{
"epoch": 0.10840108401084012,
"grad_norm": 0.8601703235721511,
"learning_rate": 1.99958212459561e-05,
"loss": 1.4932,
"step": 275
},
{
"epoch": 0.10879526977087953,
"grad_norm": 0.8947009718973543,
"learning_rate": 1.9995413824894593e-05,
"loss": 1.4279,
"step": 276
},
{
"epoch": 0.10918945553091895,
"grad_norm": 0.9310105648146282,
"learning_rate": 1.9994987459942528e-05,
"loss": 1.4802,
"step": 277
},
{
"epoch": 0.10958364129095836,
"grad_norm": 0.8501846281501174,
"learning_rate": 1.9994542151907988e-05,
"loss": 1.4749,
"step": 278
},
{
"epoch": 0.10997782705099779,
"grad_norm": 1.0075642218200616,
"learning_rate": 1.999407790163494e-05,
"loss": 1.4024,
"step": 279
},
{
"epoch": 0.1103720128110372,
"grad_norm": 0.8724020295218536,
"learning_rate": 1.9993594710003262e-05,
"loss": 1.4781,
"step": 280
},
{
"epoch": 0.11076619857107663,
"grad_norm": 0.9028708477460494,
"learning_rate": 1.9993092577928725e-05,
"loss": 1.4662,
"step": 281
},
{
"epoch": 0.11116038433111604,
"grad_norm": 0.9000611147078907,
"learning_rate": 1.9992571506362997e-05,
"loss": 1.5075,
"step": 282
},
{
"epoch": 0.11155457009115546,
"grad_norm": 0.8987129723251234,
"learning_rate": 1.9992031496293652e-05,
"loss": 1.4287,
"step": 283
},
{
"epoch": 0.11194875585119488,
"grad_norm": 0.9407581537583124,
"learning_rate": 1.999147254874414e-05,
"loss": 1.4692,
"step": 284
},
{
"epoch": 0.1123429416112343,
"grad_norm": 0.8489305931721897,
"learning_rate": 1.999089466477381e-05,
"loss": 1.4033,
"step": 285
},
{
"epoch": 0.11273712737127371,
"grad_norm": 0.9701130113270408,
"learning_rate": 1.999029784547791e-05,
"loss": 1.4633,
"step": 286
},
{
"epoch": 0.11313131313131314,
"grad_norm": 0.9645372818337129,
"learning_rate": 1.9989682091987558e-05,
"loss": 1.4762,
"step": 287
},
{
"epoch": 0.11352549889135255,
"grad_norm": 0.8958997231087552,
"learning_rate": 1.9989047405469772e-05,
"loss": 1.4915,
"step": 288
},
{
"epoch": 0.11391968465139198,
"grad_norm": 0.8671815258371959,
"learning_rate": 1.9988393787127444e-05,
"loss": 1.4463,
"step": 289
},
{
"epoch": 0.11431387041143139,
"grad_norm": 0.8618517053204878,
"learning_rate": 1.9987721238199345e-05,
"loss": 1.4234,
"step": 290
},
{
"epoch": 0.11470805617147081,
"grad_norm": 0.8902785836218885,
"learning_rate": 1.9987029759960142e-05,
"loss": 1.4214,
"step": 291
},
{
"epoch": 0.11510224193151022,
"grad_norm": 0.8858117437885646,
"learning_rate": 1.9986319353720353e-05,
"loss": 1.3894,
"step": 292
},
{
"epoch": 0.11549642769154965,
"grad_norm": 0.8611263833038788,
"learning_rate": 1.9985590020826382e-05,
"loss": 1.4862,
"step": 293
},
{
"epoch": 0.11589061345158906,
"grad_norm": 0.8533778158931522,
"learning_rate": 1.9984841762660508e-05,
"loss": 1.4738,
"step": 294
},
{
"epoch": 0.11628479921162849,
"grad_norm": 0.9054080637678216,
"learning_rate": 1.998407458064087e-05,
"loss": 1.4873,
"step": 295
},
{
"epoch": 0.1166789849716679,
"grad_norm": 0.8562878911122067,
"learning_rate": 1.9983288476221482e-05,
"loss": 1.4897,
"step": 296
},
{
"epoch": 0.11707317073170732,
"grad_norm": 0.8857579006622172,
"learning_rate": 1.9982483450892206e-05,
"loss": 1.4916,
"step": 297
},
{
"epoch": 0.11746735649174674,
"grad_norm": 0.8253228858932441,
"learning_rate": 1.9981659506178778e-05,
"loss": 1.3489,
"step": 298
},
{
"epoch": 0.11786154225178616,
"grad_norm": 0.9323194384008091,
"learning_rate": 1.9980816643642787e-05,
"loss": 1.5008,
"step": 299
},
{
"epoch": 0.11825572801182557,
"grad_norm": 1.0570822985529353,
"learning_rate": 1.9979954864881672e-05,
"loss": 1.4554,
"step": 300
},
{
"epoch": 0.118649913771865,
"grad_norm": 0.9247735264199164,
"learning_rate": 1.997907417152873e-05,
"loss": 1.4352,
"step": 301
},
{
"epoch": 0.11904409953190441,
"grad_norm": 0.9467585491612563,
"learning_rate": 1.9978174565253096e-05,
"loss": 1.4937,
"step": 302
},
{
"epoch": 0.11943828529194384,
"grad_norm": 0.9054242752625036,
"learning_rate": 1.9977256047759765e-05,
"loss": 1.4672,
"step": 303
},
{
"epoch": 0.11983247105198325,
"grad_norm": 0.8664782098266539,
"learning_rate": 1.997631862078956e-05,
"loss": 1.4183,
"step": 304
},
{
"epoch": 0.12022665681202267,
"grad_norm": 0.8736218550959834,
"learning_rate": 1.9975362286119145e-05,
"loss": 1.4379,
"step": 305
},
{
"epoch": 0.12062084257206208,
"grad_norm": 0.899159416016424,
"learning_rate": 1.9974387045561022e-05,
"loss": 1.4688,
"step": 306
},
{
"epoch": 0.12101502833210151,
"grad_norm": 0.9132102225776563,
"learning_rate": 1.997339290096353e-05,
"loss": 1.4195,
"step": 307
},
{
"epoch": 0.12140921409214092,
"grad_norm": 0.9022509743935889,
"learning_rate": 1.9972379854210824e-05,
"loss": 1.5341,
"step": 308
},
{
"epoch": 0.12180339985218035,
"grad_norm": 0.8909667554707213,
"learning_rate": 1.997134790722289e-05,
"loss": 1.3896,
"step": 309
},
{
"epoch": 0.12219758561221976,
"grad_norm": 0.810957265048853,
"learning_rate": 1.9970297061955533e-05,
"loss": 1.3607,
"step": 310
},
{
"epoch": 0.12259177137225918,
"grad_norm": 0.8624805968721132,
"learning_rate": 1.996922732040038e-05,
"loss": 1.433,
"step": 311
},
{
"epoch": 0.1229859571322986,
"grad_norm": 0.9012262047132807,
"learning_rate": 1.9968138684584862e-05,
"loss": 1.4337,
"step": 312
},
{
"epoch": 0.12338014289233802,
"grad_norm": 0.8600494551649118,
"learning_rate": 1.9967031156572233e-05,
"loss": 1.3947,
"step": 313
},
{
"epoch": 0.12377432865237743,
"grad_norm": 0.8744528870589704,
"learning_rate": 1.9965904738461534e-05,
"loss": 1.4945,
"step": 314
},
{
"epoch": 0.12416851441241686,
"grad_norm": 0.8875872891561535,
"learning_rate": 1.9964759432387626e-05,
"loss": 1.4542,
"step": 315
},
{
"epoch": 0.12456270017245627,
"grad_norm": 0.8538438066807553,
"learning_rate": 1.9963595240521158e-05,
"loss": 1.4219,
"step": 316
},
{
"epoch": 0.1249568859324957,
"grad_norm": 0.8583935860681176,
"learning_rate": 1.9962412165068575e-05,
"loss": 1.3834,
"step": 317
},
{
"epoch": 0.12535107169253512,
"grad_norm": 0.9046850234763439,
"learning_rate": 1.996121020827211e-05,
"loss": 1.4378,
"step": 318
},
{
"epoch": 0.12574525745257453,
"grad_norm": 0.8757680720234807,
"learning_rate": 1.9959989372409777e-05,
"loss": 1.4239,
"step": 319
},
{
"epoch": 0.12613944321261394,
"grad_norm": 1.1494791062386092,
"learning_rate": 1.9958749659795382e-05,
"loss": 1.407,
"step": 320
},
{
"epoch": 0.12653362897265336,
"grad_norm": 0.8689927196254672,
"learning_rate": 1.99574910727785e-05,
"loss": 1.3873,
"step": 321
},
{
"epoch": 0.1269278147326928,
"grad_norm": 0.8754813889657387,
"learning_rate": 1.995621361374447e-05,
"loss": 1.522,
"step": 322
},
{
"epoch": 0.1273220004927322,
"grad_norm": 0.8486986093611717,
"learning_rate": 1.9954917285114418e-05,
"loss": 1.3494,
"step": 323
},
{
"epoch": 0.12771618625277162,
"grad_norm": 0.9722206329399001,
"learning_rate": 1.9953602089345215e-05,
"loss": 1.4088,
"step": 324
},
{
"epoch": 0.12811037201281103,
"grad_norm": 0.8967214452714534,
"learning_rate": 1.9952268028929497e-05,
"loss": 1.4024,
"step": 325
},
{
"epoch": 0.12850455777285047,
"grad_norm": 0.964703154180979,
"learning_rate": 1.995091510639566e-05,
"loss": 1.4126,
"step": 326
},
{
"epoch": 0.12889874353288988,
"grad_norm": 0.9392746691898846,
"learning_rate": 1.9949543324307828e-05,
"loss": 1.405,
"step": 327
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.7628618547760365,
"learning_rate": 1.9948152685265896e-05,
"loss": 1.3899,
"step": 328
},
{
"epoch": 0.1296871150529687,
"grad_norm": 0.8699311844515389,
"learning_rate": 1.9946743191905473e-05,
"loss": 1.3766,
"step": 329
},
{
"epoch": 0.13008130081300814,
"grad_norm": 0.935450510994964,
"learning_rate": 1.9945314846897922e-05,
"loss": 1.3913,
"step": 330
},
{
"epoch": 0.13047548657304756,
"grad_norm": 0.8529532741122805,
"learning_rate": 1.9943867652950323e-05,
"loss": 1.3947,
"step": 331
},
{
"epoch": 0.13086967233308697,
"grad_norm": 0.9341157491415716,
"learning_rate": 1.9942401612805478e-05,
"loss": 1.4517,
"step": 332
},
{
"epoch": 0.13126385809312638,
"grad_norm": 0.8302844629086936,
"learning_rate": 1.9940916729241918e-05,
"loss": 1.3977,
"step": 333
},
{
"epoch": 0.13165804385316582,
"grad_norm": 0.8260253123890825,
"learning_rate": 1.9939413005073873e-05,
"loss": 1.4048,
"step": 334
},
{
"epoch": 0.13205222961320523,
"grad_norm": 0.8509245010253166,
"learning_rate": 1.9937890443151294e-05,
"loss": 1.3836,
"step": 335
},
{
"epoch": 0.13244641537324464,
"grad_norm": 0.9759926385519552,
"learning_rate": 1.9936349046359833e-05,
"loss": 1.4606,
"step": 336
},
{
"epoch": 0.13284060113328405,
"grad_norm": 0.8472765912232332,
"learning_rate": 1.9934788817620827e-05,
"loss": 1.3585,
"step": 337
},
{
"epoch": 0.1332347868933235,
"grad_norm": 0.8448284766692432,
"learning_rate": 1.9933209759891318e-05,
"loss": 1.3559,
"step": 338
},
{
"epoch": 0.1336289726533629,
"grad_norm": 0.8980105866822069,
"learning_rate": 1.9931611876164024e-05,
"loss": 1.3884,
"step": 339
},
{
"epoch": 0.13402315841340232,
"grad_norm": 0.8035875577985496,
"learning_rate": 1.9929995169467346e-05,
"loss": 1.4183,
"step": 340
},
{
"epoch": 0.13441734417344173,
"grad_norm": 0.8436688045262849,
"learning_rate": 1.992835964286537e-05,
"loss": 1.3847,
"step": 341
},
{
"epoch": 0.13481152993348117,
"grad_norm": 0.9086794949433027,
"learning_rate": 1.992670529945783e-05,
"loss": 1.454,
"step": 342
},
{
"epoch": 0.13520571569352058,
"grad_norm": 0.8037193631752932,
"learning_rate": 1.9925032142380144e-05,
"loss": 1.4566,
"step": 343
},
{
"epoch": 0.13559990145356,
"grad_norm": 0.9238628826502602,
"learning_rate": 1.992334017480337e-05,
"loss": 1.4551,
"step": 344
},
{
"epoch": 0.1359940872135994,
"grad_norm": 0.8954578526881097,
"learning_rate": 1.9921629399934224e-05,
"loss": 1.3993,
"step": 345
},
{
"epoch": 0.13638827297363884,
"grad_norm": 0.8298423164388818,
"learning_rate": 1.9919899821015066e-05,
"loss": 1.4251,
"step": 346
},
{
"epoch": 0.13678245873367825,
"grad_norm": 0.9558363388772838,
"learning_rate": 1.99181514413239e-05,
"loss": 1.4025,
"step": 347
},
{
"epoch": 0.13717664449371766,
"grad_norm": 0.8459196123850001,
"learning_rate": 1.9916384264174354e-05,
"loss": 1.3976,
"step": 348
},
{
"epoch": 0.13757083025375708,
"grad_norm": 0.9082414240992348,
"learning_rate": 1.9914598292915684e-05,
"loss": 1.4128,
"step": 349
},
{
"epoch": 0.13796501601379652,
"grad_norm": 0.8807624601189884,
"learning_rate": 1.9912793530932765e-05,
"loss": 1.4642,
"step": 350
},
{
"epoch": 0.13835920177383593,
"grad_norm": 0.8479509653794212,
"learning_rate": 1.991096998164609e-05,
"loss": 1.4292,
"step": 351
},
{
"epoch": 0.13875338753387534,
"grad_norm": 0.8571495642628604,
"learning_rate": 1.9909127648511758e-05,
"loss": 1.4185,
"step": 352
},
{
"epoch": 0.13914757329391475,
"grad_norm": 0.8394513200646011,
"learning_rate": 1.9907266535021465e-05,
"loss": 1.3907,
"step": 353
},
{
"epoch": 0.1395417590539542,
"grad_norm": 0.8719559245356892,
"learning_rate": 1.9905386644702495e-05,
"loss": 1.4522,
"step": 354
},
{
"epoch": 0.1399359448139936,
"grad_norm": 0.8304933398455792,
"learning_rate": 1.9903487981117732e-05,
"loss": 1.37,
"step": 355
},
{
"epoch": 0.140330130574033,
"grad_norm": 1.0554645194699375,
"learning_rate": 1.990157054786563e-05,
"loss": 1.3502,
"step": 356
},
{
"epoch": 0.14072431633407242,
"grad_norm": 0.7811763156565412,
"learning_rate": 1.9899634348580226e-05,
"loss": 1.3615,
"step": 357
},
{
"epoch": 0.14111850209411186,
"grad_norm": 0.941990212474433,
"learning_rate": 1.9897679386931115e-05,
"loss": 1.3639,
"step": 358
},
{
"epoch": 0.14151268785415128,
"grad_norm": 0.814954847959052,
"learning_rate": 1.989570566662345e-05,
"loss": 1.3888,
"step": 359
},
{
"epoch": 0.1419068736141907,
"grad_norm": 0.8608043228373365,
"learning_rate": 1.9893713191397944e-05,
"loss": 1.3935,
"step": 360
},
{
"epoch": 0.1423010593742301,
"grad_norm": 0.890892455025287,
"learning_rate": 1.9891701965030855e-05,
"loss": 1.4008,
"step": 361
},
{
"epoch": 0.14269524513426954,
"grad_norm": 0.8356857849278824,
"learning_rate": 1.9889671991333976e-05,
"loss": 1.4298,
"step": 362
},
{
"epoch": 0.14308943089430895,
"grad_norm": 0.9106567824779971,
"learning_rate": 1.9887623274154623e-05,
"loss": 1.3618,
"step": 363
},
{
"epoch": 0.14348361665434836,
"grad_norm": 0.9437928820477995,
"learning_rate": 1.9885555817375656e-05,
"loss": 1.4348,
"step": 364
},
{
"epoch": 0.14387780241438777,
"grad_norm": 0.8738867727854848,
"learning_rate": 1.988346962491543e-05,
"loss": 1.4119,
"step": 365
},
{
"epoch": 0.1442719881744272,
"grad_norm": 0.8544123455118898,
"learning_rate": 1.9881364700727827e-05,
"loss": 1.3921,
"step": 366
},
{
"epoch": 0.14466617393446662,
"grad_norm": 0.8937019344654401,
"learning_rate": 1.9879241048802213e-05,
"loss": 1.3936,
"step": 367
},
{
"epoch": 0.14506035969450604,
"grad_norm": 0.8284420958345725,
"learning_rate": 1.987709867316346e-05,
"loss": 1.4026,
"step": 368
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.989819294325302,
"learning_rate": 1.9874937577871928e-05,
"loss": 1.389,
"step": 369
},
{
"epoch": 0.1458487312145849,
"grad_norm": 0.7893349138684312,
"learning_rate": 1.9872757767023445e-05,
"loss": 1.3721,
"step": 370
},
{
"epoch": 0.1462429169746243,
"grad_norm": 0.7968967018164466,
"learning_rate": 1.9870559244749317e-05,
"loss": 1.4324,
"step": 371
},
{
"epoch": 0.1466371027346637,
"grad_norm": 0.8953034923734662,
"learning_rate": 1.9868342015216312e-05,
"loss": 1.466,
"step": 372
},
{
"epoch": 0.14703128849470312,
"grad_norm": 0.8501443759421378,
"learning_rate": 1.986610608262665e-05,
"loss": 1.3055,
"step": 373
},
{
"epoch": 0.14742547425474256,
"grad_norm": 0.8315201315122736,
"learning_rate": 1.9863851451218006e-05,
"loss": 1.3872,
"step": 374
},
{
"epoch": 0.14781966001478197,
"grad_norm": 0.8236250547602466,
"learning_rate": 1.9861578125263484e-05,
"loss": 1.3778,
"step": 375
},
{
"epoch": 0.14821384577482138,
"grad_norm": 0.8467290646865842,
"learning_rate": 1.9859286109071626e-05,
"loss": 1.3848,
"step": 376
},
{
"epoch": 0.1486080315348608,
"grad_norm": 0.8755206588442915,
"learning_rate": 1.98569754069864e-05,
"loss": 1.4124,
"step": 377
},
{
"epoch": 0.14900221729490024,
"grad_norm": 0.8238920848534587,
"learning_rate": 1.9854646023387173e-05,
"loss": 1.3724,
"step": 378
},
{
"epoch": 0.14939640305493965,
"grad_norm": 0.8349137252265575,
"learning_rate": 1.985229796268873e-05,
"loss": 1.3722,
"step": 379
},
{
"epoch": 0.14979058881497906,
"grad_norm": 0.8217741172908753,
"learning_rate": 1.9849931229341258e-05,
"loss": 1.4549,
"step": 380
},
{
"epoch": 0.15018477457501847,
"grad_norm": 0.9356658298644844,
"learning_rate": 1.9847545827830327e-05,
"loss": 1.3605,
"step": 381
},
{
"epoch": 0.1505789603350579,
"grad_norm": 0.8507506609004069,
"learning_rate": 1.9845141762676885e-05,
"loss": 1.3447,
"step": 382
},
{
"epoch": 0.15097314609509732,
"grad_norm": 0.8752380208196286,
"learning_rate": 1.984271903843726e-05,
"loss": 1.4148,
"step": 383
},
{
"epoch": 0.15136733185513673,
"grad_norm": 0.9244928793694986,
"learning_rate": 1.9840277659703138e-05,
"loss": 1.4949,
"step": 384
},
{
"epoch": 0.15176151761517614,
"grad_norm": 0.7660534270592588,
"learning_rate": 1.983781763110156e-05,
"loss": 1.345,
"step": 385
},
{
"epoch": 0.15215570337521558,
"grad_norm": 0.84775600235801,
"learning_rate": 1.983533895729492e-05,
"loss": 1.4457,
"step": 386
},
{
"epoch": 0.152549889135255,
"grad_norm": 0.823703175205359,
"learning_rate": 1.9832841642980948e-05,
"loss": 1.4155,
"step": 387
},
{
"epoch": 0.1529440748952944,
"grad_norm": 0.779646685693002,
"learning_rate": 1.983032569289269e-05,
"loss": 1.459,
"step": 388
},
{
"epoch": 0.15333826065533382,
"grad_norm": 0.8240076846457852,
"learning_rate": 1.9827791111798526e-05,
"loss": 1.3924,
"step": 389
},
{
"epoch": 0.15373244641537326,
"grad_norm": 0.8625913690976503,
"learning_rate": 1.9825237904502143e-05,
"loss": 1.3492,
"step": 390
},
{
"epoch": 0.15412663217541267,
"grad_norm": 0.8365353230811579,
"learning_rate": 1.9822666075842527e-05,
"loss": 1.4228,
"step": 391
},
{
"epoch": 0.15452081793545208,
"grad_norm": 0.8259908671120344,
"learning_rate": 1.9820075630693955e-05,
"loss": 1.4015,
"step": 392
},
{
"epoch": 0.1549150036954915,
"grad_norm": 0.8637531603835769,
"learning_rate": 1.9817466573965996e-05,
"loss": 1.4159,
"step": 393
},
{
"epoch": 0.15530918945553093,
"grad_norm": 0.7939363512701786,
"learning_rate": 1.981483891060348e-05,
"loss": 1.304,
"step": 394
},
{
"epoch": 0.15570337521557034,
"grad_norm": 0.8866031449788612,
"learning_rate": 1.981219264558651e-05,
"loss": 1.3626,
"step": 395
},
{
"epoch": 0.15609756097560976,
"grad_norm": 0.8228072983791562,
"learning_rate": 1.9809527783930444e-05,
"loss": 1.3833,
"step": 396
},
{
"epoch": 0.15649174673564917,
"grad_norm": 0.7978736951343444,
"learning_rate": 1.980684433068588e-05,
"loss": 1.3489,
"step": 397
},
{
"epoch": 0.1568859324956886,
"grad_norm": 0.8786273761217978,
"learning_rate": 1.9804142290938654e-05,
"loss": 1.3743,
"step": 398
},
{
"epoch": 0.15728011825572802,
"grad_norm": 0.86249011323067,
"learning_rate": 1.9801421669809833e-05,
"loss": 1.3764,
"step": 399
},
{
"epoch": 0.15767430401576743,
"grad_norm": 0.8732648413397713,
"learning_rate": 1.9798682472455694e-05,
"loss": 1.4046,
"step": 400
},
{
"epoch": 0.15806848977580684,
"grad_norm": 0.8151084661992906,
"learning_rate": 1.979592470406772e-05,
"loss": 1.368,
"step": 401
},
{
"epoch": 0.15846267553584628,
"grad_norm": 0.9192834088778115,
"learning_rate": 1.97931483698726e-05,
"loss": 1.4211,
"step": 402
},
{
"epoch": 0.1588568612958857,
"grad_norm": 0.8163024312946099,
"learning_rate": 1.9790353475132206e-05,
"loss": 1.3405,
"step": 403
},
{
"epoch": 0.1592510470559251,
"grad_norm": 0.8199261685516072,
"learning_rate": 1.9787540025143576e-05,
"loss": 1.4079,
"step": 404
},
{
"epoch": 0.15964523281596452,
"grad_norm": 0.8218955327149928,
"learning_rate": 1.9784708025238935e-05,
"loss": 1.3838,
"step": 405
},
{
"epoch": 0.16003941857600396,
"grad_norm": 0.8208820007455779,
"learning_rate": 1.9781857480785645e-05,
"loss": 1.3688,
"step": 406
},
{
"epoch": 0.16043360433604337,
"grad_norm": 0.8771326041021362,
"learning_rate": 1.977898839718623e-05,
"loss": 1.4101,
"step": 407
},
{
"epoch": 0.16082779009608278,
"grad_norm": 0.7558042393459081,
"learning_rate": 1.9776100779878344e-05,
"loss": 1.425,
"step": 408
},
{
"epoch": 0.1612219758561222,
"grad_norm": 0.8739591869924033,
"learning_rate": 1.9773194634334764e-05,
"loss": 1.379,
"step": 409
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.7847266820417704,
"learning_rate": 1.977026996606339e-05,
"loss": 1.3367,
"step": 410
},
{
"epoch": 0.16201034737620104,
"grad_norm": 0.8477635650808805,
"learning_rate": 1.9767326780607218e-05,
"loss": 1.3511,
"step": 411
},
{
"epoch": 0.16240453313624045,
"grad_norm": 0.8632845728066261,
"learning_rate": 1.976436508354435e-05,
"loss": 1.3313,
"step": 412
},
{
"epoch": 0.16279871889627986,
"grad_norm": 0.7873959773662924,
"learning_rate": 1.9761384880487967e-05,
"loss": 1.3409,
"step": 413
},
{
"epoch": 0.1631929046563193,
"grad_norm": 0.818419644861465,
"learning_rate": 1.9758386177086324e-05,
"loss": 1.4273,
"step": 414
},
{
"epoch": 0.16358709041635872,
"grad_norm": 0.8843790656491963,
"learning_rate": 1.9755368979022734e-05,
"loss": 1.4058,
"step": 415
},
{
"epoch": 0.16398127617639813,
"grad_norm": 0.8545938358336401,
"learning_rate": 1.9752333292015565e-05,
"loss": 1.4021,
"step": 416
},
{
"epoch": 0.16437546193643754,
"grad_norm": 0.9263197519347521,
"learning_rate": 1.9749279121818235e-05,
"loss": 1.3893,
"step": 417
},
{
"epoch": 0.16476964769647698,
"grad_norm": 0.7667419924633587,
"learning_rate": 1.9746206474219182e-05,
"loss": 1.3335,
"step": 418
},
{
"epoch": 0.1651638334565164,
"grad_norm": 0.8481486595457164,
"learning_rate": 1.9743115355041868e-05,
"loss": 1.3288,
"step": 419
},
{
"epoch": 0.1655580192165558,
"grad_norm": 0.7727894220848658,
"learning_rate": 1.9740005770144762e-05,
"loss": 1.333,
"step": 420
},
{
"epoch": 0.1659522049765952,
"grad_norm": 0.8607077475883066,
"learning_rate": 1.9736877725421325e-05,
"loss": 1.4611,
"step": 421
},
{
"epoch": 0.16634639073663465,
"grad_norm": 0.7998454699496479,
"learning_rate": 1.9733731226800016e-05,
"loss": 1.3622,
"step": 422
},
{
"epoch": 0.16674057649667406,
"grad_norm": 0.7314193043164695,
"learning_rate": 1.9730566280244256e-05,
"loss": 1.3375,
"step": 423
},
{
"epoch": 0.16713476225671348,
"grad_norm": 0.777752765207413,
"learning_rate": 1.9727382891752446e-05,
"loss": 1.38,
"step": 424
},
{
"epoch": 0.1675289480167529,
"grad_norm": 0.8338395199460101,
"learning_rate": 1.9724181067357918e-05,
"loss": 1.3022,
"step": 425
},
{
"epoch": 0.16792313377679233,
"grad_norm": 0.8380585348678756,
"learning_rate": 1.9720960813128966e-05,
"loss": 1.3745,
"step": 426
},
{
"epoch": 0.16831731953683174,
"grad_norm": 0.8412709090344273,
"learning_rate": 1.9717722135168796e-05,
"loss": 1.3487,
"step": 427
},
{
"epoch": 0.16871150529687115,
"grad_norm": 0.8188807655558134,
"learning_rate": 1.9714465039615545e-05,
"loss": 1.4046,
"step": 428
},
{
"epoch": 0.16910569105691056,
"grad_norm": 0.7873789728209534,
"learning_rate": 1.9711189532642244e-05,
"loss": 1.3695,
"step": 429
},
{
"epoch": 0.16949987681695,
"grad_norm": 0.8380079010888628,
"learning_rate": 1.9707895620456832e-05,
"loss": 1.4121,
"step": 430
},
{
"epoch": 0.1698940625769894,
"grad_norm": 0.7464093486132232,
"learning_rate": 1.9704583309302115e-05,
"loss": 1.3383,
"step": 431
},
{
"epoch": 0.17028824833702882,
"grad_norm": 0.7745574128518233,
"learning_rate": 1.970125260545579e-05,
"loss": 1.4293,
"step": 432
},
{
"epoch": 0.17068243409706824,
"grad_norm": 0.7923250648359519,
"learning_rate": 1.9697903515230387e-05,
"loss": 1.3816,
"step": 433
},
{
"epoch": 0.17107661985710768,
"grad_norm": 0.7828760994144639,
"learning_rate": 1.9694536044973303e-05,
"loss": 1.3682,
"step": 434
},
{
"epoch": 0.1714708056171471,
"grad_norm": 0.7535267581618733,
"learning_rate": 1.9691150201066765e-05,
"loss": 1.4415,
"step": 435
},
{
"epoch": 0.1718649913771865,
"grad_norm": 0.7719938628460055,
"learning_rate": 1.9687745989927823e-05,
"loss": 1.3261,
"step": 436
},
{
"epoch": 0.1722591771372259,
"grad_norm": 0.7985396893057591,
"learning_rate": 1.968432341800833e-05,
"loss": 1.3384,
"step": 437
},
{
"epoch": 0.17265336289726535,
"grad_norm": 0.7864913353035174,
"learning_rate": 1.9680882491794953e-05,
"loss": 1.4198,
"step": 438
},
{
"epoch": 0.17304754865730476,
"grad_norm": 0.7652857695438825,
"learning_rate": 1.9677423217809127e-05,
"loss": 1.4451,
"step": 439
},
{
"epoch": 0.17344173441734417,
"grad_norm": 0.7779886907598241,
"learning_rate": 1.9673945602607073e-05,
"loss": 1.445,
"step": 440
},
{
"epoch": 0.17383592017738358,
"grad_norm": 0.7526833753446838,
"learning_rate": 1.967044965277977e-05,
"loss": 1.3715,
"step": 441
},
{
"epoch": 0.17423010593742302,
"grad_norm": 0.7613651093452684,
"learning_rate": 1.9666935374952946e-05,
"loss": 1.3418,
"step": 442
},
{
"epoch": 0.17462429169746244,
"grad_norm": 0.7407113533991782,
"learning_rate": 1.9663402775787066e-05,
"loss": 1.3176,
"step": 443
},
{
"epoch": 0.17501847745750185,
"grad_norm": 0.8511077778073948,
"learning_rate": 1.9659851861977316e-05,
"loss": 1.3712,
"step": 444
},
{
"epoch": 0.17541266321754126,
"grad_norm": 0.7637296441923789,
"learning_rate": 1.965628264025359e-05,
"loss": 1.3138,
"step": 445
},
{
"epoch": 0.1758068489775807,
"grad_norm": 0.7688575868311163,
"learning_rate": 1.9652695117380496e-05,
"loss": 1.3478,
"step": 446
},
{
"epoch": 0.1762010347376201,
"grad_norm": 0.8112254863467798,
"learning_rate": 1.9649089300157307e-05,
"loss": 1.3199,
"step": 447
},
{
"epoch": 0.17659522049765952,
"grad_norm": 0.7773958932143377,
"learning_rate": 1.9645465195417986e-05,
"loss": 1.3729,
"step": 448
},
{
"epoch": 0.17698940625769893,
"grad_norm": 0.7925758880473086,
"learning_rate": 1.9641822810031135e-05,
"loss": 1.3545,
"step": 449
},
{
"epoch": 0.17738359201773837,
"grad_norm": 0.7629015638547695,
"learning_rate": 1.9638162150900028e-05,
"loss": 1.3425,
"step": 450
},
{
"epoch": 0.17777777777777778,
"grad_norm": 0.7832983576510374,
"learning_rate": 1.9634483224962555e-05,
"loss": 1.3347,
"step": 451
},
{
"epoch": 0.1781719635378172,
"grad_norm": 0.8341313973861934,
"learning_rate": 1.963078603919123e-05,
"loss": 1.3995,
"step": 452
},
{
"epoch": 0.1785661492978566,
"grad_norm": 0.7778224652767618,
"learning_rate": 1.9627070600593172e-05,
"loss": 1.2996,
"step": 453
},
{
"epoch": 0.17896033505789605,
"grad_norm": 0.8243076810986155,
"learning_rate": 1.96233369162101e-05,
"loss": 1.3893,
"step": 454
},
{
"epoch": 0.17935452081793546,
"grad_norm": 0.8654955959896804,
"learning_rate": 1.9619584993118308e-05,
"loss": 1.3232,
"step": 455
},
{
"epoch": 0.17974870657797487,
"grad_norm": 0.804527846282048,
"learning_rate": 1.9615814838428662e-05,
"loss": 1.3656,
"step": 456
},
{
"epoch": 0.18014289233801428,
"grad_norm": 0.7962448753036495,
"learning_rate": 1.961202645928658e-05,
"loss": 1.3637,
"step": 457
},
{
"epoch": 0.18053707809805372,
"grad_norm": 0.8354245092920538,
"learning_rate": 1.960821986287201e-05,
"loss": 1.3867,
"step": 458
},
{
"epoch": 0.18093126385809313,
"grad_norm": 0.8345477417237376,
"learning_rate": 1.960439505639945e-05,
"loss": 1.3931,
"step": 459
},
{
"epoch": 0.18132544961813254,
"grad_norm": 0.9026625490600573,
"learning_rate": 1.9600552047117883e-05,
"loss": 1.3355,
"step": 460
},
{
"epoch": 0.18171963537817196,
"grad_norm": 0.7381101689953861,
"learning_rate": 1.9596690842310807e-05,
"loss": 1.3469,
"step": 461
},
{
"epoch": 0.1821138211382114,
"grad_norm": 0.8146270963359201,
"learning_rate": 1.9592811449296206e-05,
"loss": 1.3754,
"step": 462
},
{
"epoch": 0.1825080068982508,
"grad_norm": 0.7583095033222406,
"learning_rate": 1.9588913875426532e-05,
"loss": 1.3674,
"step": 463
},
{
"epoch": 0.18290219265829022,
"grad_norm": 0.7547653358304839,
"learning_rate": 1.9584998128088686e-05,
"loss": 1.3402,
"step": 464
},
{
"epoch": 0.18329637841832963,
"grad_norm": 0.8068714500814903,
"learning_rate": 1.958106421470403e-05,
"loss": 1.3792,
"step": 465
},
{
"epoch": 0.18369056417836907,
"grad_norm": 0.7623764190926223,
"learning_rate": 1.957711214272834e-05,
"loss": 1.3683,
"step": 466
},
{
"epoch": 0.18408474993840848,
"grad_norm": 0.7327762464326012,
"learning_rate": 1.957314191965182e-05,
"loss": 1.3321,
"step": 467
},
{
"epoch": 0.1844789356984479,
"grad_norm": 0.8050214138929509,
"learning_rate": 1.9569153552999057e-05,
"loss": 1.4045,
"step": 468
},
{
"epoch": 0.1848731214584873,
"grad_norm": 0.7931062968671917,
"learning_rate": 1.9565147050329046e-05,
"loss": 1.3676,
"step": 469
},
{
"epoch": 0.18526730721852674,
"grad_norm": 0.7329041782778525,
"learning_rate": 1.9561122419235137e-05,
"loss": 1.3468,
"step": 470
},
{
"epoch": 0.18566149297856616,
"grad_norm": 0.7706739838708203,
"learning_rate": 1.955707966734505e-05,
"loss": 1.3456,
"step": 471
},
{
"epoch": 0.18605567873860557,
"grad_norm": 0.7721590455864087,
"learning_rate": 1.9553018802320843e-05,
"loss": 1.383,
"step": 472
},
{
"epoch": 0.18644986449864498,
"grad_norm": 0.7426283570331748,
"learning_rate": 1.95489398318589e-05,
"loss": 1.3125,
"step": 473
},
{
"epoch": 0.18684405025868442,
"grad_norm": 2.063311743166772,
"learning_rate": 1.9544842763689928e-05,
"loss": 1.4202,
"step": 474
},
{
"epoch": 0.18723823601872383,
"grad_norm": 0.7311089489840802,
"learning_rate": 1.954072760557893e-05,
"loss": 1.2622,
"step": 475
},
{
"epoch": 0.18763242177876324,
"grad_norm": 0.781806989985732,
"learning_rate": 1.953659436532519e-05,
"loss": 1.3805,
"step": 476
},
{
"epoch": 0.18802660753880265,
"grad_norm": 0.8019278871709516,
"learning_rate": 1.9532443050762265e-05,
"loss": 1.3006,
"step": 477
},
{
"epoch": 0.1884207932988421,
"grad_norm": 0.7493676971003281,
"learning_rate": 1.9528273669757974e-05,
"loss": 1.2912,
"step": 478
},
{
"epoch": 0.1888149790588815,
"grad_norm": 0.8268984543433072,
"learning_rate": 1.9524086230214366e-05,
"loss": 1.3565,
"step": 479
},
{
"epoch": 0.18920916481892092,
"grad_norm": 0.7801443400096512,
"learning_rate": 1.951988074006772e-05,
"loss": 1.371,
"step": 480
},
{
"epoch": 0.18960335057896033,
"grad_norm": 0.7539695626008661,
"learning_rate": 1.9515657207288528e-05,
"loss": 1.3721,
"step": 481
},
{
"epoch": 0.18999753633899977,
"grad_norm": 0.7703572570935576,
"learning_rate": 1.9511415639881474e-05,
"loss": 1.4442,
"step": 482
},
{
"epoch": 0.19039172209903918,
"grad_norm": 0.7742745558792156,
"learning_rate": 1.9507156045885423e-05,
"loss": 1.2905,
"step": 483
},
{
"epoch": 0.1907859078590786,
"grad_norm": 0.7359869825956976,
"learning_rate": 1.950287843337341e-05,
"loss": 1.3254,
"step": 484
},
{
"epoch": 0.191180093619118,
"grad_norm": 0.7544568408416208,
"learning_rate": 1.9498582810452607e-05,
"loss": 1.3154,
"step": 485
},
{
"epoch": 0.19157427937915744,
"grad_norm": 0.7769753768513467,
"learning_rate": 1.949426918526434e-05,
"loss": 1.3628,
"step": 486
},
{
"epoch": 0.19196846513919685,
"grad_norm": 0.7834189136520097,
"learning_rate": 1.9489937565984033e-05,
"loss": 1.3554,
"step": 487
},
{
"epoch": 0.19236265089923626,
"grad_norm": 0.7796538796113698,
"learning_rate": 1.948558796082123e-05,
"loss": 1.2925,
"step": 488
},
{
"epoch": 0.19275683665927568,
"grad_norm": 1.0372440968179562,
"learning_rate": 1.9481220378019553e-05,
"loss": 1.309,
"step": 489
},
{
"epoch": 0.19315102241931512,
"grad_norm": 0.727717117732363,
"learning_rate": 1.9476834825856696e-05,
"loss": 1.353,
"step": 490
},
{
"epoch": 0.19354520817935453,
"grad_norm": 0.7330989067981496,
"learning_rate": 1.947243131264442e-05,
"loss": 1.3326,
"step": 491
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.8625663326931535,
"learning_rate": 1.9468009846728515e-05,
"loss": 1.3795,
"step": 492
},
{
"epoch": 0.19433357969943335,
"grad_norm": 0.7442872681943762,
"learning_rate": 1.9463570436488803e-05,
"loss": 1.3343,
"step": 493
},
{
"epoch": 0.1947277654594728,
"grad_norm": 0.7892831285816906,
"learning_rate": 1.9459113090339107e-05,
"loss": 1.4112,
"step": 494
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.7915084905242407,
"learning_rate": 1.945463781672726e-05,
"loss": 1.3867,
"step": 495
},
{
"epoch": 0.1955161369795516,
"grad_norm": 0.7558768011341099,
"learning_rate": 1.945014462413505e-05,
"loss": 1.2735,
"step": 496
},
{
"epoch": 0.19591032273959103,
"grad_norm": 0.7918551795385935,
"learning_rate": 1.9445633521078246e-05,
"loss": 1.366,
"step": 497
},
{
"epoch": 0.19630450849963046,
"grad_norm": 0.7632462761447605,
"learning_rate": 1.944110451610655e-05,
"loss": 1.2919,
"step": 498
},
{
"epoch": 0.19669869425966988,
"grad_norm": 0.8619242283408518,
"learning_rate": 1.9436557617803594e-05,
"loss": 1.3433,
"step": 499
},
{
"epoch": 0.1970928800197093,
"grad_norm": 0.7486074296088833,
"learning_rate": 1.943199283478693e-05,
"loss": 1.3718,
"step": 500
},
{
"epoch": 0.1974870657797487,
"grad_norm": 0.7844981757900801,
"learning_rate": 1.9427410175707993e-05,
"loss": 1.3615,
"step": 501
},
{
"epoch": 0.19788125153978814,
"grad_norm": 0.7861270837445861,
"learning_rate": 1.942280964925211e-05,
"loss": 1.4269,
"step": 502
},
{
"epoch": 0.19827543729982755,
"grad_norm": 0.7771387444238573,
"learning_rate": 1.9418191264138468e-05,
"loss": 1.3861,
"step": 503
},
{
"epoch": 0.19866962305986696,
"grad_norm": 0.7840229669644916,
"learning_rate": 1.94135550291201e-05,
"loss": 1.3508,
"step": 504
},
{
"epoch": 0.19906380881990637,
"grad_norm": 0.7578091088675099,
"learning_rate": 1.940890095298386e-05,
"loss": 1.3,
"step": 505
},
{
"epoch": 0.1994579945799458,
"grad_norm": 0.7955186622031103,
"learning_rate": 1.9404229044550432e-05,
"loss": 1.3877,
"step": 506
},
{
"epoch": 0.19985218033998522,
"grad_norm": 0.7600697521641491,
"learning_rate": 1.939953931267429e-05,
"loss": 1.3083,
"step": 507
},
{
"epoch": 0.20024636610002464,
"grad_norm": 0.7997760910789501,
"learning_rate": 1.9394831766243688e-05,
"loss": 1.3574,
"step": 508
},
{
"epoch": 0.20064055186006405,
"grad_norm": 0.8324601470930124,
"learning_rate": 1.9390106414180635e-05,
"loss": 1.3314,
"step": 509
},
{
"epoch": 0.2010347376201035,
"grad_norm": 0.7986181347574611,
"learning_rate": 1.9385363265440896e-05,
"loss": 1.3701,
"step": 510
},
{
"epoch": 0.2014289233801429,
"grad_norm": 0.8390387581661004,
"learning_rate": 1.9380602329013967e-05,
"loss": 1.3278,
"step": 511
},
{
"epoch": 0.2018231091401823,
"grad_norm": 0.7756267484264265,
"learning_rate": 1.937582361392305e-05,
"loss": 1.2902,
"step": 512
},
{
"epoch": 0.20221729490022172,
"grad_norm": 0.8280742083628098,
"learning_rate": 1.9371027129225042e-05,
"loss": 1.3954,
"step": 513
},
{
"epoch": 0.20261148066026116,
"grad_norm": 0.7557033928381056,
"learning_rate": 1.9366212884010523e-05,
"loss": 1.3245,
"step": 514
},
{
"epoch": 0.20300566642030057,
"grad_norm": 0.7339490880913666,
"learning_rate": 1.9361380887403726e-05,
"loss": 1.3314,
"step": 515
},
{
"epoch": 0.20339985218033999,
"grad_norm": 0.759110598024447,
"learning_rate": 1.935653114856254e-05,
"loss": 1.3075,
"step": 516
},
{
"epoch": 0.2037940379403794,
"grad_norm": 0.7330136521742119,
"learning_rate": 1.9351663676678465e-05,
"loss": 1.3105,
"step": 517
},
{
"epoch": 0.20418822370041884,
"grad_norm": 0.8396501916315762,
"learning_rate": 1.9346778480976626e-05,
"loss": 1.3555,
"step": 518
},
{
"epoch": 0.20458240946045825,
"grad_norm": 0.7833213499224854,
"learning_rate": 1.9341875570715723e-05,
"loss": 1.393,
"step": 519
},
{
"epoch": 0.20497659522049766,
"grad_norm": 0.788388912099959,
"learning_rate": 1.9336954955188042e-05,
"loss": 1.3548,
"step": 520
},
{
"epoch": 0.20537078098053707,
"grad_norm": 0.7944142250573871,
"learning_rate": 1.9332016643719413e-05,
"loss": 1.3167,
"step": 521
},
{
"epoch": 0.2057649667405765,
"grad_norm": 0.7185170009516036,
"learning_rate": 1.932706064566922e-05,
"loss": 1.2763,
"step": 522
},
{
"epoch": 0.20615915250061592,
"grad_norm": 0.7625422306230389,
"learning_rate": 1.9322086970430355e-05,
"loss": 1.2991,
"step": 523
},
{
"epoch": 0.20655333826065533,
"grad_norm": 0.7528804400146271,
"learning_rate": 1.9317095627429215e-05,
"loss": 1.2744,
"step": 524
},
{
"epoch": 0.20694752402069475,
"grad_norm": 0.7235339004181085,
"learning_rate": 1.931208662612569e-05,
"loss": 1.3023,
"step": 525
},
{
"epoch": 0.20734170978073418,
"grad_norm": 0.7485454145610042,
"learning_rate": 1.930705997601313e-05,
"loss": 1.2737,
"step": 526
},
{
"epoch": 0.2077358955407736,
"grad_norm": 0.7616817297855956,
"learning_rate": 1.9302015686618328e-05,
"loss": 1.3331,
"step": 527
},
{
"epoch": 0.208130081300813,
"grad_norm": 0.7224963273000136,
"learning_rate": 1.929695376750152e-05,
"loss": 1.3113,
"step": 528
},
{
"epoch": 0.20852426706085242,
"grad_norm": 0.7117066935208167,
"learning_rate": 1.9291874228256355e-05,
"loss": 1.3536,
"step": 529
},
{
"epoch": 0.20891845282089186,
"grad_norm": 0.7620668487908003,
"learning_rate": 1.928677707850986e-05,
"loss": 1.3847,
"step": 530
},
{
"epoch": 0.20931263858093127,
"grad_norm": 0.7762645227174237,
"learning_rate": 1.9281662327922458e-05,
"loss": 1.3838,
"step": 531
},
{
"epoch": 0.20970682434097068,
"grad_norm": 0.7486355068094747,
"learning_rate": 1.9276529986187925e-05,
"loss": 1.2929,
"step": 532
},
{
"epoch": 0.2101010101010101,
"grad_norm": 0.7850761598989443,
"learning_rate": 1.9271380063033368e-05,
"loss": 1.3511,
"step": 533
},
{
"epoch": 0.21049519586104953,
"grad_norm": 0.7306901593960397,
"learning_rate": 1.9266212568219223e-05,
"loss": 1.3223,
"step": 534
},
{
"epoch": 0.21088938162108894,
"grad_norm": 0.8035850088778281,
"learning_rate": 1.9261027511539227e-05,
"loss": 1.3615,
"step": 535
},
{
"epoch": 0.21128356738112836,
"grad_norm": 0.7359933674500054,
"learning_rate": 1.9255824902820403e-05,
"loss": 1.3733,
"step": 536
},
{
"epoch": 0.21167775314116777,
"grad_norm": 0.7361755019126336,
"learning_rate": 1.9250604751923035e-05,
"loss": 1.2759,
"step": 537
},
{
"epoch": 0.2120719389012072,
"grad_norm": 0.7731391184456793,
"learning_rate": 1.9245367068740664e-05,
"loss": 1.3493,
"step": 538
},
{
"epoch": 0.21246612466124662,
"grad_norm": 0.7070141898804634,
"learning_rate": 1.9240111863200047e-05,
"loss": 1.3316,
"step": 539
},
{
"epoch": 0.21286031042128603,
"grad_norm": 0.7047293130221922,
"learning_rate": 1.9234839145261154e-05,
"loss": 1.309,
"step": 540
},
{
"epoch": 0.21325449618132544,
"grad_norm": 0.7787357081571815,
"learning_rate": 1.9229548924917146e-05,
"loss": 1.3572,
"step": 541
},
{
"epoch": 0.21364868194136488,
"grad_norm": 0.7390906175625679,
"learning_rate": 1.9224241212194364e-05,
"loss": 1.3855,
"step": 542
},
{
"epoch": 0.2140428677014043,
"grad_norm": 0.7348457458913636,
"learning_rate": 1.9218916017152292e-05,
"loss": 1.3093,
"step": 543
},
{
"epoch": 0.2144370534614437,
"grad_norm": 0.752656550237857,
"learning_rate": 1.9213573349883545e-05,
"loss": 1.4028,
"step": 544
},
{
"epoch": 0.21483123922148312,
"grad_norm": 0.7244840658804366,
"learning_rate": 1.9208213220513866e-05,
"loss": 1.2963,
"step": 545
},
{
"epoch": 0.21522542498152256,
"grad_norm": 0.770992566259173,
"learning_rate": 1.9202835639202075e-05,
"loss": 1.2926,
"step": 546
},
{
"epoch": 0.21561961074156197,
"grad_norm": 0.7643194008638872,
"learning_rate": 1.919744061614008e-05,
"loss": 1.3145,
"step": 547
},
{
"epoch": 0.21601379650160138,
"grad_norm": 0.7366196627549643,
"learning_rate": 1.9192028161552848e-05,
"loss": 1.3536,
"step": 548
},
{
"epoch": 0.2164079822616408,
"grad_norm": 0.6968551530472608,
"learning_rate": 1.9186598285698373e-05,
"loss": 1.3063,
"step": 549
},
{
"epoch": 0.21680216802168023,
"grad_norm": 0.7641280477443396,
"learning_rate": 1.9181150998867674e-05,
"loss": 1.3252,
"step": 550
},
{
"epoch": 0.21719635378171964,
"grad_norm": 0.7864006183375085,
"learning_rate": 1.9175686311384763e-05,
"loss": 1.2925,
"step": 551
},
{
"epoch": 0.21759053954175905,
"grad_norm": 0.7510317585657532,
"learning_rate": 1.917020423360664e-05,
"loss": 1.3147,
"step": 552
},
{
"epoch": 0.21798472530179847,
"grad_norm": 0.759753668019818,
"learning_rate": 1.9164704775923258e-05,
"loss": 1.2949,
"step": 553
},
{
"epoch": 0.2183789110618379,
"grad_norm": 0.7730004582439941,
"learning_rate": 1.9159187948757503e-05,
"loss": 1.2885,
"step": 554
},
{
"epoch": 0.21877309682187732,
"grad_norm": 0.7672020235507695,
"learning_rate": 1.915365376256519e-05,
"loss": 1.3914,
"step": 555
},
{
"epoch": 0.21916728258191673,
"grad_norm": 0.752157061906444,
"learning_rate": 1.9148102227835033e-05,
"loss": 1.3487,
"step": 556
},
{
"epoch": 0.21956146834195614,
"grad_norm": 0.7278798351850428,
"learning_rate": 1.9142533355088628e-05,
"loss": 1.3303,
"step": 557
},
{
"epoch": 0.21995565410199558,
"grad_norm": 0.7104471440585667,
"learning_rate": 1.9136947154880413e-05,
"loss": 1.3193,
"step": 558
},
{
"epoch": 0.220349839862035,
"grad_norm": 0.7800638989095695,
"learning_rate": 1.9131343637797695e-05,
"loss": 1.3536,
"step": 559
},
{
"epoch": 0.2207440256220744,
"grad_norm": 0.7109099389345059,
"learning_rate": 1.9125722814460582e-05,
"loss": 1.2976,
"step": 560
},
{
"epoch": 0.22113821138211381,
"grad_norm": 0.709861315894559,
"learning_rate": 1.912008469552198e-05,
"loss": 1.3534,
"step": 561
},
{
"epoch": 0.22153239714215325,
"grad_norm": 0.7625065746820054,
"learning_rate": 1.9114429291667583e-05,
"loss": 1.3593,
"step": 562
},
{
"epoch": 0.22192658290219267,
"grad_norm": 0.8957024180712038,
"learning_rate": 1.9108756613615846e-05,
"loss": 1.2796,
"step": 563
},
{
"epoch": 0.22232076866223208,
"grad_norm": 0.756013792651535,
"learning_rate": 1.9103066672117957e-05,
"loss": 1.2989,
"step": 564
},
{
"epoch": 0.2227149544222715,
"grad_norm": 0.7162732062615748,
"learning_rate": 1.9097359477957825e-05,
"loss": 1.2601,
"step": 565
},
{
"epoch": 0.22310914018231093,
"grad_norm": 0.7436938571603158,
"learning_rate": 1.9091635041952052e-05,
"loss": 1.3151,
"step": 566
},
{
"epoch": 0.22350332594235034,
"grad_norm": 0.7610549683893325,
"learning_rate": 1.9085893374949926e-05,
"loss": 1.2972,
"step": 567
},
{
"epoch": 0.22389751170238975,
"grad_norm": 0.7558082450692344,
"learning_rate": 1.9080134487833393e-05,
"loss": 1.3793,
"step": 568
},
{
"epoch": 0.22429169746242916,
"grad_norm": 0.7719491717906157,
"learning_rate": 1.9074358391517026e-05,
"loss": 1.3779,
"step": 569
},
{
"epoch": 0.2246858832224686,
"grad_norm": 0.7374690493690355,
"learning_rate": 1.9068565096948017e-05,
"loss": 1.3406,
"step": 570
},
{
"epoch": 0.225080068982508,
"grad_norm": 0.7538369331733002,
"learning_rate": 1.9062754615106162e-05,
"loss": 1.2936,
"step": 571
},
{
"epoch": 0.22547425474254743,
"grad_norm": 0.7296271125635926,
"learning_rate": 1.905692695700382e-05,
"loss": 1.3447,
"step": 572
},
{
"epoch": 0.22586844050258684,
"grad_norm": 0.8084596790033229,
"learning_rate": 1.905108213368591e-05,
"loss": 1.2637,
"step": 573
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.7557777464040102,
"learning_rate": 1.904522015622988e-05,
"loss": 1.3563,
"step": 574
},
{
"epoch": 0.2266568120226657,
"grad_norm": 0.7483236106401496,
"learning_rate": 1.9039341035745696e-05,
"loss": 1.2815,
"step": 575
},
{
"epoch": 0.2270509977827051,
"grad_norm": 0.8169659004896286,
"learning_rate": 1.9033444783375806e-05,
"loss": 1.2968,
"step": 576
},
{
"epoch": 0.2274451835427445,
"grad_norm": 0.7564345089200964,
"learning_rate": 1.9027531410295128e-05,
"loss": 1.2903,
"step": 577
},
{
"epoch": 0.22783936930278395,
"grad_norm": 0.740064034653702,
"learning_rate": 1.9021600927711037e-05,
"loss": 1.3115,
"step": 578
},
{
"epoch": 0.22823355506282336,
"grad_norm": 0.7536666281291825,
"learning_rate": 1.9015653346863322e-05,
"loss": 1.2815,
"step": 579
},
{
"epoch": 0.22862774082286277,
"grad_norm": 0.7332255399421099,
"learning_rate": 1.900968867902419e-05,
"loss": 1.2896,
"step": 580
},
{
"epoch": 0.22902192658290219,
"grad_norm": 0.7215272966131613,
"learning_rate": 1.9003706935498233e-05,
"loss": 1.3181,
"step": 581
},
{
"epoch": 0.22941611234294162,
"grad_norm": 0.8275893204395051,
"learning_rate": 1.8997708127622384e-05,
"loss": 1.293,
"step": 582
},
{
"epoch": 0.22981029810298104,
"grad_norm": 0.7495958353788804,
"learning_rate": 1.8991692266765947e-05,
"loss": 1.2679,
"step": 583
},
{
"epoch": 0.23020448386302045,
"grad_norm": 0.7772101723875109,
"learning_rate": 1.8985659364330522e-05,
"loss": 1.325,
"step": 584
},
{
"epoch": 0.23059866962305986,
"grad_norm": 0.7489454768012945,
"learning_rate": 1.8979609431750025e-05,
"loss": 1.2757,
"step": 585
},
{
"epoch": 0.2309928553830993,
"grad_norm": 0.7612569479113607,
"learning_rate": 1.8973542480490636e-05,
"loss": 1.3161,
"step": 586
},
{
"epoch": 0.2313870411431387,
"grad_norm": 0.8016105305619344,
"learning_rate": 1.89674585220508e-05,
"loss": 1.3373,
"step": 587
},
{
"epoch": 0.23178122690317812,
"grad_norm": 0.7552521095717978,
"learning_rate": 1.8961357567961182e-05,
"loss": 1.3341,
"step": 588
},
{
"epoch": 0.23217541266321753,
"grad_norm": 0.8077575349160561,
"learning_rate": 1.8955239629784667e-05,
"loss": 1.3828,
"step": 589
},
{
"epoch": 0.23256959842325697,
"grad_norm": 0.7734481164743204,
"learning_rate": 1.8949104719116334e-05,
"loss": 1.2494,
"step": 590
},
{
"epoch": 0.23296378418329639,
"grad_norm": 0.7239243239882402,
"learning_rate": 1.8942952847583417e-05,
"loss": 1.3492,
"step": 591
},
{
"epoch": 0.2333579699433358,
"grad_norm": 0.7392668666857419,
"learning_rate": 1.8936784026845304e-05,
"loss": 1.2988,
"step": 592
},
{
"epoch": 0.2337521557033752,
"grad_norm": 0.737345549169784,
"learning_rate": 1.8930598268593503e-05,
"loss": 1.3593,
"step": 593
},
{
"epoch": 0.23414634146341465,
"grad_norm": 0.7739820026696098,
"learning_rate": 1.8924395584551624e-05,
"loss": 1.2917,
"step": 594
},
{
"epoch": 0.23454052722345406,
"grad_norm": 0.7370299572384036,
"learning_rate": 1.891817598647535e-05,
"loss": 1.3188,
"step": 595
},
{
"epoch": 0.23493471298349347,
"grad_norm": 0.7045735291814132,
"learning_rate": 1.8911939486152433e-05,
"loss": 1.2999,
"step": 596
},
{
"epoch": 0.23532889874353288,
"grad_norm": 0.7318502745854408,
"learning_rate": 1.8905686095402648e-05,
"loss": 1.2973,
"step": 597
},
{
"epoch": 0.23572308450357232,
"grad_norm": 0.6992717345016547,
"learning_rate": 1.8899415826077784e-05,
"loss": 1.2562,
"step": 598
},
{
"epoch": 0.23611727026361173,
"grad_norm": 0.7855449422876546,
"learning_rate": 1.8893128690061625e-05,
"loss": 1.3331,
"step": 599
},
{
"epoch": 0.23651145602365115,
"grad_norm": 0.7330330982965301,
"learning_rate": 1.8886824699269916e-05,
"loss": 1.2719,
"step": 600
},
{
"epoch": 0.23690564178369056,
"grad_norm": 0.7235999574209688,
"learning_rate": 1.888050386565034e-05,
"loss": 1.2848,
"step": 601
},
{
"epoch": 0.23729982754373,
"grad_norm": 0.7259572083243264,
"learning_rate": 1.8874166201182526e-05,
"loss": 1.2901,
"step": 602
},
{
"epoch": 0.2376940133037694,
"grad_norm": 0.738733374260345,
"learning_rate": 1.8867811717877966e-05,
"loss": 1.2949,
"step": 603
},
{
"epoch": 0.23808819906380882,
"grad_norm": 0.7293917944233541,
"learning_rate": 1.886144042778006e-05,
"loss": 1.2738,
"step": 604
},
{
"epoch": 0.23848238482384823,
"grad_norm": 0.7004391383451308,
"learning_rate": 1.885505234296404e-05,
"loss": 1.2703,
"step": 605
},
{
"epoch": 0.23887657058388767,
"grad_norm": 0.7664560785377862,
"learning_rate": 1.884864747553698e-05,
"loss": 1.3647,
"step": 606
},
{
"epoch": 0.23927075634392708,
"grad_norm": 0.8048750538355759,
"learning_rate": 1.8842225837637765e-05,
"loss": 1.4858,
"step": 607
},
{
"epoch": 0.2396649421039665,
"grad_norm": 0.7886892188335735,
"learning_rate": 1.8835787441437043e-05,
"loss": 1.3808,
"step": 608
},
{
"epoch": 0.2400591278640059,
"grad_norm": 0.700691895354596,
"learning_rate": 1.8829332299137245e-05,
"loss": 1.3073,
"step": 609
},
{
"epoch": 0.24045331362404535,
"grad_norm": 0.749597801010302,
"learning_rate": 1.882286042297254e-05,
"loss": 1.3656,
"step": 610
},
{
"epoch": 0.24084749938408476,
"grad_norm": 0.7481923330312744,
"learning_rate": 1.881637182520879e-05,
"loss": 1.3272,
"step": 611
},
{
"epoch": 0.24124168514412417,
"grad_norm": 0.6957757781146582,
"learning_rate": 1.880986651814357e-05,
"loss": 1.2368,
"step": 612
},
{
"epoch": 0.24163587090416358,
"grad_norm": 0.7428959152728734,
"learning_rate": 1.8803344514106123e-05,
"loss": 1.3561,
"step": 613
},
{
"epoch": 0.24203005666420302,
"grad_norm": 0.733482247697521,
"learning_rate": 1.8796805825457324e-05,
"loss": 1.3296,
"step": 614
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.7941648551428049,
"learning_rate": 1.8790250464589676e-05,
"loss": 1.3018,
"step": 615
},
{
"epoch": 0.24281842818428184,
"grad_norm": 0.7864984021030504,
"learning_rate": 1.8783678443927282e-05,
"loss": 1.3507,
"step": 616
},
{
"epoch": 0.24321261394432125,
"grad_norm": 0.7607319722931054,
"learning_rate": 1.8777089775925822e-05,
"loss": 1.3028,
"step": 617
},
{
"epoch": 0.2436067997043607,
"grad_norm": 0.7531520087715251,
"learning_rate": 1.8770484473072518e-05,
"loss": 1.337,
"step": 618
},
{
"epoch": 0.2440009854644001,
"grad_norm": 0.7227583108021773,
"learning_rate": 1.8763862547886133e-05,
"loss": 1.3006,
"step": 619
},
{
"epoch": 0.24439517122443952,
"grad_norm": 0.7244215425325586,
"learning_rate": 1.8757224012916913e-05,
"loss": 1.3111,
"step": 620
},
{
"epoch": 0.24478935698447893,
"grad_norm": 0.726809176042967,
"learning_rate": 1.8750568880746606e-05,
"loss": 1.2595,
"step": 621
},
{
"epoch": 0.24518354274451837,
"grad_norm": 0.7409190065458727,
"learning_rate": 1.87438971639884e-05,
"loss": 1.2985,
"step": 622
},
{
"epoch": 0.24557772850455778,
"grad_norm": 0.7027463402470976,
"learning_rate": 1.8737208875286933e-05,
"loss": 1.2993,
"step": 623
},
{
"epoch": 0.2459719142645972,
"grad_norm": 0.7354741797652073,
"learning_rate": 1.8730504027318223e-05,
"loss": 1.3101,
"step": 624
},
{
"epoch": 0.2463661000246366,
"grad_norm": 0.7151055215992336,
"learning_rate": 1.87237826327897e-05,
"loss": 1.3016,
"step": 625
},
{
"epoch": 0.24676028578467604,
"grad_norm": 0.7346955837306206,
"learning_rate": 1.871704470444014e-05,
"loss": 1.3026,
"step": 626
},
{
"epoch": 0.24715447154471545,
"grad_norm": 0.7087046803059532,
"learning_rate": 1.8710290255039654e-05,
"loss": 1.3149,
"step": 627
},
{
"epoch": 0.24754865730475487,
"grad_norm": 0.7301865796459245,
"learning_rate": 1.870351929738967e-05,
"loss": 1.2857,
"step": 628
},
{
"epoch": 0.24794284306479428,
"grad_norm": 0.7189028712874932,
"learning_rate": 1.86967318443229e-05,
"loss": 1.3185,
"step": 629
},
{
"epoch": 0.24833702882483372,
"grad_norm": 0.6879300842588244,
"learning_rate": 1.8689927908703325e-05,
"loss": 1.2882,
"step": 630
},
{
"epoch": 0.24873121458487313,
"grad_norm": 0.6980954368807367,
"learning_rate": 1.8683107503426158e-05,
"loss": 1.2522,
"step": 631
},
{
"epoch": 0.24912540034491254,
"grad_norm": 0.7545776954574633,
"learning_rate": 1.8676270641417824e-05,
"loss": 1.322,
"step": 632
},
{
"epoch": 0.24951958610495195,
"grad_norm": 0.7115077185501087,
"learning_rate": 1.8669417335635946e-05,
"loss": 1.2723,
"step": 633
},
{
"epoch": 0.2499137718649914,
"grad_norm": 0.7379949770472353,
"learning_rate": 1.866254759906931e-05,
"loss": 1.4362,
"step": 634
},
{
"epoch": 0.2503079576250308,
"grad_norm": 0.7573308426125499,
"learning_rate": 1.8655661444737835e-05,
"loss": 1.3177,
"step": 635
},
{
"epoch": 0.25070214338507024,
"grad_norm": 0.7257743669215548,
"learning_rate": 1.864875888569257e-05,
"loss": 1.3062,
"step": 636
},
{
"epoch": 0.25109632914510965,
"grad_norm": 0.6940203952508667,
"learning_rate": 1.864183993501564e-05,
"loss": 1.2652,
"step": 637
},
{
"epoch": 0.25149051490514907,
"grad_norm": 0.8172564591114041,
"learning_rate": 1.863490460582025e-05,
"loss": 1.3199,
"step": 638
},
{
"epoch": 0.2518847006651885,
"grad_norm": 0.7226317764207526,
"learning_rate": 1.8627952911250632e-05,
"loss": 1.3106,
"step": 639
},
{
"epoch": 0.2522788864252279,
"grad_norm": 0.7438657902645007,
"learning_rate": 1.8620984864482046e-05,
"loss": 1.2981,
"step": 640
},
{
"epoch": 0.2526730721852673,
"grad_norm": 0.7422399467375352,
"learning_rate": 1.8614000478720743e-05,
"loss": 1.3406,
"step": 641
},
{
"epoch": 0.2530672579453067,
"grad_norm": 0.7811618617681046,
"learning_rate": 1.860699976720393e-05,
"loss": 1.3105,
"step": 642
},
{
"epoch": 0.2534614437053461,
"grad_norm": 0.7398963519463426,
"learning_rate": 1.8599982743199775e-05,
"loss": 1.3194,
"step": 643
},
{
"epoch": 0.2538556294653856,
"grad_norm": 0.7614275275857106,
"learning_rate": 1.859294942000734e-05,
"loss": 1.2825,
"step": 644
},
{
"epoch": 0.254249815225425,
"grad_norm": 0.7495597529607684,
"learning_rate": 1.85858998109566e-05,
"loss": 1.2941,
"step": 645
},
{
"epoch": 0.2546440009854644,
"grad_norm": 0.76715001759035,
"learning_rate": 1.857883392940837e-05,
"loss": 1.3126,
"step": 646
},
{
"epoch": 0.2550381867455038,
"grad_norm": 0.7357189271424588,
"learning_rate": 1.8571751788754336e-05,
"loss": 1.3363,
"step": 647
},
{
"epoch": 0.25543237250554324,
"grad_norm": 0.7382893718452418,
"learning_rate": 1.856465340241697e-05,
"loss": 1.2237,
"step": 648
},
{
"epoch": 0.25582655826558265,
"grad_norm": 0.7377308175335368,
"learning_rate": 1.8557538783849555e-05,
"loss": 1.2561,
"step": 649
},
{
"epoch": 0.25622074402562206,
"grad_norm": 0.7792573574030509,
"learning_rate": 1.8550407946536127e-05,
"loss": 1.2835,
"step": 650
},
{
"epoch": 0.25661492978566147,
"grad_norm": 0.8268845473577122,
"learning_rate": 1.8543260903991467e-05,
"loss": 1.2624,
"step": 651
},
{
"epoch": 0.25700911554570094,
"grad_norm": 0.7139020431429061,
"learning_rate": 1.8536097669761066e-05,
"loss": 1.2767,
"step": 652
},
{
"epoch": 0.25740330130574035,
"grad_norm": 0.836771495489938,
"learning_rate": 1.85289182574211e-05,
"loss": 1.2564,
"step": 653
},
{
"epoch": 0.25779748706577976,
"grad_norm": 0.7744188165849301,
"learning_rate": 1.8521722680578413e-05,
"loss": 1.3551,
"step": 654
},
{
"epoch": 0.2581916728258192,
"grad_norm": 0.7733400605257766,
"learning_rate": 1.851451095287048e-05,
"loss": 1.3511,
"step": 655
},
{
"epoch": 0.2585858585858586,
"grad_norm": 0.7813471536798385,
"learning_rate": 1.850728308796539e-05,
"loss": 1.2426,
"step": 656
},
{
"epoch": 0.258980044345898,
"grad_norm": 0.7708022669200939,
"learning_rate": 1.8500039099561807e-05,
"loss": 1.2708,
"step": 657
},
{
"epoch": 0.2593742301059374,
"grad_norm": 0.7838881723591813,
"learning_rate": 1.8492779001388964e-05,
"loss": 1.3396,
"step": 658
},
{
"epoch": 0.2597684158659768,
"grad_norm": 0.7443818910969162,
"learning_rate": 1.8485502807206624e-05,
"loss": 1.3021,
"step": 659
},
{
"epoch": 0.2601626016260163,
"grad_norm": 0.7268444207695822,
"learning_rate": 1.847821053080505e-05,
"loss": 1.3232,
"step": 660
},
{
"epoch": 0.2605567873860557,
"grad_norm": 0.7145438455342924,
"learning_rate": 1.8470902186004995e-05,
"loss": 1.2762,
"step": 661
},
{
"epoch": 0.2609509731460951,
"grad_norm": 0.798127221257281,
"learning_rate": 1.8463577786657653e-05,
"loss": 1.3434,
"step": 662
},
{
"epoch": 0.2613451589061345,
"grad_norm": 0.8286302645386731,
"learning_rate": 1.845623734664465e-05,
"loss": 1.3648,
"step": 663
},
{
"epoch": 0.26173934466617393,
"grad_norm": 0.7056475119658424,
"learning_rate": 1.8448880879878026e-05,
"loss": 1.2664,
"step": 664
},
{
"epoch": 0.26213353042621335,
"grad_norm": 0.7486227238349661,
"learning_rate": 1.844150840030018e-05,
"loss": 1.3144,
"step": 665
},
{
"epoch": 0.26252771618625276,
"grad_norm": 0.7252618893757948,
"learning_rate": 1.8434119921883865e-05,
"loss": 1.2523,
"step": 666
},
{
"epoch": 0.26292190194629217,
"grad_norm": 0.7522705686940889,
"learning_rate": 1.8426715458632154e-05,
"loss": 1.3312,
"step": 667
},
{
"epoch": 0.26331608770633164,
"grad_norm": 0.7442803975025406,
"learning_rate": 1.8419295024578417e-05,
"loss": 1.3162,
"step": 668
},
{
"epoch": 0.26371027346637105,
"grad_norm": 0.7428662761759469,
"learning_rate": 1.8411858633786298e-05,
"loss": 1.3616,
"step": 669
},
{
"epoch": 0.26410445922641046,
"grad_norm": 0.6883090253519637,
"learning_rate": 1.8404406300349673e-05,
"loss": 1.2775,
"step": 670
},
{
"epoch": 0.26449864498644987,
"grad_norm": 0.7298650894749236,
"learning_rate": 1.8396938038392636e-05,
"loss": 1.2973,
"step": 671
},
{
"epoch": 0.2648928307464893,
"grad_norm": 0.7210785949379522,
"learning_rate": 1.838945386206948e-05,
"loss": 1.2651,
"step": 672
},
{
"epoch": 0.2652870165065287,
"grad_norm": 0.7455429622427832,
"learning_rate": 1.8381953785564653e-05,
"loss": 1.2784,
"step": 673
},
{
"epoch": 0.2656812022665681,
"grad_norm": 0.7101554754335506,
"learning_rate": 1.8374437823092726e-05,
"loss": 1.2153,
"step": 674
},
{
"epoch": 0.2660753880266075,
"grad_norm": 0.7052828798902647,
"learning_rate": 1.836690598889839e-05,
"loss": 1.2874,
"step": 675
},
{
"epoch": 0.266469573786647,
"grad_norm": 0.7102957673047738,
"learning_rate": 1.835935829725643e-05,
"loss": 1.3323,
"step": 676
},
{
"epoch": 0.2668637595466864,
"grad_norm": 0.7113208099408921,
"learning_rate": 1.8351794762471656e-05,
"loss": 1.2808,
"step": 677
},
{
"epoch": 0.2672579453067258,
"grad_norm": 0.713012458638494,
"learning_rate": 1.8344215398878925e-05,
"loss": 1.2499,
"step": 678
},
{
"epoch": 0.2676521310667652,
"grad_norm": 0.7458478391351581,
"learning_rate": 1.833662022084309e-05,
"loss": 1.2379,
"step": 679
},
{
"epoch": 0.26804631682680463,
"grad_norm": 0.6955091694637261,
"learning_rate": 1.8329009242758977e-05,
"loss": 1.2148,
"step": 680
},
{
"epoch": 0.26844050258684404,
"grad_norm": 0.7331960366798272,
"learning_rate": 1.832138247905135e-05,
"loss": 1.3051,
"step": 681
},
{
"epoch": 0.26883468834688345,
"grad_norm": 0.7207567261465225,
"learning_rate": 1.8313739944174894e-05,
"loss": 1.3065,
"step": 682
},
{
"epoch": 0.26922887410692287,
"grad_norm": 0.7148277245246873,
"learning_rate": 1.8306081652614192e-05,
"loss": 1.2788,
"step": 683
},
{
"epoch": 0.26962305986696233,
"grad_norm": 0.7155577906316034,
"learning_rate": 1.829840761888368e-05,
"loss": 1.2429,
"step": 684
},
{
"epoch": 0.27001724562700175,
"grad_norm": 0.696356161317749,
"learning_rate": 1.829071785752764e-05,
"loss": 1.2729,
"step": 685
},
{
"epoch": 0.27041143138704116,
"grad_norm": 0.7128716614175701,
"learning_rate": 1.8283012383120148e-05,
"loss": 1.3227,
"step": 686
},
{
"epoch": 0.27080561714708057,
"grad_norm": 0.7465800322640285,
"learning_rate": 1.827529121026507e-05,
"loss": 1.3252,
"step": 687
},
{
"epoch": 0.27119980290712,
"grad_norm": 0.8172136430700996,
"learning_rate": 1.8267554353596027e-05,
"loss": 1.2756,
"step": 688
},
{
"epoch": 0.2715939886671594,
"grad_norm": 0.7347557447163089,
"learning_rate": 1.8259801827776358e-05,
"loss": 1.2878,
"step": 689
},
{
"epoch": 0.2719881744271988,
"grad_norm": 0.6960464962207745,
"learning_rate": 1.82520336474991e-05,
"loss": 1.2508,
"step": 690
},
{
"epoch": 0.2723823601872382,
"grad_norm": 0.7323542648353354,
"learning_rate": 1.8244249827486962e-05,
"loss": 1.3276,
"step": 691
},
{
"epoch": 0.2727765459472777,
"grad_norm": 0.7334410491777583,
"learning_rate": 1.8236450382492293e-05,
"loss": 1.2446,
"step": 692
},
{
"epoch": 0.2731707317073171,
"grad_norm": 0.7700697100142729,
"learning_rate": 1.8228635327297054e-05,
"loss": 1.2647,
"step": 693
},
{
"epoch": 0.2735649174673565,
"grad_norm": 0.6868021899359485,
"learning_rate": 1.8220804676712797e-05,
"loss": 1.2585,
"step": 694
},
{
"epoch": 0.2739591032273959,
"grad_norm": 0.7056110870773941,
"learning_rate": 1.8212958445580623e-05,
"loss": 1.2978,
"step": 695
},
{
"epoch": 0.27435328898743533,
"grad_norm": 0.7042929029435405,
"learning_rate": 1.8205096648771166e-05,
"loss": 1.2778,
"step": 696
},
{
"epoch": 0.27474747474747474,
"grad_norm": 0.7960978757280552,
"learning_rate": 1.8197219301184565e-05,
"loss": 1.3364,
"step": 697
},
{
"epoch": 0.27514166050751415,
"grad_norm": 0.7288353276886701,
"learning_rate": 1.818932641775043e-05,
"loss": 1.3099,
"step": 698
},
{
"epoch": 0.27553584626755356,
"grad_norm": 0.7479924057933423,
"learning_rate": 1.81814180134278e-05,
"loss": 1.3429,
"step": 699
},
{
"epoch": 0.27593003202759303,
"grad_norm": 0.7715814930725846,
"learning_rate": 1.817349410320516e-05,
"loss": 1.2634,
"step": 700
},
{
"epoch": 0.27632421778763244,
"grad_norm": 0.7186502326915973,
"learning_rate": 1.816555470210036e-05,
"loss": 1.2677,
"step": 701
},
{
"epoch": 0.27671840354767185,
"grad_norm": 0.6963815556934851,
"learning_rate": 1.815759982516061e-05,
"loss": 1.2738,
"step": 702
},
{
"epoch": 0.27711258930771127,
"grad_norm": 0.725935134036574,
"learning_rate": 1.8149629487462466e-05,
"loss": 1.3357,
"step": 703
},
{
"epoch": 0.2775067750677507,
"grad_norm": 0.7440336010726357,
"learning_rate": 1.814164370411177e-05,
"loss": 1.3394,
"step": 704
},
{
"epoch": 0.2779009608277901,
"grad_norm": 0.7144497832774677,
"learning_rate": 1.8133642490243642e-05,
"loss": 1.3247,
"step": 705
},
{
"epoch": 0.2782951465878295,
"grad_norm": 0.7330387391854017,
"learning_rate": 1.8125625861022455e-05,
"loss": 1.3037,
"step": 706
},
{
"epoch": 0.2786893323478689,
"grad_norm": 0.7408644571783576,
"learning_rate": 1.8117593831641788e-05,
"loss": 1.2714,
"step": 707
},
{
"epoch": 0.2790835181079084,
"grad_norm": 0.7538056025050238,
"learning_rate": 1.810954641732441e-05,
"loss": 1.2744,
"step": 708
},
{
"epoch": 0.2794777038679478,
"grad_norm": 0.7178383604389642,
"learning_rate": 1.8101483633322255e-05,
"loss": 1.3522,
"step": 709
},
{
"epoch": 0.2798718896279872,
"grad_norm": 0.7286512088304942,
"learning_rate": 1.8093405494916373e-05,
"loss": 1.2913,
"step": 710
},
{
"epoch": 0.2802660753880266,
"grad_norm": 0.7524538518197109,
"learning_rate": 1.8085312017416926e-05,
"loss": 1.3544,
"step": 711
},
{
"epoch": 0.280660261148066,
"grad_norm": 0.7789095889944275,
"learning_rate": 1.8077203216163145e-05,
"loss": 1.3328,
"step": 712
},
{
"epoch": 0.28105444690810544,
"grad_norm": 0.7027682398341476,
"learning_rate": 1.8069079106523303e-05,
"loss": 1.316,
"step": 713
},
{
"epoch": 0.28144863266814485,
"grad_norm": 0.71974038692439,
"learning_rate": 1.8060939703894684e-05,
"loss": 1.3089,
"step": 714
},
{
"epoch": 0.28184281842818426,
"grad_norm": 0.750073440309824,
"learning_rate": 1.805278502370356e-05,
"loss": 1.28,
"step": 715
},
{
"epoch": 0.28223700418822373,
"grad_norm": 0.7157617956836964,
"learning_rate": 1.8044615081405153e-05,
"loss": 1.2604,
"step": 716
},
{
"epoch": 0.28263118994826314,
"grad_norm": 0.7094277876635081,
"learning_rate": 1.8036429892483615e-05,
"loss": 1.2041,
"step": 717
},
{
"epoch": 0.28302537570830255,
"grad_norm": 0.6869213238799484,
"learning_rate": 1.8028229472451994e-05,
"loss": 1.2326,
"step": 718
},
{
"epoch": 0.28341956146834196,
"grad_norm": 0.7609339774943211,
"learning_rate": 1.80200138368522e-05,
"loss": 1.2778,
"step": 719
},
{
"epoch": 0.2838137472283814,
"grad_norm": 0.7445388720919836,
"learning_rate": 1.801178300125499e-05,
"loss": 1.3466,
"step": 720
},
{
"epoch": 0.2842079329884208,
"grad_norm": 0.75543054063603,
"learning_rate": 1.800353698125992e-05,
"loss": 1.2684,
"step": 721
},
{
"epoch": 0.2846021187484602,
"grad_norm": 0.7126562502264812,
"learning_rate": 1.7995275792495327e-05,
"loss": 1.3145,
"step": 722
},
{
"epoch": 0.2849963045084996,
"grad_norm": 0.750515516790499,
"learning_rate": 1.7986999450618295e-05,
"loss": 1.2766,
"step": 723
},
{
"epoch": 0.2853904902685391,
"grad_norm": 0.7302431877687291,
"learning_rate": 1.7978707971314636e-05,
"loss": 1.2127,
"step": 724
},
{
"epoch": 0.2857846760285785,
"grad_norm": 0.7122551920492798,
"learning_rate": 1.797040137029884e-05,
"loss": 1.2589,
"step": 725
},
{
"epoch": 0.2861788617886179,
"grad_norm": 0.7938703124948006,
"learning_rate": 1.796207966331406e-05,
"loss": 1.3729,
"step": 726
},
{
"epoch": 0.2865730475486573,
"grad_norm": 0.7541217984200421,
"learning_rate": 1.7953742866132082e-05,
"loss": 1.2927,
"step": 727
},
{
"epoch": 0.2869672333086967,
"grad_norm": 0.7255479166779722,
"learning_rate": 1.794539099455329e-05,
"loss": 1.3431,
"step": 728
},
{
"epoch": 0.28736141906873613,
"grad_norm": 0.7453202011835943,
"learning_rate": 1.7937024064406637e-05,
"loss": 1.2764,
"step": 729
},
{
"epoch": 0.28775560482877555,
"grad_norm": 0.7449089241310055,
"learning_rate": 1.7928642091549616e-05,
"loss": 1.2666,
"step": 730
},
{
"epoch": 0.28814979058881496,
"grad_norm": 0.688535746874336,
"learning_rate": 1.792024509186823e-05,
"loss": 1.2396,
"step": 731
},
{
"epoch": 0.2885439763488544,
"grad_norm": 0.7179660403513343,
"learning_rate": 1.7911833081276962e-05,
"loss": 1.2404,
"step": 732
},
{
"epoch": 0.28893816210889384,
"grad_norm": 0.6957846541829211,
"learning_rate": 1.7903406075718744e-05,
"loss": 1.3032,
"step": 733
},
{
"epoch": 0.28933234786893325,
"grad_norm": 0.7453327673964074,
"learning_rate": 1.7894964091164932e-05,
"loss": 1.3043,
"step": 734
},
{
"epoch": 0.28972653362897266,
"grad_norm": 0.6889929678498284,
"learning_rate": 1.788650714361526e-05,
"loss": 1.2273,
"step": 735
},
{
"epoch": 0.29012071938901207,
"grad_norm": 0.7514828515828875,
"learning_rate": 1.787803524909783e-05,
"loss": 1.232,
"step": 736
},
{
"epoch": 0.2905149051490515,
"grad_norm": 0.69838877253169,
"learning_rate": 1.7869548423669075e-05,
"loss": 1.1814,
"step": 737
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.7028140683366864,
"learning_rate": 1.7861046683413717e-05,
"loss": 1.3324,
"step": 738
},
{
"epoch": 0.2913032766691303,
"grad_norm": 0.7609333767596239,
"learning_rate": 1.785253004444475e-05,
"loss": 1.3309,
"step": 739
},
{
"epoch": 0.2916974624291698,
"grad_norm": 0.6993070009969047,
"learning_rate": 1.78439985229034e-05,
"loss": 1.2958,
"step": 740
},
{
"epoch": 0.2920916481892092,
"grad_norm": 0.7895491591304246,
"learning_rate": 1.7835452134959112e-05,
"loss": 1.2721,
"step": 741
},
{
"epoch": 0.2924858339492486,
"grad_norm": 0.7484581002135297,
"learning_rate": 1.7826890896809492e-05,
"loss": 1.2696,
"step": 742
},
{
"epoch": 0.292880019709288,
"grad_norm": 0.7180118235912724,
"learning_rate": 1.78183148246803e-05,
"loss": 1.3026,
"step": 743
},
{
"epoch": 0.2932742054693274,
"grad_norm": 0.7821323900052215,
"learning_rate": 1.7809723934825405e-05,
"loss": 1.244,
"step": 744
},
{
"epoch": 0.29366839122936683,
"grad_norm": 0.731279597221484,
"learning_rate": 1.7801118243526764e-05,
"loss": 1.2841,
"step": 745
},
{
"epoch": 0.29406257698940624,
"grad_norm": 0.7328987907210074,
"learning_rate": 1.7792497767094384e-05,
"loss": 1.2574,
"step": 746
},
{
"epoch": 0.29445676274944566,
"grad_norm": 0.7546401708479835,
"learning_rate": 1.7783862521866296e-05,
"loss": 1.2514,
"step": 747
},
{
"epoch": 0.2948509485094851,
"grad_norm": 0.6961282567593424,
"learning_rate": 1.7775212524208513e-05,
"loss": 1.2659,
"step": 748
},
{
"epoch": 0.29524513426952453,
"grad_norm": 0.7069163112336031,
"learning_rate": 1.776654779051502e-05,
"loss": 1.2231,
"step": 749
},
{
"epoch": 0.29563932002956395,
"grad_norm": 0.7257978049323676,
"learning_rate": 1.775786833720773e-05,
"loss": 1.2728,
"step": 750
},
{
"epoch": 0.29603350578960336,
"grad_norm": 0.7560009441390841,
"learning_rate": 1.7749174180736443e-05,
"loss": 1.2819,
"step": 751
},
{
"epoch": 0.29642769154964277,
"grad_norm": 0.6956575266835414,
"learning_rate": 1.7740465337578823e-05,
"loss": 1.3005,
"step": 752
},
{
"epoch": 0.2968218773096822,
"grad_norm": 0.7079492136542035,
"learning_rate": 1.7731741824240385e-05,
"loss": 1.227,
"step": 753
},
{
"epoch": 0.2972160630697216,
"grad_norm": 0.7184097566051775,
"learning_rate": 1.7723003657254447e-05,
"loss": 1.2924,
"step": 754
},
{
"epoch": 0.297610248829761,
"grad_norm": 0.6854141387606205,
"learning_rate": 1.771425085318208e-05,
"loss": 1.2557,
"step": 755
},
{
"epoch": 0.29800443458980047,
"grad_norm": 0.6879860581907943,
"learning_rate": 1.7705483428612114e-05,
"loss": 1.2204,
"step": 756
},
{
"epoch": 0.2983986203498399,
"grad_norm": 0.7067053556944854,
"learning_rate": 1.7696701400161077e-05,
"loss": 1.2709,
"step": 757
},
{
"epoch": 0.2987928061098793,
"grad_norm": 0.6684898845941895,
"learning_rate": 1.768790478447319e-05,
"loss": 1.2379,
"step": 758
},
{
"epoch": 0.2991869918699187,
"grad_norm": 0.7669440743034426,
"learning_rate": 1.7679093598220305e-05,
"loss": 1.2965,
"step": 759
},
{
"epoch": 0.2995811776299581,
"grad_norm": 0.7264067182866932,
"learning_rate": 1.7670267858101895e-05,
"loss": 1.3299,
"step": 760
},
{
"epoch": 0.29997536338999753,
"grad_norm": 0.7154874058477277,
"learning_rate": 1.766142758084502e-05,
"loss": 1.2714,
"step": 761
},
{
"epoch": 0.30036954915003694,
"grad_norm": 0.7339691526122842,
"learning_rate": 1.7652572783204286e-05,
"loss": 1.2567,
"step": 762
},
{
"epoch": 0.30076373491007635,
"grad_norm": 0.7113428916700398,
"learning_rate": 1.764370348196183e-05,
"loss": 1.2466,
"step": 763
},
{
"epoch": 0.3011579206701158,
"grad_norm": 0.7468376219349876,
"learning_rate": 1.7634819693927254e-05,
"loss": 1.2894,
"step": 764
},
{
"epoch": 0.30155210643015523,
"grad_norm": 0.706632725084111,
"learning_rate": 1.762592143593764e-05,
"loss": 1.2872,
"step": 765
},
{
"epoch": 0.30194629219019464,
"grad_norm": 0.6794782711352044,
"learning_rate": 1.761700872485748e-05,
"loss": 1.2807,
"step": 766
},
{
"epoch": 0.30234047795023405,
"grad_norm": 0.7244853098320986,
"learning_rate": 1.7608081577578665e-05,
"loss": 1.2835,
"step": 767
},
{
"epoch": 0.30273466371027347,
"grad_norm": 0.778447414784227,
"learning_rate": 1.759914001102045e-05,
"loss": 1.2765,
"step": 768
},
{
"epoch": 0.3031288494703129,
"grad_norm": 0.6969578931450477,
"learning_rate": 1.7590184042129406e-05,
"loss": 1.231,
"step": 769
},
{
"epoch": 0.3035230352303523,
"grad_norm": 0.6772342269604559,
"learning_rate": 1.758121368787941e-05,
"loss": 1.2599,
"step": 770
},
{
"epoch": 0.3039172209903917,
"grad_norm": 0.7659352446323853,
"learning_rate": 1.7572228965271595e-05,
"loss": 1.2728,
"step": 771
},
{
"epoch": 0.30431140675043117,
"grad_norm": 0.7140083484092759,
"learning_rate": 1.756322989133434e-05,
"loss": 1.273,
"step": 772
},
{
"epoch": 0.3047055925104706,
"grad_norm": 0.7580395855737478,
"learning_rate": 1.7554216483123205e-05,
"loss": 1.257,
"step": 773
},
{
"epoch": 0.30509977827051,
"grad_norm": 0.7139671098163918,
"learning_rate": 1.7545188757720933e-05,
"loss": 1.2526,
"step": 774
},
{
"epoch": 0.3054939640305494,
"grad_norm": 0.7180915186637021,
"learning_rate": 1.753614673223739e-05,
"loss": 1.284,
"step": 775
},
{
"epoch": 0.3058881497905888,
"grad_norm": 0.6906674260442509,
"learning_rate": 1.7527090423809553e-05,
"loss": 1.3048,
"step": 776
},
{
"epoch": 0.3062823355506282,
"grad_norm": 0.6975851458655973,
"learning_rate": 1.7518019849601466e-05,
"loss": 1.2902,
"step": 777
},
{
"epoch": 0.30667652131066764,
"grad_norm": 0.7046928833082814,
"learning_rate": 1.7508935026804202e-05,
"loss": 1.2339,
"step": 778
},
{
"epoch": 0.30707070707070705,
"grad_norm": 0.7051521547776037,
"learning_rate": 1.749983597263586e-05,
"loss": 1.2921,
"step": 779
},
{
"epoch": 0.3074648928307465,
"grad_norm": 0.6736469006003648,
"learning_rate": 1.749072270434148e-05,
"loss": 1.271,
"step": 780
},
{
"epoch": 0.30785907859078593,
"grad_norm": 1.9120037647074484,
"learning_rate": 1.7481595239193073e-05,
"loss": 1.2196,
"step": 781
},
{
"epoch": 0.30825326435082534,
"grad_norm": 0.72077851804003,
"learning_rate": 1.747245359448954e-05,
"loss": 1.2623,
"step": 782
},
{
"epoch": 0.30864745011086475,
"grad_norm": 0.6879089595866057,
"learning_rate": 1.7463297787556656e-05,
"loss": 1.2604,
"step": 783
},
{
"epoch": 0.30904163587090416,
"grad_norm": 0.7126887694269388,
"learning_rate": 1.745412783574704e-05,
"loss": 1.2688,
"step": 784
},
{
"epoch": 0.3094358216309436,
"grad_norm": 0.6783349938024574,
"learning_rate": 1.744494375644012e-05,
"loss": 1.2142,
"step": 785
},
{
"epoch": 0.309830007390983,
"grad_norm": 0.7591782870663694,
"learning_rate": 1.7435745567042096e-05,
"loss": 1.3246,
"step": 786
},
{
"epoch": 0.3102241931510224,
"grad_norm": 0.7137080648341777,
"learning_rate": 1.7426533284985912e-05,
"loss": 1.256,
"step": 787
},
{
"epoch": 0.31061837891106187,
"grad_norm": 0.712242808651282,
"learning_rate": 1.7417306927731226e-05,
"loss": 1.2504,
"step": 788
},
{
"epoch": 0.3110125646711013,
"grad_norm": 0.7706834788124493,
"learning_rate": 1.7408066512764365e-05,
"loss": 1.2842,
"step": 789
},
{
"epoch": 0.3114067504311407,
"grad_norm": 0.6756575206343757,
"learning_rate": 1.73988120575983e-05,
"loss": 1.2302,
"step": 790
},
{
"epoch": 0.3118009361911801,
"grad_norm": 0.7172786293209685,
"learning_rate": 1.7389543579772613e-05,
"loss": 1.2746,
"step": 791
},
{
"epoch": 0.3121951219512195,
"grad_norm": 0.7114990921157863,
"learning_rate": 1.738026109685347e-05,
"loss": 1.247,
"step": 792
},
{
"epoch": 0.3125893077112589,
"grad_norm": 0.7464653029721845,
"learning_rate": 1.737096462643357e-05,
"loss": 1.2843,
"step": 793
},
{
"epoch": 0.31298349347129834,
"grad_norm": 0.7246251283451155,
"learning_rate": 1.736165418613212e-05,
"loss": 1.2896,
"step": 794
},
{
"epoch": 0.31337767923133775,
"grad_norm": 0.709039744798614,
"learning_rate": 1.7352329793594817e-05,
"loss": 1.2729,
"step": 795
},
{
"epoch": 0.3137718649913772,
"grad_norm": 0.7184347792609641,
"learning_rate": 1.7342991466493785e-05,
"loss": 1.3516,
"step": 796
},
{
"epoch": 0.3141660507514166,
"grad_norm": 0.677698026889925,
"learning_rate": 1.7333639222527572e-05,
"loss": 1.2565,
"step": 797
},
{
"epoch": 0.31456023651145604,
"grad_norm": 0.7345054222302991,
"learning_rate": 1.732427307942109e-05,
"loss": 1.2509,
"step": 798
},
{
"epoch": 0.31495442227149545,
"grad_norm": 0.7766755838188357,
"learning_rate": 1.7314893054925604e-05,
"loss": 1.2766,
"step": 799
},
{
"epoch": 0.31534860803153486,
"grad_norm": 0.8110496899704974,
"learning_rate": 1.730549916681868e-05,
"loss": 1.3387,
"step": 800
},
{
"epoch": 0.31574279379157427,
"grad_norm": 0.7332603361275668,
"learning_rate": 1.7296091432904164e-05,
"loss": 1.3232,
"step": 801
},
{
"epoch": 0.3161369795516137,
"grad_norm": 0.7406352642846648,
"learning_rate": 1.728666987101214e-05,
"loss": 1.2996,
"step": 802
},
{
"epoch": 0.3165311653116531,
"grad_norm": 0.7257385239706662,
"learning_rate": 1.7277234498998897e-05,
"loss": 1.2809,
"step": 803
},
{
"epoch": 0.31692535107169256,
"grad_norm": 0.7450615958562268,
"learning_rate": 1.726778533474691e-05,
"loss": 1.2937,
"step": 804
},
{
"epoch": 0.317319536831732,
"grad_norm": 0.7062517786301892,
"learning_rate": 1.725832239616478e-05,
"loss": 1.3006,
"step": 805
},
{
"epoch": 0.3177137225917714,
"grad_norm": 0.7080667822251828,
"learning_rate": 1.724884570118722e-05,
"loss": 1.2349,
"step": 806
},
{
"epoch": 0.3181079083518108,
"grad_norm": 0.7066931019098044,
"learning_rate": 1.723935526777502e-05,
"loss": 1.2272,
"step": 807
},
{
"epoch": 0.3185020941118502,
"grad_norm": 0.6946668338018744,
"learning_rate": 1.722985111391499e-05,
"loss": 1.2962,
"step": 808
},
{
"epoch": 0.3188962798718896,
"grad_norm": 0.6796597060520128,
"learning_rate": 1.7220333257619967e-05,
"loss": 1.3037,
"step": 809
},
{
"epoch": 0.31929046563192903,
"grad_norm": 1.6609616990291973,
"learning_rate": 1.721080171692874e-05,
"loss": 1.3676,
"step": 810
},
{
"epoch": 0.31968465139196844,
"grad_norm": 0.7455397950852571,
"learning_rate": 1.720125650990605e-05,
"loss": 1.2693,
"step": 811
},
{
"epoch": 0.3200788371520079,
"grad_norm": 1.8102002609851213,
"learning_rate": 1.7191697654642517e-05,
"loss": 1.443,
"step": 812
},
{
"epoch": 0.3204730229120473,
"grad_norm": 1.6105677014342337,
"learning_rate": 1.7182125169254646e-05,
"loss": 1.3548,
"step": 813
},
{
"epoch": 0.32086720867208673,
"grad_norm": 1.9398768550889596,
"learning_rate": 1.717253907188477e-05,
"loss": 1.3585,
"step": 814
},
{
"epoch": 0.32126139443212615,
"grad_norm": 1.628604424489859,
"learning_rate": 1.716293938070102e-05,
"loss": 1.3206,
"step": 815
},
{
"epoch": 0.32165558019216556,
"grad_norm": 2.801181103409832,
"learning_rate": 1.7153326113897286e-05,
"loss": 1.4204,
"step": 816
},
{
"epoch": 0.32204976595220497,
"grad_norm": 1.0130939786005846,
"learning_rate": 1.7143699289693193e-05,
"loss": 1.2738,
"step": 817
},
{
"epoch": 0.3224439517122444,
"grad_norm": 6.872564216473981,
"learning_rate": 1.7134058926334063e-05,
"loss": 1.262,
"step": 818
},
{
"epoch": 0.3228381374722838,
"grad_norm": 1.4200836123074054,
"learning_rate": 1.7124405042090865e-05,
"loss": 1.3799,
"step": 819
},
{
"epoch": 0.32323232323232326,
"grad_norm": 5.08400535142629,
"learning_rate": 1.711473765526021e-05,
"loss": 1.3092,
"step": 820
},
{
"epoch": 0.32362650899236267,
"grad_norm": 1.5849311506474677,
"learning_rate": 1.7105056784164295e-05,
"loss": 1.2599,
"step": 821
},
{
"epoch": 0.3240206947524021,
"grad_norm": 1.0013431185133732,
"learning_rate": 1.7095362447150866e-05,
"loss": 1.3207,
"step": 822
},
{
"epoch": 0.3244148805124415,
"grad_norm": 0.6866727508748066,
"learning_rate": 1.7085654662593192e-05,
"loss": 1.2265,
"step": 823
},
{
"epoch": 0.3248090662724809,
"grad_norm": 0.7423237770798616,
"learning_rate": 1.7075933448890037e-05,
"loss": 1.2494,
"step": 824
},
{
"epoch": 0.3252032520325203,
"grad_norm": 0.7327984292482648,
"learning_rate": 1.706619882446561e-05,
"loss": 1.2826,
"step": 825
},
{
"epoch": 0.32559743779255973,
"grad_norm": 0.8307141447009255,
"learning_rate": 1.7056450807769543e-05,
"loss": 1.3328,
"step": 826
},
{
"epoch": 0.32599162355259914,
"grad_norm": 0.7685568008883157,
"learning_rate": 1.7046689417276836e-05,
"loss": 1.2668,
"step": 827
},
{
"epoch": 0.3263858093126386,
"grad_norm": 0.7143149682827579,
"learning_rate": 1.7036914671487854e-05,
"loss": 1.3147,
"step": 828
},
{
"epoch": 0.326779995072678,
"grad_norm": 0.7441227072240346,
"learning_rate": 1.7027126588928255e-05,
"loss": 1.2662,
"step": 829
},
{
"epoch": 0.32717418083271743,
"grad_norm": 0.8549422472836754,
"learning_rate": 1.701732518814899e-05,
"loss": 1.2276,
"step": 830
},
{
"epoch": 0.32756836659275684,
"grad_norm": 0.7104822685684634,
"learning_rate": 1.7007510487726247e-05,
"loss": 1.2174,
"step": 831
},
{
"epoch": 0.32796255235279625,
"grad_norm": 0.7990258038527759,
"learning_rate": 1.699768250626141e-05,
"loss": 1.2084,
"step": 832
},
{
"epoch": 0.32835673811283567,
"grad_norm": 0.7941920583151476,
"learning_rate": 1.698784126238105e-05,
"loss": 1.3014,
"step": 833
},
{
"epoch": 0.3287509238728751,
"grad_norm": 0.7565823644252784,
"learning_rate": 1.697798677473686e-05,
"loss": 1.3198,
"step": 834
},
{
"epoch": 0.3291451096329145,
"grad_norm": 0.776895609925856,
"learning_rate": 1.6968119062005644e-05,
"loss": 1.3171,
"step": 835
},
{
"epoch": 0.32953929539295396,
"grad_norm": 0.7511145926401521,
"learning_rate": 1.6958238142889258e-05,
"loss": 1.2645,
"step": 836
},
{
"epoch": 0.32993348115299337,
"grad_norm": 0.8590843085742348,
"learning_rate": 1.6948344036114604e-05,
"loss": 1.2381,
"step": 837
},
{
"epoch": 0.3303276669130328,
"grad_norm": 0.7298728955089272,
"learning_rate": 1.6938436760433565e-05,
"loss": 1.2919,
"step": 838
},
{
"epoch": 0.3307218526730722,
"grad_norm": 0.723873691001796,
"learning_rate": 1.6928516334622988e-05,
"loss": 1.2859,
"step": 839
},
{
"epoch": 0.3311160384331116,
"grad_norm": 0.6739547357750979,
"learning_rate": 1.6918582777484642e-05,
"loss": 1.2698,
"step": 840
},
{
"epoch": 0.331510224193151,
"grad_norm": 0.7603942315040987,
"learning_rate": 1.690863610784518e-05,
"loss": 1.3326,
"step": 841
},
{
"epoch": 0.3319044099531904,
"grad_norm": 0.7428516273827751,
"learning_rate": 1.689867634455612e-05,
"loss": 1.3044,
"step": 842
},
{
"epoch": 0.33229859571322984,
"grad_norm": 0.6987204595473288,
"learning_rate": 1.6888703506493774e-05,
"loss": 1.2418,
"step": 843
},
{
"epoch": 0.3326927814732693,
"grad_norm": 0.6798174720438129,
"learning_rate": 1.687871761255925e-05,
"loss": 1.2692,
"step": 844
},
{
"epoch": 0.3330869672333087,
"grad_norm": 0.6812029162107662,
"learning_rate": 1.6868718681678397e-05,
"loss": 1.2651,
"step": 845
},
{
"epoch": 0.33348115299334813,
"grad_norm": 5.833213521596053,
"learning_rate": 1.6858706732801767e-05,
"loss": 1.2184,
"step": 846
},
{
"epoch": 0.33387533875338754,
"grad_norm": 2.1210809511503856,
"learning_rate": 1.6848681784904597e-05,
"loss": 1.3386,
"step": 847
},
{
"epoch": 0.33426952451342695,
"grad_norm": 4.2587995536151135,
"learning_rate": 1.6838643856986746e-05,
"loss": 1.2538,
"step": 848
},
{
"epoch": 0.33466371027346636,
"grad_norm": 0.814091566447592,
"learning_rate": 1.682859296807268e-05,
"loss": 1.2472,
"step": 849
},
{
"epoch": 0.3350578960335058,
"grad_norm": 0.7308070804439674,
"learning_rate": 1.6818529137211427e-05,
"loss": 1.222,
"step": 850
},
{
"epoch": 0.3354520817935452,
"grad_norm": 0.733680332929859,
"learning_rate": 1.680845238347655e-05,
"loss": 1.2992,
"step": 851
},
{
"epoch": 0.33584626755358465,
"grad_norm": 0.7265681835122267,
"learning_rate": 1.6798362725966102e-05,
"loss": 1.2956,
"step": 852
},
{
"epoch": 0.33624045331362407,
"grad_norm": 0.7402397151917712,
"learning_rate": 1.6788260183802586e-05,
"loss": 1.3171,
"step": 853
},
{
"epoch": 0.3366346390736635,
"grad_norm": 0.7137092615288991,
"learning_rate": 1.6778144776132927e-05,
"loss": 1.2102,
"step": 854
},
{
"epoch": 0.3370288248337029,
"grad_norm": 0.7156854110239057,
"learning_rate": 1.6768016522128435e-05,
"loss": 1.3038,
"step": 855
},
{
"epoch": 0.3374230105937423,
"grad_norm": 0.711623409866771,
"learning_rate": 1.675787544098477e-05,
"loss": 1.2436,
"step": 856
},
{
"epoch": 0.3378171963537817,
"grad_norm": 0.7171571327488878,
"learning_rate": 1.6747721551921894e-05,
"loss": 1.2316,
"step": 857
},
{
"epoch": 0.3382113821138211,
"grad_norm": 0.8547583498487163,
"learning_rate": 1.6737554874184058e-05,
"loss": 1.2736,
"step": 858
},
{
"epoch": 0.33860556787386054,
"grad_norm": 0.7302470996316592,
"learning_rate": 1.6727375427039734e-05,
"loss": 1.3211,
"step": 859
},
{
"epoch": 0.3389997536339,
"grad_norm": 0.8374723063663263,
"learning_rate": 1.671718322978161e-05,
"loss": 1.22,
"step": 860
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.6807758814646102,
"learning_rate": 1.6706978301726523e-05,
"loss": 1.1737,
"step": 861
},
{
"epoch": 0.3397881251539788,
"grad_norm": 0.8925191795209313,
"learning_rate": 1.6696760662215457e-05,
"loss": 1.3089,
"step": 862
},
{
"epoch": 0.34018231091401824,
"grad_norm": 0.7669197207119955,
"learning_rate": 1.6686530330613472e-05,
"loss": 1.2567,
"step": 863
},
{
"epoch": 0.34057649667405765,
"grad_norm": 0.7863821853939692,
"learning_rate": 1.6676287326309684e-05,
"loss": 1.2913,
"step": 864
},
{
"epoch": 0.34097068243409706,
"grad_norm": 0.7288234899543948,
"learning_rate": 1.6666031668717246e-05,
"loss": 1.2282,
"step": 865
},
{
"epoch": 0.3413648681941365,
"grad_norm": 0.7392427569586649,
"learning_rate": 1.6655763377273258e-05,
"loss": 1.2523,
"step": 866
},
{
"epoch": 0.3417590539541759,
"grad_norm": 0.773906001259452,
"learning_rate": 1.6645482471438805e-05,
"loss": 1.2792,
"step": 867
},
{
"epoch": 0.34215323971421535,
"grad_norm": 0.7307235011238918,
"learning_rate": 1.6635188970698843e-05,
"loss": 1.2767,
"step": 868
},
{
"epoch": 0.34254742547425476,
"grad_norm": 0.7781474135830119,
"learning_rate": 1.662488289456222e-05,
"loss": 1.2846,
"step": 869
},
{
"epoch": 0.3429416112342942,
"grad_norm": 0.7962078143230832,
"learning_rate": 1.661456426256161e-05,
"loss": 1.256,
"step": 870
},
{
"epoch": 0.3433357969943336,
"grad_norm": 0.6984713096930648,
"learning_rate": 1.660423309425349e-05,
"loss": 1.2114,
"step": 871
},
{
"epoch": 0.343729982754373,
"grad_norm": 0.9653083144870128,
"learning_rate": 1.6593889409218084e-05,
"loss": 1.27,
"step": 872
},
{
"epoch": 0.3441241685144124,
"grad_norm": 0.7327421492980511,
"learning_rate": 1.6583533227059353e-05,
"loss": 1.2789,
"step": 873
},
{
"epoch": 0.3445183542744518,
"grad_norm": 0.7398126983540253,
"learning_rate": 1.657316456740494e-05,
"loss": 1.3085,
"step": 874
},
{
"epoch": 0.34491254003449123,
"grad_norm": 0.7299557711967728,
"learning_rate": 1.656278344990612e-05,
"loss": 1.2173,
"step": 875
},
{
"epoch": 0.3453067257945307,
"grad_norm": 0.6863138322240955,
"learning_rate": 1.6552389894237806e-05,
"loss": 1.2902,
"step": 876
},
{
"epoch": 0.3457009115545701,
"grad_norm": 0.7199868674478724,
"learning_rate": 1.6541983920098462e-05,
"loss": 1.2807,
"step": 877
},
{
"epoch": 0.3460950973146095,
"grad_norm": 0.7634746076633273,
"learning_rate": 1.6531565547210095e-05,
"loss": 1.2891,
"step": 878
},
{
"epoch": 0.34648928307464893,
"grad_norm": 0.7334440482002302,
"learning_rate": 1.6521134795318214e-05,
"loss": 1.2927,
"step": 879
},
{
"epoch": 0.34688346883468835,
"grad_norm": 0.7223249271668641,
"learning_rate": 1.6510691684191795e-05,
"loss": 1.328,
"step": 880
},
{
"epoch": 0.34727765459472776,
"grad_norm": 0.7283270674703335,
"learning_rate": 1.650023623362322e-05,
"loss": 1.2518,
"step": 881
},
{
"epoch": 0.34767184035476717,
"grad_norm": 0.6859617703188744,
"learning_rate": 1.648976846342827e-05,
"loss": 1.2036,
"step": 882
},
{
"epoch": 0.3480660261148066,
"grad_norm": 0.743057000636584,
"learning_rate": 1.647928839344608e-05,
"loss": 1.1975,
"step": 883
},
{
"epoch": 0.34846021187484605,
"grad_norm": 0.8879799533842352,
"learning_rate": 1.6468796043539082e-05,
"loss": 1.2689,
"step": 884
},
{
"epoch": 0.34885439763488546,
"grad_norm": 0.8750572793943686,
"learning_rate": 1.645829143359299e-05,
"loss": 1.2318,
"step": 885
},
{
"epoch": 0.34924858339492487,
"grad_norm": 0.7446142347770219,
"learning_rate": 1.6447774583516756e-05,
"loss": 1.2977,
"step": 886
},
{
"epoch": 0.3496427691549643,
"grad_norm": 0.7504423660668825,
"learning_rate": 1.6437245513242523e-05,
"loss": 1.2924,
"step": 887
},
{
"epoch": 0.3500369549150037,
"grad_norm": 0.7101861154635718,
"learning_rate": 1.6426704242725603e-05,
"loss": 1.2577,
"step": 888
},
{
"epoch": 0.3504311406750431,
"grad_norm": 0.747939528808994,
"learning_rate": 1.6416150791944422e-05,
"loss": 1.258,
"step": 889
},
{
"epoch": 0.3508253264350825,
"grad_norm": 0.8886537060733,
"learning_rate": 1.640558518090049e-05,
"loss": 1.2302,
"step": 890
},
{
"epoch": 0.35121951219512193,
"grad_norm": 0.7590526147979498,
"learning_rate": 1.639500742961838e-05,
"loss": 1.2814,
"step": 891
},
{
"epoch": 0.3516136979551614,
"grad_norm": 0.7361888142899841,
"learning_rate": 1.6384417558145654e-05,
"loss": 1.284,
"step": 892
},
{
"epoch": 0.3520078837152008,
"grad_norm": 0.7328949864046489,
"learning_rate": 1.637381558655286e-05,
"loss": 1.2238,
"step": 893
},
{
"epoch": 0.3524020694752402,
"grad_norm": 0.7763585243100655,
"learning_rate": 1.6363201534933465e-05,
"loss": 1.2669,
"step": 894
},
{
"epoch": 0.35279625523527963,
"grad_norm": 0.7724373870079227,
"learning_rate": 1.635257542340384e-05,
"loss": 1.2572,
"step": 895
},
{
"epoch": 0.35319044099531904,
"grad_norm": 0.7384217206450774,
"learning_rate": 1.6341937272103213e-05,
"loss": 1.2394,
"step": 896
},
{
"epoch": 0.35358462675535846,
"grad_norm": 0.910247717689576,
"learning_rate": 1.6331287101193625e-05,
"loss": 1.2368,
"step": 897
},
{
"epoch": 0.35397881251539787,
"grad_norm": 0.7158162891901805,
"learning_rate": 1.6320624930859905e-05,
"loss": 1.2402,
"step": 898
},
{
"epoch": 0.3543729982754373,
"grad_norm": 0.8329732085362143,
"learning_rate": 1.6309950781309612e-05,
"loss": 1.2966,
"step": 899
},
{
"epoch": 0.35476718403547675,
"grad_norm": 0.8155246854171831,
"learning_rate": 1.6299264672773025e-05,
"loss": 1.2497,
"step": 900
},
{
"epoch": 0.35516136979551616,
"grad_norm": 0.7837030128107672,
"learning_rate": 1.6288566625503076e-05,
"loss": 1.2868,
"step": 901
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.7235353047172081,
"learning_rate": 1.627785665977532e-05,
"loss": 1.3201,
"step": 902
},
{
"epoch": 0.355949741315595,
"grad_norm": 0.7380179619209855,
"learning_rate": 1.6267134795887914e-05,
"loss": 1.3081,
"step": 903
},
{
"epoch": 0.3563439270756344,
"grad_norm": 0.7592157290500411,
"learning_rate": 1.6256401054161565e-05,
"loss": 1.1903,
"step": 904
},
{
"epoch": 0.3567381128356738,
"grad_norm": 0.7467318769646345,
"learning_rate": 1.6245655454939474e-05,
"loss": 1.2442,
"step": 905
},
{
"epoch": 0.3571322985957132,
"grad_norm": 0.8375209294354106,
"learning_rate": 1.6234898018587336e-05,
"loss": 1.3645,
"step": 906
},
{
"epoch": 0.3575264843557526,
"grad_norm": 0.6897682274849407,
"learning_rate": 1.622412876549327e-05,
"loss": 1.2427,
"step": 907
},
{
"epoch": 0.3579206701157921,
"grad_norm": 0.6863050257352118,
"learning_rate": 1.621334771606778e-05,
"loss": 1.2618,
"step": 908
},
{
"epoch": 0.3583148558758315,
"grad_norm": 0.7753517670222771,
"learning_rate": 1.6202554890743754e-05,
"loss": 1.3007,
"step": 909
},
{
"epoch": 0.3587090416358709,
"grad_norm": 0.7259581040390859,
"learning_rate": 1.619175030997638e-05,
"loss": 1.2528,
"step": 910
},
{
"epoch": 0.35910322739591033,
"grad_norm": 0.7718789856797308,
"learning_rate": 1.6180933994243123e-05,
"loss": 1.3085,
"step": 911
},
{
"epoch": 0.35949741315594974,
"grad_norm": 0.7146087165544308,
"learning_rate": 1.6170105964043698e-05,
"loss": 1.2306,
"step": 912
},
{
"epoch": 0.35989159891598915,
"grad_norm": 0.7346445190650487,
"learning_rate": 1.6159266239900015e-05,
"loss": 1.2984,
"step": 913
},
{
"epoch": 0.36028578467602856,
"grad_norm": 0.6888116952571305,
"learning_rate": 1.614841484235616e-05,
"loss": 1.2657,
"step": 914
},
{
"epoch": 0.360679970436068,
"grad_norm": 0.6989568807671639,
"learning_rate": 1.6137551791978325e-05,
"loss": 1.2347,
"step": 915
},
{
"epoch": 0.36107415619610744,
"grad_norm": 0.6755063819703383,
"learning_rate": 1.61266771093548e-05,
"loss": 1.2551,
"step": 916
},
{
"epoch": 0.36146834195614685,
"grad_norm": 0.6534472286383475,
"learning_rate": 1.6115790815095914e-05,
"loss": 1.1829,
"step": 917
},
{
"epoch": 0.36186252771618627,
"grad_norm": 0.7262958248573816,
"learning_rate": 1.610489292983401e-05,
"loss": 1.31,
"step": 918
},
{
"epoch": 0.3622567134762257,
"grad_norm": 0.7648471804581862,
"learning_rate": 1.6093983474223392e-05,
"loss": 1.259,
"step": 919
},
{
"epoch": 0.3626508992362651,
"grad_norm": 0.7020781761512667,
"learning_rate": 1.6083062468940297e-05,
"loss": 1.3028,
"step": 920
},
{
"epoch": 0.3630450849963045,
"grad_norm": 0.6839393628121689,
"learning_rate": 1.6072129934682847e-05,
"loss": 1.2558,
"step": 921
},
{
"epoch": 0.3634392707563439,
"grad_norm": 0.7058988465923998,
"learning_rate": 1.606118589217102e-05,
"loss": 1.2582,
"step": 922
},
{
"epoch": 0.3638334565163833,
"grad_norm": 0.6791475273873648,
"learning_rate": 1.605023036214661e-05,
"loss": 1.2142,
"step": 923
},
{
"epoch": 0.3642276422764228,
"grad_norm": 0.6970350336814236,
"learning_rate": 1.6039263365373167e-05,
"loss": 1.2528,
"step": 924
},
{
"epoch": 0.3646218280364622,
"grad_norm": 0.6699695799738228,
"learning_rate": 1.602828492263598e-05,
"loss": 1.1959,
"step": 925
},
{
"epoch": 0.3650160137965016,
"grad_norm": 0.663408412743378,
"learning_rate": 1.6017295054742045e-05,
"loss": 1.288,
"step": 926
},
{
"epoch": 0.365410199556541,
"grad_norm": 0.7158290886170531,
"learning_rate": 1.6006293782519988e-05,
"loss": 1.2376,
"step": 927
},
{
"epoch": 0.36580438531658044,
"grad_norm": 0.7543773228580308,
"learning_rate": 1.5995281126820067e-05,
"loss": 1.2899,
"step": 928
},
{
"epoch": 0.36619857107661985,
"grad_norm": 0.744149002729838,
"learning_rate": 1.5984257108514107e-05,
"loss": 1.3389,
"step": 929
},
{
"epoch": 0.36659275683665926,
"grad_norm": 0.7182715748702388,
"learning_rate": 1.5973221748495472e-05,
"loss": 1.3381,
"step": 930
},
{
"epoch": 0.3669869425966987,
"grad_norm": 0.7001237272757365,
"learning_rate": 1.5962175067679013e-05,
"loss": 1.2702,
"step": 931
},
{
"epoch": 0.36738112835673814,
"grad_norm": 0.7077959320676287,
"learning_rate": 1.5951117087001048e-05,
"loss": 1.2647,
"step": 932
},
{
"epoch": 0.36777531411677755,
"grad_norm": 0.693416521429882,
"learning_rate": 1.5940047827419305e-05,
"loss": 1.307,
"step": 933
},
{
"epoch": 0.36816949987681696,
"grad_norm": 5.313983840642,
"learning_rate": 1.592896730991289e-05,
"loss": 1.3227,
"step": 934
},
{
"epoch": 0.3685636856368564,
"grad_norm": 0.7174526675424638,
"learning_rate": 1.591787555548225e-05,
"loss": 1.2003,
"step": 935
},
{
"epoch": 0.3689578713968958,
"grad_norm": 0.7620783078614348,
"learning_rate": 1.590677258514911e-05,
"loss": 1.2984,
"step": 936
},
{
"epoch": 0.3693520571569352,
"grad_norm": 0.7102280092234018,
"learning_rate": 1.5895658419956485e-05,
"loss": 1.1827,
"step": 937
},
{
"epoch": 0.3697462429169746,
"grad_norm": 0.7106880003780766,
"learning_rate": 1.588453308096857e-05,
"loss": 1.2557,
"step": 938
},
{
"epoch": 0.370140428677014,
"grad_norm": 0.7113617397724621,
"learning_rate": 1.587339658927077e-05,
"loss": 1.2874,
"step": 939
},
{
"epoch": 0.3705346144370535,
"grad_norm": 0.7043522365953943,
"learning_rate": 1.5862248965969604e-05,
"loss": 1.2596,
"step": 940
},
{
"epoch": 0.3709288001970929,
"grad_norm": 0.7433597815080879,
"learning_rate": 1.5851090232192704e-05,
"loss": 1.3157,
"step": 941
},
{
"epoch": 0.3713229859571323,
"grad_norm": 0.6920086062528787,
"learning_rate": 1.5839920409088743e-05,
"loss": 1.2526,
"step": 942
},
{
"epoch": 0.3717171717171717,
"grad_norm": 0.6806330894798819,
"learning_rate": 1.5828739517827426e-05,
"loss": 1.2665,
"step": 943
},
{
"epoch": 0.37211135747721114,
"grad_norm": 0.693773375915683,
"learning_rate": 1.5817547579599436e-05,
"loss": 1.2284,
"step": 944
},
{
"epoch": 0.37250554323725055,
"grad_norm": 0.679887610966136,
"learning_rate": 1.5806344615616375e-05,
"loss": 1.2231,
"step": 945
},
{
"epoch": 0.37289972899728996,
"grad_norm": 0.6898748206285744,
"learning_rate": 1.5795130647110755e-05,
"loss": 1.3302,
"step": 946
},
{
"epoch": 0.37329391475732937,
"grad_norm": 0.7348938348769922,
"learning_rate": 1.5783905695335947e-05,
"loss": 1.2388,
"step": 947
},
{
"epoch": 0.37368810051736884,
"grad_norm": 0.7160016591377841,
"learning_rate": 1.577266978156613e-05,
"loss": 1.2105,
"step": 948
},
{
"epoch": 0.37408228627740825,
"grad_norm": 0.840969755169091,
"learning_rate": 1.5761422927096268e-05,
"loss": 1.3243,
"step": 949
},
{
"epoch": 0.37447647203744766,
"grad_norm": 0.6987504047644173,
"learning_rate": 1.5750165153242048e-05,
"loss": 1.28,
"step": 950
},
{
"epoch": 0.3748706577974871,
"grad_norm": 0.6995543811490563,
"learning_rate": 1.5738896481339857e-05,
"loss": 1.2808,
"step": 951
},
{
"epoch": 0.3752648435575265,
"grad_norm": 0.7027815016727716,
"learning_rate": 1.5727616932746748e-05,
"loss": 1.348,
"step": 952
},
{
"epoch": 0.3756590293175659,
"grad_norm": 0.7080676371673893,
"learning_rate": 1.5716326528840374e-05,
"loss": 1.2808,
"step": 953
},
{
"epoch": 0.3760532150776053,
"grad_norm": 0.6906991486703912,
"learning_rate": 1.570502529101896e-05,
"loss": 1.2822,
"step": 954
},
{
"epoch": 0.3764474008376447,
"grad_norm": 0.667842860069977,
"learning_rate": 1.569371324070128e-05,
"loss": 1.3153,
"step": 955
},
{
"epoch": 0.3768415865976842,
"grad_norm": 0.6680351163338653,
"learning_rate": 1.5682390399326585e-05,
"loss": 1.2659,
"step": 956
},
{
"epoch": 0.3772357723577236,
"grad_norm": 0.6839204182409985,
"learning_rate": 1.5671056788354583e-05,
"loss": 1.2726,
"step": 957
},
{
"epoch": 0.377629958117763,
"grad_norm": 0.6663129665848542,
"learning_rate": 1.5659712429265403e-05,
"loss": 1.2778,
"step": 958
},
{
"epoch": 0.3780241438778024,
"grad_norm": 0.693810071056339,
"learning_rate": 1.5648357343559518e-05,
"loss": 1.313,
"step": 959
},
{
"epoch": 0.37841832963784183,
"grad_norm": 0.7242639411060869,
"learning_rate": 1.5636991552757762e-05,
"loss": 1.229,
"step": 960
},
{
"epoch": 0.37881251539788124,
"grad_norm": 0.6902168937478176,
"learning_rate": 1.5625615078401244e-05,
"loss": 1.2342,
"step": 961
},
{
"epoch": 0.37920670115792066,
"grad_norm": 0.6978251892798721,
"learning_rate": 1.561422794205131e-05,
"loss": 1.3456,
"step": 962
},
{
"epoch": 0.37960088691796007,
"grad_norm": 0.710891024016947,
"learning_rate": 1.5602830165289536e-05,
"loss": 1.2539,
"step": 963
},
{
"epoch": 0.37999507267799953,
"grad_norm": 0.6933794057288072,
"learning_rate": 1.5591421769717642e-05,
"loss": 1.2406,
"step": 964
},
{
"epoch": 0.38038925843803895,
"grad_norm": 0.6512417427643563,
"learning_rate": 1.5580002776957493e-05,
"loss": 1.2212,
"step": 965
},
{
"epoch": 0.38078344419807836,
"grad_norm": 0.6798711415370834,
"learning_rate": 1.5568573208651027e-05,
"loss": 1.2299,
"step": 966
},
{
"epoch": 0.38117762995811777,
"grad_norm": 0.7169966010210781,
"learning_rate": 1.555713308646022e-05,
"loss": 1.2823,
"step": 967
},
{
"epoch": 0.3815718157181572,
"grad_norm": 0.7176225879361188,
"learning_rate": 1.5545682432067068e-05,
"loss": 1.3277,
"step": 968
},
{
"epoch": 0.3819660014781966,
"grad_norm": 0.6634455855323579,
"learning_rate": 1.5534221267173513e-05,
"loss": 1.2707,
"step": 969
},
{
"epoch": 0.382360187238236,
"grad_norm": 0.6523220060774133,
"learning_rate": 1.5522749613501424e-05,
"loss": 1.2224,
"step": 970
},
{
"epoch": 0.3827543729982754,
"grad_norm": 0.697086935286512,
"learning_rate": 1.551126749279255e-05,
"loss": 1.2247,
"step": 971
},
{
"epoch": 0.3831485587583149,
"grad_norm": 0.6605814150970358,
"learning_rate": 1.5499774926808468e-05,
"loss": 1.2624,
"step": 972
},
{
"epoch": 0.3835427445183543,
"grad_norm": 0.7011947342499778,
"learning_rate": 1.5488271937330562e-05,
"loss": 1.2972,
"step": 973
},
{
"epoch": 0.3839369302783937,
"grad_norm": 0.693697524489148,
"learning_rate": 1.5476758546159966e-05,
"loss": 1.2054,
"step": 974
},
{
"epoch": 0.3843311160384331,
"grad_norm": 0.6700050469107739,
"learning_rate": 1.5465234775117538e-05,
"loss": 1.2642,
"step": 975
},
{
"epoch": 0.38472530179847253,
"grad_norm": 0.6977970028023794,
"learning_rate": 1.5453700646043793e-05,
"loss": 1.2929,
"step": 976
},
{
"epoch": 0.38511948755851194,
"grad_norm": 0.7256704791236026,
"learning_rate": 1.5442156180798883e-05,
"loss": 1.2111,
"step": 977
},
{
"epoch": 0.38551367331855135,
"grad_norm": 0.6833079658478705,
"learning_rate": 1.5430601401262554e-05,
"loss": 1.3011,
"step": 978
},
{
"epoch": 0.38590785907859076,
"grad_norm": 0.6451358367434681,
"learning_rate": 1.54190363293341e-05,
"loss": 1.1995,
"step": 979
},
{
"epoch": 0.38630204483863023,
"grad_norm": 0.6686548263294536,
"learning_rate": 1.540746098693231e-05,
"loss": 1.2538,
"step": 980
},
{
"epoch": 0.38669623059866964,
"grad_norm": 0.6858165127408108,
"learning_rate": 1.5395875395995456e-05,
"loss": 1.3015,
"step": 981
},
{
"epoch": 0.38709041635870906,
"grad_norm": 0.6603138490124963,
"learning_rate": 1.5384279578481223e-05,
"loss": 1.2443,
"step": 982
},
{
"epoch": 0.38748460211874847,
"grad_norm": 0.6594884559786018,
"learning_rate": 1.537267355636668e-05,
"loss": 1.2314,
"step": 983
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.6918016955048513,
"learning_rate": 1.536105735164823e-05,
"loss": 1.2714,
"step": 984
},
{
"epoch": 0.3882729736388273,
"grad_norm": 0.75800219367767,
"learning_rate": 1.5349430986341588e-05,
"loss": 1.2889,
"step": 985
},
{
"epoch": 0.3886671593988667,
"grad_norm": 0.7150281786878397,
"learning_rate": 1.5337794482481714e-05,
"loss": 1.2301,
"step": 986
},
{
"epoch": 0.3890613451589061,
"grad_norm": 0.6864306072281939,
"learning_rate": 1.5326147862122796e-05,
"loss": 1.2146,
"step": 987
},
{
"epoch": 0.3894555309189456,
"grad_norm": 0.7281857146660934,
"learning_rate": 1.531449114733818e-05,
"loss": 1.2998,
"step": 988
},
{
"epoch": 0.389849716678985,
"grad_norm": 0.7064026433919306,
"learning_rate": 1.5302824360220352e-05,
"loss": 1.213,
"step": 989
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.678827373077648,
"learning_rate": 1.5291147522880887e-05,
"loss": 1.2899,
"step": 990
},
{
"epoch": 0.3906380881990638,
"grad_norm": 0.6825912010344036,
"learning_rate": 1.5279460657450408e-05,
"loss": 1.2508,
"step": 991
},
{
"epoch": 0.3910322739591032,
"grad_norm": 0.6897275293582734,
"learning_rate": 1.5267763786078544e-05,
"loss": 1.3103,
"step": 992
},
{
"epoch": 0.39142645971914264,
"grad_norm": 0.6889677484856918,
"learning_rate": 1.5256056930933884e-05,
"loss": 1.2385,
"step": 993
},
{
"epoch": 0.39182064547918205,
"grad_norm": 0.6756715938128258,
"learning_rate": 1.5244340114203946e-05,
"loss": 1.2811,
"step": 994
},
{
"epoch": 0.39221483123922146,
"grad_norm": 0.6588263063642222,
"learning_rate": 1.5232613358095121e-05,
"loss": 1.2008,
"step": 995
},
{
"epoch": 0.39260901699926093,
"grad_norm": 0.6649629443766613,
"learning_rate": 1.522087668483264e-05,
"loss": 1.2887,
"step": 996
},
{
"epoch": 0.39300320275930034,
"grad_norm": 0.69537586560042,
"learning_rate": 1.5209130116660532e-05,
"loss": 1.2318,
"step": 997
},
{
"epoch": 0.39339738851933975,
"grad_norm": 0.6548532801163026,
"learning_rate": 1.5197373675841572e-05,
"loss": 1.2321,
"step": 998
},
{
"epoch": 0.39379157427937916,
"grad_norm": 0.6789611198366031,
"learning_rate": 1.5185607384657257e-05,
"loss": 1.2501,
"step": 999
},
{
"epoch": 0.3941857600394186,
"grad_norm": 0.669469647081716,
"learning_rate": 1.5173831265407749e-05,
"loss": 1.2316,
"step": 1000
},
{
"epoch": 0.394579945799458,
"grad_norm": 0.6441524856006325,
"learning_rate": 1.5162045340411826e-05,
"loss": 1.2215,
"step": 1001
},
{
"epoch": 0.3949741315594974,
"grad_norm": 0.6585151163796467,
"learning_rate": 1.5150249632006871e-05,
"loss": 1.2364,
"step": 1002
},
{
"epoch": 0.3953683173195368,
"grad_norm": 0.6590764235984096,
"learning_rate": 1.5138444162548791e-05,
"loss": 1.2507,
"step": 1003
},
{
"epoch": 0.3957625030795763,
"grad_norm": 0.6746142261487992,
"learning_rate": 1.5126628954412002e-05,
"loss": 1.3095,
"step": 1004
},
{
"epoch": 0.3961566888396157,
"grad_norm": 0.6425820917957424,
"learning_rate": 1.5114804029989372e-05,
"loss": 1.2455,
"step": 1005
},
{
"epoch": 0.3965508745996551,
"grad_norm": 0.6885768302093563,
"learning_rate": 1.5102969411692186e-05,
"loss": 1.2067,
"step": 1006
},
{
"epoch": 0.3969450603596945,
"grad_norm": 0.6715538405865114,
"learning_rate": 1.5091125121950105e-05,
"loss": 1.2723,
"step": 1007
},
{
"epoch": 0.3973392461197339,
"grad_norm": 0.6572204758977973,
"learning_rate": 1.5079271183211118e-05,
"loss": 1.2676,
"step": 1008
},
{
"epoch": 0.39773343187977334,
"grad_norm": 0.6913182919431603,
"learning_rate": 1.5067407617941499e-05,
"loss": 1.2723,
"step": 1009
},
{
"epoch": 0.39812761763981275,
"grad_norm": 0.6859364323759741,
"learning_rate": 1.5055534448625766e-05,
"loss": 1.2672,
"step": 1010
},
{
"epoch": 0.39852180339985216,
"grad_norm": 0.6924966624789022,
"learning_rate": 1.5043651697766642e-05,
"loss": 1.2032,
"step": 1011
},
{
"epoch": 0.3989159891598916,
"grad_norm": 0.696108235634334,
"learning_rate": 1.5031759387885008e-05,
"loss": 1.2286,
"step": 1012
},
{
"epoch": 0.39931017491993104,
"grad_norm": 0.683816830333667,
"learning_rate": 1.5019857541519866e-05,
"loss": 1.2596,
"step": 1013
},
{
"epoch": 0.39970436067997045,
"grad_norm": 0.6544409734476196,
"learning_rate": 1.5007946181228286e-05,
"loss": 1.1861,
"step": 1014
},
{
"epoch": 0.40009854644000986,
"grad_norm": 0.6828313055454289,
"learning_rate": 1.4996025329585368e-05,
"loss": 1.2627,
"step": 1015
},
{
"epoch": 0.4004927322000493,
"grad_norm": 0.7238896612698483,
"learning_rate": 1.4984095009184215e-05,
"loss": 1.2237,
"step": 1016
},
{
"epoch": 0.4008869179600887,
"grad_norm": 0.7255960311346755,
"learning_rate": 1.4972155242635853e-05,
"loss": 1.2553,
"step": 1017
},
{
"epoch": 0.4012811037201281,
"grad_norm": 0.6462351578732584,
"learning_rate": 1.496020605256923e-05,
"loss": 1.1924,
"step": 1018
},
{
"epoch": 0.4016752894801675,
"grad_norm": 0.6627446653808322,
"learning_rate": 1.4948247461631148e-05,
"loss": 1.237,
"step": 1019
},
{
"epoch": 0.402069475240207,
"grad_norm": 0.6825306611455508,
"learning_rate": 1.4936279492486222e-05,
"loss": 1.2397,
"step": 1020
},
{
"epoch": 0.4024636610002464,
"grad_norm": 0.7150438816039062,
"learning_rate": 1.4924302167816845e-05,
"loss": 1.2152,
"step": 1021
},
{
"epoch": 0.4028578467602858,
"grad_norm": 0.7093178992414255,
"learning_rate": 1.4912315510323138e-05,
"loss": 1.2576,
"step": 1022
},
{
"epoch": 0.4032520325203252,
"grad_norm": 0.6985543458898392,
"learning_rate": 1.4900319542722921e-05,
"loss": 1.2673,
"step": 1023
},
{
"epoch": 0.4036462182803646,
"grad_norm": 0.6831019226556653,
"learning_rate": 1.488831428775164e-05,
"loss": 1.2049,
"step": 1024
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.6567400662964415,
"learning_rate": 1.4876299768162361e-05,
"loss": 1.1799,
"step": 1025
},
{
"epoch": 0.40443458980044344,
"grad_norm": 0.6954618711419809,
"learning_rate": 1.48642760067257e-05,
"loss": 1.329,
"step": 1026
},
{
"epoch": 0.40482877556048286,
"grad_norm": 0.7107685604813471,
"learning_rate": 1.4852243026229787e-05,
"loss": 1.2487,
"step": 1027
},
{
"epoch": 0.4052229613205223,
"grad_norm": 0.674580720557361,
"learning_rate": 1.4840200849480226e-05,
"loss": 1.2157,
"step": 1028
},
{
"epoch": 0.40561714708056174,
"grad_norm": 0.6638304289674144,
"learning_rate": 1.4828149499300061e-05,
"loss": 1.314,
"step": 1029
},
{
"epoch": 0.40601133284060115,
"grad_norm": 0.6757193376832867,
"learning_rate": 1.4816088998529707e-05,
"loss": 1.1997,
"step": 1030
},
{
"epoch": 0.40640551860064056,
"grad_norm": 0.7111016241633684,
"learning_rate": 1.4804019370026927e-05,
"loss": 1.2307,
"step": 1031
},
{
"epoch": 0.40679970436067997,
"grad_norm": 0.6336887603576372,
"learning_rate": 1.4791940636666785e-05,
"loss": 1.2429,
"step": 1032
},
{
"epoch": 0.4071938901207194,
"grad_norm": 0.7121301295945476,
"learning_rate": 1.47798528213416e-05,
"loss": 1.2347,
"step": 1033
},
{
"epoch": 0.4075880758807588,
"grad_norm": 0.6798719496665275,
"learning_rate": 1.4767755946960902e-05,
"loss": 1.214,
"step": 1034
},
{
"epoch": 0.4079822616407982,
"grad_norm": 0.672163959841733,
"learning_rate": 1.4755650036451397e-05,
"loss": 1.2129,
"step": 1035
},
{
"epoch": 0.40837644740083767,
"grad_norm": 0.6580322284929199,
"learning_rate": 1.474353511275691e-05,
"loss": 1.233,
"step": 1036
},
{
"epoch": 0.4087706331608771,
"grad_norm": 0.8559124631644651,
"learning_rate": 1.4731411198838346e-05,
"loss": 1.3092,
"step": 1037
},
{
"epoch": 0.4091648189209165,
"grad_norm": 0.6612192406553391,
"learning_rate": 1.4719278317673655e-05,
"loss": 1.255,
"step": 1038
},
{
"epoch": 0.4095590046809559,
"grad_norm": 0.6480565858040689,
"learning_rate": 1.4707136492257783e-05,
"loss": 1.1938,
"step": 1039
},
{
"epoch": 0.4099531904409953,
"grad_norm": 0.6847017126697683,
"learning_rate": 1.4694985745602623e-05,
"loss": 1.2823,
"step": 1040
},
{
"epoch": 0.41034737620103473,
"grad_norm": 0.6625824656368514,
"learning_rate": 1.4682826100736973e-05,
"loss": 1.2196,
"step": 1041
},
{
"epoch": 0.41074156196107414,
"grad_norm": 0.6520046231301477,
"learning_rate": 1.4670657580706511e-05,
"loss": 1.2129,
"step": 1042
},
{
"epoch": 0.41113574772111355,
"grad_norm": 0.6568163192077175,
"learning_rate": 1.4658480208573717e-05,
"loss": 1.205,
"step": 1043
},
{
"epoch": 0.411529933481153,
"grad_norm": 0.7355354070775183,
"learning_rate": 1.4646294007417858e-05,
"loss": 1.2509,
"step": 1044
},
{
"epoch": 0.41192411924119243,
"grad_norm": 0.6584335682341751,
"learning_rate": 1.4634099000334932e-05,
"loss": 1.2131,
"step": 1045
},
{
"epoch": 0.41231830500123184,
"grad_norm": 0.6787385568676211,
"learning_rate": 1.4621895210437627e-05,
"loss": 1.2844,
"step": 1046
},
{
"epoch": 0.41271249076127126,
"grad_norm": 0.6534106417043676,
"learning_rate": 1.4609682660855277e-05,
"loss": 1.2036,
"step": 1047
},
{
"epoch": 0.41310667652131067,
"grad_norm": 0.6670476383359956,
"learning_rate": 1.4597461374733817e-05,
"loss": 1.2027,
"step": 1048
},
{
"epoch": 0.4135008622813501,
"grad_norm": 0.6869267202912966,
"learning_rate": 1.458523137523574e-05,
"loss": 1.2417,
"step": 1049
},
{
"epoch": 0.4138950480413895,
"grad_norm": 0.6825156046026267,
"learning_rate": 1.4572992685540057e-05,
"loss": 1.2732,
"step": 1050
},
{
"epoch": 0.4142892338014289,
"grad_norm": 0.6393859537214149,
"learning_rate": 1.4560745328842238e-05,
"loss": 1.2022,
"step": 1051
},
{
"epoch": 0.41468341956146837,
"grad_norm": 0.6783345452247255,
"learning_rate": 1.4548489328354197e-05,
"loss": 1.2039,
"step": 1052
},
{
"epoch": 0.4150776053215078,
"grad_norm": 0.6856742550565621,
"learning_rate": 1.4536224707304209e-05,
"loss": 1.2333,
"step": 1053
},
{
"epoch": 0.4154717910815472,
"grad_norm": 0.6797781228333333,
"learning_rate": 1.4523951488936905e-05,
"loss": 1.2458,
"step": 1054
},
{
"epoch": 0.4158659768415866,
"grad_norm": 0.6687542124726085,
"learning_rate": 1.4511669696513206e-05,
"loss": 1.2859,
"step": 1055
},
{
"epoch": 0.416260162601626,
"grad_norm": 0.654994598290333,
"learning_rate": 1.4499379353310275e-05,
"loss": 1.2514,
"step": 1056
},
{
"epoch": 0.4166543483616654,
"grad_norm": 0.6710277195302214,
"learning_rate": 1.4487080482621485e-05,
"loss": 1.1726,
"step": 1057
},
{
"epoch": 0.41704853412170484,
"grad_norm": 0.6975157864795727,
"learning_rate": 1.4474773107756379e-05,
"loss": 1.3039,
"step": 1058
},
{
"epoch": 0.41744271988174425,
"grad_norm": 0.6847631484475221,
"learning_rate": 1.4462457252040606e-05,
"loss": 1.2934,
"step": 1059
},
{
"epoch": 0.4178369056417837,
"grad_norm": 0.6569155149197007,
"learning_rate": 1.4450132938815896e-05,
"loss": 1.2399,
"step": 1060
},
{
"epoch": 0.41823109140182313,
"grad_norm": 0.6551116832105975,
"learning_rate": 1.443780019144e-05,
"loss": 1.2549,
"step": 1061
},
{
"epoch": 0.41862527716186254,
"grad_norm": 0.6908963315449874,
"learning_rate": 1.4425459033286664e-05,
"loss": 1.2723,
"step": 1062
},
{
"epoch": 0.41901946292190195,
"grad_norm": 0.669999734161243,
"learning_rate": 1.4413109487745571e-05,
"loss": 1.2034,
"step": 1063
},
{
"epoch": 0.41941364868194136,
"grad_norm": 0.6569047790921405,
"learning_rate": 1.4400751578222293e-05,
"loss": 1.2124,
"step": 1064
},
{
"epoch": 0.4198078344419808,
"grad_norm": 0.6641788447379324,
"learning_rate": 1.438838532813827e-05,
"loss": 1.2311,
"step": 1065
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.6421382945573415,
"learning_rate": 1.437601076093073e-05,
"loss": 1.2624,
"step": 1066
},
{
"epoch": 0.4205962059620596,
"grad_norm": 0.6987072260804941,
"learning_rate": 1.4363627900052676e-05,
"loss": 1.2533,
"step": 1067
},
{
"epoch": 0.42099039172209907,
"grad_norm": 0.7038543852283208,
"learning_rate": 1.435123676897283e-05,
"loss": 1.2362,
"step": 1068
},
{
"epoch": 0.4213845774821385,
"grad_norm": 0.6582422377441999,
"learning_rate": 1.4338837391175582e-05,
"loss": 1.2929,
"step": 1069
},
{
"epoch": 0.4217787632421779,
"grad_norm": 0.6549666509553242,
"learning_rate": 1.4326429790160958e-05,
"loss": 1.2912,
"step": 1070
},
{
"epoch": 0.4221729490022173,
"grad_norm": 0.6609389567208854,
"learning_rate": 1.4314013989444566e-05,
"loss": 1.2242,
"step": 1071
},
{
"epoch": 0.4225671347622567,
"grad_norm": 0.6742945321694513,
"learning_rate": 1.4301590012557553e-05,
"loss": 1.2606,
"step": 1072
},
{
"epoch": 0.4229613205222961,
"grad_norm": 0.6841196388200714,
"learning_rate": 1.4289157883046567e-05,
"loss": 1.1914,
"step": 1073
},
{
"epoch": 0.42335550628233554,
"grad_norm": 0.6781835047036432,
"learning_rate": 1.4276717624473697e-05,
"loss": 1.2149,
"step": 1074
},
{
"epoch": 0.42374969204237495,
"grad_norm": 0.6384771187611207,
"learning_rate": 1.4264269260416455e-05,
"loss": 1.194,
"step": 1075
},
{
"epoch": 0.4241438778024144,
"grad_norm": 0.6392205051998697,
"learning_rate": 1.4251812814467701e-05,
"loss": 1.2314,
"step": 1076
},
{
"epoch": 0.4245380635624538,
"grad_norm": 0.6789060040382907,
"learning_rate": 1.4239348310235613e-05,
"loss": 1.2207,
"step": 1077
},
{
"epoch": 0.42493224932249324,
"grad_norm": 0.6479589435408246,
"learning_rate": 1.4226875771343656e-05,
"loss": 1.2104,
"step": 1078
},
{
"epoch": 0.42532643508253265,
"grad_norm": 0.6575432784037729,
"learning_rate": 1.4214395221430501e-05,
"loss": 1.2749,
"step": 1079
},
{
"epoch": 0.42572062084257206,
"grad_norm": 0.701850378214208,
"learning_rate": 1.420190668415002e-05,
"loss": 1.2202,
"step": 1080
},
{
"epoch": 0.4261148066026115,
"grad_norm": 2.0536053216353896,
"learning_rate": 1.4189410183171214e-05,
"loss": 1.1963,
"step": 1081
},
{
"epoch": 0.4265089923626509,
"grad_norm": 0.6609999350419868,
"learning_rate": 1.417690574217818e-05,
"loss": 1.2504,
"step": 1082
},
{
"epoch": 0.4269031781226903,
"grad_norm": 0.6612267333571307,
"learning_rate": 1.4164393384870065e-05,
"loss": 1.2665,
"step": 1083
},
{
"epoch": 0.42729736388272976,
"grad_norm": 0.6757638887255789,
"learning_rate": 1.4151873134961014e-05,
"loss": 1.1514,
"step": 1084
},
{
"epoch": 0.4276915496427692,
"grad_norm": 0.683456163531099,
"learning_rate": 1.4139345016180135e-05,
"loss": 1.3079,
"step": 1085
},
{
"epoch": 0.4280857354028086,
"grad_norm": 0.8513875836873347,
"learning_rate": 1.4126809052271453e-05,
"loss": 1.2724,
"step": 1086
},
{
"epoch": 0.428479921162848,
"grad_norm": 0.6442638283664752,
"learning_rate": 1.4114265266993847e-05,
"loss": 1.2173,
"step": 1087
},
{
"epoch": 0.4288741069228874,
"grad_norm": 0.6509895157275494,
"learning_rate": 1.4101713684121042e-05,
"loss": 1.2479,
"step": 1088
},
{
"epoch": 0.4292682926829268,
"grad_norm": 0.6474693228576278,
"learning_rate": 1.408915432744152e-05,
"loss": 1.2125,
"step": 1089
},
{
"epoch": 0.42966247844296623,
"grad_norm": 0.6735783131189829,
"learning_rate": 1.407658722075851e-05,
"loss": 1.2068,
"step": 1090
},
{
"epoch": 0.43005666420300565,
"grad_norm": 0.6537663595057571,
"learning_rate": 1.406401238788992e-05,
"loss": 1.2156,
"step": 1091
},
{
"epoch": 0.4304508499630451,
"grad_norm": 0.6544657627047221,
"learning_rate": 1.4051429852668312e-05,
"loss": 1.2576,
"step": 1092
},
{
"epoch": 0.4308450357230845,
"grad_norm": 0.6301328044253675,
"learning_rate": 1.4038839638940835e-05,
"loss": 1.1426,
"step": 1093
},
{
"epoch": 0.43123922148312394,
"grad_norm": 0.6847962737010194,
"learning_rate": 1.4026241770569198e-05,
"loss": 1.1885,
"step": 1094
},
{
"epoch": 0.43163340724316335,
"grad_norm": 0.6471962172332811,
"learning_rate": 1.4013636271429612e-05,
"loss": 1.2111,
"step": 1095
},
{
"epoch": 0.43202759300320276,
"grad_norm": 0.6655421827524571,
"learning_rate": 1.4001023165412754e-05,
"loss": 1.2754,
"step": 1096
},
{
"epoch": 0.43242177876324217,
"grad_norm": 0.6748073371066969,
"learning_rate": 1.3988402476423722e-05,
"loss": 1.254,
"step": 1097
},
{
"epoch": 0.4328159645232816,
"grad_norm": 0.6557610559912413,
"learning_rate": 1.3975774228381975e-05,
"loss": 1.2439,
"step": 1098
},
{
"epoch": 0.433210150283321,
"grad_norm": 0.6632658788983514,
"learning_rate": 1.3963138445221311e-05,
"loss": 1.2516,
"step": 1099
},
{
"epoch": 0.43360433604336046,
"grad_norm": 0.6491486867598589,
"learning_rate": 1.3950495150889793e-05,
"loss": 1.2335,
"step": 1100
},
{
"epoch": 0.4339985218033999,
"grad_norm": 0.6517729673881756,
"learning_rate": 1.3937844369349736e-05,
"loss": 1.2167,
"step": 1101
},
{
"epoch": 0.4343927075634393,
"grad_norm": 0.6782382384926667,
"learning_rate": 1.3925186124577639e-05,
"loss": 1.2425,
"step": 1102
},
{
"epoch": 0.4347868933234787,
"grad_norm": 0.6591309286023143,
"learning_rate": 1.3912520440564139e-05,
"loss": 1.2043,
"step": 1103
},
{
"epoch": 0.4351810790835181,
"grad_norm": 0.6546464680178252,
"learning_rate": 1.3899847341313982e-05,
"loss": 1.1904,
"step": 1104
},
{
"epoch": 0.4355752648435575,
"grad_norm": 0.6446542186074286,
"learning_rate": 1.3887166850845963e-05,
"loss": 1.1976,
"step": 1105
},
{
"epoch": 0.43596945060359693,
"grad_norm": 0.6591279097552126,
"learning_rate": 1.3874478993192886e-05,
"loss": 1.2711,
"step": 1106
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.6877144132235246,
"learning_rate": 1.386178379240152e-05,
"loss": 1.2061,
"step": 1107
},
{
"epoch": 0.4367578221236758,
"grad_norm": 0.6207199280492006,
"learning_rate": 1.3849081272532545e-05,
"loss": 1.1999,
"step": 1108
},
{
"epoch": 0.4371520078837152,
"grad_norm": 0.6863520493826831,
"learning_rate": 1.383637145766052e-05,
"loss": 1.2781,
"step": 1109
},
{
"epoch": 0.43754619364375463,
"grad_norm": 0.6329597392455102,
"learning_rate": 1.3823654371873827e-05,
"loss": 1.2203,
"step": 1110
},
{
"epoch": 0.43794037940379404,
"grad_norm": 0.6453430853174527,
"learning_rate": 1.3810930039274626e-05,
"loss": 1.2341,
"step": 1111
},
{
"epoch": 0.43833456516383346,
"grad_norm": 0.7008614015575915,
"learning_rate": 1.3798198483978816e-05,
"loss": 1.3045,
"step": 1112
},
{
"epoch": 0.43872875092387287,
"grad_norm": 0.6526995169723234,
"learning_rate": 1.3785459730115975e-05,
"loss": 1.2444,
"step": 1113
},
{
"epoch": 0.4391229366839123,
"grad_norm": 0.6648665882412224,
"learning_rate": 1.3772713801829338e-05,
"loss": 1.2346,
"step": 1114
},
{
"epoch": 0.4395171224439517,
"grad_norm": 0.6521080562166568,
"learning_rate": 1.375996072327573e-05,
"loss": 1.2473,
"step": 1115
},
{
"epoch": 0.43991130820399116,
"grad_norm": 0.6354275169637564,
"learning_rate": 1.374720051862553e-05,
"loss": 1.2316,
"step": 1116
},
{
"epoch": 0.44030549396403057,
"grad_norm": 0.6614840460671958,
"learning_rate": 1.3734433212062617e-05,
"loss": 1.2004,
"step": 1117
},
{
"epoch": 0.44069967972407,
"grad_norm": 0.6662537159779596,
"learning_rate": 1.3721658827784335e-05,
"loss": 1.2901,
"step": 1118
},
{
"epoch": 0.4410938654841094,
"grad_norm": 0.6687056517988047,
"learning_rate": 1.3708877390001442e-05,
"loss": 1.2539,
"step": 1119
},
{
"epoch": 0.4414880512441488,
"grad_norm": 0.6733214755511964,
"learning_rate": 1.3696088922938065e-05,
"loss": 1.2515,
"step": 1120
},
{
"epoch": 0.4418822370041882,
"grad_norm": 0.6535655596127494,
"learning_rate": 1.3683293450831649e-05,
"loss": 1.2305,
"step": 1121
},
{
"epoch": 0.44227642276422763,
"grad_norm": 0.710139241305188,
"learning_rate": 1.3670490997932922e-05,
"loss": 1.3349,
"step": 1122
},
{
"epoch": 0.44267060852426704,
"grad_norm": 0.6301043045063337,
"learning_rate": 1.3657681588505835e-05,
"loss": 1.1704,
"step": 1123
},
{
"epoch": 0.4430647942843065,
"grad_norm": 0.6659655009342225,
"learning_rate": 1.3644865246827528e-05,
"loss": 1.2175,
"step": 1124
},
{
"epoch": 0.4434589800443459,
"grad_norm": 0.6562665211091786,
"learning_rate": 1.3632041997188278e-05,
"loss": 1.298,
"step": 1125
},
{
"epoch": 0.44385316580438533,
"grad_norm": 0.6649159181775033,
"learning_rate": 1.3619211863891458e-05,
"loss": 1.2194,
"step": 1126
},
{
"epoch": 0.44424735156442474,
"grad_norm": 0.6563076400799585,
"learning_rate": 1.3606374871253474e-05,
"loss": 1.2257,
"step": 1127
},
{
"epoch": 0.44464153732446415,
"grad_norm": 0.6289604646597672,
"learning_rate": 1.3593531043603756e-05,
"loss": 1.2144,
"step": 1128
},
{
"epoch": 0.44503572308450356,
"grad_norm": 1.1206270057176397,
"learning_rate": 1.3580680405284666e-05,
"loss": 1.1742,
"step": 1129
},
{
"epoch": 0.445429908844543,
"grad_norm": 0.7010573881465098,
"learning_rate": 1.3567822980651481e-05,
"loss": 1.2557,
"step": 1130
},
{
"epoch": 0.4458240946045824,
"grad_norm": 0.6819687881969332,
"learning_rate": 1.3554958794072346e-05,
"loss": 1.2628,
"step": 1131
},
{
"epoch": 0.44621828036462186,
"grad_norm": 0.6631424239254387,
"learning_rate": 1.3542087869928215e-05,
"loss": 1.2664,
"step": 1132
},
{
"epoch": 0.44661246612466127,
"grad_norm": 0.6884792830902806,
"learning_rate": 1.3529210232612815e-05,
"loss": 1.2151,
"step": 1133
},
{
"epoch": 0.4470066518847007,
"grad_norm": 0.6743020797905825,
"learning_rate": 1.3516325906532592e-05,
"loss": 1.2173,
"step": 1134
},
{
"epoch": 0.4474008376447401,
"grad_norm": 0.6748726425122616,
"learning_rate": 1.350343491610667e-05,
"loss": 1.2951,
"step": 1135
},
{
"epoch": 0.4477950234047795,
"grad_norm": 0.6790188323448472,
"learning_rate": 1.3490537285766809e-05,
"loss": 1.2548,
"step": 1136
},
{
"epoch": 0.4481892091648189,
"grad_norm": 0.7188066208980596,
"learning_rate": 1.3477633039957346e-05,
"loss": 1.3093,
"step": 1137
},
{
"epoch": 0.4485833949248583,
"grad_norm": 0.6778429503766523,
"learning_rate": 1.3464722203135164e-05,
"loss": 1.253,
"step": 1138
},
{
"epoch": 0.44897758068489774,
"grad_norm": 0.6610758959536769,
"learning_rate": 1.3451804799769625e-05,
"loss": 1.1997,
"step": 1139
},
{
"epoch": 0.4493717664449372,
"grad_norm": 0.6661694419731813,
"learning_rate": 1.3438880854342552e-05,
"loss": 1.2346,
"step": 1140
},
{
"epoch": 0.4497659522049766,
"grad_norm": 0.6668706103840563,
"learning_rate": 1.3425950391348154e-05,
"loss": 1.2652,
"step": 1141
},
{
"epoch": 0.450160137965016,
"grad_norm": 0.653413813618824,
"learning_rate": 1.3413013435293004e-05,
"loss": 1.1574,
"step": 1142
},
{
"epoch": 0.45055432372505544,
"grad_norm": 0.6626392658566362,
"learning_rate": 1.3400070010695966e-05,
"loss": 1.2584,
"step": 1143
},
{
"epoch": 0.45094850948509485,
"grad_norm": 0.6612645982158664,
"learning_rate": 1.3387120142088182e-05,
"loss": 1.3095,
"step": 1144
},
{
"epoch": 0.45134269524513426,
"grad_norm": 0.6343193781713191,
"learning_rate": 1.3374163854012987e-05,
"loss": 1.1738,
"step": 1145
},
{
"epoch": 0.4517368810051737,
"grad_norm": 0.6914178485118841,
"learning_rate": 1.33612011710259e-05,
"loss": 1.2289,
"step": 1146
},
{
"epoch": 0.4521310667652131,
"grad_norm": 0.6349842783208113,
"learning_rate": 1.3348232117694555e-05,
"loss": 1.1942,
"step": 1147
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.6878005677404854,
"learning_rate": 1.333525671859865e-05,
"loss": 1.2197,
"step": 1148
},
{
"epoch": 0.45291943828529196,
"grad_norm": 0.708515154245003,
"learning_rate": 1.3322274998329925e-05,
"loss": 1.217,
"step": 1149
},
{
"epoch": 0.4533136240453314,
"grad_norm": 0.6654307895746174,
"learning_rate": 1.3309286981492084e-05,
"loss": 1.2182,
"step": 1150
},
{
"epoch": 0.4537078098053708,
"grad_norm": 0.6849958565571799,
"learning_rate": 1.3296292692700781e-05,
"loss": 1.262,
"step": 1151
},
{
"epoch": 0.4541019955654102,
"grad_norm": 0.661458414456228,
"learning_rate": 1.3283292156583542e-05,
"loss": 1.2237,
"step": 1152
},
{
"epoch": 0.4544961813254496,
"grad_norm": 0.6445694725984406,
"learning_rate": 1.3270285397779743e-05,
"loss": 1.2046,
"step": 1153
},
{
"epoch": 0.454890367085489,
"grad_norm": 0.6880572438702209,
"learning_rate": 1.3257272440940559e-05,
"loss": 1.2517,
"step": 1154
},
{
"epoch": 0.45528455284552843,
"grad_norm": 0.6462853469948439,
"learning_rate": 1.324425331072889e-05,
"loss": 1.1937,
"step": 1155
},
{
"epoch": 0.4556787386055679,
"grad_norm": 0.6937504964864099,
"learning_rate": 1.3231228031819358e-05,
"loss": 1.2315,
"step": 1156
},
{
"epoch": 0.4560729243656073,
"grad_norm": 0.6935002768528703,
"learning_rate": 1.3218196628898232e-05,
"loss": 1.2941,
"step": 1157
},
{
"epoch": 0.4564671101256467,
"grad_norm": 0.6646155460144206,
"learning_rate": 1.320515912666338e-05,
"loss": 1.1961,
"step": 1158
},
{
"epoch": 0.45686129588568614,
"grad_norm": 0.675642433429094,
"learning_rate": 1.319211554982424e-05,
"loss": 1.1793,
"step": 1159
},
{
"epoch": 0.45725548164572555,
"grad_norm": 0.6626358544782226,
"learning_rate": 1.3179065923101759e-05,
"loss": 1.2279,
"step": 1160
},
{
"epoch": 0.45764966740576496,
"grad_norm": 0.6633366399850951,
"learning_rate": 1.3166010271228347e-05,
"loss": 1.2472,
"step": 1161
},
{
"epoch": 0.45804385316580437,
"grad_norm": 0.6572172161629819,
"learning_rate": 1.3152948618947839e-05,
"loss": 1.2959,
"step": 1162
},
{
"epoch": 0.4584380389258438,
"grad_norm": 0.6234010246471685,
"learning_rate": 1.3139880991015432e-05,
"loss": 1.1878,
"step": 1163
},
{
"epoch": 0.45883222468588325,
"grad_norm": 0.6445399860459299,
"learning_rate": 1.3126807412197666e-05,
"loss": 1.2468,
"step": 1164
},
{
"epoch": 0.45922641044592266,
"grad_norm": 0.6746604279800079,
"learning_rate": 1.3113727907272341e-05,
"loss": 1.2452,
"step": 1165
},
{
"epoch": 0.4596205962059621,
"grad_norm": 0.6634669603961608,
"learning_rate": 1.3100642501028502e-05,
"loss": 1.2124,
"step": 1166
},
{
"epoch": 0.4600147819660015,
"grad_norm": 0.6589031509633928,
"learning_rate": 1.3087551218266373e-05,
"loss": 1.2681,
"step": 1167
},
{
"epoch": 0.4604089677260409,
"grad_norm": 0.6488880528092997,
"learning_rate": 1.307445408379731e-05,
"loss": 1.2313,
"step": 1168
},
{
"epoch": 0.4608031534860803,
"grad_norm": 0.6461518831877928,
"learning_rate": 1.3061351122443774e-05,
"loss": 1.173,
"step": 1169
},
{
"epoch": 0.4611973392461197,
"grad_norm": 0.6719867860616543,
"learning_rate": 1.304824235903925e-05,
"loss": 1.2363,
"step": 1170
},
{
"epoch": 0.46159152500615913,
"grad_norm": 0.6720218506435118,
"learning_rate": 1.3035127818428239e-05,
"loss": 1.2999,
"step": 1171
},
{
"epoch": 0.4619857107661986,
"grad_norm": 0.6216405882359431,
"learning_rate": 1.302200752546618e-05,
"loss": 1.1873,
"step": 1172
},
{
"epoch": 0.462379896526238,
"grad_norm": 0.6615993873842473,
"learning_rate": 1.3008881505019413e-05,
"loss": 1.2329,
"step": 1173
},
{
"epoch": 0.4627740822862774,
"grad_norm": 0.6332451929136712,
"learning_rate": 1.2995749781965139e-05,
"loss": 1.1945,
"step": 1174
},
{
"epoch": 0.46316826804631683,
"grad_norm": 0.6600204388313866,
"learning_rate": 1.2982612381191368e-05,
"loss": 1.1736,
"step": 1175
},
{
"epoch": 0.46356245380635624,
"grad_norm": 0.6700748596784245,
"learning_rate": 1.296946932759686e-05,
"loss": 1.2847,
"step": 1176
},
{
"epoch": 0.46395663956639566,
"grad_norm": 0.6650184197669182,
"learning_rate": 1.2956320646091106e-05,
"loss": 1.2097,
"step": 1177
},
{
"epoch": 0.46435082532643507,
"grad_norm": 0.6626476795340289,
"learning_rate": 1.2943166361594242e-05,
"loss": 1.2041,
"step": 1178
},
{
"epoch": 0.4647450110864745,
"grad_norm": 0.6475300925870908,
"learning_rate": 1.293000649903704e-05,
"loss": 1.2847,
"step": 1179
},
{
"epoch": 0.46513919684651395,
"grad_norm": 0.6563755699385965,
"learning_rate": 1.2916841083360836e-05,
"loss": 1.2188,
"step": 1180
},
{
"epoch": 0.46553338260655336,
"grad_norm": 0.6558206126815487,
"learning_rate": 1.2903670139517495e-05,
"loss": 1.2171,
"step": 1181
},
{
"epoch": 0.46592756836659277,
"grad_norm": 0.6366861432284558,
"learning_rate": 1.2890493692469357e-05,
"loss": 1.2451,
"step": 1182
},
{
"epoch": 0.4663217541266322,
"grad_norm": 0.6759773243408979,
"learning_rate": 1.2877311767189192e-05,
"loss": 1.2673,
"step": 1183
},
{
"epoch": 0.4667159398866716,
"grad_norm": 0.6419744413255126,
"learning_rate": 1.2864124388660148e-05,
"loss": 1.1927,
"step": 1184
},
{
"epoch": 0.467110125646711,
"grad_norm": 0.6665800678685042,
"learning_rate": 1.2850931581875723e-05,
"loss": 1.241,
"step": 1185
},
{
"epoch": 0.4675043114067504,
"grad_norm": 0.647473022755396,
"learning_rate": 1.283773337183968e-05,
"loss": 1.2654,
"step": 1186
},
{
"epoch": 0.46789849716678983,
"grad_norm": 0.6627384520276431,
"learning_rate": 1.2824529783566044e-05,
"loss": 1.2103,
"step": 1187
},
{
"epoch": 0.4682926829268293,
"grad_norm": 0.6984420515522787,
"learning_rate": 1.2811320842079026e-05,
"loss": 1.2189,
"step": 1188
},
{
"epoch": 0.4686868686868687,
"grad_norm": 0.6838425822588616,
"learning_rate": 1.2798106572412973e-05,
"loss": 1.2817,
"step": 1189
},
{
"epoch": 0.4690810544469081,
"grad_norm": 0.6918032431384864,
"learning_rate": 1.278488699961235e-05,
"loss": 1.2529,
"step": 1190
},
{
"epoch": 0.46947524020694753,
"grad_norm": 0.6948726963202924,
"learning_rate": 1.2771662148731653e-05,
"loss": 1.2411,
"step": 1191
},
{
"epoch": 0.46986942596698694,
"grad_norm": 0.6429092095036071,
"learning_rate": 1.275843204483539e-05,
"loss": 1.2295,
"step": 1192
},
{
"epoch": 0.47026361172702635,
"grad_norm": 0.6351964026733381,
"learning_rate": 1.2745196712998032e-05,
"loss": 1.2073,
"step": 1193
},
{
"epoch": 0.47065779748706577,
"grad_norm": 0.6921674003382929,
"learning_rate": 1.2731956178303941e-05,
"loss": 1.2549,
"step": 1194
},
{
"epoch": 0.4710519832471052,
"grad_norm": 0.6322772440878668,
"learning_rate": 1.2718710465847355e-05,
"loss": 1.2263,
"step": 1195
},
{
"epoch": 0.47144616900714464,
"grad_norm": 0.6452486149856621,
"learning_rate": 1.2705459600732319e-05,
"loss": 1.2562,
"step": 1196
},
{
"epoch": 0.47184035476718406,
"grad_norm": 0.6629534381246308,
"learning_rate": 1.2692203608072646e-05,
"loss": 1.2418,
"step": 1197
},
{
"epoch": 0.47223454052722347,
"grad_norm": 0.6619087288650083,
"learning_rate": 1.2678942512991865e-05,
"loss": 1.1517,
"step": 1198
},
{
"epoch": 0.4726287262872629,
"grad_norm": 0.6639361742877278,
"learning_rate": 1.2665676340623172e-05,
"loss": 1.1919,
"step": 1199
},
{
"epoch": 0.4730229120473023,
"grad_norm": 0.6771450309425207,
"learning_rate": 1.2652405116109394e-05,
"loss": 1.2983,
"step": 1200
},
{
"epoch": 0.4734170978073417,
"grad_norm": 0.6592820641641075,
"learning_rate": 1.2639128864602932e-05,
"loss": 1.2035,
"step": 1201
},
{
"epoch": 0.4738112835673811,
"grad_norm": 0.6754237204338704,
"learning_rate": 1.2625847611265703e-05,
"loss": 1.2545,
"step": 1202
},
{
"epoch": 0.4742054693274205,
"grad_norm": 0.6746663309712343,
"learning_rate": 1.2612561381269113e-05,
"loss": 1.167,
"step": 1203
},
{
"epoch": 0.47459965508746,
"grad_norm": 0.6499219261911088,
"learning_rate": 1.2599270199794008e-05,
"loss": 1.2697,
"step": 1204
},
{
"epoch": 0.4749938408474994,
"grad_norm": 0.6496215506080194,
"learning_rate": 1.2585974092030597e-05,
"loss": 1.2177,
"step": 1205
},
{
"epoch": 0.4753880266075388,
"grad_norm": 0.6507804232904032,
"learning_rate": 1.2572673083178448e-05,
"loss": 1.2166,
"step": 1206
},
{
"epoch": 0.47578221236757823,
"grad_norm": 0.6350993220502519,
"learning_rate": 1.2559367198446401e-05,
"loss": 1.1809,
"step": 1207
},
{
"epoch": 0.47617639812761764,
"grad_norm": 0.6638184807925088,
"learning_rate": 1.254605646305255e-05,
"loss": 1.3182,
"step": 1208
},
{
"epoch": 0.47657058388765705,
"grad_norm": 0.638690190001186,
"learning_rate": 1.2532740902224171e-05,
"loss": 1.219,
"step": 1209
},
{
"epoch": 0.47696476964769646,
"grad_norm": 0.6431222064327176,
"learning_rate": 1.2519420541197696e-05,
"loss": 1.2105,
"step": 1210
},
{
"epoch": 0.4773589554077359,
"grad_norm": 0.6385515617572074,
"learning_rate": 1.2506095405218646e-05,
"loss": 1.2066,
"step": 1211
},
{
"epoch": 0.47775314116777534,
"grad_norm": 0.6625298662888042,
"learning_rate": 1.249276551954159e-05,
"loss": 1.2048,
"step": 1212
},
{
"epoch": 0.47814732692781475,
"grad_norm": 0.6511188776236311,
"learning_rate": 1.2479430909430109e-05,
"loss": 1.2683,
"step": 1213
},
{
"epoch": 0.47854151268785416,
"grad_norm": 0.6431132536314119,
"learning_rate": 1.2466091600156736e-05,
"loss": 1.2451,
"step": 1214
},
{
"epoch": 0.4789356984478936,
"grad_norm": 0.6639747730945537,
"learning_rate": 1.2452747617002902e-05,
"loss": 1.2442,
"step": 1215
},
{
"epoch": 0.479329884207933,
"grad_norm": 0.6533976794673589,
"learning_rate": 1.24393989852589e-05,
"loss": 1.2325,
"step": 1216
},
{
"epoch": 0.4797240699679724,
"grad_norm": 0.6457330805526268,
"learning_rate": 1.2426045730223842e-05,
"loss": 1.2082,
"step": 1217
},
{
"epoch": 0.4801182557280118,
"grad_norm": 0.6610877473382107,
"learning_rate": 1.2412687877205587e-05,
"loss": 1.2377,
"step": 1218
},
{
"epoch": 0.4805124414880512,
"grad_norm": 0.6592577931155573,
"learning_rate": 1.2399325451520718e-05,
"loss": 1.2529,
"step": 1219
},
{
"epoch": 0.4809066272480907,
"grad_norm": 0.6661159851544838,
"learning_rate": 1.2385958478494487e-05,
"loss": 1.3026,
"step": 1220
},
{
"epoch": 0.4813008130081301,
"grad_norm": 0.6643157743331228,
"learning_rate": 1.2372586983460755e-05,
"loss": 1.1742,
"step": 1221
},
{
"epoch": 0.4816949987681695,
"grad_norm": 0.6520829662785887,
"learning_rate": 1.2359210991761958e-05,
"loss": 1.2212,
"step": 1222
},
{
"epoch": 0.4820891845282089,
"grad_norm": 0.6421284812980386,
"learning_rate": 1.2345830528749059e-05,
"loss": 1.2352,
"step": 1223
},
{
"epoch": 0.48248337028824834,
"grad_norm": 0.6474967726372801,
"learning_rate": 1.233244561978149e-05,
"loss": 1.1619,
"step": 1224
},
{
"epoch": 0.48287755604828775,
"grad_norm": 0.6621910058206888,
"learning_rate": 1.2319056290227106e-05,
"loss": 1.2398,
"step": 1225
},
{
"epoch": 0.48327174180832716,
"grad_norm": 0.5884735021292232,
"learning_rate": 1.2305662565462146e-05,
"loss": 1.2038,
"step": 1226
},
{
"epoch": 0.48366592756836657,
"grad_norm": 0.641700494355378,
"learning_rate": 1.2292264470871183e-05,
"loss": 1.2872,
"step": 1227
},
{
"epoch": 0.48406011332840604,
"grad_norm": 0.6360792810507947,
"learning_rate": 1.2278862031847061e-05,
"loss": 1.237,
"step": 1228
},
{
"epoch": 0.48445429908844545,
"grad_norm": 0.6242051518141506,
"learning_rate": 1.226545527379086e-05,
"loss": 1.1896,
"step": 1229
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.6506990087447501,
"learning_rate": 1.2252044222111859e-05,
"loss": 1.1949,
"step": 1230
},
{
"epoch": 0.4852426706085243,
"grad_norm": 0.6592019538150893,
"learning_rate": 1.2238628902227454e-05,
"loss": 1.1833,
"step": 1231
},
{
"epoch": 0.4856368563685637,
"grad_norm": 0.6880800573570197,
"learning_rate": 1.2225209339563144e-05,
"loss": 1.2481,
"step": 1232
},
{
"epoch": 0.4860310421286031,
"grad_norm": 0.6700259002004992,
"learning_rate": 1.2211785559552472e-05,
"loss": 1.27,
"step": 1233
},
{
"epoch": 0.4864252278886425,
"grad_norm": 0.6679202540830845,
"learning_rate": 1.2198357587636958e-05,
"loss": 1.182,
"step": 1234
},
{
"epoch": 0.4868194136486819,
"grad_norm": 0.6583277626537555,
"learning_rate": 1.2184925449266083e-05,
"loss": 1.2575,
"step": 1235
},
{
"epoch": 0.4872135994087214,
"grad_norm": 0.6510891521467633,
"learning_rate": 1.2171489169897217e-05,
"loss": 1.216,
"step": 1236
},
{
"epoch": 0.4876077851687608,
"grad_norm": 0.697605524032823,
"learning_rate": 1.215804877499558e-05,
"loss": 1.2935,
"step": 1237
},
{
"epoch": 0.4880019709288002,
"grad_norm": 0.6752644934446952,
"learning_rate": 1.2144604290034193e-05,
"loss": 1.1875,
"step": 1238
},
{
"epoch": 0.4883961566888396,
"grad_norm": 0.6290688021299883,
"learning_rate": 1.2131155740493816e-05,
"loss": 1.1835,
"step": 1239
},
{
"epoch": 0.48879034244887903,
"grad_norm": 0.6341260406172561,
"learning_rate": 1.211770315186294e-05,
"loss": 1.2685,
"step": 1240
},
{
"epoch": 0.48918452820891845,
"grad_norm": 0.6299349925825592,
"learning_rate": 1.2104246549637683e-05,
"loss": 1.2167,
"step": 1241
},
{
"epoch": 0.48957871396895786,
"grad_norm": 0.6372753688281468,
"learning_rate": 1.2090785959321783e-05,
"loss": 1.2302,
"step": 1242
},
{
"epoch": 0.48997289972899727,
"grad_norm": 0.6420141409041106,
"learning_rate": 1.2077321406426542e-05,
"loss": 1.1826,
"step": 1243
},
{
"epoch": 0.49036708548903674,
"grad_norm": 0.6693778503790639,
"learning_rate": 1.2063852916470755e-05,
"loss": 1.2352,
"step": 1244
},
{
"epoch": 0.49076127124907615,
"grad_norm": 0.6667762505796914,
"learning_rate": 1.2050380514980697e-05,
"loss": 1.2304,
"step": 1245
},
{
"epoch": 0.49115545700911556,
"grad_norm": 0.6574623314489658,
"learning_rate": 1.2036904227490043e-05,
"loss": 1.2036,
"step": 1246
},
{
"epoch": 0.49154964276915497,
"grad_norm": 0.6576866899161838,
"learning_rate": 1.2023424079539841e-05,
"loss": 1.2693,
"step": 1247
},
{
"epoch": 0.4919438285291944,
"grad_norm": 0.6854866850287104,
"learning_rate": 1.2009940096678451e-05,
"loss": 1.2331,
"step": 1248
},
{
"epoch": 0.4923380142892338,
"grad_norm": 0.6591589410360849,
"learning_rate": 1.1996452304461502e-05,
"loss": 1.1481,
"step": 1249
},
{
"epoch": 0.4927322000492732,
"grad_norm": 0.657166055362852,
"learning_rate": 1.1982960728451847e-05,
"loss": 1.2066,
"step": 1250
},
{
"epoch": 0.4931263858093126,
"grad_norm": 0.6500616754839462,
"learning_rate": 1.1969465394219503e-05,
"loss": 1.2311,
"step": 1251
},
{
"epoch": 0.4935205715693521,
"grad_norm": 0.7215977353713153,
"learning_rate": 1.1955966327341614e-05,
"loss": 1.2991,
"step": 1252
},
{
"epoch": 0.4939147573293915,
"grad_norm": 0.6380629207396062,
"learning_rate": 1.1942463553402407e-05,
"loss": 1.1492,
"step": 1253
},
{
"epoch": 0.4943089430894309,
"grad_norm": 0.6438522141604093,
"learning_rate": 1.192895709799311e-05,
"loss": 1.2256,
"step": 1254
},
{
"epoch": 0.4947031288494703,
"grad_norm": 0.6829774495136759,
"learning_rate": 1.1915446986711953e-05,
"loss": 1.2092,
"step": 1255
},
{
"epoch": 0.49509731460950973,
"grad_norm": 0.6414485475773434,
"learning_rate": 1.1901933245164085e-05,
"loss": 1.1672,
"step": 1256
},
{
"epoch": 0.49549150036954914,
"grad_norm": 0.6353044864393161,
"learning_rate": 1.1888415898961538e-05,
"loss": 1.2124,
"step": 1257
},
{
"epoch": 0.49588568612958855,
"grad_norm": 0.6459942965869777,
"learning_rate": 1.1874894973723173e-05,
"loss": 1.2434,
"step": 1258
},
{
"epoch": 0.49627987188962797,
"grad_norm": 0.6455190632225122,
"learning_rate": 1.1861370495074631e-05,
"loss": 1.1948,
"step": 1259
},
{
"epoch": 0.49667405764966743,
"grad_norm": 0.6611317837642312,
"learning_rate": 1.1847842488648296e-05,
"loss": 1.2226,
"step": 1260
},
{
"epoch": 0.49706824340970684,
"grad_norm": 0.6438093407353985,
"learning_rate": 1.1834310980083234e-05,
"loss": 1.1885,
"step": 1261
},
{
"epoch": 0.49746242916974626,
"grad_norm": 0.6724323601652606,
"learning_rate": 1.1820775995025147e-05,
"loss": 1.2409,
"step": 1262
},
{
"epoch": 0.49785661492978567,
"grad_norm": 0.6748553238124116,
"learning_rate": 1.1807237559126325e-05,
"loss": 1.2272,
"step": 1263
},
{
"epoch": 0.4982508006898251,
"grad_norm": 0.6139036537344899,
"learning_rate": 1.1793695698045606e-05,
"loss": 1.2306,
"step": 1264
},
{
"epoch": 0.4986449864498645,
"grad_norm": 0.6274786131500468,
"learning_rate": 1.1780150437448308e-05,
"loss": 1.2436,
"step": 1265
},
{
"epoch": 0.4990391722099039,
"grad_norm": 0.6947108304184417,
"learning_rate": 1.1766601803006204e-05,
"loss": 1.2404,
"step": 1266
},
{
"epoch": 0.4994333579699433,
"grad_norm": 0.6330610294257072,
"learning_rate": 1.1753049820397449e-05,
"loss": 1.2661,
"step": 1267
},
{
"epoch": 0.4998275437299828,
"grad_norm": 0.6526188172174275,
"learning_rate": 1.1739494515306553e-05,
"loss": 1.2404,
"step": 1268
},
{
"epoch": 0.5002217294900222,
"grad_norm": 0.6669476058696817,
"learning_rate": 1.172593591342432e-05,
"loss": 1.2259,
"step": 1269
},
{
"epoch": 0.5006159152500615,
"grad_norm": 0.6632364458454981,
"learning_rate": 1.1712374040447802e-05,
"loss": 1.2059,
"step": 1270
},
{
"epoch": 0.501010101010101,
"grad_norm": 0.6580075066736768,
"learning_rate": 1.1698808922080248e-05,
"loss": 1.2125,
"step": 1271
},
{
"epoch": 0.5014042867701405,
"grad_norm": 0.6477489624350686,
"learning_rate": 1.1685240584031068e-05,
"loss": 1.2346,
"step": 1272
},
{
"epoch": 0.5017984725301798,
"grad_norm": 0.6536067797543117,
"learning_rate": 1.1671669052015757e-05,
"loss": 1.2087,
"step": 1273
},
{
"epoch": 0.5021926582902193,
"grad_norm": 0.6652544869437115,
"learning_rate": 1.1658094351755883e-05,
"loss": 1.2333,
"step": 1274
},
{
"epoch": 0.5025868440502587,
"grad_norm": 0.6600451654966094,
"learning_rate": 1.1644516508978998e-05,
"loss": 1.213,
"step": 1275
},
{
"epoch": 0.5029810298102981,
"grad_norm": 0.6590398336514781,
"learning_rate": 1.1630935549418627e-05,
"loss": 1.2184,
"step": 1276
},
{
"epoch": 0.5033752155703375,
"grad_norm": 0.660891374872714,
"learning_rate": 1.1617351498814199e-05,
"loss": 1.2451,
"step": 1277
},
{
"epoch": 0.503769401330377,
"grad_norm": 0.6091765102262902,
"learning_rate": 1.1603764382910989e-05,
"loss": 1.1412,
"step": 1278
},
{
"epoch": 0.5041635870904163,
"grad_norm": 0.6735824808082984,
"learning_rate": 1.1590174227460098e-05,
"loss": 1.1786,
"step": 1279
},
{
"epoch": 0.5045577728504558,
"grad_norm": 0.6532363704591942,
"learning_rate": 1.1576581058218375e-05,
"loss": 1.1864,
"step": 1280
},
{
"epoch": 0.5049519586104952,
"grad_norm": 0.6606502828456684,
"learning_rate": 1.156298490094839e-05,
"loss": 1.1888,
"step": 1281
},
{
"epoch": 0.5053461443705346,
"grad_norm": 0.6342921397541668,
"learning_rate": 1.1549385781418372e-05,
"loss": 1.2213,
"step": 1282
},
{
"epoch": 0.5057403301305741,
"grad_norm": 0.6689825246282982,
"learning_rate": 1.1535783725402163e-05,
"loss": 1.2618,
"step": 1283
},
{
"epoch": 0.5061345158906134,
"grad_norm": 0.640115147587615,
"learning_rate": 1.1522178758679172e-05,
"loss": 1.222,
"step": 1284
},
{
"epoch": 0.5065287016506529,
"grad_norm": 0.6676485619547307,
"learning_rate": 1.1508570907034325e-05,
"loss": 1.2239,
"step": 1285
},
{
"epoch": 0.5069228874106922,
"grad_norm": 0.6584471811582958,
"learning_rate": 1.1494960196258016e-05,
"loss": 1.2261,
"step": 1286
},
{
"epoch": 0.5073170731707317,
"grad_norm": 0.6313871712156794,
"learning_rate": 1.1481346652146057e-05,
"loss": 1.2352,
"step": 1287
},
{
"epoch": 0.5077112589307712,
"grad_norm": 0.6192657373849317,
"learning_rate": 1.1467730300499626e-05,
"loss": 1.2161,
"step": 1288
},
{
"epoch": 0.5081054446908105,
"grad_norm": 0.661823259158885,
"learning_rate": 1.1454111167125231e-05,
"loss": 1.1869,
"step": 1289
},
{
"epoch": 0.50849963045085,
"grad_norm": 0.6581281171795876,
"learning_rate": 1.1440489277834645e-05,
"loss": 1.2408,
"step": 1290
},
{
"epoch": 0.5088938162108894,
"grad_norm": 0.673672216319801,
"learning_rate": 1.1426864658444865e-05,
"loss": 1.2423,
"step": 1291
},
{
"epoch": 0.5092880019709288,
"grad_norm": 0.6709234458079614,
"learning_rate": 1.1413237334778064e-05,
"loss": 1.2092,
"step": 1292
},
{
"epoch": 0.5096821877309682,
"grad_norm": 0.6704668753810613,
"learning_rate": 1.139960733266154e-05,
"loss": 1.2005,
"step": 1293
},
{
"epoch": 0.5100763734910077,
"grad_norm": 0.6665476817077829,
"learning_rate": 1.1385974677927667e-05,
"loss": 1.2879,
"step": 1294
},
{
"epoch": 0.510470559251047,
"grad_norm": 0.6491129692417508,
"learning_rate": 1.1372339396413845e-05,
"loss": 1.2029,
"step": 1295
},
{
"epoch": 0.5108647450110865,
"grad_norm": 0.6370912475464865,
"learning_rate": 1.1358701513962457e-05,
"loss": 1.2327,
"step": 1296
},
{
"epoch": 0.5112589307711259,
"grad_norm": 0.648157038901389,
"learning_rate": 1.134506105642081e-05,
"loss": 1.2124,
"step": 1297
},
{
"epoch": 0.5116531165311653,
"grad_norm": 0.6461266035285687,
"learning_rate": 1.1331418049641091e-05,
"loss": 1.1982,
"step": 1298
},
{
"epoch": 0.5120473022912048,
"grad_norm": 0.6281200807330076,
"learning_rate": 1.1317772519480328e-05,
"loss": 1.2601,
"step": 1299
},
{
"epoch": 0.5124414880512441,
"grad_norm": 0.6422476551253151,
"learning_rate": 1.130412449180032e-05,
"loss": 1.1964,
"step": 1300
},
{
"epoch": 0.5128356738112836,
"grad_norm": 0.63650842337126,
"learning_rate": 1.1290473992467607e-05,
"loss": 1.2076,
"step": 1301
},
{
"epoch": 0.5132298595713229,
"grad_norm": 0.6773389045891938,
"learning_rate": 1.1276821047353403e-05,
"loss": 1.2352,
"step": 1302
},
{
"epoch": 0.5136240453313624,
"grad_norm": 0.6309296879156464,
"learning_rate": 1.1263165682333577e-05,
"loss": 1.1772,
"step": 1303
},
{
"epoch": 0.5140182310914019,
"grad_norm": 0.6765478799067353,
"learning_rate": 1.1249507923288563e-05,
"loss": 1.2115,
"step": 1304
},
{
"epoch": 0.5144124168514412,
"grad_norm": 0.6831067353554151,
"learning_rate": 1.1235847796103345e-05,
"loss": 1.2322,
"step": 1305
},
{
"epoch": 0.5148066026114807,
"grad_norm": 0.6680880986848273,
"learning_rate": 1.122218532666739e-05,
"loss": 1.2728,
"step": 1306
},
{
"epoch": 0.5152007883715201,
"grad_norm": 0.645405977896472,
"learning_rate": 1.1208520540874607e-05,
"loss": 1.2003,
"step": 1307
},
{
"epoch": 0.5155949741315595,
"grad_norm": 0.6696823139879742,
"learning_rate": 1.1194853464623294e-05,
"loss": 1.1981,
"step": 1308
},
{
"epoch": 0.5159891598915989,
"grad_norm": 0.6530439594705855,
"learning_rate": 1.1181184123816092e-05,
"loss": 1.1805,
"step": 1309
},
{
"epoch": 0.5163833456516383,
"grad_norm": 0.662122019391009,
"learning_rate": 1.1167512544359929e-05,
"loss": 1.2935,
"step": 1310
},
{
"epoch": 0.5167775314116777,
"grad_norm": 0.6515187138374906,
"learning_rate": 1.115383875216598e-05,
"loss": 1.236,
"step": 1311
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.6514508648345718,
"learning_rate": 1.1140162773149612e-05,
"loss": 1.1743,
"step": 1312
},
{
"epoch": 0.5175659029317566,
"grad_norm": 0.6440703774811735,
"learning_rate": 1.112648463323034e-05,
"loss": 1.2221,
"step": 1313
},
{
"epoch": 0.517960088691796,
"grad_norm": 0.6644581716811222,
"learning_rate": 1.1112804358331766e-05,
"loss": 1.1723,
"step": 1314
},
{
"epoch": 0.5183542744518355,
"grad_norm": 0.647476681026034,
"learning_rate": 1.1099121974381546e-05,
"loss": 1.2043,
"step": 1315
},
{
"epoch": 0.5187484602118748,
"grad_norm": 0.6615768891463015,
"learning_rate": 1.108543750731134e-05,
"loss": 1.1933,
"step": 1316
},
{
"epoch": 0.5191426459719143,
"grad_norm": 0.6352447330049817,
"learning_rate": 1.1071750983056733e-05,
"loss": 1.1965,
"step": 1317
},
{
"epoch": 0.5195368317319536,
"grad_norm": 0.6515803618281081,
"learning_rate": 1.105806242755723e-05,
"loss": 1.2412,
"step": 1318
},
{
"epoch": 0.5199310174919931,
"grad_norm": 0.6408728168852139,
"learning_rate": 1.1044371866756178e-05,
"loss": 1.2595,
"step": 1319
},
{
"epoch": 0.5203252032520326,
"grad_norm": 0.6136018250584243,
"learning_rate": 1.1030679326600726e-05,
"loss": 1.1597,
"step": 1320
},
{
"epoch": 0.5207193890120719,
"grad_norm": 0.6341434671207334,
"learning_rate": 1.1016984833041773e-05,
"loss": 1.1992,
"step": 1321
},
{
"epoch": 0.5211135747721114,
"grad_norm": 0.6539064660047773,
"learning_rate": 1.1003288412033923e-05,
"loss": 1.1332,
"step": 1322
},
{
"epoch": 0.5215077605321508,
"grad_norm": 0.6232171122795831,
"learning_rate": 1.0989590089535426e-05,
"loss": 1.2388,
"step": 1323
},
{
"epoch": 0.5219019462921902,
"grad_norm": 0.6877295201168714,
"learning_rate": 1.097588989150815e-05,
"loss": 1.2525,
"step": 1324
},
{
"epoch": 0.5222961320522296,
"grad_norm": 0.7115352113501258,
"learning_rate": 1.0962187843917498e-05,
"loss": 1.2115,
"step": 1325
},
{
"epoch": 0.522690317812269,
"grad_norm": 0.642946361400015,
"learning_rate": 1.0948483972732395e-05,
"loss": 1.2129,
"step": 1326
},
{
"epoch": 0.5230845035723084,
"grad_norm": 0.634552641474732,
"learning_rate": 1.0934778303925214e-05,
"loss": 1.1845,
"step": 1327
},
{
"epoch": 0.5234786893323479,
"grad_norm": 0.6716816812404441,
"learning_rate": 1.0921070863471732e-05,
"loss": 1.2202,
"step": 1328
},
{
"epoch": 0.5238728750923873,
"grad_norm": 0.6403984245235527,
"learning_rate": 1.09073616773511e-05,
"loss": 1.2436,
"step": 1329
},
{
"epoch": 0.5242670608524267,
"grad_norm": 0.6426802290331379,
"learning_rate": 1.089365077154576e-05,
"loss": 1.1759,
"step": 1330
},
{
"epoch": 0.5246612466124662,
"grad_norm": 0.6528320428327657,
"learning_rate": 1.0879938172041415e-05,
"loss": 1.234,
"step": 1331
},
{
"epoch": 0.5250554323725055,
"grad_norm": 0.6343235957872947,
"learning_rate": 1.0866223904826992e-05,
"loss": 1.1482,
"step": 1332
},
{
"epoch": 0.525449618132545,
"grad_norm": 0.635182058088562,
"learning_rate": 1.0852507995894558e-05,
"loss": 1.2054,
"step": 1333
},
{
"epoch": 0.5258438038925843,
"grad_norm": 0.6367031967484378,
"learning_rate": 1.0838790471239314e-05,
"loss": 1.1575,
"step": 1334
},
{
"epoch": 0.5262379896526238,
"grad_norm": 0.6402983704212438,
"learning_rate": 1.0825071356859502e-05,
"loss": 1.1966,
"step": 1335
},
{
"epoch": 0.5266321754126633,
"grad_norm": 0.6558137431376323,
"learning_rate": 1.0811350678756392e-05,
"loss": 1.2003,
"step": 1336
},
{
"epoch": 0.5270263611727026,
"grad_norm": 0.6387053585661903,
"learning_rate": 1.0797628462934214e-05,
"loss": 1.2108,
"step": 1337
},
{
"epoch": 0.5274205469327421,
"grad_norm": 0.6086598757639083,
"learning_rate": 1.0783904735400103e-05,
"loss": 1.1663,
"step": 1338
},
{
"epoch": 0.5278147326927815,
"grad_norm": 0.6399532215520667,
"learning_rate": 1.0770179522164079e-05,
"loss": 1.2112,
"step": 1339
},
{
"epoch": 0.5282089184528209,
"grad_norm": 0.6676098681703231,
"learning_rate": 1.0756452849238955e-05,
"loss": 1.2461,
"step": 1340
},
{
"epoch": 0.5286031042128603,
"grad_norm": 0.6540029616620948,
"learning_rate": 1.0742724742640323e-05,
"loss": 1.2397,
"step": 1341
},
{
"epoch": 0.5289972899728997,
"grad_norm": 0.6538972674770378,
"learning_rate": 1.0728995228386496e-05,
"loss": 1.2309,
"step": 1342
},
{
"epoch": 0.5293914757329391,
"grad_norm": 0.6772694870371185,
"learning_rate": 1.0715264332498445e-05,
"loss": 1.258,
"step": 1343
},
{
"epoch": 0.5297856614929786,
"grad_norm": 0.6376355859195808,
"learning_rate": 1.0701532080999762e-05,
"loss": 1.2376,
"step": 1344
},
{
"epoch": 0.530179847253018,
"grad_norm": 0.663394682115222,
"learning_rate": 1.0687798499916613e-05,
"loss": 1.2073,
"step": 1345
},
{
"epoch": 0.5305740330130574,
"grad_norm": 0.6701564343777298,
"learning_rate": 1.0674063615277681e-05,
"loss": 1.2365,
"step": 1346
},
{
"epoch": 0.5309682187730969,
"grad_norm": 0.6464607961695173,
"learning_rate": 1.0660327453114118e-05,
"loss": 1.1761,
"step": 1347
},
{
"epoch": 0.5313624045331362,
"grad_norm": 0.6383382398982943,
"learning_rate": 1.0646590039459499e-05,
"loss": 1.2069,
"step": 1348
},
{
"epoch": 0.5317565902931757,
"grad_norm": 0.7250328811363568,
"learning_rate": 1.063285140034977e-05,
"loss": 1.2748,
"step": 1349
},
{
"epoch": 0.532150776053215,
"grad_norm": 0.6218566182573235,
"learning_rate": 1.0619111561823208e-05,
"loss": 1.1792,
"step": 1350
},
{
"epoch": 0.5325449618132545,
"grad_norm": 0.6491294616401706,
"learning_rate": 1.060537054992034e-05,
"loss": 1.214,
"step": 1351
},
{
"epoch": 0.532939147573294,
"grad_norm": 0.6218758954772929,
"learning_rate": 1.0591628390683945e-05,
"loss": 1.1642,
"step": 1352
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.6423851142416096,
"learning_rate": 1.0577885110158959e-05,
"loss": 1.2269,
"step": 1353
},
{
"epoch": 0.5337275190933728,
"grad_norm": 0.6619276692624474,
"learning_rate": 1.0564140734392445e-05,
"loss": 1.2517,
"step": 1354
},
{
"epoch": 0.5341217048534121,
"grad_norm": 0.6486156036656686,
"learning_rate": 1.0550395289433553e-05,
"loss": 1.2318,
"step": 1355
},
{
"epoch": 0.5345158906134516,
"grad_norm": 0.6207033641119062,
"learning_rate": 1.0536648801333443e-05,
"loss": 1.22,
"step": 1356
},
{
"epoch": 0.534910076373491,
"grad_norm": 0.6286210196563511,
"learning_rate": 1.0522901296145263e-05,
"loss": 1.2087,
"step": 1357
},
{
"epoch": 0.5353042621335304,
"grad_norm": 0.6425274380062405,
"learning_rate": 1.0509152799924085e-05,
"loss": 1.2117,
"step": 1358
},
{
"epoch": 0.5356984478935698,
"grad_norm": 0.6192586936021032,
"learning_rate": 1.0495403338726862e-05,
"loss": 1.1948,
"step": 1359
},
{
"epoch": 0.5360926336536093,
"grad_norm": 0.6377697560605069,
"learning_rate": 1.0481652938612374e-05,
"loss": 1.2518,
"step": 1360
},
{
"epoch": 0.5364868194136487,
"grad_norm": 0.6359977533800316,
"learning_rate": 1.0467901625641174e-05,
"loss": 1.1883,
"step": 1361
},
{
"epoch": 0.5368810051736881,
"grad_norm": 0.6266522995098218,
"learning_rate": 1.045414942587556e-05,
"loss": 1.1223,
"step": 1362
},
{
"epoch": 0.5372751909337276,
"grad_norm": 0.6358734881969099,
"learning_rate": 1.0440396365379496e-05,
"loss": 1.2248,
"step": 1363
},
{
"epoch": 0.5376693766937669,
"grad_norm": 0.6182266673498269,
"learning_rate": 1.0426642470218587e-05,
"loss": 1.205,
"step": 1364
},
{
"epoch": 0.5380635624538064,
"grad_norm": 0.6485743617703122,
"learning_rate": 1.0412887766460017e-05,
"loss": 1.1979,
"step": 1365
},
{
"epoch": 0.5384577482138457,
"grad_norm": 0.6392709807479522,
"learning_rate": 1.0399132280172494e-05,
"loss": 1.2084,
"step": 1366
},
{
"epoch": 0.5388519339738852,
"grad_norm": 0.6545405852048852,
"learning_rate": 1.0385376037426227e-05,
"loss": 1.265,
"step": 1367
},
{
"epoch": 0.5392461197339247,
"grad_norm": 0.6496693130292205,
"learning_rate": 1.0371619064292844e-05,
"loss": 1.2467,
"step": 1368
},
{
"epoch": 0.539640305493964,
"grad_norm": 0.6835306554548173,
"learning_rate": 1.035786138684536e-05,
"loss": 1.2406,
"step": 1369
},
{
"epoch": 0.5400344912540035,
"grad_norm": 0.6433918833824575,
"learning_rate": 1.034410303115813e-05,
"loss": 1.2708,
"step": 1370
},
{
"epoch": 0.5404286770140428,
"grad_norm": 0.6391881556502016,
"learning_rate": 1.0330344023306791e-05,
"loss": 1.229,
"step": 1371
},
{
"epoch": 0.5408228627740823,
"grad_norm": 0.6778620828218745,
"learning_rate": 1.0316584389368213e-05,
"loss": 1.2611,
"step": 1372
},
{
"epoch": 0.5412170485341217,
"grad_norm": 0.6574985715883013,
"learning_rate": 1.0302824155420464e-05,
"loss": 1.2234,
"step": 1373
},
{
"epoch": 0.5416112342941611,
"grad_norm": 0.6714841683370039,
"learning_rate": 1.0289063347542727e-05,
"loss": 1.2057,
"step": 1374
},
{
"epoch": 0.5420054200542005,
"grad_norm": 0.646623331729815,
"learning_rate": 1.0275301991815299e-05,
"loss": 1.2366,
"step": 1375
},
{
"epoch": 0.54239960581424,
"grad_norm": 0.6267893952077622,
"learning_rate": 1.02615401143195e-05,
"loss": 1.2157,
"step": 1376
},
{
"epoch": 0.5427937915742794,
"grad_norm": 0.6430429787610838,
"learning_rate": 1.0247777741137636e-05,
"loss": 1.2459,
"step": 1377
},
{
"epoch": 0.5431879773343188,
"grad_norm": 0.6315063466990641,
"learning_rate": 1.0234014898352966e-05,
"loss": 1.2342,
"step": 1378
},
{
"epoch": 0.5435821630943583,
"grad_norm": 0.7220865603750691,
"learning_rate": 1.022025161204963e-05,
"loss": 1.2154,
"step": 1379
},
{
"epoch": 0.5439763488543976,
"grad_norm": 0.6377801583000084,
"learning_rate": 1.0206487908312607e-05,
"loss": 1.206,
"step": 1380
},
{
"epoch": 0.5443705346144371,
"grad_norm": 0.6319172744640024,
"learning_rate": 1.0192723813227672e-05,
"loss": 1.1919,
"step": 1381
},
{
"epoch": 0.5447647203744764,
"grad_norm": 0.6364897393407957,
"learning_rate": 1.0178959352881337e-05,
"loss": 1.2146,
"step": 1382
},
{
"epoch": 0.5451589061345159,
"grad_norm": 0.6688375716623369,
"learning_rate": 1.0165194553360813e-05,
"loss": 1.2469,
"step": 1383
},
{
"epoch": 0.5455530918945554,
"grad_norm": 0.662719310669721,
"learning_rate": 1.0151429440753948e-05,
"loss": 1.3032,
"step": 1384
},
{
"epoch": 0.5459472776545947,
"grad_norm": 0.6431824004552453,
"learning_rate": 1.0137664041149187e-05,
"loss": 1.2224,
"step": 1385
},
{
"epoch": 0.5463414634146342,
"grad_norm": 0.6397813243923787,
"learning_rate": 1.0123898380635515e-05,
"loss": 1.1647,
"step": 1386
},
{
"epoch": 0.5467356491746735,
"grad_norm": 0.6349500431531321,
"learning_rate": 1.011013248530241e-05,
"loss": 1.2286,
"step": 1387
},
{
"epoch": 0.547129834934713,
"grad_norm": 0.6355731398653511,
"learning_rate": 1.0096366381239808e-05,
"loss": 1.1548,
"step": 1388
},
{
"epoch": 0.5475240206947524,
"grad_norm": 0.6272297906309461,
"learning_rate": 1.0082600094538029e-05,
"loss": 1.2372,
"step": 1389
},
{
"epoch": 0.5479182064547918,
"grad_norm": 0.6514286635524038,
"learning_rate": 1.0068833651287736e-05,
"loss": 1.1854,
"step": 1390
},
{
"epoch": 0.5483123922148312,
"grad_norm": 0.6434159221463395,
"learning_rate": 1.0055067077579894e-05,
"loss": 1.1649,
"step": 1391
},
{
"epoch": 0.5487065779748707,
"grad_norm": 0.6534616096140339,
"learning_rate": 1.0041300399505724e-05,
"loss": 1.2058,
"step": 1392
},
{
"epoch": 0.5491007637349101,
"grad_norm": 0.6385843361048341,
"learning_rate": 1.0027533643156629e-05,
"loss": 1.206,
"step": 1393
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.654135497386305,
"learning_rate": 1.0013766834624168e-05,
"loss": 1.2947,
"step": 1394
},
{
"epoch": 0.549889135254989,
"grad_norm": 0.6527260856281124,
"learning_rate": 1e-05,
"loss": 1.2067,
"step": 1395
},
{
"epoch": 0.5502833210150283,
"grad_norm": 0.6456506343549768,
"learning_rate": 9.986233165375837e-06,
"loss": 1.2799,
"step": 1396
},
{
"epoch": 0.5506775067750678,
"grad_norm": 0.7246957748680044,
"learning_rate": 9.972466356843375e-06,
"loss": 1.3271,
"step": 1397
},
{
"epoch": 0.5510716925351071,
"grad_norm": 0.6399327077783894,
"learning_rate": 9.95869960049428e-06,
"loss": 1.2443,
"step": 1398
},
{
"epoch": 0.5514658782951466,
"grad_norm": 0.6241508398727628,
"learning_rate": 9.944932922420109e-06,
"loss": 1.2007,
"step": 1399
},
{
"epoch": 0.5518600640551861,
"grad_norm": 0.614559476153416,
"learning_rate": 9.931166348712268e-06,
"loss": 1.1704,
"step": 1400
},
{
"epoch": 0.5522542498152254,
"grad_norm": 0.6304080966033335,
"learning_rate": 9.917399905461974e-06,
"loss": 1.1869,
"step": 1401
},
{
"epoch": 0.5526484355752649,
"grad_norm": 0.6412439956786309,
"learning_rate": 9.903633618760195e-06,
"loss": 1.1782,
"step": 1402
},
{
"epoch": 0.5530426213353042,
"grad_norm": 0.6557358908407644,
"learning_rate": 9.889867514697591e-06,
"loss": 1.225,
"step": 1403
},
{
"epoch": 0.5534368070953437,
"grad_norm": 0.6212875821927828,
"learning_rate": 9.876101619364487e-06,
"loss": 1.196,
"step": 1404
},
{
"epoch": 0.5538309928553831,
"grad_norm": 0.613555231324674,
"learning_rate": 9.862335958850816e-06,
"loss": 1.1592,
"step": 1405
},
{
"epoch": 0.5542251786154225,
"grad_norm": 0.6745935115478964,
"learning_rate": 9.848570559246055e-06,
"loss": 1.1877,
"step": 1406
},
{
"epoch": 0.5546193643754619,
"grad_norm": 0.6410977347319441,
"learning_rate": 9.834805446639187e-06,
"loss": 1.1612,
"step": 1407
},
{
"epoch": 0.5550135501355014,
"grad_norm": 0.6309144641717204,
"learning_rate": 9.821040647118666e-06,
"loss": 1.1425,
"step": 1408
},
{
"epoch": 0.5554077358955408,
"grad_norm": 0.6299676272735365,
"learning_rate": 9.807276186772335e-06,
"loss": 1.208,
"step": 1409
},
{
"epoch": 0.5558019216555802,
"grad_norm": 0.6178102722375627,
"learning_rate": 9.793512091687396e-06,
"loss": 1.1846,
"step": 1410
},
{
"epoch": 0.5561961074156196,
"grad_norm": 0.622166600700565,
"learning_rate": 9.779748387950372e-06,
"loss": 1.1662,
"step": 1411
},
{
"epoch": 0.556590293175659,
"grad_norm": 0.6600214723637224,
"learning_rate": 9.765985101647037e-06,
"loss": 1.2892,
"step": 1412
},
{
"epoch": 0.5569844789356985,
"grad_norm": 0.6176714958995365,
"learning_rate": 9.752222258862364e-06,
"loss": 1.1706,
"step": 1413
},
{
"epoch": 0.5573786646957378,
"grad_norm": 0.5939231448625044,
"learning_rate": 9.738459885680502e-06,
"loss": 1.1488,
"step": 1414
},
{
"epoch": 0.5577728504557773,
"grad_norm": 0.6352717829639574,
"learning_rate": 9.724698008184705e-06,
"loss": 1.2017,
"step": 1415
},
{
"epoch": 0.5581670362158168,
"grad_norm": 0.6167223796720016,
"learning_rate": 9.710936652457276e-06,
"loss": 1.1228,
"step": 1416
},
{
"epoch": 0.5585612219758561,
"grad_norm": 0.6213254460946624,
"learning_rate": 9.69717584457954e-06,
"loss": 1.184,
"step": 1417
},
{
"epoch": 0.5589554077358956,
"grad_norm": 0.6131341167960235,
"learning_rate": 9.683415610631788e-06,
"loss": 1.161,
"step": 1418
},
{
"epoch": 0.5593495934959349,
"grad_norm": 0.6296617155093078,
"learning_rate": 9.669655976693214e-06,
"loss": 1.1642,
"step": 1419
},
{
"epoch": 0.5597437792559744,
"grad_norm": 0.6153554191014486,
"learning_rate": 9.655896968841873e-06,
"loss": 1.2156,
"step": 1420
},
{
"epoch": 0.5601379650160138,
"grad_norm": 0.6392439227341541,
"learning_rate": 9.642138613154643e-06,
"loss": 1.1957,
"step": 1421
},
{
"epoch": 0.5605321507760532,
"grad_norm": 0.6260052735651341,
"learning_rate": 9.62838093570716e-06,
"loss": 1.1974,
"step": 1422
},
{
"epoch": 0.5609263365360926,
"grad_norm": 0.6334362558009554,
"learning_rate": 9.614623962573776e-06,
"loss": 1.1965,
"step": 1423
},
{
"epoch": 0.561320522296132,
"grad_norm": 0.6179635946785395,
"learning_rate": 9.600867719827507e-06,
"loss": 1.1606,
"step": 1424
},
{
"epoch": 0.5617147080561715,
"grad_norm": 0.675892965228182,
"learning_rate": 9.587112233539988e-06,
"loss": 1.2698,
"step": 1425
},
{
"epoch": 0.5621088938162109,
"grad_norm": 0.6269199497256357,
"learning_rate": 9.573357529781413e-06,
"loss": 1.1738,
"step": 1426
},
{
"epoch": 0.5625030795762503,
"grad_norm": 0.6206668162899066,
"learning_rate": 9.559603634620505e-06,
"loss": 1.1545,
"step": 1427
},
{
"epoch": 0.5628972653362897,
"grad_norm": 0.6392518680745602,
"learning_rate": 9.545850574124444e-06,
"loss": 1.2394,
"step": 1428
},
{
"epoch": 0.5632914510963292,
"grad_norm": 0.6554357478989767,
"learning_rate": 9.532098374358828e-06,
"loss": 1.2056,
"step": 1429
},
{
"epoch": 0.5636856368563685,
"grad_norm": 0.6321993644191258,
"learning_rate": 9.518347061387629e-06,
"loss": 1.2424,
"step": 1430
},
{
"epoch": 0.564079822616408,
"grad_norm": 0.6342077276536365,
"learning_rate": 9.504596661273141e-06,
"loss": 1.216,
"step": 1431
},
{
"epoch": 0.5644740083764475,
"grad_norm": 0.655567194868911,
"learning_rate": 9.490847200075919e-06,
"loss": 1.2236,
"step": 1432
},
{
"epoch": 0.5648681941364868,
"grad_norm": 0.6452206424611665,
"learning_rate": 9.47709870385474e-06,
"loss": 1.1493,
"step": 1433
},
{
"epoch": 0.5652623798965263,
"grad_norm": 0.6551732071227462,
"learning_rate": 9.46335119866656e-06,
"loss": 1.2243,
"step": 1434
},
{
"epoch": 0.5656565656565656,
"grad_norm": 0.638292981830309,
"learning_rate": 9.449604710566452e-06,
"loss": 1.2154,
"step": 1435
},
{
"epoch": 0.5660507514166051,
"grad_norm": 0.6434536189993397,
"learning_rate": 9.435859265607555e-06,
"loss": 1.2622,
"step": 1436
},
{
"epoch": 0.5664449371766445,
"grad_norm": 0.6235727133771496,
"learning_rate": 9.422114889841045e-06,
"loss": 1.2097,
"step": 1437
},
{
"epoch": 0.5668391229366839,
"grad_norm": 0.6380544846865114,
"learning_rate": 9.40837160931606e-06,
"loss": 1.1931,
"step": 1438
},
{
"epoch": 0.5672333086967233,
"grad_norm": 0.6070307134735536,
"learning_rate": 9.394629450079661e-06,
"loss": 1.1728,
"step": 1439
},
{
"epoch": 0.5676274944567627,
"grad_norm": 0.6261762404486911,
"learning_rate": 9.380888438176797e-06,
"loss": 1.2047,
"step": 1440
},
{
"epoch": 0.5680216802168022,
"grad_norm": 0.6148402557876401,
"learning_rate": 9.367148599650231e-06,
"loss": 1.1782,
"step": 1441
},
{
"epoch": 0.5684158659768416,
"grad_norm": 0.6153367707877275,
"learning_rate": 9.353409960540506e-06,
"loss": 1.1333,
"step": 1442
},
{
"epoch": 0.568810051736881,
"grad_norm": 0.6401365387127351,
"learning_rate": 9.339672546885885e-06,
"loss": 1.2479,
"step": 1443
},
{
"epoch": 0.5692042374969204,
"grad_norm": 0.6301673949669812,
"learning_rate": 9.325936384722322e-06,
"loss": 1.2015,
"step": 1444
},
{
"epoch": 0.5695984232569599,
"grad_norm": 0.6286144736358145,
"learning_rate": 9.312201500083392e-06,
"loss": 1.2487,
"step": 1445
},
{
"epoch": 0.5699926090169992,
"grad_norm": 0.6171822342295599,
"learning_rate": 9.29846791900024e-06,
"loss": 1.1904,
"step": 1446
},
{
"epoch": 0.5703867947770387,
"grad_norm": 0.6428565759737676,
"learning_rate": 9.284735667501558e-06,
"loss": 1.1679,
"step": 1447
},
{
"epoch": 0.5707809805370782,
"grad_norm": 0.6151703289847316,
"learning_rate": 9.271004771613509e-06,
"loss": 1.1246,
"step": 1448
},
{
"epoch": 0.5711751662971175,
"grad_norm": 0.6398686829564575,
"learning_rate": 9.257275257359679e-06,
"loss": 1.1657,
"step": 1449
},
{
"epoch": 0.571569352057157,
"grad_norm": 0.6243382952424049,
"learning_rate": 9.243547150761047e-06,
"loss": 1.1966,
"step": 1450
},
{
"epoch": 0.5719635378171963,
"grad_norm": 0.6408741873334287,
"learning_rate": 9.229820477835926e-06,
"loss": 1.2205,
"step": 1451
},
{
"epoch": 0.5723577235772358,
"grad_norm": 0.633552764994025,
"learning_rate": 9.216095264599895e-06,
"loss": 1.2252,
"step": 1452
},
{
"epoch": 0.5727519093372752,
"grad_norm": 0.6511108996685305,
"learning_rate": 9.202371537065788e-06,
"loss": 1.2656,
"step": 1453
},
{
"epoch": 0.5731460950973146,
"grad_norm": 0.6529280803122515,
"learning_rate": 9.18864932124361e-06,
"loss": 1.2239,
"step": 1454
},
{
"epoch": 0.573540280857354,
"grad_norm": 0.647401441010935,
"learning_rate": 9.1749286431405e-06,
"loss": 1.2716,
"step": 1455
},
{
"epoch": 0.5739344666173934,
"grad_norm": 0.642622817859945,
"learning_rate": 9.161209528760691e-06,
"loss": 1.2222,
"step": 1456
},
{
"epoch": 0.5743286523774329,
"grad_norm": 0.6320811079325271,
"learning_rate": 9.147492004105443e-06,
"loss": 1.2481,
"step": 1457
},
{
"epoch": 0.5747228381374723,
"grad_norm": 0.6326782165239981,
"learning_rate": 9.133776095173015e-06,
"loss": 1.2739,
"step": 1458
},
{
"epoch": 0.5751170238975117,
"grad_norm": 0.6625216988220546,
"learning_rate": 9.120061827958586e-06,
"loss": 1.2355,
"step": 1459
},
{
"epoch": 0.5755112096575511,
"grad_norm": 0.6213952483408215,
"learning_rate": 9.106349228454242e-06,
"loss": 1.1701,
"step": 1460
},
{
"epoch": 0.5759053954175906,
"grad_norm": 0.6158204977575528,
"learning_rate": 9.092638322648904e-06,
"loss": 1.2463,
"step": 1461
},
{
"epoch": 0.5762995811776299,
"grad_norm": 0.6128069866736511,
"learning_rate": 9.078929136528267e-06,
"loss": 1.1581,
"step": 1462
},
{
"epoch": 0.5766937669376694,
"grad_norm": 0.6618087745723823,
"learning_rate": 9.06522169607479e-06,
"loss": 1.1823,
"step": 1463
},
{
"epoch": 0.5770879526977089,
"grad_norm": 0.6783150244501504,
"learning_rate": 9.05151602726761e-06,
"loss": 1.2302,
"step": 1464
},
{
"epoch": 0.5774821384577482,
"grad_norm": 0.6503369713306525,
"learning_rate": 9.037812156082503e-06,
"loss": 1.2407,
"step": 1465
},
{
"epoch": 0.5778763242177877,
"grad_norm": 0.6456712064826,
"learning_rate": 9.024110108491855e-06,
"loss": 1.1609,
"step": 1466
},
{
"epoch": 0.578270509977827,
"grad_norm": 0.6486197805925519,
"learning_rate": 9.010409910464575e-06,
"loss": 1.2222,
"step": 1467
},
{
"epoch": 0.5786646957378665,
"grad_norm": 0.7436596366499776,
"learning_rate": 8.996711587966079e-06,
"loss": 1.2581,
"step": 1468
},
{
"epoch": 0.5790588814979059,
"grad_norm": 0.6261635281880413,
"learning_rate": 8.983015166958228e-06,
"loss": 1.2161,
"step": 1469
},
{
"epoch": 0.5794530672579453,
"grad_norm": 0.6443605688870468,
"learning_rate": 8.969320673399276e-06,
"loss": 1.1791,
"step": 1470
},
{
"epoch": 0.5798472530179847,
"grad_norm": 0.671825587927519,
"learning_rate": 8.955628133243828e-06,
"loss": 1.218,
"step": 1471
},
{
"epoch": 0.5802414387780241,
"grad_norm": 0.6434248476334178,
"learning_rate": 8.941937572442773e-06,
"loss": 1.1846,
"step": 1472
},
{
"epoch": 0.5806356245380636,
"grad_norm": 0.6254667200582976,
"learning_rate": 8.92824901694327e-06,
"loss": 1.2353,
"step": 1473
},
{
"epoch": 0.581029810298103,
"grad_norm": 0.6232654021330023,
"learning_rate": 8.914562492688667e-06,
"loss": 1.114,
"step": 1474
},
{
"epoch": 0.5814239960581424,
"grad_norm": 0.6299635353186261,
"learning_rate": 8.900878025618453e-06,
"loss": 1.2504,
"step": 1475
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.6833411898307228,
"learning_rate": 8.887195641668235e-06,
"loss": 1.2404,
"step": 1476
},
{
"epoch": 0.5822123675782213,
"grad_norm": 0.6669528413277209,
"learning_rate": 8.873515366769666e-06,
"loss": 1.1557,
"step": 1477
},
{
"epoch": 0.5826065533382606,
"grad_norm": 0.6340389941502457,
"learning_rate": 8.85983722685039e-06,
"loss": 1.1978,
"step": 1478
},
{
"epoch": 0.5830007390983001,
"grad_norm": 0.6504266413875779,
"learning_rate": 8.846161247834024e-06,
"loss": 1.2026,
"step": 1479
},
{
"epoch": 0.5833949248583395,
"grad_norm": 0.623448080239467,
"learning_rate": 8.832487455640074e-06,
"loss": 1.1968,
"step": 1480
},
{
"epoch": 0.5837891106183789,
"grad_norm": 0.6377332989581492,
"learning_rate": 8.81881587618391e-06,
"loss": 1.1794,
"step": 1481
},
{
"epoch": 0.5841832963784184,
"grad_norm": 0.6487050264881453,
"learning_rate": 8.805146535376709e-06,
"loss": 1.2329,
"step": 1482
},
{
"epoch": 0.5845774821384577,
"grad_norm": 0.6866850553685105,
"learning_rate": 8.791479459125396e-06,
"loss": 1.2786,
"step": 1483
},
{
"epoch": 0.5849716678984972,
"grad_norm": 0.6241541462965179,
"learning_rate": 8.777814673332615e-06,
"loss": 1.1997,
"step": 1484
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.6488269216574984,
"learning_rate": 8.764152203896658e-06,
"loss": 1.1873,
"step": 1485
},
{
"epoch": 0.585760039418576,
"grad_norm": 0.6518659909159534,
"learning_rate": 8.750492076711439e-06,
"loss": 1.1964,
"step": 1486
},
{
"epoch": 0.5861542251786154,
"grad_norm": 0.6379498327658182,
"learning_rate": 8.736834317666428e-06,
"loss": 1.19,
"step": 1487
},
{
"epoch": 0.5865484109386548,
"grad_norm": 0.6073862610155873,
"learning_rate": 8.723178952646597e-06,
"loss": 1.1497,
"step": 1488
},
{
"epoch": 0.5869425966986943,
"grad_norm": 0.6335121996922907,
"learning_rate": 8.709526007532396e-06,
"loss": 1.1905,
"step": 1489
},
{
"epoch": 0.5873367824587337,
"grad_norm": 0.6478757542846147,
"learning_rate": 8.695875508199683e-06,
"loss": 1.1726,
"step": 1490
},
{
"epoch": 0.5877309682187731,
"grad_norm": 0.6801004693955225,
"learning_rate": 8.682227480519672e-06,
"loss": 1.1956,
"step": 1491
},
{
"epoch": 0.5881251539788125,
"grad_norm": 0.6869506155016226,
"learning_rate": 8.66858195035891e-06,
"loss": 1.2158,
"step": 1492
},
{
"epoch": 0.588519339738852,
"grad_norm": 0.6328860065449554,
"learning_rate": 8.654938943579194e-06,
"loss": 1.1986,
"step": 1493
},
{
"epoch": 0.5889135254988913,
"grad_norm": 0.6966371382556359,
"learning_rate": 8.641298486037543e-06,
"loss": 1.2219,
"step": 1494
},
{
"epoch": 0.5893077112589308,
"grad_norm": 0.6706456600510302,
"learning_rate": 8.627660603586157e-06,
"loss": 1.2992,
"step": 1495
},
{
"epoch": 0.5897018970189702,
"grad_norm": 0.6634528939701451,
"learning_rate": 8.614025322072338e-06,
"loss": 1.2412,
"step": 1496
},
{
"epoch": 0.5900960827790096,
"grad_norm": 0.6101971245071337,
"learning_rate": 8.600392667338465e-06,
"loss": 1.1347,
"step": 1497
},
{
"epoch": 0.5904902685390491,
"grad_norm": 0.640682969790413,
"learning_rate": 8.58676266522194e-06,
"loss": 1.2015,
"step": 1498
},
{
"epoch": 0.5908844542990884,
"grad_norm": 0.648892739773898,
"learning_rate": 8.573135341555138e-06,
"loss": 1.1751,
"step": 1499
},
{
"epoch": 0.5912786400591279,
"grad_norm": 0.6497240357012373,
"learning_rate": 8.55951072216536e-06,
"loss": 1.2231,
"step": 1500
},
{
"epoch": 0.5916728258191672,
"grad_norm": 0.653343396545042,
"learning_rate": 8.54588883287477e-06,
"loss": 1.1746,
"step": 1501
},
{
"epoch": 0.5920670115792067,
"grad_norm": 0.6432488267867399,
"learning_rate": 8.532269699500377e-06,
"loss": 1.1574,
"step": 1502
},
{
"epoch": 0.5924611973392461,
"grad_norm": 0.6545865486299587,
"learning_rate": 8.518653347853948e-06,
"loss": 1.2443,
"step": 1503
},
{
"epoch": 0.5928553830992855,
"grad_norm": 0.7869569426495164,
"learning_rate": 8.505039803741985e-06,
"loss": 1.2115,
"step": 1504
},
{
"epoch": 0.593249568859325,
"grad_norm": 0.61279157223736,
"learning_rate": 8.491429092965677e-06,
"loss": 1.1301,
"step": 1505
},
{
"epoch": 0.5936437546193644,
"grad_norm": 0.6584615054581199,
"learning_rate": 8.477821241320831e-06,
"loss": 1.1872,
"step": 1506
},
{
"epoch": 0.5940379403794038,
"grad_norm": 0.6651037222509211,
"learning_rate": 8.464216274597839e-06,
"loss": 1.1699,
"step": 1507
},
{
"epoch": 0.5944321261394432,
"grad_norm": 0.6192362295929023,
"learning_rate": 8.450614218581631e-06,
"loss": 1.2301,
"step": 1508
},
{
"epoch": 0.5948263118994827,
"grad_norm": 0.6063957302686086,
"learning_rate": 8.437015099051613e-06,
"loss": 1.1558,
"step": 1509
},
{
"epoch": 0.595220497659522,
"grad_norm": 0.6463493132821347,
"learning_rate": 8.42341894178163e-06,
"loss": 1.2595,
"step": 1510
},
{
"epoch": 0.5956146834195615,
"grad_norm": 0.6177688405321609,
"learning_rate": 8.409825772539905e-06,
"loss": 1.174,
"step": 1511
},
{
"epoch": 0.5960088691796009,
"grad_norm": 0.6181575708603189,
"learning_rate": 8.396235617089013e-06,
"loss": 1.1953,
"step": 1512
},
{
"epoch": 0.5964030549396403,
"grad_norm": 0.6232523590903218,
"learning_rate": 8.382648501185806e-06,
"loss": 1.2131,
"step": 1513
},
{
"epoch": 0.5967972406996798,
"grad_norm": 0.6853964780387746,
"learning_rate": 8.369064450581374e-06,
"loss": 1.2397,
"step": 1514
},
{
"epoch": 0.5971914264597191,
"grad_norm": 0.638261822593998,
"learning_rate": 8.355483491021007e-06,
"loss": 1.1697,
"step": 1515
},
{
"epoch": 0.5975856122197586,
"grad_norm": 0.6345858720982844,
"learning_rate": 8.341905648244122e-06,
"loss": 1.198,
"step": 1516
},
{
"epoch": 0.597979797979798,
"grad_norm": 0.6205371649965156,
"learning_rate": 8.328330947984243e-06,
"loss": 1.1509,
"step": 1517
},
{
"epoch": 0.5983739837398374,
"grad_norm": 0.6780688159415363,
"learning_rate": 8.314759415968936e-06,
"loss": 1.2359,
"step": 1518
},
{
"epoch": 0.5987681694998768,
"grad_norm": 0.6375070575615467,
"learning_rate": 8.301191077919753e-06,
"loss": 1.2035,
"step": 1519
},
{
"epoch": 0.5991623552599162,
"grad_norm": 0.622909906771207,
"learning_rate": 8.2876259595522e-06,
"loss": 1.2104,
"step": 1520
},
{
"epoch": 0.5995565410199557,
"grad_norm": 0.6094392519833095,
"learning_rate": 8.274064086575682e-06,
"loss": 1.1475,
"step": 1521
},
{
"epoch": 0.5999507267799951,
"grad_norm": 0.621252910798821,
"learning_rate": 8.260505484693449e-06,
"loss": 1.1864,
"step": 1522
},
{
"epoch": 0.6003449125400345,
"grad_norm": 0.6698438223208214,
"learning_rate": 8.246950179602554e-06,
"loss": 1.1991,
"step": 1523
},
{
"epoch": 0.6007390983000739,
"grad_norm": 0.6520795365380274,
"learning_rate": 8.2333981969938e-06,
"loss": 1.1769,
"step": 1524
},
{
"epoch": 0.6011332840601133,
"grad_norm": 0.6522360114294746,
"learning_rate": 8.219849562551695e-06,
"loss": 1.2025,
"step": 1525
},
{
"epoch": 0.6015274698201527,
"grad_norm": 0.6295823752577447,
"learning_rate": 8.206304301954397e-06,
"loss": 1.1339,
"step": 1526
},
{
"epoch": 0.6019216555801922,
"grad_norm": 0.6483586741712484,
"learning_rate": 8.192762440873675e-06,
"loss": 1.1893,
"step": 1527
},
{
"epoch": 0.6023158413402316,
"grad_norm": 0.6574976200875523,
"learning_rate": 8.179224004974857e-06,
"loss": 1.1948,
"step": 1528
},
{
"epoch": 0.602710027100271,
"grad_norm": 0.6592927070571326,
"learning_rate": 8.165689019916769e-06,
"loss": 1.1865,
"step": 1529
},
{
"epoch": 0.6031042128603105,
"grad_norm": 0.6602088196871608,
"learning_rate": 8.152157511351704e-06,
"loss": 1.2788,
"step": 1530
},
{
"epoch": 0.6034983986203498,
"grad_norm": 0.5966682622148229,
"learning_rate": 8.138629504925372e-06,
"loss": 1.1035,
"step": 1531
},
{
"epoch": 0.6038925843803893,
"grad_norm": 0.6472735298836796,
"learning_rate": 8.125105026276832e-06,
"loss": 1.2211,
"step": 1532
},
{
"epoch": 0.6042867701404286,
"grad_norm": 0.647741738867434,
"learning_rate": 8.111584101038462e-06,
"loss": 1.2187,
"step": 1533
},
{
"epoch": 0.6046809559004681,
"grad_norm": 0.6404826084219543,
"learning_rate": 8.098066754835916e-06,
"loss": 1.1788,
"step": 1534
},
{
"epoch": 0.6050751416605075,
"grad_norm": 0.6124100298486728,
"learning_rate": 8.084553013288048e-06,
"loss": 1.1426,
"step": 1535
},
{
"epoch": 0.6054693274205469,
"grad_norm": 0.6344901181171149,
"learning_rate": 8.071042902006896e-06,
"loss": 1.2431,
"step": 1536
},
{
"epoch": 0.6058635131805864,
"grad_norm": 0.6328920930143503,
"learning_rate": 8.057536446597598e-06,
"loss": 1.2025,
"step": 1537
},
{
"epoch": 0.6062576989406258,
"grad_norm": 0.6519280491300705,
"learning_rate": 8.044033672658387e-06,
"loss": 1.2351,
"step": 1538
},
{
"epoch": 0.6066518847006652,
"grad_norm": 0.6725946251767152,
"learning_rate": 8.0305346057805e-06,
"loss": 1.2485,
"step": 1539
},
{
"epoch": 0.6070460704607046,
"grad_norm": 0.657229000221368,
"learning_rate": 8.017039271548154e-06,
"loss": 1.1958,
"step": 1540
},
{
"epoch": 0.607440256220744,
"grad_norm": 0.63930798917721,
"learning_rate": 8.0035476955385e-06,
"loss": 1.2539,
"step": 1541
},
{
"epoch": 0.6078344419807834,
"grad_norm": 0.6356269105691521,
"learning_rate": 7.990059903321554e-06,
"loss": 1.174,
"step": 1542
},
{
"epoch": 0.6082286277408229,
"grad_norm": 0.6421402197109457,
"learning_rate": 7.97657592046016e-06,
"loss": 1.2085,
"step": 1543
},
{
"epoch": 0.6086228135008623,
"grad_norm": 0.6489422328975518,
"learning_rate": 7.96309577250996e-06,
"loss": 1.2387,
"step": 1544
},
{
"epoch": 0.6090169992609017,
"grad_norm": 0.6530006388057895,
"learning_rate": 7.949619485019307e-06,
"loss": 1.2009,
"step": 1545
},
{
"epoch": 0.6094111850209412,
"grad_norm": 0.6416958127168939,
"learning_rate": 7.936147083529245e-06,
"loss": 1.2154,
"step": 1546
},
{
"epoch": 0.6098053707809805,
"grad_norm": 0.6337303333525649,
"learning_rate": 7.922678593573462e-06,
"loss": 1.1974,
"step": 1547
},
{
"epoch": 0.61019955654102,
"grad_norm": 0.6637031259257837,
"learning_rate": 7.90921404067822e-06,
"loss": 1.2052,
"step": 1548
},
{
"epoch": 0.6105937423010593,
"grad_norm": 0.6473009660413165,
"learning_rate": 7.89575345036232e-06,
"loss": 1.2473,
"step": 1549
},
{
"epoch": 0.6109879280610988,
"grad_norm": 0.6261555671205469,
"learning_rate": 7.882296848137063e-06,
"loss": 1.2066,
"step": 1550
},
{
"epoch": 0.6113821138211382,
"grad_norm": 0.6177349103271258,
"learning_rate": 7.868844259506186e-06,
"loss": 1.1547,
"step": 1551
},
{
"epoch": 0.6117762995811776,
"grad_norm": 0.6264274304099752,
"learning_rate": 7.855395709965814e-06,
"loss": 1.2039,
"step": 1552
},
{
"epoch": 0.6121704853412171,
"grad_norm": 0.6208965372231373,
"learning_rate": 7.84195122500442e-06,
"loss": 1.1659,
"step": 1553
},
{
"epoch": 0.6125646711012565,
"grad_norm": 0.6182902432180839,
"learning_rate": 7.828510830102785e-06,
"loss": 1.1802,
"step": 1554
},
{
"epoch": 0.6129588568612959,
"grad_norm": 0.6010062493402437,
"learning_rate": 7.815074550733919e-06,
"loss": 1.1624,
"step": 1555
},
{
"epoch": 0.6133530426213353,
"grad_norm": 0.6100632398399762,
"learning_rate": 7.801642412363042e-06,
"loss": 1.1588,
"step": 1556
},
{
"epoch": 0.6137472283813747,
"grad_norm": 0.6244968785224004,
"learning_rate": 7.788214440447532e-06,
"loss": 1.16,
"step": 1557
},
{
"epoch": 0.6141414141414141,
"grad_norm": 0.6262394381187797,
"learning_rate": 7.774790660436857e-06,
"loss": 1.1379,
"step": 1558
},
{
"epoch": 0.6145355999014536,
"grad_norm": 0.6268360201286511,
"learning_rate": 7.761371097772548e-06,
"loss": 1.1632,
"step": 1559
},
{
"epoch": 0.614929785661493,
"grad_norm": 0.6450865669879012,
"learning_rate": 7.747955777888145e-06,
"loss": 1.1762,
"step": 1560
},
{
"epoch": 0.6153239714215324,
"grad_norm": 0.6424738031868468,
"learning_rate": 7.734544726209143e-06,
"loss": 1.1559,
"step": 1561
},
{
"epoch": 0.6157181571815719,
"grad_norm": 0.637950698301497,
"learning_rate": 7.721137968152944e-06,
"loss": 1.1831,
"step": 1562
},
{
"epoch": 0.6161123429416112,
"grad_norm": 0.6186538417807995,
"learning_rate": 7.707735529128819e-06,
"loss": 1.1962,
"step": 1563
},
{
"epoch": 0.6165065287016507,
"grad_norm": 0.6181805636977189,
"learning_rate": 7.694337434537856e-06,
"loss": 1.1768,
"step": 1564
},
{
"epoch": 0.61690071446169,
"grad_norm": 0.6254768111350152,
"learning_rate": 7.680943709772899e-06,
"loss": 1.1604,
"step": 1565
},
{
"epoch": 0.6172949002217295,
"grad_norm": 0.644104659671372,
"learning_rate": 7.667554380218513e-06,
"loss": 1.2107,
"step": 1566
},
{
"epoch": 0.6176890859817689,
"grad_norm": 0.6537180884599917,
"learning_rate": 7.654169471250945e-06,
"loss": 1.2834,
"step": 1567
},
{
"epoch": 0.6180832717418083,
"grad_norm": 0.6361808370235917,
"learning_rate": 7.640789008238044e-06,
"loss": 1.1062,
"step": 1568
},
{
"epoch": 0.6184774575018478,
"grad_norm": 0.6523288827402758,
"learning_rate": 7.627413016539247e-06,
"loss": 1.1986,
"step": 1569
},
{
"epoch": 0.6188716432618871,
"grad_norm": 0.6285054549406514,
"learning_rate": 7.614041521505517e-06,
"loss": 1.1758,
"step": 1570
},
{
"epoch": 0.6192658290219266,
"grad_norm": 0.6272952169331758,
"learning_rate": 7.6006745484792855e-06,
"loss": 1.1788,
"step": 1571
},
{
"epoch": 0.619660014781966,
"grad_norm": 0.6500656109205114,
"learning_rate": 7.587312122794414e-06,
"loss": 1.2231,
"step": 1572
},
{
"epoch": 0.6200542005420054,
"grad_norm": 0.6954118875061881,
"learning_rate": 7.5739542697761615e-06,
"loss": 1.2549,
"step": 1573
},
{
"epoch": 0.6204483863020448,
"grad_norm": 0.6226893727767379,
"learning_rate": 7.560601014741103e-06,
"loss": 1.1388,
"step": 1574
},
{
"epoch": 0.6208425720620843,
"grad_norm": 0.6505634755873115,
"learning_rate": 7.547252382997101e-06,
"loss": 1.2098,
"step": 1575
},
{
"epoch": 0.6212367578221237,
"grad_norm": 0.6498328807173522,
"learning_rate": 7.533908399843266e-06,
"loss": 1.1734,
"step": 1576
},
{
"epoch": 0.6216309435821631,
"grad_norm": 0.6761129099478455,
"learning_rate": 7.520569090569894e-06,
"loss": 1.1757,
"step": 1577
},
{
"epoch": 0.6220251293422026,
"grad_norm": 0.6971630762485974,
"learning_rate": 7.507234480458414e-06,
"loss": 1.2566,
"step": 1578
},
{
"epoch": 0.6224193151022419,
"grad_norm": 0.6237942794960373,
"learning_rate": 7.493904594781358e-06,
"loss": 1.1296,
"step": 1579
},
{
"epoch": 0.6228135008622814,
"grad_norm": 0.6295586177215396,
"learning_rate": 7.4805794588023086e-06,
"loss": 1.1169,
"step": 1580
},
{
"epoch": 0.6232076866223207,
"grad_norm": 0.6408732189903159,
"learning_rate": 7.4672590977758295e-06,
"loss": 1.1301,
"step": 1581
},
{
"epoch": 0.6236018723823602,
"grad_norm": 0.6771354689742808,
"learning_rate": 7.45394353694745e-06,
"loss": 1.2348,
"step": 1582
},
{
"epoch": 0.6239960581423996,
"grad_norm": 0.640613127950835,
"learning_rate": 7.4406328015536e-06,
"loss": 1.196,
"step": 1583
},
{
"epoch": 0.624390243902439,
"grad_norm": 0.650879151108994,
"learning_rate": 7.427326916821557e-06,
"loss": 1.1784,
"step": 1584
},
{
"epoch": 0.6247844296624785,
"grad_norm": 0.6596072847031024,
"learning_rate": 7.414025907969404e-06,
"loss": 1.2214,
"step": 1585
},
{
"epoch": 0.6251786154225178,
"grad_norm": 0.6278635059421687,
"learning_rate": 7.4007298002059965e-06,
"loss": 1.1567,
"step": 1586
},
{
"epoch": 0.6255728011825573,
"grad_norm": 0.6225891858209661,
"learning_rate": 7.387438618730891e-06,
"loss": 1.1644,
"step": 1587
},
{
"epoch": 0.6259669869425967,
"grad_norm": 0.6387712671736495,
"learning_rate": 7.3741523887343015e-06,
"loss": 1.1932,
"step": 1588
},
{
"epoch": 0.6263611727026361,
"grad_norm": 0.6731157388955487,
"learning_rate": 7.360871135397072e-06,
"loss": 1.2878,
"step": 1589
},
{
"epoch": 0.6267553584626755,
"grad_norm": 0.6067881423807671,
"learning_rate": 7.347594883890608e-06,
"loss": 1.1341,
"step": 1590
},
{
"epoch": 0.627149544222715,
"grad_norm": 0.6315807367438574,
"learning_rate": 7.3343236593768295e-06,
"loss": 1.15,
"step": 1591
},
{
"epoch": 0.6275437299827544,
"grad_norm": 0.6828787333827238,
"learning_rate": 7.321057487008136e-06,
"loss": 1.2797,
"step": 1592
},
{
"epoch": 0.6279379157427938,
"grad_norm": 0.636378285588495,
"learning_rate": 7.307796391927356e-06,
"loss": 1.2114,
"step": 1593
},
{
"epoch": 0.6283321015028333,
"grad_norm": 0.6227706869499603,
"learning_rate": 7.294540399267682e-06,
"loss": 1.2107,
"step": 1594
},
{
"epoch": 0.6287262872628726,
"grad_norm": 0.6542527940502086,
"learning_rate": 7.281289534152644e-06,
"loss": 1.1301,
"step": 1595
},
{
"epoch": 0.6291204730229121,
"grad_norm": 0.6481496871980028,
"learning_rate": 7.268043821696062e-06,
"loss": 1.2319,
"step": 1596
},
{
"epoch": 0.6295146587829514,
"grad_norm": 0.6445223927771241,
"learning_rate": 7.254803287001975e-06,
"loss": 1.2334,
"step": 1597
},
{
"epoch": 0.6299088445429909,
"grad_norm": 0.6329838727914758,
"learning_rate": 7.24156795516461e-06,
"loss": 1.1496,
"step": 1598
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.6299335180741068,
"learning_rate": 7.22833785126835e-06,
"loss": 1.184,
"step": 1599
},
{
"epoch": 0.6306972160630697,
"grad_norm": 0.6284096678702693,
"learning_rate": 7.215113000387654e-06,
"loss": 1.254,
"step": 1600
},
{
"epoch": 0.6310914018231092,
"grad_norm": 0.6324689716112708,
"learning_rate": 7.201893427587026e-06,
"loss": 1.1721,
"step": 1601
},
{
"epoch": 0.6314855875831485,
"grad_norm": 0.6858753419716495,
"learning_rate": 7.188679157920977e-06,
"loss": 1.1898,
"step": 1602
},
{
"epoch": 0.631879773343188,
"grad_norm": 0.6556988105872994,
"learning_rate": 7.1754702164339575e-06,
"loss": 1.2545,
"step": 1603
},
{
"epoch": 0.6322739591032274,
"grad_norm": 0.6195080831875678,
"learning_rate": 7.1622666281603235e-06,
"loss": 1.2272,
"step": 1604
},
{
"epoch": 0.6326681448632668,
"grad_norm": 0.6586395858980946,
"learning_rate": 7.149068418124281e-06,
"loss": 1.2194,
"step": 1605
},
{
"epoch": 0.6330623306233062,
"grad_norm": 0.6447888871223056,
"learning_rate": 7.1358756113398545e-06,
"loss": 1.2575,
"step": 1606
},
{
"epoch": 0.6334565163833457,
"grad_norm": 0.60959438103777,
"learning_rate": 7.122688232810815e-06,
"loss": 1.2215,
"step": 1607
},
{
"epoch": 0.6338507021433851,
"grad_norm": 0.6336168777241095,
"learning_rate": 7.109506307530646e-06,
"loss": 1.2274,
"step": 1608
},
{
"epoch": 0.6342448879034245,
"grad_norm": 0.6166032302997211,
"learning_rate": 7.096329860482507e-06,
"loss": 1.2061,
"step": 1609
},
{
"epoch": 0.634639073663464,
"grad_norm": 0.6674971360893448,
"learning_rate": 7.083158916639169e-06,
"loss": 1.3014,
"step": 1610
},
{
"epoch": 0.6350332594235033,
"grad_norm": 0.6542997563204203,
"learning_rate": 7.069993500962964e-06,
"loss": 1.139,
"step": 1611
},
{
"epoch": 0.6354274451835428,
"grad_norm": 0.6233870945052585,
"learning_rate": 7.056833638405762e-06,
"loss": 1.1705,
"step": 1612
},
{
"epoch": 0.6358216309435821,
"grad_norm": 0.6532480222627909,
"learning_rate": 7.043679353908901e-06,
"loss": 1.2109,
"step": 1613
},
{
"epoch": 0.6362158167036216,
"grad_norm": 0.6249185015676082,
"learning_rate": 7.0305306724031396e-06,
"loss": 1.1821,
"step": 1614
},
{
"epoch": 0.636610002463661,
"grad_norm": 0.6218410031542252,
"learning_rate": 7.017387618808634e-06,
"loss": 1.1483,
"step": 1615
},
{
"epoch": 0.6370041882237004,
"grad_norm": 0.6490684142962722,
"learning_rate": 7.0042502180348635e-06,
"loss": 1.2157,
"step": 1616
},
{
"epoch": 0.6373983739837399,
"grad_norm": 0.6034827634471542,
"learning_rate": 6.991118494980591e-06,
"loss": 1.1842,
"step": 1617
},
{
"epoch": 0.6377925597437792,
"grad_norm": 0.6274462711346118,
"learning_rate": 6.977992474533823e-06,
"loss": 1.2361,
"step": 1618
},
{
"epoch": 0.6381867455038187,
"grad_norm": 0.6760850255550227,
"learning_rate": 6.964872181571765e-06,
"loss": 1.1862,
"step": 1619
},
{
"epoch": 0.6385809312638581,
"grad_norm": 0.6396402151072694,
"learning_rate": 6.9517576409607545e-06,
"loss": 1.2231,
"step": 1620
},
{
"epoch": 0.6389751170238975,
"grad_norm": 0.6338829150069218,
"learning_rate": 6.938648877556231e-06,
"loss": 1.2246,
"step": 1621
},
{
"epoch": 0.6393693027839369,
"grad_norm": 0.6473593135129597,
"learning_rate": 6.925545916202692e-06,
"loss": 1.2431,
"step": 1622
},
{
"epoch": 0.6397634885439764,
"grad_norm": 0.6401312934763702,
"learning_rate": 6.912448781733633e-06,
"loss": 1.2157,
"step": 1623
},
{
"epoch": 0.6401576743040158,
"grad_norm": 0.6399148681302655,
"learning_rate": 6.8993574989714995e-06,
"loss": 1.1838,
"step": 1624
},
{
"epoch": 0.6405518600640552,
"grad_norm": 0.5966358662573188,
"learning_rate": 6.88627209272766e-06,
"loss": 1.1593,
"step": 1625
},
{
"epoch": 0.6409460458240946,
"grad_norm": 0.6516019968106155,
"learning_rate": 6.87319258780234e-06,
"loss": 1.1743,
"step": 1626
},
{
"epoch": 0.641340231584134,
"grad_norm": 0.623888477031532,
"learning_rate": 6.860119008984569e-06,
"loss": 1.2352,
"step": 1627
},
{
"epoch": 0.6417344173441735,
"grad_norm": 0.6462585435255515,
"learning_rate": 6.847051381052165e-06,
"loss": 1.1955,
"step": 1628
},
{
"epoch": 0.6421286031042128,
"grad_norm": 0.6285337684977241,
"learning_rate": 6.833989728771657e-06,
"loss": 1.2102,
"step": 1629
},
{
"epoch": 0.6425227888642523,
"grad_norm": 0.6313390139589669,
"learning_rate": 6.820934076898247e-06,
"loss": 1.209,
"step": 1630
},
{
"epoch": 0.6429169746242916,
"grad_norm": 0.6219389731857671,
"learning_rate": 6.8078844501757625e-06,
"loss": 1.1647,
"step": 1631
},
{
"epoch": 0.6433111603843311,
"grad_norm": 0.6255385020113866,
"learning_rate": 6.794840873336622e-06,
"loss": 1.2185,
"step": 1632
},
{
"epoch": 0.6437053461443706,
"grad_norm": 0.6214536562298445,
"learning_rate": 6.781803371101774e-06,
"loss": 1.2235,
"step": 1633
},
{
"epoch": 0.6440995319044099,
"grad_norm": 0.6520907124359351,
"learning_rate": 6.768771968180643e-06,
"loss": 1.2638,
"step": 1634
},
{
"epoch": 0.6444937176644494,
"grad_norm": 0.6349696744735929,
"learning_rate": 6.755746689271112e-06,
"loss": 1.2064,
"step": 1635
},
{
"epoch": 0.6448879034244888,
"grad_norm": 0.6202351218573725,
"learning_rate": 6.742727559059448e-06,
"loss": 1.2017,
"step": 1636
},
{
"epoch": 0.6452820891845282,
"grad_norm": 0.6114039580216786,
"learning_rate": 6.729714602220256e-06,
"loss": 1.1862,
"step": 1637
},
{
"epoch": 0.6456762749445676,
"grad_norm": 0.6747317843915315,
"learning_rate": 6.71670784341646e-06,
"loss": 1.2687,
"step": 1638
},
{
"epoch": 0.646070460704607,
"grad_norm": 0.6221379676750881,
"learning_rate": 6.703707307299224e-06,
"loss": 1.1739,
"step": 1639
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.6067484985660325,
"learning_rate": 6.690713018507917e-06,
"loss": 1.1716,
"step": 1640
},
{
"epoch": 0.6468588322246859,
"grad_norm": 0.6646806120765326,
"learning_rate": 6.677725001670078e-06,
"loss": 1.2563,
"step": 1641
},
{
"epoch": 0.6472530179847253,
"grad_norm": 0.6381676236429237,
"learning_rate": 6.664743281401351e-06,
"loss": 1.2079,
"step": 1642
},
{
"epoch": 0.6476472037447647,
"grad_norm": 0.6325821061959688,
"learning_rate": 6.651767882305447e-06,
"loss": 1.1695,
"step": 1643
},
{
"epoch": 0.6480413895048042,
"grad_norm": 0.6475669717517898,
"learning_rate": 6.6387988289741e-06,
"loss": 1.2316,
"step": 1644
},
{
"epoch": 0.6484355752648435,
"grad_norm": 0.6328642670845832,
"learning_rate": 6.625836145987015e-06,
"loss": 1.187,
"step": 1645
},
{
"epoch": 0.648829761024883,
"grad_norm": 0.6356937277383269,
"learning_rate": 6.612879857911825e-06,
"loss": 1.1713,
"step": 1646
},
{
"epoch": 0.6492239467849223,
"grad_norm": 0.6286143776886958,
"learning_rate": 6.599929989304034e-06,
"loss": 1.1949,
"step": 1647
},
{
"epoch": 0.6496181325449618,
"grad_norm": 0.6251531191060387,
"learning_rate": 6.5869865647069995e-06,
"loss": 1.1918,
"step": 1648
},
{
"epoch": 0.6500123183050013,
"grad_norm": 0.6111849191258597,
"learning_rate": 6.574049608651849e-06,
"loss": 1.1922,
"step": 1649
},
{
"epoch": 0.6504065040650406,
"grad_norm": 0.6172328892977227,
"learning_rate": 6.561119145657451e-06,
"loss": 1.2013,
"step": 1650
},
{
"epoch": 0.6508006898250801,
"grad_norm": 0.6563068727145971,
"learning_rate": 6.548195200230376e-06,
"loss": 1.1936,
"step": 1651
},
{
"epoch": 0.6511948755851195,
"grad_norm": 0.6451511184566149,
"learning_rate": 6.535277796864842e-06,
"loss": 1.1765,
"step": 1652
},
{
"epoch": 0.6515890613451589,
"grad_norm": 0.6148495858039739,
"learning_rate": 6.522366960042654e-06,
"loss": 1.1506,
"step": 1653
},
{
"epoch": 0.6519832471051983,
"grad_norm": 0.6125300863917666,
"learning_rate": 6.509462714233194e-06,
"loss": 1.1669,
"step": 1654
},
{
"epoch": 0.6523774328652377,
"grad_norm": 0.630309988193399,
"learning_rate": 6.496565083893333e-06,
"loss": 1.1889,
"step": 1655
},
{
"epoch": 0.6527716186252772,
"grad_norm": 0.6634157824387188,
"learning_rate": 6.483674093467409e-06,
"loss": 1.2278,
"step": 1656
},
{
"epoch": 0.6531658043853166,
"grad_norm": 0.631045534805432,
"learning_rate": 6.470789767387188e-06,
"loss": 1.1569,
"step": 1657
},
{
"epoch": 0.653559990145356,
"grad_norm": 0.6445024253655253,
"learning_rate": 6.457912130071786e-06,
"loss": 1.2291,
"step": 1658
},
{
"epoch": 0.6539541759053954,
"grad_norm": 0.6295685120939664,
"learning_rate": 6.445041205927658e-06,
"loss": 1.1953,
"step": 1659
},
{
"epoch": 0.6543483616654349,
"grad_norm": 0.6095510411838025,
"learning_rate": 6.432177019348521e-06,
"loss": 1.2001,
"step": 1660
},
{
"epoch": 0.6547425474254742,
"grad_norm": 0.6444146297988372,
"learning_rate": 6.419319594715338e-06,
"loss": 1.244,
"step": 1661
},
{
"epoch": 0.6551367331855137,
"grad_norm": 0.6104207832263667,
"learning_rate": 6.4064689563962505e-06,
"loss": 1.1556,
"step": 1662
},
{
"epoch": 0.655530918945553,
"grad_norm": 0.6326952360287978,
"learning_rate": 6.393625128746527e-06,
"loss": 1.1521,
"step": 1663
},
{
"epoch": 0.6559251047055925,
"grad_norm": 0.640334858610275,
"learning_rate": 6.3807881361085465e-06,
"loss": 1.181,
"step": 1664
},
{
"epoch": 0.656319290465632,
"grad_norm": 0.6504217808929613,
"learning_rate": 6.367958002811726e-06,
"loss": 1.1974,
"step": 1665
},
{
"epoch": 0.6567134762256713,
"grad_norm": 0.6529534715347126,
"learning_rate": 6.355134753172474e-06,
"loss": 1.1889,
"step": 1666
},
{
"epoch": 0.6571076619857108,
"grad_norm": 0.6654769765183821,
"learning_rate": 6.3423184114941686e-06,
"loss": 1.1865,
"step": 1667
},
{
"epoch": 0.6575018477457502,
"grad_norm": 0.6436155169730803,
"learning_rate": 6.32950900206708e-06,
"loss": 1.1647,
"step": 1668
},
{
"epoch": 0.6578960335057896,
"grad_norm": 0.6503660356165931,
"learning_rate": 6.31670654916835e-06,
"loss": 1.1674,
"step": 1669
},
{
"epoch": 0.658290219265829,
"grad_norm": 0.6608765081904892,
"learning_rate": 6.303911077061937e-06,
"loss": 1.2069,
"step": 1670
},
{
"epoch": 0.6586844050258684,
"grad_norm": 0.6417814536413016,
"learning_rate": 6.291122609998559e-06,
"loss": 1.2464,
"step": 1671
},
{
"epoch": 0.6590785907859079,
"grad_norm": 0.6676289218023853,
"learning_rate": 6.278341172215669e-06,
"loss": 1.2228,
"step": 1672
},
{
"epoch": 0.6594727765459473,
"grad_norm": 0.6280886790009287,
"learning_rate": 6.265566787937386e-06,
"loss": 1.1968,
"step": 1673
},
{
"epoch": 0.6598669623059867,
"grad_norm": 0.6483564238116941,
"learning_rate": 6.252799481374472e-06,
"loss": 1.2109,
"step": 1674
},
{
"epoch": 0.6602611480660261,
"grad_norm": 0.6189215649081374,
"learning_rate": 6.240039276724273e-06,
"loss": 1.196,
"step": 1675
},
{
"epoch": 0.6606553338260656,
"grad_norm": 0.6496483405660746,
"learning_rate": 6.227286198170663e-06,
"loss": 1.2246,
"step": 1676
},
{
"epoch": 0.6610495195861049,
"grad_norm": 0.6436584140179482,
"learning_rate": 6.214540269884026e-06,
"loss": 1.2284,
"step": 1677
},
{
"epoch": 0.6614437053461444,
"grad_norm": 0.6076777270904066,
"learning_rate": 6.20180151602119e-06,
"loss": 1.1942,
"step": 1678
},
{
"epoch": 0.6618378911061837,
"grad_norm": 0.636033416189757,
"learning_rate": 6.189069960725375e-06,
"loss": 1.1675,
"step": 1679
},
{
"epoch": 0.6622320768662232,
"grad_norm": 0.6396164730580286,
"learning_rate": 6.176345628126176e-06,
"loss": 1.1487,
"step": 1680
},
{
"epoch": 0.6626262626262627,
"grad_norm": 0.6015028228353986,
"learning_rate": 6.163628542339482e-06,
"loss": 1.1619,
"step": 1681
},
{
"epoch": 0.663020448386302,
"grad_norm": 0.6749292049019211,
"learning_rate": 6.150918727467455e-06,
"loss": 1.254,
"step": 1682
},
{
"epoch": 0.6634146341463415,
"grad_norm": 0.6328636162023467,
"learning_rate": 6.138216207598484e-06,
"loss": 1.2299,
"step": 1683
},
{
"epoch": 0.6638088199063809,
"grad_norm": 0.6214587756005278,
"learning_rate": 6.125521006807116e-06,
"loss": 1.2219,
"step": 1684
},
{
"epoch": 0.6642030056664203,
"grad_norm": 0.6537286104808447,
"learning_rate": 6.112833149154042e-06,
"loss": 1.2113,
"step": 1685
},
{
"epoch": 0.6645971914264597,
"grad_norm": 0.609872538457475,
"learning_rate": 6.10015265868602e-06,
"loss": 1.1715,
"step": 1686
},
{
"epoch": 0.6649913771864991,
"grad_norm": 0.6494731629680189,
"learning_rate": 6.0874795594358635e-06,
"loss": 1.2314,
"step": 1687
},
{
"epoch": 0.6653855629465386,
"grad_norm": 0.632923311793017,
"learning_rate": 6.0748138754223665e-06,
"loss": 1.1768,
"step": 1688
},
{
"epoch": 0.665779748706578,
"grad_norm": 0.6247202140755514,
"learning_rate": 6.062155630650265e-06,
"loss": 1.1812,
"step": 1689
},
{
"epoch": 0.6661739344666174,
"grad_norm": 0.631382377815529,
"learning_rate": 6.04950484911021e-06,
"loss": 1.1885,
"step": 1690
},
{
"epoch": 0.6665681202266568,
"grad_norm": 0.6138459038575285,
"learning_rate": 6.036861554778695e-06,
"loss": 1.1024,
"step": 1691
},
{
"epoch": 0.6669623059866963,
"grad_norm": 0.6265529929087996,
"learning_rate": 6.024225771618024e-06,
"loss": 1.1803,
"step": 1692
},
{
"epoch": 0.6673564917467356,
"grad_norm": 0.6227616940366973,
"learning_rate": 6.01159752357628e-06,
"loss": 1.2006,
"step": 1693
},
{
"epoch": 0.6677506775067751,
"grad_norm": 0.6558790947502295,
"learning_rate": 5.998976834587246e-06,
"loss": 1.2862,
"step": 1694
},
{
"epoch": 0.6681448632668144,
"grad_norm": 0.6304744900349945,
"learning_rate": 5.98636372857039e-06,
"loss": 1.1633,
"step": 1695
},
{
"epoch": 0.6685390490268539,
"grad_norm": 0.6318297859034908,
"learning_rate": 5.973758229430806e-06,
"loss": 1.2295,
"step": 1696
},
{
"epoch": 0.6689332347868934,
"grad_norm": 0.5988437549278761,
"learning_rate": 5.961160361059168e-06,
"loss": 1.1157,
"step": 1697
},
{
"epoch": 0.6693274205469327,
"grad_norm": 0.6137920151619946,
"learning_rate": 5.9485701473316925e-06,
"loss": 1.1448,
"step": 1698
},
{
"epoch": 0.6697216063069722,
"grad_norm": 0.6329970134758367,
"learning_rate": 5.935987612110081e-06,
"loss": 1.1792,
"step": 1699
},
{
"epoch": 0.6701157920670116,
"grad_norm": 0.6102586025760833,
"learning_rate": 5.923412779241493e-06,
"loss": 1.1214,
"step": 1700
},
{
"epoch": 0.670509977827051,
"grad_norm": 0.6016261422928656,
"learning_rate": 5.910845672558483e-06,
"loss": 1.1718,
"step": 1701
},
{
"epoch": 0.6709041635870904,
"grad_norm": 0.6144263728280865,
"learning_rate": 5.8982863158789605e-06,
"loss": 1.1613,
"step": 1702
},
{
"epoch": 0.6712983493471298,
"grad_norm": 0.621741539871381,
"learning_rate": 5.8857347330061545e-06,
"loss": 1.2034,
"step": 1703
},
{
"epoch": 0.6716925351071693,
"grad_norm": 0.6395204468391608,
"learning_rate": 5.873190947728552e-06,
"loss": 1.2198,
"step": 1704
},
{
"epoch": 0.6720867208672087,
"grad_norm": 0.606550147222352,
"learning_rate": 5.860654983819865e-06,
"loss": 1.1776,
"step": 1705
},
{
"epoch": 0.6724809066272481,
"grad_norm": 0.61755989526117,
"learning_rate": 5.84812686503899e-06,
"loss": 1.2269,
"step": 1706
},
{
"epoch": 0.6728750923872875,
"grad_norm": 0.7087998957119107,
"learning_rate": 5.83560661512994e-06,
"loss": 1.2204,
"step": 1707
},
{
"epoch": 0.673269278147327,
"grad_norm": 0.6413367764373633,
"learning_rate": 5.823094257821822e-06,
"loss": 1.1834,
"step": 1708
},
{
"epoch": 0.6736634639073663,
"grad_norm": 0.6157486461013707,
"learning_rate": 5.810589816828786e-06,
"loss": 1.1602,
"step": 1709
},
{
"epoch": 0.6740576496674058,
"grad_norm": 0.6342496529809019,
"learning_rate": 5.798093315849984e-06,
"loss": 1.2135,
"step": 1710
},
{
"epoch": 0.6744518354274451,
"grad_norm": 0.6117339478605194,
"learning_rate": 5.785604778569505e-06,
"loss": 1.177,
"step": 1711
},
{
"epoch": 0.6748460211874846,
"grad_norm": 0.6360723349056584,
"learning_rate": 5.773124228656348e-06,
"loss": 1.2873,
"step": 1712
},
{
"epoch": 0.6752402069475241,
"grad_norm": 0.6302819005649393,
"learning_rate": 5.76065168976439e-06,
"loss": 1.1972,
"step": 1713
},
{
"epoch": 0.6756343927075634,
"grad_norm": 0.6224162266525995,
"learning_rate": 5.748187185532306e-06,
"loss": 1.1855,
"step": 1714
},
{
"epoch": 0.6760285784676029,
"grad_norm": 0.6281722704464516,
"learning_rate": 5.73573073958355e-06,
"loss": 1.1815,
"step": 1715
},
{
"epoch": 0.6764227642276422,
"grad_norm": 0.6081887852352087,
"learning_rate": 5.723282375526302e-06,
"loss": 1.1804,
"step": 1716
},
{
"epoch": 0.6768169499876817,
"grad_norm": 0.6352236721472015,
"learning_rate": 5.7108421169534376e-06,
"loss": 1.1534,
"step": 1717
},
{
"epoch": 0.6772111357477211,
"grad_norm": 0.5979382590678716,
"learning_rate": 5.698409987442448e-06,
"loss": 1.1452,
"step": 1718
},
{
"epoch": 0.6776053215077605,
"grad_norm": 0.6036448112025448,
"learning_rate": 5.685986010555437e-06,
"loss": 1.1876,
"step": 1719
},
{
"epoch": 0.6779995072678,
"grad_norm": 0.6219506058018258,
"learning_rate": 5.6735702098390454e-06,
"loss": 1.2324,
"step": 1720
},
{
"epoch": 0.6783936930278394,
"grad_norm": 0.6263654931652052,
"learning_rate": 5.66116260882442e-06,
"loss": 1.1572,
"step": 1721
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.6278411193914041,
"learning_rate": 5.648763231027171e-06,
"loss": 1.1307,
"step": 1722
},
{
"epoch": 0.6791820645479182,
"grad_norm": 0.6294069087185388,
"learning_rate": 5.636372099947327e-06,
"loss": 1.2278,
"step": 1723
},
{
"epoch": 0.6795762503079577,
"grad_norm": 0.6296558801771532,
"learning_rate": 5.623989239069275e-06,
"loss": 1.1627,
"step": 1724
},
{
"epoch": 0.679970436067997,
"grad_norm": 0.6385637803835064,
"learning_rate": 5.611614671861733e-06,
"loss": 1.1481,
"step": 1725
},
{
"epoch": 0.6803646218280365,
"grad_norm": 0.6307923826155407,
"learning_rate": 5.5992484217777074e-06,
"loss": 1.2114,
"step": 1726
},
{
"epoch": 0.6807588075880758,
"grad_norm": 0.6040246463542289,
"learning_rate": 5.5868905122544344e-06,
"loss": 1.2137,
"step": 1727
},
{
"epoch": 0.6811529933481153,
"grad_norm": 0.6139446753066389,
"learning_rate": 5.574540966713338e-06,
"loss": 1.1472,
"step": 1728
},
{
"epoch": 0.6815471791081548,
"grad_norm": 0.6430020863098516,
"learning_rate": 5.562199808560001e-06,
"loss": 1.2109,
"step": 1729
},
{
"epoch": 0.6819413648681941,
"grad_norm": 0.6061201727927807,
"learning_rate": 5.549867061184108e-06,
"loss": 1.1718,
"step": 1730
},
{
"epoch": 0.6823355506282336,
"grad_norm": 0.6422178072097416,
"learning_rate": 5.5375427479593945e-06,
"loss": 1.1794,
"step": 1731
},
{
"epoch": 0.682729736388273,
"grad_norm": 0.6458731861630423,
"learning_rate": 5.525226892243623e-06,
"loss": 1.2502,
"step": 1732
},
{
"epoch": 0.6831239221483124,
"grad_norm": 0.631975611730984,
"learning_rate": 5.5129195173785184e-06,
"loss": 1.224,
"step": 1733
},
{
"epoch": 0.6835181079083518,
"grad_norm": 0.639062643993908,
"learning_rate": 5.50062064668973e-06,
"loss": 1.2374,
"step": 1734
},
{
"epoch": 0.6839122936683912,
"grad_norm": 0.6153286588995233,
"learning_rate": 5.488330303486795e-06,
"loss": 1.1532,
"step": 1735
},
{
"epoch": 0.6843064794284307,
"grad_norm": 0.6095750520956184,
"learning_rate": 5.4760485110630956e-06,
"loss": 1.1539,
"step": 1736
},
{
"epoch": 0.6847006651884701,
"grad_norm": 0.6242095926386367,
"learning_rate": 5.46377529269579e-06,
"loss": 1.1842,
"step": 1737
},
{
"epoch": 0.6850948509485095,
"grad_norm": 0.6373500217851757,
"learning_rate": 5.451510671645806e-06,
"loss": 1.2564,
"step": 1738
},
{
"epoch": 0.6854890367085489,
"grad_norm": 0.6528326441972604,
"learning_rate": 5.439254671157764e-06,
"loss": 1.2031,
"step": 1739
},
{
"epoch": 0.6858832224685883,
"grad_norm": 0.6265646534423697,
"learning_rate": 5.427007314459949e-06,
"loss": 1.2276,
"step": 1740
},
{
"epoch": 0.6862774082286277,
"grad_norm": 0.6155975267249686,
"learning_rate": 5.414768624764262e-06,
"loss": 1.168,
"step": 1741
},
{
"epoch": 0.6866715939886672,
"grad_norm": 0.6407827075088298,
"learning_rate": 5.402538625266184e-06,
"loss": 1.2118,
"step": 1742
},
{
"epoch": 0.6870657797487065,
"grad_norm": 0.6203929435962302,
"learning_rate": 5.390317339144726e-06,
"loss": 1.1711,
"step": 1743
},
{
"epoch": 0.687459965508746,
"grad_norm": 0.6296758413992221,
"learning_rate": 5.378104789562373e-06,
"loss": 1.1671,
"step": 1744
},
{
"epoch": 0.6878541512687855,
"grad_norm": 0.6402560327012314,
"learning_rate": 5.3659009996650704e-06,
"loss": 1.2331,
"step": 1745
},
{
"epoch": 0.6882483370288248,
"grad_norm": 0.6352813958888808,
"learning_rate": 5.353705992582147e-06,
"loss": 1.171,
"step": 1746
},
{
"epoch": 0.6886425227888643,
"grad_norm": 0.6173013307650468,
"learning_rate": 5.341519791426285e-06,
"loss": 1.1872,
"step": 1747
},
{
"epoch": 0.6890367085489036,
"grad_norm": 0.6300579221159313,
"learning_rate": 5.329342419293488e-06,
"loss": 1.1538,
"step": 1748
},
{
"epoch": 0.6894308943089431,
"grad_norm": 0.6452484286067051,
"learning_rate": 5.3171738992630266e-06,
"loss": 1.1983,
"step": 1749
},
{
"epoch": 0.6898250800689825,
"grad_norm": 0.6351697766210709,
"learning_rate": 5.305014254397378e-06,
"loss": 1.2099,
"step": 1750
},
{
"epoch": 0.6902192658290219,
"grad_norm": 0.6059437488402356,
"learning_rate": 5.292863507742218e-06,
"loss": 1.1429,
"step": 1751
},
{
"epoch": 0.6906134515890614,
"grad_norm": 0.6375500404238919,
"learning_rate": 5.280721682326349e-06,
"loss": 1.195,
"step": 1752
},
{
"epoch": 0.6910076373491008,
"grad_norm": 0.6214302914583397,
"learning_rate": 5.268588801161661e-06,
"loss": 1.1562,
"step": 1753
},
{
"epoch": 0.6914018231091402,
"grad_norm": 0.6233573649742591,
"learning_rate": 5.256464887243095e-06,
"loss": 1.1784,
"step": 1754
},
{
"epoch": 0.6917960088691796,
"grad_norm": 0.6057486309866048,
"learning_rate": 5.244349963548603e-06,
"loss": 1.1841,
"step": 1755
},
{
"epoch": 0.692190194629219,
"grad_norm": 0.6262495769486762,
"learning_rate": 5.232244053039099e-06,
"loss": 1.2069,
"step": 1756
},
{
"epoch": 0.6925843803892584,
"grad_norm": 0.6244256499974958,
"learning_rate": 5.220147178658401e-06,
"loss": 1.2099,
"step": 1757
},
{
"epoch": 0.6929785661492979,
"grad_norm": 0.5987132658245882,
"learning_rate": 5.208059363333218e-06,
"loss": 1.1172,
"step": 1758
},
{
"epoch": 0.6933727519093372,
"grad_norm": 0.6204462023553633,
"learning_rate": 5.195980629973077e-06,
"loss": 1.1287,
"step": 1759
},
{
"epoch": 0.6937669376693767,
"grad_norm": 0.616887618107624,
"learning_rate": 5.183911001470296e-06,
"loss": 1.1707,
"step": 1760
},
{
"epoch": 0.6941611234294162,
"grad_norm": 0.6131588350689924,
"learning_rate": 5.171850500699942e-06,
"loss": 1.1913,
"step": 1761
},
{
"epoch": 0.6945553091894555,
"grad_norm": 0.6220240105240659,
"learning_rate": 5.159799150519773e-06,
"loss": 1.1752,
"step": 1762
},
{
"epoch": 0.694949494949495,
"grad_norm": 0.6474411617934912,
"learning_rate": 5.147756973770215e-06,
"loss": 1.1685,
"step": 1763
},
{
"epoch": 0.6953436807095343,
"grad_norm": 0.6074241395347293,
"learning_rate": 5.135723993274304e-06,
"loss": 1.1274,
"step": 1764
},
{
"epoch": 0.6957378664695738,
"grad_norm": 0.6257258438943853,
"learning_rate": 5.123700231837643e-06,
"loss": 1.1876,
"step": 1765
},
{
"epoch": 0.6961320522296132,
"grad_norm": 0.6240327119384406,
"learning_rate": 5.111685712248364e-06,
"loss": 1.1356,
"step": 1766
},
{
"epoch": 0.6965262379896526,
"grad_norm": 0.6058794807211466,
"learning_rate": 5.099680457277083e-06,
"loss": 1.1859,
"step": 1767
},
{
"epoch": 0.6969204237496921,
"grad_norm": 0.6130830438069458,
"learning_rate": 5.087684489676862e-06,
"loss": 1.1917,
"step": 1768
},
{
"epoch": 0.6973146095097315,
"grad_norm": 0.6307417343281665,
"learning_rate": 5.07569783218316e-06,
"loss": 1.2297,
"step": 1769
},
{
"epoch": 0.6977087952697709,
"grad_norm": 0.6127737313603762,
"learning_rate": 5.063720507513781e-06,
"loss": 1.1673,
"step": 1770
},
{
"epoch": 0.6981029810298103,
"grad_norm": 0.624666994089622,
"learning_rate": 5.051752538368855e-06,
"loss": 1.2133,
"step": 1771
},
{
"epoch": 0.6984971667898497,
"grad_norm": 0.612192851855714,
"learning_rate": 5.039793947430774e-06,
"loss": 1.1894,
"step": 1772
},
{
"epoch": 0.6988913525498891,
"grad_norm": 0.6163484499307348,
"learning_rate": 5.02784475736415e-06,
"loss": 1.1901,
"step": 1773
},
{
"epoch": 0.6992855383099286,
"grad_norm": 0.6189253804729046,
"learning_rate": 5.015904990815792e-06,
"loss": 1.1852,
"step": 1774
},
{
"epoch": 0.6996797240699679,
"grad_norm": 0.6315133839229915,
"learning_rate": 5.003974670414633e-06,
"loss": 1.2218,
"step": 1775
},
{
"epoch": 0.7000739098300074,
"grad_norm": 0.6143569728327692,
"learning_rate": 4.992053818771715e-06,
"loss": 1.1698,
"step": 1776
},
{
"epoch": 0.7004680955900469,
"grad_norm": 0.6023568254933535,
"learning_rate": 4.980142458480136e-06,
"loss": 1.1618,
"step": 1777
},
{
"epoch": 0.7008622813500862,
"grad_norm": 0.620427287297367,
"learning_rate": 4.968240612114995e-06,
"loss": 1.1812,
"step": 1778
},
{
"epoch": 0.7012564671101257,
"grad_norm": 0.6169377500547716,
"learning_rate": 4.956348302233364e-06,
"loss": 1.1729,
"step": 1779
},
{
"epoch": 0.701650652870165,
"grad_norm": 0.6119581164148135,
"learning_rate": 4.944465551374238e-06,
"loss": 1.1942,
"step": 1780
},
{
"epoch": 0.7020448386302045,
"grad_norm": 0.6207029111041957,
"learning_rate": 4.932592382058503e-06,
"loss": 1.1841,
"step": 1781
},
{
"epoch": 0.7024390243902439,
"grad_norm": 0.6274557767427725,
"learning_rate": 4.920728816788885e-06,
"loss": 1.2241,
"step": 1782
},
{
"epoch": 0.7028332101502833,
"grad_norm": 0.6251490097972446,
"learning_rate": 4.908874878049894e-06,
"loss": 1.1746,
"step": 1783
},
{
"epoch": 0.7032273959103228,
"grad_norm": 0.6421558996903795,
"learning_rate": 4.897030588307816e-06,
"loss": 1.1599,
"step": 1784
},
{
"epoch": 0.7036215816703622,
"grad_norm": 0.6580529776636076,
"learning_rate": 4.885195970010634e-06,
"loss": 1.1876,
"step": 1785
},
{
"epoch": 0.7040157674304016,
"grad_norm": 0.7799716182595261,
"learning_rate": 4.873371045588002e-06,
"loss": 1.1619,
"step": 1786
},
{
"epoch": 0.704409953190441,
"grad_norm": 0.6034015555793384,
"learning_rate": 4.861555837451213e-06,
"loss": 1.1339,
"step": 1787
},
{
"epoch": 0.7048041389504804,
"grad_norm": 0.6354298706812905,
"learning_rate": 4.84975036799313e-06,
"loss": 1.1904,
"step": 1788
},
{
"epoch": 0.7051983247105198,
"grad_norm": 0.656808882761667,
"learning_rate": 4.837954659588172e-06,
"loss": 1.2118,
"step": 1789
},
{
"epoch": 0.7055925104705593,
"grad_norm": 0.6354068123945864,
"learning_rate": 4.826168734592254e-06,
"loss": 1.2657,
"step": 1790
},
{
"epoch": 0.7059866962305986,
"grad_norm": 0.6135559463093657,
"learning_rate": 4.814392615342746e-06,
"loss": 1.218,
"step": 1791
},
{
"epoch": 0.7063808819906381,
"grad_norm": 0.6190332303953764,
"learning_rate": 4.802626324158432e-06,
"loss": 1.1298,
"step": 1792
},
{
"epoch": 0.7067750677506776,
"grad_norm": 0.6261895312898496,
"learning_rate": 4.790869883339473e-06,
"loss": 1.2229,
"step": 1793
},
{
"epoch": 0.7071692535107169,
"grad_norm": 0.6499346687616555,
"learning_rate": 4.779123315167362e-06,
"loss": 1.2436,
"step": 1794
},
{
"epoch": 0.7075634392707564,
"grad_norm": 0.7112549120650247,
"learning_rate": 4.767386641904883e-06,
"loss": 1.1948,
"step": 1795
},
{
"epoch": 0.7079576250307957,
"grad_norm": 0.6187195781334022,
"learning_rate": 4.755659885796054e-06,
"loss": 1.2253,
"step": 1796
},
{
"epoch": 0.7083518107908352,
"grad_norm": 0.616576163504054,
"learning_rate": 4.743943069066118e-06,
"loss": 1.1448,
"step": 1797
},
{
"epoch": 0.7087459965508746,
"grad_norm": 0.614300702515973,
"learning_rate": 4.73223621392146e-06,
"loss": 1.181,
"step": 1798
},
{
"epoch": 0.709140182310914,
"grad_norm": 0.6141034301455051,
"learning_rate": 4.720539342549594e-06,
"loss": 1.1788,
"step": 1799
},
{
"epoch": 0.7095343680709535,
"grad_norm": 0.6073756603898747,
"learning_rate": 4.708852477119117e-06,
"loss": 1.1848,
"step": 1800
},
{
"epoch": 0.7099285538309928,
"grad_norm": 0.6344185849187683,
"learning_rate": 4.6971756397796506e-06,
"loss": 1.1721,
"step": 1801
},
{
"epoch": 0.7103227395910323,
"grad_norm": 0.6248360198993864,
"learning_rate": 4.6855088526618205e-06,
"loss": 1.1565,
"step": 1802
},
{
"epoch": 0.7107169253510717,
"grad_norm": 0.6152420860002373,
"learning_rate": 4.6738521378772066e-06,
"loss": 1.1702,
"step": 1803
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.6168160579182377,
"learning_rate": 4.662205517518286e-06,
"loss": 1.1988,
"step": 1804
},
{
"epoch": 0.7115052968711505,
"grad_norm": 0.6199790217466414,
"learning_rate": 4.650569013658417e-06,
"loss": 1.2058,
"step": 1805
},
{
"epoch": 0.71189948263119,
"grad_norm": 0.6176228890841313,
"learning_rate": 4.638942648351774e-06,
"loss": 1.1612,
"step": 1806
},
{
"epoch": 0.7122936683912293,
"grad_norm": 0.5959975381441662,
"learning_rate": 4.627326443633327e-06,
"loss": 1.1628,
"step": 1807
},
{
"epoch": 0.7126878541512688,
"grad_norm": 0.6189398958365385,
"learning_rate": 4.61572042151878e-06,
"loss": 1.1928,
"step": 1808
},
{
"epoch": 0.7130820399113083,
"grad_norm": 0.6271163010563219,
"learning_rate": 4.604124604004544e-06,
"loss": 1.2124,
"step": 1809
},
{
"epoch": 0.7134762256713476,
"grad_norm": 0.6000046568229123,
"learning_rate": 4.592539013067692e-06,
"loss": 1.153,
"step": 1810
},
{
"epoch": 0.7138704114313871,
"grad_norm": 0.5989067172216591,
"learning_rate": 4.580963670665906e-06,
"loss": 1.1537,
"step": 1811
},
{
"epoch": 0.7142645971914264,
"grad_norm": 0.65003150237445,
"learning_rate": 4.569398598737448e-06,
"loss": 1.2302,
"step": 1812
},
{
"epoch": 0.7146587829514659,
"grad_norm": 0.6224236372159876,
"learning_rate": 4.557843819201121e-06,
"loss": 1.2191,
"step": 1813
},
{
"epoch": 0.7150529687115053,
"grad_norm": 0.6360681967059407,
"learning_rate": 4.546299353956211e-06,
"loss": 1.1782,
"step": 1814
},
{
"epoch": 0.7154471544715447,
"grad_norm": 0.6134230197484926,
"learning_rate": 4.534765224882463e-06,
"loss": 1.2106,
"step": 1815
},
{
"epoch": 0.7158413402315842,
"grad_norm": 0.6176737002203802,
"learning_rate": 4.5232414538400336e-06,
"loss": 1.2175,
"step": 1816
},
{
"epoch": 0.7162355259916235,
"grad_norm": 0.6202906864487361,
"learning_rate": 4.511728062669443e-06,
"loss": 1.1807,
"step": 1817
},
{
"epoch": 0.716629711751663,
"grad_norm": 0.6212585444516489,
"learning_rate": 4.50022507319154e-06,
"loss": 1.1958,
"step": 1818
},
{
"epoch": 0.7170238975117024,
"grad_norm": 0.6142126146314887,
"learning_rate": 4.488732507207457e-06,
"loss": 1.189,
"step": 1819
},
{
"epoch": 0.7174180832717418,
"grad_norm": 0.6301160963451029,
"learning_rate": 4.477250386498582e-06,
"loss": 1.2383,
"step": 1820
},
{
"epoch": 0.7178122690317812,
"grad_norm": 0.6238993246895916,
"learning_rate": 4.46577873282649e-06,
"loss": 1.1642,
"step": 1821
},
{
"epoch": 0.7182064547918207,
"grad_norm": 0.5954902888936976,
"learning_rate": 4.4543175679329345e-06,
"loss": 1.1319,
"step": 1822
},
{
"epoch": 0.71860064055186,
"grad_norm": 0.5975113333384684,
"learning_rate": 4.442866913539783e-06,
"loss": 1.1692,
"step": 1823
},
{
"epoch": 0.7189948263118995,
"grad_norm": 0.6361387072646193,
"learning_rate": 4.431426791348981e-06,
"loss": 1.2058,
"step": 1824
},
{
"epoch": 0.719389012071939,
"grad_norm": 0.6206879841575946,
"learning_rate": 4.419997223042509e-06,
"loss": 1.1892,
"step": 1825
},
{
"epoch": 0.7197831978319783,
"grad_norm": 0.6187188924722868,
"learning_rate": 4.408578230282361e-06,
"loss": 1.2343,
"step": 1826
},
{
"epoch": 0.7201773835920178,
"grad_norm": 0.6099133549608606,
"learning_rate": 4.397169834710467e-06,
"loss": 1.1874,
"step": 1827
},
{
"epoch": 0.7205715693520571,
"grad_norm": 0.6218762750404337,
"learning_rate": 4.38577205794869e-06,
"loss": 1.2522,
"step": 1828
},
{
"epoch": 0.7209657551120966,
"grad_norm": 0.6122795104171647,
"learning_rate": 4.37438492159876e-06,
"loss": 1.1989,
"step": 1829
},
{
"epoch": 0.721359940872136,
"grad_norm": 0.6015290594639533,
"learning_rate": 4.36300844724224e-06,
"loss": 1.1714,
"step": 1830
},
{
"epoch": 0.7217541266321754,
"grad_norm": 0.6252355128162509,
"learning_rate": 4.351642656440482e-06,
"loss": 1.1703,
"step": 1831
},
{
"epoch": 0.7221483123922149,
"grad_norm": 0.6111637339804932,
"learning_rate": 4.340287570734604e-06,
"loss": 1.152,
"step": 1832
},
{
"epoch": 0.7225424981522542,
"grad_norm": 0.6101267108124663,
"learning_rate": 4.32894321164542e-06,
"loss": 1.184,
"step": 1833
},
{
"epoch": 0.7229366839122937,
"grad_norm": 0.6424270287758459,
"learning_rate": 4.317609600673418e-06,
"loss": 1.1703,
"step": 1834
},
{
"epoch": 0.7233308696723331,
"grad_norm": 0.6224326912866733,
"learning_rate": 4.306286759298721e-06,
"loss": 1.1925,
"step": 1835
},
{
"epoch": 0.7237250554323725,
"grad_norm": 0.5990540447824775,
"learning_rate": 4.294974708981041e-06,
"loss": 1.1549,
"step": 1836
},
{
"epoch": 0.7241192411924119,
"grad_norm": 0.6304187409365657,
"learning_rate": 4.283673471159632e-06,
"loss": 1.1974,
"step": 1837
},
{
"epoch": 0.7245134269524514,
"grad_norm": 0.6236344446716869,
"learning_rate": 4.272383067253254e-06,
"loss": 1.1704,
"step": 1838
},
{
"epoch": 0.7249076127124907,
"grad_norm": 0.6183536446735383,
"learning_rate": 4.2611035186601445e-06,
"loss": 1.2539,
"step": 1839
},
{
"epoch": 0.7253017984725302,
"grad_norm": 0.6381015795817223,
"learning_rate": 4.2498348467579555e-06,
"loss": 1.1772,
"step": 1840
},
{
"epoch": 0.7256959842325696,
"grad_norm": 0.6196633330398633,
"learning_rate": 4.2385770729037336e-06,
"loss": 1.1597,
"step": 1841
},
{
"epoch": 0.726090169992609,
"grad_norm": 0.6402144565991683,
"learning_rate": 4.22733021843387e-06,
"loss": 1.2207,
"step": 1842
},
{
"epoch": 0.7264843557526485,
"grad_norm": 0.6134635440909342,
"learning_rate": 4.216094304664056e-06,
"loss": 1.2303,
"step": 1843
},
{
"epoch": 0.7268785415126878,
"grad_norm": 0.6170474770272091,
"learning_rate": 4.204869352889246e-06,
"loss": 1.1897,
"step": 1844
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.625150589347141,
"learning_rate": 4.193655384383631e-06,
"loss": 1.1273,
"step": 1845
},
{
"epoch": 0.7276669130327666,
"grad_norm": 0.6702486495437785,
"learning_rate": 4.182452420400571e-06,
"loss": 1.2604,
"step": 1846
},
{
"epoch": 0.7280610987928061,
"grad_norm": 1.1398019367962655,
"learning_rate": 4.171260482172574e-06,
"loss": 1.151,
"step": 1847
},
{
"epoch": 0.7284552845528456,
"grad_norm": 0.6232712417738132,
"learning_rate": 4.160079590911257e-06,
"loss": 1.1928,
"step": 1848
},
{
"epoch": 0.7288494703128849,
"grad_norm": 0.6346597753210788,
"learning_rate": 4.1489097678073e-06,
"loss": 1.2134,
"step": 1849
},
{
"epoch": 0.7292436560729244,
"grad_norm": 0.622479343337929,
"learning_rate": 4.1377510340304e-06,
"loss": 1.1351,
"step": 1850
},
{
"epoch": 0.7296378418329638,
"grad_norm": 0.6095396783729989,
"learning_rate": 4.126603410729232e-06,
"loss": 1.1835,
"step": 1851
},
{
"epoch": 0.7300320275930032,
"grad_norm": 0.6007947259934253,
"learning_rate": 4.1154669190314315e-06,
"loss": 1.1361,
"step": 1852
},
{
"epoch": 0.7304262133530426,
"grad_norm": 0.6392450529455237,
"learning_rate": 4.104341580043518e-06,
"loss": 1.2352,
"step": 1853
},
{
"epoch": 0.730820399113082,
"grad_norm": 0.6088170301748977,
"learning_rate": 4.093227414850887e-06,
"loss": 1.1555,
"step": 1854
},
{
"epoch": 0.7312145848731214,
"grad_norm": 0.611940955223257,
"learning_rate": 4.0821244445177535e-06,
"loss": 1.1035,
"step": 1855
},
{
"epoch": 0.7316087706331609,
"grad_norm": 0.6429334370137534,
"learning_rate": 4.071032690087111e-06,
"loss": 1.2077,
"step": 1856
},
{
"epoch": 0.7320029563932003,
"grad_norm": 0.6199867856316763,
"learning_rate": 4.059952172580694e-06,
"loss": 1.1898,
"step": 1857
},
{
"epoch": 0.7323971421532397,
"grad_norm": 0.682925719480743,
"learning_rate": 4.0488829129989536e-06,
"loss": 1.1796,
"step": 1858
},
{
"epoch": 0.7327913279132792,
"grad_norm": 0.6300326280908697,
"learning_rate": 4.0378249323209915e-06,
"loss": 1.1821,
"step": 1859
},
{
"epoch": 0.7331855136733185,
"grad_norm": 0.6188854368428854,
"learning_rate": 4.026778251504533e-06,
"loss": 1.212,
"step": 1860
},
{
"epoch": 0.733579699433358,
"grad_norm": 0.7209116321064022,
"learning_rate": 4.015742891485893e-06,
"loss": 1.2115,
"step": 1861
},
{
"epoch": 0.7339738851933973,
"grad_norm": 0.6377551509793858,
"learning_rate": 4.0047188731799345e-06,
"loss": 1.2223,
"step": 1862
},
{
"epoch": 0.7343680709534368,
"grad_norm": 0.6709121309342012,
"learning_rate": 3.993706217480015e-06,
"loss": 1.2369,
"step": 1863
},
{
"epoch": 0.7347622567134763,
"grad_norm": 0.6610392131221031,
"learning_rate": 3.982704945257957e-06,
"loss": 1.238,
"step": 1864
},
{
"epoch": 0.7351564424735156,
"grad_norm": 0.6314301850508148,
"learning_rate": 3.97171507736402e-06,
"loss": 1.1694,
"step": 1865
},
{
"epoch": 0.7355506282335551,
"grad_norm": 0.6075680590520474,
"learning_rate": 3.960736634626838e-06,
"loss": 1.1627,
"step": 1866
},
{
"epoch": 0.7359448139935945,
"grad_norm": 0.6341926480920811,
"learning_rate": 3.949769637853393e-06,
"loss": 1.1434,
"step": 1867
},
{
"epoch": 0.7363389997536339,
"grad_norm": 0.621486685123361,
"learning_rate": 3.9388141078289775e-06,
"loss": 1.1946,
"step": 1868
},
{
"epoch": 0.7367331855136733,
"grad_norm": 0.6464204738071503,
"learning_rate": 3.927870065317156e-06,
"loss": 1.1774,
"step": 1869
},
{
"epoch": 0.7371273712737128,
"grad_norm": 0.6718388040792097,
"learning_rate": 3.916937531059706e-06,
"loss": 1.161,
"step": 1870
},
{
"epoch": 0.7375215570337521,
"grad_norm": 0.6323822736177052,
"learning_rate": 3.9060165257766116e-06,
"loss": 1.2166,
"step": 1871
},
{
"epoch": 0.7379157427937916,
"grad_norm": 0.6289704307488232,
"learning_rate": 3.895107070165995e-06,
"loss": 1.1657,
"step": 1872
},
{
"epoch": 0.738309928553831,
"grad_norm": 0.6262746372052379,
"learning_rate": 3.884209184904088e-06,
"loss": 1.2249,
"step": 1873
},
{
"epoch": 0.7387041143138704,
"grad_norm": 0.6184529013832247,
"learning_rate": 3.873322890645202e-06,
"loss": 1.1515,
"step": 1874
},
{
"epoch": 0.7390983000739099,
"grad_norm": 0.6290711060233826,
"learning_rate": 3.862448208021677e-06,
"loss": 1.1834,
"step": 1875
},
{
"epoch": 0.7394924858339492,
"grad_norm": 0.5895476413662796,
"learning_rate": 3.851585157643845e-06,
"loss": 1.1234,
"step": 1876
},
{
"epoch": 0.7398866715939887,
"grad_norm": 0.6107335830258855,
"learning_rate": 3.840733760099985e-06,
"loss": 1.1639,
"step": 1877
},
{
"epoch": 0.740280857354028,
"grad_norm": 0.6322945602429125,
"learning_rate": 3.829894035956306e-06,
"loss": 1.2427,
"step": 1878
},
{
"epoch": 0.7406750431140675,
"grad_norm": 0.6323335943798655,
"learning_rate": 3.819066005756883e-06,
"loss": 1.2223,
"step": 1879
},
{
"epoch": 0.741069228874107,
"grad_norm": 0.6078450616507315,
"learning_rate": 3.8082496900236244e-06,
"loss": 1.1706,
"step": 1880
},
{
"epoch": 0.7414634146341463,
"grad_norm": 0.6221466682968542,
"learning_rate": 3.7974451092562447e-06,
"loss": 1.2046,
"step": 1881
},
{
"epoch": 0.7418576003941858,
"grad_norm": 0.6049678464198069,
"learning_rate": 3.7866522839322207e-06,
"loss": 1.1767,
"step": 1882
},
{
"epoch": 0.7422517861542252,
"grad_norm": 0.6295952461868448,
"learning_rate": 3.775871234506734e-06,
"loss": 1.2225,
"step": 1883
},
{
"epoch": 0.7426459719142646,
"grad_norm": 0.6394412262692781,
"learning_rate": 3.7651019814126656e-06,
"loss": 1.214,
"step": 1884
},
{
"epoch": 0.743040157674304,
"grad_norm": 0.610513027873533,
"learning_rate": 3.754344545060529e-06,
"loss": 1.1537,
"step": 1885
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.5956769595890598,
"learning_rate": 3.743598945838438e-06,
"loss": 1.1758,
"step": 1886
},
{
"epoch": 0.7438285291943828,
"grad_norm": 0.6417078515489372,
"learning_rate": 3.732865204112084e-06,
"loss": 1.1991,
"step": 1887
},
{
"epoch": 0.7442227149544223,
"grad_norm": 0.6291270205503651,
"learning_rate": 3.722143340224682e-06,
"loss": 1.2203,
"step": 1888
},
{
"epoch": 0.7446169007144617,
"grad_norm": 0.6143214199994612,
"learning_rate": 3.7114333744969312e-06,
"loss": 1.2053,
"step": 1889
},
{
"epoch": 0.7450110864745011,
"grad_norm": 0.6247493772614575,
"learning_rate": 3.7007353272269764e-06,
"loss": 1.187,
"step": 1890
},
{
"epoch": 0.7454052722345406,
"grad_norm": 0.6280559082279741,
"learning_rate": 3.6900492186903893e-06,
"loss": 1.2001,
"step": 1891
},
{
"epoch": 0.7457994579945799,
"grad_norm": 0.6656868801405882,
"learning_rate": 3.6793750691400996e-06,
"loss": 1.2266,
"step": 1892
},
{
"epoch": 0.7461936437546194,
"grad_norm": 0.6290134544837587,
"learning_rate": 3.6687128988063768e-06,
"loss": 1.2643,
"step": 1893
},
{
"epoch": 0.7465878295146587,
"grad_norm": 0.6046720210188277,
"learning_rate": 3.6580627278967883e-06,
"loss": 1.1329,
"step": 1894
},
{
"epoch": 0.7469820152746982,
"grad_norm": 0.6132109677638092,
"learning_rate": 3.6474245765961623e-06,
"loss": 1.1802,
"step": 1895
},
{
"epoch": 0.7473762010347377,
"grad_norm": 0.6215636460183582,
"learning_rate": 3.636798465066537e-06,
"loss": 1.161,
"step": 1896
},
{
"epoch": 0.747770386794777,
"grad_norm": 0.6324476045738789,
"learning_rate": 3.6261844134471434e-06,
"loss": 1.2743,
"step": 1897
},
{
"epoch": 0.7481645725548165,
"grad_norm": 0.6229098227690751,
"learning_rate": 3.6155824418543482e-06,
"loss": 1.1813,
"step": 1898
},
{
"epoch": 0.7485587583148559,
"grad_norm": 0.6090812575135249,
"learning_rate": 3.604992570381621e-06,
"loss": 1.1345,
"step": 1899
},
{
"epoch": 0.7489529440748953,
"grad_norm": 0.6175559157353252,
"learning_rate": 3.5944148190995077e-06,
"loss": 1.2318,
"step": 1900
},
{
"epoch": 0.7493471298349347,
"grad_norm": 0.6151430132474782,
"learning_rate": 3.583849208055582e-06,
"loss": 1.1515,
"step": 1901
},
{
"epoch": 0.7497413155949741,
"grad_norm": 0.6150817757122007,
"learning_rate": 3.573295757274401e-06,
"loss": 1.1709,
"step": 1902
},
{
"epoch": 0.7501355013550135,
"grad_norm": 0.6206530860937504,
"learning_rate": 3.562754486757477e-06,
"loss": 1.2368,
"step": 1903
},
{
"epoch": 0.750529687115053,
"grad_norm": 0.6187559303708384,
"learning_rate": 3.5522254164832458e-06,
"loss": 1.166,
"step": 1904
},
{
"epoch": 0.7509238728750924,
"grad_norm": 0.6050479857846883,
"learning_rate": 3.5417085664070127e-06,
"loss": 1.1884,
"step": 1905
},
{
"epoch": 0.7513180586351318,
"grad_norm": 0.6168601224584902,
"learning_rate": 3.5312039564609203e-06,
"loss": 1.179,
"step": 1906
},
{
"epoch": 0.7517122443951713,
"grad_norm": 0.6626157674267323,
"learning_rate": 3.5207116065539214e-06,
"loss": 1.2784,
"step": 1907
},
{
"epoch": 0.7521064301552106,
"grad_norm": 0.6204622203986804,
"learning_rate": 3.510231536571731e-06,
"loss": 1.1545,
"step": 1908
},
{
"epoch": 0.7525006159152501,
"grad_norm": 0.6025298592606017,
"learning_rate": 3.4997637663767827e-06,
"loss": 1.1623,
"step": 1909
},
{
"epoch": 0.7528948016752894,
"grad_norm": 0.6686746729115949,
"learning_rate": 3.4893083158082096e-06,
"loss": 1.225,
"step": 1910
},
{
"epoch": 0.7532889874353289,
"grad_norm": 0.6770303268213698,
"learning_rate": 3.4788652046817885e-06,
"loss": 1.1987,
"step": 1911
},
{
"epoch": 0.7536831731953684,
"grad_norm": 0.6169292952669728,
"learning_rate": 3.4684344527899117e-06,
"loss": 1.1413,
"step": 1912
},
{
"epoch": 0.7540773589554077,
"grad_norm": 0.6485841260675642,
"learning_rate": 3.458016079901544e-06,
"loss": 1.1747,
"step": 1913
},
{
"epoch": 0.7544715447154472,
"grad_norm": 0.644634311279479,
"learning_rate": 3.447610105762197e-06,
"loss": 1.1688,
"step": 1914
},
{
"epoch": 0.7548657304754866,
"grad_norm": 0.5954331888752692,
"learning_rate": 3.4372165500938813e-06,
"loss": 1.1999,
"step": 1915
},
{
"epoch": 0.755259916235526,
"grad_norm": 0.617923959960479,
"learning_rate": 3.4268354325950637e-06,
"loss": 1.2101,
"step": 1916
},
{
"epoch": 0.7556541019955654,
"grad_norm": 0.6202978534151761,
"learning_rate": 3.4164667729406487e-06,
"loss": 1.1168,
"step": 1917
},
{
"epoch": 0.7560482877556048,
"grad_norm": 0.6139453726018187,
"learning_rate": 3.4061105907819202e-06,
"loss": 1.107,
"step": 1918
},
{
"epoch": 0.7564424735156442,
"grad_norm": 0.6199465940139608,
"learning_rate": 3.395766905746515e-06,
"loss": 1.2331,
"step": 1919
},
{
"epoch": 0.7568366592756837,
"grad_norm": 0.6121258940736186,
"learning_rate": 3.3854357374383905e-06,
"loss": 1.1512,
"step": 1920
},
{
"epoch": 0.7572308450357231,
"grad_norm": 0.6192952901355329,
"learning_rate": 3.375117105437784e-06,
"loss": 1.1992,
"step": 1921
},
{
"epoch": 0.7576250307957625,
"grad_norm": 0.6428452093914235,
"learning_rate": 3.3648110293011592e-06,
"loss": 1.2009,
"step": 1922
},
{
"epoch": 0.758019216555802,
"grad_norm": 0.632857445152661,
"learning_rate": 3.3545175285611986e-06,
"loss": 1.2031,
"step": 1923
},
{
"epoch": 0.7584134023158413,
"grad_norm": 0.61203461189701,
"learning_rate": 3.344236622726743e-06,
"loss": 1.128,
"step": 1924
},
{
"epoch": 0.7588075880758808,
"grad_norm": 0.5940930582433119,
"learning_rate": 3.333968331282759e-06,
"loss": 1.1638,
"step": 1925
},
{
"epoch": 0.7592017738359201,
"grad_norm": 0.6128730590023086,
"learning_rate": 3.3237126736903168e-06,
"loss": 1.1636,
"step": 1926
},
{
"epoch": 0.7595959595959596,
"grad_norm": 0.6453501409856305,
"learning_rate": 3.313469669386532e-06,
"loss": 1.2196,
"step": 1927
},
{
"epoch": 0.7599901453559991,
"grad_norm": 0.6462479993428716,
"learning_rate": 3.303239337784547e-06,
"loss": 1.1757,
"step": 1928
},
{
"epoch": 0.7603843311160384,
"grad_norm": 0.6223443320198161,
"learning_rate": 3.2930216982734775e-06,
"loss": 1.2022,
"step": 1929
},
{
"epoch": 0.7607785168760779,
"grad_norm": 0.6012467834584495,
"learning_rate": 3.2828167702183945e-06,
"loss": 1.1624,
"step": 1930
},
{
"epoch": 0.7611727026361172,
"grad_norm": 0.6212867293615743,
"learning_rate": 3.272624572960269e-06,
"loss": 1.1469,
"step": 1931
},
{
"epoch": 0.7615668883961567,
"grad_norm": 0.623426678936357,
"learning_rate": 3.262445125815945e-06,
"loss": 1.2142,
"step": 1932
},
{
"epoch": 0.7619610741561961,
"grad_norm": 0.6174911641351716,
"learning_rate": 3.2522784480781057e-06,
"loss": 1.229,
"step": 1933
},
{
"epoch": 0.7623552599162355,
"grad_norm": 0.6458478147860737,
"learning_rate": 3.242124559015234e-06,
"loss": 1.2307,
"step": 1934
},
{
"epoch": 0.7627494456762749,
"grad_norm": 0.6139695821784812,
"learning_rate": 3.2319834778715662e-06,
"loss": 1.1993,
"step": 1935
},
{
"epoch": 0.7631436314363144,
"grad_norm": 0.6244967897448498,
"learning_rate": 3.221855223867076e-06,
"loss": 1.1983,
"step": 1936
},
{
"epoch": 0.7635378171963538,
"grad_norm": 0.6167092879774253,
"learning_rate": 3.211739816197419e-06,
"loss": 1.139,
"step": 1937
},
{
"epoch": 0.7639320029563932,
"grad_norm": 0.6253757235990433,
"learning_rate": 3.2016372740339e-06,
"loss": 1.2246,
"step": 1938
},
{
"epoch": 0.7643261887164327,
"grad_norm": 0.625945816934853,
"learning_rate": 3.1915476165234505e-06,
"loss": 1.1534,
"step": 1939
},
{
"epoch": 0.764720374476472,
"grad_norm": 0.6294175091707643,
"learning_rate": 3.1814708627885736e-06,
"loss": 1.2087,
"step": 1940
},
{
"epoch": 0.7651145602365115,
"grad_norm": 0.6174964988395791,
"learning_rate": 3.171407031927325e-06,
"loss": 1.2108,
"step": 1941
},
{
"epoch": 0.7655087459965508,
"grad_norm": 0.6692493984724502,
"learning_rate": 3.161356143013258e-06,
"loss": 1.2602,
"step": 1942
},
{
"epoch": 0.7659029317565903,
"grad_norm": 0.6049874736921799,
"learning_rate": 3.1513182150954067e-06,
"loss": 1.1283,
"step": 1943
},
{
"epoch": 0.7662971175166298,
"grad_norm": 0.6170567402312764,
"learning_rate": 3.1412932671982368e-06,
"loss": 1.1787,
"step": 1944
},
{
"epoch": 0.7666913032766691,
"grad_norm": 0.5939532563374448,
"learning_rate": 3.131281318321607e-06,
"loss": 1.1134,
"step": 1945
},
{
"epoch": 0.7670854890367086,
"grad_norm": 0.6073844909969783,
"learning_rate": 3.1212823874407517e-06,
"loss": 1.1714,
"step": 1946
},
{
"epoch": 0.767479674796748,
"grad_norm": 0.6102814200245192,
"learning_rate": 3.1112964935062297e-06,
"loss": 1.172,
"step": 1947
},
{
"epoch": 0.7678738605567874,
"grad_norm": 0.6156593525633267,
"learning_rate": 3.101323655443882e-06,
"loss": 1.2028,
"step": 1948
},
{
"epoch": 0.7682680463168268,
"grad_norm": 0.630439880503606,
"learning_rate": 3.0913638921548195e-06,
"loss": 1.1547,
"step": 1949
},
{
"epoch": 0.7686622320768662,
"grad_norm": 0.596623146889128,
"learning_rate": 3.0814172225153626e-06,
"loss": 1.1191,
"step": 1950
},
{
"epoch": 0.7690564178369056,
"grad_norm": 0.6035005020079766,
"learning_rate": 3.0714836653770153e-06,
"loss": 1.1602,
"step": 1951
},
{
"epoch": 0.7694506035969451,
"grad_norm": 0.6229719405653049,
"learning_rate": 3.0615632395664395e-06,
"loss": 1.2358,
"step": 1952
},
{
"epoch": 0.7698447893569845,
"grad_norm": 0.6172825849164519,
"learning_rate": 3.051655963885398e-06,
"loss": 1.1966,
"step": 1953
},
{
"epoch": 0.7702389751170239,
"grad_norm": 0.6286383648446865,
"learning_rate": 3.0417618571107443e-06,
"loss": 1.1964,
"step": 1954
},
{
"epoch": 0.7706331608770634,
"grad_norm": 0.6108360343185555,
"learning_rate": 3.0318809379943594e-06,
"loss": 1.1728,
"step": 1955
},
{
"epoch": 0.7710273466371027,
"grad_norm": 0.6362153250389974,
"learning_rate": 3.022013225263142e-06,
"loss": 1.2236,
"step": 1956
},
{
"epoch": 0.7714215323971422,
"grad_norm": 0.6344908938517139,
"learning_rate": 3.0121587376189544e-06,
"loss": 1.2053,
"step": 1957
},
{
"epoch": 0.7718157181571815,
"grad_norm": 0.6201739659408967,
"learning_rate": 3.00231749373859e-06,
"loss": 1.1537,
"step": 1958
},
{
"epoch": 0.772209903917221,
"grad_norm": 0.6100774811460168,
"learning_rate": 2.992489512273754e-06,
"loss": 1.1984,
"step": 1959
},
{
"epoch": 0.7726040896772605,
"grad_norm": 0.6232200126606358,
"learning_rate": 2.9826748118510107e-06,
"loss": 1.2338,
"step": 1960
},
{
"epoch": 0.7729982754372998,
"grad_norm": 0.6325714051449248,
"learning_rate": 2.972873411071745e-06,
"loss": 1.1917,
"step": 1961
},
{
"epoch": 0.7733924611973393,
"grad_norm": 0.6152245310127229,
"learning_rate": 2.9630853285121506e-06,
"loss": 1.2181,
"step": 1962
},
{
"epoch": 0.7737866469573786,
"grad_norm": 0.6382727314998073,
"learning_rate": 2.9533105827231677e-06,
"loss": 1.2374,
"step": 1963
},
{
"epoch": 0.7741808327174181,
"grad_norm": 0.6093019906684419,
"learning_rate": 2.9435491922304603e-06,
"loss": 1.2039,
"step": 1964
},
{
"epoch": 0.7745750184774575,
"grad_norm": 0.6466162600658065,
"learning_rate": 2.933801175534392e-06,
"loss": 1.2507,
"step": 1965
},
{
"epoch": 0.7749692042374969,
"grad_norm": 0.6172944871295347,
"learning_rate": 2.9240665511099643e-06,
"loss": 1.1777,
"step": 1966
},
{
"epoch": 0.7753633899975363,
"grad_norm": 0.6025058965161826,
"learning_rate": 2.914345337406812e-06,
"loss": 1.1488,
"step": 1967
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.6283140418676793,
"learning_rate": 2.9046375528491378e-06,
"loss": 1.2246,
"step": 1968
},
{
"epoch": 0.7761517615176152,
"grad_norm": 0.6174686412053484,
"learning_rate": 2.8949432158357083e-06,
"loss": 1.1603,
"step": 1969
},
{
"epoch": 0.7765459472776546,
"grad_norm": 0.6249876696519094,
"learning_rate": 2.885262344739792e-06,
"loss": 1.2378,
"step": 1970
},
{
"epoch": 0.776940133037694,
"grad_norm": 0.6155008238993236,
"learning_rate": 2.875594957909136e-06,
"loss": 1.1734,
"step": 1971
},
{
"epoch": 0.7773343187977334,
"grad_norm": 0.6070997737354649,
"learning_rate": 2.865941073665942e-06,
"loss": 1.1533,
"step": 1972
},
{
"epoch": 0.7777285045577729,
"grad_norm": 0.6285112446428368,
"learning_rate": 2.8563007103068075e-06,
"loss": 1.2374,
"step": 1973
},
{
"epoch": 0.7781226903178122,
"grad_norm": 0.6292319074803627,
"learning_rate": 2.8466738861027143e-06,
"loss": 1.1764,
"step": 1974
},
{
"epoch": 0.7785168760778517,
"grad_norm": 0.6280895354859987,
"learning_rate": 2.8370606192989826e-06,
"loss": 1.2332,
"step": 1975
},
{
"epoch": 0.7789110618378912,
"grad_norm": 0.6392848234961054,
"learning_rate": 2.8274609281152322e-06,
"loss": 1.1681,
"step": 1976
},
{
"epoch": 0.7793052475979305,
"grad_norm": 0.6422553395733501,
"learning_rate": 2.8178748307453552e-06,
"loss": 1.1967,
"step": 1977
},
{
"epoch": 0.77969943335797,
"grad_norm": 0.6448664268947002,
"learning_rate": 2.8083023453574867e-06,
"loss": 1.1637,
"step": 1978
},
{
"epoch": 0.7800936191180093,
"grad_norm": 0.6268688830101503,
"learning_rate": 2.7987434900939537e-06,
"loss": 1.1992,
"step": 1979
},
{
"epoch": 0.7804878048780488,
"grad_norm": 0.6270584497214332,
"learning_rate": 2.7891982830712614e-06,
"loss": 1.215,
"step": 1980
},
{
"epoch": 0.7808819906380882,
"grad_norm": 0.6136390949207409,
"learning_rate": 2.779666742380035e-06,
"loss": 1.1842,
"step": 1981
},
{
"epoch": 0.7812761763981276,
"grad_norm": 0.6160721779555592,
"learning_rate": 2.7701488860850134e-06,
"loss": 1.1465,
"step": 1982
},
{
"epoch": 0.781670362158167,
"grad_norm": 0.6229572690437215,
"learning_rate": 2.7606447322249876e-06,
"loss": 1.1872,
"step": 1983
},
{
"epoch": 0.7820645479182065,
"grad_norm": 0.6120891016882081,
"learning_rate": 2.7511542988127815e-06,
"loss": 1.1933,
"step": 1984
},
{
"epoch": 0.7824587336782459,
"grad_norm": 0.6396299966743912,
"learning_rate": 2.7416776038352246e-06,
"loss": 1.2268,
"step": 1985
},
{
"epoch": 0.7828529194382853,
"grad_norm": 0.620606681831229,
"learning_rate": 2.732214665253092e-06,
"loss": 1.18,
"step": 1986
},
{
"epoch": 0.7832471051983247,
"grad_norm": 0.6172045847652757,
"learning_rate": 2.7227655010011034e-06,
"loss": 1.2072,
"step": 1987
},
{
"epoch": 0.7836412909583641,
"grad_norm": 0.6174655713344509,
"learning_rate": 2.7133301289878644e-06,
"loss": 1.1981,
"step": 1988
},
{
"epoch": 0.7840354767184036,
"grad_norm": 0.6453151721553436,
"learning_rate": 2.703908567095841e-06,
"loss": 1.2319,
"step": 1989
},
{
"epoch": 0.7844296624784429,
"grad_norm": 0.6143239403662212,
"learning_rate": 2.694500833181323e-06,
"loss": 1.1539,
"step": 1990
},
{
"epoch": 0.7848238482384824,
"grad_norm": 0.6118518639087388,
"learning_rate": 2.6851069450743996e-06,
"loss": 1.136,
"step": 1991
},
{
"epoch": 0.7852180339985219,
"grad_norm": 0.621523302552173,
"learning_rate": 2.6757269205789118e-06,
"loss": 1.1884,
"step": 1992
},
{
"epoch": 0.7856122197585612,
"grad_norm": 0.6177501269477549,
"learning_rate": 2.666360777472432e-06,
"loss": 1.1697,
"step": 1993
},
{
"epoch": 0.7860064055186007,
"grad_norm": 0.6169578769905575,
"learning_rate": 2.6570085335062166e-06,
"loss": 1.149,
"step": 1994
},
{
"epoch": 0.78640059127864,
"grad_norm": 0.6384469724904461,
"learning_rate": 2.6476702064051873e-06,
"loss": 1.215,
"step": 1995
},
{
"epoch": 0.7867947770386795,
"grad_norm": 0.6526331509523849,
"learning_rate": 2.638345813867883e-06,
"loss": 1.1834,
"step": 1996
},
{
"epoch": 0.7871889627987189,
"grad_norm": 0.6384058053206544,
"learning_rate": 2.629035373566433e-06,
"loss": 1.2679,
"step": 1997
},
{
"epoch": 0.7875831485587583,
"grad_norm": 0.6173186289000027,
"learning_rate": 2.6197389031465328e-06,
"loss": 1.1497,
"step": 1998
},
{
"epoch": 0.7879773343187977,
"grad_norm": 0.6179494011323186,
"learning_rate": 2.610456420227386e-06,
"loss": 1.155,
"step": 1999
},
{
"epoch": 0.7883715200788372,
"grad_norm": 0.6495295068681656,
"learning_rate": 2.6011879424017006e-06,
"loss": 1.1627,
"step": 2000
},
{
"epoch": 0.7887657058388766,
"grad_norm": 0.6124764762909571,
"learning_rate": 2.5919334872356384e-06,
"loss": 1.2092,
"step": 2001
},
{
"epoch": 0.789159891598916,
"grad_norm": 0.6267862591887654,
"learning_rate": 2.582693072268778e-06,
"loss": 1.2324,
"step": 2002
},
{
"epoch": 0.7895540773589554,
"grad_norm": 0.640938297681364,
"learning_rate": 2.573466715014089e-06,
"loss": 1.1638,
"step": 2003
},
{
"epoch": 0.7899482631189948,
"grad_norm": 0.6319357561158305,
"learning_rate": 2.5642544329579088e-06,
"loss": 1.1436,
"step": 2004
},
{
"epoch": 0.7903424488790343,
"grad_norm": 0.6599757389441551,
"learning_rate": 2.5550562435598834e-06,
"loss": 1.1859,
"step": 2005
},
{
"epoch": 0.7907366346390736,
"grad_norm": 0.6261460556185046,
"learning_rate": 2.5458721642529637e-06,
"loss": 1.2276,
"step": 2006
},
{
"epoch": 0.7911308203991131,
"grad_norm": 0.6368615447497923,
"learning_rate": 2.536702212443345e-06,
"loss": 1.126,
"step": 2007
},
{
"epoch": 0.7915250061591526,
"grad_norm": 0.6065232945787534,
"learning_rate": 2.5275464055104615e-06,
"loss": 1.1566,
"step": 2008
},
{
"epoch": 0.7919191919191919,
"grad_norm": 0.6260924052346492,
"learning_rate": 2.5184047608069283e-06,
"loss": 1.2301,
"step": 2009
},
{
"epoch": 0.7923133776792314,
"grad_norm": 0.5961679029421411,
"learning_rate": 2.509277295658521e-06,
"loss": 1.1195,
"step": 2010
},
{
"epoch": 0.7927075634392707,
"grad_norm": 0.6880173744181591,
"learning_rate": 2.500164027364147e-06,
"loss": 1.1852,
"step": 2011
},
{
"epoch": 0.7931017491993102,
"grad_norm": 0.591725360802608,
"learning_rate": 2.491064973195798e-06,
"loss": 1.1237,
"step": 2012
},
{
"epoch": 0.7934959349593496,
"grad_norm": 0.5975825860792612,
"learning_rate": 2.4819801503985365e-06,
"loss": 1.1518,
"step": 2013
},
{
"epoch": 0.793890120719389,
"grad_norm": 0.6221206271257661,
"learning_rate": 2.4729095761904487e-06,
"loss": 1.1838,
"step": 2014
},
{
"epoch": 0.7942843064794284,
"grad_norm": 0.6271650798589434,
"learning_rate": 2.4638532677626124e-06,
"loss": 1.1672,
"step": 2015
},
{
"epoch": 0.7946784922394678,
"grad_norm": 0.6395665538753358,
"learning_rate": 2.4548112422790695e-06,
"loss": 1.2002,
"step": 2016
},
{
"epoch": 0.7950726779995073,
"grad_norm": 0.6087288790926827,
"learning_rate": 2.4457835168767975e-06,
"loss": 1.1194,
"step": 2017
},
{
"epoch": 0.7954668637595467,
"grad_norm": 0.6099991672736873,
"learning_rate": 2.4367701086656625e-06,
"loss": 1.141,
"step": 2018
},
{
"epoch": 0.7958610495195861,
"grad_norm": 0.6055519755469221,
"learning_rate": 2.4277710347284035e-06,
"loss": 1.1506,
"step": 2019
},
{
"epoch": 0.7962552352796255,
"grad_norm": 0.653125514461312,
"learning_rate": 2.4187863121205933e-06,
"loss": 1.1804,
"step": 2020
},
{
"epoch": 0.796649421039665,
"grad_norm": 0.6025409266602508,
"learning_rate": 2.409815957870597e-06,
"loss": 1.1893,
"step": 2021
},
{
"epoch": 0.7970436067997043,
"grad_norm": 0.6126866642525495,
"learning_rate": 2.400859988979555e-06,
"loss": 1.186,
"step": 2022
},
{
"epoch": 0.7974377925597438,
"grad_norm": 0.6286983033908643,
"learning_rate": 2.3919184224213354e-06,
"loss": 1.1655,
"step": 2023
},
{
"epoch": 0.7978319783197833,
"grad_norm": 0.5932553711308323,
"learning_rate": 2.3829912751425244e-06,
"loss": 1.1778,
"step": 2024
},
{
"epoch": 0.7982261640798226,
"grad_norm": 0.633166520052366,
"learning_rate": 2.374078564062364e-06,
"loss": 1.1589,
"step": 2025
},
{
"epoch": 0.7986203498398621,
"grad_norm": 0.6299341383892152,
"learning_rate": 2.3651803060727484e-06,
"loss": 1.1603,
"step": 2026
},
{
"epoch": 0.7990145355999014,
"grad_norm": 0.6223977799816698,
"learning_rate": 2.3562965180381746e-06,
"loss": 1.2036,
"step": 2027
},
{
"epoch": 0.7994087213599409,
"grad_norm": 0.6214882966307388,
"learning_rate": 2.3474272167957144e-06,
"loss": 1.1902,
"step": 2028
},
{
"epoch": 0.7998029071199803,
"grad_norm": 0.6261786382679704,
"learning_rate": 2.3385724191549807e-06,
"loss": 1.1596,
"step": 2029
},
{
"epoch": 0.8001970928800197,
"grad_norm": 0.6179261386167846,
"learning_rate": 2.3297321418981077e-06,
"loss": 1.1601,
"step": 2030
},
{
"epoch": 0.8005912786400591,
"grad_norm": 0.6067017257945441,
"learning_rate": 2.3209064017797014e-06,
"loss": 1.1052,
"step": 2031
},
{
"epoch": 0.8009854644000985,
"grad_norm": 0.6030346397003117,
"learning_rate": 2.312095215526814e-06,
"loss": 1.1272,
"step": 2032
},
{
"epoch": 0.801379650160138,
"grad_norm": 0.6187228819182855,
"learning_rate": 2.3032985998389236e-06,
"loss": 1.2039,
"step": 2033
},
{
"epoch": 0.8017738359201774,
"grad_norm": 0.6190809264452526,
"learning_rate": 2.29451657138789e-06,
"loss": 1.2414,
"step": 2034
},
{
"epoch": 0.8021680216802168,
"grad_norm": 0.6083179570546223,
"learning_rate": 2.285749146817924e-06,
"loss": 1.1508,
"step": 2035
},
{
"epoch": 0.8025622074402562,
"grad_norm": 0.5937926599332075,
"learning_rate": 2.2769963427455555e-06,
"loss": 1.0988,
"step": 2036
},
{
"epoch": 0.8029563932002957,
"grad_norm": 0.6173897531116277,
"learning_rate": 2.2682581757596144e-06,
"loss": 1.1962,
"step": 2037
},
{
"epoch": 0.803350578960335,
"grad_norm": 0.5854683327803459,
"learning_rate": 2.259534662421179e-06,
"loss": 1.1119,
"step": 2038
},
{
"epoch": 0.8037447647203745,
"grad_norm": 0.6170817511105888,
"learning_rate": 2.2508258192635614e-06,
"loss": 1.1889,
"step": 2039
},
{
"epoch": 0.804138950480414,
"grad_norm": 0.6159894762027561,
"learning_rate": 2.242131662792272e-06,
"loss": 1.1667,
"step": 2040
},
{
"epoch": 0.8045331362404533,
"grad_norm": 0.6118649548400591,
"learning_rate": 2.2334522094849798e-06,
"loss": 1.1371,
"step": 2041
},
{
"epoch": 0.8049273220004928,
"grad_norm": 0.6392916794711796,
"learning_rate": 2.2247874757914865e-06,
"loss": 1.1846,
"step": 2042
},
{
"epoch": 0.8053215077605321,
"grad_norm": 0.5941927210409212,
"learning_rate": 2.2161374781337084e-06,
"loss": 1.1291,
"step": 2043
},
{
"epoch": 0.8057156935205716,
"grad_norm": 0.6294242082032777,
"learning_rate": 2.2075022329056193e-06,
"loss": 1.2009,
"step": 2044
},
{
"epoch": 0.806109879280611,
"grad_norm": 0.6422605646655121,
"learning_rate": 2.198881756473238e-06,
"loss": 1.2299,
"step": 2045
},
{
"epoch": 0.8065040650406504,
"grad_norm": 0.6563848866016602,
"learning_rate": 2.190276065174596e-06,
"loss": 1.2258,
"step": 2046
},
{
"epoch": 0.8068982508006898,
"grad_norm": 0.6448012423504815,
"learning_rate": 2.1816851753197023e-06,
"loss": 1.1881,
"step": 2047
},
{
"epoch": 0.8072924365607292,
"grad_norm": 0.597728406050263,
"learning_rate": 2.1731091031905118e-06,
"loss": 1.1688,
"step": 2048
},
{
"epoch": 0.8076866223207687,
"grad_norm": 0.5886841825944683,
"learning_rate": 2.164547865040889e-06,
"loss": 1.124,
"step": 2049
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.6142796262742458,
"learning_rate": 2.156001477096601e-06,
"loss": 1.2032,
"step": 2050
},
{
"epoch": 0.8084749938408475,
"grad_norm": 0.6175251461681956,
"learning_rate": 2.1474699555552527e-06,
"loss": 1.1787,
"step": 2051
},
{
"epoch": 0.8088691796008869,
"grad_norm": 0.6139100518824416,
"learning_rate": 2.138953316586283e-06,
"loss": 1.1953,
"step": 2052
},
{
"epoch": 0.8092633653609264,
"grad_norm": 0.6430044371047359,
"learning_rate": 2.130451576330925e-06,
"loss": 1.2208,
"step": 2053
},
{
"epoch": 0.8096575511209657,
"grad_norm": 0.6111371447533479,
"learning_rate": 2.12196475090217e-06,
"loss": 1.1537,
"step": 2054
},
{
"epoch": 0.8100517368810052,
"grad_norm": 0.6150669801063049,
"learning_rate": 2.113492856384741e-06,
"loss": 1.1211,
"step": 2055
},
{
"epoch": 0.8104459226410446,
"grad_norm": 0.6290841991274971,
"learning_rate": 2.1050359088350724e-06,
"loss": 1.2084,
"step": 2056
},
{
"epoch": 0.810840108401084,
"grad_norm": 0.6053161669582096,
"learning_rate": 2.0965939242812594e-06,
"loss": 1.1343,
"step": 2057
},
{
"epoch": 0.8112342941611235,
"grad_norm": 0.623034572056998,
"learning_rate": 2.0881669187230415e-06,
"loss": 1.1616,
"step": 2058
},
{
"epoch": 0.8116284799211628,
"grad_norm": 0.6122769163475099,
"learning_rate": 2.0797549081317724e-06,
"loss": 1.1639,
"step": 2059
},
{
"epoch": 0.8120226656812023,
"grad_norm": 0.6241014032007793,
"learning_rate": 2.0713579084503877e-06,
"loss": 1.2213,
"step": 2060
},
{
"epoch": 0.8124168514412416,
"grad_norm": 0.6054665326241209,
"learning_rate": 2.0629759355933665e-06,
"loss": 1.183,
"step": 2061
},
{
"epoch": 0.8128110372012811,
"grad_norm": 0.6131850542325953,
"learning_rate": 2.0546090054467118e-06,
"loss": 1.1867,
"step": 2062
},
{
"epoch": 0.8132052229613205,
"grad_norm": 0.5905612318597147,
"learning_rate": 2.0462571338679204e-06,
"loss": 1.1652,
"step": 2063
},
{
"epoch": 0.8135994087213599,
"grad_norm": 0.6086745867605593,
"learning_rate": 2.0379203366859413e-06,
"loss": 1.1749,
"step": 2064
},
{
"epoch": 0.8139935944813994,
"grad_norm": 0.6547726012282458,
"learning_rate": 2.0295986297011603e-06,
"loss": 1.2606,
"step": 2065
},
{
"epoch": 0.8143877802414388,
"grad_norm": 0.6176365863473255,
"learning_rate": 2.0212920286853656e-06,
"loss": 1.1631,
"step": 2066
},
{
"epoch": 0.8147819660014782,
"grad_norm": 0.5969133841837041,
"learning_rate": 2.0130005493817063e-06,
"loss": 1.1818,
"step": 2067
},
{
"epoch": 0.8151761517615176,
"grad_norm": 0.6095137689005168,
"learning_rate": 2.004724207504675e-06,
"loss": 1.1147,
"step": 2068
},
{
"epoch": 0.815570337521557,
"grad_norm": 0.6149824366682144,
"learning_rate": 1.9964630187400834e-06,
"loss": 1.1667,
"step": 2069
},
{
"epoch": 0.8159645232815964,
"grad_norm": 0.6076416587106072,
"learning_rate": 1.988216998745014e-06,
"loss": 1.1657,
"step": 2070
},
{
"epoch": 0.8163587090416359,
"grad_norm": 0.6378102035141168,
"learning_rate": 1.9799861631478013e-06,
"loss": 1.1748,
"step": 2071
},
{
"epoch": 0.8167528948016753,
"grad_norm": 0.6018846786576992,
"learning_rate": 1.971770527548008e-06,
"loss": 1.1243,
"step": 2072
},
{
"epoch": 0.8171470805617147,
"grad_norm": 0.6072693290996355,
"learning_rate": 1.9635701075163884e-06,
"loss": 1.1456,
"step": 2073
},
{
"epoch": 0.8175412663217542,
"grad_norm": 0.6188901773945752,
"learning_rate": 1.9553849185948514e-06,
"loss": 1.2303,
"step": 2074
},
{
"epoch": 0.8179354520817935,
"grad_norm": 0.6652688896175301,
"learning_rate": 1.947214976296443e-06,
"loss": 1.2502,
"step": 2075
},
{
"epoch": 0.818329637841833,
"grad_norm": 0.6180903878734494,
"learning_rate": 1.9390602961053194e-06,
"loss": 1.156,
"step": 2076
},
{
"epoch": 0.8187238236018723,
"grad_norm": 0.6125254270472376,
"learning_rate": 1.930920893476701e-06,
"loss": 1.1941,
"step": 2077
},
{
"epoch": 0.8191180093619118,
"grad_norm": 0.623138908331946,
"learning_rate": 1.9227967838368566e-06,
"loss": 1.1965,
"step": 2078
},
{
"epoch": 0.8195121951219512,
"grad_norm": 0.615972707734638,
"learning_rate": 1.9146879825830753e-06,
"loss": 1.1691,
"step": 2079
},
{
"epoch": 0.8199063808819906,
"grad_norm": 0.6000820870368339,
"learning_rate": 1.9065945050836299e-06,
"loss": 1.1169,
"step": 2080
},
{
"epoch": 0.8203005666420301,
"grad_norm": 0.609742615231763,
"learning_rate": 1.8985163666777473e-06,
"loss": 1.1694,
"step": 2081
},
{
"epoch": 0.8206947524020695,
"grad_norm": 0.6200332366286192,
"learning_rate": 1.890453582675591e-06,
"loss": 1.1225,
"step": 2082
},
{
"epoch": 0.8210889381621089,
"grad_norm": 0.6145307042295974,
"learning_rate": 1.882406168358215e-06,
"loss": 1.1893,
"step": 2083
},
{
"epoch": 0.8214831239221483,
"grad_norm": 0.613663996359055,
"learning_rate": 1.8743741389775472e-06,
"loss": 1.2003,
"step": 2084
},
{
"epoch": 0.8218773096821878,
"grad_norm": 0.6163140729383925,
"learning_rate": 1.866357509756358e-06,
"loss": 1.1625,
"step": 2085
},
{
"epoch": 0.8222714954422271,
"grad_norm": 0.6093496583736225,
"learning_rate": 1.8583562958882329e-06,
"loss": 1.1604,
"step": 2086
},
{
"epoch": 0.8226656812022666,
"grad_norm": 0.6112581505765976,
"learning_rate": 1.8503705125375382e-06,
"loss": 1.12,
"step": 2087
},
{
"epoch": 0.823059866962306,
"grad_norm": 0.6187957102380715,
"learning_rate": 1.8424001748393905e-06,
"loss": 1.2006,
"step": 2088
},
{
"epoch": 0.8234540527223454,
"grad_norm": 0.6131303613972927,
"learning_rate": 1.8344452978996441e-06,
"loss": 1.1182,
"step": 2089
},
{
"epoch": 0.8238482384823849,
"grad_norm": 0.6096435231696508,
"learning_rate": 1.8265058967948434e-06,
"loss": 1.0993,
"step": 2090
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.6188414868551905,
"learning_rate": 1.818581986572201e-06,
"loss": 1.2266,
"step": 2091
},
{
"epoch": 0.8246366100024637,
"grad_norm": 0.6187428595993414,
"learning_rate": 1.8106735822495746e-06,
"loss": 1.2269,
"step": 2092
},
{
"epoch": 0.825030795762503,
"grad_norm": 0.6158407049072168,
"learning_rate": 1.8027806988154373e-06,
"loss": 1.1678,
"step": 2093
},
{
"epoch": 0.8254249815225425,
"grad_norm": 0.6274441437082312,
"learning_rate": 1.794903351228835e-06,
"loss": 1.2211,
"step": 2094
},
{
"epoch": 0.8258191672825819,
"grad_norm": 0.6161979943389017,
"learning_rate": 1.7870415544193808e-06,
"loss": 1.1381,
"step": 2095
},
{
"epoch": 0.8262133530426213,
"grad_norm": 0.6192811967277538,
"learning_rate": 1.7791953232872083e-06,
"loss": 1.1739,
"step": 2096
},
{
"epoch": 0.8266075388026608,
"grad_norm": 0.6261988603055474,
"learning_rate": 1.7713646727029476e-06,
"loss": 1.1864,
"step": 2097
},
{
"epoch": 0.8270017245627002,
"grad_norm": 0.6383885993657525,
"learning_rate": 1.7635496175077082e-06,
"loss": 1.1576,
"step": 2098
},
{
"epoch": 0.8273959103227396,
"grad_norm": 0.6401328645174053,
"learning_rate": 1.755750172513041e-06,
"loss": 1.1973,
"step": 2099
},
{
"epoch": 0.827790096082779,
"grad_norm": 0.6520856363314526,
"learning_rate": 1.747966352500904e-06,
"loss": 1.2282,
"step": 2100
},
{
"epoch": 0.8281842818428184,
"grad_norm": 0.6338910603246662,
"learning_rate": 1.7401981722236438e-06,
"loss": 1.175,
"step": 2101
},
{
"epoch": 0.8285784676028578,
"grad_norm": 0.614780711742896,
"learning_rate": 1.7324456464039751e-06,
"loss": 1.219,
"step": 2102
},
{
"epoch": 0.8289726533628973,
"grad_norm": 0.6320193678396515,
"learning_rate": 1.7247087897349334e-06,
"loss": 1.234,
"step": 2103
},
{
"epoch": 0.8293668391229367,
"grad_norm": 0.6148462845714023,
"learning_rate": 1.7169876168798561e-06,
"loss": 1.207,
"step": 2104
},
{
"epoch": 0.8297610248829761,
"grad_norm": 0.6183637939087829,
"learning_rate": 1.7092821424723637e-06,
"loss": 1.191,
"step": 2105
},
{
"epoch": 0.8301552106430156,
"grad_norm": 0.6242963838055702,
"learning_rate": 1.7015923811163225e-06,
"loss": 1.2022,
"step": 2106
},
{
"epoch": 0.8305493964030549,
"grad_norm": 0.5988324551990205,
"learning_rate": 1.6939183473858101e-06,
"loss": 1.1113,
"step": 2107
},
{
"epoch": 0.8309435821630944,
"grad_norm": 0.6110399627678608,
"learning_rate": 1.6862600558251097e-06,
"loss": 1.14,
"step": 2108
},
{
"epoch": 0.8313377679231337,
"grad_norm": 0.6048300072512719,
"learning_rate": 1.6786175209486565e-06,
"loss": 1.1364,
"step": 2109
},
{
"epoch": 0.8317319536831732,
"grad_norm": 0.6191088800533002,
"learning_rate": 1.6709907572410266e-06,
"loss": 1.1591,
"step": 2110
},
{
"epoch": 0.8321261394432126,
"grad_norm": 0.6374165341976098,
"learning_rate": 1.6633797791569085e-06,
"loss": 1.1927,
"step": 2111
},
{
"epoch": 0.832520325203252,
"grad_norm": 0.6047378641330573,
"learning_rate": 1.6557846011210753e-06,
"loss": 1.1895,
"step": 2112
},
{
"epoch": 0.8329145109632915,
"grad_norm": 0.6180978122031335,
"learning_rate": 1.6482052375283442e-06,
"loss": 1.1932,
"step": 2113
},
{
"epoch": 0.8333086967233309,
"grad_norm": 0.6187193373594739,
"learning_rate": 1.6406417027435728e-06,
"loss": 1.2001,
"step": 2114
},
{
"epoch": 0.8337028824833703,
"grad_norm": 0.6055455770427833,
"learning_rate": 1.6330940111016103e-06,
"loss": 1.2135,
"step": 2115
},
{
"epoch": 0.8340970682434097,
"grad_norm": 0.6226585371397162,
"learning_rate": 1.6255621769072805e-06,
"loss": 1.2023,
"step": 2116
},
{
"epoch": 0.8344912540034491,
"grad_norm": 0.5949274417124252,
"learning_rate": 1.6180462144353526e-06,
"loss": 1.1744,
"step": 2117
},
{
"epoch": 0.8348854397634885,
"grad_norm": 0.6339414631453146,
"learning_rate": 1.6105461379305187e-06,
"loss": 1.1836,
"step": 2118
},
{
"epoch": 0.835279625523528,
"grad_norm": 0.6095339519814128,
"learning_rate": 1.6030619616073628e-06,
"loss": 1.1468,
"step": 2119
},
{
"epoch": 0.8356738112835674,
"grad_norm": 0.6227699723957059,
"learning_rate": 1.5955936996503285e-06,
"loss": 1.1617,
"step": 2120
},
{
"epoch": 0.8360679970436068,
"grad_norm": 0.6058636715701863,
"learning_rate": 1.5881413662137047e-06,
"loss": 1.2089,
"step": 2121
},
{
"epoch": 0.8364621828036463,
"grad_norm": 0.6345005146493108,
"learning_rate": 1.580704975421584e-06,
"loss": 1.2159,
"step": 2122
},
{
"epoch": 0.8368563685636856,
"grad_norm": 0.6425121234333704,
"learning_rate": 1.5732845413678477e-06,
"loss": 1.1546,
"step": 2123
},
{
"epoch": 0.8372505543237251,
"grad_norm": 0.6217776321143101,
"learning_rate": 1.5658800781161365e-06,
"loss": 1.1201,
"step": 2124
},
{
"epoch": 0.8376447400837644,
"grad_norm": 0.6291793073582329,
"learning_rate": 1.5584915996998217e-06,
"loss": 1.2199,
"step": 2125
},
{
"epoch": 0.8380389258438039,
"grad_norm": 0.6413491306262445,
"learning_rate": 1.5511191201219733e-06,
"loss": 1.1387,
"step": 2126
},
{
"epoch": 0.8384331116038433,
"grad_norm": 0.5968787571090911,
"learning_rate": 1.5437626533553497e-06,
"loss": 1.1677,
"step": 2127
},
{
"epoch": 0.8388272973638827,
"grad_norm": 0.6266812335989616,
"learning_rate": 1.5364222133423523e-06,
"loss": 1.1488,
"step": 2128
},
{
"epoch": 0.8392214831239222,
"grad_norm": 0.6179499573451991,
"learning_rate": 1.5290978139950108e-06,
"loss": 1.1462,
"step": 2129
},
{
"epoch": 0.8396156688839616,
"grad_norm": 0.6020456787105313,
"learning_rate": 1.521789469194952e-06,
"loss": 1.1895,
"step": 2130
},
{
"epoch": 0.840009854644001,
"grad_norm": 0.6142152475528356,
"learning_rate": 1.514497192793377e-06,
"loss": 1.1928,
"step": 2131
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.6418120903036971,
"learning_rate": 1.5072209986110376e-06,
"loss": 1.1873,
"step": 2132
},
{
"epoch": 0.8407982261640798,
"grad_norm": 0.6022912765250543,
"learning_rate": 1.4999609004381944e-06,
"loss": 1.1693,
"step": 2133
},
{
"epoch": 0.8411924119241192,
"grad_norm": 0.6241117050709148,
"learning_rate": 1.492716912034614e-06,
"loss": 1.1556,
"step": 2134
},
{
"epoch": 0.8415865976841587,
"grad_norm": 0.6088366197098409,
"learning_rate": 1.4854890471295225e-06,
"loss": 1.2307,
"step": 2135
},
{
"epoch": 0.8419807834441981,
"grad_norm": 0.626345154331026,
"learning_rate": 1.4782773194215883e-06,
"loss": 1.1245,
"step": 2136
},
{
"epoch": 0.8423749692042375,
"grad_norm": 0.6214268575987325,
"learning_rate": 1.4710817425789015e-06,
"loss": 1.1974,
"step": 2137
},
{
"epoch": 0.842769154964277,
"grad_norm": 0.6157509713525812,
"learning_rate": 1.4639023302389366e-06,
"loss": 1.1889,
"step": 2138
},
{
"epoch": 0.8431633407243163,
"grad_norm": 0.6351261747898632,
"learning_rate": 1.4567390960085325e-06,
"loss": 1.1981,
"step": 2139
},
{
"epoch": 0.8435575264843558,
"grad_norm": 0.6067571512713051,
"learning_rate": 1.4495920534638741e-06,
"loss": 1.1582,
"step": 2140
},
{
"epoch": 0.8439517122443951,
"grad_norm": 0.607006794382876,
"learning_rate": 1.4424612161504482e-06,
"loss": 1.1623,
"step": 2141
},
{
"epoch": 0.8443458980044346,
"grad_norm": 0.5784739791881964,
"learning_rate": 1.435346597583034e-06,
"loss": 1.116,
"step": 2142
},
{
"epoch": 0.844740083764474,
"grad_norm": 0.6124576542474655,
"learning_rate": 1.4282482112456686e-06,
"loss": 1.1986,
"step": 2143
},
{
"epoch": 0.8451342695245134,
"grad_norm": 0.6311729127767527,
"learning_rate": 1.4211660705916286e-06,
"loss": 1.2564,
"step": 2144
},
{
"epoch": 0.8455284552845529,
"grad_norm": 0.6337920894968637,
"learning_rate": 1.4141001890434035e-06,
"loss": 1.2245,
"step": 2145
},
{
"epoch": 0.8459226410445922,
"grad_norm": 0.5962616813895122,
"learning_rate": 1.407050579992658e-06,
"loss": 1.1572,
"step": 2146
},
{
"epoch": 0.8463168268046317,
"grad_norm": 0.6077208562957639,
"learning_rate": 1.4000172568002268e-06,
"loss": 1.1588,
"step": 2147
},
{
"epoch": 0.8467110125646711,
"grad_norm": 0.6206827599971425,
"learning_rate": 1.3930002327960702e-06,
"loss": 1.2329,
"step": 2148
},
{
"epoch": 0.8471051983247105,
"grad_norm": 0.6031727874430762,
"learning_rate": 1.385999521279261e-06,
"loss": 1.1409,
"step": 2149
},
{
"epoch": 0.8474993840847499,
"grad_norm": 0.6034983041379499,
"learning_rate": 1.3790151355179581e-06,
"loss": 1.2088,
"step": 2150
},
{
"epoch": 0.8478935698447894,
"grad_norm": 0.5944921464470333,
"learning_rate": 1.372047088749372e-06,
"loss": 1.1279,
"step": 2151
},
{
"epoch": 0.8482877556048288,
"grad_norm": 0.6214516653434409,
"learning_rate": 1.365095394179754e-06,
"loss": 1.2763,
"step": 2152
},
{
"epoch": 0.8486819413648682,
"grad_norm": 0.6442848968344648,
"learning_rate": 1.3581600649843617e-06,
"loss": 1.2047,
"step": 2153
},
{
"epoch": 0.8490761271249077,
"grad_norm": 0.6069453066470716,
"learning_rate": 1.3512411143074333e-06,
"loss": 1.1663,
"step": 2154
},
{
"epoch": 0.849470312884947,
"grad_norm": 0.632212528850588,
"learning_rate": 1.344338555262168e-06,
"loss": 1.1797,
"step": 2155
},
{
"epoch": 0.8498644986449865,
"grad_norm": 0.6551418490552343,
"learning_rate": 1.3374524009306944e-06,
"loss": 1.2136,
"step": 2156
},
{
"epoch": 0.8502586844050258,
"grad_norm": 0.6182185289441392,
"learning_rate": 1.3305826643640552e-06,
"loss": 1.1878,
"step": 2157
},
{
"epoch": 0.8506528701650653,
"grad_norm": 0.6177346028571237,
"learning_rate": 1.3237293585821786e-06,
"loss": 1.1659,
"step": 2158
},
{
"epoch": 0.8510470559251047,
"grad_norm": 0.6174374468477092,
"learning_rate": 1.316892496573845e-06,
"loss": 1.1553,
"step": 2159
},
{
"epoch": 0.8514412416851441,
"grad_norm": 0.6130949007768408,
"learning_rate": 1.310072091296677e-06,
"loss": 1.1732,
"step": 2160
},
{
"epoch": 0.8518354274451836,
"grad_norm": 0.6061989244208447,
"learning_rate": 1.303268155677101e-06,
"loss": 1.1714,
"step": 2161
},
{
"epoch": 0.852229613205223,
"grad_norm": 0.6088152483466427,
"learning_rate": 1.296480702610332e-06,
"loss": 1.1614,
"step": 2162
},
{
"epoch": 0.8526237989652624,
"grad_norm": 0.6410096353876902,
"learning_rate": 1.2897097449603491e-06,
"loss": 1.243,
"step": 2163
},
{
"epoch": 0.8530179847253018,
"grad_norm": 0.6215005861175246,
"learning_rate": 1.2829552955598623e-06,
"loss": 1.2266,
"step": 2164
},
{
"epoch": 0.8534121704853412,
"grad_norm": 0.6308618646844184,
"learning_rate": 1.2762173672102996e-06,
"loss": 1.2355,
"step": 2165
},
{
"epoch": 0.8538063562453806,
"grad_norm": 0.611573077552191,
"learning_rate": 1.269495972681777e-06,
"loss": 1.1797,
"step": 2166
},
{
"epoch": 0.8542005420054201,
"grad_norm": 0.6275131772886295,
"learning_rate": 1.2627911247130709e-06,
"loss": 1.1919,
"step": 2167
},
{
"epoch": 0.8545947277654595,
"grad_norm": 0.5993315352532911,
"learning_rate": 1.2561028360116002e-06,
"loss": 1.1554,
"step": 2168
},
{
"epoch": 0.8549889135254989,
"grad_norm": 0.6007090422412275,
"learning_rate": 1.2494311192533958e-06,
"loss": 1.1593,
"step": 2169
},
{
"epoch": 0.8553830992855384,
"grad_norm": 0.6260215764887312,
"learning_rate": 1.242775987083088e-06,
"loss": 1.1785,
"step": 2170
},
{
"epoch": 0.8557772850455777,
"grad_norm": 0.6072634488679926,
"learning_rate": 1.2361374521138724e-06,
"loss": 1.1744,
"step": 2171
},
{
"epoch": 0.8561714708056172,
"grad_norm": 0.6121816712097319,
"learning_rate": 1.2295155269274827e-06,
"loss": 1.1959,
"step": 2172
},
{
"epoch": 0.8565656565656565,
"grad_norm": 0.60232884933228,
"learning_rate": 1.2229102240741819e-06,
"loss": 1.1909,
"step": 2173
},
{
"epoch": 0.856959842325696,
"grad_norm": 0.6219022324990678,
"learning_rate": 1.2163215560727215e-06,
"loss": 1.2573,
"step": 2174
},
{
"epoch": 0.8573540280857354,
"grad_norm": 0.6432583376483387,
"learning_rate": 1.2097495354103284e-06,
"loss": 1.153,
"step": 2175
},
{
"epoch": 0.8577482138457748,
"grad_norm": 0.6057914024761237,
"learning_rate": 1.2031941745426824e-06,
"loss": 1.1835,
"step": 2176
},
{
"epoch": 0.8581423996058143,
"grad_norm": 0.5896128109103955,
"learning_rate": 1.1966554858938805e-06,
"loss": 1.1695,
"step": 2177
},
{
"epoch": 0.8585365853658536,
"grad_norm": 0.611114769313689,
"learning_rate": 1.1901334818564291e-06,
"loss": 1.1891,
"step": 2178
},
{
"epoch": 0.8589307711258931,
"grad_norm": 0.6057440341466516,
"learning_rate": 1.1836281747912125e-06,
"loss": 1.1829,
"step": 2179
},
{
"epoch": 0.8593249568859325,
"grad_norm": 0.6070873449171827,
"learning_rate": 1.1771395770274653e-06,
"loss": 1.1444,
"step": 2180
},
{
"epoch": 0.8597191426459719,
"grad_norm": 0.6173928300019214,
"learning_rate": 1.1706677008627564e-06,
"loss": 1.1758,
"step": 2181
},
{
"epoch": 0.8601133284060113,
"grad_norm": 0.620761797942304,
"learning_rate": 1.1642125585629593e-06,
"loss": 1.2022,
"step": 2182
},
{
"epoch": 0.8605075141660508,
"grad_norm": 0.6296457077216101,
"learning_rate": 1.1577741623622407e-06,
"loss": 1.1907,
"step": 2183
},
{
"epoch": 0.8609016999260902,
"grad_norm": 0.6203549213795299,
"learning_rate": 1.1513525244630198e-06,
"loss": 1.2293,
"step": 2184
},
{
"epoch": 0.8612958856861296,
"grad_norm": 0.6120086583589758,
"learning_rate": 1.1449476570359608e-06,
"loss": 1.118,
"step": 2185
},
{
"epoch": 0.861690071446169,
"grad_norm": 0.6044150524885432,
"learning_rate": 1.1385595722199438e-06,
"loss": 1.1275,
"step": 2186
},
{
"epoch": 0.8620842572062084,
"grad_norm": 0.6216834948320731,
"learning_rate": 1.1321882821220375e-06,
"loss": 1.2583,
"step": 2187
},
{
"epoch": 0.8624784429662479,
"grad_norm": 0.6314861381611362,
"learning_rate": 1.1258337988174794e-06,
"loss": 1.1917,
"step": 2188
},
{
"epoch": 0.8628726287262872,
"grad_norm": 0.6086856686806165,
"learning_rate": 1.1194961343496603e-06,
"loss": 1.2272,
"step": 2189
},
{
"epoch": 0.8632668144863267,
"grad_norm": 0.5983542589167679,
"learning_rate": 1.1131753007300884e-06,
"loss": 1.1747,
"step": 2190
},
{
"epoch": 0.863661000246366,
"grad_norm": 0.6196216583286488,
"learning_rate": 1.1068713099383754e-06,
"loss": 1.1563,
"step": 2191
},
{
"epoch": 0.8640551860064055,
"grad_norm": 0.622973730967306,
"learning_rate": 1.1005841739222166e-06,
"loss": 1.1721,
"step": 2192
},
{
"epoch": 0.864449371766445,
"grad_norm": 0.6084922385739949,
"learning_rate": 1.094313904597355e-06,
"loss": 1.2149,
"step": 2193
},
{
"epoch": 0.8648435575264843,
"grad_norm": 0.6017658686517071,
"learning_rate": 1.0880605138475708e-06,
"loss": 1.1582,
"step": 2194
},
{
"epoch": 0.8652377432865238,
"grad_norm": 0.6242920242129635,
"learning_rate": 1.0818240135246528e-06,
"loss": 1.2032,
"step": 2195
},
{
"epoch": 0.8656319290465632,
"grad_norm": 0.627892199233753,
"learning_rate": 1.0756044154483813e-06,
"loss": 1.2027,
"step": 2196
},
{
"epoch": 0.8660261148066026,
"grad_norm": 0.630460438152927,
"learning_rate": 1.0694017314064997e-06,
"loss": 1.2043,
"step": 2197
},
{
"epoch": 0.866420300566642,
"grad_norm": 0.5912369379567544,
"learning_rate": 1.0632159731546965e-06,
"loss": 1.1947,
"step": 2198
},
{
"epoch": 0.8668144863266815,
"grad_norm": 0.6032500593156851,
"learning_rate": 1.057047152416585e-06,
"loss": 1.229,
"step": 2199
},
{
"epoch": 0.8672086720867209,
"grad_norm": 0.6224700658910649,
"learning_rate": 1.0508952808836682e-06,
"loss": 1.1966,
"step": 2200
},
{
"epoch": 0.8676028578467603,
"grad_norm": 0.5995356945189887,
"learning_rate": 1.044760370215333e-06,
"loss": 1.1371,
"step": 2201
},
{
"epoch": 0.8679970436067997,
"grad_norm": 0.6264831422167915,
"learning_rate": 1.038642432038821e-06,
"loss": 1.1853,
"step": 2202
},
{
"epoch": 0.8683912293668391,
"grad_norm": 0.6112624994424279,
"learning_rate": 1.0325414779492028e-06,
"loss": 1.1631,
"step": 2203
},
{
"epoch": 0.8687854151268786,
"grad_norm": 0.6028695555356325,
"learning_rate": 1.0264575195093628e-06,
"loss": 1.1203,
"step": 2204
},
{
"epoch": 0.8691796008869179,
"grad_norm": 0.5908979194467311,
"learning_rate": 1.020390568249976e-06,
"loss": 1.1464,
"step": 2205
},
{
"epoch": 0.8695737866469574,
"grad_norm": 0.6020405748750884,
"learning_rate": 1.0143406356694797e-06,
"loss": 1.1964,
"step": 2206
},
{
"epoch": 0.8699679724069967,
"grad_norm": 0.5976257450496796,
"learning_rate": 1.0083077332340563e-06,
"loss": 1.1588,
"step": 2207
},
{
"epoch": 0.8703621581670362,
"grad_norm": 0.5924445023992051,
"learning_rate": 1.0022918723776175e-06,
"loss": 1.1257,
"step": 2208
},
{
"epoch": 0.8707563439270757,
"grad_norm": 0.6344444392731119,
"learning_rate": 9.962930645017731e-07,
"loss": 1.1801,
"step": 2209
},
{
"epoch": 0.871150529687115,
"grad_norm": 0.6241397033723098,
"learning_rate": 9.903113209758098e-07,
"loss": 1.1347,
"step": 2210
},
{
"epoch": 0.8715447154471545,
"grad_norm": 0.6092226491641914,
"learning_rate": 9.843466531366774e-07,
"loss": 1.0919,
"step": 2211
},
{
"epoch": 0.8719389012071939,
"grad_norm": 0.6236635571749678,
"learning_rate": 9.783990722889658e-07,
"loss": 1.231,
"step": 2212
},
{
"epoch": 0.8723330869672333,
"grad_norm": 0.6136904911563315,
"learning_rate": 9.724685897048747e-07,
"loss": 1.2087,
"step": 2213
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.6091769703428004,
"learning_rate": 9.665552166241965e-07,
"loss": 1.1516,
"step": 2214
},
{
"epoch": 0.8731214584873122,
"grad_norm": 0.5777422885075877,
"learning_rate": 9.606589642543064e-07,
"loss": 1.1211,
"step": 2215
},
{
"epoch": 0.8735156442473516,
"grad_norm": 0.6279241245367188,
"learning_rate": 9.547798437701194e-07,
"loss": 1.1701,
"step": 2216
},
{
"epoch": 0.873909830007391,
"grad_norm": 0.6055169158607546,
"learning_rate": 9.489178663140897e-07,
"loss": 1.1508,
"step": 2217
},
{
"epoch": 0.8743040157674304,
"grad_norm": 0.6227455138805572,
"learning_rate": 9.43073042996181e-07,
"loss": 1.1853,
"step": 2218
},
{
"epoch": 0.8746982015274698,
"grad_norm": 0.6205644720521007,
"learning_rate": 9.372453848938401e-07,
"loss": 1.1604,
"step": 2219
},
{
"epoch": 0.8750923872875093,
"grad_norm": 0.5946939883094988,
"learning_rate": 9.314349030519843e-07,
"loss": 1.1243,
"step": 2220
},
{
"epoch": 0.8754865730475486,
"grad_norm": 0.6057397264443781,
"learning_rate": 9.256416084829778e-07,
"loss": 1.141,
"step": 2221
},
{
"epoch": 0.8758807588075881,
"grad_norm": 0.6080411686477221,
"learning_rate": 9.198655121666111e-07,
"loss": 1.1783,
"step": 2222
},
{
"epoch": 0.8762749445676274,
"grad_norm": 0.6005470900378805,
"learning_rate": 9.141066250500741e-07,
"loss": 1.147,
"step": 2223
},
{
"epoch": 0.8766691303276669,
"grad_norm": 0.5945362980985712,
"learning_rate": 9.083649580479493e-07,
"loss": 1.1036,
"step": 2224
},
{
"epoch": 0.8770633160877064,
"grad_norm": 0.6099070701658922,
"learning_rate": 9.026405220421785e-07,
"loss": 1.155,
"step": 2225
},
{
"epoch": 0.8774575018477457,
"grad_norm": 0.6137077181265143,
"learning_rate": 8.969333278820447e-07,
"loss": 1.1849,
"step": 2226
},
{
"epoch": 0.8778516876077852,
"grad_norm": 0.6082519323627844,
"learning_rate": 8.912433863841541e-07,
"loss": 1.1608,
"step": 2227
},
{
"epoch": 0.8782458733678246,
"grad_norm": 0.604418332046713,
"learning_rate": 8.855707083324183e-07,
"loss": 1.1366,
"step": 2228
},
{
"epoch": 0.878640059127864,
"grad_norm": 0.58974397331068,
"learning_rate": 8.799153044780229e-07,
"loss": 1.1366,
"step": 2229
},
{
"epoch": 0.8790342448879034,
"grad_norm": 0.652855576695134,
"learning_rate": 8.742771855394205e-07,
"loss": 1.2052,
"step": 2230
},
{
"epoch": 0.8794284306479428,
"grad_norm": 0.606150321404692,
"learning_rate": 8.686563622023059e-07,
"loss": 1.1637,
"step": 2231
},
{
"epoch": 0.8798226164079823,
"grad_norm": 0.5985881774469998,
"learning_rate": 8.630528451195874e-07,
"loss": 1.1659,
"step": 2232
},
{
"epoch": 0.8802168021680217,
"grad_norm": 0.6204340076356355,
"learning_rate": 8.574666449113766e-07,
"loss": 1.1584,
"step": 2233
},
{
"epoch": 0.8806109879280611,
"grad_norm": 0.6270054382615008,
"learning_rate": 8.518977721649679e-07,
"loss": 1.2141,
"step": 2234
},
{
"epoch": 0.8810051736881005,
"grad_norm": 0.6090284700913406,
"learning_rate": 8.46346237434813e-07,
"loss": 1.1922,
"step": 2235
},
{
"epoch": 0.88139935944814,
"grad_norm": 0.6667233953406846,
"learning_rate": 8.408120512425e-07,
"loss": 1.267,
"step": 2236
},
{
"epoch": 0.8817935452081793,
"grad_norm": 0.6099197043950569,
"learning_rate": 8.352952240767453e-07,
"loss": 1.1661,
"step": 2237
},
{
"epoch": 0.8821877309682188,
"grad_norm": 0.6353214535694008,
"learning_rate": 8.297957663933609e-07,
"loss": 1.2521,
"step": 2238
},
{
"epoch": 0.8825819167282581,
"grad_norm": 0.5822802452492017,
"learning_rate": 8.243136886152381e-07,
"loss": 1.1051,
"step": 2239
},
{
"epoch": 0.8829761024882976,
"grad_norm": 0.6024284924891233,
"learning_rate": 8.188490011323291e-07,
"loss": 1.1844,
"step": 2240
},
{
"epoch": 0.8833702882483371,
"grad_norm": 0.6218166801091192,
"learning_rate": 8.134017143016304e-07,
"loss": 1.2239,
"step": 2241
},
{
"epoch": 0.8837644740083764,
"grad_norm": 0.5982021682698988,
"learning_rate": 8.079718384471557e-07,
"loss": 1.1807,
"step": 2242
},
{
"epoch": 0.8841586597684159,
"grad_norm": 0.6167445078039492,
"learning_rate": 8.025593838599221e-07,
"loss": 1.1514,
"step": 2243
},
{
"epoch": 0.8845528455284553,
"grad_norm": 0.6267698758553212,
"learning_rate": 7.971643607979273e-07,
"loss": 1.1775,
"step": 2244
},
{
"epoch": 0.8849470312884947,
"grad_norm": 0.6007524051589882,
"learning_rate": 7.917867794861378e-07,
"loss": 1.1715,
"step": 2245
},
{
"epoch": 0.8853412170485341,
"grad_norm": 0.5867075125001983,
"learning_rate": 7.864266501164541e-07,
"loss": 1.142,
"step": 2246
},
{
"epoch": 0.8857354028085735,
"grad_norm": 0.6117682983819526,
"learning_rate": 7.810839828477101e-07,
"loss": 1.1969,
"step": 2247
},
{
"epoch": 0.886129588568613,
"grad_norm": 0.6205037469861255,
"learning_rate": 7.757587878056372e-07,
"loss": 1.2472,
"step": 2248
},
{
"epoch": 0.8865237743286524,
"grad_norm": 0.6737180765038134,
"learning_rate": 7.704510750828542e-07,
"loss": 1.2256,
"step": 2249
},
{
"epoch": 0.8869179600886918,
"grad_norm": 0.5977988152478557,
"learning_rate": 7.651608547388489e-07,
"loss": 1.2092,
"step": 2250
},
{
"epoch": 0.8873121458487312,
"grad_norm": 0.5870543672858427,
"learning_rate": 7.598881367999566e-07,
"loss": 1.1694,
"step": 2251
},
{
"epoch": 0.8877063316087707,
"grad_norm": 0.604325426385001,
"learning_rate": 7.546329312593382e-07,
"loss": 1.2068,
"step": 2252
},
{
"epoch": 0.88810051736881,
"grad_norm": 0.5858794646282535,
"learning_rate": 7.49395248076964e-07,
"loss": 1.1019,
"step": 2253
},
{
"epoch": 0.8884947031288495,
"grad_norm": 0.6284533960586269,
"learning_rate": 7.441750971795991e-07,
"loss": 1.1827,
"step": 2254
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.6073835508191624,
"learning_rate": 7.389724884607763e-07,
"loss": 1.1928,
"step": 2255
},
{
"epoch": 0.8892830746489283,
"grad_norm": 0.6052990860959455,
"learning_rate": 7.337874317807803e-07,
"loss": 1.1328,
"step": 2256
},
{
"epoch": 0.8896772604089678,
"grad_norm": 0.6426366735903185,
"learning_rate": 7.286199369666346e-07,
"loss": 1.184,
"step": 2257
},
{
"epoch": 0.8900714461690071,
"grad_norm": 0.6135398558082623,
"learning_rate": 7.234700138120776e-07,
"loss": 1.1567,
"step": 2258
},
{
"epoch": 0.8904656319290466,
"grad_norm": 0.611697930631017,
"learning_rate": 7.183376720775415e-07,
"loss": 1.1767,
"step": 2259
},
{
"epoch": 0.890859817689086,
"grad_norm": 0.6047260944980717,
"learning_rate": 7.13222921490142e-07,
"loss": 1.142,
"step": 2260
},
{
"epoch": 0.8912540034491254,
"grad_norm": 0.6137118230173922,
"learning_rate": 7.081257717436507e-07,
"loss": 1.2122,
"step": 2261
},
{
"epoch": 0.8916481892091648,
"grad_norm": 0.5862917774633897,
"learning_rate": 7.030462324984821e-07,
"loss": 1.1464,
"step": 2262
},
{
"epoch": 0.8920423749692042,
"grad_norm": 0.5985229585325247,
"learning_rate": 6.979843133816744e-07,
"loss": 1.1538,
"step": 2263
},
{
"epoch": 0.8924365607292437,
"grad_norm": 0.6215683066351476,
"learning_rate": 6.929400239868745e-07,
"loss": 1.2066,
"step": 2264
},
{
"epoch": 0.8928307464892831,
"grad_norm": 0.6090688114535339,
"learning_rate": 6.879133738743116e-07,
"loss": 1.1246,
"step": 2265
},
{
"epoch": 0.8932249322493225,
"grad_norm": 0.6182971626634737,
"learning_rate": 6.829043725707852e-07,
"loss": 1.1638,
"step": 2266
},
{
"epoch": 0.8936191180093619,
"grad_norm": 0.6028753226292936,
"learning_rate": 6.779130295696479e-07,
"loss": 1.15,
"step": 2267
},
{
"epoch": 0.8940133037694014,
"grad_norm": 0.6138311069551523,
"learning_rate": 6.729393543307838e-07,
"loss": 1.1561,
"step": 2268
},
{
"epoch": 0.8944074895294407,
"grad_norm": 0.5975576303249758,
"learning_rate": 6.679833562805882e-07,
"loss": 1.1286,
"step": 2269
},
{
"epoch": 0.8948016752894802,
"grad_norm": 0.6059495772680955,
"learning_rate": 6.630450448119618e-07,
"loss": 1.1959,
"step": 2270
},
{
"epoch": 0.8951958610495195,
"grad_norm": 0.6797523524629732,
"learning_rate": 6.581244292842792e-07,
"loss": 1.1897,
"step": 2271
},
{
"epoch": 0.895590046809559,
"grad_norm": 0.6011492268276885,
"learning_rate": 6.532215190233748e-07,
"loss": 1.1667,
"step": 2272
},
{
"epoch": 0.8959842325695985,
"grad_norm": 0.6084758528762907,
"learning_rate": 6.483363233215345e-07,
"loss": 1.1592,
"step": 2273
},
{
"epoch": 0.8963784183296378,
"grad_norm": 0.6140227903857725,
"learning_rate": 6.434688514374632e-07,
"loss": 1.1743,
"step": 2274
},
{
"epoch": 0.8967726040896773,
"grad_norm": 0.6351263385074363,
"learning_rate": 6.386191125962749e-07,
"loss": 1.1728,
"step": 2275
},
{
"epoch": 0.8971667898497167,
"grad_norm": 0.6076012012521917,
"learning_rate": 6.337871159894804e-07,
"loss": 1.1741,
"step": 2276
},
{
"epoch": 0.8975609756097561,
"grad_norm": 0.6197330976983585,
"learning_rate": 6.289728707749609e-07,
"loss": 1.1687,
"step": 2277
},
{
"epoch": 0.8979551613697955,
"grad_norm": 0.6046508571791555,
"learning_rate": 6.241763860769535e-07,
"loss": 1.1977,
"step": 2278
},
{
"epoch": 0.8983493471298349,
"grad_norm": 0.6131957231184164,
"learning_rate": 6.193976709860339e-07,
"loss": 1.2021,
"step": 2279
},
{
"epoch": 0.8987435328898744,
"grad_norm": 0.6067811323591332,
"learning_rate": 6.146367345591053e-07,
"loss": 1.1561,
"step": 2280
},
{
"epoch": 0.8991377186499138,
"grad_norm": 0.5867337322284969,
"learning_rate": 6.098935858193688e-07,
"loss": 1.1449,
"step": 2281
},
{
"epoch": 0.8995319044099532,
"grad_norm": 0.6098231620125031,
"learning_rate": 6.051682337563158e-07,
"loss": 1.1462,
"step": 2282
},
{
"epoch": 0.8999260901699926,
"grad_norm": 0.6230967996752504,
"learning_rate": 6.004606873257101e-07,
"loss": 1.1426,
"step": 2283
},
{
"epoch": 0.900320275930032,
"grad_norm": 0.5958004624605026,
"learning_rate": 5.957709554495683e-07,
"loss": 1.1797,
"step": 2284
},
{
"epoch": 0.9007144616900714,
"grad_norm": 0.6079824292132843,
"learning_rate": 5.910990470161416e-07,
"loss": 1.2281,
"step": 2285
},
{
"epoch": 0.9011086474501109,
"grad_norm": 0.5984385548123256,
"learning_rate": 5.864449708799059e-07,
"loss": 1.1619,
"step": 2286
},
{
"epoch": 0.9015028332101502,
"grad_norm": 0.591664056006518,
"learning_rate": 5.818087358615354e-07,
"loss": 1.139,
"step": 2287
},
{
"epoch": 0.9018970189701897,
"grad_norm": 0.6275372827109235,
"learning_rate": 5.771903507478915e-07,
"loss": 1.2364,
"step": 2288
},
{
"epoch": 0.9022912047302292,
"grad_norm": 0.5975540870267736,
"learning_rate": 5.725898242920092e-07,
"loss": 1.1527,
"step": 2289
},
{
"epoch": 0.9026853904902685,
"grad_norm": 0.6050375583531165,
"learning_rate": 5.680071652130736e-07,
"loss": 1.1666,
"step": 2290
},
{
"epoch": 0.903079576250308,
"grad_norm": 0.6259743502880166,
"learning_rate": 5.634423821964074e-07,
"loss": 1.2275,
"step": 2291
},
{
"epoch": 0.9034737620103473,
"grad_norm": 0.6231031649083622,
"learning_rate": 5.588954838934523e-07,
"loss": 1.1716,
"step": 2292
},
{
"epoch": 0.9038679477703868,
"grad_norm": 0.6216418043768527,
"learning_rate": 5.543664789217562e-07,
"loss": 1.1871,
"step": 2293
},
{
"epoch": 0.9042621335304262,
"grad_norm": 0.583945627934862,
"learning_rate": 5.498553758649516e-07,
"loss": 1.1614,
"step": 2294
},
{
"epoch": 0.9046563192904656,
"grad_norm": 0.5974644710894348,
"learning_rate": 5.45362183272743e-07,
"loss": 1.1295,
"step": 2295
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.579085452767809,
"learning_rate": 5.408869096608926e-07,
"loss": 1.1105,
"step": 2296
},
{
"epoch": 0.9054446908105445,
"grad_norm": 0.5929251833508978,
"learning_rate": 5.364295635112016e-07,
"loss": 1.1386,
"step": 2297
},
{
"epoch": 0.9058388765705839,
"grad_norm": 0.5974271999517115,
"learning_rate": 5.319901532714877e-07,
"loss": 1.142,
"step": 2298
},
{
"epoch": 0.9062330623306233,
"grad_norm": 0.6188389973115496,
"learning_rate": 5.27568687355583e-07,
"loss": 1.2045,
"step": 2299
},
{
"epoch": 0.9066272480906628,
"grad_norm": 0.6234466396061988,
"learning_rate": 5.231651741433063e-07,
"loss": 1.1656,
"step": 2300
},
{
"epoch": 0.9070214338507021,
"grad_norm": 0.6316349387146205,
"learning_rate": 5.187796219804508e-07,
"loss": 1.1759,
"step": 2301
},
{
"epoch": 0.9074156196107416,
"grad_norm": 0.6119904812276791,
"learning_rate": 5.144120391787732e-07,
"loss": 1.1648,
"step": 2302
},
{
"epoch": 0.9078098053707809,
"grad_norm": 0.5992707761677788,
"learning_rate": 5.100624340159676e-07,
"loss": 1.1705,
"step": 2303
},
{
"epoch": 0.9082039911308204,
"grad_norm": 0.6125355457119835,
"learning_rate": 5.057308147356632e-07,
"loss": 1.1878,
"step": 2304
},
{
"epoch": 0.9085981768908599,
"grad_norm": 0.5987001014690438,
"learning_rate": 5.014171895473929e-07,
"loss": 1.1728,
"step": 2305
},
{
"epoch": 0.9089923626508992,
"grad_norm": 0.6233596220905993,
"learning_rate": 4.971215666265939e-07,
"loss": 1.1682,
"step": 2306
},
{
"epoch": 0.9093865484109387,
"grad_norm": 0.6120680988346603,
"learning_rate": 4.928439541145802e-07,
"loss": 1.154,
"step": 2307
},
{
"epoch": 0.909780734170978,
"grad_norm": 0.6159172688282434,
"learning_rate": 4.885843601185291e-07,
"loss": 1.1545,
"step": 2308
},
{
"epoch": 0.9101749199310175,
"grad_norm": 0.6561541537105161,
"learning_rate": 4.843427927114752e-07,
"loss": 1.2581,
"step": 2309
},
{
"epoch": 0.9105691056910569,
"grad_norm": 0.6397314727277476,
"learning_rate": 4.801192599322835e-07,
"loss": 1.2649,
"step": 2310
},
{
"epoch": 0.9109632914510963,
"grad_norm": 0.5968063081167863,
"learning_rate": 4.759137697856364e-07,
"loss": 1.1411,
"step": 2311
},
{
"epoch": 0.9113574772111358,
"grad_norm": 0.6046846431473332,
"learning_rate": 4.717263302420283e-07,
"loss": 1.2202,
"step": 2312
},
{
"epoch": 0.9117516629711752,
"grad_norm": 0.6213044733495849,
"learning_rate": 4.675569492377363e-07,
"loss": 1.1844,
"step": 2313
},
{
"epoch": 0.9121458487312146,
"grad_norm": 0.6145028852257042,
"learning_rate": 4.634056346748117e-07,
"loss": 1.2235,
"step": 2314
},
{
"epoch": 0.912540034491254,
"grad_norm": 0.6041076227153636,
"learning_rate": 4.5927239442107306e-07,
"loss": 1.1794,
"step": 2315
},
{
"epoch": 0.9129342202512934,
"grad_norm": 0.5917377858853244,
"learning_rate": 4.551572363100731e-07,
"loss": 1.1421,
"step": 2316
},
{
"epoch": 0.9133284060113328,
"grad_norm": 0.57962701939227,
"learning_rate": 4.5106016814110197e-07,
"loss": 1.1574,
"step": 2317
},
{
"epoch": 0.9137225917713723,
"grad_norm": 0.6010271614392757,
"learning_rate": 4.469811976791605e-07,
"loss": 1.1287,
"step": 2318
},
{
"epoch": 0.9141167775314116,
"grad_norm": 0.6304038957433044,
"learning_rate": 4.429203326549525e-07,
"loss": 1.1971,
"step": 2319
},
{
"epoch": 0.9145109632914511,
"grad_norm": 0.6078465285882131,
"learning_rate": 4.3887758076486597e-07,
"loss": 1.175,
"step": 2320
},
{
"epoch": 0.9149051490514906,
"grad_norm": 0.6058022551406895,
"learning_rate": 4.3485294967095747e-07,
"loss": 1.1782,
"step": 2321
},
{
"epoch": 0.9152993348115299,
"grad_norm": 0.6222158541213707,
"learning_rate": 4.308464470009432e-07,
"loss": 1.2142,
"step": 2322
},
{
"epoch": 0.9156935205715694,
"grad_norm": 0.5967586046808354,
"learning_rate": 4.2685808034818366e-07,
"loss": 1.1787,
"step": 2323
},
{
"epoch": 0.9160877063316087,
"grad_norm": 0.6168581167404708,
"learning_rate": 4.228878572716588e-07,
"loss": 1.1771,
"step": 2324
},
{
"epoch": 0.9164818920916482,
"grad_norm": 0.6140349806295636,
"learning_rate": 4.189357852959708e-07,
"loss": 1.1865,
"step": 2325
},
{
"epoch": 0.9168760778516876,
"grad_norm": 0.616944566915736,
"learning_rate": 4.150018719113147e-07,
"loss": 1.0969,
"step": 2326
},
{
"epoch": 0.917270263611727,
"grad_norm": 0.6129659770559598,
"learning_rate": 4.110861245734721e-07,
"loss": 1.1765,
"step": 2327
},
{
"epoch": 0.9176644493717665,
"grad_norm": 0.6033445957652277,
"learning_rate": 4.0718855070379535e-07,
"loss": 1.2008,
"step": 2328
},
{
"epoch": 0.9180586351318059,
"grad_norm": 0.6190874106262034,
"learning_rate": 4.0330915768919454e-07,
"loss": 1.2122,
"step": 2329
},
{
"epoch": 0.9184528208918453,
"grad_norm": 0.6012965614913941,
"learning_rate": 3.9944795288212047e-07,
"loss": 1.1824,
"step": 2330
},
{
"epoch": 0.9188470066518847,
"grad_norm": 0.5999458716930699,
"learning_rate": 3.956049436005538e-07,
"loss": 1.1437,
"step": 2331
},
{
"epoch": 0.9192411924119241,
"grad_norm": 0.6010551580255399,
"learning_rate": 3.917801371279895e-07,
"loss": 1.1636,
"step": 2332
},
{
"epoch": 0.9196353781719635,
"grad_norm": 0.6265717559201462,
"learning_rate": 3.8797354071342443e-07,
"loss": 1.1524,
"step": 2333
},
{
"epoch": 0.920029563932003,
"grad_norm": 0.5933108670825852,
"learning_rate": 3.841851615713399e-07,
"loss": 1.1646,
"step": 2334
},
{
"epoch": 0.9204237496920423,
"grad_norm": 0.6057802305576383,
"learning_rate": 3.8041500688169253e-07,
"loss": 1.1538,
"step": 2335
},
{
"epoch": 0.9208179354520818,
"grad_norm": 0.6237793034270526,
"learning_rate": 3.766630837899032e-07,
"loss": 1.1886,
"step": 2336
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.6198812448884538,
"learning_rate": 3.729293994068306e-07,
"loss": 1.1955,
"step": 2337
},
{
"epoch": 0.9216063069721606,
"grad_norm": 0.6247300075084717,
"learning_rate": 3.6921396080877414e-07,
"loss": 1.2292,
"step": 2338
},
{
"epoch": 0.9220004927322001,
"grad_norm": 0.6062053891469021,
"learning_rate": 3.6551677503744776e-07,
"loss": 1.1789,
"step": 2339
},
{
"epoch": 0.9223946784922394,
"grad_norm": 0.6105135332217473,
"learning_rate": 3.618378490999719e-07,
"loss": 1.1439,
"step": 2340
},
{
"epoch": 0.9227888642522789,
"grad_norm": 0.5768948920273077,
"learning_rate": 3.581771899688646e-07,
"loss": 1.1398,
"step": 2341
},
{
"epoch": 0.9231830500123183,
"grad_norm": 0.6233702760949931,
"learning_rate": 3.545348045820174e-07,
"loss": 1.2338,
"step": 2342
},
{
"epoch": 0.9235772357723577,
"grad_norm": 0.6293178839378355,
"learning_rate": 3.5091069984269366e-07,
"loss": 1.284,
"step": 2343
},
{
"epoch": 0.9239714215323972,
"grad_norm": 0.6012639840259887,
"learning_rate": 3.473048826195058e-07,
"loss": 1.1688,
"step": 2344
},
{
"epoch": 0.9243656072924366,
"grad_norm": 0.6260153598558462,
"learning_rate": 3.4371735974641053e-07,
"loss": 1.2185,
"step": 2345
},
{
"epoch": 0.924759793052476,
"grad_norm": 0.6268091346400951,
"learning_rate": 3.40148138022689e-07,
"loss": 1.2058,
"step": 2346
},
{
"epoch": 0.9251539788125154,
"grad_norm": 0.6019494923660028,
"learning_rate": 3.365972242129378e-07,
"loss": 1.1248,
"step": 2347
},
{
"epoch": 0.9255481645725548,
"grad_norm": 0.6127790785927769,
"learning_rate": 3.3306462504705706e-07,
"loss": 1.1704,
"step": 2348
},
{
"epoch": 0.9259423503325942,
"grad_norm": 0.6434642793289438,
"learning_rate": 3.2955034722023214e-07,
"loss": 1.1639,
"step": 2349
},
{
"epoch": 0.9263365360926337,
"grad_norm": 0.6160741690407769,
"learning_rate": 3.2605439739292863e-07,
"loss": 1.181,
"step": 2350
},
{
"epoch": 0.926730721852673,
"grad_norm": 0.6040626337033564,
"learning_rate": 3.2257678219087543e-07,
"loss": 1.1359,
"step": 2351
},
{
"epoch": 0.9271249076127125,
"grad_norm": 0.6569697201075794,
"learning_rate": 3.191175082050502e-07,
"loss": 1.1525,
"step": 2352
},
{
"epoch": 0.927519093372752,
"grad_norm": 0.600173226578904,
"learning_rate": 3.156765819916696e-07,
"loss": 1.1436,
"step": 2353
},
{
"epoch": 0.9279132791327913,
"grad_norm": 0.5975529599005833,
"learning_rate": 3.122540100721794e-07,
"loss": 1.1291,
"step": 2354
},
{
"epoch": 0.9283074648928308,
"grad_norm": 0.6182461879570678,
"learning_rate": 3.088497989332351e-07,
"loss": 1.1686,
"step": 2355
},
{
"epoch": 0.9287016506528701,
"grad_norm": 0.6027606375147575,
"learning_rate": 3.05463955026698e-07,
"loss": 1.132,
"step": 2356
},
{
"epoch": 0.9290958364129096,
"grad_norm": 0.6211823263235605,
"learning_rate": 3.020964847696151e-07,
"loss": 1.2116,
"step": 2357
},
{
"epoch": 0.929490022172949,
"grad_norm": 0.8055569064292696,
"learning_rate": 2.987473945442143e-07,
"loss": 1.1802,
"step": 2358
},
{
"epoch": 0.9298842079329884,
"grad_norm": 0.63319663534154,
"learning_rate": 2.9541669069788505e-07,
"loss": 1.1735,
"step": 2359
},
{
"epoch": 0.9302783936930279,
"grad_norm": 0.6092240457871959,
"learning_rate": 2.9210437954316997e-07,
"loss": 1.1769,
"step": 2360
},
{
"epoch": 0.9306725794530673,
"grad_norm": 0.5994634449672462,
"learning_rate": 2.888104673577574e-07,
"loss": 1.1217,
"step": 2361
},
{
"epoch": 0.9310667652131067,
"grad_norm": 0.6129161824755393,
"learning_rate": 2.8553496038445707e-07,
"loss": 1.1949,
"step": 2362
},
{
"epoch": 0.9314609509731461,
"grad_norm": 0.5946581674891636,
"learning_rate": 2.8227786483120523e-07,
"loss": 1.1596,
"step": 2363
},
{
"epoch": 0.9318551367331855,
"grad_norm": 0.6220408417857064,
"learning_rate": 2.790391868710374e-07,
"loss": 1.1697,
"step": 2364
},
{
"epoch": 0.9322493224932249,
"grad_norm": 0.611301302747428,
"learning_rate": 2.7581893264208346e-07,
"loss": 1.1655,
"step": 2365
},
{
"epoch": 0.9326435082532644,
"grad_norm": 0.6303361299231326,
"learning_rate": 2.7261710824755814e-07,
"loss": 1.1762,
"step": 2366
},
{
"epoch": 0.9330376940133037,
"grad_norm": 0.613809194427214,
"learning_rate": 2.694337197557462e-07,
"loss": 1.217,
"step": 2367
},
{
"epoch": 0.9334318797733432,
"grad_norm": 0.5947788641950997,
"learning_rate": 2.66268773199988e-07,
"loss": 1.2082,
"step": 2368
},
{
"epoch": 0.9338260655333827,
"grad_norm": 0.6342184771933248,
"learning_rate": 2.631222745786788e-07,
"loss": 1.2426,
"step": 2369
},
{
"epoch": 0.934220251293422,
"grad_norm": 0.6238792637987063,
"learning_rate": 2.5999422985524157e-07,
"loss": 1.2235,
"step": 2370
},
{
"epoch": 0.9346144370534615,
"grad_norm": 0.6601808628731608,
"learning_rate": 2.5688464495813304e-07,
"loss": 1.2687,
"step": 2371
},
{
"epoch": 0.9350086228135008,
"grad_norm": 0.591780101499758,
"learning_rate": 2.537935257808177e-07,
"loss": 1.1727,
"step": 2372
},
{
"epoch": 0.9354028085735403,
"grad_norm": 0.6004908722208354,
"learning_rate": 2.507208781817638e-07,
"loss": 1.1644,
"step": 2373
},
{
"epoch": 0.9357969943335797,
"grad_norm": 0.6213717940339839,
"learning_rate": 2.4766670798443414e-07,
"loss": 1.1808,
"step": 2374
},
{
"epoch": 0.9361911800936191,
"grad_norm": 0.6088482849843166,
"learning_rate": 2.4463102097726843e-07,
"loss": 1.1679,
"step": 2375
},
{
"epoch": 0.9365853658536586,
"grad_norm": 0.5797582430181196,
"learning_rate": 2.4161382291367776e-07,
"loss": 1.1257,
"step": 2376
},
{
"epoch": 0.936979551613698,
"grad_norm": 0.619020334020193,
"learning_rate": 2.386151195120323e-07,
"loss": 1.1419,
"step": 2377
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.5946052196409608,
"learning_rate": 2.356349164556493e-07,
"loss": 1.1304,
"step": 2378
},
{
"epoch": 0.9377679231337768,
"grad_norm": 0.6091945754012382,
"learning_rate": 2.3267321939278277e-07,
"loss": 1.2201,
"step": 2379
},
{
"epoch": 0.9381621088938162,
"grad_norm": 0.6170932843567667,
"learning_rate": 2.2973003393661374e-07,
"loss": 1.2362,
"step": 2380
},
{
"epoch": 0.9385562946538556,
"grad_norm": 0.6012825687735323,
"learning_rate": 2.2680536566523802e-07,
"loss": 1.15,
"step": 2381
},
{
"epoch": 0.9389504804138951,
"grad_norm": 0.6255938234171833,
"learning_rate": 2.2389922012165944e-07,
"loss": 1.2223,
"step": 2382
},
{
"epoch": 0.9393446661739344,
"grad_norm": 0.5876733837374598,
"learning_rate": 2.2101160281377098e-07,
"loss": 1.141,
"step": 2383
},
{
"epoch": 0.9397388519339739,
"grad_norm": 0.6146179783064085,
"learning_rate": 2.1814251921435603e-07,
"loss": 1.1977,
"step": 2384
},
{
"epoch": 0.9401330376940134,
"grad_norm": 0.5988256998213285,
"learning_rate": 2.1529197476106821e-07,
"loss": 1.1755,
"step": 2385
},
{
"epoch": 0.9405272234540527,
"grad_norm": 0.619835334128145,
"learning_rate": 2.124599748564249e-07,
"loss": 1.1283,
"step": 2386
},
{
"epoch": 0.9409214092140922,
"grad_norm": 0.598162178135982,
"learning_rate": 2.0964652486779814e-07,
"loss": 1.1926,
"step": 2387
},
{
"epoch": 0.9413155949741315,
"grad_norm": 0.613362224923904,
"learning_rate": 2.0685163012740039e-07,
"loss": 1.1947,
"step": 2388
},
{
"epoch": 0.941709780734171,
"grad_norm": 0.5975727904035542,
"learning_rate": 2.0407529593228114e-07,
"loss": 1.1629,
"step": 2389
},
{
"epoch": 0.9421039664942104,
"grad_norm": 0.6139860108767166,
"learning_rate": 2.013175275443102e-07,
"loss": 1.2471,
"step": 2390
},
{
"epoch": 0.9424981522542498,
"grad_norm": 0.585425153613225,
"learning_rate": 1.9857833019017004e-07,
"loss": 1.0983,
"step": 2391
},
{
"epoch": 0.9428923380142893,
"grad_norm": 0.6118000826090201,
"learning_rate": 1.9585770906134671e-07,
"loss": 1.1331,
"step": 2392
},
{
"epoch": 0.9432865237743286,
"grad_norm": 0.5921590656780138,
"learning_rate": 1.9315566931412233e-07,
"loss": 1.1126,
"step": 2393
},
{
"epoch": 0.9436807095343681,
"grad_norm": 0.6165903484277372,
"learning_rate": 1.9047221606955713e-07,
"loss": 1.198,
"step": 2394
},
{
"epoch": 0.9440748952944075,
"grad_norm": 0.6368352242306206,
"learning_rate": 1.8780735441348842e-07,
"loss": 1.2699,
"step": 2395
},
{
"epoch": 0.9444690810544469,
"grad_norm": 0.6099076721349784,
"learning_rate": 1.8516108939651945e-07,
"loss": 1.2367,
"step": 2396
},
{
"epoch": 0.9448632668144863,
"grad_norm": 0.6085928656086841,
"learning_rate": 1.8253342603400503e-07,
"loss": 1.1395,
"step": 2397
},
{
"epoch": 0.9452574525745258,
"grad_norm": 0.6174687470746002,
"learning_rate": 1.7992436930604484e-07,
"loss": 1.1651,
"step": 2398
},
{
"epoch": 0.9456516383345651,
"grad_norm": 0.6129685190288655,
"learning_rate": 1.7733392415747452e-07,
"loss": 1.1806,
"step": 2399
},
{
"epoch": 0.9460458240946046,
"grad_norm": 0.5836621907525494,
"learning_rate": 1.7476209549785906e-07,
"loss": 1.1498,
"step": 2400
},
{
"epoch": 0.946440009854644,
"grad_norm": 0.5996938824902894,
"learning_rate": 1.7220888820147607e-07,
"loss": 1.1156,
"step": 2401
},
{
"epoch": 0.9468341956146834,
"grad_norm": 0.6162536454834876,
"learning_rate": 1.6967430710731258e-07,
"loss": 1.1963,
"step": 2402
},
{
"epoch": 0.9472283813747229,
"grad_norm": 0.6280127586386618,
"learning_rate": 1.6715835701905604e-07,
"loss": 1.2415,
"step": 2403
},
{
"epoch": 0.9476225671347622,
"grad_norm": 0.6202334141414314,
"learning_rate": 1.6466104270508099e-07,
"loss": 1.1966,
"step": 2404
},
{
"epoch": 0.9480167528948017,
"grad_norm": 0.6122489081297163,
"learning_rate": 1.6218236889844142e-07,
"loss": 1.1671,
"step": 2405
},
{
"epoch": 0.948410938654841,
"grad_norm": 0.6035232347033065,
"learning_rate": 1.5972234029686617e-07,
"loss": 1.0962,
"step": 2406
},
{
"epoch": 0.9488051244148805,
"grad_norm": 0.6496961489577563,
"learning_rate": 1.5728096156274353e-07,
"loss": 1.2318,
"step": 2407
},
{
"epoch": 0.94919931017492,
"grad_norm": 0.6147346192870907,
"learning_rate": 1.5485823732311777e-07,
"loss": 1.0982,
"step": 2408
},
{
"epoch": 0.9495934959349593,
"grad_norm": 0.6303713451636969,
"learning_rate": 1.5245417216967596e-07,
"loss": 1.2279,
"step": 2409
},
{
"epoch": 0.9499876816949988,
"grad_norm": 0.5889090939067558,
"learning_rate": 1.5006877065874338e-07,
"loss": 1.169,
"step": 2410
},
{
"epoch": 0.9503818674550382,
"grad_norm": 0.6019171279270943,
"learning_rate": 1.477020373112714e-07,
"loss": 1.1254,
"step": 2411
},
{
"epoch": 0.9507760532150776,
"grad_norm": 0.6157755932202649,
"learning_rate": 1.4535397661283092e-07,
"loss": 1.1132,
"step": 2412
},
{
"epoch": 0.951170238975117,
"grad_norm": 0.6132084756622929,
"learning_rate": 1.4302459301360428e-07,
"loss": 1.1932,
"step": 2413
},
{
"epoch": 0.9515644247351565,
"grad_norm": 0.6249158834646313,
"learning_rate": 1.4071389092837339e-07,
"loss": 1.2299,
"step": 2414
},
{
"epoch": 0.9519586104951958,
"grad_norm": 0.6183091225952251,
"learning_rate": 1.3842187473651626e-07,
"loss": 1.1556,
"step": 2415
},
{
"epoch": 0.9523527962552353,
"grad_norm": 0.5918073875966923,
"learning_rate": 1.3614854878199578e-07,
"loss": 1.1273,
"step": 2416
},
{
"epoch": 0.9527469820152747,
"grad_norm": 0.5982357040080991,
"learning_rate": 1.3389391737335112e-07,
"loss": 1.1114,
"step": 2417
},
{
"epoch": 0.9531411677753141,
"grad_norm": 0.5883507787023478,
"learning_rate": 1.3165798478369184e-07,
"loss": 1.1184,
"step": 2418
},
{
"epoch": 0.9535353535353536,
"grad_norm": 0.6182981301693431,
"learning_rate": 1.2944075525068712e-07,
"loss": 1.1803,
"step": 2419
},
{
"epoch": 0.9539295392953929,
"grad_norm": 0.6185455523897264,
"learning_rate": 1.272422329765588e-07,
"loss": 1.1795,
"step": 2420
},
{
"epoch": 0.9543237250554324,
"grad_norm": 0.6220883345091087,
"learning_rate": 1.2506242212807607e-07,
"loss": 1.2235,
"step": 2421
},
{
"epoch": 0.9547179108154717,
"grad_norm": 0.6098949505020008,
"learning_rate": 1.2290132683654087e-07,
"loss": 1.1566,
"step": 2422
},
{
"epoch": 0.9551120965755112,
"grad_norm": 0.6015695706886922,
"learning_rate": 1.2075895119779025e-07,
"loss": 1.1703,
"step": 2423
},
{
"epoch": 0.9555062823355507,
"grad_norm": 0.6332300803609152,
"learning_rate": 1.1863529927217731e-07,
"loss": 1.1943,
"step": 2424
},
{
"epoch": 0.95590046809559,
"grad_norm": 0.612260563852357,
"learning_rate": 1.1653037508457032e-07,
"loss": 1.1732,
"step": 2425
},
{
"epoch": 0.9562946538556295,
"grad_norm": 0.5999781512649874,
"learning_rate": 1.1444418262434587e-07,
"loss": 1.1752,
"step": 2426
},
{
"epoch": 0.9566888396156689,
"grad_norm": 0.6008667456915643,
"learning_rate": 1.1237672584537673e-07,
"loss": 1.1495,
"step": 2427
},
{
"epoch": 0.9570830253757083,
"grad_norm": 0.6153244050308969,
"learning_rate": 1.1032800866602633e-07,
"loss": 1.1937,
"step": 2428
},
{
"epoch": 0.9574772111357477,
"grad_norm": 0.5959829809552201,
"learning_rate": 1.0829803496914537e-07,
"loss": 1.1581,
"step": 2429
},
{
"epoch": 0.9578713968957872,
"grad_norm": 0.6077619966859046,
"learning_rate": 1.062868086020552e-07,
"loss": 1.1725,
"step": 2430
},
{
"epoch": 0.9582655826558265,
"grad_norm": 0.6047743581903363,
"learning_rate": 1.0429433337655115e-07,
"loss": 1.1331,
"step": 2431
},
{
"epoch": 0.958659768415866,
"grad_norm": 0.6201599918518463,
"learning_rate": 1.0232061306888918e-07,
"loss": 1.1858,
"step": 2432
},
{
"epoch": 0.9590539541759054,
"grad_norm": 0.6231710616869747,
"learning_rate": 1.0036565141977594e-07,
"loss": 1.2016,
"step": 2433
},
{
"epoch": 0.9594481399359448,
"grad_norm": 0.6448288343953715,
"learning_rate": 9.842945213437094e-08,
"loss": 1.2158,
"step": 2434
},
{
"epoch": 0.9598423256959843,
"grad_norm": 0.6167891303410092,
"learning_rate": 9.651201888227102e-08,
"loss": 1.1559,
"step": 2435
},
{
"epoch": 0.9602365114560236,
"grad_norm": 0.6038868590043498,
"learning_rate": 9.461335529750815e-08,
"loss": 1.1601,
"step": 2436
},
{
"epoch": 0.9606306972160631,
"grad_norm": 0.6077888775853522,
"learning_rate": 9.273346497854052e-08,
"loss": 1.1977,
"step": 2437
},
{
"epoch": 0.9610248829761024,
"grad_norm": 0.603082429453148,
"learning_rate": 9.08723514882437e-08,
"loss": 1.1205,
"step": 2438
},
{
"epoch": 0.9614190687361419,
"grad_norm": 0.6010255915248192,
"learning_rate": 8.903001835390946e-08,
"loss": 1.1565,
"step": 2439
},
{
"epoch": 0.9618132544961814,
"grad_norm": 0.5911163710697771,
"learning_rate": 8.720646906723585e-08,
"loss": 1.1529,
"step": 2440
},
{
"epoch": 0.9622074402562207,
"grad_norm": 0.6227655050280417,
"learning_rate": 8.540170708431716e-08,
"loss": 1.2165,
"step": 2441
},
{
"epoch": 0.9626016260162602,
"grad_norm": 0.626494521422824,
"learning_rate": 8.36157358256473e-08,
"loss": 1.2108,
"step": 2442
},
{
"epoch": 0.9629958117762996,
"grad_norm": 0.5903062085449574,
"learning_rate": 8.184855867609976e-08,
"loss": 1.1558,
"step": 2443
},
{
"epoch": 0.963389997536339,
"grad_norm": 0.6107447987815348,
"learning_rate": 8.010017898493316e-08,
"loss": 1.159,
"step": 2444
},
{
"epoch": 0.9637841832963784,
"grad_norm": 0.608930442078416,
"learning_rate": 7.837060006577801e-08,
"loss": 1.1968,
"step": 2445
},
{
"epoch": 0.9641783690564178,
"grad_norm": 0.594295975968586,
"learning_rate": 7.665982519663329e-08,
"loss": 1.1405,
"step": 2446
},
{
"epoch": 0.9645725548164572,
"grad_norm": 0.5973153401367114,
"learning_rate": 7.49678576198587e-08,
"loss": 1.1439,
"step": 2447
},
{
"epoch": 0.9649667405764967,
"grad_norm": 0.5985621492583797,
"learning_rate": 7.329470054217024e-08,
"loss": 1.1717,
"step": 2448
},
{
"epoch": 0.9653609263365361,
"grad_norm": 0.602845907873701,
"learning_rate": 7.164035713463358e-08,
"loss": 1.1579,
"step": 2449
},
{
"epoch": 0.9657551120965755,
"grad_norm": 0.6205834350913317,
"learning_rate": 7.000483053265506e-08,
"loss": 1.2058,
"step": 2450
},
{
"epoch": 0.966149297856615,
"grad_norm": 0.6363339379587928,
"learning_rate": 6.838812383597959e-08,
"loss": 1.2335,
"step": 2451
},
{
"epoch": 0.9665434836166543,
"grad_norm": 0.6717079440212176,
"learning_rate": 6.679024010868617e-08,
"loss": 1.1835,
"step": 2452
},
{
"epoch": 0.9669376693766938,
"grad_norm": 0.6013068431470037,
"learning_rate": 6.521118237917456e-08,
"loss": 1.1285,
"step": 2453
},
{
"epoch": 0.9673318551367331,
"grad_norm": 0.5951721146532576,
"learning_rate": 6.365095364016971e-08,
"loss": 1.1539,
"step": 2454
},
{
"epoch": 0.9677260408967726,
"grad_norm": 0.6658577073295611,
"learning_rate": 6.210955684870512e-08,
"loss": 1.2482,
"step": 2455
},
{
"epoch": 0.9681202266568121,
"grad_norm": 0.6300768133578355,
"learning_rate": 6.058699492612841e-08,
"loss": 1.2359,
"step": 2456
},
{
"epoch": 0.9685144124168514,
"grad_norm": 0.6082556264479969,
"learning_rate": 5.9083270758085733e-08,
"loss": 1.1134,
"step": 2457
},
{
"epoch": 0.9689085981768909,
"grad_norm": 0.6185300650907809,
"learning_rate": 5.759838719452404e-08,
"loss": 1.2206,
"step": 2458
},
{
"epoch": 0.9693027839369303,
"grad_norm": 0.6117970900606814,
"learning_rate": 5.6132347049679955e-08,
"loss": 1.1647,
"step": 2459
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.5976874867227856,
"learning_rate": 5.468515310207867e-08,
"loss": 1.1589,
"step": 2460
},
{
"epoch": 0.9700911554570091,
"grad_norm": 0.6304288708508361,
"learning_rate": 5.3256808094527266e-08,
"loss": 1.1898,
"step": 2461
},
{
"epoch": 0.9704853412170485,
"grad_norm": 0.6311672116169154,
"learning_rate": 5.184731473410698e-08,
"loss": 1.1659,
"step": 2462
},
{
"epoch": 0.9708795269770879,
"grad_norm": 0.58587149930154,
"learning_rate": 5.045667569217316e-08,
"loss": 1.1655,
"step": 2463
},
{
"epoch": 0.9712737127371274,
"grad_norm": 0.6010861112474221,
"learning_rate": 4.9084893604344205e-08,
"loss": 1.1392,
"step": 2464
},
{
"epoch": 0.9716678984971668,
"grad_norm": 0.6136708610607174,
"learning_rate": 4.7731971070503754e-08,
"loss": 1.1839,
"step": 2465
},
{
"epoch": 0.9720620842572062,
"grad_norm": 0.5941054001607767,
"learning_rate": 4.639791065478738e-08,
"loss": 1.1675,
"step": 2466
},
{
"epoch": 0.9724562700172457,
"grad_norm": 0.6082606108108427,
"learning_rate": 4.508271488558369e-08,
"loss": 1.1678,
"step": 2467
},
{
"epoch": 0.972850455777285,
"grad_norm": 0.63694043332642,
"learning_rate": 4.3786386255531e-08,
"loss": 1.2357,
"step": 2468
},
{
"epoch": 0.9732446415373245,
"grad_norm": 0.6218499921470892,
"learning_rate": 4.250892722150401e-08,
"loss": 1.1817,
"step": 2469
},
{
"epoch": 0.9736388272973638,
"grad_norm": 0.618351384803128,
"learning_rate": 4.1250340204619375e-08,
"loss": 1.1498,
"step": 2470
},
{
"epoch": 0.9740330130574033,
"grad_norm": 0.6221821265806511,
"learning_rate": 4.001062759022456e-08,
"loss": 1.1812,
"step": 2471
},
{
"epoch": 0.9744271988174428,
"grad_norm": 0.6350605796642136,
"learning_rate": 3.878979172789454e-08,
"loss": 1.2148,
"step": 2472
},
{
"epoch": 0.9748213845774821,
"grad_norm": 0.6203025166705224,
"learning_rate": 3.758783493142737e-08,
"loss": 1.1737,
"step": 2473
},
{
"epoch": 0.9752155703375216,
"grad_norm": 0.6008544551965036,
"learning_rate": 3.640475947884303e-08,
"loss": 1.1266,
"step": 2474
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.6113341557887032,
"learning_rate": 3.5240567612375706e-08,
"loss": 1.2014,
"step": 2475
},
{
"epoch": 0.9760039418576004,
"grad_norm": 0.603617063644902,
"learning_rate": 3.4095261538468204e-08,
"loss": 1.166,
"step": 2476
},
{
"epoch": 0.9763981276176398,
"grad_norm": 0.6271623067160851,
"learning_rate": 3.2968843427770844e-08,
"loss": 1.201,
"step": 2477
},
{
"epoch": 0.9767923133776792,
"grad_norm": 0.5896479252918767,
"learning_rate": 3.186131541513926e-08,
"loss": 1.1689,
"step": 2478
},
{
"epoch": 0.9771864991377186,
"grad_norm": 0.6139597394243195,
"learning_rate": 3.0772679599623266e-08,
"loss": 1.1962,
"step": 2479
},
{
"epoch": 0.9775806848977581,
"grad_norm": 0.6298030226727921,
"learning_rate": 2.9702938044468e-08,
"loss": 1.1874,
"step": 2480
},
{
"epoch": 0.9779748706577975,
"grad_norm": 0.5794413704040846,
"learning_rate": 2.865209277711167e-08,
"loss": 1.1074,
"step": 2481
},
{
"epoch": 0.9783690564178369,
"grad_norm": 0.5885716516364036,
"learning_rate": 2.7620145789177823e-08,
"loss": 1.125,
"step": 2482
},
{
"epoch": 0.9787632421778764,
"grad_norm": 0.6320208790946613,
"learning_rate": 2.6607099036470853e-08,
"loss": 1.2337,
"step": 2483
},
{
"epoch": 0.9791574279379157,
"grad_norm": 0.6070406774043791,
"learning_rate": 2.5612954438977155e-08,
"loss": 1.1309,
"step": 2484
},
{
"epoch": 0.9795516136979552,
"grad_norm": 0.6061624110898025,
"learning_rate": 2.463771388085623e-08,
"loss": 1.161,
"step": 2485
},
{
"epoch": 0.9799457994579945,
"grad_norm": 0.6181129393801446,
"learning_rate": 2.368137921044289e-08,
"loss": 1.152,
"step": 2486
},
{
"epoch": 0.980339985218034,
"grad_norm": 0.6053023110866588,
"learning_rate": 2.274395224023618e-08,
"loss": 1.2039,
"step": 2487
},
{
"epoch": 0.9807341709780735,
"grad_norm": 0.611443540064316,
"learning_rate": 2.1825434746903794e-08,
"loss": 1.2308,
"step": 2488
},
{
"epoch": 0.9811283567381128,
"grad_norm": 0.5983940583235254,
"learning_rate": 2.0925828471272115e-08,
"loss": 1.1492,
"step": 2489
},
{
"epoch": 0.9815225424981523,
"grad_norm": 0.6070581145638013,
"learning_rate": 2.0045135118328397e-08,
"loss": 1.1946,
"step": 2490
},
{
"epoch": 0.9819167282581917,
"grad_norm": 0.6080141003498726,
"learning_rate": 1.9183356357215242e-08,
"loss": 1.1755,
"step": 2491
},
{
"epoch": 0.9823109140182311,
"grad_norm": 0.6183949984566449,
"learning_rate": 1.8340493821222827e-08,
"loss": 1.234,
"step": 2492
},
{
"epoch": 0.9827050997782705,
"grad_norm": 0.6158791546765815,
"learning_rate": 1.7516549107795543e-08,
"loss": 1.1807,
"step": 2493
},
{
"epoch": 0.9830992855383099,
"grad_norm": 0.6008031176354653,
"learning_rate": 1.671152377852092e-08,
"loss": 1.1555,
"step": 2494
},
{
"epoch": 0.9834934712983493,
"grad_norm": 0.6243823889960919,
"learning_rate": 1.5925419359130723e-08,
"loss": 1.1506,
"step": 2495
},
{
"epoch": 0.9838876570583888,
"grad_norm": 0.6092824290673818,
"learning_rate": 1.5158237339494285e-08,
"loss": 1.1245,
"step": 2496
},
{
"epoch": 0.9842818428184282,
"grad_norm": 0.6173876535957193,
"learning_rate": 1.4409979173620747e-08,
"loss": 1.1329,
"step": 2497
},
{
"epoch": 0.9846760285784676,
"grad_norm": 0.6081775209074783,
"learning_rate": 1.3680646279651266e-08,
"loss": 1.1479,
"step": 2498
},
{
"epoch": 0.985070214338507,
"grad_norm": 0.6202693029416111,
"learning_rate": 1.2970240039861248e-08,
"loss": 1.2072,
"step": 2499
},
{
"epoch": 0.9854644000985464,
"grad_norm": 0.6008870570624699,
"learning_rate": 1.2278761800653682e-08,
"loss": 1.1418,
"step": 2500
},
{
"epoch": 0.9858585858585859,
"grad_norm": 0.624028333998548,
"learning_rate": 1.1606212872559142e-08,
"loss": 1.2152,
"step": 2501
},
{
"epoch": 0.9862527716186252,
"grad_norm": 0.6239253652188765,
"learning_rate": 1.0952594530230232e-08,
"loss": 1.2422,
"step": 2502
},
{
"epoch": 0.9866469573786647,
"grad_norm": 0.6066337975290457,
"learning_rate": 1.0317908012442701e-08,
"loss": 1.1602,
"step": 2503
},
{
"epoch": 0.9870411431387042,
"grad_norm": 0.6377500814670377,
"learning_rate": 9.702154522092111e-09,
"loss": 1.2192,
"step": 2504
},
{
"epoch": 0.9874353288987435,
"grad_norm": 0.5987907515887436,
"learning_rate": 9.105335226190504e-09,
"loss": 1.1616,
"step": 2505
},
{
"epoch": 0.987829514658783,
"grad_norm": 0.6172014036158203,
"learning_rate": 8.527451255863073e-09,
"loss": 1.225,
"step": 2506
},
{
"epoch": 0.9882237004188223,
"grad_norm": 0.6077694286293223,
"learning_rate": 7.968503706350384e-09,
"loss": 1.1987,
"step": 2507
},
{
"epoch": 0.9886178861788618,
"grad_norm": 0.6138556064349517,
"learning_rate": 7.42849363700282e-09,
"loss": 1.1483,
"step": 2508
},
{
"epoch": 0.9890120719389012,
"grad_norm": 0.6120940708596503,
"learning_rate": 6.907422071278369e-09,
"loss": 1.1581,
"step": 2509
},
{
"epoch": 0.9894062576989406,
"grad_norm": 0.5962048270770236,
"learning_rate": 6.405289996741504e-09,
"loss": 1.1662,
"step": 2510
},
{
"epoch": 0.98980044345898,
"grad_norm": 0.6184599584147658,
"learning_rate": 5.922098365063189e-09,
"loss": 1.1495,
"step": 2511
},
{
"epoch": 0.9901946292190195,
"grad_norm": 0.6296776196488952,
"learning_rate": 5.457848092015328e-09,
"loss": 1.1905,
"step": 2512
},
{
"epoch": 0.9905888149790589,
"grad_norm": 0.6131588421344288,
"learning_rate": 5.012540057474091e-09,
"loss": 1.1818,
"step": 2513
},
{
"epoch": 0.9909830007390983,
"grad_norm": 0.5964517876857598,
"learning_rate": 4.586175105411039e-09,
"loss": 1.1824,
"step": 2514
},
{
"epoch": 0.9913771864991378,
"grad_norm": 0.6158891574168905,
"learning_rate": 4.178754043898669e-09,
"loss": 1.1601,
"step": 2515
},
{
"epoch": 0.9917713722591771,
"grad_norm": 0.6048009237523553,
"learning_rate": 3.790277645104867e-09,
"loss": 1.1299,
"step": 2516
},
{
"epoch": 0.9921655580192166,
"grad_norm": 0.6238556971612192,
"learning_rate": 3.420746645292905e-09,
"loss": 1.1244,
"step": 2517
},
{
"epoch": 0.9925597437792559,
"grad_norm": 0.6155143754125697,
"learning_rate": 3.0701617448203325e-09,
"loss": 1.1856,
"step": 2518
},
{
"epoch": 0.9929539295392954,
"grad_norm": 0.6156379383507039,
"learning_rate": 2.738523608135646e-09,
"loss": 1.1921,
"step": 2519
},
{
"epoch": 0.9933481152993349,
"grad_norm": 0.6287557362309201,
"learning_rate": 2.4258328637771776e-09,
"loss": 1.1696,
"step": 2520
},
{
"epoch": 0.9937423010593742,
"grad_norm": 0.6035984671210802,
"learning_rate": 2.1320901043764276e-09,
"loss": 1.1752,
"step": 2521
},
{
"epoch": 0.9941364868194137,
"grad_norm": 0.6095120389983935,
"learning_rate": 1.8572958866514e-09,
"loss": 1.1458,
"step": 2522
},
{
"epoch": 0.994530672579453,
"grad_norm": 0.7589305669134696,
"learning_rate": 1.6014507314077165e-09,
"loss": 1.1667,
"step": 2523
},
{
"epoch": 0.9949248583394925,
"grad_norm": 0.6114552923969634,
"learning_rate": 1.3645551235386134e-09,
"loss": 1.1621,
"step": 2524
},
{
"epoch": 0.9953190440995319,
"grad_norm": 0.6058392606138625,
"learning_rate": 1.1466095120216126e-09,
"loss": 1.1241,
"step": 2525
},
{
"epoch": 0.9957132298595713,
"grad_norm": 0.5936603980813377,
"learning_rate": 9.476143099207414e-10,
"loss": 1.1423,
"step": 2526
},
{
"epoch": 0.9961074156196107,
"grad_norm": 0.5977975525192136,
"learning_rate": 7.67569894382092e-10,
"loss": 1.1964,
"step": 2527
},
{
"epoch": 0.9965016013796502,
"grad_norm": 0.5957259774856952,
"learning_rate": 6.064766066382622e-10,
"loss": 1.1949,
"step": 2528
},
{
"epoch": 0.9968957871396896,
"grad_norm": 0.6094021396471523,
"learning_rate": 4.643347520005836e-10,
"loss": 1.2123,
"step": 2529
},
{
"epoch": 0.997289972899729,
"grad_norm": 0.6120542827325469,
"learning_rate": 3.4114459986689386e-10,
"loss": 1.1313,
"step": 2530
},
{
"epoch": 0.9976841586597684,
"grad_norm": 0.6151506851061069,
"learning_rate": 2.369063837115437e-10,
"loss": 1.2058,
"step": 2531
},
{
"epoch": 0.9980783444198078,
"grad_norm": 0.6008592003001969,
"learning_rate": 1.5162030109538982e-10,
"loss": 1.151,
"step": 2532
},
{
"epoch": 0.9984725301798473,
"grad_norm": 0.5857304461429403,
"learning_rate": 8.528651365580232e-11,
"loss": 1.1576,
"step": 2533
},
{
"epoch": 0.9988667159398866,
"grad_norm": 0.6021334182290597,
"learning_rate": 3.790514711332627e-11,
"loss": 1.195,
"step": 2534
},
{
"epoch": 0.9992609016999261,
"grad_norm": 0.6202010114249676,
"learning_rate": 9.476291268351035e-12,
"loss": 1.192,
"step": 2535
},
{
"epoch": 0.9996550874599656,
"grad_norm": 0.6201498827195971,
"learning_rate": 0.0,
"loss": 1.1993,
"step": 2536
},
{
"epoch": 0.9996550874599656,
"eval_loss": 1.168265700340271,
"eval_runtime": 2983.6589,
"eval_samples_per_second": 5.508,
"eval_steps_per_second": 0.689,
"step": 2536
},
{
"epoch": 0.9996550874599656,
"step": 2536,
"total_flos": 661690545340416.0,
"train_loss": 1.391600751820423,
"train_runtime": 151844.1268,
"train_samples_per_second": 1.069,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1,
"max_steps": 2536,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 661690545340416.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}