Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
ZyKINvice's picture
Model save
ca14c8b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009196247930844216,
"grad_norm": 4.087223679622462,
"learning_rate": 9.191176470588236e-07,
"loss": 1.3446,
"mean_token_accuracy": 0.6661458969116211,
"step": 5
},
{
"epoch": 0.0018392495861688431,
"grad_norm": 3.3376471514991324,
"learning_rate": 1.8382352941176471e-06,
"loss": 1.2534,
"mean_token_accuracy": 0.6856188654899598,
"step": 10
},
{
"epoch": 0.0027588743792532648,
"grad_norm": 3.1883807133419646,
"learning_rate": 2.7573529411764708e-06,
"loss": 1.2495,
"mean_token_accuracy": 0.6844112038612366,
"step": 15
},
{
"epoch": 0.0036784991723376862,
"grad_norm": 2.5757356327081826,
"learning_rate": 3.6764705882352942e-06,
"loss": 1.1962,
"mean_token_accuracy": 0.6918170928955079,
"step": 20
},
{
"epoch": 0.004598123965422108,
"grad_norm": 2.3971194855376092,
"learning_rate": 4.595588235294118e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.6844529986381531,
"step": 25
},
{
"epoch": 0.0055177487585065296,
"grad_norm": 2.00434532423879,
"learning_rate": 5.5147058823529415e-06,
"loss": 1.1506,
"mean_token_accuracy": 0.697660756111145,
"step": 30
},
{
"epoch": 0.006437373551590951,
"grad_norm": 2.0663662496595543,
"learning_rate": 6.433823529411764e-06,
"loss": 1.1278,
"mean_token_accuracy": 0.6973050832748413,
"step": 35
},
{
"epoch": 0.0073569983446753725,
"grad_norm": 1.9519049901829761,
"learning_rate": 7.3529411764705884e-06,
"loss": 1.102,
"mean_token_accuracy": 0.7046478033065796,
"step": 40
},
{
"epoch": 0.008276623137759793,
"grad_norm": 1.8451875842176761,
"learning_rate": 8.272058823529413e-06,
"loss": 1.125,
"mean_token_accuracy": 0.6951346158981323,
"step": 45
},
{
"epoch": 0.009196247930844215,
"grad_norm": 2.000034845742239,
"learning_rate": 9.191176470588236e-06,
"loss": 1.0295,
"mean_token_accuracy": 0.7154734015464783,
"step": 50
},
{
"epoch": 0.010115872723928637,
"grad_norm": 1.621484821283711,
"learning_rate": 1.011029411764706e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.706468117237091,
"step": 55
},
{
"epoch": 0.011035497517013059,
"grad_norm": 1.753826025706781,
"learning_rate": 1.1029411764705883e-05,
"loss": 1.0394,
"mean_token_accuracy": 0.7156139016151428,
"step": 60
},
{
"epoch": 0.011955122310097481,
"grad_norm": 1.6505676536191385,
"learning_rate": 1.1948529411764707e-05,
"loss": 1.0338,
"mean_token_accuracy": 0.7132004976272583,
"step": 65
},
{
"epoch": 0.012874747103181901,
"grad_norm": 1.8513933357249144,
"learning_rate": 1.2867647058823528e-05,
"loss": 0.9804,
"mean_token_accuracy": 0.7274341702461242,
"step": 70
},
{
"epoch": 0.013794371896266323,
"grad_norm": 2.4070230665851993,
"learning_rate": 1.3786764705882355e-05,
"loss": 1.0398,
"mean_token_accuracy": 0.7116599082946777,
"step": 75
},
{
"epoch": 0.014713996689350745,
"grad_norm": 1.798866895809756,
"learning_rate": 1.4705882352941177e-05,
"loss": 0.9922,
"mean_token_accuracy": 0.720504081249237,
"step": 80
},
{
"epoch": 0.015633621482435165,
"grad_norm": 1.709611126629724,
"learning_rate": 1.5625e-05,
"loss": 0.9938,
"mean_token_accuracy": 0.7247263193130493,
"step": 85
},
{
"epoch": 0.016553246275519587,
"grad_norm": 1.7626425485303618,
"learning_rate": 1.6544117647058825e-05,
"loss": 1.0122,
"mean_token_accuracy": 0.717292582988739,
"step": 90
},
{
"epoch": 0.01747287106860401,
"grad_norm": 2.036503882503329,
"learning_rate": 1.7463235294117647e-05,
"loss": 1.0109,
"mean_token_accuracy": 0.7172105073928833,
"step": 95
},
{
"epoch": 0.01839249586168843,
"grad_norm": 1.927409741133158,
"learning_rate": 1.8382352941176472e-05,
"loss": 1.0434,
"mean_token_accuracy": 0.7078547954559327,
"step": 100
},
{
"epoch": 0.019312120654772853,
"grad_norm": 2.079665033278075,
"learning_rate": 1.9301470588235298e-05,
"loss": 0.9959,
"mean_token_accuracy": 0.7182355523109436,
"step": 105
},
{
"epoch": 0.020231745447857274,
"grad_norm": 1.8479982769163703,
"learning_rate": 2.022058823529412e-05,
"loss": 1.0194,
"mean_token_accuracy": 0.7173629522323608,
"step": 110
},
{
"epoch": 0.021151370240941696,
"grad_norm": 1.831806807070413,
"learning_rate": 2.113970588235294e-05,
"loss": 0.9569,
"mean_token_accuracy": 0.7312556385993958,
"step": 115
},
{
"epoch": 0.022070995034026118,
"grad_norm": 1.7952413093248756,
"learning_rate": 2.2058823529411766e-05,
"loss": 1.0149,
"mean_token_accuracy": 0.7192024111747741,
"step": 120
},
{
"epoch": 0.02299061982711054,
"grad_norm": 1.6441769080980864,
"learning_rate": 2.2977941176470588e-05,
"loss": 0.9668,
"mean_token_accuracy": 0.7280102610588074,
"step": 125
},
{
"epoch": 0.023910244620194962,
"grad_norm": 1.7182187182460715,
"learning_rate": 2.3897058823529413e-05,
"loss": 1.025,
"mean_token_accuracy": 0.7164386153221131,
"step": 130
},
{
"epoch": 0.02482986941327938,
"grad_norm": 1.7665031820505241,
"learning_rate": 2.4816176470588238e-05,
"loss": 0.9879,
"mean_token_accuracy": 0.7216517567634583,
"step": 135
},
{
"epoch": 0.025749494206363802,
"grad_norm": 1.65781753659198,
"learning_rate": 2.5735294117647057e-05,
"loss": 1.0204,
"mean_token_accuracy": 0.7183511853218079,
"step": 140
},
{
"epoch": 0.026669118999448224,
"grad_norm": 1.5947996494100198,
"learning_rate": 2.6654411764705882e-05,
"loss": 0.9915,
"mean_token_accuracy": 0.7210009098052979,
"step": 145
},
{
"epoch": 0.027588743792532646,
"grad_norm": 1.6195741488866147,
"learning_rate": 2.757352941176471e-05,
"loss": 0.9609,
"mean_token_accuracy": 0.7290344476699829,
"step": 150
},
{
"epoch": 0.028508368585617068,
"grad_norm": 1.700795937176488,
"learning_rate": 2.849264705882353e-05,
"loss": 1.0017,
"mean_token_accuracy": 0.7190845251083374,
"step": 155
},
{
"epoch": 0.02942799337870149,
"grad_norm": 1.6626957868958252,
"learning_rate": 2.9411764705882354e-05,
"loss": 0.9801,
"mean_token_accuracy": 0.7264268517494201,
"step": 160
},
{
"epoch": 0.03034761817178591,
"grad_norm": 1.646176772035618,
"learning_rate": 3.0330882352941176e-05,
"loss": 0.9819,
"mean_token_accuracy": 0.7258347868919373,
"step": 165
},
{
"epoch": 0.03126724296487033,
"grad_norm": 1.7051406597026453,
"learning_rate": 3.125e-05,
"loss": 1.0021,
"mean_token_accuracy": 0.7193678379058838,
"step": 170
},
{
"epoch": 0.032186867757954755,
"grad_norm": 1.6583599673202631,
"learning_rate": 3.2169117647058826e-05,
"loss": 0.9863,
"mean_token_accuracy": 0.7218608260154724,
"step": 175
},
{
"epoch": 0.033106492551039174,
"grad_norm": 1.6811054631655953,
"learning_rate": 3.308823529411765e-05,
"loss": 0.9776,
"mean_token_accuracy": 0.7252245903015136,
"step": 180
},
{
"epoch": 0.0340261173441236,
"grad_norm": 1.6005295960642778,
"learning_rate": 3.4007352941176476e-05,
"loss": 0.952,
"mean_token_accuracy": 0.7300998091697692,
"step": 185
},
{
"epoch": 0.03494574213720802,
"grad_norm": 1.884741061084924,
"learning_rate": 3.4926470588235294e-05,
"loss": 1.0216,
"mean_token_accuracy": 0.7144460439682007,
"step": 190
},
{
"epoch": 0.03586536693029244,
"grad_norm": 1.61333499821342,
"learning_rate": 3.584558823529412e-05,
"loss": 1.0067,
"mean_token_accuracy": 0.7160724878311158,
"step": 195
},
{
"epoch": 0.03678499172337686,
"grad_norm": 1.592957572722435,
"learning_rate": 3.6764705882352945e-05,
"loss": 0.9367,
"mean_token_accuracy": 0.7348474979400634,
"step": 200
},
{
"epoch": 0.03770461651646129,
"grad_norm": 1.7666690880786284,
"learning_rate": 3.768382352941176e-05,
"loss": 0.9545,
"mean_token_accuracy": 0.7297826528549194,
"step": 205
},
{
"epoch": 0.038624241309545705,
"grad_norm": 1.5696177739032589,
"learning_rate": 3.8602941176470595e-05,
"loss": 1.0076,
"mean_token_accuracy": 0.7160616636276245,
"step": 210
},
{
"epoch": 0.039543866102630124,
"grad_norm": 1.5375849975431441,
"learning_rate": 3.952205882352941e-05,
"loss": 1.0082,
"mean_token_accuracy": 0.7139402985572815,
"step": 215
},
{
"epoch": 0.04046349089571455,
"grad_norm": 1.6613621558577687,
"learning_rate": 4.044117647058824e-05,
"loss": 1.0047,
"mean_token_accuracy": 0.7157810091972351,
"step": 220
},
{
"epoch": 0.04138311568879897,
"grad_norm": 1.6712866586887962,
"learning_rate": 4.136029411764706e-05,
"loss": 0.9841,
"mean_token_accuracy": 0.7261144757270813,
"step": 225
},
{
"epoch": 0.04230274048188339,
"grad_norm": 1.5868739813391535,
"learning_rate": 4.227941176470588e-05,
"loss": 1.0063,
"mean_token_accuracy": 0.7146228194236756,
"step": 230
},
{
"epoch": 0.04322236527496781,
"grad_norm": 1.4745940440239442,
"learning_rate": 4.319852941176471e-05,
"loss": 0.9895,
"mean_token_accuracy": 0.7205227255821228,
"step": 235
},
{
"epoch": 0.044141990068052236,
"grad_norm": 1.565812920746474,
"learning_rate": 4.411764705882353e-05,
"loss": 0.9883,
"mean_token_accuracy": 0.7221224546432495,
"step": 240
},
{
"epoch": 0.045061614861136655,
"grad_norm": 1.579279007990175,
"learning_rate": 4.503676470588236e-05,
"loss": 1.0339,
"mean_token_accuracy": 0.7140692472457886,
"step": 245
},
{
"epoch": 0.04598123965422108,
"grad_norm": 1.550674625710887,
"learning_rate": 4.5955882352941176e-05,
"loss": 1.009,
"mean_token_accuracy": 0.717827045917511,
"step": 250
},
{
"epoch": 0.0469008644473055,
"grad_norm": 1.494069442893164,
"learning_rate": 4.6875e-05,
"loss": 1.0163,
"mean_token_accuracy": 0.7157993316650391,
"step": 255
},
{
"epoch": 0.047820489240389924,
"grad_norm": 1.585433590429472,
"learning_rate": 4.7794117647058826e-05,
"loss": 0.9662,
"mean_token_accuracy": 0.7260660767555237,
"step": 260
},
{
"epoch": 0.04874011403347434,
"grad_norm": 1.5561077784742092,
"learning_rate": 4.871323529411765e-05,
"loss": 1.0521,
"mean_token_accuracy": 0.7059531569480896,
"step": 265
},
{
"epoch": 0.04965973882655876,
"grad_norm": 1.3842507274813078,
"learning_rate": 4.9632352941176476e-05,
"loss": 0.96,
"mean_token_accuracy": 0.7317641496658325,
"step": 270
},
{
"epoch": 0.050579363619643186,
"grad_norm": 1.4379239878799341,
"learning_rate": 4.999996254118754e-05,
"loss": 0.972,
"mean_token_accuracy": 0.7297493696212769,
"step": 275
},
{
"epoch": 0.051498988412727605,
"grad_norm": 1.3761784967587591,
"learning_rate": 4.999973362667417e-05,
"loss": 0.9844,
"mean_token_accuracy": 0.724224853515625,
"step": 280
},
{
"epoch": 0.05241861320581203,
"grad_norm": 1.4249636066532947,
"learning_rate": 4.999929661021346e-05,
"loss": 0.9974,
"mean_token_accuracy": 0.7186186075210571,
"step": 285
},
{
"epoch": 0.05333823799889645,
"grad_norm": 1.6467747117004,
"learning_rate": 4.9998651495847435e-05,
"loss": 1.0296,
"mean_token_accuracy": 0.7110173583030701,
"step": 290
},
{
"epoch": 0.054257862791980874,
"grad_norm": 1.3761801455599358,
"learning_rate": 4.9997798289542816e-05,
"loss": 1.0209,
"mean_token_accuracy": 0.7124481081962586,
"step": 295
},
{
"epoch": 0.05517748758506529,
"grad_norm": 1.4585308096786376,
"learning_rate": 4.9996736999190965e-05,
"loss": 1.0248,
"mean_token_accuracy": 0.7100600242614746,
"step": 300
},
{
"epoch": 0.05609711237814972,
"grad_norm": 1.4301378065367794,
"learning_rate": 4.999546763460785e-05,
"loss": 0.9864,
"mean_token_accuracy": 0.7253738522529602,
"step": 305
},
{
"epoch": 0.057016737171234136,
"grad_norm": 1.4586102770676173,
"learning_rate": 4.999399020753393e-05,
"loss": 0.9541,
"mean_token_accuracy": 0.7308779239654541,
"step": 310
},
{
"epoch": 0.05793636196431856,
"grad_norm": 1.5007400960218442,
"learning_rate": 4.999230473163406e-05,
"loss": 1.0123,
"mean_token_accuracy": 0.7142405152320862,
"step": 315
},
{
"epoch": 0.05885598675740298,
"grad_norm": 1.4247385882584611,
"learning_rate": 4.999041122249735e-05,
"loss": 1.0097,
"mean_token_accuracy": 0.7164065957069397,
"step": 320
},
{
"epoch": 0.0597756115504874,
"grad_norm": 1.4338281584111965,
"learning_rate": 4.9988309697637025e-05,
"loss": 1.0381,
"mean_token_accuracy": 0.7093045115470886,
"step": 325
},
{
"epoch": 0.06069523634357182,
"grad_norm": 1.3206321897141915,
"learning_rate": 4.9986000176490264e-05,
"loss": 1.0378,
"mean_token_accuracy": 0.7081658363342285,
"step": 330
},
{
"epoch": 0.06161486113665624,
"grad_norm": 1.4771390057019052,
"learning_rate": 4.998348268041803e-05,
"loss": 1.0473,
"mean_token_accuracy": 0.7044042825698853,
"step": 335
},
{
"epoch": 0.06253448592974066,
"grad_norm": 1.410427294901373,
"learning_rate": 4.9980757232704836e-05,
"loss": 1.0476,
"mean_token_accuracy": 0.7044672727584839,
"step": 340
},
{
"epoch": 0.06345411072282509,
"grad_norm": 1.293731368317575,
"learning_rate": 4.997782385855862e-05,
"loss": 0.9809,
"mean_token_accuracy": 0.7207650065422058,
"step": 345
},
{
"epoch": 0.06437373551590951,
"grad_norm": 1.373213488697433,
"learning_rate": 4.9974682585110375e-05,
"loss": 1.0238,
"mean_token_accuracy": 0.713714337348938,
"step": 350
},
{
"epoch": 0.06529336030899394,
"grad_norm": 1.4173612737543944,
"learning_rate": 4.997133344141402e-05,
"loss": 0.9995,
"mean_token_accuracy": 0.7182128310203553,
"step": 355
},
{
"epoch": 0.06621298510207835,
"grad_norm": 1.4208487527297817,
"learning_rate": 4.9967776458446067e-05,
"loss": 1.0247,
"mean_token_accuracy": 0.7120985150337219,
"step": 360
},
{
"epoch": 0.06713260989516277,
"grad_norm": 1.3468936690832556,
"learning_rate": 4.996401166910535e-05,
"loss": 1.0257,
"mean_token_accuracy": 0.711448609828949,
"step": 365
},
{
"epoch": 0.0680522346882472,
"grad_norm": 1.3418384776624692,
"learning_rate": 4.996003910821273e-05,
"loss": 0.9908,
"mean_token_accuracy": 0.7198069810867309,
"step": 370
},
{
"epoch": 0.06897185948133161,
"grad_norm": 1.2757020291626893,
"learning_rate": 4.995585881251076e-05,
"loss": 1.0029,
"mean_token_accuracy": 0.7165916681289672,
"step": 375
},
{
"epoch": 0.06989148427441604,
"grad_norm": 1.2215136508098425,
"learning_rate": 4.995147082066335e-05,
"loss": 1.0071,
"mean_token_accuracy": 0.7161303281784057,
"step": 380
},
{
"epoch": 0.07081110906750046,
"grad_norm": 1.5100364277085054,
"learning_rate": 4.9946875173255405e-05,
"loss": 0.9808,
"mean_token_accuracy": 0.7223702430725097,
"step": 385
},
{
"epoch": 0.07173073386058489,
"grad_norm": 1.3193074150499653,
"learning_rate": 4.9942071912792463e-05,
"loss": 0.9692,
"mean_token_accuracy": 0.7253165245056152,
"step": 390
},
{
"epoch": 0.0726503586536693,
"grad_norm": 1.360795639773644,
"learning_rate": 4.9937061083700286e-05,
"loss": 0.9248,
"mean_token_accuracy": 0.738149356842041,
"step": 395
},
{
"epoch": 0.07356998344675372,
"grad_norm": 1.3934617241628962,
"learning_rate": 4.993184273232445e-05,
"loss": 1.0174,
"mean_token_accuracy": 0.7140317440032959,
"step": 400
},
{
"epoch": 0.07448960823983815,
"grad_norm": 1.3755761090465115,
"learning_rate": 4.9926416906929954e-05,
"loss": 0.9371,
"mean_token_accuracy": 0.7347567915916443,
"step": 405
},
{
"epoch": 0.07540923303292257,
"grad_norm": 1.3123084901189321,
"learning_rate": 4.9920783657700685e-05,
"loss": 1.0494,
"mean_token_accuracy": 0.7046082258224488,
"step": 410
},
{
"epoch": 0.07632885782600698,
"grad_norm": 1.26236320940822,
"learning_rate": 4.9914943036739075e-05,
"loss": 0.9813,
"mean_token_accuracy": 0.7248732924461365,
"step": 415
},
{
"epoch": 0.07724848261909141,
"grad_norm": 1.4072657383382854,
"learning_rate": 4.99088950980655e-05,
"loss": 1.0041,
"mean_token_accuracy": 0.7161918520927429,
"step": 420
},
{
"epoch": 0.07816810741217584,
"grad_norm": 1.4142932157820918,
"learning_rate": 4.9902639897617876e-05,
"loss": 1.0343,
"mean_token_accuracy": 0.7073235511779785,
"step": 425
},
{
"epoch": 0.07908773220526025,
"grad_norm": 1.2620775477382082,
"learning_rate": 4.9896177493251065e-05,
"loss": 0.9773,
"mean_token_accuracy": 0.724228036403656,
"step": 430
},
{
"epoch": 0.08000735699834467,
"grad_norm": 1.2299977431090294,
"learning_rate": 4.9889507944736405e-05,
"loss": 0.9921,
"mean_token_accuracy": 0.7193984985351562,
"step": 435
},
{
"epoch": 0.0809269817914291,
"grad_norm": 1.272005618491772,
"learning_rate": 4.9882631313761116e-05,
"loss": 1.0266,
"mean_token_accuracy": 0.7106949806213378,
"step": 440
},
{
"epoch": 0.08184660658451352,
"grad_norm": 1.3368998742271194,
"learning_rate": 4.9875547663927744e-05,
"loss": 0.9945,
"mean_token_accuracy": 0.7178430318832397,
"step": 445
},
{
"epoch": 0.08276623137759793,
"grad_norm": 1.2395804635484349,
"learning_rate": 4.986825706075357e-05,
"loss": 0.9614,
"mean_token_accuracy": 0.7270126938819885,
"step": 450
},
{
"epoch": 0.08368585617068236,
"grad_norm": 1.2355105682399337,
"learning_rate": 4.9860759571669987e-05,
"loss": 1.017,
"mean_token_accuracy": 0.7113536357879638,
"step": 455
},
{
"epoch": 0.08460548096376679,
"grad_norm": 1.2769471363849882,
"learning_rate": 4.985305526602192e-05,
"loss": 0.9841,
"mean_token_accuracy": 0.7207873582839965,
"step": 460
},
{
"epoch": 0.08552510575685121,
"grad_norm": 1.3105851965485462,
"learning_rate": 4.984514421506715e-05,
"loss": 1.0238,
"mean_token_accuracy": 0.7113570213317871,
"step": 465
},
{
"epoch": 0.08644473054993562,
"grad_norm": 1.2226583029739935,
"learning_rate": 4.983702649197565e-05,
"loss": 1.0026,
"mean_token_accuracy": 0.7175478458404541,
"step": 470
},
{
"epoch": 0.08736435534302005,
"grad_norm": 1.3032963672614144,
"learning_rate": 4.982870217182893e-05,
"loss": 1.0102,
"mean_token_accuracy": 0.7142111778259277,
"step": 475
},
{
"epoch": 0.08828398013610447,
"grad_norm": 1.276533355049304,
"learning_rate": 4.9820171331619343e-05,
"loss": 1.0175,
"mean_token_accuracy": 0.7140154242515564,
"step": 480
},
{
"epoch": 0.08920360492918888,
"grad_norm": 1.3275369586760475,
"learning_rate": 4.981143405024936e-05,
"loss": 0.9664,
"mean_token_accuracy": 0.7251969814300537,
"step": 485
},
{
"epoch": 0.09012322972227331,
"grad_norm": 1.322475452296982,
"learning_rate": 4.980249040853081e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.7284212589263916,
"step": 490
},
{
"epoch": 0.09104285451535774,
"grad_norm": 1.2219967426964762,
"learning_rate": 4.979334048918422e-05,
"loss": 1.0265,
"mean_token_accuracy": 0.7094637989997864,
"step": 495
},
{
"epoch": 0.09196247930844216,
"grad_norm": 1.2500649142513325,
"learning_rate": 4.978398437683797e-05,
"loss": 0.9429,
"mean_token_accuracy": 0.7309910893440247,
"step": 500
},
{
"epoch": 0.09288210410152657,
"grad_norm": 1.2382649121413325,
"learning_rate": 4.977442215802753e-05,
"loss": 1.0142,
"mean_token_accuracy": 0.7163145303726196,
"step": 505
},
{
"epoch": 0.093801728894611,
"grad_norm": 1.2494735942714719,
"learning_rate": 4.976465392119467e-05,
"loss": 0.9711,
"mean_token_accuracy": 0.7253948450088501,
"step": 510
},
{
"epoch": 0.09472135368769542,
"grad_norm": 1.1320102641208292,
"learning_rate": 4.9754679756686654e-05,
"loss": 0.9754,
"mean_token_accuracy": 0.7240365982055664,
"step": 515
},
{
"epoch": 0.09564097848077985,
"grad_norm": 1.2636397583226155,
"learning_rate": 4.974449975675538e-05,
"loss": 0.9683,
"mean_token_accuracy": 0.7268050789833069,
"step": 520
},
{
"epoch": 0.09656060327386426,
"grad_norm": 1.2638605012202537,
"learning_rate": 4.9734114015556506e-05,
"loss": 0.994,
"mean_token_accuracy": 0.7192271828651429,
"step": 525
},
{
"epoch": 0.09748022806694868,
"grad_norm": 1.3539672940723328,
"learning_rate": 4.972352262914867e-05,
"loss": 1.0219,
"mean_token_accuracy": 0.712011969089508,
"step": 530
},
{
"epoch": 0.09839985286003311,
"grad_norm": 1.2622022574950933,
"learning_rate": 4.971272569549246e-05,
"loss": 0.9993,
"mean_token_accuracy": 0.717021644115448,
"step": 535
},
{
"epoch": 0.09931947765311752,
"grad_norm": 1.2498621609285703,
"learning_rate": 4.970172331444968e-05,
"loss": 0.9869,
"mean_token_accuracy": 0.7201068043708801,
"step": 540
},
{
"epoch": 0.10023910244620195,
"grad_norm": 1.2563183037951813,
"learning_rate": 4.969051558778226e-05,
"loss": 1.0328,
"mean_token_accuracy": 0.7072706580162048,
"step": 545
},
{
"epoch": 0.10115872723928637,
"grad_norm": 1.1583096373701225,
"learning_rate": 4.967910261915142e-05,
"loss": 1.0073,
"mean_token_accuracy": 0.7176116108894348,
"step": 550
},
{
"epoch": 0.1020783520323708,
"grad_norm": 1.2337310449325847,
"learning_rate": 4.966748451411668e-05,
"loss": 1.0075,
"mean_token_accuracy": 0.7166797518730164,
"step": 555
},
{
"epoch": 0.10299797682545521,
"grad_norm": 1.187463601840395,
"learning_rate": 4.9655661380134874e-05,
"loss": 0.9978,
"mean_token_accuracy": 0.7187446594238281,
"step": 560
},
{
"epoch": 0.10391760161853963,
"grad_norm": 1.1950175317081544,
"learning_rate": 4.964363332655918e-05,
"loss": 1.0127,
"mean_token_accuracy": 0.7141183018684387,
"step": 565
},
{
"epoch": 0.10483722641162406,
"grad_norm": 1.1797983108141703,
"learning_rate": 4.9631400464638074e-05,
"loss": 1.0058,
"mean_token_accuracy": 0.7147095799446106,
"step": 570
},
{
"epoch": 0.10575685120470849,
"grad_norm": 1.3194739883489515,
"learning_rate": 4.961896290751434e-05,
"loss": 1.0125,
"mean_token_accuracy": 0.7156966686248779,
"step": 575
},
{
"epoch": 0.1066764759977929,
"grad_norm": 1.232197096442626,
"learning_rate": 4.960632077022402e-05,
"loss": 1.0096,
"mean_token_accuracy": 0.7136348843574524,
"step": 580
},
{
"epoch": 0.10759610079087732,
"grad_norm": 1.1109964489025674,
"learning_rate": 4.959347416969529e-05,
"loss": 0.9782,
"mean_token_accuracy": 0.7218139052391053,
"step": 585
},
{
"epoch": 0.10851572558396175,
"grad_norm": 1.1118328480221105,
"learning_rate": 4.958042322474747e-05,
"loss": 0.9138,
"mean_token_accuracy": 0.7406689524650574,
"step": 590
},
{
"epoch": 0.10943535037704616,
"grad_norm": 1.1550688598895895,
"learning_rate": 4.956716805608984e-05,
"loss": 1.0123,
"mean_token_accuracy": 0.7150320529937744,
"step": 595
},
{
"epoch": 0.11035497517013058,
"grad_norm": 1.2400379075265455,
"learning_rate": 4.955370878632058e-05,
"loss": 0.9642,
"mean_token_accuracy": 0.7274539470672607,
"step": 600
},
{
"epoch": 0.11127459996321501,
"grad_norm": 1.1266451881904362,
"learning_rate": 4.954004553992564e-05,
"loss": 0.9597,
"mean_token_accuracy": 0.7269688129425049,
"step": 605
},
{
"epoch": 0.11219422475629943,
"grad_norm": 1.195410688726218,
"learning_rate": 4.952617844327753e-05,
"loss": 0.9667,
"mean_token_accuracy": 0.7273669600486755,
"step": 610
},
{
"epoch": 0.11311384954938385,
"grad_norm": 1.2168436664941074,
"learning_rate": 4.951210762463421e-05,
"loss": 0.981,
"mean_token_accuracy": 0.7224032163619996,
"step": 615
},
{
"epoch": 0.11403347434246827,
"grad_norm": 1.1158577605300688,
"learning_rate": 4.949783321413787e-05,
"loss": 1.0133,
"mean_token_accuracy": 0.7140767455101014,
"step": 620
},
{
"epoch": 0.1149530991355527,
"grad_norm": 1.2227500677211205,
"learning_rate": 4.948335534381375e-05,
"loss": 1.0178,
"mean_token_accuracy": 0.7107774257659912,
"step": 625
},
{
"epoch": 0.11587272392863712,
"grad_norm": 1.1733820093333545,
"learning_rate": 4.9468674147568906e-05,
"loss": 0.9496,
"mean_token_accuracy": 0.7264823913574219,
"step": 630
},
{
"epoch": 0.11679234872172153,
"grad_norm": 1.1456005644666878,
"learning_rate": 4.945378976119096e-05,
"loss": 1.0301,
"mean_token_accuracy": 0.7111668229103089,
"step": 635
},
{
"epoch": 0.11771197351480596,
"grad_norm": 1.176194033859284,
"learning_rate": 4.943870232234688e-05,
"loss": 0.9904,
"mean_token_accuracy": 0.7183448076248169,
"step": 640
},
{
"epoch": 0.11863159830789038,
"grad_norm": 1.1767555657667275,
"learning_rate": 4.9423411970581656e-05,
"loss": 0.9565,
"mean_token_accuracy": 0.7282203912734986,
"step": 645
},
{
"epoch": 0.1195512231009748,
"grad_norm": 1.1593918150017006,
"learning_rate": 4.940791884731706e-05,
"loss": 0.9629,
"mean_token_accuracy": 0.7265506267547608,
"step": 650
},
{
"epoch": 0.12047084789405922,
"grad_norm": 1.1809244906539653,
"learning_rate": 4.939222309585029e-05,
"loss": 0.9506,
"mean_token_accuracy": 0.7299855709075928,
"step": 655
},
{
"epoch": 0.12139047268714365,
"grad_norm": 1.187342482868558,
"learning_rate": 4.93763248613527e-05,
"loss": 0.9873,
"mean_token_accuracy": 0.7208028793334961,
"step": 660
},
{
"epoch": 0.12231009748022807,
"grad_norm": 1.1643370561641233,
"learning_rate": 4.936022429086841e-05,
"loss": 1.019,
"mean_token_accuracy": 0.7111838817596435,
"step": 665
},
{
"epoch": 0.12322972227331248,
"grad_norm": 1.1548281507110767,
"learning_rate": 4.9343921533312955e-05,
"loss": 0.949,
"mean_token_accuracy": 0.7271883249282837,
"step": 670
},
{
"epoch": 0.12414934706639691,
"grad_norm": 1.1323282418083014,
"learning_rate": 4.9327416739471935e-05,
"loss": 0.9269,
"mean_token_accuracy": 0.737087082862854,
"step": 675
},
{
"epoch": 0.12506897185948132,
"grad_norm": 1.2363897419233494,
"learning_rate": 4.9310710061999575e-05,
"loss": 1.0061,
"mean_token_accuracy": 0.714658522605896,
"step": 680
},
{
"epoch": 0.12598859665256576,
"grad_norm": 1.15808211817011,
"learning_rate": 4.9293801655417366e-05,
"loss": 0.9426,
"mean_token_accuracy": 0.7324698209762573,
"step": 685
},
{
"epoch": 0.12690822144565017,
"grad_norm": 1.168156282468429,
"learning_rate": 4.927669167611259e-05,
"loss": 0.9516,
"mean_token_accuracy": 0.726858627796173,
"step": 690
},
{
"epoch": 0.12782784623873458,
"grad_norm": 1.1708412963628498,
"learning_rate": 4.92593802823369e-05,
"loss": 0.9565,
"mean_token_accuracy": 0.7281310319900512,
"step": 695
},
{
"epoch": 0.12874747103181902,
"grad_norm": 1.150205433303024,
"learning_rate": 4.924186763420486e-05,
"loss": 0.9966,
"mean_token_accuracy": 0.7196317195892334,
"step": 700
},
{
"epoch": 0.12966709582490343,
"grad_norm": 1.1412449351652514,
"learning_rate": 4.922415389369243e-05,
"loss": 0.9393,
"mean_token_accuracy": 0.7308167576789856,
"step": 705
},
{
"epoch": 0.13058672061798787,
"grad_norm": 1.2590368311590696,
"learning_rate": 4.9206239224635486e-05,
"loss": 0.9961,
"mean_token_accuracy": 0.7167337894439697,
"step": 710
},
{
"epoch": 0.13150634541107228,
"grad_norm": 1.1862573902159457,
"learning_rate": 4.9188123792728344e-05,
"loss": 0.9991,
"mean_token_accuracy": 0.71655353307724,
"step": 715
},
{
"epoch": 0.1324259702041567,
"grad_norm": 1.1728642333915622,
"learning_rate": 4.916980776552218e-05,
"loss": 0.9354,
"mean_token_accuracy": 0.734131133556366,
"step": 720
},
{
"epoch": 0.13334559499724113,
"grad_norm": 1.208191683152181,
"learning_rate": 4.915129131242345e-05,
"loss": 0.9578,
"mean_token_accuracy": 0.7278777837753296,
"step": 725
},
{
"epoch": 0.13426521979032555,
"grad_norm": 1.138309077411327,
"learning_rate": 4.913257460469243e-05,
"loss": 0.9448,
"mean_token_accuracy": 0.7303597450256347,
"step": 730
},
{
"epoch": 0.13518484458340996,
"grad_norm": 1.1410024150973699,
"learning_rate": 4.911365781544153e-05,
"loss": 0.9765,
"mean_token_accuracy": 0.7208934783935547,
"step": 735
},
{
"epoch": 0.1361044693764944,
"grad_norm": 1.135207319109893,
"learning_rate": 4.9094541119633756e-05,
"loss": 0.9625,
"mean_token_accuracy": 0.7279266119003296,
"step": 740
},
{
"epoch": 0.1370240941695788,
"grad_norm": 1.1470179542343784,
"learning_rate": 4.907522469408103e-05,
"loss": 1.0099,
"mean_token_accuracy": 0.7129136681556701,
"step": 745
},
{
"epoch": 0.13794371896266322,
"grad_norm": 1.1186516076443083,
"learning_rate": 4.905570871744262e-05,
"loss": 0.9492,
"mean_token_accuracy": 0.7295220971107483,
"step": 750
},
{
"epoch": 0.13886334375574766,
"grad_norm": 1.188235501807293,
"learning_rate": 4.903599337022345e-05,
"loss": 0.9158,
"mean_token_accuracy": 0.7392297148704529,
"step": 755
},
{
"epoch": 0.13978296854883207,
"grad_norm": 1.156585568722138,
"learning_rate": 4.9016078834772436e-05,
"loss": 1.0069,
"mean_token_accuracy": 0.7133058428764343,
"step": 760
},
{
"epoch": 0.1407025933419165,
"grad_norm": 1.0550430464679208,
"learning_rate": 4.899596529528083e-05,
"loss": 0.9804,
"mean_token_accuracy": 0.7237313628196717,
"step": 765
},
{
"epoch": 0.14162221813500092,
"grad_norm": 1.0828080346302627,
"learning_rate": 4.897565293778045e-05,
"loss": 0.9398,
"mean_token_accuracy": 0.7297361016273498,
"step": 770
},
{
"epoch": 0.14254184292808533,
"grad_norm": 1.0748821988518662,
"learning_rate": 4.895514195014201e-05,
"loss": 0.9512,
"mean_token_accuracy": 0.727254593372345,
"step": 775
},
{
"epoch": 0.14346146772116977,
"grad_norm": 1.1000801031665166,
"learning_rate": 4.893443252207339e-05,
"loss": 0.96,
"mean_token_accuracy": 0.7277865290641785,
"step": 780
},
{
"epoch": 0.14438109251425418,
"grad_norm": 1.1979288214254857,
"learning_rate": 4.891352484511783e-05,
"loss": 0.9904,
"mean_token_accuracy": 0.7203876137733459,
"step": 785
},
{
"epoch": 0.1453007173073386,
"grad_norm": 1.0336978471065938,
"learning_rate": 4.889241911265224e-05,
"loss": 0.9512,
"mean_token_accuracy": 0.7298694252967834,
"step": 790
},
{
"epoch": 0.14622034210042303,
"grad_norm": 1.093196247221492,
"learning_rate": 4.887111551988531e-05,
"loss": 1.0404,
"mean_token_accuracy": 0.7045328140258789,
"step": 795
},
{
"epoch": 0.14713996689350745,
"grad_norm": 1.224732532168464,
"learning_rate": 4.884961426385578e-05,
"loss": 1.0189,
"mean_token_accuracy": 0.7101276278495788,
"step": 800
},
{
"epoch": 0.14805959168659186,
"grad_norm": 1.1751595598375444,
"learning_rate": 4.8827915543430604e-05,
"loss": 0.9166,
"mean_token_accuracy": 0.7369141817092896,
"step": 805
},
{
"epoch": 0.1489792164796763,
"grad_norm": 1.0711984590567727,
"learning_rate": 4.880601955930308e-05,
"loss": 0.9528,
"mean_token_accuracy": 0.7275946021080018,
"step": 810
},
{
"epoch": 0.1498988412727607,
"grad_norm": 1.1523849563074238,
"learning_rate": 4.878392651399103e-05,
"loss": 0.9724,
"mean_token_accuracy": 0.72748943567276,
"step": 815
},
{
"epoch": 0.15081846606584515,
"grad_norm": 1.1385592224893888,
"learning_rate": 4.8761636611834906e-05,
"loss": 0.9423,
"mean_token_accuracy": 0.7338582873344421,
"step": 820
},
{
"epoch": 0.15173809085892956,
"grad_norm": 1.171019568482894,
"learning_rate": 4.873915005899591e-05,
"loss": 0.9823,
"mean_token_accuracy": 0.7215001463890076,
"step": 825
},
{
"epoch": 0.15265771565201397,
"grad_norm": 1.1181637038875023,
"learning_rate": 4.871646706345407e-05,
"loss": 0.9696,
"mean_token_accuracy": 0.7244228839874267,
"step": 830
},
{
"epoch": 0.1535773404450984,
"grad_norm": 1.140111709793846,
"learning_rate": 4.869358783500634e-05,
"loss": 0.9691,
"mean_token_accuracy": 0.7219241619110107,
"step": 835
},
{
"epoch": 0.15449696523818282,
"grad_norm": 1.1035668632214553,
"learning_rate": 4.867051258526466e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.7362164258956909,
"step": 840
},
{
"epoch": 0.15541659003126723,
"grad_norm": 1.0632498704772437,
"learning_rate": 4.864724152765396e-05,
"loss": 0.9319,
"mean_token_accuracy": 0.7335481762886047,
"step": 845
},
{
"epoch": 0.15633621482435167,
"grad_norm": 1.1360641167900578,
"learning_rate": 4.8623774877410235e-05,
"loss": 0.998,
"mean_token_accuracy": 0.7165634036064148,
"step": 850
},
{
"epoch": 0.15725583961743608,
"grad_norm": 1.1574648839544697,
"learning_rate": 4.860011285157852e-05,
"loss": 0.9983,
"mean_token_accuracy": 0.7154228448867798,
"step": 855
},
{
"epoch": 0.1581754644105205,
"grad_norm": 1.1103379240939366,
"learning_rate": 4.857625566901091e-05,
"loss": 0.9606,
"mean_token_accuracy": 0.7255040884017945,
"step": 860
},
{
"epoch": 0.15909508920360493,
"grad_norm": 1.3478355454379694,
"learning_rate": 4.85522035503645e-05,
"loss": 0.9643,
"mean_token_accuracy": 0.7249020457267761,
"step": 865
},
{
"epoch": 0.16001471399668935,
"grad_norm": 1.129020628766503,
"learning_rate": 4.852795671809941e-05,
"loss": 0.9341,
"mean_token_accuracy": 0.7329063415527344,
"step": 870
},
{
"epoch": 0.16093433878977378,
"grad_norm": 1.1322677948976352,
"learning_rate": 4.850351539647661e-05,
"loss": 0.9977,
"mean_token_accuracy": 0.7172942876815795,
"step": 875
},
{
"epoch": 0.1618539635828582,
"grad_norm": 1.120014190171844,
"learning_rate": 4.8478879811555986e-05,
"loss": 0.9283,
"mean_token_accuracy": 0.7341889500617981,
"step": 880
},
{
"epoch": 0.1627735883759426,
"grad_norm": 1.1336097713701254,
"learning_rate": 4.845405019119414e-05,
"loss": 1.0008,
"mean_token_accuracy": 0.7151533484458923,
"step": 885
},
{
"epoch": 0.16369321316902705,
"grad_norm": 0.9922793909516228,
"learning_rate": 4.842902676504235e-05,
"loss": 0.9039,
"mean_token_accuracy": 0.7395052313804626,
"step": 890
},
{
"epoch": 0.16461283796211146,
"grad_norm": 1.2309806920357915,
"learning_rate": 4.840380976454441e-05,
"loss": 0.9143,
"mean_token_accuracy": 0.7372842311859131,
"step": 895
},
{
"epoch": 0.16553246275519587,
"grad_norm": 1.058725560363019,
"learning_rate": 4.837839942293449e-05,
"loss": 1.0122,
"mean_token_accuracy": 0.7113693952560425,
"step": 900
},
{
"epoch": 0.1664520875482803,
"grad_norm": 1.1050666066281727,
"learning_rate": 4.835279597523501e-05,
"loss": 0.9691,
"mean_token_accuracy": 0.7241552948951722,
"step": 905
},
{
"epoch": 0.16737171234136472,
"grad_norm": 1.1281645078253164,
"learning_rate": 4.832699965825443e-05,
"loss": 0.9783,
"mean_token_accuracy": 0.7210159540176392,
"step": 910
},
{
"epoch": 0.16829133713444913,
"grad_norm": 1.1049918709083206,
"learning_rate": 4.830101071058506e-05,
"loss": 0.9529,
"mean_token_accuracy": 0.726420772075653,
"step": 915
},
{
"epoch": 0.16921096192753357,
"grad_norm": 1.1589903082257091,
"learning_rate": 4.82748293726009e-05,
"loss": 1.0162,
"mean_token_accuracy": 0.7134600043296814,
"step": 920
},
{
"epoch": 0.17013058672061798,
"grad_norm": 1.0648743038360364,
"learning_rate": 4.824845588645538e-05,
"loss": 0.931,
"mean_token_accuracy": 0.7355116486549378,
"step": 925
},
{
"epoch": 0.17105021151370242,
"grad_norm": 1.0563630156850699,
"learning_rate": 4.822189049607909e-05,
"loss": 0.9303,
"mean_token_accuracy": 0.7332427501678467,
"step": 930
},
{
"epoch": 0.17196983630678683,
"grad_norm": 1.0946637430016075,
"learning_rate": 4.819513344717759e-05,
"loss": 0.9805,
"mean_token_accuracy": 0.7218296766281128,
"step": 935
},
{
"epoch": 0.17288946109987124,
"grad_norm": 1.218450386345206,
"learning_rate": 4.8168184987229104e-05,
"loss": 1.0025,
"mean_token_accuracy": 0.7138312220573425,
"step": 940
},
{
"epoch": 0.17380908589295568,
"grad_norm": 1.1265660437743932,
"learning_rate": 4.814104536548222e-05,
"loss": 0.9901,
"mean_token_accuracy": 0.7183592796325684,
"step": 945
},
{
"epoch": 0.1747287106860401,
"grad_norm": 1.1519197604777511,
"learning_rate": 4.811371483295361e-05,
"loss": 0.9677,
"mean_token_accuracy": 0.723106038570404,
"step": 950
},
{
"epoch": 0.1756483354791245,
"grad_norm": 1.0668603888469903,
"learning_rate": 4.808619364242569e-05,
"loss": 0.9428,
"mean_token_accuracy": 0.7298098564147949,
"step": 955
},
{
"epoch": 0.17656796027220895,
"grad_norm": 1.0617094358031158,
"learning_rate": 4.805848204844427e-05,
"loss": 0.9794,
"mean_token_accuracy": 0.7198897957801819,
"step": 960
},
{
"epoch": 0.17748758506529336,
"grad_norm": 1.1638181916029056,
"learning_rate": 4.803058030731627e-05,
"loss": 1.0356,
"mean_token_accuracy": 0.7055891275405883,
"step": 965
},
{
"epoch": 0.17840720985837777,
"grad_norm": 1.0804274338945197,
"learning_rate": 4.800248867710724e-05,
"loss": 0.9551,
"mean_token_accuracy": 0.7267025232315063,
"step": 970
},
{
"epoch": 0.1793268346514622,
"grad_norm": 1.1002302515677742,
"learning_rate": 4.797420741763906e-05,
"loss": 0.9513,
"mean_token_accuracy": 0.727520763874054,
"step": 975
},
{
"epoch": 0.18024645944454662,
"grad_norm": 1.0807257658531308,
"learning_rate": 4.794573679048751e-05,
"loss": 0.9667,
"mean_token_accuracy": 0.7254797458648682,
"step": 980
},
{
"epoch": 0.18116608423763106,
"grad_norm": 1.1423934429361384,
"learning_rate": 4.791707705897982e-05,
"loss": 0.9289,
"mean_token_accuracy": 0.7316087126731873,
"step": 985
},
{
"epoch": 0.18208570903071547,
"grad_norm": 1.0732201976252709,
"learning_rate": 4.7888228488192294e-05,
"loss": 0.9826,
"mean_token_accuracy": 0.7205982804298401,
"step": 990
},
{
"epoch": 0.18300533382379988,
"grad_norm": 1.0026696776201605,
"learning_rate": 4.7859191344947804e-05,
"loss": 0.9289,
"mean_token_accuracy": 0.7336562752723694,
"step": 995
},
{
"epoch": 0.18392495861688432,
"grad_norm": 1.138379913644609,
"learning_rate": 4.782996589781337e-05,
"loss": 0.9497,
"mean_token_accuracy": 0.729135024547577,
"step": 1000
},
{
"epoch": 0.18484458340996873,
"grad_norm": 1.107580666472087,
"learning_rate": 4.780055241709762e-05,
"loss": 0.9048,
"mean_token_accuracy": 0.7381602048873901,
"step": 1005
},
{
"epoch": 0.18576420820305314,
"grad_norm": 1.0667620674465943,
"learning_rate": 4.7770951174848335e-05,
"loss": 0.9742,
"mean_token_accuracy": 0.7205707669258118,
"step": 1010
},
{
"epoch": 0.18668383299613758,
"grad_norm": 1.0940019385189808,
"learning_rate": 4.774116244484993e-05,
"loss": 0.9857,
"mean_token_accuracy": 0.718968415260315,
"step": 1015
},
{
"epoch": 0.187603457789222,
"grad_norm": 1.0279044112611866,
"learning_rate": 4.7711186502620894e-05,
"loss": 1.0084,
"mean_token_accuracy": 0.7144084692001342,
"step": 1020
},
{
"epoch": 0.1885230825823064,
"grad_norm": 1.0751882464256728,
"learning_rate": 4.768102362541126e-05,
"loss": 0.9353,
"mean_token_accuracy": 0.7318849921226501,
"step": 1025
},
{
"epoch": 0.18944270737539085,
"grad_norm": 1.1701748750390102,
"learning_rate": 4.765067409220004e-05,
"loss": 0.957,
"mean_token_accuracy": 0.7275319814682006,
"step": 1030
},
{
"epoch": 0.19036233216847526,
"grad_norm": 1.0512353267451773,
"learning_rate": 4.762013818369266e-05,
"loss": 0.9367,
"mean_token_accuracy": 0.7317106485366821,
"step": 1035
},
{
"epoch": 0.1912819569615597,
"grad_norm": 1.1085851412035923,
"learning_rate": 4.7589416182318305e-05,
"loss": 0.9416,
"mean_token_accuracy": 0.7324359536170959,
"step": 1040
},
{
"epoch": 0.1922015817546441,
"grad_norm": 1.094731274119514,
"learning_rate": 4.755850837222739e-05,
"loss": 0.9474,
"mean_token_accuracy": 0.7309187650680542,
"step": 1045
},
{
"epoch": 0.19312120654772852,
"grad_norm": 1.0610610405848808,
"learning_rate": 4.7527415039288874e-05,
"loss": 0.9638,
"mean_token_accuracy": 0.7251871824264526,
"step": 1050
},
{
"epoch": 0.19404083134081296,
"grad_norm": 1.0919916417692772,
"learning_rate": 4.749613647108764e-05,
"loss": 1.0008,
"mean_token_accuracy": 0.7152180433273315,
"step": 1055
},
{
"epoch": 0.19496045613389737,
"grad_norm": 1.0847298297852,
"learning_rate": 4.7464672956921814e-05,
"loss": 0.9366,
"mean_token_accuracy": 0.7313546657562255,
"step": 1060
},
{
"epoch": 0.19588008092698178,
"grad_norm": 1.0912787695821449,
"learning_rate": 4.743302478780011e-05,
"loss": 0.945,
"mean_token_accuracy": 0.728658664226532,
"step": 1065
},
{
"epoch": 0.19679970572006622,
"grad_norm": 1.052195400658314,
"learning_rate": 4.7401192256439144e-05,
"loss": 0.9793,
"mean_token_accuracy": 0.7213846921920777,
"step": 1070
},
{
"epoch": 0.19771933051315063,
"grad_norm": 1.1107870405998106,
"learning_rate": 4.736917565726069e-05,
"loss": 0.9313,
"mean_token_accuracy": 0.735443937778473,
"step": 1075
},
{
"epoch": 0.19863895530623504,
"grad_norm": 1.1399365300090571,
"learning_rate": 4.7336975286389e-05,
"loss": 0.9717,
"mean_token_accuracy": 0.7237229943275452,
"step": 1080
},
{
"epoch": 0.19955858009931948,
"grad_norm": 1.0983682734144682,
"learning_rate": 4.730459144164802e-05,
"loss": 0.9306,
"mean_token_accuracy": 0.733622133731842,
"step": 1085
},
{
"epoch": 0.2004782048924039,
"grad_norm": 1.1053704101564246,
"learning_rate": 4.727202442255871e-05,
"loss": 0.9936,
"mean_token_accuracy": 0.718384611606598,
"step": 1090
},
{
"epoch": 0.20139782968548833,
"grad_norm": 1.0858488860538602,
"learning_rate": 4.723927453033619e-05,
"loss": 0.9548,
"mean_token_accuracy": 0.7286873102188111,
"step": 1095
},
{
"epoch": 0.20231745447857274,
"grad_norm": 1.0232898856111519,
"learning_rate": 4.720634206788697e-05,
"loss": 0.9804,
"mean_token_accuracy": 0.7218252301216126,
"step": 1100
},
{
"epoch": 0.20323707927165716,
"grad_norm": 1.1548447631409977,
"learning_rate": 4.717322733980622e-05,
"loss": 0.931,
"mean_token_accuracy": 0.7311301946640014,
"step": 1105
},
{
"epoch": 0.2041567040647416,
"grad_norm": 1.1168183831474872,
"learning_rate": 4.713993065237486e-05,
"loss": 0.9718,
"mean_token_accuracy": 0.7235833764076233,
"step": 1110
},
{
"epoch": 0.205076328857826,
"grad_norm": 1.1111836320920656,
"learning_rate": 4.710645231355678e-05,
"loss": 0.9855,
"mean_token_accuracy": 0.7195135593414307,
"step": 1115
},
{
"epoch": 0.20599595365091042,
"grad_norm": 1.0024638729648838,
"learning_rate": 4.707279263299598e-05,
"loss": 0.9729,
"mean_token_accuracy": 0.7219846963882446,
"step": 1120
},
{
"epoch": 0.20691557844399486,
"grad_norm": 1.0121762272601764,
"learning_rate": 4.703895192201372e-05,
"loss": 0.9459,
"mean_token_accuracy": 0.7269375443458557,
"step": 1125
},
{
"epoch": 0.20783520323707927,
"grad_norm": 1.0470465876428376,
"learning_rate": 4.7004930493605573e-05,
"loss": 1.0105,
"mean_token_accuracy": 0.7086774349212647,
"step": 1130
},
{
"epoch": 0.20875482803016368,
"grad_norm": 1.0632837126367782,
"learning_rate": 4.697072866243866e-05,
"loss": 0.9412,
"mean_token_accuracy": 0.7307331085205078,
"step": 1135
},
{
"epoch": 0.20967445282324812,
"grad_norm": 1.0768863946202714,
"learning_rate": 4.69363467448486e-05,
"loss": 0.9674,
"mean_token_accuracy": 0.7221316814422607,
"step": 1140
},
{
"epoch": 0.21059407761633253,
"grad_norm": 1.1181930167961487,
"learning_rate": 4.6901785058836675e-05,
"loss": 0.955,
"mean_token_accuracy": 0.725222361087799,
"step": 1145
},
{
"epoch": 0.21151370240941697,
"grad_norm": 1.0688002319746086,
"learning_rate": 4.686704392406685e-05,
"loss": 0.9687,
"mean_token_accuracy": 0.7218108892440795,
"step": 1150
},
{
"epoch": 0.21243332720250138,
"grad_norm": 1.1052965038670703,
"learning_rate": 4.6832123661862835e-05,
"loss": 0.9516,
"mean_token_accuracy": 0.7287932515144349,
"step": 1155
},
{
"epoch": 0.2133529519955858,
"grad_norm": 1.0349887525202925,
"learning_rate": 4.6797024595205104e-05,
"loss": 0.9599,
"mean_token_accuracy": 0.7228366494178772,
"step": 1160
},
{
"epoch": 0.21427257678867023,
"grad_norm": 1.052123043795087,
"learning_rate": 4.6761747048727907e-05,
"loss": 0.9833,
"mean_token_accuracy": 0.714729118347168,
"step": 1165
},
{
"epoch": 0.21519220158175464,
"grad_norm": 1.0646750046566955,
"learning_rate": 4.672629134871625e-05,
"loss": 0.98,
"mean_token_accuracy": 0.7194055676460266,
"step": 1170
},
{
"epoch": 0.21611182637483906,
"grad_norm": 1.072675922430035,
"learning_rate": 4.669065782310294e-05,
"loss": 0.9661,
"mean_token_accuracy": 0.7228956103324891,
"step": 1175
},
{
"epoch": 0.2170314511679235,
"grad_norm": 1.0475965649186345,
"learning_rate": 4.665484680146546e-05,
"loss": 0.9168,
"mean_token_accuracy": 0.7354954957962037,
"step": 1180
},
{
"epoch": 0.2179510759610079,
"grad_norm": 1.0183550500547607,
"learning_rate": 4.6618858615023e-05,
"loss": 0.9268,
"mean_token_accuracy": 0.731166672706604,
"step": 1185
},
{
"epoch": 0.21887070075409232,
"grad_norm": 1.0894438583208028,
"learning_rate": 4.658269359663336e-05,
"loss": 0.9134,
"mean_token_accuracy": 0.7400953650474549,
"step": 1190
},
{
"epoch": 0.21979032554717676,
"grad_norm": 0.9962620966267176,
"learning_rate": 4.6546352080789854e-05,
"loss": 0.9472,
"mean_token_accuracy": 0.7283522963523865,
"step": 1195
},
{
"epoch": 0.22070995034026117,
"grad_norm": 1.0767144498287804,
"learning_rate": 4.650983440361825e-05,
"loss": 0.9798,
"mean_token_accuracy": 0.7208079814910888,
"step": 1200
},
{
"epoch": 0.2216295751333456,
"grad_norm": 1.0451151540293229,
"learning_rate": 4.6473140902873666e-05,
"loss": 0.9735,
"mean_token_accuracy": 0.7223762154579163,
"step": 1205
},
{
"epoch": 0.22254919992643002,
"grad_norm": 0.9904423090265289,
"learning_rate": 4.643627191793737e-05,
"loss": 0.9416,
"mean_token_accuracy": 0.7333443641662598,
"step": 1210
},
{
"epoch": 0.22346882471951443,
"grad_norm": 1.0324822073086444,
"learning_rate": 4.639922778981377e-05,
"loss": 0.9096,
"mean_token_accuracy": 0.7366245865821839,
"step": 1215
},
{
"epoch": 0.22438844951259887,
"grad_norm": 1.00961392870682,
"learning_rate": 4.636200886112714e-05,
"loss": 0.9647,
"mean_token_accuracy": 0.7272518515586853,
"step": 1220
},
{
"epoch": 0.22530807430568328,
"grad_norm": 1.041598639678359,
"learning_rate": 4.63246154761185e-05,
"loss": 0.982,
"mean_token_accuracy": 0.7185810923576355,
"step": 1225
},
{
"epoch": 0.2262276990987677,
"grad_norm": 1.0574278162856792,
"learning_rate": 4.628704798064247e-05,
"loss": 0.9442,
"mean_token_accuracy": 0.7297179222106933,
"step": 1230
},
{
"epoch": 0.22714732389185213,
"grad_norm": 1.060076765820854,
"learning_rate": 4.624930672216399e-05,
"loss": 0.9614,
"mean_token_accuracy": 0.7244118571281433,
"step": 1235
},
{
"epoch": 0.22806694868493654,
"grad_norm": 1.0123003105589568,
"learning_rate": 4.621139204975516e-05,
"loss": 0.9169,
"mean_token_accuracy": 0.7362489700317383,
"step": 1240
},
{
"epoch": 0.22898657347802095,
"grad_norm": 1.1490153575204947,
"learning_rate": 4.617330431409201e-05,
"loss": 0.9929,
"mean_token_accuracy": 0.7166203141212464,
"step": 1245
},
{
"epoch": 0.2299061982711054,
"grad_norm": 1.0270625785191527,
"learning_rate": 4.6135043867451255e-05,
"loss": 0.9325,
"mean_token_accuracy": 0.7311270833015442,
"step": 1250
},
{
"epoch": 0.2308258230641898,
"grad_norm": 1.030694744170465,
"learning_rate": 4.609661106370701e-05,
"loss": 0.9228,
"mean_token_accuracy": 0.7355565190315246,
"step": 1255
},
{
"epoch": 0.23174544785727424,
"grad_norm": 1.0190672056189127,
"learning_rate": 4.605800625832753e-05,
"loss": 0.9577,
"mean_token_accuracy": 0.7273682594299317,
"step": 1260
},
{
"epoch": 0.23266507265035866,
"grad_norm": 1.025832787786935,
"learning_rate": 4.6019229808371945e-05,
"loss": 0.9291,
"mean_token_accuracy": 0.7325186491012573,
"step": 1265
},
{
"epoch": 0.23358469744344307,
"grad_norm": 1.0254402284447273,
"learning_rate": 4.598028207248693e-05,
"loss": 0.9681,
"mean_token_accuracy": 0.7215327501296998,
"step": 1270
},
{
"epoch": 0.2345043222365275,
"grad_norm": 1.043519079594266,
"learning_rate": 4.5941163410903406e-05,
"loss": 0.9565,
"mean_token_accuracy": 0.7248036026954651,
"step": 1275
},
{
"epoch": 0.23542394702961192,
"grad_norm": 0.9811685630848649,
"learning_rate": 4.590187418543321e-05,
"loss": 0.9204,
"mean_token_accuracy": 0.7338666915893555,
"step": 1280
},
{
"epoch": 0.23634357182269633,
"grad_norm": 1.0355767679745649,
"learning_rate": 4.586241475946571e-05,
"loss": 0.9824,
"mean_token_accuracy": 0.7212961316108704,
"step": 1285
},
{
"epoch": 0.23726319661578077,
"grad_norm": 0.9995187864598916,
"learning_rate": 4.582278549796448e-05,
"loss": 0.914,
"mean_token_accuracy": 0.7355898737907409,
"step": 1290
},
{
"epoch": 0.23818282140886518,
"grad_norm": 1.0163621938165361,
"learning_rate": 4.5782986767463946e-05,
"loss": 0.9614,
"mean_token_accuracy": 0.7241615772247314,
"step": 1295
},
{
"epoch": 0.2391024462019496,
"grad_norm": 1.0913821743861445,
"learning_rate": 4.574301893606594e-05,
"loss": 0.8839,
"mean_token_accuracy": 0.7434832811355591,
"step": 1300
},
{
"epoch": 0.24002207099503403,
"grad_norm": 1.0399223484753735,
"learning_rate": 4.570288237343632e-05,
"loss": 0.9104,
"mean_token_accuracy": 0.7378169417381286,
"step": 1305
},
{
"epoch": 0.24094169578811844,
"grad_norm": 1.011671028641558,
"learning_rate": 4.5662577450801576e-05,
"loss": 0.9595,
"mean_token_accuracy": 0.7230379819869995,
"step": 1310
},
{
"epoch": 0.24186132058120288,
"grad_norm": 1.008990928095214,
"learning_rate": 4.562210454094535e-05,
"loss": 0.9363,
"mean_token_accuracy": 0.7295035600662232,
"step": 1315
},
{
"epoch": 0.2427809453742873,
"grad_norm": 1.059357744292348,
"learning_rate": 4.558146401820502e-05,
"loss": 0.9569,
"mean_token_accuracy": 0.7264422059059144,
"step": 1320
},
{
"epoch": 0.2437005701673717,
"grad_norm": 1.0224904321964083,
"learning_rate": 4.554065625846825e-05,
"loss": 0.9838,
"mean_token_accuracy": 0.7178040146827698,
"step": 1325
},
{
"epoch": 0.24462019496045614,
"grad_norm": 1.0737296876090594,
"learning_rate": 4.549968163916946e-05,
"loss": 0.976,
"mean_token_accuracy": 0.7180652141571044,
"step": 1330
},
{
"epoch": 0.24553981975354056,
"grad_norm": 1.0129242243093401,
"learning_rate": 4.545854053928639e-05,
"loss": 0.9394,
"mean_token_accuracy": 0.7314478039741517,
"step": 1335
},
{
"epoch": 0.24645944454662497,
"grad_norm": 0.9860304727584566,
"learning_rate": 4.541723333933657e-05,
"loss": 0.9595,
"mean_token_accuracy": 0.7271197676658631,
"step": 1340
},
{
"epoch": 0.2473790693397094,
"grad_norm": 1.0235437508308431,
"learning_rate": 4.5375760421373796e-05,
"loss": 0.9888,
"mean_token_accuracy": 0.7178149104118348,
"step": 1345
},
{
"epoch": 0.24829869413279382,
"grad_norm": 1.076473129213084,
"learning_rate": 4.533412216898461e-05,
"loss": 0.9374,
"mean_token_accuracy": 0.7287054538726807,
"step": 1350
},
{
"epoch": 0.24921831892587823,
"grad_norm": 1.027000741915809,
"learning_rate": 4.529231896728474e-05,
"loss": 0.9098,
"mean_token_accuracy": 0.7352772355079651,
"step": 1355
},
{
"epoch": 0.25013794371896264,
"grad_norm": 1.0980991489181584,
"learning_rate": 4.525035120291557e-05,
"loss": 0.9613,
"mean_token_accuracy": 0.7250553727149963,
"step": 1360
},
{
"epoch": 0.2510575685120471,
"grad_norm": 1.0105378261394609,
"learning_rate": 4.520821926404049e-05,
"loss": 0.9232,
"mean_token_accuracy": 0.7339854836463928,
"step": 1365
},
{
"epoch": 0.2519771933051315,
"grad_norm": 1.0465671126237865,
"learning_rate": 4.516592354034138e-05,
"loss": 0.9578,
"mean_token_accuracy": 0.7243474960327149,
"step": 1370
},
{
"epoch": 0.2528968180982159,
"grad_norm": 1.0721948067984564,
"learning_rate": 4.512346442301501e-05,
"loss": 0.9305,
"mean_token_accuracy": 0.7290533304214477,
"step": 1375
},
{
"epoch": 0.25381644289130034,
"grad_norm": 1.083352961545848,
"learning_rate": 4.5080842304769345e-05,
"loss": 0.9338,
"mean_token_accuracy": 0.733627998828888,
"step": 1380
},
{
"epoch": 0.2547360676843848,
"grad_norm": 0.979913773136715,
"learning_rate": 4.503805757981997e-05,
"loss": 0.9012,
"mean_token_accuracy": 0.7409675002098084,
"step": 1385
},
{
"epoch": 0.25565569247746917,
"grad_norm": 1.1174510417210128,
"learning_rate": 4.499511064388645e-05,
"loss": 0.8754,
"mean_token_accuracy": 0.7447872519493103,
"step": 1390
},
{
"epoch": 0.2565753172705536,
"grad_norm": 1.0562227070300527,
"learning_rate": 4.495200189418864e-05,
"loss": 0.9505,
"mean_token_accuracy": 0.7265227913856507,
"step": 1395
},
{
"epoch": 0.25749494206363804,
"grad_norm": 1.0550543313489833,
"learning_rate": 4.490873172944303e-05,
"loss": 0.9096,
"mean_token_accuracy": 0.7342225193977356,
"step": 1400
},
{
"epoch": 0.2584145668567225,
"grad_norm": 1.0844914008772555,
"learning_rate": 4.486530054985905e-05,
"loss": 0.9643,
"mean_token_accuracy": 0.7227702975273133,
"step": 1405
},
{
"epoch": 0.25933419164980687,
"grad_norm": 1.11030675175993,
"learning_rate": 4.482170875713536e-05,
"loss": 0.98,
"mean_token_accuracy": 0.7210663437843323,
"step": 1410
},
{
"epoch": 0.2602538164428913,
"grad_norm": 1.0678730599548856,
"learning_rate": 4.477795675445616e-05,
"loss": 0.9248,
"mean_token_accuracy": 0.7327564835548401,
"step": 1415
},
{
"epoch": 0.26117344123597575,
"grad_norm": 0.9866628204231362,
"learning_rate": 4.473404494648744e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.7343960881233216,
"step": 1420
},
{
"epoch": 0.26209306602906013,
"grad_norm": 0.9895263110250994,
"learning_rate": 4.4689973739373244e-05,
"loss": 0.9123,
"mean_token_accuracy": 0.7354090452194214,
"step": 1425
},
{
"epoch": 0.26301269082214457,
"grad_norm": 0.9560958289104061,
"learning_rate": 4.46457435407319e-05,
"loss": 0.9494,
"mean_token_accuracy": 0.725600802898407,
"step": 1430
},
{
"epoch": 0.263932315615229,
"grad_norm": 1.0418751893863187,
"learning_rate": 4.460135475965227e-05,
"loss": 0.887,
"mean_token_accuracy": 0.744392192363739,
"step": 1435
},
{
"epoch": 0.2648519404083134,
"grad_norm": 1.0270767884123133,
"learning_rate": 4.455680780668997e-05,
"loss": 0.98,
"mean_token_accuracy": 0.717594051361084,
"step": 1440
},
{
"epoch": 0.26577156520139783,
"grad_norm": 1.0194372684867639,
"learning_rate": 4.4512103093863555e-05,
"loss": 0.9145,
"mean_token_accuracy": 0.7369788885116577,
"step": 1445
},
{
"epoch": 0.26669118999448227,
"grad_norm": 1.0981284825838393,
"learning_rate": 4.44672410346507e-05,
"loss": 0.9519,
"mean_token_accuracy": 0.7260895729064941,
"step": 1450
},
{
"epoch": 0.26761081478756665,
"grad_norm": 1.0207625075556366,
"learning_rate": 4.442222204398441e-05,
"loss": 0.9555,
"mean_token_accuracy": 0.7227967500686645,
"step": 1455
},
{
"epoch": 0.2685304395806511,
"grad_norm": 0.98393868791661,
"learning_rate": 4.437704653824915e-05,
"loss": 0.8831,
"mean_token_accuracy": 0.7438354253768921,
"step": 1460
},
{
"epoch": 0.26945006437373553,
"grad_norm": 0.9817630950075087,
"learning_rate": 4.433171493527701e-05,
"loss": 0.9404,
"mean_token_accuracy": 0.728731095790863,
"step": 1465
},
{
"epoch": 0.2703696891668199,
"grad_norm": 1.0298652072064594,
"learning_rate": 4.428622765434383e-05,
"loss": 0.9136,
"mean_token_accuracy": 0.7356218695640564,
"step": 1470
},
{
"epoch": 0.27128931395990435,
"grad_norm": 0.981553092264934,
"learning_rate": 4.4240585116165334e-05,
"loss": 0.8555,
"mean_token_accuracy": 0.753374171257019,
"step": 1475
},
{
"epoch": 0.2722089387529888,
"grad_norm": 1.172918257192198,
"learning_rate": 4.419478774289325e-05,
"loss": 0.998,
"mean_token_accuracy": 0.713919198513031,
"step": 1480
},
{
"epoch": 0.2731285635460732,
"grad_norm": 1.003409782978005,
"learning_rate": 4.414883595811136e-05,
"loss": 0.8782,
"mean_token_accuracy": 0.7452871680259705,
"step": 1485
},
{
"epoch": 0.2740481883391576,
"grad_norm": 1.0316918646250515,
"learning_rate": 4.410273018683163e-05,
"loss": 0.9242,
"mean_token_accuracy": 0.7311699628829956,
"step": 1490
},
{
"epoch": 0.27496781313224206,
"grad_norm": 0.978003437149563,
"learning_rate": 4.405647085549025e-05,
"loss": 0.9241,
"mean_token_accuracy": 0.7328976273536683,
"step": 1495
},
{
"epoch": 0.27588743792532644,
"grad_norm": 1.0070406181231344,
"learning_rate": 4.40100583919437e-05,
"loss": 0.9001,
"mean_token_accuracy": 0.7395057559013367,
"step": 1500
},
{
"epoch": 0.2768070627184109,
"grad_norm": 0.9873878935159346,
"learning_rate": 4.3963493225464817e-05,
"loss": 0.9258,
"mean_token_accuracy": 0.7336387634277344,
"step": 1505
},
{
"epoch": 0.2777266875114953,
"grad_norm": 0.9521695030248521,
"learning_rate": 4.3916775786738754e-05,
"loss": 0.914,
"mean_token_accuracy": 0.7378314137458801,
"step": 1510
},
{
"epoch": 0.27864631230457976,
"grad_norm": 0.9502896850196428,
"learning_rate": 4.3869906507859096e-05,
"loss": 0.8987,
"mean_token_accuracy": 0.7417943596839904,
"step": 1515
},
{
"epoch": 0.27956593709766414,
"grad_norm": 0.991426828614557,
"learning_rate": 4.382288582232376e-05,
"loss": 0.9106,
"mean_token_accuracy": 0.7390964746475219,
"step": 1520
},
{
"epoch": 0.2804855618907486,
"grad_norm": 1.0581857743606324,
"learning_rate": 4.377571416503108e-05,
"loss": 0.9179,
"mean_token_accuracy": 0.7379998922348022,
"step": 1525
},
{
"epoch": 0.281405186683833,
"grad_norm": 0.9872377385823925,
"learning_rate": 4.372839197227571e-05,
"loss": 0.8848,
"mean_token_accuracy": 0.7446985721588135,
"step": 1530
},
{
"epoch": 0.2823248114769174,
"grad_norm": 1.0976151495403408,
"learning_rate": 4.368091968174463e-05,
"loss": 0.9632,
"mean_token_accuracy": 0.723613953590393,
"step": 1535
},
{
"epoch": 0.28324443627000184,
"grad_norm": 1.013680671037777,
"learning_rate": 4.363329773251309e-05,
"loss": 0.866,
"mean_token_accuracy": 0.750942587852478,
"step": 1540
},
{
"epoch": 0.2841640610630863,
"grad_norm": 1.1182733077200029,
"learning_rate": 4.3585526565040543e-05,
"loss": 0.9995,
"mean_token_accuracy": 0.7137303233146668,
"step": 1545
},
{
"epoch": 0.28508368585617067,
"grad_norm": 0.9779737007515391,
"learning_rate": 4.353760662116658e-05,
"loss": 0.9369,
"mean_token_accuracy": 0.7336580872535705,
"step": 1550
},
{
"epoch": 0.2860033106492551,
"grad_norm": 1.0260468281394197,
"learning_rate": 4.348953834410683e-05,
"loss": 0.9678,
"mean_token_accuracy": 0.7206373929977417,
"step": 1555
},
{
"epoch": 0.28692293544233954,
"grad_norm": 1.0263096637333005,
"learning_rate": 4.3441322178448856e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.7260561943054199,
"step": 1560
},
{
"epoch": 0.2878425602354239,
"grad_norm": 0.9619383230028783,
"learning_rate": 4.339295857014809e-05,
"loss": 0.9501,
"mean_token_accuracy": 0.7264659523963928,
"step": 1565
},
{
"epoch": 0.28876218502850837,
"grad_norm": 0.9946060524217067,
"learning_rate": 4.3344447966523634e-05,
"loss": 0.9887,
"mean_token_accuracy": 0.7160560727119446,
"step": 1570
},
{
"epoch": 0.2896818098215928,
"grad_norm": 1.0275376139203307,
"learning_rate": 4.3295790816254195e-05,
"loss": 0.9262,
"mean_token_accuracy": 0.734666109085083,
"step": 1575
},
{
"epoch": 0.2906014346146772,
"grad_norm": 1.1276042923218728,
"learning_rate": 4.324698756937388e-05,
"loss": 0.9378,
"mean_token_accuracy": 0.7300173878669739,
"step": 1580
},
{
"epoch": 0.29152105940776163,
"grad_norm": 0.9552400868458645,
"learning_rate": 4.319803867726807e-05,
"loss": 0.8879,
"mean_token_accuracy": 0.7425481796264648,
"step": 1585
},
{
"epoch": 0.29244068420084607,
"grad_norm": 0.9486514468425481,
"learning_rate": 4.3148944592669234e-05,
"loss": 0.9613,
"mean_token_accuracy": 0.7219538450241089,
"step": 1590
},
{
"epoch": 0.29336030899393045,
"grad_norm": 0.9567962674802902,
"learning_rate": 4.30997057696527e-05,
"loss": 0.8741,
"mean_token_accuracy": 0.7477473855018616,
"step": 1595
},
{
"epoch": 0.2942799337870149,
"grad_norm": 0.9667609260469084,
"learning_rate": 4.3050322663632564e-05,
"loss": 0.9568,
"mean_token_accuracy": 0.7255883097648621,
"step": 1600
},
{
"epoch": 0.29519955858009933,
"grad_norm": 0.9920073647296315,
"learning_rate": 4.3000795731357333e-05,
"loss": 0.9237,
"mean_token_accuracy": 0.7383288621902466,
"step": 1605
},
{
"epoch": 0.2961191833731837,
"grad_norm": 1.0604465170326072,
"learning_rate": 4.295112543090584e-05,
"loss": 0.9609,
"mean_token_accuracy": 0.7225096940994262,
"step": 1610
},
{
"epoch": 0.29703880816626815,
"grad_norm": 1.0688037490276023,
"learning_rate": 4.290131222168289e-05,
"loss": 1.0008,
"mean_token_accuracy": 0.7138909697532654,
"step": 1615
},
{
"epoch": 0.2979584329593526,
"grad_norm": 1.143629206489082,
"learning_rate": 4.2851356564415086e-05,
"loss": 0.9867,
"mean_token_accuracy": 0.7165561437606811,
"step": 1620
},
{
"epoch": 0.29887805775243703,
"grad_norm": 1.0438745750713756,
"learning_rate": 4.280125892114656e-05,
"loss": 0.9434,
"mean_token_accuracy": 0.7298865675926208,
"step": 1625
},
{
"epoch": 0.2997976825455214,
"grad_norm": 1.0251559106803514,
"learning_rate": 4.2751019755234664e-05,
"loss": 0.935,
"mean_token_accuracy": 0.7299148678779602,
"step": 1630
},
{
"epoch": 0.30071730733860585,
"grad_norm": 0.9900961445552091,
"learning_rate": 4.27006395313457e-05,
"loss": 0.9963,
"mean_token_accuracy": 0.7131295561790466,
"step": 1635
},
{
"epoch": 0.3016369321316903,
"grad_norm": 1.040210108998438,
"learning_rate": 4.265011871545066e-05,
"loss": 0.9412,
"mean_token_accuracy": 0.7279941439628601,
"step": 1640
},
{
"epoch": 0.3025565569247747,
"grad_norm": 1.0262950854145634,
"learning_rate": 4.259945777482085e-05,
"loss": 0.9239,
"mean_token_accuracy": 0.7327239632606506,
"step": 1645
},
{
"epoch": 0.3034761817178591,
"grad_norm": 0.9969469234100081,
"learning_rate": 4.25486571780236e-05,
"loss": 0.9462,
"mean_token_accuracy": 0.7269651889801025,
"step": 1650
},
{
"epoch": 0.30439580651094356,
"grad_norm": 1.0021703198417462,
"learning_rate": 4.249771739491795e-05,
"loss": 0.9003,
"mean_token_accuracy": 0.7421126961708069,
"step": 1655
},
{
"epoch": 0.30531543130402794,
"grad_norm": 1.0255704189414308,
"learning_rate": 4.24466388966503e-05,
"loss": 0.9249,
"mean_token_accuracy": 0.7345858454704285,
"step": 1660
},
{
"epoch": 0.3062350560971124,
"grad_norm": 0.9438771845720968,
"learning_rate": 4.239542215565e-05,
"loss": 0.9749,
"mean_token_accuracy": 0.7182752847671509,
"step": 1665
},
{
"epoch": 0.3071546808901968,
"grad_norm": 0.9878451650581643,
"learning_rate": 4.2344067645625036e-05,
"loss": 0.9455,
"mean_token_accuracy": 0.7264060854911805,
"step": 1670
},
{
"epoch": 0.3080743056832812,
"grad_norm": 1.1287364443586523,
"learning_rate": 4.229257584155765e-05,
"loss": 0.9218,
"mean_token_accuracy": 0.7332573175430298,
"step": 1675
},
{
"epoch": 0.30899393047636564,
"grad_norm": 0.971666072350275,
"learning_rate": 4.2240947219699895e-05,
"loss": 0.8756,
"mean_token_accuracy": 0.7459922909736634,
"step": 1680
},
{
"epoch": 0.3099135552694501,
"grad_norm": 0.9593974583897734,
"learning_rate": 4.2189182257569285e-05,
"loss": 0.9329,
"mean_token_accuracy": 0.730040967464447,
"step": 1685
},
{
"epoch": 0.31083318006253446,
"grad_norm": 0.943158273064518,
"learning_rate": 4.213728143394436e-05,
"loss": 0.8839,
"mean_token_accuracy": 0.7458212971687317,
"step": 1690
},
{
"epoch": 0.3117528048556189,
"grad_norm": 1.050902490407755,
"learning_rate": 4.208524522886022e-05,
"loss": 0.9443,
"mean_token_accuracy": 0.7311147809028625,
"step": 1695
},
{
"epoch": 0.31267242964870334,
"grad_norm": 1.0074348860409519,
"learning_rate": 4.203307412360418e-05,
"loss": 0.9201,
"mean_token_accuracy": 0.7326057314872741,
"step": 1700
},
{
"epoch": 0.3135920544417877,
"grad_norm": 1.0039288385867127,
"learning_rate": 4.1980768600711194e-05,
"loss": 0.9169,
"mean_token_accuracy": 0.736884355545044,
"step": 1705
},
{
"epoch": 0.31451167923487217,
"grad_norm": 0.9456279018137994,
"learning_rate": 4.1928329143959506e-05,
"loss": 0.9198,
"mean_token_accuracy": 0.7341038465499878,
"step": 1710
},
{
"epoch": 0.3154313040279566,
"grad_norm": 0.969219875361889,
"learning_rate": 4.18757562383661e-05,
"loss": 0.9586,
"mean_token_accuracy": 0.7229322910308837,
"step": 1715
},
{
"epoch": 0.316350928821041,
"grad_norm": 0.9823553221239351,
"learning_rate": 4.182305037018224e-05,
"loss": 0.8674,
"mean_token_accuracy": 0.7455045938491821,
"step": 1720
},
{
"epoch": 0.31727055361412543,
"grad_norm": 0.9614849491835867,
"learning_rate": 4.1770212026888974e-05,
"loss": 0.8978,
"mean_token_accuracy": 0.7393216609954834,
"step": 1725
},
{
"epoch": 0.31819017840720987,
"grad_norm": 1.0298443865011644,
"learning_rate": 4.1717241697192636e-05,
"loss": 0.9046,
"mean_token_accuracy": 0.7390219569206238,
"step": 1730
},
{
"epoch": 0.3191098032002943,
"grad_norm": 0.9675044814332657,
"learning_rate": 4.166413987102031e-05,
"loss": 0.9014,
"mean_token_accuracy": 0.7412125468254089,
"step": 1735
},
{
"epoch": 0.3200294279933787,
"grad_norm": 0.9558901216962499,
"learning_rate": 4.161090703951528e-05,
"loss": 0.8915,
"mean_token_accuracy": 0.7442119359970093,
"step": 1740
},
{
"epoch": 0.32094905278646313,
"grad_norm": 1.0231471726772243,
"learning_rate": 4.155754369503254e-05,
"loss": 0.9508,
"mean_token_accuracy": 0.7272051572799683,
"step": 1745
},
{
"epoch": 0.32186867757954757,
"grad_norm": 0.971225693001968,
"learning_rate": 4.1504050331134186e-05,
"loss": 0.9271,
"mean_token_accuracy": 0.7334083676338196,
"step": 1750
},
{
"epoch": 0.32278830237263195,
"grad_norm": 0.9487975621871125,
"learning_rate": 4.1450427442584885e-05,
"loss": 0.9231,
"mean_token_accuracy": 0.7330006003379822,
"step": 1755
},
{
"epoch": 0.3237079271657164,
"grad_norm": 1.080234485746019,
"learning_rate": 4.13966755253473e-05,
"loss": 0.8934,
"mean_token_accuracy": 0.7371908903121949,
"step": 1760
},
{
"epoch": 0.32462755195880083,
"grad_norm": 1.0042744657060512,
"learning_rate": 4.134279507657746e-05,
"loss": 0.9357,
"mean_token_accuracy": 0.7307947874069214,
"step": 1765
},
{
"epoch": 0.3255471767518852,
"grad_norm": 1.0167454318885076,
"learning_rate": 4.1288786594620224e-05,
"loss": 0.9522,
"mean_token_accuracy": 0.7250777244567871,
"step": 1770
},
{
"epoch": 0.32646680154496965,
"grad_norm": 1.0378785371682158,
"learning_rate": 4.123465057900463e-05,
"loss": 0.8991,
"mean_token_accuracy": 0.7383182883262634,
"step": 1775
},
{
"epoch": 0.3273864263380541,
"grad_norm": 0.975574798117687,
"learning_rate": 4.118038753043927e-05,
"loss": 0.8962,
"mean_token_accuracy": 0.7391498327255249,
"step": 1780
},
{
"epoch": 0.3283060511311385,
"grad_norm": 0.9785593634297269,
"learning_rate": 4.112599795080771e-05,
"loss": 0.8976,
"mean_token_accuracy": 0.7406945347785949,
"step": 1785
},
{
"epoch": 0.3292256759242229,
"grad_norm": 0.9506069452238485,
"learning_rate": 4.107148234316378e-05,
"loss": 0.9792,
"mean_token_accuracy": 0.7183930397033691,
"step": 1790
},
{
"epoch": 0.33014530071730736,
"grad_norm": 0.9568388159915644,
"learning_rate": 4.101684121172696e-05,
"loss": 0.9445,
"mean_token_accuracy": 0.7280240654945374,
"step": 1795
},
{
"epoch": 0.33106492551039174,
"grad_norm": 1.022357456314008,
"learning_rate": 4.096207506187773e-05,
"loss": 0.9394,
"mean_token_accuracy": 0.7300898432731628,
"step": 1800
},
{
"epoch": 0.3319845503034762,
"grad_norm": 0.993312074550177,
"learning_rate": 4.090718440015285e-05,
"loss": 0.8857,
"mean_token_accuracy": 0.7397880554199219,
"step": 1805
},
{
"epoch": 0.3329041750965606,
"grad_norm": 0.9393217165901138,
"learning_rate": 4.0852169734240715e-05,
"loss": 0.9055,
"mean_token_accuracy": 0.7397056937217712,
"step": 1810
},
{
"epoch": 0.333823799889645,
"grad_norm": 1.0286146516865022,
"learning_rate": 4.0797031572976644e-05,
"loss": 0.9486,
"mean_token_accuracy": 0.7270653247833252,
"step": 1815
},
{
"epoch": 0.33474342468272944,
"grad_norm": 1.0433673618214743,
"learning_rate": 4.074177042633818e-05,
"loss": 0.8654,
"mean_token_accuracy": 0.7493741869926452,
"step": 1820
},
{
"epoch": 0.3356630494758139,
"grad_norm": 0.9978374983290279,
"learning_rate": 4.068638680544035e-05,
"loss": 0.9434,
"mean_token_accuracy": 0.7284141898155212,
"step": 1825
},
{
"epoch": 0.33658267426889826,
"grad_norm": 0.9268570875914646,
"learning_rate": 4.063088122253096e-05,
"loss": 0.9323,
"mean_token_accuracy": 0.7292568445205688,
"step": 1830
},
{
"epoch": 0.3375022990619827,
"grad_norm": 1.0098370277606412,
"learning_rate": 4.05752541909859e-05,
"loss": 0.8831,
"mean_token_accuracy": 0.7427129149436951,
"step": 1835
},
{
"epoch": 0.33842192385506714,
"grad_norm": 0.9840521255378257,
"learning_rate": 4.0519506225304266e-05,
"loss": 0.9129,
"mean_token_accuracy": 0.7376075983047485,
"step": 1840
},
{
"epoch": 0.3393415486481516,
"grad_norm": 0.9706147022595509,
"learning_rate": 4.046363784110375e-05,
"loss": 0.8867,
"mean_token_accuracy": 0.7421358585357666,
"step": 1845
},
{
"epoch": 0.34026117344123596,
"grad_norm": 1.0544553608523015,
"learning_rate": 4.040764955511577e-05,
"loss": 0.9404,
"mean_token_accuracy": 0.7300120830535889,
"step": 1850
},
{
"epoch": 0.3411807982343204,
"grad_norm": 0.9771051625951763,
"learning_rate": 4.035154188518076e-05,
"loss": 0.92,
"mean_token_accuracy": 0.7353024840354919,
"step": 1855
},
{
"epoch": 0.34210042302740484,
"grad_norm": 0.9612601058837731,
"learning_rate": 4.02953153502433e-05,
"loss": 0.8822,
"mean_token_accuracy": 0.7446259975433349,
"step": 1860
},
{
"epoch": 0.3430200478204892,
"grad_norm": 1.0790844365415948,
"learning_rate": 4.0238970470347404e-05,
"loss": 0.9243,
"mean_token_accuracy": 0.7315137147903442,
"step": 1865
},
{
"epoch": 0.34393967261357367,
"grad_norm": 0.9988868690440261,
"learning_rate": 4.018250776663164e-05,
"loss": 0.8875,
"mean_token_accuracy": 0.7421119809150696,
"step": 1870
},
{
"epoch": 0.3448592974066581,
"grad_norm": 1.0571095915292046,
"learning_rate": 4.012592776132435e-05,
"loss": 0.9273,
"mean_token_accuracy": 0.731085193157196,
"step": 1875
},
{
"epoch": 0.3457789221997425,
"grad_norm": 1.135743652086019,
"learning_rate": 4.0069230977738826e-05,
"loss": 0.9534,
"mean_token_accuracy": 0.7248372554779052,
"step": 1880
},
{
"epoch": 0.34669854699282693,
"grad_norm": 0.9715071563775657,
"learning_rate": 4.001241794026842e-05,
"loss": 0.94,
"mean_token_accuracy": 0.731473171710968,
"step": 1885
},
{
"epoch": 0.34761817178591137,
"grad_norm": 0.9942342778662301,
"learning_rate": 3.9955489174381746e-05,
"loss": 0.9329,
"mean_token_accuracy": 0.7310616850852967,
"step": 1890
},
{
"epoch": 0.34853779657899575,
"grad_norm": 1.0075175249825896,
"learning_rate": 3.989844520661779e-05,
"loss": 0.9438,
"mean_token_accuracy": 0.7262274742126464,
"step": 1895
},
{
"epoch": 0.3494574213720802,
"grad_norm": 0.9753954477573876,
"learning_rate": 3.984128656458106e-05,
"loss": 0.9702,
"mean_token_accuracy": 0.7193968415260314,
"step": 1900
},
{
"epoch": 0.35037704616516463,
"grad_norm": 1.0133558076382343,
"learning_rate": 3.978401377693669e-05,
"loss": 0.873,
"mean_token_accuracy": 0.7490906119346619,
"step": 1905
},
{
"epoch": 0.351296670958249,
"grad_norm": 1.0343688728685794,
"learning_rate": 3.9726627373405544e-05,
"loss": 0.9308,
"mean_token_accuracy": 0.7297749042510986,
"step": 1910
},
{
"epoch": 0.35221629575133345,
"grad_norm": 0.9695668089988693,
"learning_rate": 3.966912788475937e-05,
"loss": 0.9028,
"mean_token_accuracy": 0.7381954431533814,
"step": 1915
},
{
"epoch": 0.3531359205444179,
"grad_norm": 0.9832664588504738,
"learning_rate": 3.961151584281581e-05,
"loss": 0.8815,
"mean_token_accuracy": 0.7429476737976074,
"step": 1920
},
{
"epoch": 0.3540555453375023,
"grad_norm": 0.963687599953708,
"learning_rate": 3.955379178043352e-05,
"loss": 0.9823,
"mean_token_accuracy": 0.7177613019943238,
"step": 1925
},
{
"epoch": 0.3549751701305867,
"grad_norm": 0.9479437389842555,
"learning_rate": 3.9495956231507266e-05,
"loss": 0.9274,
"mean_token_accuracy": 0.7312801122665405,
"step": 1930
},
{
"epoch": 0.35589479492367115,
"grad_norm": 0.938691928481946,
"learning_rate": 3.943800973096296e-05,
"loss": 0.9017,
"mean_token_accuracy": 0.7394131779670715,
"step": 1935
},
{
"epoch": 0.35681441971675554,
"grad_norm": 0.967769246759337,
"learning_rate": 3.937995281475269e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.7352214097976685,
"step": 1940
},
{
"epoch": 0.35773404450984,
"grad_norm": 0.9613349378582403,
"learning_rate": 3.932178601984982e-05,
"loss": 0.8861,
"mean_token_accuracy": 0.7429886102676392,
"step": 1945
},
{
"epoch": 0.3586536693029244,
"grad_norm": 0.9739202222729397,
"learning_rate": 3.926350988424397e-05,
"loss": 0.8628,
"mean_token_accuracy": 0.7480137705802917,
"step": 1950
},
{
"epoch": 0.35957329409600886,
"grad_norm": 1.00417983410191,
"learning_rate": 3.920512494693607e-05,
"loss": 0.879,
"mean_token_accuracy": 0.7440518856048584,
"step": 1955
},
{
"epoch": 0.36049291888909324,
"grad_norm": 1.0098406374163094,
"learning_rate": 3.9146631747933366e-05,
"loss": 0.8329,
"mean_token_accuracy": 0.759476363658905,
"step": 1960
},
{
"epoch": 0.3614125436821777,
"grad_norm": 0.9962046099940254,
"learning_rate": 3.908803082824441e-05,
"loss": 0.8369,
"mean_token_accuracy": 0.7543352007865906,
"step": 1965
},
{
"epoch": 0.3623321684752621,
"grad_norm": 1.0229275697874085,
"learning_rate": 3.9029322729874104e-05,
"loss": 0.9319,
"mean_token_accuracy": 0.7315138220787049,
"step": 1970
},
{
"epoch": 0.3632517932683465,
"grad_norm": 0.9131833883898176,
"learning_rate": 3.8970507995818636e-05,
"loss": 0.8373,
"mean_token_accuracy": 0.754296875,
"step": 1975
},
{
"epoch": 0.36417141806143094,
"grad_norm": 0.9558351857573911,
"learning_rate": 3.891158717006046e-05,
"loss": 0.892,
"mean_token_accuracy": 0.7430965900421143,
"step": 1980
},
{
"epoch": 0.3650910428545154,
"grad_norm": 0.9446973659937214,
"learning_rate": 3.885256079756331e-05,
"loss": 0.9394,
"mean_token_accuracy": 0.7250162839889527,
"step": 1985
},
{
"epoch": 0.36601066764759976,
"grad_norm": 0.9202948815573198,
"learning_rate": 3.879342942426711e-05,
"loss": 0.9124,
"mean_token_accuracy": 0.7363432049751282,
"step": 1990
},
{
"epoch": 0.3669302924406842,
"grad_norm": 0.9507433703052857,
"learning_rate": 3.8734193597082964e-05,
"loss": 0.9265,
"mean_token_accuracy": 0.7309059858322143,
"step": 1995
},
{
"epoch": 0.36784991723376864,
"grad_norm": 0.9721403940210892,
"learning_rate": 3.867485386388806e-05,
"loss": 0.9368,
"mean_token_accuracy": 0.7331580281257629,
"step": 2000
},
{
"epoch": 0.368769542026853,
"grad_norm": 0.9405505899400793,
"learning_rate": 3.8615410773520635e-05,
"loss": 0.9138,
"mean_token_accuracy": 0.7358463048934937,
"step": 2005
},
{
"epoch": 0.36968916681993746,
"grad_norm": 0.963025470188593,
"learning_rate": 3.8555864875774885e-05,
"loss": 0.9019,
"mean_token_accuracy": 0.7384212732315063,
"step": 2010
},
{
"epoch": 0.3706087916130219,
"grad_norm": 0.9907971594256944,
"learning_rate": 3.849621672139588e-05,
"loss": 0.8763,
"mean_token_accuracy": 0.7444020867347717,
"step": 2015
},
{
"epoch": 0.3715284164061063,
"grad_norm": 0.981696155165083,
"learning_rate": 3.843646686207445e-05,
"loss": 0.9202,
"mean_token_accuracy": 0.7325111865997315,
"step": 2020
},
{
"epoch": 0.3724480411991907,
"grad_norm": 0.990078628199776,
"learning_rate": 3.837661585044211e-05,
"loss": 0.9045,
"mean_token_accuracy": 0.7379343152046204,
"step": 2025
},
{
"epoch": 0.37336766599227517,
"grad_norm": 0.9302652014201332,
"learning_rate": 3.831666424006598e-05,
"loss": 0.9145,
"mean_token_accuracy": 0.7369246363639832,
"step": 2030
},
{
"epoch": 0.37428729078535955,
"grad_norm": 1.0127134327540788,
"learning_rate": 3.825661258544358e-05,
"loss": 0.8949,
"mean_token_accuracy": 0.740783178806305,
"step": 2035
},
{
"epoch": 0.375206915578444,
"grad_norm": 0.9456025309406082,
"learning_rate": 3.819646144199777e-05,
"loss": 0.8635,
"mean_token_accuracy": 0.749360203742981,
"step": 2040
},
{
"epoch": 0.37612654037152843,
"grad_norm": 0.9458510607283644,
"learning_rate": 3.813621136607157e-05,
"loss": 0.9212,
"mean_token_accuracy": 0.7321518301963806,
"step": 2045
},
{
"epoch": 0.3770461651646128,
"grad_norm": 0.995792214246869,
"learning_rate": 3.8075862914923074e-05,
"loss": 0.9529,
"mean_token_accuracy": 0.7222961544990539,
"step": 2050
},
{
"epoch": 0.37796578995769725,
"grad_norm": 0.931780686224964,
"learning_rate": 3.801541664672021e-05,
"loss": 0.9068,
"mean_token_accuracy": 0.7373356938362121,
"step": 2055
},
{
"epoch": 0.3788854147507817,
"grad_norm": 1.032699719779323,
"learning_rate": 3.795487312053566e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.754009485244751,
"step": 2060
},
{
"epoch": 0.37980503954386613,
"grad_norm": 1.0082536583803767,
"learning_rate": 3.789423289634163e-05,
"loss": 0.8877,
"mean_token_accuracy": 0.7419803261756897,
"step": 2065
},
{
"epoch": 0.3807246643369505,
"grad_norm": 0.9922794484448726,
"learning_rate": 3.783349653500472e-05,
"loss": 0.9549,
"mean_token_accuracy": 0.7244602799415588,
"step": 2070
},
{
"epoch": 0.38164428913003495,
"grad_norm": 0.9289765959162268,
"learning_rate": 3.777266459828067e-05,
"loss": 0.9049,
"mean_token_accuracy": 0.7346539378166199,
"step": 2075
},
{
"epoch": 0.3825639139231194,
"grad_norm": 0.9418822148176986,
"learning_rate": 3.7711737648809255e-05,
"loss": 0.8631,
"mean_token_accuracy": 0.7498388290405273,
"step": 2080
},
{
"epoch": 0.3834835387162038,
"grad_norm": 0.9739714347813362,
"learning_rate": 3.765071625010899e-05,
"loss": 0.8642,
"mean_token_accuracy": 0.7496488690376282,
"step": 2085
},
{
"epoch": 0.3844031635092882,
"grad_norm": 0.9876318304111896,
"learning_rate": 3.758960096657197e-05,
"loss": 0.9409,
"mean_token_accuracy": 0.7231215476989746,
"step": 2090
},
{
"epoch": 0.38532278830237265,
"grad_norm": 0.9391298182307426,
"learning_rate": 3.752839236345866e-05,
"loss": 0.9321,
"mean_token_accuracy": 0.7299721479415894,
"step": 2095
},
{
"epoch": 0.38624241309545704,
"grad_norm": 0.9975883406823954,
"learning_rate": 3.746709100689263e-05,
"loss": 0.9119,
"mean_token_accuracy": 0.7372664332389831,
"step": 2100
},
{
"epoch": 0.3871620378885415,
"grad_norm": 0.9585598143365737,
"learning_rate": 3.740569746385531e-05,
"loss": 0.9511,
"mean_token_accuracy": 0.7252285242080688,
"step": 2105
},
{
"epoch": 0.3880816626816259,
"grad_norm": 0.9708930878655039,
"learning_rate": 3.7344212302180807e-05,
"loss": 0.9021,
"mean_token_accuracy": 0.7373741269111633,
"step": 2110
},
{
"epoch": 0.3890012874747103,
"grad_norm": 0.9842480657825518,
"learning_rate": 3.7282636090550613e-05,
"loss": 0.9155,
"mean_token_accuracy": 0.7346144676208496,
"step": 2115
},
{
"epoch": 0.38992091226779474,
"grad_norm": 1.010319909401371,
"learning_rate": 3.722096939848833e-05,
"loss": 0.8251,
"mean_token_accuracy": 0.7569172263145447,
"step": 2120
},
{
"epoch": 0.3908405370608792,
"grad_norm": 1.0232782350312868,
"learning_rate": 3.7159212796354425e-05,
"loss": 0.9061,
"mean_token_accuracy": 0.7363372683525086,
"step": 2125
},
{
"epoch": 0.39176016185396356,
"grad_norm": 0.9853933308782586,
"learning_rate": 3.7097366855340974e-05,
"loss": 0.9281,
"mean_token_accuracy": 0.7297635912895203,
"step": 2130
},
{
"epoch": 0.392679786647048,
"grad_norm": 1.0085562594833883,
"learning_rate": 3.703543214746632e-05,
"loss": 0.9345,
"mean_token_accuracy": 0.7267664670944214,
"step": 2135
},
{
"epoch": 0.39359941144013244,
"grad_norm": 0.9907065624349415,
"learning_rate": 3.6973409245569846e-05,
"loss": 0.9017,
"mean_token_accuracy": 0.7393394112586975,
"step": 2140
},
{
"epoch": 0.3945190362332168,
"grad_norm": 0.9488707860528096,
"learning_rate": 3.691129872330663e-05,
"loss": 0.9373,
"mean_token_accuracy": 0.728193199634552,
"step": 2145
},
{
"epoch": 0.39543866102630126,
"grad_norm": 0.9103606197233259,
"learning_rate": 3.684910115514218e-05,
"loss": 0.897,
"mean_token_accuracy": 0.7412585973739624,
"step": 2150
},
{
"epoch": 0.3963582858193857,
"grad_norm": 0.965709462156266,
"learning_rate": 3.678681711634708e-05,
"loss": 0.8715,
"mean_token_accuracy": 0.74575275182724,
"step": 2155
},
{
"epoch": 0.3972779106124701,
"grad_norm": 1.0272326947622106,
"learning_rate": 3.67244471829917e-05,
"loss": 0.8789,
"mean_token_accuracy": 0.7422020196914673,
"step": 2160
},
{
"epoch": 0.3981975354055545,
"grad_norm": 0.9300588922771316,
"learning_rate": 3.6661991931940856e-05,
"loss": 0.8945,
"mean_token_accuracy": 0.7385678648948669,
"step": 2165
},
{
"epoch": 0.39911716019863896,
"grad_norm": 1.002757392159615,
"learning_rate": 3.6599451940848446e-05,
"loss": 0.8993,
"mean_token_accuracy": 0.7361081838607788,
"step": 2170
},
{
"epoch": 0.4000367849917234,
"grad_norm": 1.1036859227862066,
"learning_rate": 3.6536827788152176e-05,
"loss": 0.9308,
"mean_token_accuracy": 0.7304606318473816,
"step": 2175
},
{
"epoch": 0.4009564097848078,
"grad_norm": 0.9701793563305904,
"learning_rate": 3.6474120053068164e-05,
"loss": 0.8472,
"mean_token_accuracy": 0.7498792171478271,
"step": 2180
},
{
"epoch": 0.4018760345778922,
"grad_norm": 1.041733702997736,
"learning_rate": 3.641132931558556e-05,
"loss": 0.9581,
"mean_token_accuracy": 0.7201631188392639,
"step": 2185
},
{
"epoch": 0.40279565937097667,
"grad_norm": 1.0348942168040987,
"learning_rate": 3.634845615646123e-05,
"loss": 0.9393,
"mean_token_accuracy": 0.7280836224555969,
"step": 2190
},
{
"epoch": 0.40371528416406105,
"grad_norm": 1.0131734961320986,
"learning_rate": 3.628550115721437e-05,
"loss": 0.927,
"mean_token_accuracy": 0.729682469367981,
"step": 2195
},
{
"epoch": 0.4046349089571455,
"grad_norm": 1.025738826571974,
"learning_rate": 3.622246490012111e-05,
"loss": 0.9357,
"mean_token_accuracy": 0.724788224697113,
"step": 2200
},
{
"epoch": 0.40555453375022993,
"grad_norm": 0.9501914998942569,
"learning_rate": 3.615934796820915e-05,
"loss": 0.8978,
"mean_token_accuracy": 0.7385434865951538,
"step": 2205
},
{
"epoch": 0.4064741585433143,
"grad_norm": 1.0106650660729533,
"learning_rate": 3.609615094525235e-05,
"loss": 0.952,
"mean_token_accuracy": 0.7243346452713013,
"step": 2210
},
{
"epoch": 0.40739378333639875,
"grad_norm": 0.9301771755028939,
"learning_rate": 3.6032874415765344e-05,
"loss": 0.8633,
"mean_token_accuracy": 0.7481309175491333,
"step": 2215
},
{
"epoch": 0.4083134081294832,
"grad_norm": 0.9662316400458029,
"learning_rate": 3.596951896499813e-05,
"loss": 0.8931,
"mean_token_accuracy": 0.7380975484848022,
"step": 2220
},
{
"epoch": 0.4092330329225676,
"grad_norm": 0.9612362754674141,
"learning_rate": 3.590608517893065e-05,
"loss": 0.8787,
"mean_token_accuracy": 0.743196439743042,
"step": 2225
},
{
"epoch": 0.410152657715652,
"grad_norm": 0.9923328807528666,
"learning_rate": 3.584257364426738e-05,
"loss": 0.942,
"mean_token_accuracy": 0.7252677202224731,
"step": 2230
},
{
"epoch": 0.41107228250873645,
"grad_norm": 0.9797715702136052,
"learning_rate": 3.577898494843191e-05,
"loss": 0.9523,
"mean_token_accuracy": 0.7244603157043457,
"step": 2235
},
{
"epoch": 0.41199190730182084,
"grad_norm": 0.9048445218025765,
"learning_rate": 3.571531967956147e-05,
"loss": 0.9136,
"mean_token_accuracy": 0.7320458292961121,
"step": 2240
},
{
"epoch": 0.4129115320949053,
"grad_norm": 0.9649058945655278,
"learning_rate": 3.565157842650154e-05,
"loss": 0.9041,
"mean_token_accuracy": 0.7362257719039917,
"step": 2245
},
{
"epoch": 0.4138311568879897,
"grad_norm": 0.9147474250541198,
"learning_rate": 3.55877617788004e-05,
"loss": 0.9155,
"mean_token_accuracy": 0.7333362221717834,
"step": 2250
},
{
"epoch": 0.4147507816810741,
"grad_norm": 0.876619458906422,
"learning_rate": 3.5523870326703635e-05,
"loss": 0.8492,
"mean_token_accuracy": 0.7528911828994751,
"step": 2255
},
{
"epoch": 0.41567040647415854,
"grad_norm": 1.0036194468259731,
"learning_rate": 3.545990466114871e-05,
"loss": 0.9137,
"mean_token_accuracy": 0.734946858882904,
"step": 2260
},
{
"epoch": 0.416590031267243,
"grad_norm": 0.9978348158615458,
"learning_rate": 3.5395865373759504e-05,
"loss": 0.8815,
"mean_token_accuracy": 0.742937445640564,
"step": 2265
},
{
"epoch": 0.41750965606032736,
"grad_norm": 0.9799485166888982,
"learning_rate": 3.533175305684081e-05,
"loss": 0.8857,
"mean_token_accuracy": 0.7412702798843384,
"step": 2270
},
{
"epoch": 0.4184292808534118,
"grad_norm": 0.9766101000667111,
"learning_rate": 3.5267568303372914e-05,
"loss": 0.8934,
"mean_token_accuracy": 0.7409379720687866,
"step": 2275
},
{
"epoch": 0.41934890564649624,
"grad_norm": 0.9775807722195559,
"learning_rate": 3.520331170700605e-05,
"loss": 0.9067,
"mean_token_accuracy": 0.7377767205238343,
"step": 2280
},
{
"epoch": 0.4202685304395807,
"grad_norm": 0.9690742278243399,
"learning_rate": 3.513898386205491e-05,
"loss": 0.9032,
"mean_token_accuracy": 0.7356434345245362,
"step": 2285
},
{
"epoch": 0.42118815523266506,
"grad_norm": 0.965511424805927,
"learning_rate": 3.507458536349323e-05,
"loss": 0.9157,
"mean_token_accuracy": 0.7343951106071472,
"step": 2290
},
{
"epoch": 0.4221077800257495,
"grad_norm": 0.9486968791577164,
"learning_rate": 3.5010116806948166e-05,
"loss": 0.901,
"mean_token_accuracy": 0.7399522423744201,
"step": 2295
},
{
"epoch": 0.42302740481883394,
"grad_norm": 0.9414293890579761,
"learning_rate": 3.4945578788694894e-05,
"loss": 0.9179,
"mean_token_accuracy": 0.7342228889465332,
"step": 2300
},
{
"epoch": 0.4239470296119183,
"grad_norm": 0.9896377940060639,
"learning_rate": 3.4880971905651016e-05,
"loss": 0.8784,
"mean_token_accuracy": 0.7457787752151489,
"step": 2305
},
{
"epoch": 0.42486665440500276,
"grad_norm": 0.9655527131977069,
"learning_rate": 3.481629675537108e-05,
"loss": 0.863,
"mean_token_accuracy": 0.7453173756599426,
"step": 2310
},
{
"epoch": 0.4257862791980872,
"grad_norm": 0.8936296988219236,
"learning_rate": 3.475155393604104e-05,
"loss": 0.8856,
"mean_token_accuracy": 0.7441475629806519,
"step": 2315
},
{
"epoch": 0.4267059039911716,
"grad_norm": 0.9149916486904485,
"learning_rate": 3.468674404647273e-05,
"loss": 0.8532,
"mean_token_accuracy": 0.7507219910621643,
"step": 2320
},
{
"epoch": 0.427625528784256,
"grad_norm": 0.9750792604803812,
"learning_rate": 3.462186768609834e-05,
"loss": 0.863,
"mean_token_accuracy": 0.7469933509826661,
"step": 2325
},
{
"epoch": 0.42854515357734047,
"grad_norm": 0.980901247745682,
"learning_rate": 3.455692545496483e-05,
"loss": 0.837,
"mean_token_accuracy": 0.7545093297958374,
"step": 2330
},
{
"epoch": 0.42946477837042485,
"grad_norm": 0.9686839306544004,
"learning_rate": 3.4491917953728396e-05,
"loss": 0.8885,
"mean_token_accuracy": 0.7428396463394165,
"step": 2335
},
{
"epoch": 0.4303844031635093,
"grad_norm": 0.9388350160272184,
"learning_rate": 3.442684578364897e-05,
"loss": 0.8951,
"mean_token_accuracy": 0.7408537268638611,
"step": 2340
},
{
"epoch": 0.4313040279565937,
"grad_norm": 0.8933385447401438,
"learning_rate": 3.4361709546584545e-05,
"loss": 0.8689,
"mean_token_accuracy": 0.7458449006080627,
"step": 2345
},
{
"epoch": 0.4322236527496781,
"grad_norm": 0.9411177313363235,
"learning_rate": 3.429650984498573e-05,
"loss": 0.8417,
"mean_token_accuracy": 0.7528134107589721,
"step": 2350
},
{
"epoch": 0.43314327754276255,
"grad_norm": 0.9359109119006161,
"learning_rate": 3.423124728189009e-05,
"loss": 0.8737,
"mean_token_accuracy": 0.7434362411499024,
"step": 2355
},
{
"epoch": 0.434062902335847,
"grad_norm": 0.966957214742338,
"learning_rate": 3.4165922460916635e-05,
"loss": 0.8946,
"mean_token_accuracy": 0.7397825956344605,
"step": 2360
},
{
"epoch": 0.4349825271289314,
"grad_norm": 0.9950941777576424,
"learning_rate": 3.410053598626016e-05,
"loss": 0.8833,
"mean_token_accuracy": 0.7447291493415833,
"step": 2365
},
{
"epoch": 0.4359021519220158,
"grad_norm": 0.963560335329199,
"learning_rate": 3.403508846268574e-05,
"loss": 0.8675,
"mean_token_accuracy": 0.7479366779327392,
"step": 2370
},
{
"epoch": 0.43682177671510025,
"grad_norm": 0.9286384422364868,
"learning_rate": 3.396958049552307e-05,
"loss": 0.9171,
"mean_token_accuracy": 0.7304298520088196,
"step": 2375
},
{
"epoch": 0.43774140150818464,
"grad_norm": 0.9750119805406471,
"learning_rate": 3.39040126906609e-05,
"loss": 0.8858,
"mean_token_accuracy": 0.742851734161377,
"step": 2380
},
{
"epoch": 0.4386610263012691,
"grad_norm": 0.9160809046368507,
"learning_rate": 3.383838565454144e-05,
"loss": 0.9062,
"mean_token_accuracy": 0.7335192441940308,
"step": 2385
},
{
"epoch": 0.4395806510943535,
"grad_norm": 0.9668435486381742,
"learning_rate": 3.37726999941547e-05,
"loss": 0.9243,
"mean_token_accuracy": 0.7276196122169495,
"step": 2390
},
{
"epoch": 0.4405002758874379,
"grad_norm": 0.9935097247563913,
"learning_rate": 3.3706956317032954e-05,
"loss": 0.8678,
"mean_token_accuracy": 0.7438644409179688,
"step": 2395
},
{
"epoch": 0.44141990068052234,
"grad_norm": 0.9939894791042586,
"learning_rate": 3.364115523124503e-05,
"loss": 0.8904,
"mean_token_accuracy": 0.7412869215011597,
"step": 2400
},
{
"epoch": 0.4423395254736068,
"grad_norm": 0.9937645932689831,
"learning_rate": 3.357529734539079e-05,
"loss": 0.8455,
"mean_token_accuracy": 0.7517339706420898,
"step": 2405
},
{
"epoch": 0.4432591502666912,
"grad_norm": 0.9375114941684974,
"learning_rate": 3.350938326859539e-05,
"loss": 0.8468,
"mean_token_accuracy": 0.7528372883796692,
"step": 2410
},
{
"epoch": 0.4441787750597756,
"grad_norm": 0.8973960962242926,
"learning_rate": 3.3443413610503735e-05,
"loss": 0.878,
"mean_token_accuracy": 0.7442919254302979,
"step": 2415
},
{
"epoch": 0.44509839985286004,
"grad_norm": 1.0080330285869648,
"learning_rate": 3.337738898127479e-05,
"loss": 0.8785,
"mean_token_accuracy": 0.7428927779197693,
"step": 2420
},
{
"epoch": 0.4460180246459445,
"grad_norm": 0.8985281228115014,
"learning_rate": 3.331130999157597e-05,
"loss": 0.8644,
"mean_token_accuracy": 0.7480224132537842,
"step": 2425
},
{
"epoch": 0.44693764943902886,
"grad_norm": 0.9291069202904676,
"learning_rate": 3.3245177252577454e-05,
"loss": 0.8976,
"mean_token_accuracy": 0.7383280873298645,
"step": 2430
},
{
"epoch": 0.4478572742321133,
"grad_norm": 0.9623008963786942,
"learning_rate": 3.317899137594656e-05,
"loss": 0.9593,
"mean_token_accuracy": 0.7246118664741517,
"step": 2435
},
{
"epoch": 0.44877689902519774,
"grad_norm": 0.9234507163948065,
"learning_rate": 3.311275297384208e-05,
"loss": 0.8413,
"mean_token_accuracy": 0.7528854846954346,
"step": 2440
},
{
"epoch": 0.4496965238182821,
"grad_norm": 0.979267043456503,
"learning_rate": 3.3046462658908636e-05,
"loss": 0.845,
"mean_token_accuracy": 0.7532721877098083,
"step": 2445
},
{
"epoch": 0.45061614861136656,
"grad_norm": 0.9032231134895651,
"learning_rate": 3.298012104427097e-05,
"loss": 0.895,
"mean_token_accuracy": 0.7396630644798279,
"step": 2450
},
{
"epoch": 0.451535773404451,
"grad_norm": 0.9383158653652773,
"learning_rate": 3.291372874352832e-05,
"loss": 0.8943,
"mean_token_accuracy": 0.73899405002594,
"step": 2455
},
{
"epoch": 0.4524553981975354,
"grad_norm": 0.9664126873169693,
"learning_rate": 3.284728637074869e-05,
"loss": 0.869,
"mean_token_accuracy": 0.746407687664032,
"step": 2460
},
{
"epoch": 0.4533750229906198,
"grad_norm": 0.993853088939543,
"learning_rate": 3.278079454046325e-05,
"loss": 0.9011,
"mean_token_accuracy": 0.7388368129730225,
"step": 2465
},
{
"epoch": 0.45429464778370426,
"grad_norm": 0.8741206209918251,
"learning_rate": 3.271425386766058e-05,
"loss": 0.8388,
"mean_token_accuracy": 0.7533232569694519,
"step": 2470
},
{
"epoch": 0.45521427257678865,
"grad_norm": 0.9447835076472045,
"learning_rate": 3.2647664967781035e-05,
"loss": 0.8228,
"mean_token_accuracy": 0.7583665132522583,
"step": 2475
},
{
"epoch": 0.4561338973698731,
"grad_norm": 1.0045001891415821,
"learning_rate": 3.258102845671097e-05,
"loss": 0.8934,
"mean_token_accuracy": 0.7414227366447449,
"step": 2480
},
{
"epoch": 0.4570535221629575,
"grad_norm": 0.9475063098055461,
"learning_rate": 3.251434495077716e-05,
"loss": 0.9182,
"mean_token_accuracy": 0.7303388476371765,
"step": 2485
},
{
"epoch": 0.4579731469560419,
"grad_norm": 0.9775463234456495,
"learning_rate": 3.2447615066741004e-05,
"loss": 0.9361,
"mean_token_accuracy": 0.7293364763259887,
"step": 2490
},
{
"epoch": 0.45889277174912635,
"grad_norm": 0.9174334893241889,
"learning_rate": 3.238083942179288e-05,
"loss": 0.8474,
"mean_token_accuracy": 0.7529029250144958,
"step": 2495
},
{
"epoch": 0.4598123965422108,
"grad_norm": 0.9021239390235616,
"learning_rate": 3.2314018633546375e-05,
"loss": 0.8314,
"mean_token_accuracy": 0.7585980296134949,
"step": 2500
},
{
"epoch": 0.46073202133529517,
"grad_norm": 0.9231622515184421,
"learning_rate": 3.224715332003265e-05,
"loss": 0.8498,
"mean_token_accuracy": 0.7502579808235168,
"step": 2505
},
{
"epoch": 0.4616516461283796,
"grad_norm": 0.9279166556927757,
"learning_rate": 3.218024409969468e-05,
"loss": 0.899,
"mean_token_accuracy": 0.7380064010620118,
"step": 2510
},
{
"epoch": 0.46257127092146405,
"grad_norm": 0.9333611856920211,
"learning_rate": 3.2113291591381516e-05,
"loss": 0.9113,
"mean_token_accuracy": 0.7354224920272827,
"step": 2515
},
{
"epoch": 0.4634908957145485,
"grad_norm": 0.9585859302538061,
"learning_rate": 3.204629641434259e-05,
"loss": 0.912,
"mean_token_accuracy": 0.7332522869110107,
"step": 2520
},
{
"epoch": 0.4644105205076329,
"grad_norm": 1.0072945032594127,
"learning_rate": 3.197925918822199e-05,
"loss": 0.8615,
"mean_token_accuracy": 0.7460902214050293,
"step": 2525
},
{
"epoch": 0.4653301453007173,
"grad_norm": 0.9703474311506037,
"learning_rate": 3.1912180533052716e-05,
"loss": 0.9391,
"mean_token_accuracy": 0.7272826433181763,
"step": 2530
},
{
"epoch": 0.46624977009380175,
"grad_norm": 0.9701812144923739,
"learning_rate": 3.184506106925094e-05,
"loss": 0.8677,
"mean_token_accuracy": 0.747051191329956,
"step": 2535
},
{
"epoch": 0.46716939488688614,
"grad_norm": 0.9672451609696705,
"learning_rate": 3.177790141761029e-05,
"loss": 0.8627,
"mean_token_accuracy": 0.7482078075408936,
"step": 2540
},
{
"epoch": 0.4680890196799706,
"grad_norm": 0.9530973638849749,
"learning_rate": 3.1710702199296085e-05,
"loss": 0.8492,
"mean_token_accuracy": 0.7528972029685974,
"step": 2545
},
{
"epoch": 0.469008644473055,
"grad_norm": 0.9084239076489461,
"learning_rate": 3.16434640358396e-05,
"loss": 0.8653,
"mean_token_accuracy": 0.746622622013092,
"step": 2550
},
{
"epoch": 0.4699282692661394,
"grad_norm": 0.9998420571855022,
"learning_rate": 3.157618754913233e-05,
"loss": 0.8975,
"mean_token_accuracy": 0.738722312450409,
"step": 2555
},
{
"epoch": 0.47084789405922384,
"grad_norm": 0.9250250902872688,
"learning_rate": 3.15088733614202e-05,
"loss": 0.8551,
"mean_token_accuracy": 0.750208032131195,
"step": 2560
},
{
"epoch": 0.4717675188523083,
"grad_norm": 1.0106796436372896,
"learning_rate": 3.144152209529786e-05,
"loss": 0.9079,
"mean_token_accuracy": 0.7350385189056396,
"step": 2565
},
{
"epoch": 0.47268714364539266,
"grad_norm": 0.9619558970415346,
"learning_rate": 3.137413437370289e-05,
"loss": 0.91,
"mean_token_accuracy": 0.7369326472282409,
"step": 2570
},
{
"epoch": 0.4736067684384771,
"grad_norm": 1.0109885841238913,
"learning_rate": 3.130671081991005e-05,
"loss": 0.9084,
"mean_token_accuracy": 0.7353306174278259,
"step": 2575
},
{
"epoch": 0.47452639323156154,
"grad_norm": 0.9779190292756188,
"learning_rate": 3.123925205752552e-05,
"loss": 0.8556,
"mean_token_accuracy": 0.7515247583389282,
"step": 2580
},
{
"epoch": 0.4754460180246459,
"grad_norm": 0.9645840220644,
"learning_rate": 3.1171758710481096e-05,
"loss": 0.8755,
"mean_token_accuracy": 0.7436783194541932,
"step": 2585
},
{
"epoch": 0.47636564281773036,
"grad_norm": 1.001058541812525,
"learning_rate": 3.110423140302852e-05,
"loss": 0.9096,
"mean_token_accuracy": 0.7341774582862854,
"step": 2590
},
{
"epoch": 0.4772852676108148,
"grad_norm": 0.8974468409856537,
"learning_rate": 3.103667075973356e-05,
"loss": 0.9083,
"mean_token_accuracy": 0.7359666705131531,
"step": 2595
},
{
"epoch": 0.4782048924038992,
"grad_norm": 1.0374371477545201,
"learning_rate": 3.096907740547036e-05,
"loss": 0.9111,
"mean_token_accuracy": 0.7324892163276673,
"step": 2600
},
{
"epoch": 0.4791245171969836,
"grad_norm": 0.9405864234939062,
"learning_rate": 3.0901451965415595e-05,
"loss": 0.812,
"mean_token_accuracy": 0.7602822542190552,
"step": 2605
},
{
"epoch": 0.48004414199006806,
"grad_norm": 0.9654353230874346,
"learning_rate": 3.08337950650427e-05,
"loss": 0.8978,
"mean_token_accuracy": 0.7364333510398865,
"step": 2610
},
{
"epoch": 0.48096376678315245,
"grad_norm": 1.0011041381512356,
"learning_rate": 3.076610733011609e-05,
"loss": 0.9049,
"mean_token_accuracy": 0.7363562822341919,
"step": 2615
},
{
"epoch": 0.4818833915762369,
"grad_norm": 0.9686831090055986,
"learning_rate": 3.069838938668538e-05,
"loss": 0.8898,
"mean_token_accuracy": 0.7398189902305603,
"step": 2620
},
{
"epoch": 0.4828030163693213,
"grad_norm": 0.9318085356157495,
"learning_rate": 3.063064186107957e-05,
"loss": 0.8791,
"mean_token_accuracy": 0.7449330806732177,
"step": 2625
},
{
"epoch": 0.48372264116240576,
"grad_norm": 0.8934228857530689,
"learning_rate": 3.056286537990129e-05,
"loss": 0.8632,
"mean_token_accuracy": 0.7459052681922913,
"step": 2630
},
{
"epoch": 0.48464226595549015,
"grad_norm": 0.9725972260652284,
"learning_rate": 3.049506057002098e-05,
"loss": 0.8541,
"mean_token_accuracy": 0.7478031516075134,
"step": 2635
},
{
"epoch": 0.4855618907485746,
"grad_norm": 0.9452628770649284,
"learning_rate": 3.042722805857106e-05,
"loss": 0.8555,
"mean_token_accuracy": 0.746888279914856,
"step": 2640
},
{
"epoch": 0.486481515541659,
"grad_norm": 0.8806175124503305,
"learning_rate": 3.0359368472940208e-05,
"loss": 0.9035,
"mean_token_accuracy": 0.7369076132774353,
"step": 2645
},
{
"epoch": 0.4874011403347434,
"grad_norm": 0.8988265278259941,
"learning_rate": 3.029148244076749e-05,
"loss": 0.8643,
"mean_token_accuracy": 0.7449605345726014,
"step": 2650
},
{
"epoch": 0.48832076512782785,
"grad_norm": 0.9176861265880045,
"learning_rate": 3.022357058993657e-05,
"loss": 0.8643,
"mean_token_accuracy": 0.7462789297103882,
"step": 2655
},
{
"epoch": 0.4892403899209123,
"grad_norm": 0.9232400004776917,
"learning_rate": 3.0155633548569955e-05,
"loss": 0.903,
"mean_token_accuracy": 0.7353234887123108,
"step": 2660
},
{
"epoch": 0.4901600147139967,
"grad_norm": 0.9476269194909095,
"learning_rate": 3.008767194502309e-05,
"loss": 0.9035,
"mean_token_accuracy": 0.7386479258537293,
"step": 2665
},
{
"epoch": 0.4910796395070811,
"grad_norm": 0.931067111141978,
"learning_rate": 3.0019686407878617e-05,
"loss": 0.8883,
"mean_token_accuracy": 0.7414939045906067,
"step": 2670
},
{
"epoch": 0.49199926430016555,
"grad_norm": 0.9153445295986272,
"learning_rate": 2.995167756594055e-05,
"loss": 0.8625,
"mean_token_accuracy": 0.7501867294311524,
"step": 2675
},
{
"epoch": 0.49291888909324993,
"grad_norm": 0.9210143810764434,
"learning_rate": 2.988364604822845e-05,
"loss": 0.8972,
"mean_token_accuracy": 0.7386625647544861,
"step": 2680
},
{
"epoch": 0.4938385138863344,
"grad_norm": 0.9925053868796728,
"learning_rate": 2.9815592483971584e-05,
"loss": 0.8458,
"mean_token_accuracy": 0.751643443107605,
"step": 2685
},
{
"epoch": 0.4947581386794188,
"grad_norm": 1.006336852347141,
"learning_rate": 2.9747517502603167e-05,
"loss": 0.8721,
"mean_token_accuracy": 0.7480525851249695,
"step": 2690
},
{
"epoch": 0.4956777634725032,
"grad_norm": 0.9701598502406181,
"learning_rate": 2.967942173375447e-05,
"loss": 0.8818,
"mean_token_accuracy": 0.740173089504242,
"step": 2695
},
{
"epoch": 0.49659738826558764,
"grad_norm": 0.9431128523024928,
"learning_rate": 2.9611305807249052e-05,
"loss": 0.8344,
"mean_token_accuracy": 0.7551051139831543,
"step": 2700
},
{
"epoch": 0.4975170130586721,
"grad_norm": 0.9346714282194056,
"learning_rate": 2.95431703530969e-05,
"loss": 0.835,
"mean_token_accuracy": 0.7544684171676636,
"step": 2705
},
{
"epoch": 0.49843663785175646,
"grad_norm": 0.9358393411052466,
"learning_rate": 2.9475016001488608e-05,
"loss": 0.8906,
"mean_token_accuracy": 0.7427068829536438,
"step": 2710
},
{
"epoch": 0.4993562626448409,
"grad_norm": 0.8867163340537708,
"learning_rate": 2.9406843382789583e-05,
"loss": 0.8719,
"mean_token_accuracy": 0.745942211151123,
"step": 2715
},
{
"epoch": 0.5002758874379253,
"grad_norm": 0.9212664551640851,
"learning_rate": 2.9338653127534148e-05,
"loss": 0.8562,
"mean_token_accuracy": 0.7497703909873963,
"step": 2720
},
{
"epoch": 0.5011955122310098,
"grad_norm": 0.9432905808331339,
"learning_rate": 2.9270445866419766e-05,
"loss": 0.8741,
"mean_token_accuracy": 0.7432116866111755,
"step": 2725
},
{
"epoch": 0.5021151370240942,
"grad_norm": 0.9512906709412812,
"learning_rate": 2.92022222303012e-05,
"loss": 0.8818,
"mean_token_accuracy": 0.7435823440551758,
"step": 2730
},
{
"epoch": 0.5030347618171785,
"grad_norm": 0.9468765725989278,
"learning_rate": 2.9133982850184645e-05,
"loss": 0.8627,
"mean_token_accuracy": 0.748947024345398,
"step": 2735
},
{
"epoch": 0.503954386610263,
"grad_norm": 1.0112504748902342,
"learning_rate": 2.9065728357221927e-05,
"loss": 0.8508,
"mean_token_accuracy": 0.7537087440490723,
"step": 2740
},
{
"epoch": 0.5048740114033474,
"grad_norm": 0.9649262010355393,
"learning_rate": 2.899745938270465e-05,
"loss": 0.8819,
"mean_token_accuracy": 0.7414289236068725,
"step": 2745
},
{
"epoch": 0.5057936361964318,
"grad_norm": 0.9373961423715033,
"learning_rate": 2.8929176558058352e-05,
"loss": 0.8876,
"mean_token_accuracy": 0.741254198551178,
"step": 2750
},
{
"epoch": 0.5067132609895163,
"grad_norm": 0.9616567239953456,
"learning_rate": 2.8860880514836687e-05,
"loss": 0.8826,
"mean_token_accuracy": 0.7436172485351562,
"step": 2755
},
{
"epoch": 0.5076328857826007,
"grad_norm": 0.9367792403626876,
"learning_rate": 2.8792571884715546e-05,
"loss": 0.8482,
"mean_token_accuracy": 0.7529447674751282,
"step": 2760
},
{
"epoch": 0.5085525105756851,
"grad_norm": 0.9104599971108884,
"learning_rate": 2.8724251299487263e-05,
"loss": 0.8753,
"mean_token_accuracy": 0.7427584528923035,
"step": 2765
},
{
"epoch": 0.5094721353687696,
"grad_norm": 1.0105096627504964,
"learning_rate": 2.8655919391054732e-05,
"loss": 0.8641,
"mean_token_accuracy": 0.7479874610900878,
"step": 2770
},
{
"epoch": 0.510391760161854,
"grad_norm": 0.9279979512504474,
"learning_rate": 2.8587576791425568e-05,
"loss": 0.8317,
"mean_token_accuracy": 0.7535252571105957,
"step": 2775
},
{
"epoch": 0.5113113849549383,
"grad_norm": 0.9297465828114925,
"learning_rate": 2.8519224132706297e-05,
"loss": 0.8774,
"mean_token_accuracy": 0.7402622103691101,
"step": 2780
},
{
"epoch": 0.5122310097480228,
"grad_norm": 0.9452271860575534,
"learning_rate": 2.845086204709645e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.744519031047821,
"step": 2785
},
{
"epoch": 0.5131506345411072,
"grad_norm": 0.9830981203343458,
"learning_rate": 2.838249116688277e-05,
"loss": 0.9289,
"mean_token_accuracy": 0.7298115253448486,
"step": 2790
},
{
"epoch": 0.5140702593341917,
"grad_norm": 1.041430018260559,
"learning_rate": 2.8314112124433334e-05,
"loss": 0.9045,
"mean_token_accuracy": 0.7383831977844239,
"step": 2795
},
{
"epoch": 0.5149898841272761,
"grad_norm": 0.9620402098071436,
"learning_rate": 2.8245725552191703e-05,
"loss": 0.8634,
"mean_token_accuracy": 0.746962821483612,
"step": 2800
},
{
"epoch": 0.5159095089203605,
"grad_norm": 0.9015921123510985,
"learning_rate": 2.8177332082671117e-05,
"loss": 0.853,
"mean_token_accuracy": 0.7487654685974121,
"step": 2805
},
{
"epoch": 0.516829133713445,
"grad_norm": 0.9007228615494444,
"learning_rate": 2.8108932348448553e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.7535581469535828,
"step": 2810
},
{
"epoch": 0.5177487585065293,
"grad_norm": 0.9827577309973088,
"learning_rate": 2.8040526982158993e-05,
"loss": 0.8789,
"mean_token_accuracy": 0.7432992815971374,
"step": 2815
},
{
"epoch": 0.5186683832996137,
"grad_norm": 0.9633925171762643,
"learning_rate": 2.7972116616489464e-05,
"loss": 0.8397,
"mean_token_accuracy": 0.752094304561615,
"step": 2820
},
{
"epoch": 0.5195880080926982,
"grad_norm": 0.9281148435495344,
"learning_rate": 2.790370188417324e-05,
"loss": 0.8596,
"mean_token_accuracy": 0.7485750317573547,
"step": 2825
},
{
"epoch": 0.5205076328857826,
"grad_norm": 1.0029136932204825,
"learning_rate": 2.7835283417984005e-05,
"loss": 0.8718,
"mean_token_accuracy": 0.7433583855628967,
"step": 2830
},
{
"epoch": 0.521427257678867,
"grad_norm": 0.9621263162970809,
"learning_rate": 2.7766861850729958e-05,
"loss": 0.8955,
"mean_token_accuracy": 0.7394774556159973,
"step": 2835
},
{
"epoch": 0.5223468824719515,
"grad_norm": 0.9670299071015823,
"learning_rate": 2.7698437815247995e-05,
"loss": 0.8529,
"mean_token_accuracy": 0.7500015497207642,
"step": 2840
},
{
"epoch": 0.5232665072650359,
"grad_norm": 0.9398184622397476,
"learning_rate": 2.763001194439782e-05,
"loss": 0.8447,
"mean_token_accuracy": 0.7504964828491211,
"step": 2845
},
{
"epoch": 0.5241861320581203,
"grad_norm": 0.8869891271688453,
"learning_rate": 2.756158487105613e-05,
"loss": 0.8404,
"mean_token_accuracy": 0.7549336075782775,
"step": 2850
},
{
"epoch": 0.5251057568512048,
"grad_norm": 0.9965820824716972,
"learning_rate": 2.749315722811073e-05,
"loss": 0.9179,
"mean_token_accuracy": 0.7317790746688843,
"step": 2855
},
{
"epoch": 0.5260253816442891,
"grad_norm": 0.9304946857092635,
"learning_rate": 2.7424729648454717e-05,
"loss": 0.8874,
"mean_token_accuracy": 0.7398088812828064,
"step": 2860
},
{
"epoch": 0.5269450064373735,
"grad_norm": 0.9880649590404676,
"learning_rate": 2.735630276498058e-05,
"loss": 0.8738,
"mean_token_accuracy": 0.7432942867279053,
"step": 2865
},
{
"epoch": 0.527864631230458,
"grad_norm": 0.9350070938993663,
"learning_rate": 2.728787721057437e-05,
"loss": 0.8758,
"mean_token_accuracy": 0.7431787729263306,
"step": 2870
},
{
"epoch": 0.5287842560235424,
"grad_norm": 0.8997664568286488,
"learning_rate": 2.7219453618109853e-05,
"loss": 0.842,
"mean_token_accuracy": 0.7523634552955627,
"step": 2875
},
{
"epoch": 0.5297038808166268,
"grad_norm": 0.9519585493296138,
"learning_rate": 2.715103262044265e-05,
"loss": 0.8744,
"mean_token_accuracy": 0.7417232871055603,
"step": 2880
},
{
"epoch": 0.5306235056097113,
"grad_norm": 0.8836119550117293,
"learning_rate": 2.708261485040439e-05,
"loss": 0.856,
"mean_token_accuracy": 0.7496297836303711,
"step": 2885
},
{
"epoch": 0.5315431304027957,
"grad_norm": 0.9589883589041829,
"learning_rate": 2.7014200940796824e-05,
"loss": 0.8418,
"mean_token_accuracy": 0.7520057439804078,
"step": 2890
},
{
"epoch": 0.53246275519588,
"grad_norm": 0.9563207815434712,
"learning_rate": 2.694579152438601e-05,
"loss": 0.8936,
"mean_token_accuracy": 0.7398610949516297,
"step": 2895
},
{
"epoch": 0.5333823799889645,
"grad_norm": 0.9233468769288075,
"learning_rate": 2.6877387233896472e-05,
"loss": 0.8634,
"mean_token_accuracy": 0.745741093158722,
"step": 2900
},
{
"epoch": 0.5343020047820489,
"grad_norm": 0.9541286928919233,
"learning_rate": 2.6808988702005285e-05,
"loss": 0.868,
"mean_token_accuracy": 0.7439489006996155,
"step": 2905
},
{
"epoch": 0.5352216295751333,
"grad_norm": 0.9922987370495847,
"learning_rate": 2.6740596561336275e-05,
"loss": 0.8482,
"mean_token_accuracy": 0.7504428863525391,
"step": 2910
},
{
"epoch": 0.5361412543682178,
"grad_norm": 0.9722831543231532,
"learning_rate": 2.667221144445418e-05,
"loss": 0.8177,
"mean_token_accuracy": 0.7608316302299499,
"step": 2915
},
{
"epoch": 0.5370608791613022,
"grad_norm": 1.0275441684092577,
"learning_rate": 2.6603833983858738e-05,
"loss": 0.9398,
"mean_token_accuracy": 0.7276052117347718,
"step": 2920
},
{
"epoch": 0.5379805039543866,
"grad_norm": 1.0068511170391965,
"learning_rate": 2.6535464811978894e-05,
"loss": 0.8424,
"mean_token_accuracy": 0.7531503081321717,
"step": 2925
},
{
"epoch": 0.5389001287474711,
"grad_norm": 0.9554905959505885,
"learning_rate": 2.6467104561166927e-05,
"loss": 0.8671,
"mean_token_accuracy": 0.7456499934196472,
"step": 2930
},
{
"epoch": 0.5398197535405554,
"grad_norm": 0.9318421761107843,
"learning_rate": 2.639875386369261e-05,
"loss": 0.8674,
"mean_token_accuracy": 0.7474814653396606,
"step": 2935
},
{
"epoch": 0.5407393783336398,
"grad_norm": 0.9797586514540253,
"learning_rate": 2.6330413351737336e-05,
"loss": 0.893,
"mean_token_accuracy": 0.7371798276901245,
"step": 2940
},
{
"epoch": 0.5416590031267243,
"grad_norm": 0.9627863342351398,
"learning_rate": 2.626208365738831e-05,
"loss": 0.8662,
"mean_token_accuracy": 0.7450501322746277,
"step": 2945
},
{
"epoch": 0.5425786279198087,
"grad_norm": 0.9378560834404903,
"learning_rate": 2.6193765412632677e-05,
"loss": 0.8427,
"mean_token_accuracy": 0.750009298324585,
"step": 2950
},
{
"epoch": 0.5434982527128931,
"grad_norm": 0.9349477883280783,
"learning_rate": 2.6125459249351697e-05,
"loss": 0.8908,
"mean_token_accuracy": 0.7386453747749329,
"step": 2955
},
{
"epoch": 0.5444178775059776,
"grad_norm": 0.9298587181804499,
"learning_rate": 2.6057165799314854e-05,
"loss": 0.855,
"mean_token_accuracy": 0.7491998553276062,
"step": 2960
},
{
"epoch": 0.545337502299062,
"grad_norm": 0.9026144571758381,
"learning_rate": 2.5988885694174085e-05,
"loss": 0.8786,
"mean_token_accuracy": 0.7437506198883057,
"step": 2965
},
{
"epoch": 0.5462571270921464,
"grad_norm": 0.9408107824152944,
"learning_rate": 2.5920619565457877e-05,
"loss": 0.8758,
"mean_token_accuracy": 0.7427832961082459,
"step": 2970
},
{
"epoch": 0.5471767518852308,
"grad_norm": 0.9195819021761746,
"learning_rate": 2.5852368044565452e-05,
"loss": 0.9277,
"mean_token_accuracy": 0.7323094010353088,
"step": 2975
},
{
"epoch": 0.5480963766783152,
"grad_norm": 0.9586681296133412,
"learning_rate": 2.5784131762760922e-05,
"loss": 0.8334,
"mean_token_accuracy": 0.7566598057746887,
"step": 2980
},
{
"epoch": 0.5490160014713996,
"grad_norm": 0.9092467816987784,
"learning_rate": 2.5715911351167465e-05,
"loss": 0.9014,
"mean_token_accuracy": 0.7390154361724853,
"step": 2985
},
{
"epoch": 0.5499356262644841,
"grad_norm": 0.966449128998816,
"learning_rate": 2.564770744076144e-05,
"loss": 0.8959,
"mean_token_accuracy": 0.7373208284378052,
"step": 2990
},
{
"epoch": 0.5508552510575685,
"grad_norm": 1.0269176653506933,
"learning_rate": 2.5579520662366618e-05,
"loss": 0.8626,
"mean_token_accuracy": 0.7471036791801453,
"step": 2995
},
{
"epoch": 0.5517748758506529,
"grad_norm": 0.9705454615801481,
"learning_rate": 2.5511351646648324e-05,
"loss": 0.8761,
"mean_token_accuracy": 0.7408113241195678,
"step": 3000
},
{
"epoch": 0.5526945006437374,
"grad_norm": 0.9683019669667483,
"learning_rate": 2.5443201024107537e-05,
"loss": 0.8974,
"mean_token_accuracy": 0.7345914959907531,
"step": 3005
},
{
"epoch": 0.5536141254368218,
"grad_norm": 0.9328296833493311,
"learning_rate": 2.5375069425075176e-05,
"loss": 0.8629,
"mean_token_accuracy": 0.7468894720077515,
"step": 3010
},
{
"epoch": 0.5545337502299063,
"grad_norm": 0.9565417579373001,
"learning_rate": 2.5306957479706196e-05,
"loss": 0.8914,
"mean_token_accuracy": 0.7373947501182556,
"step": 3015
},
{
"epoch": 0.5554533750229906,
"grad_norm": 0.9439811181197841,
"learning_rate": 2.5238865817973735e-05,
"loss": 0.8264,
"mean_token_accuracy": 0.7566876411437988,
"step": 3020
},
{
"epoch": 0.556372999816075,
"grad_norm": 0.8918377804941932,
"learning_rate": 2.5170795069663374e-05,
"loss": 0.8384,
"mean_token_accuracy": 0.7532538652420044,
"step": 3025
},
{
"epoch": 0.5572926246091595,
"grad_norm": 0.9531681758263391,
"learning_rate": 2.510274586436725e-05,
"loss": 0.9137,
"mean_token_accuracy": 0.7336269617080688,
"step": 3030
},
{
"epoch": 0.5582122494022439,
"grad_norm": 0.9547809224031603,
"learning_rate": 2.5034718831478236e-05,
"loss": 0.8121,
"mean_token_accuracy": 0.7607084512710571,
"step": 3035
},
{
"epoch": 0.5591318741953283,
"grad_norm": 0.9101416039188879,
"learning_rate": 2.496671460018414e-05,
"loss": 0.8374,
"mean_token_accuracy": 0.7512237310409546,
"step": 3040
},
{
"epoch": 0.5600514989884128,
"grad_norm": 0.9591588974138807,
"learning_rate": 2.4898733799461866e-05,
"loss": 0.8691,
"mean_token_accuracy": 0.7475574612617493,
"step": 3045
},
{
"epoch": 0.5609711237814972,
"grad_norm": 0.9481182124754315,
"learning_rate": 2.4830777058071623e-05,
"loss": 0.8541,
"mean_token_accuracy": 0.7470650672912598,
"step": 3050
},
{
"epoch": 0.5618907485745815,
"grad_norm": 0.8991567391844545,
"learning_rate": 2.4762845004551077e-05,
"loss": 0.834,
"mean_token_accuracy": 0.7513617157936097,
"step": 3055
},
{
"epoch": 0.562810373367666,
"grad_norm": 0.8993594505060807,
"learning_rate": 2.4694938267209567e-05,
"loss": 0.8302,
"mean_token_accuracy": 0.7539983510971069,
"step": 3060
},
{
"epoch": 0.5637299981607504,
"grad_norm": 0.9212463554308379,
"learning_rate": 2.4627057474122273e-05,
"loss": 0.8598,
"mean_token_accuracy": 0.747953188419342,
"step": 3065
},
{
"epoch": 0.5646496229538348,
"grad_norm": 0.9155845020709076,
"learning_rate": 2.4559203253124407e-05,
"loss": 0.8728,
"mean_token_accuracy": 0.7440886616706848,
"step": 3070
},
{
"epoch": 0.5655692477469193,
"grad_norm": 0.9376543570110895,
"learning_rate": 2.4491376231805428e-05,
"loss": 0.8529,
"mean_token_accuracy": 0.7518376111984253,
"step": 3075
},
{
"epoch": 0.5664888725400037,
"grad_norm": 0.9720221730313491,
"learning_rate": 2.442357703750322e-05,
"loss": 0.8423,
"mean_token_accuracy": 0.7525236487388611,
"step": 3080
},
{
"epoch": 0.5674084973330881,
"grad_norm": 0.9013738631587733,
"learning_rate": 2.4355806297298296e-05,
"loss": 0.8422,
"mean_token_accuracy": 0.7528858304023742,
"step": 3085
},
{
"epoch": 0.5683281221261726,
"grad_norm": 0.9524358228393591,
"learning_rate": 2.4288064638007974e-05,
"loss": 0.8672,
"mean_token_accuracy": 0.7468002319335938,
"step": 3090
},
{
"epoch": 0.569247746919257,
"grad_norm": 0.9505409858129935,
"learning_rate": 2.4220352686180613e-05,
"loss": 0.8416,
"mean_token_accuracy": 0.7486450433731079,
"step": 3095
},
{
"epoch": 0.5701673717123413,
"grad_norm": 0.9615751645550065,
"learning_rate": 2.415267106808983e-05,
"loss": 0.803,
"mean_token_accuracy": 0.7603586912155151,
"step": 3100
},
{
"epoch": 0.5710869965054258,
"grad_norm": 0.9458073029155306,
"learning_rate": 2.4085020409728633e-05,
"loss": 0.8614,
"mean_token_accuracy": 0.7483598232269287,
"step": 3105
},
{
"epoch": 0.5720066212985102,
"grad_norm": 0.959427274017189,
"learning_rate": 2.4017401336803713e-05,
"loss": 0.8795,
"mean_token_accuracy": 0.7383235573768616,
"step": 3110
},
{
"epoch": 0.5729262460915946,
"grad_norm": 0.9688058239251538,
"learning_rate": 2.394981447472963e-05,
"loss": 0.8854,
"mean_token_accuracy": 0.7413538813591003,
"step": 3115
},
{
"epoch": 0.5738458708846791,
"grad_norm": 0.9543674760330169,
"learning_rate": 2.3882260448623002e-05,
"loss": 0.8924,
"mean_token_accuracy": 0.739243483543396,
"step": 3120
},
{
"epoch": 0.5747654956777635,
"grad_norm": 0.9565581088949338,
"learning_rate": 2.381473988329675e-05,
"loss": 0.8878,
"mean_token_accuracy": 0.737128746509552,
"step": 3125
},
{
"epoch": 0.5756851204708479,
"grad_norm": 0.9446263148140598,
"learning_rate": 2.374725340325433e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.7424870610237122,
"step": 3130
},
{
"epoch": 0.5766047452639324,
"grad_norm": 0.9235345865848048,
"learning_rate": 2.3679801632683927e-05,
"loss": 0.8791,
"mean_token_accuracy": 0.7413055062294006,
"step": 3135
},
{
"epoch": 0.5775243700570167,
"grad_norm": 0.931358306977097,
"learning_rate": 2.3612385195452687e-05,
"loss": 0.8864,
"mean_token_accuracy": 0.7415070414543152,
"step": 3140
},
{
"epoch": 0.5784439948501011,
"grad_norm": 0.9366462545353926,
"learning_rate": 2.3545004715100966e-05,
"loss": 0.8791,
"mean_token_accuracy": 0.7428970575332642,
"step": 3145
},
{
"epoch": 0.5793636196431856,
"grad_norm": 0.9312216076414869,
"learning_rate": 2.3477660814836562e-05,
"loss": 0.8318,
"mean_token_accuracy": 0.7540540814399719,
"step": 3150
},
{
"epoch": 0.58028324443627,
"grad_norm": 0.9058432741408705,
"learning_rate": 2.3410354117528904e-05,
"loss": 0.9128,
"mean_token_accuracy": 0.7328131318092346,
"step": 3155
},
{
"epoch": 0.5812028692293544,
"grad_norm": 0.92693757568253,
"learning_rate": 2.3343085245703373e-05,
"loss": 0.8356,
"mean_token_accuracy": 0.754761004447937,
"step": 3160
},
{
"epoch": 0.5821224940224389,
"grad_norm": 0.9685552745916727,
"learning_rate": 2.3275854821535476e-05,
"loss": 0.8696,
"mean_token_accuracy": 0.7423434615135193,
"step": 3165
},
{
"epoch": 0.5830421188155233,
"grad_norm": 0.9530016316914325,
"learning_rate": 2.3208663466845108e-05,
"loss": 0.8239,
"mean_token_accuracy": 0.7581414461135865,
"step": 3170
},
{
"epoch": 0.5839617436086076,
"grad_norm": 0.9912981010776241,
"learning_rate": 2.3141511803090815e-05,
"loss": 0.8784,
"mean_token_accuracy": 0.743216586112976,
"step": 3175
},
{
"epoch": 0.5848813684016921,
"grad_norm": 0.8897494823501038,
"learning_rate": 2.3074400451364048e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.7422731041908264,
"step": 3180
},
{
"epoch": 0.5858009931947765,
"grad_norm": 0.9087254524604537,
"learning_rate": 2.300733003238339e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.75495365858078,
"step": 3185
},
{
"epoch": 0.5867206179878609,
"grad_norm": 0.9615326948623956,
"learning_rate": 2.2940301166488846e-05,
"loss": 0.7821,
"mean_token_accuracy": 0.7687617659568786,
"step": 3190
},
{
"epoch": 0.5876402427809454,
"grad_norm": 0.9239773147706558,
"learning_rate": 2.28733144736361e-05,
"loss": 0.8034,
"mean_token_accuracy": 0.7630661010742188,
"step": 3195
},
{
"epoch": 0.5885598675740298,
"grad_norm": 0.9271354944208791,
"learning_rate": 2.2806370573390745e-05,
"loss": 0.8377,
"mean_token_accuracy": 0.7517584562301636,
"step": 3200
},
{
"epoch": 0.5894794923671142,
"grad_norm": 0.9307261567222711,
"learning_rate": 2.2739470084922608e-05,
"loss": 0.9145,
"mean_token_accuracy": 0.7307730317115784,
"step": 3205
},
{
"epoch": 0.5903991171601987,
"grad_norm": 0.8708186634436479,
"learning_rate": 2.2672613626999994e-05,
"loss": 0.8495,
"mean_token_accuracy": 0.7486128211021423,
"step": 3210
},
{
"epoch": 0.591318741953283,
"grad_norm": 0.9473141853732495,
"learning_rate": 2.2605801817983958e-05,
"loss": 0.8341,
"mean_token_accuracy": 0.7518749475479126,
"step": 3215
},
{
"epoch": 0.5922383667463674,
"grad_norm": 0.9382593885727152,
"learning_rate": 2.253903527582259e-05,
"loss": 0.8447,
"mean_token_accuracy": 0.7506359577178955,
"step": 3220
},
{
"epoch": 0.5931579915394519,
"grad_norm": 0.9696123819996886,
"learning_rate": 2.247231461804532e-05,
"loss": 0.8266,
"mean_token_accuracy": 0.7562480688095092,
"step": 3225
},
{
"epoch": 0.5940776163325363,
"grad_norm": 0.8949351423802622,
"learning_rate": 2.2405640461757176e-05,
"loss": 0.814,
"mean_token_accuracy": 0.7592174887657166,
"step": 3230
},
{
"epoch": 0.5949972411256208,
"grad_norm": 0.9615311548799811,
"learning_rate": 2.2339013423633083e-05,
"loss": 0.8503,
"mean_token_accuracy": 0.7499252796173096,
"step": 3235
},
{
"epoch": 0.5959168659187052,
"grad_norm": 0.9086052926810453,
"learning_rate": 2.2272434119912184e-05,
"loss": 0.8754,
"mean_token_accuracy": 0.7434251546859741,
"step": 3240
},
{
"epoch": 0.5968364907117896,
"grad_norm": 0.9221742878259598,
"learning_rate": 2.2205903166392113e-05,
"loss": 0.8477,
"mean_token_accuracy": 0.7485897660255432,
"step": 3245
},
{
"epoch": 0.5977561155048741,
"grad_norm": 0.967041034869552,
"learning_rate": 2.2139421178423307e-05,
"loss": 0.8225,
"mean_token_accuracy": 0.7570245742797852,
"step": 3250
},
{
"epoch": 0.5986757402979584,
"grad_norm": 0.981067205830958,
"learning_rate": 2.207298877090333e-05,
"loss": 0.8701,
"mean_token_accuracy": 0.7440281748771668,
"step": 3255
},
{
"epoch": 0.5995953650910428,
"grad_norm": 0.989973298607582,
"learning_rate": 2.2006606558271142e-05,
"loss": 0.8713,
"mean_token_accuracy": 0.7413482785224914,
"step": 3260
},
{
"epoch": 0.6005149898841273,
"grad_norm": 0.8672144464089592,
"learning_rate": 2.1940275154501482e-05,
"loss": 0.87,
"mean_token_accuracy": 0.743138313293457,
"step": 3265
},
{
"epoch": 0.6014346146772117,
"grad_norm": 0.9653292378844739,
"learning_rate": 2.187399517309914e-05,
"loss": 0.8575,
"mean_token_accuracy": 0.7464121103286743,
"step": 3270
},
{
"epoch": 0.6023542394702961,
"grad_norm": 0.9239524199502155,
"learning_rate": 2.1807767227093268e-05,
"loss": 0.8236,
"mean_token_accuracy": 0.7573307991027832,
"step": 3275
},
{
"epoch": 0.6032738642633806,
"grad_norm": 0.9806975126747703,
"learning_rate": 2.1741591929031795e-05,
"loss": 0.878,
"mean_token_accuracy": 0.7407856106758117,
"step": 3280
},
{
"epoch": 0.604193489056465,
"grad_norm": 0.9640808408127749,
"learning_rate": 2.167546989097566e-05,
"loss": 0.8638,
"mean_token_accuracy": 0.7459958910942077,
"step": 3285
},
{
"epoch": 0.6051131138495494,
"grad_norm": 0.9656473527433518,
"learning_rate": 2.16094017244932e-05,
"loss": 0.8783,
"mean_token_accuracy": 0.7419638872146607,
"step": 3290
},
{
"epoch": 0.6060327386426339,
"grad_norm": 0.9930014003610543,
"learning_rate": 2.154338804065451e-05,
"loss": 0.8615,
"mean_token_accuracy": 0.7456332087516785,
"step": 3295
},
{
"epoch": 0.6069523634357182,
"grad_norm": 0.9330196848152268,
"learning_rate": 2.1477429450025767e-05,
"loss": 0.8352,
"mean_token_accuracy": 0.7517044901847839,
"step": 3300
},
{
"epoch": 0.6078719882288026,
"grad_norm": 0.8777553334567131,
"learning_rate": 2.1411526562663554e-05,
"loss": 0.8364,
"mean_token_accuracy": 0.7501665949821472,
"step": 3305
},
{
"epoch": 0.6087916130218871,
"grad_norm": 0.9315142599796349,
"learning_rate": 2.1345679988109284e-05,
"loss": 0.8378,
"mean_token_accuracy": 0.7534802198410034,
"step": 3310
},
{
"epoch": 0.6097112378149715,
"grad_norm": 0.9385962221597601,
"learning_rate": 2.1279890335383534e-05,
"loss": 0.8876,
"mean_token_accuracy": 0.7398653388023376,
"step": 3315
},
{
"epoch": 0.6106308626080559,
"grad_norm": 0.9451857651632474,
"learning_rate": 2.1214158212980366e-05,
"loss": 0.7988,
"mean_token_accuracy": 0.7636669516563416,
"step": 3320
},
{
"epoch": 0.6115504874011404,
"grad_norm": 0.9310680714278403,
"learning_rate": 2.114848422886177e-05,
"loss": 0.8417,
"mean_token_accuracy": 0.7545873999595643,
"step": 3325
},
{
"epoch": 0.6124701121942248,
"grad_norm": 0.9555284993925652,
"learning_rate": 2.108286899045202e-05,
"loss": 0.8906,
"mean_token_accuracy": 0.7384588122367859,
"step": 3330
},
{
"epoch": 0.6133897369873091,
"grad_norm": 0.9525478437560697,
"learning_rate": 2.1017313104632003e-05,
"loss": 0.844,
"mean_token_accuracy": 0.7497392654418945,
"step": 3335
},
{
"epoch": 0.6143093617803936,
"grad_norm": 0.9657934498214388,
"learning_rate": 2.0951817177733684e-05,
"loss": 0.8748,
"mean_token_accuracy": 0.7426393389701843,
"step": 3340
},
{
"epoch": 0.615228986573478,
"grad_norm": 0.9174407552166862,
"learning_rate": 2.088638181553446e-05,
"loss": 0.8727,
"mean_token_accuracy": 0.742801570892334,
"step": 3345
},
{
"epoch": 0.6161486113665624,
"grad_norm": 0.9106809477969502,
"learning_rate": 2.0821007623251564e-05,
"loss": 0.8227,
"mean_token_accuracy": 0.7550573825836182,
"step": 3350
},
{
"epoch": 0.6170682361596469,
"grad_norm": 0.8816231707997737,
"learning_rate": 2.075569520553643e-05,
"loss": 0.8066,
"mean_token_accuracy": 0.7590124368667602,
"step": 3355
},
{
"epoch": 0.6179878609527313,
"grad_norm": 0.9651791807712018,
"learning_rate": 2.0690445166469158e-05,
"loss": 0.8575,
"mean_token_accuracy": 0.7481630921363831,
"step": 3360
},
{
"epoch": 0.6189074857458157,
"grad_norm": 0.962161882798645,
"learning_rate": 2.0625258109552926e-05,
"loss": 0.8842,
"mean_token_accuracy": 0.743985378742218,
"step": 3365
},
{
"epoch": 0.6198271105389002,
"grad_norm": 0.955250281560398,
"learning_rate": 2.0560134637708334e-05,
"loss": 0.8413,
"mean_token_accuracy": 0.7497357606887818,
"step": 3370
},
{
"epoch": 0.6207467353319845,
"grad_norm": 1.0327175413319667,
"learning_rate": 2.0495075353267913e-05,
"loss": 0.8697,
"mean_token_accuracy": 0.7445659875869751,
"step": 3375
},
{
"epoch": 0.6216663601250689,
"grad_norm": 0.9525687098312168,
"learning_rate": 2.043008085797052e-05,
"loss": 0.8722,
"mean_token_accuracy": 0.7410041093826294,
"step": 3380
},
{
"epoch": 0.6225859849181534,
"grad_norm": 0.9275514977855014,
"learning_rate": 2.036515175295574e-05,
"loss": 0.8412,
"mean_token_accuracy": 0.7507887959480286,
"step": 3385
},
{
"epoch": 0.6235056097112378,
"grad_norm": 0.9493961658678648,
"learning_rate": 2.03002886387584e-05,
"loss": 0.8556,
"mean_token_accuracy": 0.7469261646270752,
"step": 3390
},
{
"epoch": 0.6244252345043222,
"grad_norm": 0.9292345545436532,
"learning_rate": 2.0235492115302944e-05,
"loss": 0.8301,
"mean_token_accuracy": 0.7550871014595032,
"step": 3395
},
{
"epoch": 0.6253448592974067,
"grad_norm": 0.9430411664378814,
"learning_rate": 2.017076278189794e-05,
"loss": 0.8321,
"mean_token_accuracy": 0.7533326983451843,
"step": 3400
},
{
"epoch": 0.6262644840904911,
"grad_norm": 0.8889521393845567,
"learning_rate": 2.0106101237230455e-05,
"loss": 0.8324,
"mean_token_accuracy": 0.7539088129997253,
"step": 3405
},
{
"epoch": 0.6271841088835755,
"grad_norm": 0.9180009901150891,
"learning_rate": 2.0041508079360634e-05,
"loss": 0.7898,
"mean_token_accuracy": 0.761493980884552,
"step": 3410
},
{
"epoch": 0.62810373367666,
"grad_norm": 0.9055995921329637,
"learning_rate": 1.997698390571608e-05,
"loss": 0.8419,
"mean_token_accuracy": 0.7503387928009033,
"step": 3415
},
{
"epoch": 0.6290233584697443,
"grad_norm": 0.9447591194939752,
"learning_rate": 1.991252931308633e-05,
"loss": 0.8692,
"mean_token_accuracy": 0.7452242970466614,
"step": 3420
},
{
"epoch": 0.6299429832628287,
"grad_norm": 0.9351426059072258,
"learning_rate": 1.9848144897617417e-05,
"loss": 0.8149,
"mean_token_accuracy": 0.7568124055862426,
"step": 3425
},
{
"epoch": 0.6308626080559132,
"grad_norm": 0.9168023134449134,
"learning_rate": 1.9783831254806257e-05,
"loss": 0.8157,
"mean_token_accuracy": 0.7554953694343567,
"step": 3430
},
{
"epoch": 0.6317822328489976,
"grad_norm": 1.027979530127791,
"learning_rate": 1.971958897949518e-05,
"loss": 0.8229,
"mean_token_accuracy": 0.7550533413887024,
"step": 3435
},
{
"epoch": 0.632701857642082,
"grad_norm": 0.8964633060914129,
"learning_rate": 1.9655418665866465e-05,
"loss": 0.7966,
"mean_token_accuracy": 0.7639833688735962,
"step": 3440
},
{
"epoch": 0.6336214824351665,
"grad_norm": 0.8702615238247585,
"learning_rate": 1.9591320907436782e-05,
"loss": 0.8502,
"mean_token_accuracy": 0.74614177942276,
"step": 3445
},
{
"epoch": 0.6345411072282509,
"grad_norm": 0.9157962896320851,
"learning_rate": 1.9527296297051765e-05,
"loss": 0.8026,
"mean_token_accuracy": 0.758307683467865,
"step": 3450
},
{
"epoch": 0.6354607320213354,
"grad_norm": 0.9465005665572019,
"learning_rate": 1.9463345426880448e-05,
"loss": 0.8036,
"mean_token_accuracy": 0.7617629647254944,
"step": 3455
},
{
"epoch": 0.6363803568144197,
"grad_norm": 0.9618417431183126,
"learning_rate": 1.939946888840986e-05,
"loss": 0.8819,
"mean_token_accuracy": 0.7395693898200989,
"step": 3460
},
{
"epoch": 0.6372999816075041,
"grad_norm": 0.9326022903907812,
"learning_rate": 1.933566727243956e-05,
"loss": 0.8384,
"mean_token_accuracy": 0.7497618556022644,
"step": 3465
},
{
"epoch": 0.6382196064005886,
"grad_norm": 0.942168299955769,
"learning_rate": 1.927194116907608e-05,
"loss": 0.8821,
"mean_token_accuracy": 0.7422310829162597,
"step": 3470
},
{
"epoch": 0.639139231193673,
"grad_norm": 0.930256851029374,
"learning_rate": 1.9208291167727576e-05,
"loss": 0.8293,
"mean_token_accuracy": 0.7561385631561279,
"step": 3475
},
{
"epoch": 0.6400588559867574,
"grad_norm": 0.8857746537604931,
"learning_rate": 1.9144717857098328e-05,
"loss": 0.8166,
"mean_token_accuracy": 0.7583439826965332,
"step": 3480
},
{
"epoch": 0.6409784807798419,
"grad_norm": 0.9519372824273006,
"learning_rate": 1.908122182518326e-05,
"loss": 0.8674,
"mean_token_accuracy": 0.741856062412262,
"step": 3485
},
{
"epoch": 0.6418981055729263,
"grad_norm": 0.9483959540274922,
"learning_rate": 1.9017803659262583e-05,
"loss": 0.8496,
"mean_token_accuracy": 0.7491413950920105,
"step": 3490
},
{
"epoch": 0.6428177303660106,
"grad_norm": 0.9729346329964175,
"learning_rate": 1.8954463945896293e-05,
"loss": 0.8554,
"mean_token_accuracy": 0.7483752846717835,
"step": 3495
},
{
"epoch": 0.6437373551590951,
"grad_norm": 0.910719020599245,
"learning_rate": 1.889120327091879e-05,
"loss": 0.8332,
"mean_token_accuracy": 0.753311276435852,
"step": 3500
},
{
"epoch": 0.6446569799521795,
"grad_norm": 0.8997078755147822,
"learning_rate": 1.8828022219433413e-05,
"loss": 0.8311,
"mean_token_accuracy": 0.7538302779197693,
"step": 3505
},
{
"epoch": 0.6455766047452639,
"grad_norm": 0.9097287217365273,
"learning_rate": 1.8764921375807083e-05,
"loss": 0.8573,
"mean_token_accuracy": 0.74767564535141,
"step": 3510
},
{
"epoch": 0.6464962295383484,
"grad_norm": 0.9420262116863728,
"learning_rate": 1.8701901323664863e-05,
"loss": 0.8551,
"mean_token_accuracy": 0.7479906916618347,
"step": 3515
},
{
"epoch": 0.6474158543314328,
"grad_norm": 0.9297816459092663,
"learning_rate": 1.8638962645884565e-05,
"loss": 0.8066,
"mean_token_accuracy": 0.7580268263816834,
"step": 3520
},
{
"epoch": 0.6483354791245172,
"grad_norm": 0.946031226164797,
"learning_rate": 1.8576105924591357e-05,
"loss": 0.8179,
"mean_token_accuracy": 0.7542472004890441,
"step": 3525
},
{
"epoch": 0.6492551039176017,
"grad_norm": 0.9036904422802344,
"learning_rate": 1.8513331741152412e-05,
"loss": 0.8261,
"mean_token_accuracy": 0.7552783608436584,
"step": 3530
},
{
"epoch": 0.650174728710686,
"grad_norm": 0.921905554132334,
"learning_rate": 1.8450640676171472e-05,
"loss": 0.8351,
"mean_token_accuracy": 0.752598226070404,
"step": 3535
},
{
"epoch": 0.6510943535037704,
"grad_norm": 1.0035005670649164,
"learning_rate": 1.8388033309483522e-05,
"loss": 0.8981,
"mean_token_accuracy": 0.7371325850486755,
"step": 3540
},
{
"epoch": 0.6520139782968549,
"grad_norm": 0.9724909600231612,
"learning_rate": 1.8325510220149413e-05,
"loss": 0.8327,
"mean_token_accuracy": 0.751532518863678,
"step": 3545
},
{
"epoch": 0.6529336030899393,
"grad_norm": 0.9664687506252672,
"learning_rate": 1.8263071986450524e-05,
"loss": 0.8336,
"mean_token_accuracy": 0.7516280770301819,
"step": 3550
},
{
"epoch": 0.6538532278830237,
"grad_norm": 0.9164445815967506,
"learning_rate": 1.8200719185883358e-05,
"loss": 0.8316,
"mean_token_accuracy": 0.7544404864311218,
"step": 3555
},
{
"epoch": 0.6547728526761082,
"grad_norm": 0.9293565126179983,
"learning_rate": 1.813845239515427e-05,
"loss": 0.8257,
"mean_token_accuracy": 0.7552899837493896,
"step": 3560
},
{
"epoch": 0.6556924774691926,
"grad_norm": 0.9010810987925738,
"learning_rate": 1.8076272190174115e-05,
"loss": 0.8201,
"mean_token_accuracy": 0.7565722703933716,
"step": 3565
},
{
"epoch": 0.656612102262277,
"grad_norm": 1.0075745989661558,
"learning_rate": 1.801417914605286e-05,
"loss": 0.869,
"mean_token_accuracy": 0.7453143835067749,
"step": 3570
},
{
"epoch": 0.6575317270553614,
"grad_norm": 0.935586367301874,
"learning_rate": 1.795217383709437e-05,
"loss": 0.8845,
"mean_token_accuracy": 0.7403179168701172,
"step": 3575
},
{
"epoch": 0.6584513518484458,
"grad_norm": 0.9872971011864189,
"learning_rate": 1.7890256836791008e-05,
"loss": 0.8052,
"mean_token_accuracy": 0.7629344463348389,
"step": 3580
},
{
"epoch": 0.6593709766415302,
"grad_norm": 0.9876503263464145,
"learning_rate": 1.7828428717818353e-05,
"loss": 0.8135,
"mean_token_accuracy": 0.7590724229812622,
"step": 3585
},
{
"epoch": 0.6602906014346147,
"grad_norm": 0.8811578706911977,
"learning_rate": 1.7766690052029944e-05,
"loss": 0.8221,
"mean_token_accuracy": 0.7560603976249695,
"step": 3590
},
{
"epoch": 0.6612102262276991,
"grad_norm": 0.9719326557742581,
"learning_rate": 1.770504141045194e-05,
"loss": 0.8342,
"mean_token_accuracy": 0.7510559558868408,
"step": 3595
},
{
"epoch": 0.6621298510207835,
"grad_norm": 1.0132470520749903,
"learning_rate": 1.7643483363277874e-05,
"loss": 0.8487,
"mean_token_accuracy": 0.7500616908073425,
"step": 3600
},
{
"epoch": 0.663049475813868,
"grad_norm": 1.0318932699213554,
"learning_rate": 1.7582016479863327e-05,
"loss": 0.8487,
"mean_token_accuracy": 0.7490703582763671,
"step": 3605
},
{
"epoch": 0.6639691006069524,
"grad_norm": 0.8658023921332224,
"learning_rate": 1.7520641328720756e-05,
"loss": 0.8238,
"mean_token_accuracy": 0.7564070224761963,
"step": 3610
},
{
"epoch": 0.6648887254000367,
"grad_norm": 0.9750052383478849,
"learning_rate": 1.7459358477514122e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.7549832344055176,
"step": 3615
},
{
"epoch": 0.6658083501931212,
"grad_norm": 0.957114636285714,
"learning_rate": 1.7398168493053723e-05,
"loss": 0.7881,
"mean_token_accuracy": 0.7615378856658935,
"step": 3620
},
{
"epoch": 0.6667279749862056,
"grad_norm": 0.9148381033348181,
"learning_rate": 1.7337071941290944e-05,
"loss": 0.8196,
"mean_token_accuracy": 0.7577734112739563,
"step": 3625
},
{
"epoch": 0.66764759977929,
"grad_norm": 0.9583843198631806,
"learning_rate": 1.7276069387312955e-05,
"loss": 0.9,
"mean_token_accuracy": 0.7367844343185425,
"step": 3630
},
{
"epoch": 0.6685672245723745,
"grad_norm": 0.9525242256598431,
"learning_rate": 1.7215161395337572e-05,
"loss": 0.8351,
"mean_token_accuracy": 0.7536734580993653,
"step": 3635
},
{
"epoch": 0.6694868493654589,
"grad_norm": 0.9218486580963495,
"learning_rate": 1.7154348528707992e-05,
"loss": 0.8512,
"mean_token_accuracy": 0.7513302564620972,
"step": 3640
},
{
"epoch": 0.6704064741585433,
"grad_norm": 0.9497350819436411,
"learning_rate": 1.709363134988757e-05,
"loss": 0.8522,
"mean_token_accuracy": 0.747953987121582,
"step": 3645
},
{
"epoch": 0.6713260989516278,
"grad_norm": 0.9359833703344925,
"learning_rate": 1.7033010420454655e-05,
"loss": 0.8091,
"mean_token_accuracy": 0.7576663970947266,
"step": 3650
},
{
"epoch": 0.6722457237447121,
"grad_norm": 0.9884296155896105,
"learning_rate": 1.6972486301097376e-05,
"loss": 0.8185,
"mean_token_accuracy": 0.7578543424606323,
"step": 3655
},
{
"epoch": 0.6731653485377965,
"grad_norm": 0.885165473016121,
"learning_rate": 1.691205955160845e-05,
"loss": 0.8461,
"mean_token_accuracy": 0.7491200208663941,
"step": 3660
},
{
"epoch": 0.674084973330881,
"grad_norm": 0.9715821597591158,
"learning_rate": 1.6851730730880012e-05,
"loss": 0.8527,
"mean_token_accuracy": 0.7483757376670838,
"step": 3665
},
{
"epoch": 0.6750045981239654,
"grad_norm": 0.8871437133597592,
"learning_rate": 1.679150039689846e-05,
"loss": 0.8148,
"mean_token_accuracy": 0.7578411340713501,
"step": 3670
},
{
"epoch": 0.6759242229170498,
"grad_norm": 0.9530586600231223,
"learning_rate": 1.673136910673926e-05,
"loss": 0.8645,
"mean_token_accuracy": 0.7451423764228821,
"step": 3675
},
{
"epoch": 0.6768438477101343,
"grad_norm": 0.9427729850229866,
"learning_rate": 1.6671337416561817e-05,
"loss": 0.8432,
"mean_token_accuracy": 0.7509079575538635,
"step": 3680
},
{
"epoch": 0.6777634725032187,
"grad_norm": 0.9325142143827265,
"learning_rate": 1.661140588160435e-05,
"loss": 0.8347,
"mean_token_accuracy": 0.7516968011856079,
"step": 3685
},
{
"epoch": 0.6786830972963032,
"grad_norm": 0.9601757924065347,
"learning_rate": 1.6551575056178695e-05,
"loss": 0.8166,
"mean_token_accuracy": 0.7589465737342834,
"step": 3690
},
{
"epoch": 0.6796027220893875,
"grad_norm": 1.0086779966517565,
"learning_rate": 1.649184549366525e-05,
"loss": 0.8395,
"mean_token_accuracy": 0.7520246505737305,
"step": 3695
},
{
"epoch": 0.6805223468824719,
"grad_norm": 0.9707009645804029,
"learning_rate": 1.6432217746507814e-05,
"loss": 0.8382,
"mean_token_accuracy": 0.7533354997634888,
"step": 3700
},
{
"epoch": 0.6814419716755564,
"grad_norm": 0.9109669918450888,
"learning_rate": 1.6372692366208476e-05,
"loss": 0.8186,
"mean_token_accuracy": 0.7560298204421997,
"step": 3705
},
{
"epoch": 0.6823615964686408,
"grad_norm": 0.931556246223817,
"learning_rate": 1.6313269903322536e-05,
"loss": 0.8682,
"mean_token_accuracy": 0.7464072823524475,
"step": 3710
},
{
"epoch": 0.6832812212617252,
"grad_norm": 0.9316943141031991,
"learning_rate": 1.6253950907453414e-05,
"loss": 0.7891,
"mean_token_accuracy": 0.7643645644187927,
"step": 3715
},
{
"epoch": 0.6842008460548097,
"grad_norm": 0.9367407375514984,
"learning_rate": 1.619473592724752e-05,
"loss": 0.8489,
"mean_token_accuracy": 0.7488224864006042,
"step": 3720
},
{
"epoch": 0.6851204708478941,
"grad_norm": 0.96189736553831,
"learning_rate": 1.613562551038925e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7625237464904785,
"step": 3725
},
{
"epoch": 0.6860400956409785,
"grad_norm": 0.9170890141555628,
"learning_rate": 1.607662020359587e-05,
"loss": 0.8404,
"mean_token_accuracy": 0.7529777765274048,
"step": 3730
},
{
"epoch": 0.686959720434063,
"grad_norm": 0.9456438498787428,
"learning_rate": 1.6017720552612462e-05,
"loss": 0.8036,
"mean_token_accuracy": 0.7614395618438721,
"step": 3735
},
{
"epoch": 0.6878793452271473,
"grad_norm": 0.9544770877536788,
"learning_rate": 1.595892710220691e-05,
"loss": 0.8413,
"mean_token_accuracy": 0.7519929647445679,
"step": 3740
},
{
"epoch": 0.6887989700202317,
"grad_norm": 1.022115954707187,
"learning_rate": 1.5900240396164835e-05,
"loss": 0.8612,
"mean_token_accuracy": 0.747264850139618,
"step": 3745
},
{
"epoch": 0.6897185948133162,
"grad_norm": 0.9476824745559427,
"learning_rate": 1.584166097728455e-05,
"loss": 0.847,
"mean_token_accuracy": 0.7491350531578064,
"step": 3750
},
{
"epoch": 0.6906382196064006,
"grad_norm": 0.8827290010499629,
"learning_rate": 1.578318938737209e-05,
"loss": 0.8284,
"mean_token_accuracy": 0.7547004818916321,
"step": 3755
},
{
"epoch": 0.691557844399485,
"grad_norm": 0.9009975487421323,
"learning_rate": 1.5724826167236146e-05,
"loss": 0.8214,
"mean_token_accuracy": 0.7568115711212158,
"step": 3760
},
{
"epoch": 0.6924774691925695,
"grad_norm": 0.9187149873785133,
"learning_rate": 1.5666571856683116e-05,
"loss": 0.827,
"mean_token_accuracy": 0.7550323009490967,
"step": 3765
},
{
"epoch": 0.6933970939856539,
"grad_norm": 0.9280641474823987,
"learning_rate": 1.560842699451204e-05,
"loss": 0.7616,
"mean_token_accuracy": 0.7714649677276612,
"step": 3770
},
{
"epoch": 0.6943167187787382,
"grad_norm": 0.9038372482824055,
"learning_rate": 1.5550392118509705e-05,
"loss": 0.8028,
"mean_token_accuracy": 0.760212504863739,
"step": 3775
},
{
"epoch": 0.6952363435718227,
"grad_norm": 0.9201432901179558,
"learning_rate": 1.5492467765445613e-05,
"loss": 0.8241,
"mean_token_accuracy": 0.754262363910675,
"step": 3780
},
{
"epoch": 0.6961559683649071,
"grad_norm": 0.9031896471527984,
"learning_rate": 1.5434654471067007e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.7623116612434387,
"step": 3785
},
{
"epoch": 0.6970755931579915,
"grad_norm": 0.928442088214151,
"learning_rate": 1.537695277009396e-05,
"loss": 0.8667,
"mean_token_accuracy": 0.7442408680915833,
"step": 3790
},
{
"epoch": 0.697995217951076,
"grad_norm": 0.9545685310758198,
"learning_rate": 1.5319363196214427e-05,
"loss": 0.8147,
"mean_token_accuracy": 0.757679283618927,
"step": 3795
},
{
"epoch": 0.6989148427441604,
"grad_norm": 0.957997913837239,
"learning_rate": 1.526188628207924e-05,
"loss": 0.8674,
"mean_token_accuracy": 0.7406766414642334,
"step": 3800
},
{
"epoch": 0.6998344675372448,
"grad_norm": 0.907233770113165,
"learning_rate": 1.5204522559297275e-05,
"loss": 0.8228,
"mean_token_accuracy": 0.7550997257232666,
"step": 3805
},
{
"epoch": 0.7007540923303293,
"grad_norm": 0.9753264400407652,
"learning_rate": 1.5147272558430472e-05,
"loss": 0.812,
"mean_token_accuracy": 0.7584111213684082,
"step": 3810
},
{
"epoch": 0.7016737171234136,
"grad_norm": 0.898583550613599,
"learning_rate": 1.509013680898896e-05,
"loss": 0.814,
"mean_token_accuracy": 0.7574291110038758,
"step": 3815
},
{
"epoch": 0.702593341916498,
"grad_norm": 0.9245046858803572,
"learning_rate": 1.5033115839426127e-05,
"loss": 0.8002,
"mean_token_accuracy": 0.7631544828414917,
"step": 3820
},
{
"epoch": 0.7035129667095825,
"grad_norm": 0.9501909113953771,
"learning_rate": 1.4976210177133764e-05,
"loss": 0.8284,
"mean_token_accuracy": 0.7537835121154786,
"step": 3825
},
{
"epoch": 0.7044325915026669,
"grad_norm": 0.9118736011138947,
"learning_rate": 1.4919420348437189e-05,
"loss": 0.8637,
"mean_token_accuracy": 0.746515440940857,
"step": 3830
},
{
"epoch": 0.7053522162957513,
"grad_norm": 0.9346208775326443,
"learning_rate": 1.4862746878590329e-05,
"loss": 0.8325,
"mean_token_accuracy": 0.7536684751510621,
"step": 3835
},
{
"epoch": 0.7062718410888358,
"grad_norm": 0.9644025251262837,
"learning_rate": 1.4806190291770932e-05,
"loss": 0.9199,
"mean_token_accuracy": 0.728544807434082,
"step": 3840
},
{
"epoch": 0.7071914658819202,
"grad_norm": 0.9316658230434494,
"learning_rate": 1.4749751111075682e-05,
"loss": 0.8478,
"mean_token_accuracy": 0.7476451396942139,
"step": 3845
},
{
"epoch": 0.7081110906750046,
"grad_norm": 0.8593875878005443,
"learning_rate": 1.469342985851534e-05,
"loss": 0.7931,
"mean_token_accuracy": 0.7640434741973877,
"step": 3850
},
{
"epoch": 0.709030715468089,
"grad_norm": 0.9379422901278587,
"learning_rate": 1.4637227055009962e-05,
"loss": 0.8228,
"mean_token_accuracy": 0.7573190450668335,
"step": 3855
},
{
"epoch": 0.7099503402611734,
"grad_norm": 0.9026485371540945,
"learning_rate": 1.4581143220384047e-05,
"loss": 0.82,
"mean_token_accuracy": 0.756511640548706,
"step": 3860
},
{
"epoch": 0.7108699650542578,
"grad_norm": 0.9796042273923296,
"learning_rate": 1.4525178873361756e-05,
"loss": 0.8242,
"mean_token_accuracy": 0.7555618524551392,
"step": 3865
},
{
"epoch": 0.7117895898473423,
"grad_norm": 0.9383990549827186,
"learning_rate": 1.4469334531562067e-05,
"loss": 0.8448,
"mean_token_accuracy": 0.7482100129127502,
"step": 3870
},
{
"epoch": 0.7127092146404267,
"grad_norm": 0.9602931261847705,
"learning_rate": 1.4413610711494058e-05,
"loss": 0.8365,
"mean_token_accuracy": 0.7580392360687256,
"step": 3875
},
{
"epoch": 0.7136288394335111,
"grad_norm": 0.943240285031073,
"learning_rate": 1.4358007928552075e-05,
"loss": 0.7861,
"mean_token_accuracy": 0.7667181611061096,
"step": 3880
},
{
"epoch": 0.7145484642265956,
"grad_norm": 0.9447898247986761,
"learning_rate": 1.4302526697010964e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.7595344543457031,
"step": 3885
},
{
"epoch": 0.71546808901968,
"grad_norm": 0.9841983235190546,
"learning_rate": 1.424716753002136e-05,
"loss": 0.8597,
"mean_token_accuracy": 0.7481236219406128,
"step": 3890
},
{
"epoch": 0.7163877138127643,
"grad_norm": 0.9684153403690037,
"learning_rate": 1.4191930939604908e-05,
"loss": 0.8117,
"mean_token_accuracy": 0.7613986849784851,
"step": 3895
},
{
"epoch": 0.7173073386058488,
"grad_norm": 0.996877698893722,
"learning_rate": 1.4136817436649502e-05,
"loss": 0.8766,
"mean_token_accuracy": 0.738961935043335,
"step": 3900
},
{
"epoch": 0.7182269633989332,
"grad_norm": 0.9051545491177592,
"learning_rate": 1.4081827530904624e-05,
"loss": 0.8445,
"mean_token_accuracy": 0.749999487400055,
"step": 3905
},
{
"epoch": 0.7191465881920177,
"grad_norm": 0.9684927881965169,
"learning_rate": 1.4026961730976584e-05,
"loss": 0.8209,
"mean_token_accuracy": 0.7576812863349914,
"step": 3910
},
{
"epoch": 0.7200662129851021,
"grad_norm": 0.9610042841526357,
"learning_rate": 1.3972220544323832e-05,
"loss": 0.8131,
"mean_token_accuracy": 0.7582221627235413,
"step": 3915
},
{
"epoch": 0.7209858377781865,
"grad_norm": 0.9412320092723402,
"learning_rate": 1.3917604477252238e-05,
"loss": 0.7937,
"mean_token_accuracy": 0.7617234110832214,
"step": 3920
},
{
"epoch": 0.721905462571271,
"grad_norm": 0.9321659094215312,
"learning_rate": 1.3863114034910452e-05,
"loss": 0.8156,
"mean_token_accuracy": 0.7598451256752015,
"step": 3925
},
{
"epoch": 0.7228250873643554,
"grad_norm": 0.956577146254236,
"learning_rate": 1.3808749721285214e-05,
"loss": 0.8107,
"mean_token_accuracy": 0.757847785949707,
"step": 3930
},
{
"epoch": 0.7237447121574397,
"grad_norm": 0.9139917904820034,
"learning_rate": 1.3754512039196658e-05,
"loss": 0.8754,
"mean_token_accuracy": 0.7391230940818787,
"step": 3935
},
{
"epoch": 0.7246643369505242,
"grad_norm": 0.92757564731535,
"learning_rate": 1.3700401490293718e-05,
"loss": 0.8193,
"mean_token_accuracy": 0.7570781588554383,
"step": 3940
},
{
"epoch": 0.7255839617436086,
"grad_norm": 0.9533935473757719,
"learning_rate": 1.3646418575049475e-05,
"loss": 0.8244,
"mean_token_accuracy": 0.756612241268158,
"step": 3945
},
{
"epoch": 0.726503586536693,
"grad_norm": 0.9319033478082173,
"learning_rate": 1.3592563792756468e-05,
"loss": 0.7994,
"mean_token_accuracy": 0.7616767644882202,
"step": 3950
},
{
"epoch": 0.7274232113297775,
"grad_norm": 0.9659322616790049,
"learning_rate": 1.3538837641522172e-05,
"loss": 0.776,
"mean_token_accuracy": 0.7666900753974915,
"step": 3955
},
{
"epoch": 0.7283428361228619,
"grad_norm": 0.9715937702004781,
"learning_rate": 1.3485240618264322e-05,
"loss": 0.8707,
"mean_token_accuracy": 0.742601501941681,
"step": 3960
},
{
"epoch": 0.7292624609159463,
"grad_norm": 0.9279423695840053,
"learning_rate": 1.3431773218706336e-05,
"loss": 0.8435,
"mean_token_accuracy": 0.7503429889678955,
"step": 3965
},
{
"epoch": 0.7301820857090308,
"grad_norm": 0.9826978876425828,
"learning_rate": 1.3378435937372729e-05,
"loss": 0.8609,
"mean_token_accuracy": 0.7491580963134765,
"step": 3970
},
{
"epoch": 0.7311017105021151,
"grad_norm": 0.9333913123309906,
"learning_rate": 1.3325229267584549e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.7425579071044922,
"step": 3975
},
{
"epoch": 0.7320213352951995,
"grad_norm": 0.9125063830711305,
"learning_rate": 1.3272153701454809e-05,
"loss": 0.8086,
"mean_token_accuracy": 0.7603332042694092,
"step": 3980
},
{
"epoch": 0.732940960088284,
"grad_norm": 0.9868481200984651,
"learning_rate": 1.3219209729883918e-05,
"loss": 0.7879,
"mean_token_accuracy": 0.7675115823745727,
"step": 3985
},
{
"epoch": 0.7338605848813684,
"grad_norm": 0.9006549103315062,
"learning_rate": 1.3166397842555175e-05,
"loss": 0.7923,
"mean_token_accuracy": 0.7659124851226806,
"step": 3990
},
{
"epoch": 0.7347802096744528,
"grad_norm": 0.9128416767290051,
"learning_rate": 1.3113718527930214e-05,
"loss": 0.8363,
"mean_token_accuracy": 0.751650869846344,
"step": 3995
},
{
"epoch": 0.7356998344675373,
"grad_norm": 0.93586974280188,
"learning_rate": 1.3061172273244477e-05,
"loss": 0.8634,
"mean_token_accuracy": 0.7428792953491211,
"step": 4000
},
{
"epoch": 0.7366194592606217,
"grad_norm": 0.9865948469992011,
"learning_rate": 1.3008759564502742e-05,
"loss": 0.8627,
"mean_token_accuracy": 0.7454355955123901,
"step": 4005
},
{
"epoch": 0.737539084053706,
"grad_norm": 0.9395366278250679,
"learning_rate": 1.2956480886474609e-05,
"loss": 0.8408,
"mean_token_accuracy": 0.7488868713378907,
"step": 4010
},
{
"epoch": 0.7384587088467905,
"grad_norm": 0.9259161411169768,
"learning_rate": 1.2904336722690013e-05,
"loss": 0.8474,
"mean_token_accuracy": 0.7509873270988464,
"step": 4015
},
{
"epoch": 0.7393783336398749,
"grad_norm": 0.8982963261004637,
"learning_rate": 1.2852327555434743e-05,
"loss": 0.8272,
"mean_token_accuracy": 0.7562850832939148,
"step": 4020
},
{
"epoch": 0.7402979584329593,
"grad_norm": 0.9145268063018638,
"learning_rate": 1.280045386574601e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7601189255714417,
"step": 4025
},
{
"epoch": 0.7412175832260438,
"grad_norm": 0.9417030319528836,
"learning_rate": 1.2748716133407985e-05,
"loss": 0.8243,
"mean_token_accuracy": 0.7563821077346802,
"step": 4030
},
{
"epoch": 0.7421372080191282,
"grad_norm": 0.9170391844634309,
"learning_rate": 1.269711483694733e-05,
"loss": 0.8071,
"mean_token_accuracy": 0.7610970735549927,
"step": 4035
},
{
"epoch": 0.7430568328122126,
"grad_norm": 0.927700931925603,
"learning_rate": 1.264565045362883e-05,
"loss": 0.83,
"mean_token_accuracy": 0.7542360424995422,
"step": 4040
},
{
"epoch": 0.7439764576052971,
"grad_norm": 0.902718257172033,
"learning_rate": 1.259432345945094e-05,
"loss": 0.8026,
"mean_token_accuracy": 0.7602586507797241,
"step": 4045
},
{
"epoch": 0.7448960823983815,
"grad_norm": 0.9732168765607019,
"learning_rate": 1.2543134329141382e-05,
"loss": 0.8166,
"mean_token_accuracy": 0.7585108041763305,
"step": 4050
},
{
"epoch": 0.7458157071914658,
"grad_norm": 0.9466993086607015,
"learning_rate": 1.2492083536152772e-05,
"loss": 0.8169,
"mean_token_accuracy": 0.758376932144165,
"step": 4055
},
{
"epoch": 0.7467353319845503,
"grad_norm": 0.9757475911083087,
"learning_rate": 1.2441171552658228e-05,
"loss": 0.8389,
"mean_token_accuracy": 0.7498653650283813,
"step": 4060
},
{
"epoch": 0.7476549567776347,
"grad_norm": 0.9151481291254611,
"learning_rate": 1.2390398849547023e-05,
"loss": 0.8006,
"mean_token_accuracy": 0.7613858461380005,
"step": 4065
},
{
"epoch": 0.7485745815707191,
"grad_norm": 0.8890653066533022,
"learning_rate": 1.2339765896420178e-05,
"loss": 0.8404,
"mean_token_accuracy": 0.7510004043579102,
"step": 4070
},
{
"epoch": 0.7494942063638036,
"grad_norm": 0.9533182704017102,
"learning_rate": 1.2289273161586194e-05,
"loss": 0.8234,
"mean_token_accuracy": 0.7551814436912536,
"step": 4075
},
{
"epoch": 0.750413831156888,
"grad_norm": 0.9407240854533703,
"learning_rate": 1.2238921112056663e-05,
"loss": 0.8635,
"mean_token_accuracy": 0.7466271042823791,
"step": 4080
},
{
"epoch": 0.7513334559499724,
"grad_norm": 0.8895247933273808,
"learning_rate": 1.2188710213541957e-05,
"loss": 0.8332,
"mean_token_accuracy": 0.752234959602356,
"step": 4085
},
{
"epoch": 0.7522530807430569,
"grad_norm": 0.9353802672482648,
"learning_rate": 1.213864093044695e-05,
"loss": 0.8448,
"mean_token_accuracy": 0.7497453451156616,
"step": 4090
},
{
"epoch": 0.7531727055361412,
"grad_norm": 0.946809122144392,
"learning_rate": 1.2088713725866696e-05,
"loss": 0.8088,
"mean_token_accuracy": 0.758155906200409,
"step": 4095
},
{
"epoch": 0.7540923303292256,
"grad_norm": 0.9340815348568988,
"learning_rate": 1.203892906158214e-05,
"loss": 0.8525,
"mean_token_accuracy": 0.7470645427703857,
"step": 4100
},
{
"epoch": 0.7550119551223101,
"grad_norm": 0.9903725518055015,
"learning_rate": 1.1989287398055874e-05,
"loss": 0.8406,
"mean_token_accuracy": 0.7499817609786987,
"step": 4105
},
{
"epoch": 0.7559315799153945,
"grad_norm": 0.9005006268013445,
"learning_rate": 1.193978919442787e-05,
"loss": 0.833,
"mean_token_accuracy": 0.7508885979652404,
"step": 4110
},
{
"epoch": 0.7568512047084789,
"grad_norm": 0.922000222155766,
"learning_rate": 1.1890434908511212e-05,
"loss": 0.8256,
"mean_token_accuracy": 0.7544254660606384,
"step": 4115
},
{
"epoch": 0.7577708295015634,
"grad_norm": 0.9147121717124462,
"learning_rate": 1.1841224996787876e-05,
"loss": 0.8119,
"mean_token_accuracy": 0.7572540044784546,
"step": 4120
},
{
"epoch": 0.7586904542946478,
"grad_norm": 0.9401032528457242,
"learning_rate": 1.1792159914404518e-05,
"loss": 0.8389,
"mean_token_accuracy": 0.7547949194908142,
"step": 4125
},
{
"epoch": 0.7596100790877323,
"grad_norm": 0.899746427074481,
"learning_rate": 1.1743240115168262e-05,
"loss": 0.8104,
"mean_token_accuracy": 0.7588290691375732,
"step": 4130
},
{
"epoch": 0.7605297038808166,
"grad_norm": 0.9377432106115406,
"learning_rate": 1.1694466051542473e-05,
"loss": 0.8155,
"mean_token_accuracy": 0.7565756559371948,
"step": 4135
},
{
"epoch": 0.761449328673901,
"grad_norm": 0.9436429623996605,
"learning_rate": 1.1645838174642614e-05,
"loss": 0.8167,
"mean_token_accuracy": 0.7574901819229126,
"step": 4140
},
{
"epoch": 0.7623689534669855,
"grad_norm": 0.9163014099905564,
"learning_rate": 1.1597356934232053e-05,
"loss": 0.8518,
"mean_token_accuracy": 0.7465153455734252,
"step": 4145
},
{
"epoch": 0.7632885782600699,
"grad_norm": 0.8716564591657281,
"learning_rate": 1.1549022778717888e-05,
"loss": 0.8572,
"mean_token_accuracy": 0.7444779276847839,
"step": 4150
},
{
"epoch": 0.7642082030531543,
"grad_norm": 0.9408396749893937,
"learning_rate": 1.1500836155146839e-05,
"loss": 0.83,
"mean_token_accuracy": 0.7533326983451843,
"step": 4155
},
{
"epoch": 0.7651278278462388,
"grad_norm": 0.9335839862612282,
"learning_rate": 1.1452797509201083e-05,
"loss": 0.8751,
"mean_token_accuracy": 0.7398134231567383,
"step": 4160
},
{
"epoch": 0.7660474526393232,
"grad_norm": 0.9850624435923674,
"learning_rate": 1.1404907285194125e-05,
"loss": 0.8523,
"mean_token_accuracy": 0.7461954593658447,
"step": 4165
},
{
"epoch": 0.7669670774324076,
"grad_norm": 0.9679449146346353,
"learning_rate": 1.1357165926066716e-05,
"loss": 0.7892,
"mean_token_accuracy": 0.7605505466461182,
"step": 4170
},
{
"epoch": 0.767886702225492,
"grad_norm": 0.9416265509404674,
"learning_rate": 1.130957387338275e-05,
"loss": 0.8221,
"mean_token_accuracy": 0.7559242844581604,
"step": 4175
},
{
"epoch": 0.7688063270185764,
"grad_norm": 0.909615601406411,
"learning_rate": 1.1262131567325163e-05,
"loss": 0.8357,
"mean_token_accuracy": 0.7517993927001954,
"step": 4180
},
{
"epoch": 0.7697259518116608,
"grad_norm": 0.9047722281799156,
"learning_rate": 1.1214839446691869e-05,
"loss": 0.8032,
"mean_token_accuracy": 0.7601001501083374,
"step": 4185
},
{
"epoch": 0.7706455766047453,
"grad_norm": 0.9246634008625312,
"learning_rate": 1.1167697948891707e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.7536085605621338,
"step": 4190
},
{
"epoch": 0.7715652013978297,
"grad_norm": 0.9460638804791452,
"learning_rate": 1.1120707509940403e-05,
"loss": 0.8167,
"mean_token_accuracy": 0.7593476176261902,
"step": 4195
},
{
"epoch": 0.7724848261909141,
"grad_norm": 0.9221593736048895,
"learning_rate": 1.1073868564456503e-05,
"loss": 0.845,
"mean_token_accuracy": 0.7480282187461853,
"step": 4200
},
{
"epoch": 0.7734044509839986,
"grad_norm": 0.8888076192030434,
"learning_rate": 1.1027181545657403e-05,
"loss": 0.7794,
"mean_token_accuracy": 0.76693354845047,
"step": 4205
},
{
"epoch": 0.774324075777083,
"grad_norm": 0.8891810327123515,
"learning_rate": 1.0980646885355313e-05,
"loss": 0.7885,
"mean_token_accuracy": 0.7628621697425843,
"step": 4210
},
{
"epoch": 0.7752437005701673,
"grad_norm": 0.9743526817712896,
"learning_rate": 1.0934265013953239e-05,
"loss": 0.8478,
"mean_token_accuracy": 0.7504450678825378,
"step": 4215
},
{
"epoch": 0.7761633253632518,
"grad_norm": 0.9143999464853897,
"learning_rate": 1.0888036360441066e-05,
"loss": 0.8059,
"mean_token_accuracy": 0.7603421926498413,
"step": 4220
},
{
"epoch": 0.7770829501563362,
"grad_norm": 0.9734913517153475,
"learning_rate": 1.0841961352391522e-05,
"loss": 0.8159,
"mean_token_accuracy": 0.7574024796485901,
"step": 4225
},
{
"epoch": 0.7780025749494206,
"grad_norm": 0.935773373300799,
"learning_rate": 1.079604041595628e-05,
"loss": 0.8562,
"mean_token_accuracy": 0.7468973875045777,
"step": 4230
},
{
"epoch": 0.7789221997425051,
"grad_norm": 0.9031689337704597,
"learning_rate": 1.075027397586198e-05,
"loss": 0.8165,
"mean_token_accuracy": 0.7566033601760864,
"step": 4235
},
{
"epoch": 0.7798418245355895,
"grad_norm": 0.9138920947374664,
"learning_rate": 1.0704662455406309e-05,
"loss": 0.8137,
"mean_token_accuracy": 0.7558243870735168,
"step": 4240
},
{
"epoch": 0.7807614493286739,
"grad_norm": 0.942480721965923,
"learning_rate": 1.06592062764541e-05,
"loss": 0.8103,
"mean_token_accuracy": 0.7595886349678039,
"step": 4245
},
{
"epoch": 0.7816810741217584,
"grad_norm": 0.8995689595482391,
"learning_rate": 1.0613905859433412e-05,
"loss": 0.8158,
"mean_token_accuracy": 0.7546827673912049,
"step": 4250
},
{
"epoch": 0.7826006989148427,
"grad_norm": 0.8666864815369382,
"learning_rate": 1.0568761623331642e-05,
"loss": 0.8082,
"mean_token_accuracy": 0.7590071558952332,
"step": 4255
},
{
"epoch": 0.7835203237079271,
"grad_norm": 0.9696655409923509,
"learning_rate": 1.0523773985691673e-05,
"loss": 0.8556,
"mean_token_accuracy": 0.7452132105827332,
"step": 4260
},
{
"epoch": 0.7844399485010116,
"grad_norm": 0.9833829005536767,
"learning_rate": 1.0478943362607984e-05,
"loss": 0.8586,
"mean_token_accuracy": 0.7462344169616699,
"step": 4265
},
{
"epoch": 0.785359573294096,
"grad_norm": 0.9595206401213471,
"learning_rate": 1.0434270168722813e-05,
"loss": 0.8351,
"mean_token_accuracy": 0.7498462796211243,
"step": 4270
},
{
"epoch": 0.7862791980871804,
"grad_norm": 0.9261440611345254,
"learning_rate": 1.0389754817222325e-05,
"loss": 0.77,
"mean_token_accuracy": 0.7716120958328248,
"step": 4275
},
{
"epoch": 0.7871988228802649,
"grad_norm": 0.926036803637149,
"learning_rate": 1.0345397719832791e-05,
"loss": 0.8117,
"mean_token_accuracy": 0.75774165391922,
"step": 4280
},
{
"epoch": 0.7881184476733493,
"grad_norm": 0.9482199838406158,
"learning_rate": 1.0301199286816768e-05,
"loss": 0.7869,
"mean_token_accuracy": 0.7647076845169067,
"step": 4285
},
{
"epoch": 0.7890380724664336,
"grad_norm": 0.9249156078948935,
"learning_rate": 1.0257159926969315e-05,
"loss": 0.8379,
"mean_token_accuracy": 0.7494875431060791,
"step": 4290
},
{
"epoch": 0.7899576972595181,
"grad_norm": 0.9426764037549299,
"learning_rate": 1.0213280047614224e-05,
"loss": 0.8399,
"mean_token_accuracy": 0.748091197013855,
"step": 4295
},
{
"epoch": 0.7908773220526025,
"grad_norm": 0.9001227058548062,
"learning_rate": 1.016956005460021e-05,
"loss": 0.8151,
"mean_token_accuracy": 0.7553766012191773,
"step": 4300
},
{
"epoch": 0.7917969468456869,
"grad_norm": 0.9494070318147612,
"learning_rate": 1.0126000352297207e-05,
"loss": 0.8161,
"mean_token_accuracy": 0.7553802728652954,
"step": 4305
},
{
"epoch": 0.7927165716387714,
"grad_norm": 0.9634025237949015,
"learning_rate": 1.0082601343592613e-05,
"loss": 0.8375,
"mean_token_accuracy": 0.7490672588348388,
"step": 4310
},
{
"epoch": 0.7936361964318558,
"grad_norm": 0.918509774691625,
"learning_rate": 1.0039363429887526e-05,
"loss": 0.8027,
"mean_token_accuracy": 0.7611651062965393,
"step": 4315
},
{
"epoch": 0.7945558212249402,
"grad_norm": 0.9045021299622812,
"learning_rate": 9.996287011093095e-06,
"loss": 0.8194,
"mean_token_accuracy": 0.7530111193656921,
"step": 4320
},
{
"epoch": 0.7954754460180247,
"grad_norm": 0.9575102184844824,
"learning_rate": 9.95337248562677e-06,
"loss": 0.813,
"mean_token_accuracy": 0.7606404304504395,
"step": 4325
},
{
"epoch": 0.796395070811109,
"grad_norm": 0.9520723107616024,
"learning_rate": 9.910620250408654e-06,
"loss": 0.8219,
"mean_token_accuracy": 0.7527819633483886,
"step": 4330
},
{
"epoch": 0.7973146956041934,
"grad_norm": 0.9957772801943348,
"learning_rate": 9.868030700857786e-06,
"loss": 0.8527,
"mean_token_accuracy": 0.7474417209625244,
"step": 4335
},
{
"epoch": 0.7982343203972779,
"grad_norm": 0.9206334782903142,
"learning_rate": 9.825604230888534e-06,
"loss": 0.8013,
"mean_token_accuracy": 0.7611706376075744,
"step": 4340
},
{
"epoch": 0.7991539451903623,
"grad_norm": 0.9528692345244755,
"learning_rate": 9.783341232906929e-06,
"loss": 0.8452,
"mean_token_accuracy": 0.7476886630058288,
"step": 4345
},
{
"epoch": 0.8000735699834468,
"grad_norm": 0.9501814513029114,
"learning_rate": 9.741242097807015e-06,
"loss": 0.7998,
"mean_token_accuracy": 0.7616806149482727,
"step": 4350
},
{
"epoch": 0.8009931947765312,
"grad_norm": 0.9162860642484046,
"learning_rate": 9.699307214967278e-06,
"loss": 0.8154,
"mean_token_accuracy": 0.7584839701652527,
"step": 4355
},
{
"epoch": 0.8019128195696156,
"grad_norm": 1.0326738672670173,
"learning_rate": 9.657536972247011e-06,
"loss": 0.8364,
"mean_token_accuracy": 0.7505152702331543,
"step": 4360
},
{
"epoch": 0.8028324443627001,
"grad_norm": 0.9226495279325524,
"learning_rate": 9.615931755982732e-06,
"loss": 0.8249,
"mean_token_accuracy": 0.7548305869102478,
"step": 4365
},
{
"epoch": 0.8037520691557845,
"grad_norm": 0.9998522862414826,
"learning_rate": 9.574491950984617e-06,
"loss": 0.8713,
"mean_token_accuracy": 0.7403565168380737,
"step": 4370
},
{
"epoch": 0.8046716939488688,
"grad_norm": 0.9493513097435586,
"learning_rate": 9.533217940532952e-06,
"loss": 0.8295,
"mean_token_accuracy": 0.7500657081604004,
"step": 4375
},
{
"epoch": 0.8055913187419533,
"grad_norm": 0.9906056177459279,
"learning_rate": 9.492110106374562e-06,
"loss": 0.7962,
"mean_token_accuracy": 0.7624237060546875,
"step": 4380
},
{
"epoch": 0.8065109435350377,
"grad_norm": 0.9844968670498593,
"learning_rate": 9.451168828719293e-06,
"loss": 0.7978,
"mean_token_accuracy": 0.7625670194625854,
"step": 4385
},
{
"epoch": 0.8074305683281221,
"grad_norm": 0.9677134975970255,
"learning_rate": 9.410394486236498e-06,
"loss": 0.8635,
"mean_token_accuracy": 0.7404338598251343,
"step": 4390
},
{
"epoch": 0.8083501931212066,
"grad_norm": 0.9239280726012725,
"learning_rate": 9.369787456051545e-06,
"loss": 0.8134,
"mean_token_accuracy": 0.75517338514328,
"step": 4395
},
{
"epoch": 0.809269817914291,
"grad_norm": 0.9448230478695528,
"learning_rate": 9.329348113742293e-06,
"loss": 0.8304,
"mean_token_accuracy": 0.7514260888099671,
"step": 4400
},
{
"epoch": 0.8101894427073754,
"grad_norm": 0.9454127260499946,
"learning_rate": 9.289076833335659e-06,
"loss": 0.8097,
"mean_token_accuracy": 0.7581054925918579,
"step": 4405
},
{
"epoch": 0.8111090675004599,
"grad_norm": 0.9492270487120692,
"learning_rate": 9.24897398730414e-06,
"loss": 0.8527,
"mean_token_accuracy": 0.7465508818626404,
"step": 4410
},
{
"epoch": 0.8120286922935442,
"grad_norm": 0.9570757946856893,
"learning_rate": 9.209039946562354e-06,
"loss": 0.8267,
"mean_token_accuracy": 0.755340301990509,
"step": 4415
},
{
"epoch": 0.8129483170866286,
"grad_norm": 0.9284190475550864,
"learning_rate": 9.169275080463641e-06,
"loss": 0.7752,
"mean_token_accuracy": 0.7686259269714355,
"step": 4420
},
{
"epoch": 0.8138679418797131,
"grad_norm": 0.9501950391649288,
"learning_rate": 9.129679756796622e-06,
"loss": 0.8111,
"mean_token_accuracy": 0.7585479974746704,
"step": 4425
},
{
"epoch": 0.8147875666727975,
"grad_norm": 0.9046262111625721,
"learning_rate": 9.090254341781824e-06,
"loss": 0.802,
"mean_token_accuracy": 0.7600291728973388,
"step": 4430
},
{
"epoch": 0.8157071914658819,
"grad_norm": 0.9379329497256937,
"learning_rate": 9.05099920006824e-06,
"loss": 0.8206,
"mean_token_accuracy": 0.754150140285492,
"step": 4435
},
{
"epoch": 0.8166268162589664,
"grad_norm": 0.9034131325499937,
"learning_rate": 9.011914694730014e-06,
"loss": 0.7971,
"mean_token_accuracy": 0.7597368478775024,
"step": 4440
},
{
"epoch": 0.8175464410520508,
"grad_norm": 0.9338149471790205,
"learning_rate": 8.973001187263069e-06,
"loss": 0.8184,
"mean_token_accuracy": 0.7545792698860169,
"step": 4445
},
{
"epoch": 0.8184660658451351,
"grad_norm": 0.9541079918085381,
"learning_rate": 8.934259037581725e-06,
"loss": 0.8097,
"mean_token_accuracy": 0.7586872816085816,
"step": 4450
},
{
"epoch": 0.8193856906382196,
"grad_norm": 0.9233023020738409,
"learning_rate": 8.895688604015418e-06,
"loss": 0.8276,
"mean_token_accuracy": 0.7541133642196656,
"step": 4455
},
{
"epoch": 0.820305315431304,
"grad_norm": 0.9312024884427347,
"learning_rate": 8.857290243305372e-06,
"loss": 0.8242,
"mean_token_accuracy": 0.7540480494499207,
"step": 4460
},
{
"epoch": 0.8212249402243884,
"grad_norm": 0.9636521068626411,
"learning_rate": 8.819064310601274e-06,
"loss": 0.827,
"mean_token_accuracy": 0.754251503944397,
"step": 4465
},
{
"epoch": 0.8221445650174729,
"grad_norm": 0.9594804588793242,
"learning_rate": 8.78101115945803e-06,
"loss": 0.8195,
"mean_token_accuracy": 0.7567231893539429,
"step": 4470
},
{
"epoch": 0.8230641898105573,
"grad_norm": 0.946382911890805,
"learning_rate": 8.743131141832466e-06,
"loss": 0.8093,
"mean_token_accuracy": 0.7608936429023743,
"step": 4475
},
{
"epoch": 0.8239838146036417,
"grad_norm": 0.9662210178630657,
"learning_rate": 8.705424608080091e-06,
"loss": 0.845,
"mean_token_accuracy": 0.7482501983642578,
"step": 4480
},
{
"epoch": 0.8249034393967262,
"grad_norm": 1.0134277900865423,
"learning_rate": 8.667891906951822e-06,
"loss": 0.806,
"mean_token_accuracy": 0.7607534885406494,
"step": 4485
},
{
"epoch": 0.8258230641898106,
"grad_norm": 0.969259829449015,
"learning_rate": 8.63053338559081e-06,
"loss": 0.8301,
"mean_token_accuracy": 0.7495483517646789,
"step": 4490
},
{
"epoch": 0.8267426889828949,
"grad_norm": 0.973132836806053,
"learning_rate": 8.593349389529194e-06,
"loss": 0.8412,
"mean_token_accuracy": 0.7499716639518738,
"step": 4495
},
{
"epoch": 0.8276623137759794,
"grad_norm": 0.9074516956073079,
"learning_rate": 8.556340262684901e-06,
"loss": 0.8239,
"mean_token_accuracy": 0.7554465770721436,
"step": 4500
},
{
"epoch": 0.8285819385690638,
"grad_norm": 0.930234934487542,
"learning_rate": 8.519506347358495e-06,
"loss": 0.7947,
"mean_token_accuracy": 0.7629730701446533,
"step": 4505
},
{
"epoch": 0.8295015633621482,
"grad_norm": 0.8753133502304897,
"learning_rate": 8.482847984229992e-06,
"loss": 0.8461,
"mean_token_accuracy": 0.747829282283783,
"step": 4510
},
{
"epoch": 0.8304211881552327,
"grad_norm": 0.9490806269639048,
"learning_rate": 8.446365512355697e-06,
"loss": 0.809,
"mean_token_accuracy": 0.7590258955955506,
"step": 4515
},
{
"epoch": 0.8313408129483171,
"grad_norm": 0.945014272705201,
"learning_rate": 8.410059269165094e-06,
"loss": 0.858,
"mean_token_accuracy": 0.7476967573165894,
"step": 4520
},
{
"epoch": 0.8322604377414015,
"grad_norm": 0.9585805628825262,
"learning_rate": 8.37392959045771e-06,
"loss": 0.8276,
"mean_token_accuracy": 0.7536361336708068,
"step": 4525
},
{
"epoch": 0.833180062534486,
"grad_norm": 0.9798760065535969,
"learning_rate": 8.337976810400024e-06,
"loss": 0.8271,
"mean_token_accuracy": 0.7538176774978638,
"step": 4530
},
{
"epoch": 0.8340996873275703,
"grad_norm": 0.9885247811188054,
"learning_rate": 8.30220126152233e-06,
"loss": 0.8351,
"mean_token_accuracy": 0.7511208415031433,
"step": 4535
},
{
"epoch": 0.8350193121206547,
"grad_norm": 0.926636431875522,
"learning_rate": 8.266603274715734e-06,
"loss": 0.8536,
"mean_token_accuracy": 0.7437230348587036,
"step": 4540
},
{
"epoch": 0.8359389369137392,
"grad_norm": 0.9639989728106565,
"learning_rate": 8.231183179229041e-06,
"loss": 0.8337,
"mean_token_accuracy": 0.749656867980957,
"step": 4545
},
{
"epoch": 0.8368585617068236,
"grad_norm": 0.9810922714927505,
"learning_rate": 8.19594130266571e-06,
"loss": 0.8441,
"mean_token_accuracy": 0.7471103310585022,
"step": 4550
},
{
"epoch": 0.837778186499908,
"grad_norm": 0.940673214702186,
"learning_rate": 8.16087797098086e-06,
"loss": 0.8076,
"mean_token_accuracy": 0.757796049118042,
"step": 4555
},
{
"epoch": 0.8386978112929925,
"grad_norm": 0.9808241732647448,
"learning_rate": 8.125993508478222e-06,
"loss": 0.8107,
"mean_token_accuracy": 0.7570709705352783,
"step": 4560
},
{
"epoch": 0.8396174360860769,
"grad_norm": 0.9417309972023068,
"learning_rate": 8.091288237807148e-06,
"loss": 0.7918,
"mean_token_accuracy": 0.7627918124198914,
"step": 4565
},
{
"epoch": 0.8405370608791614,
"grad_norm": 0.9994759897340699,
"learning_rate": 8.05676247995964e-06,
"loss": 0.8308,
"mean_token_accuracy": 0.7522749185562134,
"step": 4570
},
{
"epoch": 0.8414566856722457,
"grad_norm": 0.9575333123064316,
"learning_rate": 8.022416554267361e-06,
"loss": 0.8249,
"mean_token_accuracy": 0.7555456757545471,
"step": 4575
},
{
"epoch": 0.8423763104653301,
"grad_norm": 0.9428369551875321,
"learning_rate": 7.988250778398704e-06,
"loss": 0.7799,
"mean_token_accuracy": 0.7657583713531494,
"step": 4580
},
{
"epoch": 0.8432959352584146,
"grad_norm": 0.9491493130691244,
"learning_rate": 7.95426546835582e-06,
"loss": 0.8463,
"mean_token_accuracy": 0.7497212409973144,
"step": 4585
},
{
"epoch": 0.844215560051499,
"grad_norm": 0.9279119840497574,
"learning_rate": 7.92046093847173e-06,
"loss": 0.7911,
"mean_token_accuracy": 0.7641847729682922,
"step": 4590
},
{
"epoch": 0.8451351848445834,
"grad_norm": 0.975196157389162,
"learning_rate": 7.88683750140741e-06,
"loss": 0.7829,
"mean_token_accuracy": 0.76539067029953,
"step": 4595
},
{
"epoch": 0.8460548096376679,
"grad_norm": 0.9630038826041202,
"learning_rate": 7.853395468148877e-06,
"loss": 0.8214,
"mean_token_accuracy": 0.7576993346214295,
"step": 4600
},
{
"epoch": 0.8469744344307523,
"grad_norm": 0.9547194790847711,
"learning_rate": 7.82013514800434e-06,
"loss": 0.8133,
"mean_token_accuracy": 0.7594569325447083,
"step": 4605
},
{
"epoch": 0.8478940592238366,
"grad_norm": 0.9804442806928446,
"learning_rate": 7.787056848601327e-06,
"loss": 0.826,
"mean_token_accuracy": 0.7542958974838256,
"step": 4610
},
{
"epoch": 0.8488136840169211,
"grad_norm": 0.987211519153664,
"learning_rate": 7.754160875883835e-06,
"loss": 0.859,
"mean_token_accuracy": 0.7447464466094971,
"step": 4615
},
{
"epoch": 0.8497333088100055,
"grad_norm": 0.9279113898182684,
"learning_rate": 7.721447534109509e-06,
"loss": 0.8318,
"mean_token_accuracy": 0.7507144689559937,
"step": 4620
},
{
"epoch": 0.8506529336030899,
"grad_norm": 0.9722340874170035,
"learning_rate": 7.688917125846836e-06,
"loss": 0.8354,
"mean_token_accuracy": 0.7506987690925598,
"step": 4625
},
{
"epoch": 0.8515725583961744,
"grad_norm": 0.9470559135859266,
"learning_rate": 7.65656995197231e-06,
"loss": 0.846,
"mean_token_accuracy": 0.7494428992271424,
"step": 4630
},
{
"epoch": 0.8524921831892588,
"grad_norm": 1.0085786438496558,
"learning_rate": 7.6244063116676965e-06,
"loss": 0.8048,
"mean_token_accuracy": 0.7590271830558777,
"step": 4635
},
{
"epoch": 0.8534118079823432,
"grad_norm": 0.9122173396588265,
"learning_rate": 7.592426502417235e-06,
"loss": 0.792,
"mean_token_accuracy": 0.7632818222045898,
"step": 4640
},
{
"epoch": 0.8543314327754277,
"grad_norm": 0.920428242471814,
"learning_rate": 7.560630820004905e-06,
"loss": 0.7682,
"mean_token_accuracy": 0.768799901008606,
"step": 4645
},
{
"epoch": 0.855251057568512,
"grad_norm": 0.9650658819203722,
"learning_rate": 7.529019558511664e-06,
"loss": 0.8591,
"mean_token_accuracy": 0.7465671896934509,
"step": 4650
},
{
"epoch": 0.8561706823615964,
"grad_norm": 0.941100631374564,
"learning_rate": 7.4975930103127575e-06,
"loss": 0.8133,
"mean_token_accuracy": 0.7577845811843872,
"step": 4655
},
{
"epoch": 0.8570903071546809,
"grad_norm": 0.911355294655365,
"learning_rate": 7.466351466075003e-06,
"loss": 0.776,
"mean_token_accuracy": 0.7704600811004638,
"step": 4660
},
{
"epoch": 0.8580099319477653,
"grad_norm": 0.9600196890925632,
"learning_rate": 7.43529521475409e-06,
"loss": 0.8356,
"mean_token_accuracy": 0.752436888217926,
"step": 4665
},
{
"epoch": 0.8589295567408497,
"grad_norm": 0.9096404947618868,
"learning_rate": 7.404424543591926e-06,
"loss": 0.8434,
"mean_token_accuracy": 0.749167013168335,
"step": 4670
},
{
"epoch": 0.8598491815339342,
"grad_norm": 0.9645413054824178,
"learning_rate": 7.37373973811398e-06,
"loss": 0.8422,
"mean_token_accuracy": 0.7523573756217956,
"step": 4675
},
{
"epoch": 0.8607688063270186,
"grad_norm": 0.9461536188211753,
"learning_rate": 7.343241082126609e-06,
"loss": 0.789,
"mean_token_accuracy": 0.7644837021827697,
"step": 4680
},
{
"epoch": 0.861688431120103,
"grad_norm": 0.9177981778366934,
"learning_rate": 7.312928857714484e-06,
"loss": 0.7912,
"mean_token_accuracy": 0.7650796055793763,
"step": 4685
},
{
"epoch": 0.8626080559131875,
"grad_norm": 0.9395263274096144,
"learning_rate": 7.282803345237937e-06,
"loss": 0.779,
"mean_token_accuracy": 0.766014575958252,
"step": 4690
},
{
"epoch": 0.8635276807062718,
"grad_norm": 0.974228845887035,
"learning_rate": 7.252864823330397e-06,
"loss": 0.8096,
"mean_token_accuracy": 0.7609816431999207,
"step": 4695
},
{
"epoch": 0.8644473054993562,
"grad_norm": 0.9138771854988429,
"learning_rate": 7.223113568895791e-06,
"loss": 0.8228,
"mean_token_accuracy": 0.7533741354942322,
"step": 4700
},
{
"epoch": 0.8653669302924407,
"grad_norm": 0.9230858356341091,
"learning_rate": 7.193549857105998e-06,
"loss": 0.7817,
"mean_token_accuracy": 0.7645957589149475,
"step": 4705
},
{
"epoch": 0.8662865550855251,
"grad_norm": 0.9248959407091435,
"learning_rate": 7.164173961398307e-06,
"loss": 0.8123,
"mean_token_accuracy": 0.758608341217041,
"step": 4710
},
{
"epoch": 0.8672061798786095,
"grad_norm": 0.920957739245226,
"learning_rate": 7.134986153472864e-06,
"loss": 0.8089,
"mean_token_accuracy": 0.7574970960617066,
"step": 4715
},
{
"epoch": 0.868125804671694,
"grad_norm": 0.9365387305302294,
"learning_rate": 7.105986703290185e-06,
"loss": 0.8207,
"mean_token_accuracy": 0.7519280552864075,
"step": 4720
},
{
"epoch": 0.8690454294647784,
"grad_norm": 0.9848472191309555,
"learning_rate": 7.077175879068652e-06,
"loss": 0.8318,
"mean_token_accuracy": 0.7514313578605651,
"step": 4725
},
{
"epoch": 0.8699650542578627,
"grad_norm": 0.9841439973977463,
"learning_rate": 7.04855394728202e-06,
"loss": 0.8254,
"mean_token_accuracy": 0.7536401510238647,
"step": 4730
},
{
"epoch": 0.8708846790509472,
"grad_norm": 0.9368690483918741,
"learning_rate": 7.020121172656971e-06,
"loss": 0.8079,
"mean_token_accuracy": 0.7589451789855957,
"step": 4735
},
{
"epoch": 0.8718043038440316,
"grad_norm": 0.9537367969880632,
"learning_rate": 6.991877818170647e-06,
"loss": 0.8105,
"mean_token_accuracy": 0.7570921540260315,
"step": 4740
},
{
"epoch": 0.872723928637116,
"grad_norm": 0.9771290706741976,
"learning_rate": 6.963824145048245e-06,
"loss": 0.8383,
"mean_token_accuracy": 0.7482818961143494,
"step": 4745
},
{
"epoch": 0.8736435534302005,
"grad_norm": 0.9167489506515816,
"learning_rate": 6.935960412760554e-06,
"loss": 0.7956,
"mean_token_accuracy": 0.7615381121635437,
"step": 4750
},
{
"epoch": 0.8745631782232849,
"grad_norm": 0.9509142520738616,
"learning_rate": 6.908286879021611e-06,
"loss": 0.8272,
"mean_token_accuracy": 0.7538857817649841,
"step": 4755
},
{
"epoch": 0.8754828030163693,
"grad_norm": 0.9492010037774332,
"learning_rate": 6.880803799786282e-06,
"loss": 0.8083,
"mean_token_accuracy": 0.7596304178237915,
"step": 4760
},
{
"epoch": 0.8764024278094538,
"grad_norm": 0.9879455089380224,
"learning_rate": 6.853511429247891e-06,
"loss": 0.8501,
"mean_token_accuracy": 0.7443594694137573,
"step": 4765
},
{
"epoch": 0.8773220526025381,
"grad_norm": 0.900884905164465,
"learning_rate": 6.826410019835897e-06,
"loss": 0.8388,
"mean_token_accuracy": 0.75017911195755,
"step": 4770
},
{
"epoch": 0.8782416773956225,
"grad_norm": 0.9347399353088925,
"learning_rate": 6.7994998222135415e-06,
"loss": 0.8338,
"mean_token_accuracy": 0.7503747582435608,
"step": 4775
},
{
"epoch": 0.879161302188707,
"grad_norm": 0.9313447849733553,
"learning_rate": 6.77278108527552e-06,
"loss": 0.8223,
"mean_token_accuracy": 0.7531881928443909,
"step": 4780
},
{
"epoch": 0.8800809269817914,
"grad_norm": 0.9749122247147805,
"learning_rate": 6.7462540561457035e-06,
"loss": 0.8078,
"mean_token_accuracy": 0.7597910761833191,
"step": 4785
},
{
"epoch": 0.8810005517748758,
"grad_norm": 0.9459726297921652,
"learning_rate": 6.719918980174842e-06,
"loss": 0.7735,
"mean_token_accuracy": 0.7680148124694824,
"step": 4790
},
{
"epoch": 0.8819201765679603,
"grad_norm": 0.9477334526426899,
"learning_rate": 6.6937761009382816e-06,
"loss": 0.8025,
"mean_token_accuracy": 0.759226131439209,
"step": 4795
},
{
"epoch": 0.8828398013610447,
"grad_norm": 0.9350684746914302,
"learning_rate": 6.667825660233736e-06,
"loss": 0.8141,
"mean_token_accuracy": 0.7565145611763,
"step": 4800
},
{
"epoch": 0.8837594261541292,
"grad_norm": 0.9492764392082258,
"learning_rate": 6.642067898079038e-06,
"loss": 0.8311,
"mean_token_accuracy": 0.7527845025062561,
"step": 4805
},
{
"epoch": 0.8846790509472136,
"grad_norm": 0.8598768439927121,
"learning_rate": 6.616503052709914e-06,
"loss": 0.7896,
"mean_token_accuracy": 0.7648340344429017,
"step": 4810
},
{
"epoch": 0.8855986757402979,
"grad_norm": 0.9446656437839204,
"learning_rate": 6.591131360577795e-06,
"loss": 0.8052,
"mean_token_accuracy": 0.7575154542922974,
"step": 4815
},
{
"epoch": 0.8865183005333824,
"grad_norm": 0.8652514268793213,
"learning_rate": 6.565953056347608e-06,
"loss": 0.7534,
"mean_token_accuracy": 0.7725171089172364,
"step": 4820
},
{
"epoch": 0.8874379253264668,
"grad_norm": 0.9422431334861092,
"learning_rate": 6.540968372895634e-06,
"loss": 0.7977,
"mean_token_accuracy": 0.7611649394035339,
"step": 4825
},
{
"epoch": 0.8883575501195512,
"grad_norm": 0.9384703132768932,
"learning_rate": 6.516177541307333e-06,
"loss": 0.7995,
"mean_token_accuracy": 0.7624763369560241,
"step": 4830
},
{
"epoch": 0.8892771749126357,
"grad_norm": 1.015847599195386,
"learning_rate": 6.491580790875209e-06,
"loss": 0.7916,
"mean_token_accuracy": 0.7621793508529663,
"step": 4835
},
{
"epoch": 0.8901967997057201,
"grad_norm": 0.9098096698494834,
"learning_rate": 6.4671783490966945e-06,
"loss": 0.8088,
"mean_token_accuracy": 0.7614699125289917,
"step": 4840
},
{
"epoch": 0.8911164244988045,
"grad_norm": 0.9558674059824713,
"learning_rate": 6.442970441672051e-06,
"loss": 0.8545,
"mean_token_accuracy": 0.7470506310462952,
"step": 4845
},
{
"epoch": 0.892036049291889,
"grad_norm": 0.9590352976202275,
"learning_rate": 6.4189572925022655e-06,
"loss": 0.8363,
"mean_token_accuracy": 0.7472939848899841,
"step": 4850
},
{
"epoch": 0.8929556740849733,
"grad_norm": 0.8982751392912057,
"learning_rate": 6.3951391236869985e-06,
"loss": 0.8259,
"mean_token_accuracy": 0.7548177719116211,
"step": 4855
},
{
"epoch": 0.8938752988780577,
"grad_norm": 0.9627549202883984,
"learning_rate": 6.371516155522513e-06,
"loss": 0.8035,
"mean_token_accuracy": 0.7578222513198852,
"step": 4860
},
{
"epoch": 0.8947949236711422,
"grad_norm": 0.962995623951893,
"learning_rate": 6.3480886064996484e-06,
"loss": 0.8119,
"mean_token_accuracy": 0.7579006910324096,
"step": 4865
},
{
"epoch": 0.8957145484642266,
"grad_norm": 0.99045632467858,
"learning_rate": 6.3248566933017975e-06,
"loss": 0.7942,
"mean_token_accuracy": 0.75965256690979,
"step": 4870
},
{
"epoch": 0.896634173257311,
"grad_norm": 0.9510071830298487,
"learning_rate": 6.3018206308028975e-06,
"loss": 0.8185,
"mean_token_accuracy": 0.7584743499755859,
"step": 4875
},
{
"epoch": 0.8975537980503955,
"grad_norm": 0.9703791789576997,
"learning_rate": 6.2789806320654456e-06,
"loss": 0.7816,
"mean_token_accuracy": 0.7649904489517212,
"step": 4880
},
{
"epoch": 0.8984734228434799,
"grad_norm": 0.9398378664335288,
"learning_rate": 6.256336908338531e-06,
"loss": 0.78,
"mean_token_accuracy": 0.767956817150116,
"step": 4885
},
{
"epoch": 0.8993930476365642,
"grad_norm": 0.987114293205303,
"learning_rate": 6.233889669055878e-06,
"loss": 0.8443,
"mean_token_accuracy": 0.7497469425201416,
"step": 4890
},
{
"epoch": 0.9003126724296487,
"grad_norm": 0.9343500174042304,
"learning_rate": 6.211639121833912e-06,
"loss": 0.7931,
"mean_token_accuracy": 0.763602340221405,
"step": 4895
},
{
"epoch": 0.9012322972227331,
"grad_norm": 0.9262644956755969,
"learning_rate": 6.189585472469829e-06,
"loss": 0.7792,
"mean_token_accuracy": 0.7697998642921448,
"step": 4900
},
{
"epoch": 0.9021519220158175,
"grad_norm": 0.9622834108867682,
"learning_rate": 6.167728924939705e-06,
"loss": 0.797,
"mean_token_accuracy": 0.7625941157341003,
"step": 4905
},
{
"epoch": 0.903071546808902,
"grad_norm": 0.9190192726730757,
"learning_rate": 6.146069681396612e-06,
"loss": 0.8253,
"mean_token_accuracy": 0.7542304992675781,
"step": 4910
},
{
"epoch": 0.9039911716019864,
"grad_norm": 0.9361246140345745,
"learning_rate": 6.124607942168726e-06,
"loss": 0.8031,
"mean_token_accuracy": 0.7584469556808472,
"step": 4915
},
{
"epoch": 0.9049107963950708,
"grad_norm": 0.9457716726884055,
"learning_rate": 6.1033439057574965e-06,
"loss": 0.8153,
"mean_token_accuracy": 0.758701741695404,
"step": 4920
},
{
"epoch": 0.9058304211881553,
"grad_norm": 0.8853750515926242,
"learning_rate": 6.082277768835807e-06,
"loss": 0.7921,
"mean_token_accuracy": 0.763675856590271,
"step": 4925
},
{
"epoch": 0.9067500459812396,
"grad_norm": 0.9702784866596219,
"learning_rate": 6.061409726246143e-06,
"loss": 0.7851,
"mean_token_accuracy": 0.7646818399429322,
"step": 4930
},
{
"epoch": 0.907669670774324,
"grad_norm": 0.9693421985103569,
"learning_rate": 6.040739970998802e-06,
"loss": 0.8346,
"mean_token_accuracy": 0.7530786991119385,
"step": 4935
},
{
"epoch": 0.9085892955674085,
"grad_norm": 0.8930655347204544,
"learning_rate": 6.020268694270109e-06,
"loss": 0.7966,
"mean_token_accuracy": 0.7641753435134888,
"step": 4940
},
{
"epoch": 0.9095089203604929,
"grad_norm": 0.908390221485836,
"learning_rate": 5.999996085400643e-06,
"loss": 0.7995,
"mean_token_accuracy": 0.7642928123474121,
"step": 4945
},
{
"epoch": 0.9104285451535773,
"grad_norm": 0.9291773666129768,
"learning_rate": 5.9799223318934765e-06,
"loss": 0.801,
"mean_token_accuracy": 0.7588168382644653,
"step": 4950
},
{
"epoch": 0.9113481699466618,
"grad_norm": 0.9290002720904244,
"learning_rate": 5.9600476194124675e-06,
"loss": 0.7973,
"mean_token_accuracy": 0.763935673236847,
"step": 4955
},
{
"epoch": 0.9122677947397462,
"grad_norm": 0.9446442087955222,
"learning_rate": 5.9403721317805245e-06,
"loss": 0.801,
"mean_token_accuracy": 0.7578533172607422,
"step": 4960
},
{
"epoch": 0.9131874195328306,
"grad_norm": 0.9568316679901518,
"learning_rate": 5.920896050977891e-06,
"loss": 0.8926,
"mean_token_accuracy": 0.7361096501350403,
"step": 4965
},
{
"epoch": 0.914107044325915,
"grad_norm": 0.9761363167639366,
"learning_rate": 5.901619557140502e-06,
"loss": 0.8302,
"mean_token_accuracy": 0.7517902731895447,
"step": 4970
},
{
"epoch": 0.9150266691189994,
"grad_norm": 0.9363921634925068,
"learning_rate": 5.882542828558286e-06,
"loss": 0.8066,
"mean_token_accuracy": 0.7580497026443481,
"step": 4975
},
{
"epoch": 0.9159462939120838,
"grad_norm": 0.9898749363112332,
"learning_rate": 5.86366604167352e-06,
"loss": 0.7785,
"mean_token_accuracy": 0.7676722645759583,
"step": 4980
},
{
"epoch": 0.9168659187051683,
"grad_norm": 0.9461120512925497,
"learning_rate": 5.844989371079215e-06,
"loss": 0.7655,
"mean_token_accuracy": 0.7703205943107605,
"step": 4985
},
{
"epoch": 0.9177855434982527,
"grad_norm": 0.9340964548547984,
"learning_rate": 5.826512989517478e-06,
"loss": 0.8243,
"mean_token_accuracy": 0.7529069542884826,
"step": 4990
},
{
"epoch": 0.9187051682913371,
"grad_norm": 0.9542091804584825,
"learning_rate": 5.808237067877942e-06,
"loss": 0.7869,
"mean_token_accuracy": 0.7639023303985596,
"step": 4995
},
{
"epoch": 0.9196247930844216,
"grad_norm": 0.9799469338180448,
"learning_rate": 5.790161775196144e-06,
"loss": 0.7942,
"mean_token_accuracy": 0.7624092340469361,
"step": 5000
},
{
"epoch": 0.920544417877506,
"grad_norm": 0.9533254080832144,
"learning_rate": 5.772287278652012e-06,
"loss": 0.8109,
"mean_token_accuracy": 0.7598010182380677,
"step": 5005
},
{
"epoch": 0.9214640426705903,
"grad_norm": 0.9311527277134242,
"learning_rate": 5.754613743568279e-06,
"loss": 0.7906,
"mean_token_accuracy": 0.7638931751251221,
"step": 5010
},
{
"epoch": 0.9223836674636748,
"grad_norm": 0.9812836116539834,
"learning_rate": 5.737141333408972e-06,
"loss": 0.8008,
"mean_token_accuracy": 0.7612162590026855,
"step": 5015
},
{
"epoch": 0.9233032922567592,
"grad_norm": 0.9745443553849291,
"learning_rate": 5.719870209777896e-06,
"loss": 0.8417,
"mean_token_accuracy": 0.7509512066841125,
"step": 5020
},
{
"epoch": 0.9242229170498437,
"grad_norm": 0.9530895065948418,
"learning_rate": 5.702800532417144e-06,
"loss": 0.7899,
"mean_token_accuracy": 0.7625620007514954,
"step": 5025
},
{
"epoch": 0.9251425418429281,
"grad_norm": 0.9106620317823355,
"learning_rate": 5.685932459205606e-06,
"loss": 0.8075,
"mean_token_accuracy": 0.7597783088684082,
"step": 5030
},
{
"epoch": 0.9260621666360125,
"grad_norm": 0.9016062622069709,
"learning_rate": 5.669266146157527e-06,
"loss": 0.7956,
"mean_token_accuracy": 0.7618203997612,
"step": 5035
},
{
"epoch": 0.926981791429097,
"grad_norm": 0.9311871037406105,
"learning_rate": 5.652801747421053e-06,
"loss": 0.7755,
"mean_token_accuracy": 0.7672530770301819,
"step": 5040
},
{
"epoch": 0.9279014162221814,
"grad_norm": 0.9289149914362874,
"learning_rate": 5.636539415276807e-06,
"loss": 0.7971,
"mean_token_accuracy": 0.7606992840766906,
"step": 5045
},
{
"epoch": 0.9288210410152657,
"grad_norm": 0.9265920738234094,
"learning_rate": 5.620479300136475e-06,
"loss": 0.7675,
"mean_token_accuracy": 0.7715546011924743,
"step": 5050
},
{
"epoch": 0.9297406658083502,
"grad_norm": 1.001963123510446,
"learning_rate": 5.604621550541429e-06,
"loss": 0.8426,
"mean_token_accuracy": 0.7474547743797302,
"step": 5055
},
{
"epoch": 0.9306602906014346,
"grad_norm": 0.9062392197653472,
"learning_rate": 5.5889663131613465e-06,
"loss": 0.8237,
"mean_token_accuracy": 0.7512851595878601,
"step": 5060
},
{
"epoch": 0.931579915394519,
"grad_norm": 0.9878466692235598,
"learning_rate": 5.5735137327928384e-06,
"loss": 0.8018,
"mean_token_accuracy": 0.7595331549644471,
"step": 5065
},
{
"epoch": 0.9324995401876035,
"grad_norm": 0.911756127989921,
"learning_rate": 5.558263952358139e-06,
"loss": 0.8146,
"mean_token_accuracy": 0.7572713255882263,
"step": 5070
},
{
"epoch": 0.9334191649806879,
"grad_norm": 0.9534452188147857,
"learning_rate": 5.543217112903766e-06,
"loss": 0.8092,
"mean_token_accuracy": 0.7591339111328125,
"step": 5075
},
{
"epoch": 0.9343387897737723,
"grad_norm": 0.94136690175154,
"learning_rate": 5.528373353599207e-06,
"loss": 0.7945,
"mean_token_accuracy": 0.7594197154045105,
"step": 5080
},
{
"epoch": 0.9352584145668568,
"grad_norm": 0.9367268234664168,
"learning_rate": 5.513732811735657e-06,
"loss": 0.8123,
"mean_token_accuracy": 0.7594240307807922,
"step": 5085
},
{
"epoch": 0.9361780393599411,
"grad_norm": 0.8975989192963018,
"learning_rate": 5.4992956227247345e-06,
"loss": 0.7715,
"mean_token_accuracy": 0.7677939176559448,
"step": 5090
},
{
"epoch": 0.9370976641530255,
"grad_norm": 0.9987125543689239,
"learning_rate": 5.48506192009722e-06,
"loss": 0.8051,
"mean_token_accuracy": 0.7597865104675293,
"step": 5095
},
{
"epoch": 0.93801728894611,
"grad_norm": 0.9396093256392507,
"learning_rate": 5.4710318355018435e-06,
"loss": 0.8248,
"mean_token_accuracy": 0.7557710766792297,
"step": 5100
},
{
"epoch": 0.9389369137391944,
"grad_norm": 0.907072734656757,
"learning_rate": 5.457205498704046e-06,
"loss": 0.8104,
"mean_token_accuracy": 0.7568627595901489,
"step": 5105
},
{
"epoch": 0.9398565385322788,
"grad_norm": 0.9498606808400206,
"learning_rate": 5.443583037584792e-06,
"loss": 0.829,
"mean_token_accuracy": 0.7537372469902038,
"step": 5110
},
{
"epoch": 0.9407761633253633,
"grad_norm": 0.9500188031150016,
"learning_rate": 5.430164578139382e-06,
"loss": 0.771,
"mean_token_accuracy": 0.7692322492599487,
"step": 5115
},
{
"epoch": 0.9416957881184477,
"grad_norm": 0.9133488515736051,
"learning_rate": 5.4169502444762836e-06,
"loss": 0.8203,
"mean_token_accuracy": 0.7578924179077149,
"step": 5120
},
{
"epoch": 0.9426154129115321,
"grad_norm": 0.9585342004886042,
"learning_rate": 5.403940158815996e-06,
"loss": 0.8209,
"mean_token_accuracy": 0.7570155620574951,
"step": 5125
},
{
"epoch": 0.9435350377046166,
"grad_norm": 0.9797939933864984,
"learning_rate": 5.391134441489905e-06,
"loss": 0.7937,
"mean_token_accuracy": 0.7618912696838379,
"step": 5130
},
{
"epoch": 0.9444546624977009,
"grad_norm": 0.9293935572688817,
"learning_rate": 5.378533210939176e-06,
"loss": 0.7948,
"mean_token_accuracy": 0.7596281886100769,
"step": 5135
},
{
"epoch": 0.9453742872907853,
"grad_norm": 0.9221042858985046,
"learning_rate": 5.366136583713665e-06,
"loss": 0.7717,
"mean_token_accuracy": 0.7698543071746826,
"step": 5140
},
{
"epoch": 0.9462939120838698,
"grad_norm": 1.025946124148099,
"learning_rate": 5.353944674470823e-06,
"loss": 0.8213,
"mean_token_accuracy": 0.7552660465240478,
"step": 5145
},
{
"epoch": 0.9472135368769542,
"grad_norm": 0.984504169212397,
"learning_rate": 5.341957595974662e-06,
"loss": 0.8392,
"mean_token_accuracy": 0.7498656630516052,
"step": 5150
},
{
"epoch": 0.9481331616700386,
"grad_norm": 0.9188252633726173,
"learning_rate": 5.3301754590946824e-06,
"loss": 0.8166,
"mean_token_accuracy": 0.7552522420883179,
"step": 5155
},
{
"epoch": 0.9490527864631231,
"grad_norm": 0.8673224532160614,
"learning_rate": 5.318598372804873e-06,
"loss": 0.7689,
"mean_token_accuracy": 0.7689907431602478,
"step": 5160
},
{
"epoch": 0.9499724112562075,
"grad_norm": 0.9392909148393203,
"learning_rate": 5.307226444182686e-06,
"loss": 0.7877,
"mean_token_accuracy": 0.7654459595680236,
"step": 5165
},
{
"epoch": 0.9508920360492918,
"grad_norm": 1.0092515399603914,
"learning_rate": 5.296059778408057e-06,
"loss": 0.8228,
"mean_token_accuracy": 0.7547815799713135,
"step": 5170
},
{
"epoch": 0.9518116608423763,
"grad_norm": 0.9724478118701938,
"learning_rate": 5.2850984787624264e-06,
"loss": 0.8068,
"mean_token_accuracy": 0.757933521270752,
"step": 5175
},
{
"epoch": 0.9527312856354607,
"grad_norm": 0.9595437776833703,
"learning_rate": 5.274342646627783e-06,
"loss": 0.8612,
"mean_token_accuracy": 0.7451163768768311,
"step": 5180
},
{
"epoch": 0.9536509104285451,
"grad_norm": 0.9035621461181421,
"learning_rate": 5.263792381485733e-06,
"loss": 0.7942,
"mean_token_accuracy": 0.7612574458122253,
"step": 5185
},
{
"epoch": 0.9545705352216296,
"grad_norm": 0.9369759529937411,
"learning_rate": 5.253447780916577e-06,
"loss": 0.8199,
"mean_token_accuracy": 0.755517327785492,
"step": 5190
},
{
"epoch": 0.955490160014714,
"grad_norm": 0.9223279306007958,
"learning_rate": 5.2433089405984e-06,
"loss": 0.7855,
"mean_token_accuracy": 0.7672001838684082,
"step": 5195
},
{
"epoch": 0.9564097848077984,
"grad_norm": 0.9093658718364905,
"learning_rate": 5.233375954306199e-06,
"loss": 0.7588,
"mean_token_accuracy": 0.7701982975006103,
"step": 5200
},
{
"epoch": 0.9573294096008829,
"grad_norm": 0.9756234794282658,
"learning_rate": 5.22364891391101e-06,
"loss": 0.8294,
"mean_token_accuracy": 0.75344318151474,
"step": 5205
},
{
"epoch": 0.9582490343939672,
"grad_norm": 0.910212786589889,
"learning_rate": 5.2141279093790575e-06,
"loss": 0.7894,
"mean_token_accuracy": 0.7678821444511413,
"step": 5210
},
{
"epoch": 0.9591686591870516,
"grad_norm": 0.9474929875705357,
"learning_rate": 5.204813028770913e-06,
"loss": 0.7891,
"mean_token_accuracy": 0.7625754833221435,
"step": 5215
},
{
"epoch": 0.9600882839801361,
"grad_norm": 0.9344552952746554,
"learning_rate": 5.195704358240704e-06,
"loss": 0.8059,
"mean_token_accuracy": 0.759453558921814,
"step": 5220
},
{
"epoch": 0.9610079087732205,
"grad_norm": 0.9060367178226402,
"learning_rate": 5.186801982035298e-06,
"loss": 0.7846,
"mean_token_accuracy": 0.7654222846031189,
"step": 5225
},
{
"epoch": 0.9619275335663049,
"grad_norm": 0.9799737312884412,
"learning_rate": 5.178105982493528e-06,
"loss": 0.813,
"mean_token_accuracy": 0.7591325879096985,
"step": 5230
},
{
"epoch": 0.9628471583593894,
"grad_norm": 0.9419373863409995,
"learning_rate": 5.169616440045433e-06,
"loss": 0.7933,
"mean_token_accuracy": 0.7605907201766968,
"step": 5235
},
{
"epoch": 0.9637667831524738,
"grad_norm": 0.904753211539841,
"learning_rate": 5.16133343321151e-06,
"loss": 0.796,
"mean_token_accuracy": 0.7628448724746704,
"step": 5240
},
{
"epoch": 0.9646864079455583,
"grad_norm": 0.9588441625989744,
"learning_rate": 5.1532570386019944e-06,
"loss": 0.7746,
"mean_token_accuracy": 0.7675014138221741,
"step": 5245
},
{
"epoch": 0.9656060327386427,
"grad_norm": 0.8875696215604679,
"learning_rate": 5.145387330916144e-06,
"loss": 0.7988,
"mean_token_accuracy": 0.7614070296287536,
"step": 5250
},
{
"epoch": 0.966525657531727,
"grad_norm": 0.9405630235157387,
"learning_rate": 5.137724382941557e-06,
"loss": 0.7918,
"mean_token_accuracy": 0.7650785088539124,
"step": 5255
},
{
"epoch": 0.9674452823248115,
"grad_norm": 0.9562043810312459,
"learning_rate": 5.130268265553487e-06,
"loss": 0.8144,
"mean_token_accuracy": 0.7557086706161499,
"step": 5260
},
{
"epoch": 0.9683649071178959,
"grad_norm": 0.9274811086930055,
"learning_rate": 5.123019047714198e-06,
"loss": 0.7576,
"mean_token_accuracy": 0.7753474235534668,
"step": 5265
},
{
"epoch": 0.9692845319109803,
"grad_norm": 0.9409745943869224,
"learning_rate": 5.115976796472322e-06,
"loss": 0.8328,
"mean_token_accuracy": 0.7535906672477722,
"step": 5270
},
{
"epoch": 0.9702041567040648,
"grad_norm": 0.919927159373234,
"learning_rate": 5.109141576962239e-06,
"loss": 0.7912,
"mean_token_accuracy": 0.7655844688415527,
"step": 5275
},
{
"epoch": 0.9711237814971492,
"grad_norm": 0.951329112362283,
"learning_rate": 5.102513452403473e-06,
"loss": 0.7683,
"mean_token_accuracy": 0.7696467399597168,
"step": 5280
},
{
"epoch": 0.9720434062902336,
"grad_norm": 0.9201946233258363,
"learning_rate": 5.0960924841001155e-06,
"loss": 0.7988,
"mean_token_accuracy": 0.7610312700271606,
"step": 5285
},
{
"epoch": 0.972963031083318,
"grad_norm": 1.0032717462292577,
"learning_rate": 5.089878731440241e-06,
"loss": 0.821,
"mean_token_accuracy": 0.7543939590454102,
"step": 5290
},
{
"epoch": 0.9738826558764024,
"grad_norm": 0.9429172545610519,
"learning_rate": 5.0838722518953816e-06,
"loss": 0.7989,
"mean_token_accuracy": 0.7595749855041504,
"step": 5295
},
{
"epoch": 0.9748022806694868,
"grad_norm": 0.9007616401314099,
"learning_rate": 5.078073101019974e-06,
"loss": 0.8083,
"mean_token_accuracy": 0.7579713940620423,
"step": 5300
},
{
"epoch": 0.9757219054625713,
"grad_norm": 0.8990406462252963,
"learning_rate": 5.072481332450857e-06,
"loss": 0.8114,
"mean_token_accuracy": 0.7577333807945251,
"step": 5305
},
{
"epoch": 0.9766415302556557,
"grad_norm": 0.9615340254243923,
"learning_rate": 5.067096997906774e-06,
"loss": 0.7715,
"mean_token_accuracy": 0.7705414056777954,
"step": 5310
},
{
"epoch": 0.9775611550487401,
"grad_norm": 0.8455749234692341,
"learning_rate": 5.06192014718789e-06,
"loss": 0.7642,
"mean_token_accuracy": 0.7697661995887757,
"step": 5315
},
{
"epoch": 0.9784807798418246,
"grad_norm": 0.9292612449999305,
"learning_rate": 5.05695082817534e-06,
"loss": 0.7789,
"mean_token_accuracy": 0.7671653866767884,
"step": 5320
},
{
"epoch": 0.979400404634909,
"grad_norm": 0.9275056123774931,
"learning_rate": 5.052189086830779e-06,
"loss": 0.8018,
"mean_token_accuracy": 0.7623230576515198,
"step": 5325
},
{
"epoch": 0.9803200294279933,
"grad_norm": 0.9703545231339168,
"learning_rate": 5.047634967195952e-06,
"loss": 0.7877,
"mean_token_accuracy": 0.7638481616973877,
"step": 5330
},
{
"epoch": 0.9812396542210778,
"grad_norm": 0.955542417327297,
"learning_rate": 5.043288511392302e-06,
"loss": 0.7891,
"mean_token_accuracy": 0.7614734530448913,
"step": 5335
},
{
"epoch": 0.9821592790141622,
"grad_norm": 0.9645172124378145,
"learning_rate": 5.039149759620569e-06,
"loss": 0.7624,
"mean_token_accuracy": 0.7724639177322388,
"step": 5340
},
{
"epoch": 0.9830789038072466,
"grad_norm": 0.9734387825498484,
"learning_rate": 5.0352187501604155e-06,
"loss": 0.8579,
"mean_token_accuracy": 0.746760880947113,
"step": 5345
},
{
"epoch": 0.9839985286003311,
"grad_norm": 0.9730228991663388,
"learning_rate": 5.031495519370083e-06,
"loss": 0.8102,
"mean_token_accuracy": 0.758979082107544,
"step": 5350
},
{
"epoch": 0.9849181533934155,
"grad_norm": 1.0013660074202417,
"learning_rate": 5.027980101686053e-06,
"loss": 0.8396,
"mean_token_accuracy": 0.7509408593177795,
"step": 5355
},
{
"epoch": 0.9858377781864999,
"grad_norm": 0.9817157587290055,
"learning_rate": 5.024672529622717e-06,
"loss": 0.7935,
"mean_token_accuracy": 0.7596516370773315,
"step": 5360
},
{
"epoch": 0.9867574029795844,
"grad_norm": 0.9800745490721745,
"learning_rate": 5.0215728337720955e-06,
"loss": 0.7491,
"mean_token_accuracy": 0.7768563270568848,
"step": 5365
},
{
"epoch": 0.9876770277726687,
"grad_norm": 0.99189390574119,
"learning_rate": 5.018681042803533e-06,
"loss": 0.7759,
"mean_token_accuracy": 0.7670275330543518,
"step": 5370
},
{
"epoch": 0.9885966525657531,
"grad_norm": 0.9673022649880465,
"learning_rate": 5.0159971834634545e-06,
"loss": 0.7867,
"mean_token_accuracy": 0.764349353313446,
"step": 5375
},
{
"epoch": 0.9895162773588376,
"grad_norm": 1.0182176113772272,
"learning_rate": 5.013521280575099e-06,
"loss": 0.799,
"mean_token_accuracy": 0.7618956327438354,
"step": 5380
},
{
"epoch": 0.990435902151922,
"grad_norm": 0.9959171759739962,
"learning_rate": 5.011253357038306e-06,
"loss": 0.8392,
"mean_token_accuracy": 0.7527823686599732,
"step": 5385
},
{
"epoch": 0.9913555269450064,
"grad_norm": 0.8997528487054468,
"learning_rate": 5.0091934338292915e-06,
"loss": 0.7615,
"mean_token_accuracy": 0.7715205192565918,
"step": 5390
},
{
"epoch": 0.9922751517380909,
"grad_norm": 0.919462849827096,
"learning_rate": 5.00734153000046e-06,
"loss": 0.7409,
"mean_token_accuracy": 0.77668297290802,
"step": 5395
},
{
"epoch": 0.9931947765311753,
"grad_norm": 0.984326555402561,
"learning_rate": 5.005697662680227e-06,
"loss": 0.7989,
"mean_token_accuracy": 0.7626922607421875,
"step": 5400
},
{
"epoch": 0.9941144013242597,
"grad_norm": 0.9499542228497883,
"learning_rate": 5.004261847072863e-06,
"loss": 0.8283,
"mean_token_accuracy": 0.7542143225669861,
"step": 5405
},
{
"epoch": 0.9950340261173442,
"grad_norm": 0.9585799297597308,
"learning_rate": 5.003034096458347e-06,
"loss": 0.835,
"mean_token_accuracy": 0.7544377326965332,
"step": 5410
},
{
"epoch": 0.9959536509104285,
"grad_norm": 0.9165677599227604,
"learning_rate": 5.0020144221922466e-06,
"loss": 0.8013,
"mean_token_accuracy": 0.7582892417907715,
"step": 5415
},
{
"epoch": 0.9968732757035129,
"grad_norm": 0.9449991405622632,
"learning_rate": 5.001202833705621e-06,
"loss": 0.8352,
"mean_token_accuracy": 0.7502840042114258,
"step": 5420
},
{
"epoch": 0.9977929004965974,
"grad_norm": 0.9827477783752422,
"learning_rate": 5.000599338504916e-06,
"loss": 0.7931,
"mean_token_accuracy": 0.762959897518158,
"step": 5425
},
{
"epoch": 0.9987125252896818,
"grad_norm": 0.9751233701044131,
"learning_rate": 5.0002039421719105e-06,
"loss": 0.7978,
"mean_token_accuracy": 0.7619426846504211,
"step": 5430
},
{
"epoch": 0.9996321500827662,
"grad_norm": 0.971614941671036,
"learning_rate": 5.000016648363663e-06,
"loss": 0.801,
"mean_token_accuracy": 0.7594120621681213,
"step": 5435
},
{
"epoch": 1.0,
"mean_token_accuracy": 0.779580146074295,
"step": 5437,
"total_flos": 77442066677760.0,
"train_loss": 0.8871173100675843,
"train_runtime": 5515.7519,
"train_samples_per_second": 15.771,
"train_steps_per_second": 0.986
}
],
"logging_steps": 5,
"max_steps": 5437,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 77442066677760.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}