Qwen-code-7B-SFT-500k / trainer_state.json
zhuangxialie
Model save
beea65f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9986530690783146,
"eval_steps": 500,
"global_step": 10392,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019241870309794113,
"grad_norm": 1.963148361362126,
"learning_rate": 2.403846153846154e-07,
"loss": 0.5398,
"mean_token_accuracy": 0.8655876636505127,
"step": 5
},
{
"epoch": 0.0038483740619588225,
"grad_norm": 1.9157009818195054,
"learning_rate": 4.807692307692308e-07,
"loss": 0.5307,
"mean_token_accuracy": 0.8676515460014343,
"step": 10
},
{
"epoch": 0.005772561092938233,
"grad_norm": 1.5111307334029855,
"learning_rate": 7.211538461538462e-07,
"loss": 0.5196,
"mean_token_accuracy": 0.8695880591869354,
"step": 15
},
{
"epoch": 0.007696748123917645,
"grad_norm": 0.853529638874443,
"learning_rate": 9.615384615384617e-07,
"loss": 0.5095,
"mean_token_accuracy": 0.8688256680965424,
"step": 20
},
{
"epoch": 0.009620935154897056,
"grad_norm": 0.7077984713233373,
"learning_rate": 1.201923076923077e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.8708780348300934,
"step": 25
},
{
"epoch": 0.011545122185876467,
"grad_norm": 0.7200684137615945,
"learning_rate": 1.4423076923076924e-06,
"loss": 0.4602,
"mean_token_accuracy": 0.8745088517665863,
"step": 30
},
{
"epoch": 0.01346930921685588,
"grad_norm": 0.4688698486503959,
"learning_rate": 1.6826923076923077e-06,
"loss": 0.4473,
"mean_token_accuracy": 0.8767399728298187,
"step": 35
},
{
"epoch": 0.01539349624783529,
"grad_norm": 0.45791546768368196,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.4325,
"mean_token_accuracy": 0.8795706152915954,
"step": 40
},
{
"epoch": 0.0173176832788147,
"grad_norm": 0.2883581646319124,
"learning_rate": 2.1634615384615387e-06,
"loss": 0.4035,
"mean_token_accuracy": 0.8868738651275635,
"step": 45
},
{
"epoch": 0.01924187030979411,
"grad_norm": 0.2975988257840724,
"learning_rate": 2.403846153846154e-06,
"loss": 0.4,
"mean_token_accuracy": 0.8870362162590026,
"step": 50
},
{
"epoch": 0.021166057340773523,
"grad_norm": 0.21745403228236987,
"learning_rate": 2.644230769230769e-06,
"loss": 0.3981,
"mean_token_accuracy": 0.8870003700256348,
"step": 55
},
{
"epoch": 0.023090244371752933,
"grad_norm": 0.23081938415720538,
"learning_rate": 2.884615384615385e-06,
"loss": 0.3841,
"mean_token_accuracy": 0.8902194917201995,
"step": 60
},
{
"epoch": 0.025014431402732344,
"grad_norm": 0.1954993628169396,
"learning_rate": 3.125e-06,
"loss": 0.3801,
"mean_token_accuracy": 0.8912450432777405,
"step": 65
},
{
"epoch": 0.02693861843371176,
"grad_norm": 0.2003159839657926,
"learning_rate": 3.3653846153846154e-06,
"loss": 0.3768,
"mean_token_accuracy": 0.8914297997951508,
"step": 70
},
{
"epoch": 0.02886280546469117,
"grad_norm": 0.19367313769698136,
"learning_rate": 3.6057692307692307e-06,
"loss": 0.3718,
"mean_token_accuracy": 0.892903745174408,
"step": 75
},
{
"epoch": 0.03078699249567058,
"grad_norm": 0.17732974119710987,
"learning_rate": 3.846153846153847e-06,
"loss": 0.3634,
"mean_token_accuracy": 0.8940442979335785,
"step": 80
},
{
"epoch": 0.03271117952664999,
"grad_norm": 0.18202600444863165,
"learning_rate": 4.086538461538462e-06,
"loss": 0.3618,
"mean_token_accuracy": 0.8950167894363403,
"step": 85
},
{
"epoch": 0.0346353665576294,
"grad_norm": 0.1814077955452155,
"learning_rate": 4.326923076923077e-06,
"loss": 0.3608,
"mean_token_accuracy": 0.894530737400055,
"step": 90
},
{
"epoch": 0.036559553588608816,
"grad_norm": 0.17982732704049031,
"learning_rate": 4.567307692307692e-06,
"loss": 0.3546,
"mean_token_accuracy": 0.8964052140712738,
"step": 95
},
{
"epoch": 0.03848374061958822,
"grad_norm": 0.18760114918066836,
"learning_rate": 4.807692307692308e-06,
"loss": 0.3527,
"mean_token_accuracy": 0.896798574924469,
"step": 100
},
{
"epoch": 0.04040792765056764,
"grad_norm": 0.18792678975884775,
"learning_rate": 5.0480769230769235e-06,
"loss": 0.3552,
"mean_token_accuracy": 0.8957571089267731,
"step": 105
},
{
"epoch": 0.042332114681547045,
"grad_norm": 0.18458960773369418,
"learning_rate": 5.288461538461538e-06,
"loss": 0.35,
"mean_token_accuracy": 0.8970022380352021,
"step": 110
},
{
"epoch": 0.04425630171252646,
"grad_norm": 0.20512394864726513,
"learning_rate": 5.528846153846154e-06,
"loss": 0.3494,
"mean_token_accuracy": 0.8968957602977753,
"step": 115
},
{
"epoch": 0.04618048874350587,
"grad_norm": 0.19060088598183494,
"learning_rate": 5.76923076923077e-06,
"loss": 0.3435,
"mean_token_accuracy": 0.8983101010322571,
"step": 120
},
{
"epoch": 0.04810467577448528,
"grad_norm": 0.1837400972185552,
"learning_rate": 6.0096153846153855e-06,
"loss": 0.3423,
"mean_token_accuracy": 0.8987049281597137,
"step": 125
},
{
"epoch": 0.05002886280546469,
"grad_norm": 0.19128226203549084,
"learning_rate": 6.25e-06,
"loss": 0.34,
"mean_token_accuracy": 0.8992797672748566,
"step": 130
},
{
"epoch": 0.0519530498364441,
"grad_norm": 0.2012016913255968,
"learning_rate": 6.490384615384616e-06,
"loss": 0.3415,
"mean_token_accuracy": 0.8987887859344482,
"step": 135
},
{
"epoch": 0.05387723686742352,
"grad_norm": 0.20162833105748967,
"learning_rate": 6.730769230769231e-06,
"loss": 0.339,
"mean_token_accuracy": 0.8988729476928711,
"step": 140
},
{
"epoch": 0.055801423898402924,
"grad_norm": 0.1975004512012202,
"learning_rate": 6.9711538461538465e-06,
"loss": 0.3363,
"mean_token_accuracy": 0.8998090863227844,
"step": 145
},
{
"epoch": 0.05772561092938234,
"grad_norm": 0.19122157725014663,
"learning_rate": 7.211538461538461e-06,
"loss": 0.335,
"mean_token_accuracy": 0.8997473120689392,
"step": 150
},
{
"epoch": 0.059649797960361746,
"grad_norm": 0.19833364273809126,
"learning_rate": 7.451923076923077e-06,
"loss": 0.3358,
"mean_token_accuracy": 0.8996488392353058,
"step": 155
},
{
"epoch": 0.06157398499134116,
"grad_norm": 0.19478486853041554,
"learning_rate": 7.692307692307694e-06,
"loss": 0.338,
"mean_token_accuracy": 0.8991158306598663,
"step": 160
},
{
"epoch": 0.06349817202232057,
"grad_norm": 0.2237604537377637,
"learning_rate": 7.932692307692308e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.9004191577434539,
"step": 165
},
{
"epoch": 0.06542235905329997,
"grad_norm": 0.22412298555295504,
"learning_rate": 8.173076923076923e-06,
"loss": 0.3316,
"mean_token_accuracy": 0.9007892608642578,
"step": 170
},
{
"epoch": 0.0673465460842794,
"grad_norm": 0.22506299457001125,
"learning_rate": 8.41346153846154e-06,
"loss": 0.3331,
"mean_token_accuracy": 0.9003400206565857,
"step": 175
},
{
"epoch": 0.0692707331152588,
"grad_norm": 0.20552697674020787,
"learning_rate": 8.653846153846155e-06,
"loss": 0.3277,
"mean_token_accuracy": 0.9019653260707855,
"step": 180
},
{
"epoch": 0.07119492014623821,
"grad_norm": 0.20325569047208658,
"learning_rate": 8.89423076923077e-06,
"loss": 0.3285,
"mean_token_accuracy": 0.9014288187026978,
"step": 185
},
{
"epoch": 0.07311910717721763,
"grad_norm": 0.20125377934796918,
"learning_rate": 9.134615384615384e-06,
"loss": 0.3308,
"mean_token_accuracy": 0.9006717920303344,
"step": 190
},
{
"epoch": 0.07504329420819704,
"grad_norm": 0.2030873071758768,
"learning_rate": 9.375000000000001e-06,
"loss": 0.3216,
"mean_token_accuracy": 0.9034212172031403,
"step": 195
},
{
"epoch": 0.07696748123917645,
"grad_norm": 0.20006678924701887,
"learning_rate": 9.615384615384616e-06,
"loss": 0.323,
"mean_token_accuracy": 0.9028126895427704,
"step": 200
},
{
"epoch": 0.07889166827015585,
"grad_norm": 0.21243281284856008,
"learning_rate": 9.85576923076923e-06,
"loss": 0.3248,
"mean_token_accuracy": 0.9024546027183533,
"step": 205
},
{
"epoch": 0.08081585530113528,
"grad_norm": 0.21408796709828543,
"learning_rate": 1.0096153846153847e-05,
"loss": 0.3207,
"mean_token_accuracy": 0.903283417224884,
"step": 210
},
{
"epoch": 0.08274004233211468,
"grad_norm": 0.2172138301632152,
"learning_rate": 1.0336538461538462e-05,
"loss": 0.3237,
"mean_token_accuracy": 0.9019180119037629,
"step": 215
},
{
"epoch": 0.08466422936309409,
"grad_norm": 0.20230661485907633,
"learning_rate": 1.0576923076923077e-05,
"loss": 0.3201,
"mean_token_accuracy": 0.9030288457870483,
"step": 220
},
{
"epoch": 0.0865884163940735,
"grad_norm": 0.20311664902409643,
"learning_rate": 1.0817307692307693e-05,
"loss": 0.3182,
"mean_token_accuracy": 0.9034170091152192,
"step": 225
},
{
"epoch": 0.08851260342505292,
"grad_norm": 0.21362908226608346,
"learning_rate": 1.1057692307692308e-05,
"loss": 0.3187,
"mean_token_accuracy": 0.9034193456172943,
"step": 230
},
{
"epoch": 0.09043679045603233,
"grad_norm": 0.2169502521795548,
"learning_rate": 1.1298076923076923e-05,
"loss": 0.3154,
"mean_token_accuracy": 0.9046095728874206,
"step": 235
},
{
"epoch": 0.09236097748701173,
"grad_norm": 0.20379925548365357,
"learning_rate": 1.153846153846154e-05,
"loss": 0.3164,
"mean_token_accuracy": 0.9037660837173462,
"step": 240
},
{
"epoch": 0.09428516451799115,
"grad_norm": 0.21148715632559753,
"learning_rate": 1.1778846153846154e-05,
"loss": 0.3131,
"mean_token_accuracy": 0.9047864377498627,
"step": 245
},
{
"epoch": 0.09620935154897056,
"grad_norm": 0.21702583427227537,
"learning_rate": 1.2019230769230771e-05,
"loss": 0.312,
"mean_token_accuracy": 0.9051642954349518,
"step": 250
},
{
"epoch": 0.09813353857994997,
"grad_norm": 0.22069801101779785,
"learning_rate": 1.2259615384615384e-05,
"loss": 0.3134,
"mean_token_accuracy": 0.9046817421913147,
"step": 255
},
{
"epoch": 0.10005772561092938,
"grad_norm": 0.22529773024079458,
"learning_rate": 1.25e-05,
"loss": 0.3102,
"mean_token_accuracy": 0.9054987668991089,
"step": 260
},
{
"epoch": 0.1019819126419088,
"grad_norm": 0.23939421584415665,
"learning_rate": 1.2740384615384615e-05,
"loss": 0.3129,
"mean_token_accuracy": 0.9048707842826843,
"step": 265
},
{
"epoch": 0.1039060996728882,
"grad_norm": 0.2163800175850561,
"learning_rate": 1.2980769230769232e-05,
"loss": 0.309,
"mean_token_accuracy": 0.905825936794281,
"step": 270
},
{
"epoch": 0.10583028670386761,
"grad_norm": 0.20997253552830777,
"learning_rate": 1.3221153846153847e-05,
"loss": 0.306,
"mean_token_accuracy": 0.9066311120986938,
"step": 275
},
{
"epoch": 0.10775447373484703,
"grad_norm": 0.21975272965592113,
"learning_rate": 1.3461538461538462e-05,
"loss": 0.3059,
"mean_token_accuracy": 0.9064570367336273,
"step": 280
},
{
"epoch": 0.10967866076582644,
"grad_norm": 0.2268669947079344,
"learning_rate": 1.3701923076923078e-05,
"loss": 0.3027,
"mean_token_accuracy": 0.9075237393379212,
"step": 285
},
{
"epoch": 0.11160284779680585,
"grad_norm": 0.2253035741790312,
"learning_rate": 1.3942307692307693e-05,
"loss": 0.3041,
"mean_token_accuracy": 0.9066643595695496,
"step": 290
},
{
"epoch": 0.11352703482778526,
"grad_norm": 0.22634929034520535,
"learning_rate": 1.4182692307692308e-05,
"loss": 0.3019,
"mean_token_accuracy": 0.9074173867702484,
"step": 295
},
{
"epoch": 0.11545122185876468,
"grad_norm": 0.22715884246265147,
"learning_rate": 1.4423076923076923e-05,
"loss": 0.301,
"mean_token_accuracy": 0.9073862195014953,
"step": 300
},
{
"epoch": 0.11737540888974408,
"grad_norm": 0.26148249937262613,
"learning_rate": 1.466346153846154e-05,
"loss": 0.2997,
"mean_token_accuracy": 0.9080133736133575,
"step": 305
},
{
"epoch": 0.11929959592072349,
"grad_norm": 0.22681858980447167,
"learning_rate": 1.4903846153846154e-05,
"loss": 0.2975,
"mean_token_accuracy": 0.908364349603653,
"step": 310
},
{
"epoch": 0.1212237829517029,
"grad_norm": 0.22867695868438945,
"learning_rate": 1.5144230769230769e-05,
"loss": 0.2952,
"mean_token_accuracy": 0.90918750166893,
"step": 315
},
{
"epoch": 0.12314796998268232,
"grad_norm": 0.2332682731441986,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.2955,
"mean_token_accuracy": 0.9092335760593414,
"step": 320
},
{
"epoch": 0.12507215701366173,
"grad_norm": 0.2459196384955729,
"learning_rate": 1.5625e-05,
"loss": 0.2939,
"mean_token_accuracy": 0.9092588603496552,
"step": 325
},
{
"epoch": 0.12699634404464114,
"grad_norm": 0.22828931788344325,
"learning_rate": 1.5865384615384617e-05,
"loss": 0.2873,
"mean_token_accuracy": 0.911347258090973,
"step": 330
},
{
"epoch": 0.12892053107562054,
"grad_norm": 0.21946639421360295,
"learning_rate": 1.6105769230769233e-05,
"loss": 0.2866,
"mean_token_accuracy": 0.9112029016017914,
"step": 335
},
{
"epoch": 0.13084471810659995,
"grad_norm": 0.22530996089046024,
"learning_rate": 1.6346153846153847e-05,
"loss": 0.2875,
"mean_token_accuracy": 0.9108709812164306,
"step": 340
},
{
"epoch": 0.13276890513757938,
"grad_norm": 0.2551096047012644,
"learning_rate": 1.6586538461538463e-05,
"loss": 0.2839,
"mean_token_accuracy": 0.9118252098560333,
"step": 345
},
{
"epoch": 0.1346930921685588,
"grad_norm": 0.23107070145783154,
"learning_rate": 1.682692307692308e-05,
"loss": 0.2862,
"mean_token_accuracy": 0.911124873161316,
"step": 350
},
{
"epoch": 0.1366172791995382,
"grad_norm": 0.24198231450428304,
"learning_rate": 1.7067307692307693e-05,
"loss": 0.2795,
"mean_token_accuracy": 0.9127156972885132,
"step": 355
},
{
"epoch": 0.1385414662305176,
"grad_norm": 0.23288959582422472,
"learning_rate": 1.730769230769231e-05,
"loss": 0.2795,
"mean_token_accuracy": 0.9133687376976013,
"step": 360
},
{
"epoch": 0.14046565326149701,
"grad_norm": 0.25473076353716345,
"learning_rate": 1.7548076923076922e-05,
"loss": 0.2833,
"mean_token_accuracy": 0.9122391879558563,
"step": 365
},
{
"epoch": 0.14238984029247642,
"grad_norm": 0.2401352594878419,
"learning_rate": 1.778846153846154e-05,
"loss": 0.2772,
"mean_token_accuracy": 0.9137230932712554,
"step": 370
},
{
"epoch": 0.14431402732345583,
"grad_norm": 0.2277816706017706,
"learning_rate": 1.8028846153846156e-05,
"loss": 0.2748,
"mean_token_accuracy": 0.9141271114349365,
"step": 375
},
{
"epoch": 0.14623821435443526,
"grad_norm": 0.30055025836902316,
"learning_rate": 1.826923076923077e-05,
"loss": 0.2757,
"mean_token_accuracy": 0.9140791058540344,
"step": 380
},
{
"epoch": 0.14816240138541467,
"grad_norm": 0.2978323202801783,
"learning_rate": 1.8509615384615385e-05,
"loss": 0.2712,
"mean_token_accuracy": 0.9153619170188904,
"step": 385
},
{
"epoch": 0.15008658841639408,
"grad_norm": 0.26309054507870117,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.2733,
"mean_token_accuracy": 0.9149505078792572,
"step": 390
},
{
"epoch": 0.1520107754473735,
"grad_norm": 0.23686532210659178,
"learning_rate": 1.8990384615384615e-05,
"loss": 0.273,
"mean_token_accuracy": 0.9151660382747651,
"step": 395
},
{
"epoch": 0.1539349624783529,
"grad_norm": 0.2512431182310415,
"learning_rate": 1.923076923076923e-05,
"loss": 0.267,
"mean_token_accuracy": 0.9165635347366333,
"step": 400
},
{
"epoch": 0.1558591495093323,
"grad_norm": 0.23304477086405798,
"learning_rate": 1.9471153846153848e-05,
"loss": 0.2658,
"mean_token_accuracy": 0.9168694078922272,
"step": 405
},
{
"epoch": 0.1577833365403117,
"grad_norm": 0.2676902091848038,
"learning_rate": 1.971153846153846e-05,
"loss": 0.266,
"mean_token_accuracy": 0.9166438400745391,
"step": 410
},
{
"epoch": 0.15970752357129112,
"grad_norm": 0.26789992728688017,
"learning_rate": 1.9951923076923078e-05,
"loss": 0.2664,
"mean_token_accuracy": 0.916456151008606,
"step": 415
},
{
"epoch": 0.16163171060227055,
"grad_norm": 0.2979302921999793,
"learning_rate": 2.0192307692307694e-05,
"loss": 0.2631,
"mean_token_accuracy": 0.9175773561000824,
"step": 420
},
{
"epoch": 0.16355589763324996,
"grad_norm": 0.2831262323507047,
"learning_rate": 2.0432692307692307e-05,
"loss": 0.262,
"mean_token_accuracy": 0.917683893442154,
"step": 425
},
{
"epoch": 0.16548008466422937,
"grad_norm": 0.28630264276949907,
"learning_rate": 2.0673076923076924e-05,
"loss": 0.2622,
"mean_token_accuracy": 0.9177160143852234,
"step": 430
},
{
"epoch": 0.16740427169520877,
"grad_norm": 0.25697007177807973,
"learning_rate": 2.091346153846154e-05,
"loss": 0.259,
"mean_token_accuracy": 0.9182245790958404,
"step": 435
},
{
"epoch": 0.16932845872618818,
"grad_norm": 0.23508881366270015,
"learning_rate": 2.1153846153846154e-05,
"loss": 0.2564,
"mean_token_accuracy": 0.9188485264778137,
"step": 440
},
{
"epoch": 0.1712526457571676,
"grad_norm": 0.25717377003632436,
"learning_rate": 2.139423076923077e-05,
"loss": 0.2543,
"mean_token_accuracy": 0.9198683559894562,
"step": 445
},
{
"epoch": 0.173176832788147,
"grad_norm": 0.24779708644039147,
"learning_rate": 2.1634615384615387e-05,
"loss": 0.2497,
"mean_token_accuracy": 0.9209720194339752,
"step": 450
},
{
"epoch": 0.17510101981912643,
"grad_norm": 0.25956326734408053,
"learning_rate": 2.1875e-05,
"loss": 0.2559,
"mean_token_accuracy": 0.9193503677845001,
"step": 455
},
{
"epoch": 0.17702520685010584,
"grad_norm": 0.24756279829978345,
"learning_rate": 2.2115384615384616e-05,
"loss": 0.2542,
"mean_token_accuracy": 0.9198038160800934,
"step": 460
},
{
"epoch": 0.17894939388108524,
"grad_norm": 0.29512233740298904,
"learning_rate": 2.2355769230769233e-05,
"loss": 0.25,
"mean_token_accuracy": 0.9208518505096436,
"step": 465
},
{
"epoch": 0.18087358091206465,
"grad_norm": 0.25467507539452205,
"learning_rate": 2.2596153846153846e-05,
"loss": 0.2471,
"mean_token_accuracy": 0.9220957338809967,
"step": 470
},
{
"epoch": 0.18279776794304406,
"grad_norm": 0.24226828627780678,
"learning_rate": 2.2836538461538463e-05,
"loss": 0.2473,
"mean_token_accuracy": 0.9218408524990082,
"step": 475
},
{
"epoch": 0.18472195497402347,
"grad_norm": 0.2613108563091361,
"learning_rate": 2.307692307692308e-05,
"loss": 0.2472,
"mean_token_accuracy": 0.9217159509658813,
"step": 480
},
{
"epoch": 0.18664614200500287,
"grad_norm": 0.23328958429498128,
"learning_rate": 2.3317307692307692e-05,
"loss": 0.2486,
"mean_token_accuracy": 0.9216979265213012,
"step": 485
},
{
"epoch": 0.1885703290359823,
"grad_norm": 0.23884925516205582,
"learning_rate": 2.355769230769231e-05,
"loss": 0.2452,
"mean_token_accuracy": 0.9223480701446534,
"step": 490
},
{
"epoch": 0.19049451606696172,
"grad_norm": 0.23034686602700213,
"learning_rate": 2.3798076923076922e-05,
"loss": 0.245,
"mean_token_accuracy": 0.9224259674549102,
"step": 495
},
{
"epoch": 0.19241870309794112,
"grad_norm": 0.23791074060824063,
"learning_rate": 2.4038461538461542e-05,
"loss": 0.2447,
"mean_token_accuracy": 0.9224281191825867,
"step": 500
},
{
"epoch": 0.19434289012892053,
"grad_norm": 0.23743899436610488,
"learning_rate": 2.4278846153846155e-05,
"loss": 0.2386,
"mean_token_accuracy": 0.9246494829654693,
"step": 505
},
{
"epoch": 0.19626707715989994,
"grad_norm": 0.25251125640157557,
"learning_rate": 2.4519230769230768e-05,
"loss": 0.2414,
"mean_token_accuracy": 0.9234608709812164,
"step": 510
},
{
"epoch": 0.19819126419087935,
"grad_norm": 0.24599247169778568,
"learning_rate": 2.4759615384615388e-05,
"loss": 0.2374,
"mean_token_accuracy": 0.9246467411518097,
"step": 515
},
{
"epoch": 0.20011545122185875,
"grad_norm": 0.2618092608426384,
"learning_rate": 2.5e-05,
"loss": 0.2382,
"mean_token_accuracy": 0.92443066239357,
"step": 520
},
{
"epoch": 0.2020396382528382,
"grad_norm": 0.22703499557895365,
"learning_rate": 2.5240384615384614e-05,
"loss": 0.2368,
"mean_token_accuracy": 0.924897426366806,
"step": 525
},
{
"epoch": 0.2039638252838176,
"grad_norm": 0.2607950217469307,
"learning_rate": 2.548076923076923e-05,
"loss": 0.2367,
"mean_token_accuracy": 0.9249145805835723,
"step": 530
},
{
"epoch": 0.205888012314797,
"grad_norm": 0.24761827539633852,
"learning_rate": 2.5721153846153844e-05,
"loss": 0.2359,
"mean_token_accuracy": 0.924891984462738,
"step": 535
},
{
"epoch": 0.2078121993457764,
"grad_norm": 0.23623344107823752,
"learning_rate": 2.5961538461538464e-05,
"loss": 0.2335,
"mean_token_accuracy": 0.9254171848297119,
"step": 540
},
{
"epoch": 0.20973638637675582,
"grad_norm": 0.24983941011324626,
"learning_rate": 2.620192307692308e-05,
"loss": 0.2343,
"mean_token_accuracy": 0.9255397915840149,
"step": 545
},
{
"epoch": 0.21166057340773523,
"grad_norm": 0.2292795574738372,
"learning_rate": 2.6442307692307694e-05,
"loss": 0.2345,
"mean_token_accuracy": 0.9254584074020386,
"step": 550
},
{
"epoch": 0.21358476043871463,
"grad_norm": 0.2333800602809108,
"learning_rate": 2.668269230769231e-05,
"loss": 0.2292,
"mean_token_accuracy": 0.9271303296089173,
"step": 555
},
{
"epoch": 0.21550894746969407,
"grad_norm": 0.26776777652632827,
"learning_rate": 2.6923076923076923e-05,
"loss": 0.228,
"mean_token_accuracy": 0.9271810591220856,
"step": 560
},
{
"epoch": 0.21743313450067348,
"grad_norm": 0.22449836229224876,
"learning_rate": 2.7163461538461536e-05,
"loss": 0.2309,
"mean_token_accuracy": 0.9264620125293732,
"step": 565
},
{
"epoch": 0.21935732153165288,
"grad_norm": 0.2478440965545411,
"learning_rate": 2.7403846153846156e-05,
"loss": 0.2279,
"mean_token_accuracy": 0.9271829903125763,
"step": 570
},
{
"epoch": 0.2212815085626323,
"grad_norm": 0.2515161851481189,
"learning_rate": 2.7644230769230773e-05,
"loss": 0.2287,
"mean_token_accuracy": 0.9269922792911529,
"step": 575
},
{
"epoch": 0.2232056955936117,
"grad_norm": 0.2372790976207239,
"learning_rate": 2.7884615384615386e-05,
"loss": 0.2238,
"mean_token_accuracy": 0.9283542573451996,
"step": 580
},
{
"epoch": 0.2251298826245911,
"grad_norm": 0.2308583498822988,
"learning_rate": 2.8125000000000003e-05,
"loss": 0.2245,
"mean_token_accuracy": 0.9282214522361756,
"step": 585
},
{
"epoch": 0.2270540696555705,
"grad_norm": 0.23936964659736945,
"learning_rate": 2.8365384615384616e-05,
"loss": 0.2256,
"mean_token_accuracy": 0.9280098855495453,
"step": 590
},
{
"epoch": 0.22897825668654992,
"grad_norm": 0.23065855672205646,
"learning_rate": 2.860576923076923e-05,
"loss": 0.2264,
"mean_token_accuracy": 0.9275277733802796,
"step": 595
},
{
"epoch": 0.23090244371752935,
"grad_norm": 0.23666991823944167,
"learning_rate": 2.8846153846153845e-05,
"loss": 0.2279,
"mean_token_accuracy": 0.927252185344696,
"step": 600
},
{
"epoch": 0.23282663074850876,
"grad_norm": 0.2362324188657187,
"learning_rate": 2.9086538461538465e-05,
"loss": 0.2213,
"mean_token_accuracy": 0.9293029963970184,
"step": 605
},
{
"epoch": 0.23475081777948817,
"grad_norm": 0.23060183587573338,
"learning_rate": 2.932692307692308e-05,
"loss": 0.2242,
"mean_token_accuracy": 0.9282744646072387,
"step": 610
},
{
"epoch": 0.23667500481046758,
"grad_norm": 0.21245991724386762,
"learning_rate": 2.9567307692307695e-05,
"loss": 0.2233,
"mean_token_accuracy": 0.9289264142513275,
"step": 615
},
{
"epoch": 0.23859919184144698,
"grad_norm": 0.23324459763738273,
"learning_rate": 2.9807692307692308e-05,
"loss": 0.2206,
"mean_token_accuracy": 0.9290121555328369,
"step": 620
},
{
"epoch": 0.2405233788724264,
"grad_norm": 0.22128479385792282,
"learning_rate": 3.0048076923076925e-05,
"loss": 0.2208,
"mean_token_accuracy": 0.9296835780143737,
"step": 625
},
{
"epoch": 0.2424475659034058,
"grad_norm": 0.22415876113873653,
"learning_rate": 3.0288461538461538e-05,
"loss": 0.219,
"mean_token_accuracy": 0.929944384098053,
"step": 630
},
{
"epoch": 0.24437175293438523,
"grad_norm": 0.23998099576004747,
"learning_rate": 3.052884615384616e-05,
"loss": 0.2186,
"mean_token_accuracy": 0.9297797441482544,
"step": 635
},
{
"epoch": 0.24629593996536464,
"grad_norm": 0.23903304478335358,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.2191,
"mean_token_accuracy": 0.9300007164478302,
"step": 640
},
{
"epoch": 0.24822012699634405,
"grad_norm": 0.21012421644185675,
"learning_rate": 3.1009615384615384e-05,
"loss": 0.2179,
"mean_token_accuracy": 0.9301288604736329,
"step": 645
},
{
"epoch": 0.25014431402732346,
"grad_norm": 0.21025751764366593,
"learning_rate": 3.125e-05,
"loss": 0.2204,
"mean_token_accuracy": 0.9291583478450776,
"step": 650
},
{
"epoch": 0.25206850105830286,
"grad_norm": 0.20781743259005545,
"learning_rate": 3.149038461538462e-05,
"loss": 0.2122,
"mean_token_accuracy": 0.9316769897937774,
"step": 655
},
{
"epoch": 0.25399268808928227,
"grad_norm": 0.23423569885618364,
"learning_rate": 3.1730769230769234e-05,
"loss": 0.2161,
"mean_token_accuracy": 0.9302766799926758,
"step": 660
},
{
"epoch": 0.2559168751202617,
"grad_norm": 0.2618434429763953,
"learning_rate": 3.1971153846153843e-05,
"loss": 0.2172,
"mean_token_accuracy": 0.9301712214946747,
"step": 665
},
{
"epoch": 0.2578410621512411,
"grad_norm": 0.21344866448076094,
"learning_rate": 3.221153846153847e-05,
"loss": 0.2143,
"mean_token_accuracy": 0.9310815393924713,
"step": 670
},
{
"epoch": 0.2597652491822205,
"grad_norm": 0.19740495346391507,
"learning_rate": 3.2451923076923077e-05,
"loss": 0.2149,
"mean_token_accuracy": 0.930985963344574,
"step": 675
},
{
"epoch": 0.2616894362131999,
"grad_norm": 0.21087424388155931,
"learning_rate": 3.269230769230769e-05,
"loss": 0.2155,
"mean_token_accuracy": 0.9308305561542511,
"step": 680
},
{
"epoch": 0.2636136232441793,
"grad_norm": 0.22783958767619705,
"learning_rate": 3.293269230769231e-05,
"loss": 0.2188,
"mean_token_accuracy": 0.9300263047218322,
"step": 685
},
{
"epoch": 0.26553781027515877,
"grad_norm": 0.20761301453893283,
"learning_rate": 3.3173076923076926e-05,
"loss": 0.2165,
"mean_token_accuracy": 0.9306348443031311,
"step": 690
},
{
"epoch": 0.2674619973061382,
"grad_norm": 0.21496205447404498,
"learning_rate": 3.3413461538461536e-05,
"loss": 0.2129,
"mean_token_accuracy": 0.9316029012203216,
"step": 695
},
{
"epoch": 0.2693861843371176,
"grad_norm": 0.206245562580476,
"learning_rate": 3.365384615384616e-05,
"loss": 0.2134,
"mean_token_accuracy": 0.9316189765930176,
"step": 700
},
{
"epoch": 0.271310371368097,
"grad_norm": 0.20802188731768906,
"learning_rate": 3.3894230769230776e-05,
"loss": 0.2117,
"mean_token_accuracy": 0.9319843292236328,
"step": 705
},
{
"epoch": 0.2732345583990764,
"grad_norm": 0.2009145459195632,
"learning_rate": 3.4134615384615386e-05,
"loss": 0.2139,
"mean_token_accuracy": 0.9317736685276031,
"step": 710
},
{
"epoch": 0.2751587454300558,
"grad_norm": 0.20069344484919666,
"learning_rate": 3.4375e-05,
"loss": 0.2116,
"mean_token_accuracy": 0.9322500884532928,
"step": 715
},
{
"epoch": 0.2770829324610352,
"grad_norm": 0.20952627537217056,
"learning_rate": 3.461538461538462e-05,
"loss": 0.2124,
"mean_token_accuracy": 0.9313783466815948,
"step": 720
},
{
"epoch": 0.2790071194920146,
"grad_norm": 0.20442808485359995,
"learning_rate": 3.485576923076923e-05,
"loss": 0.2092,
"mean_token_accuracy": 0.9327110588550568,
"step": 725
},
{
"epoch": 0.28093130652299403,
"grad_norm": 0.19161824037544672,
"learning_rate": 3.5096153846153845e-05,
"loss": 0.2103,
"mean_token_accuracy": 0.9320014178752899,
"step": 730
},
{
"epoch": 0.28285549355397344,
"grad_norm": 0.20837842647969745,
"learning_rate": 3.533653846153847e-05,
"loss": 0.2117,
"mean_token_accuracy": 0.9322972476482392,
"step": 735
},
{
"epoch": 0.28477968058495284,
"grad_norm": 0.21822156699986803,
"learning_rate": 3.557692307692308e-05,
"loss": 0.2146,
"mean_token_accuracy": 0.9314618885517121,
"step": 740
},
{
"epoch": 0.28670386761593225,
"grad_norm": 0.22156113384687548,
"learning_rate": 3.5817307692307695e-05,
"loss": 0.2101,
"mean_token_accuracy": 0.9324331998825073,
"step": 745
},
{
"epoch": 0.28862805464691166,
"grad_norm": 0.20641479681027783,
"learning_rate": 3.605769230769231e-05,
"loss": 0.2138,
"mean_token_accuracy": 0.9312689363956451,
"step": 750
},
{
"epoch": 0.29055224167789107,
"grad_norm": 0.23309235000114525,
"learning_rate": 3.629807692307692e-05,
"loss": 0.2096,
"mean_token_accuracy": 0.9329419672489166,
"step": 755
},
{
"epoch": 0.29247642870887053,
"grad_norm": 0.21847662656225478,
"learning_rate": 3.653846153846154e-05,
"loss": 0.2131,
"mean_token_accuracy": 0.9315805315971375,
"step": 760
},
{
"epoch": 0.29440061573984994,
"grad_norm": 0.18839750574033276,
"learning_rate": 3.677884615384616e-05,
"loss": 0.2105,
"mean_token_accuracy": 0.9323232233524322,
"step": 765
},
{
"epoch": 0.29632480277082934,
"grad_norm": 0.21122122983313732,
"learning_rate": 3.701923076923077e-05,
"loss": 0.2078,
"mean_token_accuracy": 0.9330488741397858,
"step": 770
},
{
"epoch": 0.29824898980180875,
"grad_norm": 0.21721244243557053,
"learning_rate": 3.725961538461539e-05,
"loss": 0.207,
"mean_token_accuracy": 0.9333645045757294,
"step": 775
},
{
"epoch": 0.30017317683278816,
"grad_norm": 0.20222043253832608,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.2078,
"mean_token_accuracy": 0.9331040740013122,
"step": 780
},
{
"epoch": 0.30209736386376757,
"grad_norm": 0.19877908114652662,
"learning_rate": 3.774038461538461e-05,
"loss": 0.2099,
"mean_token_accuracy": 0.9326650083065033,
"step": 785
},
{
"epoch": 0.304021550894747,
"grad_norm": 0.19472736523917117,
"learning_rate": 3.798076923076923e-05,
"loss": 0.2092,
"mean_token_accuracy": 0.9328258693218231,
"step": 790
},
{
"epoch": 0.3059457379257264,
"grad_norm": 0.19321786695920545,
"learning_rate": 3.8221153846153846e-05,
"loss": 0.206,
"mean_token_accuracy": 0.9335503697395324,
"step": 795
},
{
"epoch": 0.3078699249567058,
"grad_norm": 0.18972433432584948,
"learning_rate": 3.846153846153846e-05,
"loss": 0.2088,
"mean_token_accuracy": 0.9328764796257019,
"step": 800
},
{
"epoch": 0.3097941119876852,
"grad_norm": 0.1923307709214479,
"learning_rate": 3.870192307692308e-05,
"loss": 0.2099,
"mean_token_accuracy": 0.9327602207660675,
"step": 805
},
{
"epoch": 0.3117182990186646,
"grad_norm": 0.18143925889012769,
"learning_rate": 3.8942307692307696e-05,
"loss": 0.2076,
"mean_token_accuracy": 0.9332870185375214,
"step": 810
},
{
"epoch": 0.313642486049644,
"grad_norm": 0.18503423559580173,
"learning_rate": 3.918269230769231e-05,
"loss": 0.2058,
"mean_token_accuracy": 0.9337727010250092,
"step": 815
},
{
"epoch": 0.3155666730806234,
"grad_norm": 0.19520954027014065,
"learning_rate": 3.942307692307692e-05,
"loss": 0.2058,
"mean_token_accuracy": 0.9339201748371124,
"step": 820
},
{
"epoch": 0.3174908601116028,
"grad_norm": 0.19288588146770536,
"learning_rate": 3.966346153846154e-05,
"loss": 0.2045,
"mean_token_accuracy": 0.9341778755187988,
"step": 825
},
{
"epoch": 0.31941504714258223,
"grad_norm": 0.1978389444065355,
"learning_rate": 3.9903846153846155e-05,
"loss": 0.2067,
"mean_token_accuracy": 0.9330973982810974,
"step": 830
},
{
"epoch": 0.3213392341735617,
"grad_norm": 0.19378571981651038,
"learning_rate": 4.014423076923077e-05,
"loss": 0.2063,
"mean_token_accuracy": 0.9334518015384674,
"step": 835
},
{
"epoch": 0.3232634212045411,
"grad_norm": 0.19124108346964624,
"learning_rate": 4.038461538461539e-05,
"loss": 0.2064,
"mean_token_accuracy": 0.9336638748645782,
"step": 840
},
{
"epoch": 0.3251876082355205,
"grad_norm": 0.18622997253186024,
"learning_rate": 4.0625000000000005e-05,
"loss": 0.2052,
"mean_token_accuracy": 0.933771300315857,
"step": 845
},
{
"epoch": 0.3271117952664999,
"grad_norm": 0.18047500059188085,
"learning_rate": 4.0865384615384615e-05,
"loss": 0.2036,
"mean_token_accuracy": 0.9343971490859986,
"step": 850
},
{
"epoch": 0.3290359822974793,
"grad_norm": 0.17877914026319977,
"learning_rate": 4.110576923076923e-05,
"loss": 0.2048,
"mean_token_accuracy": 0.9344187080860138,
"step": 855
},
{
"epoch": 0.33096016932845873,
"grad_norm": 0.18446937805529498,
"learning_rate": 4.134615384615385e-05,
"loss": 0.202,
"mean_token_accuracy": 0.9348318338394165,
"step": 860
},
{
"epoch": 0.33288435635943814,
"grad_norm": 0.1780966137250119,
"learning_rate": 4.1586538461538464e-05,
"loss": 0.2032,
"mean_token_accuracy": 0.9346541225910187,
"step": 865
},
{
"epoch": 0.33480854339041755,
"grad_norm": 0.18714951188569887,
"learning_rate": 4.182692307692308e-05,
"loss": 0.2018,
"mean_token_accuracy": 0.9349911212921143,
"step": 870
},
{
"epoch": 0.33673273042139695,
"grad_norm": 0.18414606574133185,
"learning_rate": 4.20673076923077e-05,
"loss": 0.2068,
"mean_token_accuracy": 0.9334247648715973,
"step": 875
},
{
"epoch": 0.33865691745237636,
"grad_norm": 0.1890699417514011,
"learning_rate": 4.230769230769231e-05,
"loss": 0.2022,
"mean_token_accuracy": 0.9347915649414062,
"step": 880
},
{
"epoch": 0.34058110448335577,
"grad_norm": 0.1952117855015522,
"learning_rate": 4.2548076923076924e-05,
"loss": 0.2044,
"mean_token_accuracy": 0.9339782655239105,
"step": 885
},
{
"epoch": 0.3425052915143352,
"grad_norm": 0.17922877325513092,
"learning_rate": 4.278846153846154e-05,
"loss": 0.2036,
"mean_token_accuracy": 0.93400137424469,
"step": 890
},
{
"epoch": 0.3444294785453146,
"grad_norm": 0.19127546209858898,
"learning_rate": 4.302884615384616e-05,
"loss": 0.2023,
"mean_token_accuracy": 0.9349508583545685,
"step": 895
},
{
"epoch": 0.346353665576294,
"grad_norm": 0.18030007018769648,
"learning_rate": 4.326923076923077e-05,
"loss": 0.2025,
"mean_token_accuracy": 0.9346420764923096,
"step": 900
},
{
"epoch": 0.34827785260727345,
"grad_norm": 0.18714905196078213,
"learning_rate": 4.350961538461539e-05,
"loss": 0.2019,
"mean_token_accuracy": 0.9350101053714752,
"step": 905
},
{
"epoch": 0.35020203963825286,
"grad_norm": 0.18031917297695335,
"learning_rate": 4.375e-05,
"loss": 0.2038,
"mean_token_accuracy": 0.9343986690044404,
"step": 910
},
{
"epoch": 0.35212622666923227,
"grad_norm": 0.17658313934751513,
"learning_rate": 4.3990384615384616e-05,
"loss": 0.2048,
"mean_token_accuracy": 0.934125417470932,
"step": 915
},
{
"epoch": 0.3540504137002117,
"grad_norm": 0.1853217964006783,
"learning_rate": 4.423076923076923e-05,
"loss": 0.1997,
"mean_token_accuracy": 0.9355320036411285,
"step": 920
},
{
"epoch": 0.3559746007311911,
"grad_norm": 0.18337184641002502,
"learning_rate": 4.447115384615384e-05,
"loss": 0.2011,
"mean_token_accuracy": 0.9351063668727875,
"step": 925
},
{
"epoch": 0.3578987877621705,
"grad_norm": 0.18647822815016663,
"learning_rate": 4.4711538461538466e-05,
"loss": 0.2079,
"mean_token_accuracy": 0.9331667721271515,
"step": 930
},
{
"epoch": 0.3598229747931499,
"grad_norm": 0.1786427996544572,
"learning_rate": 4.495192307692308e-05,
"loss": 0.2063,
"mean_token_accuracy": 0.933509111404419,
"step": 935
},
{
"epoch": 0.3617471618241293,
"grad_norm": 0.17995728376539782,
"learning_rate": 4.519230769230769e-05,
"loss": 0.2056,
"mean_token_accuracy": 0.9337765216827393,
"step": 940
},
{
"epoch": 0.3636713488551087,
"grad_norm": 0.18565188071732694,
"learning_rate": 4.543269230769231e-05,
"loss": 0.2033,
"mean_token_accuracy": 0.9345883190631866,
"step": 945
},
{
"epoch": 0.3655955358860881,
"grad_norm": 0.20682212909883085,
"learning_rate": 4.5673076923076925e-05,
"loss": 0.201,
"mean_token_accuracy": 0.935194593667984,
"step": 950
},
{
"epoch": 0.3675197229170675,
"grad_norm": 0.17087885109099807,
"learning_rate": 4.591346153846154e-05,
"loss": 0.2023,
"mean_token_accuracy": 0.9346142172813415,
"step": 955
},
{
"epoch": 0.36944390994804693,
"grad_norm": 0.17283361779419767,
"learning_rate": 4.615384615384616e-05,
"loss": 0.2014,
"mean_token_accuracy": 0.934814327955246,
"step": 960
},
{
"epoch": 0.37136809697902634,
"grad_norm": 0.17663705053214648,
"learning_rate": 4.6394230769230775e-05,
"loss": 0.2004,
"mean_token_accuracy": 0.9352030277252197,
"step": 965
},
{
"epoch": 0.37329228401000575,
"grad_norm": 0.17188964913509067,
"learning_rate": 4.6634615384615384e-05,
"loss": 0.2035,
"mean_token_accuracy": 0.9344177782535553,
"step": 970
},
{
"epoch": 0.3752164710409852,
"grad_norm": 0.17299384159996276,
"learning_rate": 4.6875e-05,
"loss": 0.1993,
"mean_token_accuracy": 0.935628992319107,
"step": 975
},
{
"epoch": 0.3771406580719646,
"grad_norm": 0.1787796731368517,
"learning_rate": 4.711538461538462e-05,
"loss": 0.1996,
"mean_token_accuracy": 0.9355967044830322,
"step": 980
},
{
"epoch": 0.379064845102944,
"grad_norm": 0.17902949352868802,
"learning_rate": 4.7355769230769234e-05,
"loss": 0.1998,
"mean_token_accuracy": 0.9354398012161255,
"step": 985
},
{
"epoch": 0.38098903213392343,
"grad_norm": 0.18172066960666447,
"learning_rate": 4.7596153846153844e-05,
"loss": 0.2011,
"mean_token_accuracy": 0.9351277530193329,
"step": 990
},
{
"epoch": 0.38291321916490284,
"grad_norm": 0.18052681549380922,
"learning_rate": 4.783653846153847e-05,
"loss": 0.1991,
"mean_token_accuracy": 0.9355564057826996,
"step": 995
},
{
"epoch": 0.38483740619588225,
"grad_norm": 0.17224435083974163,
"learning_rate": 4.8076923076923084e-05,
"loss": 0.2009,
"mean_token_accuracy": 0.9353624939918518,
"step": 1000
},
{
"epoch": 0.38676159322686166,
"grad_norm": 0.18548492282198598,
"learning_rate": 4.8317307692307693e-05,
"loss": 0.2008,
"mean_token_accuracy": 0.9351412355899811,
"step": 1005
},
{
"epoch": 0.38868578025784106,
"grad_norm": 0.17377556409194556,
"learning_rate": 4.855769230769231e-05,
"loss": 0.2007,
"mean_token_accuracy": 0.9352908134460449,
"step": 1010
},
{
"epoch": 0.39060996728882047,
"grad_norm": 0.1773864121321377,
"learning_rate": 4.8798076923076926e-05,
"loss": 0.2005,
"mean_token_accuracy": 0.9353309094905853,
"step": 1015
},
{
"epoch": 0.3925341543197999,
"grad_norm": 0.1731017926113393,
"learning_rate": 4.9038461538461536e-05,
"loss": 0.2025,
"mean_token_accuracy": 0.934537136554718,
"step": 1020
},
{
"epoch": 0.3944583413507793,
"grad_norm": 0.16644220626271233,
"learning_rate": 4.927884615384616e-05,
"loss": 0.1976,
"mean_token_accuracy": 0.9359455347061157,
"step": 1025
},
{
"epoch": 0.3963825283817587,
"grad_norm": 0.17009248714497768,
"learning_rate": 4.9519230769230776e-05,
"loss": 0.2023,
"mean_token_accuracy": 0.934699285030365,
"step": 1030
},
{
"epoch": 0.3983067154127381,
"grad_norm": 0.17340750114798312,
"learning_rate": 4.9759615384615386e-05,
"loss": 0.1972,
"mean_token_accuracy": 0.9362557351589202,
"step": 1035
},
{
"epoch": 0.4002309024437175,
"grad_norm": 0.17122966366041767,
"learning_rate": 5e-05,
"loss": 0.1985,
"mean_token_accuracy": 0.935871708393097,
"step": 1040
},
{
"epoch": 0.4021550894746969,
"grad_norm": 0.17659027782667022,
"learning_rate": 4.999996826173528e-05,
"loss": 0.1995,
"mean_token_accuracy": 0.9355424761772155,
"step": 1045
},
{
"epoch": 0.4040792765056764,
"grad_norm": 0.18105261227658284,
"learning_rate": 4.999987304703068e-05,
"loss": 0.2029,
"mean_token_accuracy": 0.9345368921756745,
"step": 1050
},
{
"epoch": 0.4060034635366558,
"grad_norm": 0.17884616958670743,
"learning_rate": 4.999971435615479e-05,
"loss": 0.2037,
"mean_token_accuracy": 0.9342390179634095,
"step": 1055
},
{
"epoch": 0.4079276505676352,
"grad_norm": 0.17233022523598526,
"learning_rate": 4.999949218955533e-05,
"loss": 0.2001,
"mean_token_accuracy": 0.9354979634284973,
"step": 1060
},
{
"epoch": 0.4098518375986146,
"grad_norm": 0.16780066324475737,
"learning_rate": 4.999920654785905e-05,
"loss": 0.1982,
"mean_token_accuracy": 0.9359723389148712,
"step": 1065
},
{
"epoch": 0.411776024629594,
"grad_norm": 0.16408063918538596,
"learning_rate": 4.999885743187181e-05,
"loss": 0.1969,
"mean_token_accuracy": 0.9364181160926819,
"step": 1070
},
{
"epoch": 0.4137002116605734,
"grad_norm": 0.16436131857729716,
"learning_rate": 4.9998444842578536e-05,
"loss": 0.197,
"mean_token_accuracy": 0.9364462256431579,
"step": 1075
},
{
"epoch": 0.4156243986915528,
"grad_norm": 0.1649527103012933,
"learning_rate": 4.999796878114321e-05,
"loss": 0.2003,
"mean_token_accuracy": 0.9352196276187896,
"step": 1080
},
{
"epoch": 0.41754858572253223,
"grad_norm": 0.1753592212006608,
"learning_rate": 4.9997429248908874e-05,
"loss": 0.2027,
"mean_token_accuracy": 0.9347833514213562,
"step": 1085
},
{
"epoch": 0.41947277275351164,
"grad_norm": 0.1721618475449869,
"learning_rate": 4.999682624739765e-05,
"loss": 0.1957,
"mean_token_accuracy": 0.9364085972309113,
"step": 1090
},
{
"epoch": 0.42139695978449104,
"grad_norm": 0.15782787279679403,
"learning_rate": 4.9996159778310734e-05,
"loss": 0.2024,
"mean_token_accuracy": 0.9344262480735779,
"step": 1095
},
{
"epoch": 0.42332114681547045,
"grad_norm": 0.16736454601981463,
"learning_rate": 4.9995429843528316e-05,
"loss": 0.2025,
"mean_token_accuracy": 0.9348289132118225,
"step": 1100
},
{
"epoch": 0.42524533384644986,
"grad_norm": 0.16222474872691323,
"learning_rate": 4.999463644510971e-05,
"loss": 0.2023,
"mean_token_accuracy": 0.9346409559249877,
"step": 1105
},
{
"epoch": 0.42716952087742927,
"grad_norm": 0.15986614324171702,
"learning_rate": 4.999377958529322e-05,
"loss": 0.2,
"mean_token_accuracy": 0.9356332302093506,
"step": 1110
},
{
"epoch": 0.4290937079084087,
"grad_norm": 0.16333128122988397,
"learning_rate": 4.99928592664962e-05,
"loss": 0.1979,
"mean_token_accuracy": 0.9361936330795289,
"step": 1115
},
{
"epoch": 0.43101789493938814,
"grad_norm": 0.16227308161461754,
"learning_rate": 4.9991875491315034e-05,
"loss": 0.1972,
"mean_token_accuracy": 0.9363140523433685,
"step": 1120
},
{
"epoch": 0.43294208197036754,
"grad_norm": 0.1713591306571476,
"learning_rate": 4.999082826252513e-05,
"loss": 0.197,
"mean_token_accuracy": 0.9363386750221252,
"step": 1125
},
{
"epoch": 0.43486626900134695,
"grad_norm": 0.15990954791736506,
"learning_rate": 4.9989717583080906e-05,
"loss": 0.1972,
"mean_token_accuracy": 0.9362068951129914,
"step": 1130
},
{
"epoch": 0.43679045603232636,
"grad_norm": 0.1737279224848079,
"learning_rate": 4.998854345611579e-05,
"loss": 0.1956,
"mean_token_accuracy": 0.9366493225097656,
"step": 1135
},
{
"epoch": 0.43871464306330576,
"grad_norm": 0.1562327893766701,
"learning_rate": 4.998730588494221e-05,
"loss": 0.1971,
"mean_token_accuracy": 0.9364685654640198,
"step": 1140
},
{
"epoch": 0.44063883009428517,
"grad_norm": 0.1588473310407234,
"learning_rate": 4.998600487305156e-05,
"loss": 0.1948,
"mean_token_accuracy": 0.9368900418281555,
"step": 1145
},
{
"epoch": 0.4425630171252646,
"grad_norm": 0.17995519422150538,
"learning_rate": 4.998464042411424e-05,
"loss": 0.1991,
"mean_token_accuracy": 0.9357859253883362,
"step": 1150
},
{
"epoch": 0.444487204156244,
"grad_norm": 0.16926915405066503,
"learning_rate": 4.9983212541979594e-05,
"loss": 0.1971,
"mean_token_accuracy": 0.9361749947071075,
"step": 1155
},
{
"epoch": 0.4464113911872234,
"grad_norm": 0.16163021262186142,
"learning_rate": 4.998172123067595e-05,
"loss": 0.1956,
"mean_token_accuracy": 0.936591511964798,
"step": 1160
},
{
"epoch": 0.4483355782182028,
"grad_norm": 0.15952963787166335,
"learning_rate": 4.9980166494410556e-05,
"loss": 0.1929,
"mean_token_accuracy": 0.937582665681839,
"step": 1165
},
{
"epoch": 0.4502597652491822,
"grad_norm": 0.16284192484187074,
"learning_rate": 4.99785483375696e-05,
"loss": 0.1971,
"mean_token_accuracy": 0.9360791742801666,
"step": 1170
},
{
"epoch": 0.4521839522801616,
"grad_norm": 0.15441856261654946,
"learning_rate": 4.99768667647182e-05,
"loss": 0.1922,
"mean_token_accuracy": 0.9375222563743592,
"step": 1175
},
{
"epoch": 0.454108139311141,
"grad_norm": 0.16405561020180287,
"learning_rate": 4.9975121780600356e-05,
"loss": 0.1965,
"mean_token_accuracy": 0.9364106237888337,
"step": 1180
},
{
"epoch": 0.45603232634212043,
"grad_norm": 0.14887241164649884,
"learning_rate": 4.9973313390138985e-05,
"loss": 0.1947,
"mean_token_accuracy": 0.9369799256324768,
"step": 1185
},
{
"epoch": 0.45795651337309984,
"grad_norm": 0.15910575747869182,
"learning_rate": 4.9971441598435905e-05,
"loss": 0.1997,
"mean_token_accuracy": 0.9359139621257782,
"step": 1190
},
{
"epoch": 0.4598807004040793,
"grad_norm": 0.1596570148547143,
"learning_rate": 4.996950641077174e-05,
"loss": 0.1938,
"mean_token_accuracy": 0.9371317565441132,
"step": 1195
},
{
"epoch": 0.4618048874350587,
"grad_norm": 0.1542919586208301,
"learning_rate": 4.996750783260602e-05,
"loss": 0.1972,
"mean_token_accuracy": 0.9365085899829865,
"step": 1200
},
{
"epoch": 0.4637290744660381,
"grad_norm": 0.1554002350742973,
"learning_rate": 4.9965445869577106e-05,
"loss": 0.193,
"mean_token_accuracy": 0.9373429834842681,
"step": 1205
},
{
"epoch": 0.4656532614970175,
"grad_norm": 0.15006275457030324,
"learning_rate": 4.996332052750214e-05,
"loss": 0.1931,
"mean_token_accuracy": 0.9375267207622529,
"step": 1210
},
{
"epoch": 0.46757744852799693,
"grad_norm": 0.15502590460532487,
"learning_rate": 4.99611318123771e-05,
"loss": 0.1941,
"mean_token_accuracy": 0.9373384654521942,
"step": 1215
},
{
"epoch": 0.46950163555897634,
"grad_norm": 0.14854596494688704,
"learning_rate": 4.9958879730376754e-05,
"loss": 0.1926,
"mean_token_accuracy": 0.9373452723026275,
"step": 1220
},
{
"epoch": 0.47142582258995575,
"grad_norm": 0.14988651130185893,
"learning_rate": 4.995656428785461e-05,
"loss": 0.198,
"mean_token_accuracy": 0.936055588722229,
"step": 1225
},
{
"epoch": 0.47335000962093515,
"grad_norm": 0.15044586479128982,
"learning_rate": 4.995418549134296e-05,
"loss": 0.194,
"mean_token_accuracy": 0.937198007106781,
"step": 1230
},
{
"epoch": 0.47527419665191456,
"grad_norm": 0.14304018627938955,
"learning_rate": 4.995174334755281e-05,
"loss": 0.1913,
"mean_token_accuracy": 0.9380493998527527,
"step": 1235
},
{
"epoch": 0.47719838368289397,
"grad_norm": 0.15847240603024573,
"learning_rate": 4.994923786337389e-05,
"loss": 0.1941,
"mean_token_accuracy": 0.9369276583194732,
"step": 1240
},
{
"epoch": 0.4791225707138734,
"grad_norm": 0.15050361963809947,
"learning_rate": 4.9946669045874616e-05,
"loss": 0.1965,
"mean_token_accuracy": 0.9365364372730255,
"step": 1245
},
{
"epoch": 0.4810467577448528,
"grad_norm": 0.15143997168073883,
"learning_rate": 4.994403690230208e-05,
"loss": 0.1952,
"mean_token_accuracy": 0.9370022714138031,
"step": 1250
},
{
"epoch": 0.4829709447758322,
"grad_norm": 0.14721494174882732,
"learning_rate": 4.994134144008203e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.9377739369869232,
"step": 1255
},
{
"epoch": 0.4848951318068116,
"grad_norm": 0.15299254663295053,
"learning_rate": 4.993858266681885e-05,
"loss": 0.194,
"mean_token_accuracy": 0.9371655225753784,
"step": 1260
},
{
"epoch": 0.48681931883779106,
"grad_norm": 0.15807599202373432,
"learning_rate": 4.9935760590295534e-05,
"loss": 0.1923,
"mean_token_accuracy": 0.9380616784095764,
"step": 1265
},
{
"epoch": 0.48874350586877047,
"grad_norm": 0.14790047243010532,
"learning_rate": 4.9932875218473666e-05,
"loss": 0.1921,
"mean_token_accuracy": 0.9376394093036652,
"step": 1270
},
{
"epoch": 0.4906676928997499,
"grad_norm": 0.15177558035884425,
"learning_rate": 4.992992655949339e-05,
"loss": 0.193,
"mean_token_accuracy": 0.9373527705669403,
"step": 1275
},
{
"epoch": 0.4925918799307293,
"grad_norm": 0.15030494230476282,
"learning_rate": 4.992691462167342e-05,
"loss": 0.1952,
"mean_token_accuracy": 0.93678218126297,
"step": 1280
},
{
"epoch": 0.4945160669617087,
"grad_norm": 0.14934065421375978,
"learning_rate": 4.992383941351094e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.9374918937683105,
"step": 1285
},
{
"epoch": 0.4964402539926881,
"grad_norm": 0.14795547731798583,
"learning_rate": 4.9920700943681695e-05,
"loss": 0.1918,
"mean_token_accuracy": 0.937732708454132,
"step": 1290
},
{
"epoch": 0.4983644410236675,
"grad_norm": 0.14706674754198268,
"learning_rate": 4.991749922103984e-05,
"loss": 0.1937,
"mean_token_accuracy": 0.9371985256671905,
"step": 1295
},
{
"epoch": 0.5002886280546469,
"grad_norm": 0.18209623021883456,
"learning_rate": 4.991423425461804e-05,
"loss": 0.1925,
"mean_token_accuracy": 0.9376034200191498,
"step": 1300
},
{
"epoch": 0.5022128150856263,
"grad_norm": 0.14998754818873625,
"learning_rate": 4.991090605362733e-05,
"loss": 0.1952,
"mean_token_accuracy": 0.9369374454021454,
"step": 1305
},
{
"epoch": 0.5041370021166057,
"grad_norm": 0.14623146138818685,
"learning_rate": 4.990751462745717e-05,
"loss": 0.1925,
"mean_token_accuracy": 0.9375016152858734,
"step": 1310
},
{
"epoch": 0.5060611891475851,
"grad_norm": 0.14877961023165806,
"learning_rate": 4.990405998567537e-05,
"loss": 0.1942,
"mean_token_accuracy": 0.9372852742671967,
"step": 1315
},
{
"epoch": 0.5079853761785645,
"grad_norm": 0.14508285936489035,
"learning_rate": 4.99005421380281e-05,
"loss": 0.1942,
"mean_token_accuracy": 0.9370539069175721,
"step": 1320
},
{
"epoch": 0.509909563209544,
"grad_norm": 0.14621668211521224,
"learning_rate": 4.9896961094439844e-05,
"loss": 0.1921,
"mean_token_accuracy": 0.9378675162792206,
"step": 1325
},
{
"epoch": 0.5118337502405234,
"grad_norm": 0.14679732692209072,
"learning_rate": 4.989331686501335e-05,
"loss": 0.1878,
"mean_token_accuracy": 0.9390625476837158,
"step": 1330
},
{
"epoch": 0.5137579372715028,
"grad_norm": 0.14745685811150153,
"learning_rate": 4.9889609460029654e-05,
"loss": 0.1905,
"mean_token_accuracy": 0.9380582571029663,
"step": 1335
},
{
"epoch": 0.5156821243024822,
"grad_norm": 0.14611296052225958,
"learning_rate": 4.988583888994802e-05,
"loss": 0.1899,
"mean_token_accuracy": 0.938438081741333,
"step": 1340
},
{
"epoch": 0.5176063113334616,
"grad_norm": 0.15273461896563223,
"learning_rate": 4.9882005165405885e-05,
"loss": 0.1932,
"mean_token_accuracy": 0.9374414086341858,
"step": 1345
},
{
"epoch": 0.519530498364441,
"grad_norm": 0.1440988164125214,
"learning_rate": 4.987810829721887e-05,
"loss": 0.192,
"mean_token_accuracy": 0.9377279877662659,
"step": 1350
},
{
"epoch": 0.5214546853954204,
"grad_norm": 0.14678789036749104,
"learning_rate": 4.9874148296380754e-05,
"loss": 0.1936,
"mean_token_accuracy": 0.9375370383262634,
"step": 1355
},
{
"epoch": 0.5233788724263998,
"grad_norm": 0.14293580047102425,
"learning_rate": 4.9870125174063384e-05,
"loss": 0.1901,
"mean_token_accuracy": 0.9380953311920166,
"step": 1360
},
{
"epoch": 0.5253030594573792,
"grad_norm": 0.14829079991813543,
"learning_rate": 4.9866038941616736e-05,
"loss": 0.1891,
"mean_token_accuracy": 0.9382195234298706,
"step": 1365
},
{
"epoch": 0.5272272464883586,
"grad_norm": 0.1454512033844362,
"learning_rate": 4.986188961056879e-05,
"loss": 0.1937,
"mean_token_accuracy": 0.9372387111186982,
"step": 1370
},
{
"epoch": 0.5291514335193381,
"grad_norm": 0.138895169464844,
"learning_rate": 4.9857677192625564e-05,
"loss": 0.1906,
"mean_token_accuracy": 0.9381425142288208,
"step": 1375
},
{
"epoch": 0.5310756205503175,
"grad_norm": 0.14740272656815195,
"learning_rate": 4.9853401699671016e-05,
"loss": 0.1896,
"mean_token_accuracy": 0.9385837614536285,
"step": 1380
},
{
"epoch": 0.532999807581297,
"grad_norm": 0.14804617939261644,
"learning_rate": 4.98490631437671e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.9376095533370972,
"step": 1385
},
{
"epoch": 0.5349239946122764,
"grad_norm": 0.13962868701210818,
"learning_rate": 4.9844661537153656e-05,
"loss": 0.1894,
"mean_token_accuracy": 0.9385484099388123,
"step": 1390
},
{
"epoch": 0.5368481816432558,
"grad_norm": 0.13562255479313326,
"learning_rate": 4.98401968922484e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.9374591946601868,
"step": 1395
},
{
"epoch": 0.5387723686742352,
"grad_norm": 0.13681927313682837,
"learning_rate": 4.9835669221646896e-05,
"loss": 0.1907,
"mean_token_accuracy": 0.9382172763347626,
"step": 1400
},
{
"epoch": 0.5406965557052146,
"grad_norm": 0.1416451055009483,
"learning_rate": 4.983107853812252e-05,
"loss": 0.1899,
"mean_token_accuracy": 0.9381133437156677,
"step": 1405
},
{
"epoch": 0.542620742736194,
"grad_norm": 0.1398213343702262,
"learning_rate": 4.98264248546264e-05,
"loss": 0.19,
"mean_token_accuracy": 0.9381514072418213,
"step": 1410
},
{
"epoch": 0.5445449297671734,
"grad_norm": 0.14336297674820447,
"learning_rate": 4.982170818428742e-05,
"loss": 0.1924,
"mean_token_accuracy": 0.9378505229949952,
"step": 1415
},
{
"epoch": 0.5464691167981528,
"grad_norm": 0.1378930710651164,
"learning_rate": 4.981692854041215e-05,
"loss": 0.1878,
"mean_token_accuracy": 0.938957291841507,
"step": 1420
},
{
"epoch": 0.5483933038291322,
"grad_norm": 0.1543001875922398,
"learning_rate": 4.981208593648482e-05,
"loss": 0.1922,
"mean_token_accuracy": 0.937881326675415,
"step": 1425
},
{
"epoch": 0.5503174908601116,
"grad_norm": 0.14381566153232886,
"learning_rate": 4.980718038616728e-05,
"loss": 0.1897,
"mean_token_accuracy": 0.9386523187160491,
"step": 1430
},
{
"epoch": 0.552241677891091,
"grad_norm": 0.1414593617076599,
"learning_rate": 4.980221190329898e-05,
"loss": 0.1897,
"mean_token_accuracy": 0.938573569059372,
"step": 1435
},
{
"epoch": 0.5541658649220704,
"grad_norm": 0.13802418684524773,
"learning_rate": 4.979718050189688e-05,
"loss": 0.1891,
"mean_token_accuracy": 0.9385579884052276,
"step": 1440
},
{
"epoch": 0.5560900519530498,
"grad_norm": 0.13125752086195527,
"learning_rate": 4.979208619615547e-05,
"loss": 0.191,
"mean_token_accuracy": 0.9383366286754609,
"step": 1445
},
{
"epoch": 0.5580142389840292,
"grad_norm": 0.14209482273294116,
"learning_rate": 4.978692900044671e-05,
"loss": 0.1882,
"mean_token_accuracy": 0.9388597071170807,
"step": 1450
},
{
"epoch": 0.5599384260150087,
"grad_norm": 0.13255400542137394,
"learning_rate": 4.978170892931996e-05,
"loss": 0.1882,
"mean_token_accuracy": 0.9389723241329193,
"step": 1455
},
{
"epoch": 0.5618626130459881,
"grad_norm": 0.13459691303653742,
"learning_rate": 4.977642599750198e-05,
"loss": 0.1883,
"mean_token_accuracy": 0.9389943897724151,
"step": 1460
},
{
"epoch": 0.5637868000769675,
"grad_norm": 0.13445586406317245,
"learning_rate": 4.9771080219896875e-05,
"loss": 0.1897,
"mean_token_accuracy": 0.9383779525756836,
"step": 1465
},
{
"epoch": 0.5657109871079469,
"grad_norm": 0.14319192722592644,
"learning_rate": 4.976567161158603e-05,
"loss": 0.1877,
"mean_token_accuracy": 0.9391680121421814,
"step": 1470
},
{
"epoch": 0.5676351741389263,
"grad_norm": 0.14311727857762083,
"learning_rate": 4.9760200187828104e-05,
"loss": 0.19,
"mean_token_accuracy": 0.9383435606956482,
"step": 1475
},
{
"epoch": 0.5695593611699057,
"grad_norm": 0.1433601280447413,
"learning_rate": 4.9754665964058956e-05,
"loss": 0.1917,
"mean_token_accuracy": 0.9377669870853425,
"step": 1480
},
{
"epoch": 0.5714835482008851,
"grad_norm": 0.1372126840203153,
"learning_rate": 4.974906895589162e-05,
"loss": 0.1897,
"mean_token_accuracy": 0.9382838129997253,
"step": 1485
},
{
"epoch": 0.5734077352318645,
"grad_norm": 0.14135144914463524,
"learning_rate": 4.974340917911628e-05,
"loss": 0.1889,
"mean_token_accuracy": 0.9386630952358246,
"step": 1490
},
{
"epoch": 0.5753319222628439,
"grad_norm": 0.1431861008729524,
"learning_rate": 4.9737686649700154e-05,
"loss": 0.1886,
"mean_token_accuracy": 0.9389110207557678,
"step": 1495
},
{
"epoch": 0.5772561092938233,
"grad_norm": 0.134683506562364,
"learning_rate": 4.973190138378754e-05,
"loss": 0.19,
"mean_token_accuracy": 0.9384437501430511,
"step": 1500
},
{
"epoch": 0.5791802963248027,
"grad_norm": 0.13612786893218118,
"learning_rate": 4.97260533976997e-05,
"loss": 0.1915,
"mean_token_accuracy": 0.9379990577697754,
"step": 1505
},
{
"epoch": 0.5811044833557821,
"grad_norm": 0.14289619967069558,
"learning_rate": 4.972014270793485e-05,
"loss": 0.1897,
"mean_token_accuracy": 0.9383374452590942,
"step": 1510
},
{
"epoch": 0.5830286703867615,
"grad_norm": 0.13842544154199743,
"learning_rate": 4.9714169331168104e-05,
"loss": 0.1923,
"mean_token_accuracy": 0.9376774847507476,
"step": 1515
},
{
"epoch": 0.5849528574177411,
"grad_norm": 0.13006096264789718,
"learning_rate": 4.970813328425143e-05,
"loss": 0.1871,
"mean_token_accuracy": 0.9393136322498321,
"step": 1520
},
{
"epoch": 0.5868770444487205,
"grad_norm": 0.13174047857207602,
"learning_rate": 4.9702034584213605e-05,
"loss": 0.1905,
"mean_token_accuracy": 0.9378688871860504,
"step": 1525
},
{
"epoch": 0.5888012314796999,
"grad_norm": 0.1300374585549842,
"learning_rate": 4.9695873248260145e-05,
"loss": 0.1879,
"mean_token_accuracy": 0.9391686737537384,
"step": 1530
},
{
"epoch": 0.5907254185106793,
"grad_norm": 0.13007064298516266,
"learning_rate": 4.968964929377328e-05,
"loss": 0.1885,
"mean_token_accuracy": 0.9387658298015594,
"step": 1535
},
{
"epoch": 0.5926496055416587,
"grad_norm": 0.13360649826667276,
"learning_rate": 4.9683362738311913e-05,
"loss": 0.1875,
"mean_token_accuracy": 0.9391666889190674,
"step": 1540
},
{
"epoch": 0.5945737925726381,
"grad_norm": 0.13060804250585295,
"learning_rate": 4.967701359961152e-05,
"loss": 0.1889,
"mean_token_accuracy": 0.9385679483413696,
"step": 1545
},
{
"epoch": 0.5964979796036175,
"grad_norm": 0.14032541180172345,
"learning_rate": 4.9670601895584186e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9397392094135284,
"step": 1550
},
{
"epoch": 0.5984221666345969,
"grad_norm": 0.13607270266800195,
"learning_rate": 4.966412764431845e-05,
"loss": 0.1892,
"mean_token_accuracy": 0.9384610235691071,
"step": 1555
},
{
"epoch": 0.6003463536655763,
"grad_norm": 0.1312707683845405,
"learning_rate": 4.965759086407936e-05,
"loss": 0.1882,
"mean_token_accuracy": 0.9386238515377044,
"step": 1560
},
{
"epoch": 0.6022705406965557,
"grad_norm": 0.13457563564564892,
"learning_rate": 4.965099157330832e-05,
"loss": 0.185,
"mean_token_accuracy": 0.939483368396759,
"step": 1565
},
{
"epoch": 0.6041947277275351,
"grad_norm": 0.13439942039177172,
"learning_rate": 4.964432979062313e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9405307114124298,
"step": 1570
},
{
"epoch": 0.6061189147585145,
"grad_norm": 0.1277496982429766,
"learning_rate": 4.963760553481786e-05,
"loss": 0.1899,
"mean_token_accuracy": 0.9384656131267548,
"step": 1575
},
{
"epoch": 0.608043101789494,
"grad_norm": 0.1330158934788981,
"learning_rate": 4.963081882486284e-05,
"loss": 0.1834,
"mean_token_accuracy": 0.9401716351509094,
"step": 1580
},
{
"epoch": 0.6099672888204734,
"grad_norm": 0.13754542628006322,
"learning_rate": 4.96239696799046e-05,
"loss": 0.1919,
"mean_token_accuracy": 0.9377372145652771,
"step": 1585
},
{
"epoch": 0.6118914758514528,
"grad_norm": 0.14098492971953913,
"learning_rate": 4.9617058119265805e-05,
"loss": 0.1894,
"mean_token_accuracy": 0.9386878430843353,
"step": 1590
},
{
"epoch": 0.6138156628824322,
"grad_norm": 0.12664632216663757,
"learning_rate": 4.961008416244519e-05,
"loss": 0.185,
"mean_token_accuracy": 0.9396873474121094,
"step": 1595
},
{
"epoch": 0.6157398499134116,
"grad_norm": 0.13549624226878912,
"learning_rate": 4.960304782911756e-05,
"loss": 0.1863,
"mean_token_accuracy": 0.9393212020397186,
"step": 1600
},
{
"epoch": 0.617664036944391,
"grad_norm": 0.1276823140599733,
"learning_rate": 4.959594913913366e-05,
"loss": 0.1868,
"mean_token_accuracy": 0.9388953983783722,
"step": 1605
},
{
"epoch": 0.6195882239753704,
"grad_norm": 0.1336813264156418,
"learning_rate": 4.9588788112520164e-05,
"loss": 0.1845,
"mean_token_accuracy": 0.9399426996707916,
"step": 1610
},
{
"epoch": 0.6215124110063498,
"grad_norm": 0.13481862710535897,
"learning_rate": 4.958156476947961e-05,
"loss": 0.1831,
"mean_token_accuracy": 0.940336000919342,
"step": 1615
},
{
"epoch": 0.6234365980373292,
"grad_norm": 0.13324160235017124,
"learning_rate": 4.957427913039034e-05,
"loss": 0.1893,
"mean_token_accuracy": 0.9386727988719941,
"step": 1620
},
{
"epoch": 0.6253607850683086,
"grad_norm": 0.13030229625983306,
"learning_rate": 4.9566931215806464e-05,
"loss": 0.1883,
"mean_token_accuracy": 0.938975191116333,
"step": 1625
},
{
"epoch": 0.627284972099288,
"grad_norm": 0.12493763582679271,
"learning_rate": 4.955952104645775e-05,
"loss": 0.1846,
"mean_token_accuracy": 0.9397555112838745,
"step": 1630
},
{
"epoch": 0.6292091591302674,
"grad_norm": 0.12723126191253217,
"learning_rate": 4.955204864324961e-05,
"loss": 0.183,
"mean_token_accuracy": 0.9404920816421509,
"step": 1635
},
{
"epoch": 0.6311333461612468,
"grad_norm": 0.1293781315765133,
"learning_rate": 4.9544514027263034e-05,
"loss": 0.1879,
"mean_token_accuracy": 0.9391253471374512,
"step": 1640
},
{
"epoch": 0.6330575331922262,
"grad_norm": 0.13138978342222302,
"learning_rate": 4.953691721975453e-05,
"loss": 0.1867,
"mean_token_accuracy": 0.9391219913959503,
"step": 1645
},
{
"epoch": 0.6349817202232056,
"grad_norm": 0.12270187391728252,
"learning_rate": 4.9529258242156046e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.9395655930042267,
"step": 1650
},
{
"epoch": 0.636905907254185,
"grad_norm": 0.12918156512326553,
"learning_rate": 4.9521537116074926e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9395742118358612,
"step": 1655
},
{
"epoch": 0.6388300942851645,
"grad_norm": 0.13972950651489827,
"learning_rate": 4.951375386329387e-05,
"loss": 0.1859,
"mean_token_accuracy": 0.9398290574550628,
"step": 1660
},
{
"epoch": 0.640754281316144,
"grad_norm": 0.12809225460490406,
"learning_rate": 4.95059085057708e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9390866160392761,
"step": 1665
},
{
"epoch": 0.6426784683471234,
"grad_norm": 0.13667531914360745,
"learning_rate": 4.949800106563889e-05,
"loss": 0.1849,
"mean_token_accuracy": 0.9396887421607971,
"step": 1670
},
{
"epoch": 0.6446026553781028,
"grad_norm": 0.13602807879743783,
"learning_rate": 4.9490031565206445e-05,
"loss": 0.1863,
"mean_token_accuracy": 0.939508056640625,
"step": 1675
},
{
"epoch": 0.6465268424090822,
"grad_norm": 0.13096626380887844,
"learning_rate": 4.948200002695685e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9400151014328003,
"step": 1680
},
{
"epoch": 0.6484510294400616,
"grad_norm": 0.12291861404114918,
"learning_rate": 4.947390647354851e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9389609873294831,
"step": 1685
},
{
"epoch": 0.650375216471041,
"grad_norm": 0.13409913540340435,
"learning_rate": 4.94657509278148e-05,
"loss": 0.1859,
"mean_token_accuracy": 0.9396482467651367,
"step": 1690
},
{
"epoch": 0.6522994035020204,
"grad_norm": 0.12281251011094463,
"learning_rate": 4.945753341276395e-05,
"loss": 0.1848,
"mean_token_accuracy": 0.939803171157837,
"step": 1695
},
{
"epoch": 0.6542235905329998,
"grad_norm": 0.1228749169492863,
"learning_rate": 4.944925395157907e-05,
"loss": 0.1875,
"mean_token_accuracy": 0.9391197860240936,
"step": 1700
},
{
"epoch": 0.6561477775639792,
"grad_norm": 0.12664527861803945,
"learning_rate": 4.944091256761798e-05,
"loss": 0.1892,
"mean_token_accuracy": 0.9383731603622436,
"step": 1705
},
{
"epoch": 0.6580719645949586,
"grad_norm": 0.12403803816854818,
"learning_rate": 4.943250928441324e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.9401000320911408,
"step": 1710
},
{
"epoch": 0.659996151625938,
"grad_norm": 0.1342330704907052,
"learning_rate": 4.942404412567201e-05,
"loss": 0.1887,
"mean_token_accuracy": 0.9387575447559356,
"step": 1715
},
{
"epoch": 0.6619203386569175,
"grad_norm": 0.12352416672457492,
"learning_rate": 4.941551711527601e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9390809714794159,
"step": 1720
},
{
"epoch": 0.6638445256878969,
"grad_norm": 0.12952860042778544,
"learning_rate": 4.940692827728146e-05,
"loss": 0.1868,
"mean_token_accuracy": 0.9390901625156403,
"step": 1725
},
{
"epoch": 0.6657687127188763,
"grad_norm": 0.12949620744531398,
"learning_rate": 4.939827763591902e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9391276597976684,
"step": 1730
},
{
"epoch": 0.6676928997498557,
"grad_norm": 0.1326361157966624,
"learning_rate": 4.93895652155937e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9392795920372009,
"step": 1735
},
{
"epoch": 0.6696170867808351,
"grad_norm": 0.1259436738522025,
"learning_rate": 4.93807910408848e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.9394049286842346,
"step": 1740
},
{
"epoch": 0.6715412738118145,
"grad_norm": 0.12833385292087515,
"learning_rate": 4.937195513654582e-05,
"loss": 0.1847,
"mean_token_accuracy": 0.9400054693222046,
"step": 1745
},
{
"epoch": 0.6734654608427939,
"grad_norm": 0.1305902666993928,
"learning_rate": 4.936305752750445e-05,
"loss": 0.1862,
"mean_token_accuracy": 0.9397811412811279,
"step": 1750
},
{
"epoch": 0.6753896478737733,
"grad_norm": 0.12503772393114956,
"learning_rate": 4.9354098238862434e-05,
"loss": 0.1866,
"mean_token_accuracy": 0.9391416549682617,
"step": 1755
},
{
"epoch": 0.6773138349047527,
"grad_norm": 0.12907005180260395,
"learning_rate": 4.934507729589552e-05,
"loss": 0.1837,
"mean_token_accuracy": 0.9401533901691437,
"step": 1760
},
{
"epoch": 0.6792380219357321,
"grad_norm": 0.12952056395716516,
"learning_rate": 4.93359947240534e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9394142091274261,
"step": 1765
},
{
"epoch": 0.6811622089667115,
"grad_norm": 0.12964032848148677,
"learning_rate": 4.9326850548959655e-05,
"loss": 0.1872,
"mean_token_accuracy": 0.9389999568462372,
"step": 1770
},
{
"epoch": 0.6830863959976909,
"grad_norm": 0.12307908590279568,
"learning_rate": 4.9317644796411626e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.9391985893249511,
"step": 1775
},
{
"epoch": 0.6850105830286704,
"grad_norm": 0.12409231912708199,
"learning_rate": 4.9308377492380395e-05,
"loss": 0.1859,
"mean_token_accuracy": 0.939410537481308,
"step": 1780
},
{
"epoch": 0.6869347700596498,
"grad_norm": 0.1284220476802047,
"learning_rate": 4.929904866301069e-05,
"loss": 0.1853,
"mean_token_accuracy": 0.9397732496261597,
"step": 1785
},
{
"epoch": 0.6888589570906292,
"grad_norm": 0.1283258198594263,
"learning_rate": 4.9289658334620795e-05,
"loss": 0.187,
"mean_token_accuracy": 0.939195990562439,
"step": 1790
},
{
"epoch": 0.6907831441216086,
"grad_norm": 0.12635478080816434,
"learning_rate": 4.928020653370253e-05,
"loss": 0.1874,
"mean_token_accuracy": 0.9393328726291656,
"step": 1795
},
{
"epoch": 0.692707331152588,
"grad_norm": 0.12484761872443023,
"learning_rate": 4.92706932869211e-05,
"loss": 0.1856,
"mean_token_accuracy": 0.9395947933197022,
"step": 1800
},
{
"epoch": 0.6946315181835675,
"grad_norm": 0.13346046560029717,
"learning_rate": 4.9261118621115075e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9395689606666565,
"step": 1805
},
{
"epoch": 0.6965557052145469,
"grad_norm": 0.12461615072971878,
"learning_rate": 4.925148256329632e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.938995772600174,
"step": 1810
},
{
"epoch": 0.6984798922455263,
"grad_norm": 0.1320523043841543,
"learning_rate": 4.9241785140649874e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.9397156715393067,
"step": 1815
},
{
"epoch": 0.7004040792765057,
"grad_norm": 0.12176312501257469,
"learning_rate": 4.92320263805339e-05,
"loss": 0.1853,
"mean_token_accuracy": 0.9398873805999756,
"step": 1820
},
{
"epoch": 0.7023282663074851,
"grad_norm": 0.12276787751903055,
"learning_rate": 4.922220631047959e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9400618195533752,
"step": 1825
},
{
"epoch": 0.7042524533384645,
"grad_norm": 0.12945726528749396,
"learning_rate": 4.921232495819115e-05,
"loss": 0.1883,
"mean_token_accuracy": 0.9388338088989258,
"step": 1830
},
{
"epoch": 0.7061766403694439,
"grad_norm": 0.13077148163199273,
"learning_rate": 4.9202382351545635e-05,
"loss": 0.182,
"mean_token_accuracy": 0.9405941784381866,
"step": 1835
},
{
"epoch": 0.7081008274004233,
"grad_norm": 0.13399997817922676,
"learning_rate": 4.91923785185929e-05,
"loss": 0.1858,
"mean_token_accuracy": 0.939314740896225,
"step": 1840
},
{
"epoch": 0.7100250144314028,
"grad_norm": 0.13410360541184926,
"learning_rate": 4.918231348755558e-05,
"loss": 0.1868,
"mean_token_accuracy": 0.9394115328788757,
"step": 1845
},
{
"epoch": 0.7119492014623822,
"grad_norm": 0.12784687831405284,
"learning_rate": 4.917218728682891e-05,
"loss": 0.1832,
"mean_token_accuracy": 0.9403221547603607,
"step": 1850
},
{
"epoch": 0.7138733884933616,
"grad_norm": 0.12638458550832857,
"learning_rate": 4.916199994498073e-05,
"loss": 0.186,
"mean_token_accuracy": 0.9395315229892731,
"step": 1855
},
{
"epoch": 0.715797575524341,
"grad_norm": 0.13064255797982968,
"learning_rate": 4.915175149075134e-05,
"loss": 0.1876,
"mean_token_accuracy": 0.9386875092983246,
"step": 1860
},
{
"epoch": 0.7177217625553204,
"grad_norm": 0.13292186717008103,
"learning_rate": 4.914144195305346e-05,
"loss": 0.185,
"mean_token_accuracy": 0.9393829703330994,
"step": 1865
},
{
"epoch": 0.7196459495862998,
"grad_norm": 0.12405323655496656,
"learning_rate": 4.9131071360972166e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.939610755443573,
"step": 1870
},
{
"epoch": 0.7215701366172792,
"grad_norm": 0.11979493582212139,
"learning_rate": 4.912063974376475e-05,
"loss": 0.1821,
"mean_token_accuracy": 0.9403452455997467,
"step": 1875
},
{
"epoch": 0.7234943236482586,
"grad_norm": 0.12773120330873297,
"learning_rate": 4.9110147130860645e-05,
"loss": 0.1872,
"mean_token_accuracy": 0.9392232894897461,
"step": 1880
},
{
"epoch": 0.725418510679238,
"grad_norm": 0.1274813879213902,
"learning_rate": 4.909959355186143e-05,
"loss": 0.1859,
"mean_token_accuracy": 0.9395413219928741,
"step": 1885
},
{
"epoch": 0.7273426977102174,
"grad_norm": 0.11820515206801051,
"learning_rate": 4.908897903654061e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9395869731903076,
"step": 1890
},
{
"epoch": 0.7292668847411968,
"grad_norm": 0.13102076278056418,
"learning_rate": 4.907830361484365e-05,
"loss": 0.183,
"mean_token_accuracy": 0.9400752663612366,
"step": 1895
},
{
"epoch": 0.7311910717721762,
"grad_norm": 0.1278752224588257,
"learning_rate": 4.9067567316887827e-05,
"loss": 0.1833,
"mean_token_accuracy": 0.9402811288833618,
"step": 1900
},
{
"epoch": 0.7331152588031556,
"grad_norm": 0.12646029457624683,
"learning_rate": 4.905677017296215e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9400483667850494,
"step": 1905
},
{
"epoch": 0.735039445834135,
"grad_norm": 0.12189597684353602,
"learning_rate": 4.90459122135273e-05,
"loss": 0.1853,
"mean_token_accuracy": 0.939655190706253,
"step": 1910
},
{
"epoch": 0.7369636328651145,
"grad_norm": 0.11912995970116359,
"learning_rate": 4.9034993469215536e-05,
"loss": 0.1855,
"mean_token_accuracy": 0.9393987059593201,
"step": 1915
},
{
"epoch": 0.7388878198960939,
"grad_norm": 0.12473059168347295,
"learning_rate": 4.902401397083057e-05,
"loss": 0.1827,
"mean_token_accuracy": 0.9405106902122498,
"step": 1920
},
{
"epoch": 0.7408120069270733,
"grad_norm": 0.12178119702571959,
"learning_rate": 4.901297374934756e-05,
"loss": 0.182,
"mean_token_accuracy": 0.940519267320633,
"step": 1925
},
{
"epoch": 0.7427361939580527,
"grad_norm": 0.12298998058807746,
"learning_rate": 4.900187283591292e-05,
"loss": 0.1855,
"mean_token_accuracy": 0.9396823763847351,
"step": 1930
},
{
"epoch": 0.7446603809890321,
"grad_norm": 0.1649138029052641,
"learning_rate": 4.899071126184433e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.9390595555305481,
"step": 1935
},
{
"epoch": 0.7465845680200115,
"grad_norm": 0.12302266948272966,
"learning_rate": 4.897948905863059e-05,
"loss": 0.1865,
"mean_token_accuracy": 0.939179515838623,
"step": 1940
},
{
"epoch": 0.7485087550509909,
"grad_norm": 0.13051190459136533,
"learning_rate": 4.896820625793154e-05,
"loss": 0.1833,
"mean_token_accuracy": 0.9401630163192749,
"step": 1945
},
{
"epoch": 0.7504329420819704,
"grad_norm": 0.12589310940355214,
"learning_rate": 4.8956862891577985e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9398818373680115,
"step": 1950
},
{
"epoch": 0.7523571291129498,
"grad_norm": 0.13251423292021208,
"learning_rate": 4.89454589915716e-05,
"loss": 0.1846,
"mean_token_accuracy": 0.9395718336105346,
"step": 1955
},
{
"epoch": 0.7542813161439292,
"grad_norm": 0.12609386176353532,
"learning_rate": 4.893399459008481e-05,
"loss": 0.188,
"mean_token_accuracy": 0.9388734877109528,
"step": 1960
},
{
"epoch": 0.7562055031749086,
"grad_norm": 0.13969598978136358,
"learning_rate": 4.892246971946075e-05,
"loss": 0.1867,
"mean_token_accuracy": 0.9394755423069,
"step": 1965
},
{
"epoch": 0.758129690205888,
"grad_norm": 0.11916269700290026,
"learning_rate": 4.891088441221316e-05,
"loss": 0.1834,
"mean_token_accuracy": 0.9400242328643799,
"step": 1970
},
{
"epoch": 0.7600538772368675,
"grad_norm": 0.11842170357576598,
"learning_rate": 4.889923870102625e-05,
"loss": 0.1828,
"mean_token_accuracy": 0.9405191838741302,
"step": 1975
},
{
"epoch": 0.7619780642678469,
"grad_norm": 0.12122423863343659,
"learning_rate": 4.888753261875467e-05,
"loss": 0.1847,
"mean_token_accuracy": 0.9396079897880554,
"step": 1980
},
{
"epoch": 0.7639022512988263,
"grad_norm": 0.1319218553876708,
"learning_rate": 4.887576619842336e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.9395697891712189,
"step": 1985
},
{
"epoch": 0.7658264383298057,
"grad_norm": 0.12946605027962763,
"learning_rate": 4.886393947322751e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.9401982545852661,
"step": 1990
},
{
"epoch": 0.7677506253607851,
"grad_norm": 0.12078596356425471,
"learning_rate": 4.885205247653242e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9403159558773041,
"step": 1995
},
{
"epoch": 0.7696748123917645,
"grad_norm": 0.12064298731271933,
"learning_rate": 4.884010524187345e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9408186912536621,
"step": 2000
},
{
"epoch": 0.7715989994227439,
"grad_norm": 0.12556269641903606,
"learning_rate": 4.882809780295587e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9409188389778137,
"step": 2005
},
{
"epoch": 0.7735231864537233,
"grad_norm": 0.13254601751365308,
"learning_rate": 4.8816030193654836e-05,
"loss": 0.1858,
"mean_token_accuracy": 0.9394202411174775,
"step": 2010
},
{
"epoch": 0.7754473734847027,
"grad_norm": 0.11416378254386586,
"learning_rate": 4.880390244801523e-05,
"loss": 0.1835,
"mean_token_accuracy": 0.9398796260356903,
"step": 2015
},
{
"epoch": 0.7773715605156821,
"grad_norm": 0.12378280260912523,
"learning_rate": 4.879171460025157e-05,
"loss": 0.1859,
"mean_token_accuracy": 0.9396220564842224,
"step": 2020
},
{
"epoch": 0.7792957475466615,
"grad_norm": 0.11753365831717989,
"learning_rate": 4.8779466684748004e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.9389756441116333,
"step": 2025
},
{
"epoch": 0.7812199345776409,
"grad_norm": 0.11907092470176393,
"learning_rate": 4.8767158736058046e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9402213454246521,
"step": 2030
},
{
"epoch": 0.7831441216086203,
"grad_norm": 0.1201609963783468,
"learning_rate": 4.8754790788904656e-05,
"loss": 0.1846,
"mean_token_accuracy": 0.9401138067245484,
"step": 2035
},
{
"epoch": 0.7850683086395998,
"grad_norm": 0.12241701349986983,
"learning_rate": 4.874236287818002e-05,
"loss": 0.185,
"mean_token_accuracy": 0.9396168053150177,
"step": 2040
},
{
"epoch": 0.7869924956705792,
"grad_norm": 0.11890858606428757,
"learning_rate": 4.872987503894549e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9403604686260223,
"step": 2045
},
{
"epoch": 0.7889166827015586,
"grad_norm": 0.12085338845765432,
"learning_rate": 4.8717327306431505e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9393939733505249,
"step": 2050
},
{
"epoch": 0.790840869732538,
"grad_norm": 0.12386143683480247,
"learning_rate": 4.870471971603746e-05,
"loss": 0.1841,
"mean_token_accuracy": 0.9399244070053101,
"step": 2055
},
{
"epoch": 0.7927650567635174,
"grad_norm": 0.11380417602513576,
"learning_rate": 4.8692052303331636e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9398261904716492,
"step": 2060
},
{
"epoch": 0.7946892437944968,
"grad_norm": 0.12935731088043464,
"learning_rate": 4.8679325104051074e-05,
"loss": 0.1837,
"mean_token_accuracy": 0.9402774631977081,
"step": 2065
},
{
"epoch": 0.7966134308254762,
"grad_norm": 0.1250137586530349,
"learning_rate": 4.866653815410146e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9400751292705536,
"step": 2070
},
{
"epoch": 0.7985376178564556,
"grad_norm": 0.11926647844819718,
"learning_rate": 4.86536914895571e-05,
"loss": 0.1852,
"mean_token_accuracy": 0.9398459553718567,
"step": 2075
},
{
"epoch": 0.800461804887435,
"grad_norm": 0.11842733754417097,
"learning_rate": 4.86407851466607e-05,
"loss": 0.1814,
"mean_token_accuracy": 0.9405053138732911,
"step": 2080
},
{
"epoch": 0.8023859919184144,
"grad_norm": 0.11684244820743363,
"learning_rate": 4.86278191618234e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.9398811280727386,
"step": 2085
},
{
"epoch": 0.8043101789493938,
"grad_norm": 0.11974151405660274,
"learning_rate": 4.861479357162455e-05,
"loss": 0.1843,
"mean_token_accuracy": 0.9397909820079804,
"step": 2090
},
{
"epoch": 0.8062343659803733,
"grad_norm": 0.11691695951953154,
"learning_rate": 4.8601708412811666e-05,
"loss": 0.1841,
"mean_token_accuracy": 0.9401267170906067,
"step": 2095
},
{
"epoch": 0.8081585530113528,
"grad_norm": 0.1230162234683747,
"learning_rate": 4.8588563722300335e-05,
"loss": 0.1811,
"mean_token_accuracy": 0.9409735321998596,
"step": 2100
},
{
"epoch": 0.8100827400423322,
"grad_norm": 0.12199443457355415,
"learning_rate": 4.857535953717408e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.940091347694397,
"step": 2105
},
{
"epoch": 0.8120069270733116,
"grad_norm": 0.12805212639654023,
"learning_rate": 4.856209589468427e-05,
"loss": 0.1818,
"mean_token_accuracy": 0.940629106760025,
"step": 2110
},
{
"epoch": 0.813931114104291,
"grad_norm": 0.12938937862607275,
"learning_rate": 4.8548772832250015e-05,
"loss": 0.182,
"mean_token_accuracy": 0.9406083405017853,
"step": 2115
},
{
"epoch": 0.8158553011352704,
"grad_norm": 0.12552376482220515,
"learning_rate": 4.8535390387458066e-05,
"loss": 0.1872,
"mean_token_accuracy": 0.9390264034271241,
"step": 2120
},
{
"epoch": 0.8177794881662498,
"grad_norm": 0.1170373209408773,
"learning_rate": 4.852194859806269e-05,
"loss": 0.1808,
"mean_token_accuracy": 0.9408780634403229,
"step": 2125
},
{
"epoch": 0.8197036751972292,
"grad_norm": 0.11615734062516449,
"learning_rate": 4.8508447501985585e-05,
"loss": 0.1873,
"mean_token_accuracy": 0.9388710737228394,
"step": 2130
},
{
"epoch": 0.8216278622282086,
"grad_norm": 0.11616601365701719,
"learning_rate": 4.849488713731576e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9404361367225647,
"step": 2135
},
{
"epoch": 0.823552049259188,
"grad_norm": 0.1200473574748539,
"learning_rate": 4.8481267542309425e-05,
"loss": 0.1814,
"mean_token_accuracy": 0.9407158613204956,
"step": 2140
},
{
"epoch": 0.8254762362901674,
"grad_norm": 0.11416470941265607,
"learning_rate": 4.8467588755389915e-05,
"loss": 0.1827,
"mean_token_accuracy": 0.940354073047638,
"step": 2145
},
{
"epoch": 0.8274004233211468,
"grad_norm": 0.1165530375543256,
"learning_rate": 4.845385081514752e-05,
"loss": 0.1805,
"mean_token_accuracy": 0.9412187516689301,
"step": 2150
},
{
"epoch": 0.8293246103521262,
"grad_norm": 0.11754202343664585,
"learning_rate": 4.8440053760339446e-05,
"loss": 0.1834,
"mean_token_accuracy": 0.9401270806789398,
"step": 2155
},
{
"epoch": 0.8312487973831056,
"grad_norm": 0.1176517652797786,
"learning_rate": 4.842619762988963e-05,
"loss": 0.1826,
"mean_token_accuracy": 0.9402092635631562,
"step": 2160
},
{
"epoch": 0.833172984414085,
"grad_norm": 0.12133212991641226,
"learning_rate": 4.841228246288873e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9408377289772034,
"step": 2165
},
{
"epoch": 0.8350971714450645,
"grad_norm": 0.12044128214619812,
"learning_rate": 4.83983082985939e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.940137755870819,
"step": 2170
},
{
"epoch": 0.8370213584760439,
"grad_norm": 0.11885648796006264,
"learning_rate": 4.838427517642877e-05,
"loss": 0.1801,
"mean_token_accuracy": 0.9411330699920655,
"step": 2175
},
{
"epoch": 0.8389455455070233,
"grad_norm": 0.1106034905535314,
"learning_rate": 4.837018313598328e-05,
"loss": 0.1845,
"mean_token_accuracy": 0.9397697389125824,
"step": 2180
},
{
"epoch": 0.8408697325380027,
"grad_norm": 0.11987613929957971,
"learning_rate": 4.835603221701362e-05,
"loss": 0.1803,
"mean_token_accuracy": 0.9408568203449249,
"step": 2185
},
{
"epoch": 0.8427939195689821,
"grad_norm": 0.11966460776022199,
"learning_rate": 4.834182245944205e-05,
"loss": 0.1785,
"mean_token_accuracy": 0.9416488707065582,
"step": 2190
},
{
"epoch": 0.8447181065999615,
"grad_norm": 0.1182565560206525,
"learning_rate": 4.8327553903356836e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9405614793300628,
"step": 2195
},
{
"epoch": 0.8466422936309409,
"grad_norm": 0.11535883667619559,
"learning_rate": 4.831322658901215e-05,
"loss": 0.1817,
"mean_token_accuracy": 0.9406129837036132,
"step": 2200
},
{
"epoch": 0.8485664806619203,
"grad_norm": 0.11788009293521055,
"learning_rate": 4.82988405568279e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.9399451076984405,
"step": 2205
},
{
"epoch": 0.8504906676928997,
"grad_norm": 0.11917536508755569,
"learning_rate": 4.828439584738966e-05,
"loss": 0.184,
"mean_token_accuracy": 0.9401583254337311,
"step": 2210
},
{
"epoch": 0.8524148547238791,
"grad_norm": 0.1146235510311429,
"learning_rate": 4.826989250144854e-05,
"loss": 0.1819,
"mean_token_accuracy": 0.9403225839138031,
"step": 2215
},
{
"epoch": 0.8543390417548585,
"grad_norm": 0.12469548548238404,
"learning_rate": 4.8255330559921074e-05,
"loss": 0.1827,
"mean_token_accuracy": 0.9404394030570984,
"step": 2220
},
{
"epoch": 0.8562632287858379,
"grad_norm": 0.11613789081146532,
"learning_rate": 4.824071006388912e-05,
"loss": 0.1829,
"mean_token_accuracy": 0.9401649594306946,
"step": 2225
},
{
"epoch": 0.8581874158168173,
"grad_norm": 0.12267984426574025,
"learning_rate": 4.82260310545997e-05,
"loss": 0.1825,
"mean_token_accuracy": 0.940345722436905,
"step": 2230
},
{
"epoch": 0.8601116028477968,
"grad_norm": 0.12100207158363169,
"learning_rate": 4.8211293573464946e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9399094760417939,
"step": 2235
},
{
"epoch": 0.8620357898787763,
"grad_norm": 0.10970579702066553,
"learning_rate": 4.8196497662061914e-05,
"loss": 0.1838,
"mean_token_accuracy": 0.9400064706802368,
"step": 2240
},
{
"epoch": 0.8639599769097557,
"grad_norm": 0.11453745047830523,
"learning_rate": 4.818164336213252e-05,
"loss": 0.1855,
"mean_token_accuracy": 0.9396546185016632,
"step": 2245
},
{
"epoch": 0.8658841639407351,
"grad_norm": 0.11513182404489364,
"learning_rate": 4.8166730715583425e-05,
"loss": 0.1818,
"mean_token_accuracy": 0.940559697151184,
"step": 2250
},
{
"epoch": 0.8678083509717145,
"grad_norm": 0.1476666402315563,
"learning_rate": 4.8151759764485856e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9404184460639954,
"step": 2255
},
{
"epoch": 0.8697325380026939,
"grad_norm": 0.11948879120992552,
"learning_rate": 4.813673055107555e-05,
"loss": 0.1819,
"mean_token_accuracy": 0.9407192707061768,
"step": 2260
},
{
"epoch": 0.8716567250336733,
"grad_norm": 0.1262081227430968,
"learning_rate": 4.812164311775261e-05,
"loss": 0.1821,
"mean_token_accuracy": 0.9405526936054229,
"step": 2265
},
{
"epoch": 0.8735809120646527,
"grad_norm": 0.11634634554542392,
"learning_rate": 4.810649750708139e-05,
"loss": 0.1831,
"mean_token_accuracy": 0.9402897179126739,
"step": 2270
},
{
"epoch": 0.8755050990956321,
"grad_norm": 0.11942845642892773,
"learning_rate": 4.8091293761790376e-05,
"loss": 0.1842,
"mean_token_accuracy": 0.9398006618022918,
"step": 2275
},
{
"epoch": 0.8774292861266115,
"grad_norm": 0.12249680244184764,
"learning_rate": 4.807603192477204e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9406807065010071,
"step": 2280
},
{
"epoch": 0.8793534731575909,
"grad_norm": 0.11271491646058761,
"learning_rate": 4.8060712039082776e-05,
"loss": 0.181,
"mean_token_accuracy": 0.9405488193035125,
"step": 2285
},
{
"epoch": 0.8812776601885703,
"grad_norm": 0.11533467431113814,
"learning_rate": 4.804533414794272e-05,
"loss": 0.1853,
"mean_token_accuracy": 0.9396409273147583,
"step": 2290
},
{
"epoch": 0.8832018472195498,
"grad_norm": 0.1285187047296205,
"learning_rate": 4.8029898294735645e-05,
"loss": 0.182,
"mean_token_accuracy": 0.9406699240207672,
"step": 2295
},
{
"epoch": 0.8851260342505292,
"grad_norm": 0.1411300877607505,
"learning_rate": 4.801440452300886e-05,
"loss": 0.1848,
"mean_token_accuracy": 0.9397783398628234,
"step": 2300
},
{
"epoch": 0.8870502212815086,
"grad_norm": 0.12830296379755649,
"learning_rate": 4.799885287647308e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.9400253415107727,
"step": 2305
},
{
"epoch": 0.888974408312488,
"grad_norm": 0.12269949468159491,
"learning_rate": 4.798324339900228e-05,
"loss": 0.1832,
"mean_token_accuracy": 0.9401925921440124,
"step": 2310
},
{
"epoch": 0.8908985953434674,
"grad_norm": 0.11756618845811823,
"learning_rate": 4.7967576134633596e-05,
"loss": 0.1804,
"mean_token_accuracy": 0.9412013351917267,
"step": 2315
},
{
"epoch": 0.8928227823744468,
"grad_norm": 0.16572054412065182,
"learning_rate": 4.7951851127567184e-05,
"loss": 0.1807,
"mean_token_accuracy": 0.9412114441394805,
"step": 2320
},
{
"epoch": 0.8947469694054262,
"grad_norm": 0.12116911140290487,
"learning_rate": 4.793606842216609e-05,
"loss": 0.1795,
"mean_token_accuracy": 0.941188383102417,
"step": 2325
},
{
"epoch": 0.8966711564364056,
"grad_norm": 0.1260181768965728,
"learning_rate": 4.792022806295618e-05,
"loss": 0.1854,
"mean_token_accuracy": 0.9395481050014496,
"step": 2330
},
{
"epoch": 0.898595343467385,
"grad_norm": 0.12955287274675212,
"learning_rate": 4.790433009462592e-05,
"loss": 0.1813,
"mean_token_accuracy": 0.9407733678817749,
"step": 2335
},
{
"epoch": 0.9005195304983644,
"grad_norm": 0.21585557593260274,
"learning_rate": 4.788837456202634e-05,
"loss": 0.1912,
"mean_token_accuracy": 0.9396725356578827,
"step": 2340
},
{
"epoch": 0.9024437175293438,
"grad_norm": 0.7997740442778281,
"learning_rate": 4.787236151017085e-05,
"loss": 0.3293,
"mean_token_accuracy": 0.9140214204788208,
"step": 2345
},
{
"epoch": 0.9043679045603232,
"grad_norm": 0.2971714473559009,
"learning_rate": 4.785629098423513e-05,
"loss": 0.2142,
"mean_token_accuracy": 0.931775027513504,
"step": 2350
},
{
"epoch": 0.9062920915913026,
"grad_norm": 0.27663796986222866,
"learning_rate": 4.7840163029557034e-05,
"loss": 0.1949,
"mean_token_accuracy": 0.9374295115470886,
"step": 2355
},
{
"epoch": 0.908216278622282,
"grad_norm": 0.14943846174583816,
"learning_rate": 4.782397769163638e-05,
"loss": 0.1926,
"mean_token_accuracy": 0.9374601423740387,
"step": 2360
},
{
"epoch": 0.9101404656532615,
"grad_norm": 0.2191158094100316,
"learning_rate": 4.780773501613493e-05,
"loss": 0.1902,
"mean_token_accuracy": 0.9381971001625061,
"step": 2365
},
{
"epoch": 0.9120646526842409,
"grad_norm": 0.1368075879535361,
"learning_rate": 4.7791435048876166e-05,
"loss": 0.1883,
"mean_token_accuracy": 0.9388419568538666,
"step": 2370
},
{
"epoch": 0.9139888397152203,
"grad_norm": 0.14956559770526698,
"learning_rate": 4.777507783584522e-05,
"loss": 0.1877,
"mean_token_accuracy": 0.9390267848968505,
"step": 2375
},
{
"epoch": 0.9159130267461997,
"grad_norm": 0.1320819312114568,
"learning_rate": 4.775866342318871e-05,
"loss": 0.1857,
"mean_token_accuracy": 0.9396218538284302,
"step": 2380
},
{
"epoch": 0.9178372137771792,
"grad_norm": 0.12840423074414092,
"learning_rate": 4.774219185721466e-05,
"loss": 0.1868,
"mean_token_accuracy": 0.9395570278167724,
"step": 2385
},
{
"epoch": 0.9197614008081586,
"grad_norm": 0.12215985631548372,
"learning_rate": 4.7725663184392284e-05,
"loss": 0.1845,
"mean_token_accuracy": 0.9402369081974029,
"step": 2390
},
{
"epoch": 0.921685587839138,
"grad_norm": 0.23556308545025137,
"learning_rate": 4.770907745135194e-05,
"loss": 0.1814,
"mean_token_accuracy": 0.9406795680522919,
"step": 2395
},
{
"epoch": 0.9236097748701174,
"grad_norm": 0.13504443301156466,
"learning_rate": 4.769243470488493e-05,
"loss": 0.1848,
"mean_token_accuracy": 0.9396166443824768,
"step": 2400
},
{
"epoch": 0.9255339619010968,
"grad_norm": 0.25044759571321284,
"learning_rate": 4.767573499194344e-05,
"loss": 0.1869,
"mean_token_accuracy": 0.939172875881195,
"step": 2405
},
{
"epoch": 0.9274581489320762,
"grad_norm": 0.12385091018646273,
"learning_rate": 4.765897835964035e-05,
"loss": 0.1852,
"mean_token_accuracy": 0.9395855963230133,
"step": 2410
},
{
"epoch": 0.9293823359630556,
"grad_norm": 0.11439544922055143,
"learning_rate": 4.7642164855249124e-05,
"loss": 0.1852,
"mean_token_accuracy": 0.9394470155239105,
"step": 2415
},
{
"epoch": 0.931306522994035,
"grad_norm": 0.1235397306154651,
"learning_rate": 4.7625294526203657e-05,
"loss": 0.1847,
"mean_token_accuracy": 0.9398016691207886,
"step": 2420
},
{
"epoch": 0.9332307100250145,
"grad_norm": 1.6728295055753946,
"learning_rate": 4.760836742009818e-05,
"loss": 0.1851,
"mean_token_accuracy": 0.9397288858890533,
"step": 2425
},
{
"epoch": 0.9351548970559939,
"grad_norm": 0.12316305160239477,
"learning_rate": 4.759138358468709e-05,
"loss": 0.1835,
"mean_token_accuracy": 0.9399518132209778,
"step": 2430
},
{
"epoch": 0.9370790840869733,
"grad_norm": 0.13793514276935762,
"learning_rate": 4.757434306788482e-05,
"loss": 0.1861,
"mean_token_accuracy": 0.9393800020217895,
"step": 2435
},
{
"epoch": 0.9390032711179527,
"grad_norm": 0.12149826036690749,
"learning_rate": 4.755724591776572e-05,
"loss": 0.1846,
"mean_token_accuracy": 0.9398538827896118,
"step": 2440
},
{
"epoch": 0.9409274581489321,
"grad_norm": 0.11764487668178286,
"learning_rate": 4.754009218256392e-05,
"loss": 0.1828,
"mean_token_accuracy": 0.9401457965373993,
"step": 2445
},
{
"epoch": 0.9428516451799115,
"grad_norm": 0.11557655011443249,
"learning_rate": 4.752288191067317e-05,
"loss": 0.1833,
"mean_token_accuracy": 0.9400545418262481,
"step": 2450
},
{
"epoch": 0.9447758322108909,
"grad_norm": 0.11671398919818406,
"learning_rate": 4.7505615150646737e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.9409000337123871,
"step": 2455
},
{
"epoch": 0.9467000192418703,
"grad_norm": 0.11369930035927728,
"learning_rate": 4.748829195119724e-05,
"loss": 0.1811,
"mean_token_accuracy": 0.9409979999065399,
"step": 2460
},
{
"epoch": 0.9486242062728497,
"grad_norm": 0.14297427532144838,
"learning_rate": 4.747091236119653e-05,
"loss": 0.1848,
"mean_token_accuracy": 0.9399132430553436,
"step": 2465
},
{
"epoch": 0.9505483933038291,
"grad_norm": 0.11870533079661626,
"learning_rate": 4.7453476429675545e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.9405813694000245,
"step": 2470
},
{
"epoch": 0.9524725803348085,
"grad_norm": 0.11984706759405574,
"learning_rate": 4.7435984205824155e-05,
"loss": 0.185,
"mean_token_accuracy": 0.9398678004741668,
"step": 2475
},
{
"epoch": 0.9543967673657879,
"grad_norm": 0.12387680072322939,
"learning_rate": 4.741843573899107e-05,
"loss": 0.1835,
"mean_token_accuracy": 0.939969539642334,
"step": 2480
},
{
"epoch": 0.9563209543967673,
"grad_norm": 0.12033084141979045,
"learning_rate": 4.7400831078683655e-05,
"loss": 0.1861,
"mean_token_accuracy": 0.9396663069725036,
"step": 2485
},
{
"epoch": 0.9582451414277467,
"grad_norm": 0.12538542915432893,
"learning_rate": 4.738317027456782e-05,
"loss": 0.1854,
"mean_token_accuracy": 0.9394169688224793,
"step": 2490
},
{
"epoch": 0.9601693284587262,
"grad_norm": 0.11316308781679958,
"learning_rate": 4.7365453376467836e-05,
"loss": 0.1824,
"mean_token_accuracy": 0.940411388874054,
"step": 2495
},
{
"epoch": 0.9620935154897056,
"grad_norm": 0.12556823145271817,
"learning_rate": 4.734768043436625e-05,
"loss": 0.1819,
"mean_token_accuracy": 0.9407488465309143,
"step": 2500
},
{
"epoch": 0.964017702520685,
"grad_norm": 0.12484667142634479,
"learning_rate": 4.732985149840373e-05,
"loss": 0.183,
"mean_token_accuracy": 0.9400827348232269,
"step": 2505
},
{
"epoch": 0.9659418895516644,
"grad_norm": 0.11562585712771245,
"learning_rate": 4.7311966618878874e-05,
"loss": 0.1816,
"mean_token_accuracy": 0.940721720457077,
"step": 2510
},
{
"epoch": 0.9678660765826438,
"grad_norm": 0.11245306612355185,
"learning_rate": 4.729402584624815e-05,
"loss": 0.1803,
"mean_token_accuracy": 0.9411805689334869,
"step": 2515
},
{
"epoch": 0.9697902636136232,
"grad_norm": 0.11271273272429061,
"learning_rate": 4.727602923112568e-05,
"loss": 0.1829,
"mean_token_accuracy": 0.9400192379951477,
"step": 2520
},
{
"epoch": 0.9717144506446026,
"grad_norm": 0.11323226408965702,
"learning_rate": 4.725797682428314e-05,
"loss": 0.1831,
"mean_token_accuracy": 0.9404727399349213,
"step": 2525
},
{
"epoch": 0.9736386376755821,
"grad_norm": 0.11663415399743987,
"learning_rate": 4.72398686766496e-05,
"loss": 0.1809,
"mean_token_accuracy": 0.9406877100467682,
"step": 2530
},
{
"epoch": 0.9755628247065615,
"grad_norm": 0.11252169021412911,
"learning_rate": 4.72217048393114e-05,
"loss": 0.1802,
"mean_token_accuracy": 0.9408616960048676,
"step": 2535
},
{
"epoch": 0.9774870117375409,
"grad_norm": 0.11696017091375428,
"learning_rate": 4.720348536351197e-05,
"loss": 0.1836,
"mean_token_accuracy": 0.9399718284606934,
"step": 2540
},
{
"epoch": 0.9794111987685203,
"grad_norm": 0.10962144925513925,
"learning_rate": 4.718521030065171e-05,
"loss": 0.1789,
"mean_token_accuracy": 0.9413145005702972,
"step": 2545
},
{
"epoch": 0.9813353857994997,
"grad_norm": 0.10656545904669233,
"learning_rate": 4.7166879702287844e-05,
"loss": 0.1793,
"mean_token_accuracy": 0.9414481461048126,
"step": 2550
},
{
"epoch": 0.9832595728304792,
"grad_norm": 0.1365718930459726,
"learning_rate": 4.714849362013428e-05,
"loss": 0.1802,
"mean_token_accuracy": 0.9410699248313904,
"step": 2555
},
{
"epoch": 0.9851837598614586,
"grad_norm": 0.10982982510596055,
"learning_rate": 4.7130052106061454e-05,
"loss": 0.1808,
"mean_token_accuracy": 0.9409809350967407,
"step": 2560
},
{
"epoch": 0.987107946892438,
"grad_norm": 0.11421754495270237,
"learning_rate": 4.711155521209616e-05,
"loss": 0.1809,
"mean_token_accuracy": 0.9407776176929474,
"step": 2565
},
{
"epoch": 0.9890321339234174,
"grad_norm": 0.1274611305378617,
"learning_rate": 4.7093002990421466e-05,
"loss": 0.1777,
"mean_token_accuracy": 0.9420558035373687,
"step": 2570
},
{
"epoch": 0.9909563209543968,
"grad_norm": 0.11024541409362704,
"learning_rate": 4.70743954933765e-05,
"loss": 0.1787,
"mean_token_accuracy": 0.941502434015274,
"step": 2575
},
{
"epoch": 0.9928805079853762,
"grad_norm": 0.16449134699837414,
"learning_rate": 4.705573277345635e-05,
"loss": 0.181,
"mean_token_accuracy": 0.9409343540668488,
"step": 2580
},
{
"epoch": 0.9948046950163556,
"grad_norm": 0.11524065323328825,
"learning_rate": 4.70370148833119e-05,
"loss": 0.1807,
"mean_token_accuracy": 0.9407903850078583,
"step": 2585
},
{
"epoch": 0.996728882047335,
"grad_norm": 0.10713501522499569,
"learning_rate": 4.701824187574965e-05,
"loss": 0.1791,
"mean_token_accuracy": 0.9414562463760376,
"step": 2590
},
{
"epoch": 0.9986530690783144,
"grad_norm": 0.11504187540918072,
"learning_rate": 4.699941380373163e-05,
"loss": 0.1802,
"mean_token_accuracy": 0.940996652841568,
"step": 2595
},
{
"epoch": 1.0003848374061959,
"grad_norm": 0.17475084050053288,
"learning_rate": 4.69805307203752e-05,
"loss": 0.1764,
"mean_token_accuracy": 0.9420868423249986,
"step": 2600
},
{
"epoch": 1.0023090244371753,
"grad_norm": 0.11291991760873382,
"learning_rate": 4.696159267895291e-05,
"loss": 0.169,
"mean_token_accuracy": 0.943702882528305,
"step": 2605
},
{
"epoch": 1.0042332114681547,
"grad_norm": 0.11662597339258252,
"learning_rate": 4.694259973289239e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9445142686367035,
"step": 2610
},
{
"epoch": 1.006157398499134,
"grad_norm": 0.11242395335687048,
"learning_rate": 4.692355193577612e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9452146112918853,
"step": 2615
},
{
"epoch": 1.0080815855301135,
"grad_norm": 0.11836124984243517,
"learning_rate": 4.690444934134136e-05,
"loss": 0.1671,
"mean_token_accuracy": 0.9443571090698242,
"step": 2620
},
{
"epoch": 1.010005772561093,
"grad_norm": 0.11951371179098023,
"learning_rate": 4.6885292003479945e-05,
"loss": 0.1646,
"mean_token_accuracy": 0.9452442586421966,
"step": 2625
},
{
"epoch": 1.0119299595920723,
"grad_norm": 0.14562315224241446,
"learning_rate": 4.686607997623816e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9449779331684113,
"step": 2630
},
{
"epoch": 1.0138541466230517,
"grad_norm": 0.1209319195563948,
"learning_rate": 4.6846813313816555e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9453990161418915,
"step": 2635
},
{
"epoch": 1.0157783336540311,
"grad_norm": 0.12074689017528342,
"learning_rate": 4.682749207056986e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9453830003738404,
"step": 2640
},
{
"epoch": 1.0177025206850105,
"grad_norm": 0.11368217084841102,
"learning_rate": 4.680811630100675e-05,
"loss": 0.1649,
"mean_token_accuracy": 0.9448183536529541,
"step": 2645
},
{
"epoch": 1.01962670771599,
"grad_norm": 0.11343676694591881,
"learning_rate": 4.678868605978975e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9464600622653961,
"step": 2650
},
{
"epoch": 1.0215508947469694,
"grad_norm": 0.11741034993722078,
"learning_rate": 4.676920140173504e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9449496328830719,
"step": 2655
},
{
"epoch": 1.0234750817779488,
"grad_norm": 0.11961210804780995,
"learning_rate": 4.674966238181234e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9452983498573303,
"step": 2660
},
{
"epoch": 1.0253992688089282,
"grad_norm": 0.7073890270630259,
"learning_rate": 4.67300690551447e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9446015000343323,
"step": 2665
},
{
"epoch": 1.0273234558399076,
"grad_norm": 0.11899414259953112,
"learning_rate": 4.671042147700844e-05,
"loss": 0.1668,
"mean_token_accuracy": 0.9445567905902863,
"step": 2670
},
{
"epoch": 1.029247642870887,
"grad_norm": 0.11154402650406485,
"learning_rate": 4.669071970283287e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9451187491416931,
"step": 2675
},
{
"epoch": 1.0311718299018664,
"grad_norm": 0.11674403117276437,
"learning_rate": 4.667096378820023e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9446981310844421,
"step": 2680
},
{
"epoch": 1.0330960169328458,
"grad_norm": 0.11233220894814873,
"learning_rate": 4.665115378884549e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9454202771186828,
"step": 2685
},
{
"epoch": 1.0350202039638252,
"grad_norm": 0.11490664025644873,
"learning_rate": 4.663128976065622e-05,
"loss": 0.1646,
"mean_token_accuracy": 0.9450455904006958,
"step": 2690
},
{
"epoch": 1.0369443909948046,
"grad_norm": 0.11352501568114064,
"learning_rate": 4.661137175967239e-05,
"loss": 0.1673,
"mean_token_accuracy": 0.9439740359783173,
"step": 2695
},
{
"epoch": 1.038868578025784,
"grad_norm": 0.11140206907733896,
"learning_rate": 4.659139984208624e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.945688658952713,
"step": 2700
},
{
"epoch": 1.0407927650567634,
"grad_norm": 0.11939481459237158,
"learning_rate": 4.657137406424214e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9446808695793152,
"step": 2705
},
{
"epoch": 1.0427169520877428,
"grad_norm": 0.1135153559788723,
"learning_rate": 4.655129448263639e-05,
"loss": 0.1666,
"mean_token_accuracy": 0.944467556476593,
"step": 2710
},
{
"epoch": 1.0446411391187223,
"grad_norm": 0.11014146292873342,
"learning_rate": 4.6531161153917094e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.944754683971405,
"step": 2715
},
{
"epoch": 1.0465653261497017,
"grad_norm": 0.1159482186501732,
"learning_rate": 4.6510974134883964e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9449528694152832,
"step": 2720
},
{
"epoch": 1.048489513180681,
"grad_norm": 0.11779456258476576,
"learning_rate": 4.649073348248821e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9451721370220184,
"step": 2725
},
{
"epoch": 1.0504137002116605,
"grad_norm": 0.11851875590633976,
"learning_rate": 4.6470439253832316e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9451947033405304,
"step": 2730
},
{
"epoch": 1.0523378872426399,
"grad_norm": 0.11416263688350742,
"learning_rate": 4.645009150616995e-05,
"loss": 0.164,
"mean_token_accuracy": 0.9453210353851318,
"step": 2735
},
{
"epoch": 1.0542620742736193,
"grad_norm": 0.11153611396218852,
"learning_rate": 4.6429690296905756e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.9449005126953125,
"step": 2740
},
{
"epoch": 1.0561862613045987,
"grad_norm": 0.11896875234146256,
"learning_rate": 4.64092356835952e-05,
"loss": 0.1659,
"mean_token_accuracy": 0.9447975337505341,
"step": 2745
},
{
"epoch": 1.0581104483355783,
"grad_norm": 0.11935152821930041,
"learning_rate": 4.6388727723944395e-05,
"loss": 0.1659,
"mean_token_accuracy": 0.9447598576545715,
"step": 2750
},
{
"epoch": 1.0600346353665577,
"grad_norm": 0.1163982177154289,
"learning_rate": 4.636816647580998e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9461501777172089,
"step": 2755
},
{
"epoch": 1.0619588223975371,
"grad_norm": 0.21829881993472672,
"learning_rate": 4.6347551997198915e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9448219954967498,
"step": 2760
},
{
"epoch": 1.0638830094285165,
"grad_norm": 0.11191678688827768,
"learning_rate": 4.632688434626833e-05,
"loss": 0.1682,
"mean_token_accuracy": 0.9440916359424592,
"step": 2765
},
{
"epoch": 1.065807196459496,
"grad_norm": 0.10912851202680238,
"learning_rate": 4.630616358132538e-05,
"loss": 0.1671,
"mean_token_accuracy": 0.944388085603714,
"step": 2770
},
{
"epoch": 1.0677313834904754,
"grad_norm": 0.11325247994043755,
"learning_rate": 4.6285389760827035e-05,
"loss": 0.1603,
"mean_token_accuracy": 0.9464464247226715,
"step": 2775
},
{
"epoch": 1.0696555705214548,
"grad_norm": 0.10949410815060585,
"learning_rate": 4.626456294337999e-05,
"loss": 0.1606,
"mean_token_accuracy": 0.9462958097457885,
"step": 2780
},
{
"epoch": 1.0715797575524342,
"grad_norm": 0.11071407586025536,
"learning_rate": 4.6243683187740414e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.9457931101322175,
"step": 2785
},
{
"epoch": 1.0735039445834136,
"grad_norm": 0.1158591760502722,
"learning_rate": 4.6222750552813834e-05,
"loss": 0.1663,
"mean_token_accuracy": 0.944324654340744,
"step": 2790
},
{
"epoch": 1.075428131614393,
"grad_norm": 0.10880174773529794,
"learning_rate": 4.620176509765496e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9458514332771302,
"step": 2795
},
{
"epoch": 1.0773523186453724,
"grad_norm": 0.11470522185752449,
"learning_rate": 4.618072688146752e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9443574488162995,
"step": 2800
},
{
"epoch": 1.0792765056763518,
"grad_norm": 0.1169552366814147,
"learning_rate": 4.615963596360411e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9445386469364166,
"step": 2805
},
{
"epoch": 1.0812006927073312,
"grad_norm": 0.1146473820257585,
"learning_rate": 4.613849240356595e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9454758405685425,
"step": 2810
},
{
"epoch": 1.0831248797383106,
"grad_norm": 0.1180821382249639,
"learning_rate": 4.611729626100284e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.9456037998199462,
"step": 2815
},
{
"epoch": 1.08504906676929,
"grad_norm": 0.10806023645742613,
"learning_rate": 4.6096047595712874e-05,
"loss": 0.1664,
"mean_token_accuracy": 0.9445085465908051,
"step": 2820
},
{
"epoch": 1.0869732538002694,
"grad_norm": 0.10893231386169591,
"learning_rate": 4.607474646764236e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9448854207992554,
"step": 2825
},
{
"epoch": 1.0888974408312488,
"grad_norm": 0.12081177073304383,
"learning_rate": 4.605339293688558e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.944972711801529,
"step": 2830
},
{
"epoch": 1.0908216278622282,
"grad_norm": 0.10938210180068166,
"learning_rate": 4.603198706368468e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.946021032333374,
"step": 2835
},
{
"epoch": 1.0927458148932077,
"grad_norm": 0.10519629631556646,
"learning_rate": 4.6010528908429445e-05,
"loss": 0.1675,
"mean_token_accuracy": 0.9442543506622314,
"step": 2840
},
{
"epoch": 1.094670001924187,
"grad_norm": 0.10324109258642244,
"learning_rate": 4.598901853165719e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9447111248970032,
"step": 2845
},
{
"epoch": 1.0965941889551665,
"grad_norm": 0.10728210589019434,
"learning_rate": 4.596745599405254e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9445396065711975,
"step": 2850
},
{
"epoch": 1.0985183759861459,
"grad_norm": 0.11493544596778879,
"learning_rate": 4.5945841356447255e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9453412830829621,
"step": 2855
},
{
"epoch": 1.1004425630171253,
"grad_norm": 0.10825846359143984,
"learning_rate": 4.5924174679820124e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.945780211687088,
"step": 2860
},
{
"epoch": 1.1023667500481047,
"grad_norm": 0.11159733375533923,
"learning_rate": 4.5902456025296716e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.94489386677742,
"step": 2865
},
{
"epoch": 1.104290937079084,
"grad_norm": 0.12082066111535134,
"learning_rate": 4.588068545414924e-05,
"loss": 0.1695,
"mean_token_accuracy": 0.9434131681919098,
"step": 2870
},
{
"epoch": 1.1062151241100635,
"grad_norm": 0.10837959440293671,
"learning_rate": 4.585886302779637e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9456695020198822,
"step": 2875
},
{
"epoch": 1.108139311141043,
"grad_norm": 0.10937977096751192,
"learning_rate": 4.5836988807803086e-05,
"loss": 0.1665,
"mean_token_accuracy": 0.9442308902740478,
"step": 2880
},
{
"epoch": 1.1100634981720223,
"grad_norm": 0.1091698135418322,
"learning_rate": 4.581506285588049e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9454576671123505,
"step": 2885
},
{
"epoch": 1.1119876852030017,
"grad_norm": 0.10725811081270654,
"learning_rate": 4.579308523388559e-05,
"loss": 0.1639,
"mean_token_accuracy": 0.9453332245349884,
"step": 2890
},
{
"epoch": 1.1139118722339811,
"grad_norm": 0.11351347695275504,
"learning_rate": 4.577105600382122e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9453843176364899,
"step": 2895
},
{
"epoch": 1.1158360592649605,
"grad_norm": 0.1138607424511027,
"learning_rate": 4.574897522783578e-05,
"loss": 0.167,
"mean_token_accuracy": 0.9444127380847931,
"step": 2900
},
{
"epoch": 1.11776024629594,
"grad_norm": 0.1106852790197566,
"learning_rate": 4.572684296822308e-05,
"loss": 0.1691,
"mean_token_accuracy": 0.9439824044704437,
"step": 2905
},
{
"epoch": 1.1196844333269194,
"grad_norm": 0.10667739581669644,
"learning_rate": 4.5704659287422203e-05,
"loss": 0.1664,
"mean_token_accuracy": 0.944707703590393,
"step": 2910
},
{
"epoch": 1.1216086203578988,
"grad_norm": 0.11007892960055661,
"learning_rate": 4.568242424801727e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9453161656856537,
"step": 2915
},
{
"epoch": 1.1235328073888782,
"grad_norm": 0.1195397398056014,
"learning_rate": 4.566013791273733e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9449052453041077,
"step": 2920
},
{
"epoch": 1.1254569944198576,
"grad_norm": 0.11078554270766434,
"learning_rate": 4.56378003444561e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9444914698600769,
"step": 2925
},
{
"epoch": 1.127381181450837,
"grad_norm": 0.1152580424421967,
"learning_rate": 4.561541160619188e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9451714098453522,
"step": 2930
},
{
"epoch": 1.1293053684818164,
"grad_norm": 0.11601480600361394,
"learning_rate": 4.5592971761107305e-05,
"loss": 0.1688,
"mean_token_accuracy": 0.9436035096645355,
"step": 2935
},
{
"epoch": 1.1312295555127958,
"grad_norm": 0.11043130745236737,
"learning_rate": 4.557048087250919e-05,
"loss": 0.1705,
"mean_token_accuracy": 0.943270879983902,
"step": 2940
},
{
"epoch": 1.1331537425437752,
"grad_norm": 0.11410012718160904,
"learning_rate": 4.5547939003848374e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9457004249095917,
"step": 2945
},
{
"epoch": 1.1350779295747546,
"grad_norm": 0.14452950245071855,
"learning_rate": 4.5525346218719494e-05,
"loss": 0.1682,
"mean_token_accuracy": 0.944180291891098,
"step": 2950
},
{
"epoch": 1.137002116605734,
"grad_norm": 0.11593464274125287,
"learning_rate": 4.550270258086085e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9445393085479736,
"step": 2955
},
{
"epoch": 1.1389263036367134,
"grad_norm": 0.10914297851497087,
"learning_rate": 4.548000815415419e-05,
"loss": 0.1655,
"mean_token_accuracy": 0.9448778092861175,
"step": 2960
},
{
"epoch": 1.1408504906676928,
"grad_norm": 0.11799163038725619,
"learning_rate": 4.5457263002624564e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.9448582708835602,
"step": 2965
},
{
"epoch": 1.1427746776986722,
"grad_norm": 0.10917437430822485,
"learning_rate": 4.543446719044011e-05,
"loss": 0.1676,
"mean_token_accuracy": 0.9441717028617859,
"step": 2970
},
{
"epoch": 1.1446988647296517,
"grad_norm": 0.10999293274127507,
"learning_rate": 4.541162078191191e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9447192370891571,
"step": 2975
},
{
"epoch": 1.146623051760631,
"grad_norm": 0.1213298439573089,
"learning_rate": 4.5388723841493756e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9453912734985351,
"step": 2980
},
{
"epoch": 1.1485472387916105,
"grad_norm": 0.1120433593367859,
"learning_rate": 4.536577643378203e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9446643471717835,
"step": 2985
},
{
"epoch": 1.1504714258225899,
"grad_norm": 0.1195181116215837,
"learning_rate": 4.534277862351548e-05,
"loss": 0.1689,
"mean_token_accuracy": 0.9437766313552857,
"step": 2990
},
{
"epoch": 1.1523956128535693,
"grad_norm": 0.10566122117237882,
"learning_rate": 4.531973047557504e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9449531376361847,
"step": 2995
},
{
"epoch": 1.1543197998845487,
"grad_norm": 0.11045165779517872,
"learning_rate": 4.529663205498367e-05,
"loss": 0.1646,
"mean_token_accuracy": 0.9451044261455536,
"step": 3000
},
{
"epoch": 1.156243986915528,
"grad_norm": 0.10897028680389657,
"learning_rate": 4.5273483426906136e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9457724034786225,
"step": 3005
},
{
"epoch": 1.1581681739465075,
"grad_norm": 0.11229669750924309,
"learning_rate": 4.525028465664888e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9444893836975098,
"step": 3010
},
{
"epoch": 1.1600923609774871,
"grad_norm": 0.11598130083724927,
"learning_rate": 4.522703580965979e-05,
"loss": 0.1649,
"mean_token_accuracy": 0.9450352728366852,
"step": 3015
},
{
"epoch": 1.1620165480084665,
"grad_norm": 0.11264247770065546,
"learning_rate": 4.5203736951528015e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.945190292596817,
"step": 3020
},
{
"epoch": 1.163940735039446,
"grad_norm": 0.11105533940678633,
"learning_rate": 4.5180388147983804e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9444464981555939,
"step": 3025
},
{
"epoch": 1.1658649220704254,
"grad_norm": 0.10819037808707227,
"learning_rate": 4.515698946489833e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.9460033416748047,
"step": 3030
},
{
"epoch": 1.1677891091014048,
"grad_norm": 0.106476221373719,
"learning_rate": 4.513354096828345e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9452900588512421,
"step": 3035
},
{
"epoch": 1.1697132961323842,
"grad_norm": 0.10565816473934767,
"learning_rate": 4.511004272429158e-05,
"loss": 0.1685,
"mean_token_accuracy": 0.9438742876052857,
"step": 3040
},
{
"epoch": 1.1716374831633636,
"grad_norm": 0.10537825559045179,
"learning_rate": 4.508649479921547e-05,
"loss": 0.1663,
"mean_token_accuracy": 0.9447853684425354,
"step": 3045
},
{
"epoch": 1.173561670194343,
"grad_norm": 0.1458321817709821,
"learning_rate": 4.506289725948805e-05,
"loss": 0.165,
"mean_token_accuracy": 0.945037055015564,
"step": 3050
},
{
"epoch": 1.1754858572253224,
"grad_norm": 0.11225857644127477,
"learning_rate": 4.503925017168219e-05,
"loss": 0.1649,
"mean_token_accuracy": 0.9449495792388916,
"step": 3055
},
{
"epoch": 1.1774100442563018,
"grad_norm": 0.15497296316426867,
"learning_rate": 4.501555360251056e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9451259493827819,
"step": 3060
},
{
"epoch": 1.1793342312872812,
"grad_norm": 0.10779990682132905,
"learning_rate": 4.499180761882543e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9450840950012207,
"step": 3065
},
{
"epoch": 1.1812584183182606,
"grad_norm": 0.11208595212873654,
"learning_rate": 4.4968012287618474e-05,
"loss": 0.1672,
"mean_token_accuracy": 0.9442859888076782,
"step": 3070
},
{
"epoch": 1.18318260534924,
"grad_norm": 0.10790877379409972,
"learning_rate": 4.494416767602058e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9444129526615143,
"step": 3075
},
{
"epoch": 1.1851067923802194,
"grad_norm": 0.13362309151311375,
"learning_rate": 4.492027385130166e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9450437903404236,
"step": 3080
},
{
"epoch": 1.1870309794111988,
"grad_norm": 0.10548405243633255,
"learning_rate": 4.489633088087049e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9451897084712982,
"step": 3085
},
{
"epoch": 1.1889551664421782,
"grad_norm": 0.1094571190971919,
"learning_rate": 4.487233883227446e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9450724065303803,
"step": 3090
},
{
"epoch": 1.1908793534731577,
"grad_norm": 0.11368729549868266,
"learning_rate": 4.4848297773199444e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9449832141399384,
"step": 3095
},
{
"epoch": 1.192803540504137,
"grad_norm": 0.10633275002345796,
"learning_rate": 4.482420777146958e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9445993602275848,
"step": 3100
},
{
"epoch": 1.1947277275351165,
"grad_norm": 0.10890863944556516,
"learning_rate": 4.480006889504707e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.9447604954242707,
"step": 3105
},
{
"epoch": 1.1966519145660959,
"grad_norm": 0.10605220946366134,
"learning_rate": 4.477588121203201e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9447374463081359,
"step": 3110
},
{
"epoch": 1.1985761015970753,
"grad_norm": 0.1091304337833763,
"learning_rate": 4.475164479066218e-05,
"loss": 0.1655,
"mean_token_accuracy": 0.9448967158794404,
"step": 3115
},
{
"epoch": 1.2005002886280547,
"grad_norm": 0.10964839732163159,
"learning_rate": 4.472735969931287e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9460000276565552,
"step": 3120
},
{
"epoch": 1.202424475659034,
"grad_norm": 0.1078550524683339,
"learning_rate": 4.470302600649667e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9453963935375214,
"step": 3125
},
{
"epoch": 1.2043486626900135,
"grad_norm": 0.11060898474209213,
"learning_rate": 4.467864378086329e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.944653457403183,
"step": 3130
},
{
"epoch": 1.206272849720993,
"grad_norm": 0.11874980904115744,
"learning_rate": 4.4654213091199345e-05,
"loss": 0.1663,
"mean_token_accuracy": 0.94460169672966,
"step": 3135
},
{
"epoch": 1.2081970367519723,
"grad_norm": 0.11565952367311678,
"learning_rate": 4.46297340064282e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9452078878879547,
"step": 3140
},
{
"epoch": 1.2101212237829517,
"grad_norm": 0.1113501248031809,
"learning_rate": 4.460520659560973e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.9448917925357818,
"step": 3145
},
{
"epoch": 1.2120454108139311,
"grad_norm": 0.11584440119707458,
"learning_rate": 4.4580630927940145e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9459242165088654,
"step": 3150
},
{
"epoch": 1.2139695978449105,
"grad_norm": 0.10654586713838474,
"learning_rate": 4.455600707275181e-05,
"loss": 0.163,
"mean_token_accuracy": 0.9455973863601684,
"step": 3155
},
{
"epoch": 1.21589378487589,
"grad_norm": 0.11309187543030096,
"learning_rate": 4.453133509951304e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9448260486125946,
"step": 3160
},
{
"epoch": 1.2178179719068694,
"grad_norm": 0.10795367642944186,
"learning_rate": 4.450661507782788e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9442971289157868,
"step": 3165
},
{
"epoch": 1.2197421589378488,
"grad_norm": 0.11325608172844308,
"learning_rate": 4.448184707743594e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9452082395553589,
"step": 3170
},
{
"epoch": 1.2216663459688282,
"grad_norm": 0.10770156568934536,
"learning_rate": 4.4457031168212195e-05,
"loss": 0.1665,
"mean_token_accuracy": 0.9443559110164642,
"step": 3175
},
{
"epoch": 1.2235905329998076,
"grad_norm": 0.10345448099059522,
"learning_rate": 4.443216742016675e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9450966417789459,
"step": 3180
},
{
"epoch": 1.225514720030787,
"grad_norm": 0.10362673775355276,
"learning_rate": 4.440725590344469e-05,
"loss": 0.1655,
"mean_token_accuracy": 0.9448781192302704,
"step": 3185
},
{
"epoch": 1.2274389070617664,
"grad_norm": 0.10715949067340574,
"learning_rate": 4.4382296688325894e-05,
"loss": 0.1628,
"mean_token_accuracy": 0.9453873097896576,
"step": 3190
},
{
"epoch": 1.2293630940927458,
"grad_norm": 0.11161907208207356,
"learning_rate": 4.4357289845224755e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9458705008029937,
"step": 3195
},
{
"epoch": 1.2312872811237252,
"grad_norm": 0.10746793786362824,
"learning_rate": 4.433223544469006e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9446219742298126,
"step": 3200
},
{
"epoch": 1.2332114681547046,
"grad_norm": 0.2789190819561189,
"learning_rate": 4.4307133557404754e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.9454685270786285,
"step": 3205
},
{
"epoch": 1.235135655185684,
"grad_norm": 0.11125241310201558,
"learning_rate": 4.428198425418576e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9451810359954834,
"step": 3210
},
{
"epoch": 1.2370598422166634,
"grad_norm": 0.10590925043833456,
"learning_rate": 4.425678760598377e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9448261022567749,
"step": 3215
},
{
"epoch": 1.2389840292476428,
"grad_norm": 0.10703980730393392,
"learning_rate": 4.423154368388304e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9447467207908631,
"step": 3220
},
{
"epoch": 1.2409082162786222,
"grad_norm": 0.11244571794665767,
"learning_rate": 4.42062525591012e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9450976312160492,
"step": 3225
},
{
"epoch": 1.2428324033096017,
"grad_norm": 0.11057302698238773,
"learning_rate": 4.418091430298903e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.944273978471756,
"step": 3230
},
{
"epoch": 1.244756590340581,
"grad_norm": 0.109162505178306,
"learning_rate": 4.41555289870303e-05,
"loss": 0.1671,
"mean_token_accuracy": 0.9443233251571655,
"step": 3235
},
{
"epoch": 1.2466807773715605,
"grad_norm": 0.11127896226989155,
"learning_rate": 4.413009668284153e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.945149028301239,
"step": 3240
},
{
"epoch": 1.2486049644025399,
"grad_norm": 0.10557729092136117,
"learning_rate": 4.410461746217179e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9456705510616302,
"step": 3245
},
{
"epoch": 1.2505291514335193,
"grad_norm": 0.11207584287790179,
"learning_rate": 4.407909139690255e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9445265352725982,
"step": 3250
},
{
"epoch": 1.2524533384644987,
"grad_norm": 0.10876050990252382,
"learning_rate": 4.405351855904739e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9442014992237091,
"step": 3255
},
{
"epoch": 1.254377525495478,
"grad_norm": 0.1058929093267954,
"learning_rate": 4.402789902075187e-05,
"loss": 0.1668,
"mean_token_accuracy": 0.9443489611148834,
"step": 3260
},
{
"epoch": 1.2563017125264575,
"grad_norm": 0.12172411780115582,
"learning_rate": 4.4002232854293305e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9444850265979767,
"step": 3265
},
{
"epoch": 1.258225899557437,
"grad_norm": 0.1117116436027642,
"learning_rate": 4.397652013208054e-05,
"loss": 0.1674,
"mean_token_accuracy": 0.9442491412162781,
"step": 3270
},
{
"epoch": 1.2601500865884163,
"grad_norm": 0.11305742675214413,
"learning_rate": 4.395076092665377e-05,
"loss": 0.1663,
"mean_token_accuracy": 0.9447296023368835,
"step": 3275
},
{
"epoch": 1.2620742736193957,
"grad_norm": 0.10524076252473272,
"learning_rate": 4.392495531068433e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9456575155258179,
"step": 3280
},
{
"epoch": 1.2639984606503751,
"grad_norm": 0.10565547204693512,
"learning_rate": 4.389910335697447e-05,
"loss": 0.1684,
"mean_token_accuracy": 0.9437812447547913,
"step": 3285
},
{
"epoch": 1.2659226476813545,
"grad_norm": 0.5886669314222224,
"learning_rate": 4.3873205138457204e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9450515389442444,
"step": 3290
},
{
"epoch": 1.267846834712334,
"grad_norm": 0.11366376959578357,
"learning_rate": 4.384726072819602e-05,
"loss": 0.1668,
"mean_token_accuracy": 0.9443962574005127,
"step": 3295
},
{
"epoch": 1.2697710217433134,
"grad_norm": 0.1127714483239427,
"learning_rate": 4.382127019938477e-05,
"loss": 0.1653,
"mean_token_accuracy": 0.9446232855319977,
"step": 3300
},
{
"epoch": 1.2716952087742928,
"grad_norm": 0.10977940299951658,
"learning_rate": 4.379523362534736e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9452247381210327,
"step": 3305
},
{
"epoch": 1.2736193958052722,
"grad_norm": 0.1043758131726259,
"learning_rate": 4.376915107953767e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9448330402374268,
"step": 3310
},
{
"epoch": 1.2755435828362516,
"grad_norm": 0.11335117990899052,
"learning_rate": 4.37430226355392e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9443521559238434,
"step": 3315
},
{
"epoch": 1.277467769867231,
"grad_norm": 0.11887980255705852,
"learning_rate": 4.371684836706497e-05,
"loss": 0.1693,
"mean_token_accuracy": 0.9434103488922119,
"step": 3320
},
{
"epoch": 1.2793919568982104,
"grad_norm": 0.10264830047952361,
"learning_rate": 4.3690628347957294e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9455609321594238,
"step": 3325
},
{
"epoch": 1.2813161439291898,
"grad_norm": 0.10656476625453781,
"learning_rate": 4.3664362652187544e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.9452636063098907,
"step": 3330
},
{
"epoch": 1.2832403309601692,
"grad_norm": 0.11373328764330298,
"learning_rate": 4.363805135385593e-05,
"loss": 0.1685,
"mean_token_accuracy": 0.9438173949718476,
"step": 3335
},
{
"epoch": 1.2851645179911486,
"grad_norm": 0.1110834865814892,
"learning_rate": 4.361169452719136e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9445045590400696,
"step": 3340
},
{
"epoch": 1.287088705022128,
"grad_norm": 0.10499389790518351,
"learning_rate": 4.358529224655115e-05,
"loss": 0.1668,
"mean_token_accuracy": 0.9443840861320496,
"step": 3345
},
{
"epoch": 1.2890128920531074,
"grad_norm": 0.10943161523408348,
"learning_rate": 4.355884458642085e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9448311269283295,
"step": 3350
},
{
"epoch": 1.2909370790840868,
"grad_norm": 0.11241992154282385,
"learning_rate": 4.3532351621414076e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.944890421628952,
"step": 3355
},
{
"epoch": 1.2928612661150665,
"grad_norm": 0.12204214677028828,
"learning_rate": 4.3505813426272206e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9451867461204528,
"step": 3360
},
{
"epoch": 1.2947854531460459,
"grad_norm": 0.11051458573708776,
"learning_rate": 4.347923007586424e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.9464590072631835,
"step": 3365
},
{
"epoch": 1.2967096401770253,
"grad_norm": 0.10588729835633466,
"learning_rate": 4.3452601645186576e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.9458398222923279,
"step": 3370
},
{
"epoch": 1.2986338272080047,
"grad_norm": 0.1129626094027341,
"learning_rate": 4.3425928209362784e-05,
"loss": 0.1676,
"mean_token_accuracy": 0.9443058788776397,
"step": 3375
},
{
"epoch": 1.300558014238984,
"grad_norm": 0.11026036715010902,
"learning_rate": 4.339920984364341e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9446721136569977,
"step": 3380
},
{
"epoch": 1.3024822012699635,
"grad_norm": 0.10431970395231525,
"learning_rate": 4.337244662340574e-05,
"loss": 0.1671,
"mean_token_accuracy": 0.9446083605289459,
"step": 3385
},
{
"epoch": 1.304406388300943,
"grad_norm": 0.10863163610480198,
"learning_rate": 4.334563862415361e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9451094448566437,
"step": 3390
},
{
"epoch": 1.3063305753319223,
"grad_norm": 0.10344587307267852,
"learning_rate": 4.33187859215172e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9459316253662109,
"step": 3395
},
{
"epoch": 1.3082547623629017,
"grad_norm": 0.10329769224219561,
"learning_rate": 4.3291888591252774e-05,
"loss": 0.1681,
"mean_token_accuracy": 0.9443010270595551,
"step": 3400
},
{
"epoch": 1.3101789493938811,
"grad_norm": 0.10573120313146973,
"learning_rate": 4.326494670924254e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9456990361213684,
"step": 3405
},
{
"epoch": 1.3121031364248605,
"grad_norm": 0.10999181599566926,
"learning_rate": 4.323796035149435e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9456778645515442,
"step": 3410
},
{
"epoch": 1.31402732345584,
"grad_norm": 0.11732350200322827,
"learning_rate": 4.321092959414157e-05,
"loss": 0.1664,
"mean_token_accuracy": 0.9445131242275238,
"step": 3415
},
{
"epoch": 1.3159515104868194,
"grad_norm": 0.11232867292678717,
"learning_rate": 4.318385451344278e-05,
"loss": 0.164,
"mean_token_accuracy": 0.945280921459198,
"step": 3420
},
{
"epoch": 1.3178756975177988,
"grad_norm": 0.10617374846493216,
"learning_rate": 4.315673518578167e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9459102272987365,
"step": 3425
},
{
"epoch": 1.3197998845487782,
"grad_norm": 0.10896252285505933,
"learning_rate": 4.312957168766669e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9459147036075592,
"step": 3430
},
{
"epoch": 1.3217240715797576,
"grad_norm": 0.11137213099006038,
"learning_rate": 4.310236409573095e-05,
"loss": 0.1653,
"mean_token_accuracy": 0.9451059341430664,
"step": 3435
},
{
"epoch": 1.323648258610737,
"grad_norm": 0.10832084844435444,
"learning_rate": 4.307511248673193e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9447243928909301,
"step": 3440
},
{
"epoch": 1.3255724456417164,
"grad_norm": 0.11469692641777185,
"learning_rate": 4.30478169375513e-05,
"loss": 0.1681,
"mean_token_accuracy": 0.9439410865306854,
"step": 3445
},
{
"epoch": 1.3274966326726958,
"grad_norm": 0.10691215801933739,
"learning_rate": 4.30204775251947e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9449956476688385,
"step": 3450
},
{
"epoch": 1.3294208197036752,
"grad_norm": 0.11825050155634374,
"learning_rate": 4.2993094326791495e-05,
"loss": 0.1691,
"mean_token_accuracy": 0.9436154067516327,
"step": 3455
},
{
"epoch": 1.3313450067346546,
"grad_norm": 0.09976872054833154,
"learning_rate": 4.296566741959461e-05,
"loss": 0.1637,
"mean_token_accuracy": 0.9454048812389374,
"step": 3460
},
{
"epoch": 1.333269193765634,
"grad_norm": 0.10977843829733427,
"learning_rate": 4.293819688098024e-05,
"loss": 0.163,
"mean_token_accuracy": 0.94544877409935,
"step": 3465
},
{
"epoch": 1.3351933807966134,
"grad_norm": 0.10388477951978371,
"learning_rate": 4.291068278844771e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9446780025959015,
"step": 3470
},
{
"epoch": 1.3371175678275928,
"grad_norm": 0.11193340370564675,
"learning_rate": 4.288312521961919e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9452359676361084,
"step": 3475
},
{
"epoch": 1.3390417548585722,
"grad_norm": 0.10573875704379705,
"learning_rate": 4.285552425223955e-05,
"loss": 0.1639,
"mean_token_accuracy": 0.9454302430152893,
"step": 3480
},
{
"epoch": 1.3409659418895516,
"grad_norm": 0.10442435507308796,
"learning_rate": 4.282787996417601e-05,
"loss": 0.1703,
"mean_token_accuracy": 0.9433972775936127,
"step": 3485
},
{
"epoch": 1.342890128920531,
"grad_norm": 0.10178493746278647,
"learning_rate": 4.2800192433418094e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9455658197402954,
"step": 3490
},
{
"epoch": 1.3448143159515105,
"grad_norm": 0.10658339710742912,
"learning_rate": 4.2772461738077274e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9446894705295563,
"step": 3495
},
{
"epoch": 1.3467385029824899,
"grad_norm": 0.11283900491670172,
"learning_rate": 4.274468795638681e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.9456519961357117,
"step": 3500
},
{
"epoch": 1.3486626900134693,
"grad_norm": 0.10214896741095081,
"learning_rate": 4.271687116670151e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9451316177845002,
"step": 3505
},
{
"epoch": 1.3505868770444487,
"grad_norm": 0.10559833937537223,
"learning_rate": 4.268901144749753e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9450106203556061,
"step": 3510
},
{
"epoch": 1.352511064075428,
"grad_norm": 0.10894574541369677,
"learning_rate": 4.26611088773721e-05,
"loss": 0.1664,
"mean_token_accuracy": 0.9443780064582825,
"step": 3515
},
{
"epoch": 1.3544352511064075,
"grad_norm": 0.1152130055560103,
"learning_rate": 4.263316353504341e-05,
"loss": 0.1647,
"mean_token_accuracy": 0.9447641968727112,
"step": 3520
},
{
"epoch": 1.356359438137387,
"grad_norm": 0.11157843122921446,
"learning_rate": 4.260517549935024e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9448266983032226,
"step": 3525
},
{
"epoch": 1.3582836251683663,
"grad_norm": 0.10694050714172938,
"learning_rate": 4.257714484925185e-05,
"loss": 0.1672,
"mean_token_accuracy": 0.9442776262760162,
"step": 3530
},
{
"epoch": 1.3602078121993457,
"grad_norm": 0.10508952012523275,
"learning_rate": 4.254907166382775e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9448453724384308,
"step": 3535
},
{
"epoch": 1.3621319992303251,
"grad_norm": 0.10804026722792796,
"learning_rate": 4.2520956022277415e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9450315535068512,
"step": 3540
},
{
"epoch": 1.3640561862613045,
"grad_norm": 0.10584159448655887,
"learning_rate": 4.249279800392009e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.946070384979248,
"step": 3545
},
{
"epoch": 1.365980373292284,
"grad_norm": 0.10247054811830368,
"learning_rate": 4.24645976881946e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9454725086688995,
"step": 3550
},
{
"epoch": 1.3679045603232634,
"grad_norm": 0.10303603825443779,
"learning_rate": 4.2436355154659085e-05,
"loss": 0.1628,
"mean_token_accuracy": 0.9457809746265411,
"step": 3555
},
{
"epoch": 1.3698287473542428,
"grad_norm": 0.10231812479939655,
"learning_rate": 4.240807048299079e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9446156203746796,
"step": 3560
},
{
"epoch": 1.3717529343852222,
"grad_norm": 0.09923793110107657,
"learning_rate": 4.237974375298584e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.94522123336792,
"step": 3565
},
{
"epoch": 1.3736771214162016,
"grad_norm": 0.10193644008290698,
"learning_rate": 4.2351375044558996e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9449735343456268,
"step": 3570
},
{
"epoch": 1.3756013084471812,
"grad_norm": 0.10993227566346095,
"learning_rate": 4.232296443774349e-05,
"loss": 0.1659,
"mean_token_accuracy": 0.9448629319667816,
"step": 3575
},
{
"epoch": 1.3775254954781606,
"grad_norm": 0.10305085529258905,
"learning_rate": 4.229451201269072e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9453402578830719,
"step": 3580
},
{
"epoch": 1.37944968250914,
"grad_norm": 0.10694140393689279,
"learning_rate": 4.226601784967008e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.944640851020813,
"step": 3585
},
{
"epoch": 1.3813738695401194,
"grad_norm": 0.10315714341865034,
"learning_rate": 4.223748202906869e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9451101958751679,
"step": 3590
},
{
"epoch": 1.3832980565710988,
"grad_norm": 0.10596272263919713,
"learning_rate": 4.220890463139122e-05,
"loss": 0.1647,
"mean_token_accuracy": 0.9449066698551178,
"step": 3595
},
{
"epoch": 1.3852222436020782,
"grad_norm": 0.11068312840643807,
"learning_rate": 4.218028573725963e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9452288806438446,
"step": 3600
},
{
"epoch": 1.3871464306330576,
"grad_norm": 0.11399898827725882,
"learning_rate": 4.215162542741295e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9458402037620545,
"step": 3605
},
{
"epoch": 1.389070617664037,
"grad_norm": 0.10629897967961681,
"learning_rate": 4.2122923782707035e-05,
"loss": 0.1647,
"mean_token_accuracy": 0.9450836300849914,
"step": 3610
},
{
"epoch": 1.3909948046950165,
"grad_norm": 0.11877802521782936,
"learning_rate": 4.2094180884114375e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9447278261184693,
"step": 3615
},
{
"epoch": 1.3929189917259959,
"grad_norm": 0.10567173738423105,
"learning_rate": 4.206539681272382e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9448735773563385,
"step": 3620
},
{
"epoch": 1.3948431787569753,
"grad_norm": 0.10352666517341502,
"learning_rate": 4.2036571649740404e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9451052606105804,
"step": 3625
},
{
"epoch": 1.3967673657879547,
"grad_norm": 0.11357510290183502,
"learning_rate": 4.2007705476485064e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9447233974933624,
"step": 3630
},
{
"epoch": 1.398691552818934,
"grad_norm": 0.11195536430362973,
"learning_rate": 4.197879837439446e-05,
"loss": 0.162,
"mean_token_accuracy": 0.9459154546260834,
"step": 3635
},
{
"epoch": 1.4006157398499135,
"grad_norm": 0.10415607753076003,
"learning_rate": 4.194985042502069e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9454761385917664,
"step": 3640
},
{
"epoch": 1.402539926880893,
"grad_norm": 0.10766512506284352,
"learning_rate": 4.1920861710031094e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9452585756778717,
"step": 3645
},
{
"epoch": 1.4044641139118723,
"grad_norm": 0.10856833595067805,
"learning_rate": 4.1891832311208055e-05,
"loss": 0.1681,
"mean_token_accuracy": 0.9444329977035523,
"step": 3650
},
{
"epoch": 1.4063883009428517,
"grad_norm": 0.10904305554437421,
"learning_rate": 4.1862762310448686e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9446165263652802,
"step": 3655
},
{
"epoch": 1.4083124879738311,
"grad_norm": 0.10282925284209865,
"learning_rate": 4.1833651789764675e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9455777108669281,
"step": 3660
},
{
"epoch": 1.4102366750048105,
"grad_norm": 0.10235844426693394,
"learning_rate": 4.1804500831282006e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9464917063713074,
"step": 3665
},
{
"epoch": 1.41216086203579,
"grad_norm": 0.10763382432644855,
"learning_rate": 4.177530951724076e-05,
"loss": 0.1637,
"mean_token_accuracy": 0.9452625811100006,
"step": 3670
},
{
"epoch": 1.4140850490667694,
"grad_norm": 0.10239862565132175,
"learning_rate": 4.1746077929994865e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9451430201530456,
"step": 3675
},
{
"epoch": 1.4160092360977488,
"grad_norm": 0.1019056980175764,
"learning_rate": 4.171680615201185e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9452406644821167,
"step": 3680
},
{
"epoch": 1.4179334231287282,
"grad_norm": 0.10649424349147088,
"learning_rate": 4.168749426587265e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9451565325260163,
"step": 3685
},
{
"epoch": 1.4198576101597076,
"grad_norm": 0.10680893725233677,
"learning_rate": 4.165814235427135e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.9464034974575043,
"step": 3690
},
{
"epoch": 1.421781797190687,
"grad_norm": 0.10238376427445661,
"learning_rate": 4.1628750500014947e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.94556645154953,
"step": 3695
},
{
"epoch": 1.4237059842216664,
"grad_norm": 0.10632577681067389,
"learning_rate": 4.159931878602312e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9463468253612518,
"step": 3700
},
{
"epoch": 1.4256301712526458,
"grad_norm": 0.1066470110654814,
"learning_rate": 4.1569847295328e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9456010282039642,
"step": 3705
},
{
"epoch": 1.4275543582836252,
"grad_norm": 0.10638056425969808,
"learning_rate": 4.1540336111073956e-05,
"loss": 0.1628,
"mean_token_accuracy": 0.9455960631370545,
"step": 3710
},
{
"epoch": 1.4294785453146046,
"grad_norm": 0.10984777816492315,
"learning_rate": 4.151078531651733e-05,
"loss": 0.1647,
"mean_token_accuracy": 0.9452995538711548,
"step": 3715
},
{
"epoch": 1.431402732345584,
"grad_norm": 0.10653537135718445,
"learning_rate": 4.148119499502617e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9448102474212646,
"step": 3720
},
{
"epoch": 1.4333269193765634,
"grad_norm": 0.10140762916952242,
"learning_rate": 4.1451565230080114e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9449801802635193,
"step": 3725
},
{
"epoch": 1.4352511064075428,
"grad_norm": 0.10708663553306287,
"learning_rate": 4.142189610527e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9456826210021972,
"step": 3730
},
{
"epoch": 1.4371752934385222,
"grad_norm": 0.10793078077394648,
"learning_rate": 4.139218770429776e-05,
"loss": 0.1667,
"mean_token_accuracy": 0.9444868803024292,
"step": 3735
},
{
"epoch": 1.4390994804695016,
"grad_norm": 0.10309565815294483,
"learning_rate": 4.136244011097612e-05,
"loss": 0.1626,
"mean_token_accuracy": 0.9456857860088348,
"step": 3740
},
{
"epoch": 1.441023667500481,
"grad_norm": 0.10410978514797928,
"learning_rate": 4.133265340922836e-05,
"loss": 0.1612,
"mean_token_accuracy": 0.9461513102054596,
"step": 3745
},
{
"epoch": 1.4429478545314605,
"grad_norm": 0.12668018886073118,
"learning_rate": 4.130282768308809e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.9444233894348144,
"step": 3750
},
{
"epoch": 1.4448720415624399,
"grad_norm": 0.1052856451043635,
"learning_rate": 4.127296301669903e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9449902892112731,
"step": 3755
},
{
"epoch": 1.4467962285934193,
"grad_norm": 0.10299103372921697,
"learning_rate": 4.124305949431477e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9454239845275879,
"step": 3760
},
{
"epoch": 1.4487204156243987,
"grad_norm": 0.10222015678259523,
"learning_rate": 4.121311720029848e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9456360638141632,
"step": 3765
},
{
"epoch": 1.450644602655378,
"grad_norm": 0.10718052740518302,
"learning_rate": 4.118313621912275e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9451362669467926,
"step": 3770
},
{
"epoch": 1.4525687896863575,
"grad_norm": 0.10677535074707435,
"learning_rate": 4.115311663536928e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.9452996253967285,
"step": 3775
},
{
"epoch": 1.454492976717337,
"grad_norm": 0.11089059502709091,
"learning_rate": 4.112305853372871e-05,
"loss": 0.1658,
"mean_token_accuracy": 0.9444291234016419,
"step": 3780
},
{
"epoch": 1.4564171637483163,
"grad_norm": 0.11249418555034482,
"learning_rate": 4.109296199900031e-05,
"loss": 0.163,
"mean_token_accuracy": 0.9453545689582825,
"step": 3785
},
{
"epoch": 1.4583413507792957,
"grad_norm": 0.10298839043242691,
"learning_rate": 4.1062827116091805e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9453439474105835,
"step": 3790
},
{
"epoch": 1.4602655378102751,
"grad_norm": 0.10368991422762244,
"learning_rate": 4.1032653970019105e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9450232863426209,
"step": 3795
},
{
"epoch": 1.4621897248412545,
"grad_norm": 0.1013222632737639,
"learning_rate": 4.100244264590604e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.945792269706726,
"step": 3800
},
{
"epoch": 1.464113911872234,
"grad_norm": 0.11295443317373963,
"learning_rate": 4.097219322898417e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9462812006473541,
"step": 3805
},
{
"epoch": 1.4660380989032133,
"grad_norm": 0.10843179532916145,
"learning_rate": 4.0941905804592526e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9450453817844391,
"step": 3810
},
{
"epoch": 1.4679622859341928,
"grad_norm": 0.10545222962017423,
"learning_rate": 4.091158045817735e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9447169959545135,
"step": 3815
},
{
"epoch": 1.4698864729651722,
"grad_norm": 0.10689442875428007,
"learning_rate": 4.088121727529188e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9454691410064697,
"step": 3820
},
{
"epoch": 1.4718106599961516,
"grad_norm": 0.10976362275608818,
"learning_rate": 4.0850816341596084e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9450465559959411,
"step": 3825
},
{
"epoch": 1.473734847027131,
"grad_norm": 0.10317032616560282,
"learning_rate": 4.082037774285645e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.9461302757263184,
"step": 3830
},
{
"epoch": 1.4756590340581104,
"grad_norm": 0.10571152503983529,
"learning_rate": 4.0789901564945704e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.945213770866394,
"step": 3835
},
{
"epoch": 1.4775832210890898,
"grad_norm": 0.10519775368401885,
"learning_rate": 4.075938789384262e-05,
"loss": 0.1649,
"mean_token_accuracy": 0.9450217008590698,
"step": 3840
},
{
"epoch": 1.4795074081200692,
"grad_norm": 0.10585444591489851,
"learning_rate": 4.072883681563171e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.9457007706165313,
"step": 3845
},
{
"epoch": 1.4814315951510486,
"grad_norm": 0.11925588297419709,
"learning_rate": 4.069824841650304e-05,
"loss": 0.1645,
"mean_token_accuracy": 0.9450950980186462,
"step": 3850
},
{
"epoch": 1.483355782182028,
"grad_norm": 0.10791072692381821,
"learning_rate": 4.0667622782751986e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9458039939403534,
"step": 3855
},
{
"epoch": 1.4852799692130074,
"grad_norm": 0.1032554985834212,
"learning_rate": 4.0636960000778906e-05,
"loss": 0.162,
"mean_token_accuracy": 0.945751404762268,
"step": 3860
},
{
"epoch": 1.4872041562439868,
"grad_norm": 0.10170527823886044,
"learning_rate": 4.060626015708903e-05,
"loss": 0.1659,
"mean_token_accuracy": 0.9446871399879455,
"step": 3865
},
{
"epoch": 1.4891283432749662,
"grad_norm": 0.10902366153161942,
"learning_rate": 4.057552333829211e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.945387089252472,
"step": 3870
},
{
"epoch": 1.4910525303059456,
"grad_norm": 0.10938961670814928,
"learning_rate": 4.0544749631102205e-05,
"loss": 0.1653,
"mean_token_accuracy": 0.944713294506073,
"step": 3875
},
{
"epoch": 1.492976717336925,
"grad_norm": 0.10089328025560834,
"learning_rate": 4.0513939122337455e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9452538788318634,
"step": 3880
},
{
"epoch": 1.4949009043679045,
"grad_norm": 0.10384666680800574,
"learning_rate": 4.048309189891984e-05,
"loss": 0.1626,
"mean_token_accuracy": 0.9458374917507172,
"step": 3885
},
{
"epoch": 1.4968250913988839,
"grad_norm": 0.10536014803015964,
"learning_rate": 4.045220804787487e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9458865523338318,
"step": 3890
},
{
"epoch": 1.4987492784298633,
"grad_norm": 0.1086882198235937,
"learning_rate": 4.042128765633146e-05,
"loss": 0.1657,
"mean_token_accuracy": 0.9448476135730743,
"step": 3895
},
{
"epoch": 1.5006734654608427,
"grad_norm": 0.10352972906796397,
"learning_rate": 4.0390330811521546e-05,
"loss": 0.1627,
"mean_token_accuracy": 0.9454161047935485,
"step": 3900
},
{
"epoch": 1.502597652491822,
"grad_norm": 0.10540351615679729,
"learning_rate": 4.035933760077992e-05,
"loss": 0.1659,
"mean_token_accuracy": 0.944770210981369,
"step": 3905
},
{
"epoch": 1.5045218395228015,
"grad_norm": 0.10305963532964829,
"learning_rate": 4.0328308111544014e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9457070767879486,
"step": 3910
},
{
"epoch": 1.506446026553781,
"grad_norm": 0.11035442755861583,
"learning_rate": 4.029724243135355e-05,
"loss": 0.1647,
"mean_token_accuracy": 0.9449433743953705,
"step": 3915
},
{
"epoch": 1.5083702135847603,
"grad_norm": 0.11050723665714139,
"learning_rate": 4.026614064785038e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.9456405460834503,
"step": 3920
},
{
"epoch": 1.5102944006157397,
"grad_norm": 0.11786552783526551,
"learning_rate": 4.023500284877822e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9459376811981202,
"step": 3925
},
{
"epoch": 1.5122185876467191,
"grad_norm": 0.10870625983906573,
"learning_rate": 4.020382912198235e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.945826131105423,
"step": 3930
},
{
"epoch": 1.5141427746776985,
"grad_norm": 0.10948135264308353,
"learning_rate": 4.017261955540945e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9464793860912323,
"step": 3935
},
{
"epoch": 1.516066961708678,
"grad_norm": 0.10162679882740341,
"learning_rate": 4.01413742371073e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.945406460762024,
"step": 3940
},
{
"epoch": 1.5179911487396573,
"grad_norm": 0.10521806512733063,
"learning_rate": 4.0110093255224534e-05,
"loss": 0.1646,
"mean_token_accuracy": 0.9450480759143829,
"step": 3945
},
{
"epoch": 1.5199153357706368,
"grad_norm": 0.10833725994973822,
"learning_rate": 4.00787766980104e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9450052797794342,
"step": 3950
},
{
"epoch": 1.5218395228016162,
"grad_norm": 0.109814598470446,
"learning_rate": 4.004742465381454e-05,
"loss": 0.1594,
"mean_token_accuracy": 0.9465232491493225,
"step": 3955
},
{
"epoch": 1.5237637098325956,
"grad_norm": 0.10694664255943216,
"learning_rate": 4.001603721108665e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9463098287582398,
"step": 3960
},
{
"epoch": 1.525687896863575,
"grad_norm": 0.10263512740022389,
"learning_rate": 3.998461445837634e-05,
"loss": 0.1649,
"mean_token_accuracy": 0.9454042196273804,
"step": 3965
},
{
"epoch": 1.5276120838945544,
"grad_norm": 0.10032118708257971,
"learning_rate": 3.995315648433283e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9457547128200531,
"step": 3970
},
{
"epoch": 1.529536270925534,
"grad_norm": 0.10523384809000864,
"learning_rate": 3.992166337770469e-05,
"loss": 0.166,
"mean_token_accuracy": 0.9447933554649353,
"step": 3975
},
{
"epoch": 1.5314604579565134,
"grad_norm": 0.10296416037816417,
"learning_rate": 3.989013522733961e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9456773638725281,
"step": 3980
},
{
"epoch": 1.5333846449874928,
"grad_norm": 0.10534602947819298,
"learning_rate": 3.9858572122184165e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.945252388715744,
"step": 3985
},
{
"epoch": 1.5353088320184722,
"grad_norm": 0.10465920572337489,
"learning_rate": 3.982697415128352e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.945354574918747,
"step": 3990
},
{
"epoch": 1.5372330190494516,
"grad_norm": 0.098211960325333,
"learning_rate": 3.97953414037812e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9457767367362976,
"step": 3995
},
{
"epoch": 1.539157206080431,
"grad_norm": 0.10124489861736853,
"learning_rate": 3.976367396891887e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9456769168376923,
"step": 4000
},
{
"epoch": 1.5410813931114105,
"grad_norm": 0.10314687402688161,
"learning_rate": 3.9731971936036004e-05,
"loss": 0.1614,
"mean_token_accuracy": 0.9460195481777192,
"step": 4005
},
{
"epoch": 1.5430055801423899,
"grad_norm": 0.10177769628276215,
"learning_rate": 3.970023539456974e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9456782817840577,
"step": 4010
},
{
"epoch": 1.5449297671733693,
"grad_norm": 0.09796056102911006,
"learning_rate": 3.966846443405455e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.9447887420654297,
"step": 4015
},
{
"epoch": 1.5468539542043487,
"grad_norm": 0.10469377145655043,
"learning_rate": 3.963665914412197e-05,
"loss": 0.1585,
"mean_token_accuracy": 0.9467072010040283,
"step": 4020
},
{
"epoch": 1.548778141235328,
"grad_norm": 0.1006826335320941,
"learning_rate": 3.960481961450045e-05,
"loss": 0.1612,
"mean_token_accuracy": 0.9459984958171844,
"step": 4025
},
{
"epoch": 1.5507023282663075,
"grad_norm": 0.10467059451473193,
"learning_rate": 3.9572945935014996e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.9454215824604034,
"step": 4030
},
{
"epoch": 1.552626515297287,
"grad_norm": 0.10041987447318289,
"learning_rate": 3.954103819558697e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.9457914710044861,
"step": 4035
},
{
"epoch": 1.5545507023282663,
"grad_norm": 0.10089989409325976,
"learning_rate": 3.95090964862338e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.9456892788410187,
"step": 4040
},
{
"epoch": 1.5564748893592457,
"grad_norm": 0.10159003859464948,
"learning_rate": 3.947712089706879e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9456854522228241,
"step": 4045
},
{
"epoch": 1.5583990763902251,
"grad_norm": 0.10090677861345802,
"learning_rate": 3.9445111518300805e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9454401433467865,
"step": 4050
},
{
"epoch": 1.5603232634212045,
"grad_norm": 0.09959007302395402,
"learning_rate": 3.941306844023402e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.9458894848823547,
"step": 4055
},
{
"epoch": 1.562247450452184,
"grad_norm": 0.10522278792323339,
"learning_rate": 3.9380991753267704e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9460979759693146,
"step": 4060
},
{
"epoch": 1.5641716374831633,
"grad_norm": 0.10308921430670417,
"learning_rate": 3.934888154789593e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9448984205722809,
"step": 4065
},
{
"epoch": 1.5660958245141428,
"grad_norm": 0.0990674715690409,
"learning_rate": 3.931673791470734e-05,
"loss": 0.1627,
"mean_token_accuracy": 0.9455141544342041,
"step": 4070
},
{
"epoch": 1.5680200115451222,
"grad_norm": 0.099198857552859,
"learning_rate": 3.928456094438489e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.9452341854572296,
"step": 4075
},
{
"epoch": 1.5699441985761016,
"grad_norm": 0.10447787722426305,
"learning_rate": 3.9252350727705555e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9459849536418915,
"step": 4080
},
{
"epoch": 1.571868385607081,
"grad_norm": 0.10013162065383374,
"learning_rate": 3.922010735554014e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9463956356048584,
"step": 4085
},
{
"epoch": 1.5737925726380604,
"grad_norm": 0.10170511979504664,
"learning_rate": 3.918783091885297e-05,
"loss": 0.1656,
"mean_token_accuracy": 0.944677346944809,
"step": 4090
},
{
"epoch": 1.5757167596690398,
"grad_norm": 0.10131083936715862,
"learning_rate": 3.915552150870166e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9461105525493622,
"step": 4095
},
{
"epoch": 1.5776409467000192,
"grad_norm": 0.1083208948877487,
"learning_rate": 3.9123179216236826e-05,
"loss": 0.1581,
"mean_token_accuracy": 0.946831899881363,
"step": 4100
},
{
"epoch": 1.5795651337309986,
"grad_norm": 0.10235631166571117,
"learning_rate": 3.9090804132701887e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9463982284069061,
"step": 4105
},
{
"epoch": 1.581489320761978,
"grad_norm": 0.09790623152102618,
"learning_rate": 3.905839634943273e-05,
"loss": 0.16,
"mean_token_accuracy": 0.9465533256530761,
"step": 4110
},
{
"epoch": 1.5834135077929576,
"grad_norm": 0.1028509160004831,
"learning_rate": 3.9025955957857524e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.94595587849617,
"step": 4115
},
{
"epoch": 1.585337694823937,
"grad_norm": 0.10797185942056255,
"learning_rate": 3.899348304949642e-05,
"loss": 0.1636,
"mean_token_accuracy": 0.9453164756298065,
"step": 4120
},
{
"epoch": 1.5872618818549165,
"grad_norm": 0.09812477671946834,
"learning_rate": 3.896097771596133e-05,
"loss": 0.1626,
"mean_token_accuracy": 0.9459312915802002,
"step": 4125
},
{
"epoch": 1.5891860688858959,
"grad_norm": 0.10397428273694205,
"learning_rate": 3.892844004895559e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9457064151763916,
"step": 4130
},
{
"epoch": 1.5911102559168753,
"grad_norm": 0.10265428157169595,
"learning_rate": 3.889587014027381e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9455591917037964,
"step": 4135
},
{
"epoch": 1.5930344429478547,
"grad_norm": 0.09954216473225751,
"learning_rate": 3.886326808180152e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9466226279735566,
"step": 4140
},
{
"epoch": 1.594958629978834,
"grad_norm": 0.0948999453129512,
"learning_rate": 3.8830633965514965e-05,
"loss": 0.1597,
"mean_token_accuracy": 0.9466159641742706,
"step": 4145
},
{
"epoch": 1.5968828170098135,
"grad_norm": 0.10150749898077574,
"learning_rate": 3.879796788348084e-05,
"loss": 0.1668,
"mean_token_accuracy": 0.9443914890289307,
"step": 4150
},
{
"epoch": 1.598807004040793,
"grad_norm": 0.10002527883833515,
"learning_rate": 3.876526992785602e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9461474359035492,
"step": 4155
},
{
"epoch": 1.6007311910717723,
"grad_norm": 0.1049235476768015,
"learning_rate": 3.873254019088727e-05,
"loss": 0.1648,
"mean_token_accuracy": 0.945132714509964,
"step": 4160
},
{
"epoch": 1.6026553781027517,
"grad_norm": 0.09716463237940702,
"learning_rate": 3.869977876491105e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9458979725837707,
"step": 4165
},
{
"epoch": 1.6045795651337311,
"grad_norm": 0.09527653419455659,
"learning_rate": 3.8666985742353214e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9457536578178406,
"step": 4170
},
{
"epoch": 1.6065037521647105,
"grad_norm": 0.09974264857216013,
"learning_rate": 3.863416121572875e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.945774495601654,
"step": 4175
},
{
"epoch": 1.60842793919569,
"grad_norm": 0.1016351993204537,
"learning_rate": 3.860130527764153e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.9462750256061554,
"step": 4180
},
{
"epoch": 1.6103521262266693,
"grad_norm": 0.09782946120729334,
"learning_rate": 3.856841802078403e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.9449570298194885,
"step": 4185
},
{
"epoch": 1.6122763132576488,
"grad_norm": 0.09900853294736975,
"learning_rate": 3.85354995379371e-05,
"loss": 0.1597,
"mean_token_accuracy": 0.9465481758117675,
"step": 4190
},
{
"epoch": 1.6142005002886282,
"grad_norm": 0.09657776840856795,
"learning_rate": 3.850254992196967e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9456781804561615,
"step": 4195
},
{
"epoch": 1.6161246873196076,
"grad_norm": 0.10028799032204168,
"learning_rate": 3.84695692658385e-05,
"loss": 0.1596,
"mean_token_accuracy": 0.9467596590518952,
"step": 4200
},
{
"epoch": 1.618048874350587,
"grad_norm": 0.09996049503329667,
"learning_rate": 3.8436557662587945e-05,
"loss": 0.1652,
"mean_token_accuracy": 0.944662743806839,
"step": 4205
},
{
"epoch": 1.6199730613815664,
"grad_norm": 0.09724019229821691,
"learning_rate": 3.840351520534964e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9463082849979401,
"step": 4210
},
{
"epoch": 1.6218972484125458,
"grad_norm": 0.09820808813755214,
"learning_rate": 3.8370441987342274e-05,
"loss": 0.1612,
"mean_token_accuracy": 0.9463602125644683,
"step": 4215
},
{
"epoch": 1.6238214354435252,
"grad_norm": 0.09953918320915445,
"learning_rate": 3.833733810187131e-05,
"loss": 0.165,
"mean_token_accuracy": 0.9448364675045013,
"step": 4220
},
{
"epoch": 1.6257456224745046,
"grad_norm": 0.10152002216245509,
"learning_rate": 3.830420364232876e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9457645535469055,
"step": 4225
},
{
"epoch": 1.627669809505484,
"grad_norm": 0.10335735116416461,
"learning_rate": 3.827103870219285e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9453236222267151,
"step": 4230
},
{
"epoch": 1.6295939965364634,
"grad_norm": 0.10024442307017409,
"learning_rate": 3.823784337502782e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9455093562602996,
"step": 4235
},
{
"epoch": 1.6315181835674428,
"grad_norm": 0.10809315246772307,
"learning_rate": 3.820461775448364e-05,
"loss": 0.1644,
"mean_token_accuracy": 0.9452807545661926,
"step": 4240
},
{
"epoch": 1.6334423705984222,
"grad_norm": 0.10929760682984689,
"learning_rate": 3.817136193429571e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9450959205627442,
"step": 4245
},
{
"epoch": 1.6353665576294016,
"grad_norm": 0.09951760251430594,
"learning_rate": 3.813807600828468e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9465416312217713,
"step": 4250
},
{
"epoch": 1.637290744660381,
"grad_norm": 0.09218652520491599,
"learning_rate": 3.810476007035611e-05,
"loss": 0.1585,
"mean_token_accuracy": 0.9469627916812897,
"step": 4255
},
{
"epoch": 1.6392149316913605,
"grad_norm": 0.10040311322983182,
"learning_rate": 3.807141421450021e-05,
"loss": 0.1594,
"mean_token_accuracy": 0.9466736435890197,
"step": 4260
},
{
"epoch": 1.6411391187223399,
"grad_norm": 0.10402262287219591,
"learning_rate": 3.803803853479163e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.945269650220871,
"step": 4265
},
{
"epoch": 1.6430633057533193,
"grad_norm": 0.09907131803218525,
"learning_rate": 3.8004633125389115e-05,
"loss": 0.1583,
"mean_token_accuracy": 0.9469216406345368,
"step": 4270
},
{
"epoch": 1.6449874927842987,
"grad_norm": 0.10596990721340989,
"learning_rate": 3.797119808053533e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9460356116294861,
"step": 4275
},
{
"epoch": 1.646911679815278,
"grad_norm": 0.09841311054889437,
"learning_rate": 3.793773349455652e-05,
"loss": 0.1614,
"mean_token_accuracy": 0.9460277736186982,
"step": 4280
},
{
"epoch": 1.6488358668462575,
"grad_norm": 0.09806735698422851,
"learning_rate": 3.790423946186226e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9456752181053162,
"step": 4285
},
{
"epoch": 1.650760053877237,
"grad_norm": 0.10126137877354573,
"learning_rate": 3.787071607694523e-05,
"loss": 0.1637,
"mean_token_accuracy": 0.9453158974647522,
"step": 4290
},
{
"epoch": 1.6526842409082163,
"grad_norm": 0.09767249513375194,
"learning_rate": 3.783716343438091e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9464418232440949,
"step": 4295
},
{
"epoch": 1.6546084279391957,
"grad_norm": 0.1002960076707505,
"learning_rate": 3.7803581628827285e-05,
"loss": 0.1592,
"mean_token_accuracy": 0.9464677512645722,
"step": 4300
},
{
"epoch": 1.6565326149701751,
"grad_norm": 0.1083689472471439,
"learning_rate": 3.776997075502466e-05,
"loss": 0.1597,
"mean_token_accuracy": 0.9464883804321289,
"step": 4305
},
{
"epoch": 1.6584568020011545,
"grad_norm": 0.09991261064628595,
"learning_rate": 3.773633090779534e-05,
"loss": 0.1598,
"mean_token_accuracy": 0.9466773569583893,
"step": 4310
},
{
"epoch": 1.660380989032134,
"grad_norm": 0.09813598977680858,
"learning_rate": 3.770266218204334e-05,
"loss": 0.1635,
"mean_token_accuracy": 0.9454030215740203,
"step": 4315
},
{
"epoch": 1.6623051760631133,
"grad_norm": 0.09964594545739816,
"learning_rate": 3.766896467275417e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9455722093582153,
"step": 4320
},
{
"epoch": 1.6642293630940928,
"grad_norm": 0.10360473288926222,
"learning_rate": 3.763523847499454e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.945340096950531,
"step": 4325
},
{
"epoch": 1.6661535501250722,
"grad_norm": 0.09450293752635669,
"learning_rate": 3.76014836839121e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9461982131004334,
"step": 4330
},
{
"epoch": 1.6680777371560516,
"grad_norm": 0.10472592004396242,
"learning_rate": 3.7567700394735144e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.9452694237232209,
"step": 4335
},
{
"epoch": 1.670001924187031,
"grad_norm": 0.10278164728725776,
"learning_rate": 3.75338887027724e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9455021142959594,
"step": 4340
},
{
"epoch": 1.6719261112180104,
"grad_norm": 0.09960828921624268,
"learning_rate": 3.750004870341269e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9457294404506683,
"step": 4345
},
{
"epoch": 1.6738502982489898,
"grad_norm": 0.09986603053296246,
"learning_rate": 3.746618049212473e-05,
"loss": 0.1588,
"mean_token_accuracy": 0.9466879844665528,
"step": 4350
},
{
"epoch": 1.6757744852799692,
"grad_norm": 0.10259521262476153,
"learning_rate": 3.7432284164456793e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.945849597454071,
"step": 4355
},
{
"epoch": 1.6776986723109486,
"grad_norm": 0.10462616055841983,
"learning_rate": 3.73983598160365e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.946190345287323,
"step": 4360
},
{
"epoch": 1.679622859341928,
"grad_norm": 0.11107610378497039,
"learning_rate": 3.736440754257051e-05,
"loss": 0.158,
"mean_token_accuracy": 0.9471249282360077,
"step": 4365
},
{
"epoch": 1.6815470463729074,
"grad_norm": 0.09987899542924915,
"learning_rate": 3.733042743984425e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9459578812122345,
"step": 4370
},
{
"epoch": 1.6834712334038868,
"grad_norm": 0.10117763930369007,
"learning_rate": 3.7296419603721706e-05,
"loss": 0.1654,
"mean_token_accuracy": 0.9449024319648742,
"step": 4375
},
{
"epoch": 1.6853954204348662,
"grad_norm": 0.09923596483325249,
"learning_rate": 3.7262384130145054e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9448891043663025,
"step": 4380
},
{
"epoch": 1.6873196074658456,
"grad_norm": 0.10668055665280804,
"learning_rate": 3.722832111513447e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9455032289028168,
"step": 4385
},
{
"epoch": 1.689243794496825,
"grad_norm": 0.10363929813286556,
"learning_rate": 3.719423065478782e-05,
"loss": 0.1646,
"mean_token_accuracy": 0.9447316706180573,
"step": 4390
},
{
"epoch": 1.6911679815278045,
"grad_norm": 0.10047421943729479,
"learning_rate": 3.7160112845280385e-05,
"loss": 0.1631,
"mean_token_accuracy": 0.9456027746200562,
"step": 4395
},
{
"epoch": 1.6930921685587839,
"grad_norm": 0.09742967063763901,
"learning_rate": 3.7125967782864624e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9459131598472595,
"step": 4400
},
{
"epoch": 1.6950163555897633,
"grad_norm": 0.10819151467635159,
"learning_rate": 3.7091795563869876e-05,
"loss": 0.1642,
"mean_token_accuracy": 0.945240992307663,
"step": 4405
},
{
"epoch": 1.6969405426207427,
"grad_norm": 0.10434588415019977,
"learning_rate": 3.705759628470208e-05,
"loss": 0.1641,
"mean_token_accuracy": 0.9450987458229065,
"step": 4410
},
{
"epoch": 1.698864729651722,
"grad_norm": 0.10145849516304649,
"learning_rate": 3.702337004184354e-05,
"loss": 0.1606,
"mean_token_accuracy": 0.9462997257709503,
"step": 4415
},
{
"epoch": 1.7007889166827015,
"grad_norm": 0.10005157393993798,
"learning_rate": 3.6989116931852616e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.946276193857193,
"step": 4420
},
{
"epoch": 1.702713103713681,
"grad_norm": 0.11138468161786759,
"learning_rate": 3.695483705136345e-05,
"loss": 0.1651,
"mean_token_accuracy": 0.9449223577976227,
"step": 4425
},
{
"epoch": 1.7046372907446603,
"grad_norm": 0.1004360772894267,
"learning_rate": 3.692053049708574e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9464481592178344,
"step": 4430
},
{
"epoch": 1.7065614777756397,
"grad_norm": 0.09901623918063937,
"learning_rate": 3.688619736580441e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9458293974399566,
"step": 4435
},
{
"epoch": 1.7084856648066191,
"grad_norm": 0.09797231983138321,
"learning_rate": 3.685183775437938e-05,
"loss": 0.1627,
"mean_token_accuracy": 0.9456776857376099,
"step": 4440
},
{
"epoch": 1.7104098518375985,
"grad_norm": 0.1004592463191462,
"learning_rate": 3.681745175974525e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9465253472328186,
"step": 4445
},
{
"epoch": 1.712334038868578,
"grad_norm": 0.09635015865058442,
"learning_rate": 3.6783039478911104e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9462304711341858,
"step": 4450
},
{
"epoch": 1.7142582258995573,
"grad_norm": 0.10150560455863913,
"learning_rate": 3.674860100896011e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.9462295591831207,
"step": 4455
},
{
"epoch": 1.7161824129305367,
"grad_norm": 0.09964573607398473,
"learning_rate": 3.671413644704938e-05,
"loss": 0.1603,
"mean_token_accuracy": 0.9462384462356568,
"step": 4460
},
{
"epoch": 1.7181065999615162,
"grad_norm": 0.0989426450473668,
"learning_rate": 3.667964589040961e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9463894844055176,
"step": 4465
},
{
"epoch": 1.7200307869924956,
"grad_norm": 0.0947431826482723,
"learning_rate": 3.664512943634485e-05,
"loss": 0.1589,
"mean_token_accuracy": 0.9468457162380218,
"step": 4470
},
{
"epoch": 1.721954974023475,
"grad_norm": 0.10292528415128935,
"learning_rate": 3.661058718223216e-05,
"loss": 0.1585,
"mean_token_accuracy": 0.946750795841217,
"step": 4475
},
{
"epoch": 1.7238791610544544,
"grad_norm": 0.09812618533590778,
"learning_rate": 3.6576019225521474e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.9462311148643494,
"step": 4480
},
{
"epoch": 1.7258033480854338,
"grad_norm": 0.09992020088492978,
"learning_rate": 3.654142566373516e-05,
"loss": 0.1614,
"mean_token_accuracy": 0.9459415197372436,
"step": 4485
},
{
"epoch": 1.7277275351164132,
"grad_norm": 0.10029244391981966,
"learning_rate": 3.6506806594467845e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9462564051151275,
"step": 4490
},
{
"epoch": 1.7296517221473926,
"grad_norm": 0.10382887562209585,
"learning_rate": 3.647216211538615e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9461699724197388,
"step": 4495
},
{
"epoch": 1.731575909178372,
"grad_norm": 0.10869185316485384,
"learning_rate": 3.643749232422833e-05,
"loss": 0.162,
"mean_token_accuracy": 0.9458631217479706,
"step": 4500
},
{
"epoch": 1.7335000962093514,
"grad_norm": 0.10790114677302129,
"learning_rate": 3.64027973188041e-05,
"loss": 0.1592,
"mean_token_accuracy": 0.9466256439685822,
"step": 4505
},
{
"epoch": 1.7354242832403308,
"grad_norm": 0.10633086609839314,
"learning_rate": 3.6368077196994255e-05,
"loss": 0.1622,
"mean_token_accuracy": 0.9459817409515381,
"step": 4510
},
{
"epoch": 1.7373484702713102,
"grad_norm": 0.10885187496202672,
"learning_rate": 3.633333205675049e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9459581017494202,
"step": 4515
},
{
"epoch": 1.7392726573022896,
"grad_norm": 0.10176958875864693,
"learning_rate": 3.629856199609507e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9459785044193267,
"step": 4520
},
{
"epoch": 1.741196844333269,
"grad_norm": 0.09916618375083491,
"learning_rate": 3.626376711312056e-05,
"loss": 0.1592,
"mean_token_accuracy": 0.9466202616691589,
"step": 4525
},
{
"epoch": 1.7431210313642485,
"grad_norm": 0.10060420158449908,
"learning_rate": 3.622894750598956e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9465162098407746,
"step": 4530
},
{
"epoch": 1.7450452183952279,
"grad_norm": 0.09887169050050851,
"learning_rate": 3.6194103272934407e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9455604314804077,
"step": 4535
},
{
"epoch": 1.7469694054262073,
"grad_norm": 0.10871757318822675,
"learning_rate": 3.615923451225694e-05,
"loss": 0.1612,
"mean_token_accuracy": 0.9459405720233918,
"step": 4540
},
{
"epoch": 1.7488935924571867,
"grad_norm": 0.09759522454802382,
"learning_rate": 3.6124341322328164e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9464699804782868,
"step": 4545
},
{
"epoch": 1.7508177794881663,
"grad_norm": 0.10405359692751093,
"learning_rate": 3.608942380158802e-05,
"loss": 0.1597,
"mean_token_accuracy": 0.9467595756053925,
"step": 4550
},
{
"epoch": 1.7527419665191457,
"grad_norm": 0.10730189765746177,
"learning_rate": 3.605448204854508e-05,
"loss": 0.1634,
"mean_token_accuracy": 0.9453517079353333,
"step": 4555
},
{
"epoch": 1.7546661535501251,
"grad_norm": 0.10031455080369805,
"learning_rate": 3.60195161617763e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.9458699703216553,
"step": 4560
},
{
"epoch": 1.7565903405811045,
"grad_norm": 0.10551108903780186,
"learning_rate": 3.59845262399267e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.9461313605308532,
"step": 4565
},
{
"epoch": 1.758514527612084,
"grad_norm": 0.10398560325781105,
"learning_rate": 3.594951238170912e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9458823204040527,
"step": 4570
},
{
"epoch": 1.7604387146430633,
"grad_norm": 0.1007130950595616,
"learning_rate": 3.591447468590392e-05,
"loss": 0.1629,
"mean_token_accuracy": 0.9455589532852173,
"step": 4575
},
{
"epoch": 1.7623629016740427,
"grad_norm": 0.0955238435134818,
"learning_rate": 3.5879413251358724e-05,
"loss": 0.1583,
"mean_token_accuracy": 0.9468055486679077,
"step": 4580
},
{
"epoch": 1.7642870887050222,
"grad_norm": 0.094174218073043,
"learning_rate": 3.5844328176988105e-05,
"loss": 0.1584,
"mean_token_accuracy": 0.9468781888484955,
"step": 4585
},
{
"epoch": 1.7662112757360016,
"grad_norm": 0.0997841174583319,
"learning_rate": 3.5809219561773346e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9462185323238372,
"step": 4590
},
{
"epoch": 1.768135462766981,
"grad_norm": 0.09884549747941763,
"learning_rate": 3.5774087504762144e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9469363689422607,
"step": 4595
},
{
"epoch": 1.7700596497979604,
"grad_norm": 0.09899189814693576,
"learning_rate": 3.573893210506832e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9469284415245056,
"step": 4600
},
{
"epoch": 1.7719838368289398,
"grad_norm": 0.0993621353004576,
"learning_rate": 3.570375346187155e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9463739633560181,
"step": 4605
},
{
"epoch": 1.7739080238599192,
"grad_norm": 0.10547214561067843,
"learning_rate": 3.5668551674417084e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.945833295583725,
"step": 4610
},
{
"epoch": 1.7758322108908986,
"grad_norm": 0.09370909630735182,
"learning_rate": 3.563332684201548e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9458651065826416,
"step": 4615
},
{
"epoch": 1.777756397921878,
"grad_norm": 0.09452531437014097,
"learning_rate": 3.559807906404228e-05,
"loss": 0.1606,
"mean_token_accuracy": 0.9462214589118958,
"step": 4620
},
{
"epoch": 1.7796805849528574,
"grad_norm": 0.09829866705995945,
"learning_rate": 3.556280843993779e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9468035578727723,
"step": 4625
},
{
"epoch": 1.7816047719838368,
"grad_norm": 0.09935570637840965,
"learning_rate": 3.552751506920676e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9466434597969056,
"step": 4630
},
{
"epoch": 1.7835289590148162,
"grad_norm": 0.09686737707137055,
"learning_rate": 3.54921990514181e-05,
"loss": 0.1584,
"mean_token_accuracy": 0.946895694732666,
"step": 4635
},
{
"epoch": 1.7854531460457956,
"grad_norm": 0.1036327803378326,
"learning_rate": 3.5456860486204637e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9463905453681946,
"step": 4640
},
{
"epoch": 1.787377333076775,
"grad_norm": 0.11585854547485858,
"learning_rate": 3.5421499473262776e-05,
"loss": 0.161,
"mean_token_accuracy": 0.9461309313774109,
"step": 4645
},
{
"epoch": 1.7893015201077545,
"grad_norm": 0.09913408037481335,
"learning_rate": 3.538611611235226e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9463733434677124,
"step": 4650
},
{
"epoch": 1.7912257071387339,
"grad_norm": 0.09816208195520347,
"learning_rate": 3.535071050329591e-05,
"loss": 0.16,
"mean_token_accuracy": 0.9465721905231476,
"step": 4655
},
{
"epoch": 1.7931498941697133,
"grad_norm": 0.0954450905445559,
"learning_rate": 3.5315282745979275e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9462323606014251,
"step": 4660
},
{
"epoch": 1.7950740812006927,
"grad_norm": 0.09685202748056504,
"learning_rate": 3.527983294035041e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.9465530514717102,
"step": 4665
},
{
"epoch": 1.796998268231672,
"grad_norm": 0.09595396274966222,
"learning_rate": 3.524436118641956e-05,
"loss": 0.1613,
"mean_token_accuracy": 0.9458921074867248,
"step": 4670
},
{
"epoch": 1.7989224552626515,
"grad_norm": 0.1004200825903867,
"learning_rate": 3.52088675842589e-05,
"loss": 0.1591,
"mean_token_accuracy": 0.946587735414505,
"step": 4675
},
{
"epoch": 1.800846642293631,
"grad_norm": 0.09727212472084619,
"learning_rate": 3.517335223400223e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.9454812586307526,
"step": 4680
},
{
"epoch": 1.8027708293246103,
"grad_norm": 0.10781734060683949,
"learning_rate": 3.513781523584473e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.946499890089035,
"step": 4685
},
{
"epoch": 1.8046950163555897,
"grad_norm": 0.09765609473673183,
"learning_rate": 3.510225669004262e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9462354481220245,
"step": 4690
},
{
"epoch": 1.8066192033865693,
"grad_norm": 0.09593958412025588,
"learning_rate": 3.506667669691292e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9458974361419678,
"step": 4695
},
{
"epoch": 1.8085433904175487,
"grad_norm": 0.09690005881713605,
"learning_rate": 3.5031075356833184e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9459888219833374,
"step": 4700
},
{
"epoch": 1.8104675774485282,
"grad_norm": 0.10049690019327372,
"learning_rate": 3.4995452770241146e-05,
"loss": 0.1587,
"mean_token_accuracy": 0.946663624048233,
"step": 4705
},
{
"epoch": 1.8123917644795076,
"grad_norm": 0.10080261659131069,
"learning_rate": 3.495980903763453e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9462504684925079,
"step": 4710
},
{
"epoch": 1.814315951510487,
"grad_norm": 0.097535424302901,
"learning_rate": 3.4924144259570665e-05,
"loss": 0.1589,
"mean_token_accuracy": 0.946724146604538,
"step": 4715
},
{
"epoch": 1.8162401385414664,
"grad_norm": 0.10097992438397164,
"learning_rate": 3.488845853666628e-05,
"loss": 0.1606,
"mean_token_accuracy": 0.9463124930858612,
"step": 4720
},
{
"epoch": 1.8181643255724458,
"grad_norm": 0.09896929588763823,
"learning_rate": 3.485275196959719e-05,
"loss": 0.159,
"mean_token_accuracy": 0.9467292606830597,
"step": 4725
},
{
"epoch": 1.8200885126034252,
"grad_norm": 0.10281217785566646,
"learning_rate": 3.481702465909803e-05,
"loss": 0.1588,
"mean_token_accuracy": 0.94689239859581,
"step": 4730
},
{
"epoch": 1.8220126996344046,
"grad_norm": 0.09840610317271821,
"learning_rate": 3.478127670596193e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9461028575897217,
"step": 4735
},
{
"epoch": 1.823936886665384,
"grad_norm": 0.09885915505162275,
"learning_rate": 3.474550821104026e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.9459745645523071,
"step": 4740
},
{
"epoch": 1.8258610736963634,
"grad_norm": 0.10031987785936006,
"learning_rate": 3.470971927524236e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.9460942864418029,
"step": 4745
},
{
"epoch": 1.8277852607273428,
"grad_norm": 0.10261317824465392,
"learning_rate": 3.467390999953524e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9455275893211365,
"step": 4750
},
{
"epoch": 1.8297094477583222,
"grad_norm": 0.09844431473152106,
"learning_rate": 3.463808048494325e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9458800673484802,
"step": 4755
},
{
"epoch": 1.8316336347893016,
"grad_norm": 0.096300292573579,
"learning_rate": 3.4602230832547885e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9462751030921936,
"step": 4760
},
{
"epoch": 1.833557821820281,
"grad_norm": 0.09643759538772625,
"learning_rate": 3.456636114348744e-05,
"loss": 0.1593,
"mean_token_accuracy": 0.9467679262161255,
"step": 4765
},
{
"epoch": 1.8354820088512604,
"grad_norm": 0.09947537690648842,
"learning_rate": 3.4530471518956715e-05,
"loss": 0.16,
"mean_token_accuracy": 0.9465085625648498,
"step": 4770
},
{
"epoch": 1.8374061958822399,
"grad_norm": 0.0961494745039196,
"learning_rate": 3.449456206020677e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9457358002662659,
"step": 4775
},
{
"epoch": 1.8393303829132193,
"grad_norm": 0.09980075650621577,
"learning_rate": 3.4458632868544647e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9465402901172638,
"step": 4780
},
{
"epoch": 1.8412545699441987,
"grad_norm": 0.10689945107107311,
"learning_rate": 3.4422684045332994e-05,
"loss": 0.1609,
"mean_token_accuracy": 0.9462505578994751,
"step": 4785
},
{
"epoch": 1.843178756975178,
"grad_norm": 0.09496817919075264,
"learning_rate": 3.43867156919899e-05,
"loss": 0.1601,
"mean_token_accuracy": 0.9463814616203308,
"step": 4790
},
{
"epoch": 1.8451029440061575,
"grad_norm": 0.1018698324932633,
"learning_rate": 3.435072790998852e-05,
"loss": 0.1633,
"mean_token_accuracy": 0.9452820897102356,
"step": 4795
},
{
"epoch": 1.847027131037137,
"grad_norm": 0.10232179591168951,
"learning_rate": 3.431472080085684e-05,
"loss": 0.1632,
"mean_token_accuracy": 0.9454320549964905,
"step": 4800
},
{
"epoch": 1.8489513180681163,
"grad_norm": 0.10072763548455216,
"learning_rate": 3.427869446617736e-05,
"loss": 0.1587,
"mean_token_accuracy": 0.9469704747200012,
"step": 4805
},
{
"epoch": 1.8508755050990957,
"grad_norm": 0.09896154806898143,
"learning_rate": 3.424264900758682e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9457802414894104,
"step": 4810
},
{
"epoch": 1.8527996921300751,
"grad_norm": 0.09851390977832954,
"learning_rate": 3.420658452677592e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.946274584531784,
"step": 4815
},
{
"epoch": 1.8547238791610545,
"grad_norm": 0.10141452729511008,
"learning_rate": 3.4170501125489005e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.94554203748703,
"step": 4820
},
{
"epoch": 1.856648066192034,
"grad_norm": 0.09465743780992764,
"learning_rate": 3.413439890552384e-05,
"loss": 0.159,
"mean_token_accuracy": 0.9467312037944794,
"step": 4825
},
{
"epoch": 1.8585722532230133,
"grad_norm": 0.09693986948890045,
"learning_rate": 3.409827796873122e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9463321626186371,
"step": 4830
},
{
"epoch": 1.8604964402539927,
"grad_norm": 0.09809270772137343,
"learning_rate": 3.40621384170148e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.9462391793727875,
"step": 4835
},
{
"epoch": 1.8624206272849722,
"grad_norm": 0.09646790690485375,
"learning_rate": 3.402598035233072e-05,
"loss": 0.162,
"mean_token_accuracy": 0.9458190202713013,
"step": 4840
},
{
"epoch": 1.8643448143159516,
"grad_norm": 0.3675774609735443,
"learning_rate": 3.398980387668735e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9459438502788544,
"step": 4845
},
{
"epoch": 1.866269001346931,
"grad_norm": 0.10581255129356738,
"learning_rate": 3.395360909214502e-05,
"loss": 0.1591,
"mean_token_accuracy": 0.9468293070793152,
"step": 4850
},
{
"epoch": 1.8681931883779104,
"grad_norm": 0.09716761917946114,
"learning_rate": 3.391739610081568e-05,
"loss": 0.1611,
"mean_token_accuracy": 0.946347177028656,
"step": 4855
},
{
"epoch": 1.8701173754088898,
"grad_norm": 0.10643801155332201,
"learning_rate": 3.388116500486268e-05,
"loss": 0.1594,
"mean_token_accuracy": 0.9463170647621155,
"step": 4860
},
{
"epoch": 1.8720415624398692,
"grad_norm": 0.09878880152730199,
"learning_rate": 3.3844915906500426e-05,
"loss": 0.1588,
"mean_token_accuracy": 0.9468757033348083,
"step": 4865
},
{
"epoch": 1.8739657494708486,
"grad_norm": 0.10644131887426493,
"learning_rate": 3.380864890799411e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9463419139385223,
"step": 4870
},
{
"epoch": 1.875889936501828,
"grad_norm": 0.10976722626462512,
"learning_rate": 3.3772364111659444e-05,
"loss": 0.1581,
"mean_token_accuracy": 0.9467686593532563,
"step": 4875
},
{
"epoch": 1.8778141235328074,
"grad_norm": 0.09598048657221817,
"learning_rate": 3.373606161986231e-05,
"loss": 0.159,
"mean_token_accuracy": 0.9466426372528076,
"step": 4880
},
{
"epoch": 1.8797383105637868,
"grad_norm": 0.09662806316650543,
"learning_rate": 3.369974153501857e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.9462577164173126,
"step": 4885
},
{
"epoch": 1.8816624975947662,
"grad_norm": 0.09303829159925717,
"learning_rate": 3.3663403959593673e-05,
"loss": 0.1593,
"mean_token_accuracy": 0.9464956879615783,
"step": 4890
},
{
"epoch": 1.8835866846257456,
"grad_norm": 0.10133191319579814,
"learning_rate": 3.362704899610242e-05,
"loss": 0.1578,
"mean_token_accuracy": 0.9470635533332825,
"step": 4895
},
{
"epoch": 1.885510871656725,
"grad_norm": 0.10158166791887556,
"learning_rate": 3.3590676747108685e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9463416457176208,
"step": 4900
},
{
"epoch": 1.8874350586877044,
"grad_norm": 0.0977662785830903,
"learning_rate": 3.355428731522509e-05,
"loss": 0.1625,
"mean_token_accuracy": 0.9458343744277954,
"step": 4905
},
{
"epoch": 1.8893592457186839,
"grad_norm": 0.09788446984575432,
"learning_rate": 3.351788080311275e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9464067101478577,
"step": 4910
},
{
"epoch": 1.8912834327496633,
"grad_norm": 0.10214906792155744,
"learning_rate": 3.3481457313480934e-05,
"loss": 0.1604,
"mean_token_accuracy": 0.9464616537094116,
"step": 4915
},
{
"epoch": 1.8932076197806427,
"grad_norm": 0.10060544611170799,
"learning_rate": 3.344501694908686e-05,
"loss": 0.1596,
"mean_token_accuracy": 0.9465152502059937,
"step": 4920
},
{
"epoch": 1.895131806811622,
"grad_norm": 0.2794861014508182,
"learning_rate": 3.340855981273528e-05,
"loss": 0.1621,
"mean_token_accuracy": 0.9458632647991181,
"step": 4925
},
{
"epoch": 1.8970559938426015,
"grad_norm": 0.10406982278890324,
"learning_rate": 3.3372086007278344e-05,
"loss": 0.1603,
"mean_token_accuracy": 0.9462267875671386,
"step": 4930
},
{
"epoch": 1.898980180873581,
"grad_norm": 0.09761129864564562,
"learning_rate": 3.333559563561517e-05,
"loss": 0.1594,
"mean_token_accuracy": 0.9467118740081787,
"step": 4935
},
{
"epoch": 1.9009043679045603,
"grad_norm": 0.10085599183569154,
"learning_rate": 3.329908880069163e-05,
"loss": 0.1616,
"mean_token_accuracy": 0.9460000395774841,
"step": 4940
},
{
"epoch": 1.9028285549355397,
"grad_norm": 0.09683653725025748,
"learning_rate": 3.326256560550006e-05,
"loss": 0.1589,
"mean_token_accuracy": 0.9466755270957947,
"step": 4945
},
{
"epoch": 1.9047527419665191,
"grad_norm": 0.09342326037959806,
"learning_rate": 3.322602615307891e-05,
"loss": 0.1593,
"mean_token_accuracy": 0.9468986928462982,
"step": 4950
},
{
"epoch": 1.9066769289974985,
"grad_norm": 0.10555195157867583,
"learning_rate": 3.318947054651254e-05,
"loss": 0.1581,
"mean_token_accuracy": 0.9468642890453338,
"step": 4955
},
{
"epoch": 1.908601116028478,
"grad_norm": 0.09511827856569342,
"learning_rate": 3.315289888893085e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9460481464862823,
"step": 4960
},
{
"epoch": 1.9105253030594573,
"grad_norm": 0.0961044811313041,
"learning_rate": 3.3116311283509046e-05,
"loss": 0.1615,
"mean_token_accuracy": 0.945965725183487,
"step": 4965
},
{
"epoch": 1.9124494900904367,
"grad_norm": 0.09474214908651583,
"learning_rate": 3.30797078334673e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9464991927146912,
"step": 4970
},
{
"epoch": 1.9143736771214162,
"grad_norm": 0.09640519175928383,
"learning_rate": 3.304308864207052e-05,
"loss": 0.158,
"mean_token_accuracy": 0.9469054996967315,
"step": 4975
},
{
"epoch": 1.9162978641523956,
"grad_norm": 0.09736860178647218,
"learning_rate": 3.300645381262798e-05,
"loss": 0.1595,
"mean_token_accuracy": 0.9467026233673096,
"step": 4980
},
{
"epoch": 1.918222051183375,
"grad_norm": 0.10031238266022684,
"learning_rate": 3.2969803448493116e-05,
"loss": 0.1627,
"mean_token_accuracy": 0.9458101212978363,
"step": 4985
},
{
"epoch": 1.9201462382143544,
"grad_norm": 0.09654538040286698,
"learning_rate": 3.2933137653063154e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9466418564319611,
"step": 4990
},
{
"epoch": 1.9220704252453338,
"grad_norm": 0.09793397852312749,
"learning_rate": 3.289645652977888e-05,
"loss": 0.1592,
"mean_token_accuracy": 0.9470449328422547,
"step": 4995
},
{
"epoch": 1.9239946122763132,
"grad_norm": 0.09742505251785624,
"learning_rate": 3.285976018212429e-05,
"loss": 0.1587,
"mean_token_accuracy": 0.9468475997447967,
"step": 5000
},
{
"epoch": 1.9259187993072926,
"grad_norm": 0.10083861652857352,
"learning_rate": 3.2823048713626395e-05,
"loss": 0.161,
"mean_token_accuracy": 0.945935195684433,
"step": 5005
},
{
"epoch": 1.927842986338272,
"grad_norm": 0.09563911338044254,
"learning_rate": 3.278632222785478e-05,
"loss": 0.1598,
"mean_token_accuracy": 0.9465391278266907,
"step": 5010
},
{
"epoch": 1.9297671733692514,
"grad_norm": 0.09947135052042433,
"learning_rate": 3.274958082842145e-05,
"loss": 0.1594,
"mean_token_accuracy": 0.9466078162193299,
"step": 5015
},
{
"epoch": 1.9316913604002308,
"grad_norm": 0.09939040105234527,
"learning_rate": 3.271282461898049e-05,
"loss": 0.158,
"mean_token_accuracy": 0.9472302615642547,
"step": 5020
},
{
"epoch": 1.9336155474312102,
"grad_norm": 0.10048369508357508,
"learning_rate": 3.267605370322773e-05,
"loss": 0.161,
"mean_token_accuracy": 0.9461906492710114,
"step": 5025
},
{
"epoch": 1.9355397344621896,
"grad_norm": 0.09790767485398483,
"learning_rate": 3.2639268184900506e-05,
"loss": 0.1574,
"mean_token_accuracy": 0.9472224354743958,
"step": 5030
},
{
"epoch": 1.937463921493169,
"grad_norm": 0.09849370603158782,
"learning_rate": 3.260246816777737e-05,
"loss": 0.1563,
"mean_token_accuracy": 0.9475186169147491,
"step": 5035
},
{
"epoch": 1.9393881085241484,
"grad_norm": 0.0980768828044851,
"learning_rate": 3.256565375567776e-05,
"loss": 0.1584,
"mean_token_accuracy": 0.9468068718910218,
"step": 5040
},
{
"epoch": 1.9413122955551279,
"grad_norm": 0.10480453762445419,
"learning_rate": 3.252882505246171e-05,
"loss": 0.1624,
"mean_token_accuracy": 0.945637184381485,
"step": 5045
},
{
"epoch": 1.9432364825861073,
"grad_norm": 0.10616535211196816,
"learning_rate": 3.24919821620296e-05,
"loss": 0.1605,
"mean_token_accuracy": 0.9462700366973877,
"step": 5050
},
{
"epoch": 1.9451606696170867,
"grad_norm": 0.09894956259060722,
"learning_rate": 3.2455125188321806e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9463880658149719,
"step": 5055
},
{
"epoch": 1.947084856648066,
"grad_norm": 0.09224677772865261,
"learning_rate": 3.2418254235318474e-05,
"loss": 0.1578,
"mean_token_accuracy": 0.947035801410675,
"step": 5060
},
{
"epoch": 1.9490090436790455,
"grad_norm": 0.09838850162687025,
"learning_rate": 3.238136940703915e-05,
"loss": 0.1578,
"mean_token_accuracy": 0.9472411751747132,
"step": 5065
},
{
"epoch": 1.950933230710025,
"grad_norm": 0.09555416511064461,
"learning_rate": 3.234447080754255e-05,
"loss": 0.1574,
"mean_token_accuracy": 0.9474334597587586,
"step": 5070
},
{
"epoch": 1.9528574177410043,
"grad_norm": 0.09801635064811252,
"learning_rate": 3.230755854092622e-05,
"loss": 0.1597,
"mean_token_accuracy": 0.9464401781558991,
"step": 5075
},
{
"epoch": 1.9547816047719837,
"grad_norm": 0.09373235655008649,
"learning_rate": 3.2270632711326285e-05,
"loss": 0.1619,
"mean_token_accuracy": 0.9457164108753204,
"step": 5080
},
{
"epoch": 1.9567057918029631,
"grad_norm": 0.09348989261398567,
"learning_rate": 3.223369342291711e-05,
"loss": 0.1572,
"mean_token_accuracy": 0.9472990691661834,
"step": 5085
},
{
"epoch": 1.9586299788339425,
"grad_norm": 0.10418420549716748,
"learning_rate": 3.2196740779911054e-05,
"loss": 0.1596,
"mean_token_accuracy": 0.9464613854885101,
"step": 5090
},
{
"epoch": 1.960554165864922,
"grad_norm": 0.09745844772641116,
"learning_rate": 3.215977488655814e-05,
"loss": 0.1581,
"mean_token_accuracy": 0.9471061944961547,
"step": 5095
},
{
"epoch": 1.9624783528959013,
"grad_norm": 0.0992049012870913,
"learning_rate": 3.212279584714577e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9458459138870239,
"step": 5100
},
{
"epoch": 1.9644025399268807,
"grad_norm": 0.105459848856873,
"learning_rate": 3.2085803765998435e-05,
"loss": 0.1593,
"mean_token_accuracy": 0.9468812644481659,
"step": 5105
},
{
"epoch": 1.9663267269578601,
"grad_norm": 0.09767862155081931,
"learning_rate": 3.204879874747743e-05,
"loss": 0.1618,
"mean_token_accuracy": 0.9461267232894898,
"step": 5110
},
{
"epoch": 1.9682509139888396,
"grad_norm": 0.09507934172125736,
"learning_rate": 3.201178089598053e-05,
"loss": 0.1617,
"mean_token_accuracy": 0.9458625495433808,
"step": 5115
},
{
"epoch": 1.970175101019819,
"grad_norm": 0.0974695929044688,
"learning_rate": 3.1974750315941725e-05,
"loss": 0.1552,
"mean_token_accuracy": 0.9478743553161622,
"step": 5120
},
{
"epoch": 1.9720992880507984,
"grad_norm": 0.10126002440399436,
"learning_rate": 3.193770711183092e-05,
"loss": 0.1598,
"mean_token_accuracy": 0.9465251445770264,
"step": 5125
},
{
"epoch": 1.974023475081778,
"grad_norm": 0.09827018057592911,
"learning_rate": 3.1900651388153604e-05,
"loss": 0.1602,
"mean_token_accuracy": 0.9464419603347778,
"step": 5130
},
{
"epoch": 1.9759476621127574,
"grad_norm": 0.09528586355201614,
"learning_rate": 3.1863583249450645e-05,
"loss": 0.161,
"mean_token_accuracy": 0.9459393203258515,
"step": 5135
},
{
"epoch": 1.9778718491437368,
"grad_norm": 0.09611110064857509,
"learning_rate": 3.182650280029788e-05,
"loss": 0.1613,
"mean_token_accuracy": 0.9462891340255737,
"step": 5140
},
{
"epoch": 1.9797960361747162,
"grad_norm": 0.09725057007008812,
"learning_rate": 3.17894101453059e-05,
"loss": 0.1607,
"mean_token_accuracy": 0.9462293803691864,
"step": 5145
},
{
"epoch": 1.9817202232056956,
"grad_norm": 0.09214072277376484,
"learning_rate": 3.175230538911972e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.946392560005188,
"step": 5150
},
{
"epoch": 1.983644410236675,
"grad_norm": 0.09804047929466903,
"learning_rate": 3.171518863641852e-05,
"loss": 0.16,
"mean_token_accuracy": 0.9464940249919891,
"step": 5155
},
{
"epoch": 1.9855685972676544,
"grad_norm": 0.09701178740625536,
"learning_rate": 3.167805999191528e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9466553747653961,
"step": 5160
},
{
"epoch": 1.9874927842986339,
"grad_norm": 0.09811194237130026,
"learning_rate": 3.164091956035659e-05,
"loss": 0.16,
"mean_token_accuracy": 0.9464266657829284,
"step": 5165
},
{
"epoch": 1.9894169713296133,
"grad_norm": 0.09316358619416497,
"learning_rate": 3.1603767446522234e-05,
"loss": 0.156,
"mean_token_accuracy": 0.9476395964622497,
"step": 5170
},
{
"epoch": 1.9913411583605927,
"grad_norm": 0.10022414149683,
"learning_rate": 3.1566603755224976e-05,
"loss": 0.1593,
"mean_token_accuracy": 0.9465124249458313,
"step": 5175
},
{
"epoch": 1.993265345391572,
"grad_norm": 0.09865479422492948,
"learning_rate": 3.152942859131026e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9463797211647034,
"step": 5180
},
{
"epoch": 1.9951895324225515,
"grad_norm": 0.10300294525826155,
"learning_rate": 3.149224205965587e-05,
"loss": 0.161,
"mean_token_accuracy": 0.9461157321929932,
"step": 5185
},
{
"epoch": 1.9971137194535309,
"grad_norm": 0.09814417031680028,
"learning_rate": 3.145504426517168e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9469424188137054,
"step": 5190
},
{
"epoch": 1.9990379064845103,
"grad_norm": 0.09283170373290625,
"learning_rate": 3.141783531279932e-05,
"loss": 0.1589,
"mean_token_accuracy": 0.9467655956745148,
"step": 5195
},
{
"epoch": 2.0007696748123918,
"grad_norm": 0.0979217349593047,
"learning_rate": 3.138061530751193e-05,
"loss": 0.1461,
"mean_token_accuracy": 0.950280037191179,
"step": 5200
},
{
"epoch": 2.002693861843371,
"grad_norm": 0.10634331349231106,
"learning_rate": 3.134338435431377e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.954954308271408,
"step": 5205
},
{
"epoch": 2.0046180488743506,
"grad_norm": 0.10679220723686542,
"learning_rate": 3.130614255824006e-05,
"loss": 0.1318,
"mean_token_accuracy": 0.9545697510242462,
"step": 5210
},
{
"epoch": 2.00654223590533,
"grad_norm": 0.1056994101202361,
"learning_rate": 3.1268890024356575e-05,
"loss": 0.1316,
"mean_token_accuracy": 0.9546972990036011,
"step": 5215
},
{
"epoch": 2.0084664229363094,
"grad_norm": 0.1074679315159629,
"learning_rate": 3.1231626857759365e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9559260845184326,
"step": 5220
},
{
"epoch": 2.010390609967289,
"grad_norm": 0.3393926179404455,
"learning_rate": 3.119435316357451e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9549328625202179,
"step": 5225
},
{
"epoch": 2.012314796998268,
"grad_norm": 0.10947564562909612,
"learning_rate": 3.115706904695778e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.955171936750412,
"step": 5230
},
{
"epoch": 2.0142389840292476,
"grad_norm": 0.10517939325961964,
"learning_rate": 3.1119774613094335e-05,
"loss": 0.1314,
"mean_token_accuracy": 0.9554017126560211,
"step": 5235
},
{
"epoch": 2.016163171060227,
"grad_norm": 0.116200055131795,
"learning_rate": 3.1082469967198457e-05,
"loss": 0.1261,
"mean_token_accuracy": 0.9565369427204132,
"step": 5240
},
{
"epoch": 2.0180873580912064,
"grad_norm": 0.10526597846445619,
"learning_rate": 3.104515521451323e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.955550628900528,
"step": 5245
},
{
"epoch": 2.020011545122186,
"grad_norm": 0.11016870301187903,
"learning_rate": 3.1007830460310264e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9552641570568084,
"step": 5250
},
{
"epoch": 2.0219357321531652,
"grad_norm": 0.10511615738632486,
"learning_rate": 3.097049580988935e-05,
"loss": 0.129,
"mean_token_accuracy": 0.9556998193264008,
"step": 5255
},
{
"epoch": 2.0238599191841447,
"grad_norm": 0.10691090435399411,
"learning_rate": 3.093315136857825e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9555999100208282,
"step": 5260
},
{
"epoch": 2.025784106215124,
"grad_norm": 0.15292956488636977,
"learning_rate": 3.089579724173231e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9560263872146606,
"step": 5265
},
{
"epoch": 2.0277082932461035,
"grad_norm": 0.10684680779913877,
"learning_rate": 3.085843353473421e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9552083313465118,
"step": 5270
},
{
"epoch": 2.029632480277083,
"grad_norm": 0.58474037537279,
"learning_rate": 3.082106035299366e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.9555357456207275,
"step": 5275
},
{
"epoch": 2.0315566673080623,
"grad_norm": 0.1065973145069723,
"learning_rate": 3.07836778019471e-05,
"loss": 0.1318,
"mean_token_accuracy": 0.9546404302120208,
"step": 5280
},
{
"epoch": 2.0334808543390417,
"grad_norm": 0.10442064704720966,
"learning_rate": 3.07462859870574e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9547493577003479,
"step": 5285
},
{
"epoch": 2.035405041370021,
"grad_norm": 0.10873826596804126,
"learning_rate": 3.070888501381357e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9552703380584717,
"step": 5290
},
{
"epoch": 2.0373292284010005,
"grad_norm": 0.10357965628977037,
"learning_rate": 3.067147498773045e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9551702678203583,
"step": 5295
},
{
"epoch": 2.03925341543198,
"grad_norm": 0.1074588597649682,
"learning_rate": 3.063405601434841e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9560691356658936,
"step": 5300
},
{
"epoch": 2.0411776024629593,
"grad_norm": 0.10911825659535652,
"learning_rate": 3.059662819923311e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9555826544761657,
"step": 5305
},
{
"epoch": 2.0431017894939387,
"grad_norm": 0.10108948948238519,
"learning_rate": 3.0559191647975074e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9549266755580902,
"step": 5310
},
{
"epoch": 2.045025976524918,
"grad_norm": 0.1071244864350153,
"learning_rate": 3.052174646618956e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.9551376879215241,
"step": 5315
},
{
"epoch": 2.0469501635558975,
"grad_norm": 0.11039730911750427,
"learning_rate": 3.0484292759516104e-05,
"loss": 0.1309,
"mean_token_accuracy": 0.954924464225769,
"step": 5320
},
{
"epoch": 2.048874350586877,
"grad_norm": 0.10763586909319262,
"learning_rate": 3.0446830633618334e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9557511568069458,
"step": 5325
},
{
"epoch": 2.0507985376178564,
"grad_norm": 0.11038708625650738,
"learning_rate": 3.0409360194183605e-05,
"loss": 0.1325,
"mean_token_accuracy": 0.9542287409305572,
"step": 5330
},
{
"epoch": 2.0527227246488358,
"grad_norm": 0.10981688615939526,
"learning_rate": 3.0371881546922748e-05,
"loss": 0.1309,
"mean_token_accuracy": 0.9547199666500091,
"step": 5335
},
{
"epoch": 2.054646911679815,
"grad_norm": 0.10796682056039919,
"learning_rate": 3.0334394797569725e-05,
"loss": 0.1316,
"mean_token_accuracy": 0.954637098312378,
"step": 5340
},
{
"epoch": 2.0565710987107946,
"grad_norm": 0.11915701134432091,
"learning_rate": 3.029690005188139e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9551791489124298,
"step": 5345
},
{
"epoch": 2.058495285741774,
"grad_norm": 0.11569250091945521,
"learning_rate": 3.0259397415637114e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9559932589530945,
"step": 5350
},
{
"epoch": 2.0604194727727534,
"grad_norm": 0.10436449803415335,
"learning_rate": 3.0221886994638567e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9558859884738922,
"step": 5355
},
{
"epoch": 2.062343659803733,
"grad_norm": 0.10517908575815679,
"learning_rate": 3.0184368894709343e-05,
"loss": 0.1314,
"mean_token_accuracy": 0.9548024773597718,
"step": 5360
},
{
"epoch": 2.064267846834712,
"grad_norm": 0.13334566474512122,
"learning_rate": 3.014684322169474e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9551829278469086,
"step": 5365
},
{
"epoch": 2.0661920338656916,
"grad_norm": 0.291710679915826,
"learning_rate": 3.0109310081461405e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9552090466022491,
"step": 5370
},
{
"epoch": 2.068116220896671,
"grad_norm": 0.11518704261619796,
"learning_rate": 3.007176957989703e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.9560563921928406,
"step": 5375
},
{
"epoch": 2.0700404079276504,
"grad_norm": 0.10943743689817875,
"learning_rate": 3.0034221822910108e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9554878771305084,
"step": 5380
},
{
"epoch": 2.07196459495863,
"grad_norm": 0.10570008629104946,
"learning_rate": 2.9996666916429578e-05,
"loss": 0.1318,
"mean_token_accuracy": 0.954410445690155,
"step": 5385
},
{
"epoch": 2.0738887819896092,
"grad_norm": 0.11138483865068619,
"learning_rate": 2.9959104966404562e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9557713150978089,
"step": 5390
},
{
"epoch": 2.0758129690205886,
"grad_norm": 0.11771707950321599,
"learning_rate": 2.9921536078804042e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9561442255973815,
"step": 5395
},
{
"epoch": 2.077737156051568,
"grad_norm": 0.12294598274945921,
"learning_rate": 2.9883960359616587e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9550639927387238,
"step": 5400
},
{
"epoch": 2.0796613430825475,
"grad_norm": 0.11316747482591458,
"learning_rate": 2.984637791485001e-05,
"loss": 0.1308,
"mean_token_accuracy": 0.9548614978790283,
"step": 5405
},
{
"epoch": 2.081585530113527,
"grad_norm": 0.10864043023159166,
"learning_rate": 2.9808788850531145e-05,
"loss": 0.1318,
"mean_token_accuracy": 0.9546567440032959,
"step": 5410
},
{
"epoch": 2.0835097171445063,
"grad_norm": 0.10886707239628111,
"learning_rate": 2.9771193272705454e-05,
"loss": 0.1308,
"mean_token_accuracy": 0.9549291670322418,
"step": 5415
},
{
"epoch": 2.0854339041754857,
"grad_norm": 0.10824866644582333,
"learning_rate": 2.9733591287436807e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9558004319667817,
"step": 5420
},
{
"epoch": 2.087358091206465,
"grad_norm": 0.10670212406836654,
"learning_rate": 2.9695983000807133e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.9559083044528961,
"step": 5425
},
{
"epoch": 2.0892822782374445,
"grad_norm": 0.10772997897188348,
"learning_rate": 2.965836851891614e-05,
"loss": 0.1316,
"mean_token_accuracy": 0.9547219812870026,
"step": 5430
},
{
"epoch": 2.091206465268424,
"grad_norm": 0.10726583715210378,
"learning_rate": 2.9620747947881016e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9550222814083099,
"step": 5435
},
{
"epoch": 2.0931306522994033,
"grad_norm": 0.10923784314271137,
"learning_rate": 2.958312139383615e-05,
"loss": 0.1305,
"mean_token_accuracy": 0.9548932433128356,
"step": 5440
},
{
"epoch": 2.0950548393303827,
"grad_norm": 0.1149543715362021,
"learning_rate": 2.9545488962932764e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9552010416984558,
"step": 5445
},
{
"epoch": 2.096979026361362,
"grad_norm": 0.10556163043636207,
"learning_rate": 2.9507850761338694e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9556468963623047,
"step": 5450
},
{
"epoch": 2.0989032133923415,
"grad_norm": 0.1105578089091931,
"learning_rate": 2.947020689523806e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9551441192626953,
"step": 5455
},
{
"epoch": 2.100827400423321,
"grad_norm": 0.10875601486403272,
"learning_rate": 2.943255747083093e-05,
"loss": 0.1311,
"mean_token_accuracy": 0.9547959864139557,
"step": 5460
},
{
"epoch": 2.1027515874543004,
"grad_norm": 0.11107473151673773,
"learning_rate": 2.939490259433308e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.955434012413025,
"step": 5465
},
{
"epoch": 2.1046757744852798,
"grad_norm": 0.10500369274811092,
"learning_rate": 2.9357242371975663e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9555002927780152,
"step": 5470
},
{
"epoch": 2.106599961516259,
"grad_norm": 0.11034845372006379,
"learning_rate": 2.9319576910004908e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.955095624923706,
"step": 5475
},
{
"epoch": 2.1085241485472386,
"grad_norm": 0.1062967363159288,
"learning_rate": 2.9281906314681828e-05,
"loss": 0.1282,
"mean_token_accuracy": 0.9557537794113159,
"step": 5480
},
{
"epoch": 2.1104483355782184,
"grad_norm": 0.10818146503538062,
"learning_rate": 2.9244230692281928e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9547770500183106,
"step": 5485
},
{
"epoch": 2.1123725226091974,
"grad_norm": 0.15693716675982816,
"learning_rate": 2.920655014909487e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.9557695984840393,
"step": 5490
},
{
"epoch": 2.1142967096401772,
"grad_norm": 0.10530890895897269,
"learning_rate": 2.916886479142422e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9554035186767578,
"step": 5495
},
{
"epoch": 2.1162208966711566,
"grad_norm": 0.11209369119736379,
"learning_rate": 2.9131174725587134e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9551724195480347,
"step": 5500
},
{
"epoch": 2.118145083702136,
"grad_norm": 0.10735945610865963,
"learning_rate": 2.9093480057914018e-05,
"loss": 0.1287,
"mean_token_accuracy": 0.9553767502307892,
"step": 5505
},
{
"epoch": 2.1200692707331155,
"grad_norm": 0.10427220007610355,
"learning_rate": 2.9055780894748284e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9552501380443573,
"step": 5510
},
{
"epoch": 2.121993457764095,
"grad_norm": 0.10587862731328973,
"learning_rate": 2.9018077342446042e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9554009735584259,
"step": 5515
},
{
"epoch": 2.1239176447950743,
"grad_norm": 0.09896574916048452,
"learning_rate": 2.8980369507375744e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.9556243360042572,
"step": 5520
},
{
"epoch": 2.1258418318260537,
"grad_norm": 0.11066609082111911,
"learning_rate": 2.8942657495917945e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9550888121128083,
"step": 5525
},
{
"epoch": 2.127766018857033,
"grad_norm": 0.10443162080898229,
"learning_rate": 2.8904941414465002e-05,
"loss": 0.1303,
"mean_token_accuracy": 0.9551991522312164,
"step": 5530
},
{
"epoch": 2.1296902058880125,
"grad_norm": 0.10637238952626588,
"learning_rate": 2.8867221369420722e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9547699511051178,
"step": 5535
},
{
"epoch": 2.131614392918992,
"grad_norm": 0.10864780408653604,
"learning_rate": 2.8829497467200105e-05,
"loss": 0.1332,
"mean_token_accuracy": 0.9541658878326416,
"step": 5540
},
{
"epoch": 2.1335385799499713,
"grad_norm": 0.10480536707422429,
"learning_rate": 2.879176981422904e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9548263072967529,
"step": 5545
},
{
"epoch": 2.1354627669809507,
"grad_norm": 0.10840316140924505,
"learning_rate": 2.8754038516943988e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9551911413669586,
"step": 5550
},
{
"epoch": 2.13738695401193,
"grad_norm": 0.10511882631547212,
"learning_rate": 2.87163036817917e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9550678730010986,
"step": 5555
},
{
"epoch": 2.1393111410429095,
"grad_norm": 0.11303406026455419,
"learning_rate": 2.8678565415228915e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9555539965629578,
"step": 5560
},
{
"epoch": 2.141235328073889,
"grad_norm": 0.10462909368723101,
"learning_rate": 2.8640823823722013e-05,
"loss": 0.1317,
"mean_token_accuracy": 0.9547511637210846,
"step": 5565
},
{
"epoch": 2.1431595151048684,
"grad_norm": 0.10309809452782778,
"learning_rate": 2.8603079013746802e-05,
"loss": 0.1287,
"mean_token_accuracy": 0.9556194484233856,
"step": 5570
},
{
"epoch": 2.1450837021358478,
"grad_norm": 0.10589336408568385,
"learning_rate": 2.856533109178815e-05,
"loss": 0.1305,
"mean_token_accuracy": 0.9550436675548554,
"step": 5575
},
{
"epoch": 2.147007889166827,
"grad_norm": 0.10548353857019672,
"learning_rate": 2.8527580164339706e-05,
"loss": 0.1321,
"mean_token_accuracy": 0.9544054627418518,
"step": 5580
},
{
"epoch": 2.1489320761978066,
"grad_norm": 0.10686359661962178,
"learning_rate": 2.8489826337903585e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9546919703483582,
"step": 5585
},
{
"epoch": 2.150856263228786,
"grad_norm": 0.10848484445107079,
"learning_rate": 2.845206971899011e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9550032496452332,
"step": 5590
},
{
"epoch": 2.1527804502597654,
"grad_norm": 0.11059986768054421,
"learning_rate": 2.841431041411745e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9552673876285553,
"step": 5595
},
{
"epoch": 2.154704637290745,
"grad_norm": 0.1153169713302373,
"learning_rate": 2.8376548529811377e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.9550434827804566,
"step": 5600
},
{
"epoch": 2.156628824321724,
"grad_norm": 0.10497406788532454,
"learning_rate": 2.8338784172604915e-05,
"loss": 0.1324,
"mean_token_accuracy": 0.9546684205532074,
"step": 5605
},
{
"epoch": 2.1585530113527036,
"grad_norm": 0.1105251857493249,
"learning_rate": 2.8301017449038087e-05,
"loss": 0.1327,
"mean_token_accuracy": 0.9545048475265503,
"step": 5610
},
{
"epoch": 2.160477198383683,
"grad_norm": 0.1080109687561676,
"learning_rate": 2.8263248465657595e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9556637406349182,
"step": 5615
},
{
"epoch": 2.1624013854146624,
"grad_norm": 0.10582265706415724,
"learning_rate": 2.8225477329016487e-05,
"loss": 0.1336,
"mean_token_accuracy": 0.9541392922401428,
"step": 5620
},
{
"epoch": 2.164325572445642,
"grad_norm": 0.10822890534554394,
"learning_rate": 2.8187704145673914e-05,
"loss": 0.1327,
"mean_token_accuracy": 0.9543117165565491,
"step": 5625
},
{
"epoch": 2.1662497594766212,
"grad_norm": 0.1062661018079552,
"learning_rate": 2.8149929022194783e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9552195489406585,
"step": 5630
},
{
"epoch": 2.1681739465076006,
"grad_norm": 0.10273705135295484,
"learning_rate": 2.81121520651495e-05,
"loss": 0.1303,
"mean_token_accuracy": 0.9550776362419129,
"step": 5635
},
{
"epoch": 2.17009813353858,
"grad_norm": 0.10648913106036861,
"learning_rate": 2.80743733811136e-05,
"loss": 0.1318,
"mean_token_accuracy": 0.9548194766044616,
"step": 5640
},
{
"epoch": 2.1720223205695595,
"grad_norm": 0.10344426917331391,
"learning_rate": 2.8036593076667533e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9556067049503326,
"step": 5645
},
{
"epoch": 2.173946507600539,
"grad_norm": 0.10956564255431846,
"learning_rate": 2.79988112583963e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9553233087062836,
"step": 5650
},
{
"epoch": 2.1758706946315183,
"grad_norm": 0.10202792432984834,
"learning_rate": 2.796102803288918e-05,
"loss": 0.1311,
"mean_token_accuracy": 0.9548551559448242,
"step": 5655
},
{
"epoch": 2.1777948816624977,
"grad_norm": 0.10763154625024812,
"learning_rate": 2.792324350673941e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9553223311901092,
"step": 5660
},
{
"epoch": 2.179719068693477,
"grad_norm": 0.10845788005478905,
"learning_rate": 2.7885457786543924e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9553823292255401,
"step": 5665
},
{
"epoch": 2.1816432557244565,
"grad_norm": 0.10586949851671838,
"learning_rate": 2.784767097890298e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.955373477935791,
"step": 5670
},
{
"epoch": 2.183567442755436,
"grad_norm": 0.1072073809147426,
"learning_rate": 2.7809883190419945e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9549127817153931,
"step": 5675
},
{
"epoch": 2.1854916297864153,
"grad_norm": 0.10760418876402711,
"learning_rate": 2.777209452770093e-05,
"loss": 0.1323,
"mean_token_accuracy": 0.9545275568962097,
"step": 5680
},
{
"epoch": 2.1874158168173947,
"grad_norm": 0.12750165015325696,
"learning_rate": 2.7734305097354528e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9550306141376496,
"step": 5685
},
{
"epoch": 2.189340003848374,
"grad_norm": 0.10549599237818093,
"learning_rate": 2.7696515005991487e-05,
"loss": 0.1287,
"mean_token_accuracy": 0.9555880784988403,
"step": 5690
},
{
"epoch": 2.1912641908793535,
"grad_norm": 0.10416146043638136,
"learning_rate": 2.765872436022442e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.9558050215244294,
"step": 5695
},
{
"epoch": 2.193188377910333,
"grad_norm": 0.11222982332531294,
"learning_rate": 2.7620933266667502e-05,
"loss": 0.1305,
"mean_token_accuracy": 0.954997855424881,
"step": 5700
},
{
"epoch": 2.1951125649413123,
"grad_norm": 0.10838434196926705,
"learning_rate": 2.7583141831936187e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9555193424224854,
"step": 5705
},
{
"epoch": 2.1970367519722918,
"grad_norm": 0.10364017631139727,
"learning_rate": 2.7545350162646882e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9559944987297058,
"step": 5710
},
{
"epoch": 2.198960939003271,
"grad_norm": 0.11070015056421295,
"learning_rate": 2.750755836541663e-05,
"loss": 0.1311,
"mean_token_accuracy": 0.9547731518745423,
"step": 5715
},
{
"epoch": 2.2008851260342506,
"grad_norm": 0.10907380677894396,
"learning_rate": 2.7469766546862878e-05,
"loss": 0.1305,
"mean_token_accuracy": 0.9551173925399781,
"step": 5720
},
{
"epoch": 2.20280931306523,
"grad_norm": 0.10698591041494747,
"learning_rate": 2.7431974813603128e-05,
"loss": 0.1276,
"mean_token_accuracy": 0.9559794843196869,
"step": 5725
},
{
"epoch": 2.2047335000962094,
"grad_norm": 0.10973257299820491,
"learning_rate": 2.7394183272254603e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9552995502948761,
"step": 5730
},
{
"epoch": 2.206657687127189,
"grad_norm": 0.10609738612776014,
"learning_rate": 2.7356392029434008e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9562586367130279,
"step": 5735
},
{
"epoch": 2.208581874158168,
"grad_norm": 0.10594911969855815,
"learning_rate": 2.7318601191757227e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9548856914043427,
"step": 5740
},
{
"epoch": 2.2105060611891476,
"grad_norm": 0.11002601673144265,
"learning_rate": 2.7280810865838968e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9547413468360901,
"step": 5745
},
{
"epoch": 2.212430248220127,
"grad_norm": 0.10815764932630612,
"learning_rate": 2.7243021158292508e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9547680258750916,
"step": 5750
},
{
"epoch": 2.2143544352511064,
"grad_norm": 0.10325292557809732,
"learning_rate": 2.7205232175729385e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9557075679302216,
"step": 5755
},
{
"epoch": 2.216278622282086,
"grad_norm": 0.10368790762039573,
"learning_rate": 2.7167444024759072e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9563073456287384,
"step": 5760
},
{
"epoch": 2.2182028093130652,
"grad_norm": 0.10585655474534729,
"learning_rate": 2.712965681198873e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9553906381130218,
"step": 5765
},
{
"epoch": 2.2201269963440446,
"grad_norm": 0.10760885515979707,
"learning_rate": 2.7091870644022848e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.954918497800827,
"step": 5770
},
{
"epoch": 2.222051183375024,
"grad_norm": 0.10509001085051531,
"learning_rate": 2.7054085627462967e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.954748409986496,
"step": 5775
},
{
"epoch": 2.2239753704060035,
"grad_norm": 0.10709947966640206,
"learning_rate": 2.701630186890738e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9552448928356171,
"step": 5780
},
{
"epoch": 2.225899557436983,
"grad_norm": 0.11355608152711724,
"learning_rate": 2.6978519474950853e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9551321148872376,
"step": 5785
},
{
"epoch": 2.2278237444679623,
"grad_norm": 0.10715051929012634,
"learning_rate": 2.694073855218428e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9552355110645294,
"step": 5790
},
{
"epoch": 2.2297479314989417,
"grad_norm": 0.10240754149347893,
"learning_rate": 2.6902959207194394e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.9551454603672027,
"step": 5795
},
{
"epoch": 2.231672118529921,
"grad_norm": 0.10684337470859812,
"learning_rate": 2.6865181546563516e-05,
"loss": 0.1323,
"mean_token_accuracy": 0.9545264899730682,
"step": 5800
},
{
"epoch": 2.2335963055609005,
"grad_norm": 0.11288874992338749,
"learning_rate": 2.682740567686918e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9554365873336792,
"step": 5805
},
{
"epoch": 2.23552049259188,
"grad_norm": 0.10965823833426662,
"learning_rate": 2.6789631704683887e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9561128377914428,
"step": 5810
},
{
"epoch": 2.2374446796228593,
"grad_norm": 0.10668910390319478,
"learning_rate": 2.6751859736574764e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9552412092685699,
"step": 5815
},
{
"epoch": 2.2393688666538387,
"grad_norm": 0.10517025682012639,
"learning_rate": 2.6714089879103304e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9549421489238739,
"step": 5820
},
{
"epoch": 2.241293053684818,
"grad_norm": 0.1086958023252,
"learning_rate": 2.667632223882504e-05,
"loss": 0.1314,
"mean_token_accuracy": 0.9544699192047119,
"step": 5825
},
{
"epoch": 2.2432172407157975,
"grad_norm": 0.10850082865160174,
"learning_rate": 2.6638556922289266e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9557575345039367,
"step": 5830
},
{
"epoch": 2.245141427746777,
"grad_norm": 0.11397149685095452,
"learning_rate": 2.660079403603867e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9550568044185639,
"step": 5835
},
{
"epoch": 2.2470656147777563,
"grad_norm": 0.10692811819225884,
"learning_rate": 2.6563033686609135e-05,
"loss": 0.1288,
"mean_token_accuracy": 0.95555659532547,
"step": 5840
},
{
"epoch": 2.2489898018087358,
"grad_norm": 0.10865015750464559,
"learning_rate": 2.6525275980529375e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9553133845329285,
"step": 5845
},
{
"epoch": 2.250913988839715,
"grad_norm": 0.10811587288138687,
"learning_rate": 2.648752102432062e-05,
"loss": 0.1308,
"mean_token_accuracy": 0.9549175322055816,
"step": 5850
},
{
"epoch": 2.2528381758706946,
"grad_norm": 0.11089053169125833,
"learning_rate": 2.6449768924496392e-05,
"loss": 0.1315,
"mean_token_accuracy": 0.9547737777233124,
"step": 5855
},
{
"epoch": 2.254762362901674,
"grad_norm": 0.11025011291342199,
"learning_rate": 2.6412019787562103e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9564937889575958,
"step": 5860
},
{
"epoch": 2.2566865499326534,
"grad_norm": 0.10938026792029695,
"learning_rate": 2.6374273720014836e-05,
"loss": 0.1322,
"mean_token_accuracy": 0.9544373095035553,
"step": 5865
},
{
"epoch": 2.258610736963633,
"grad_norm": 0.10214032067674882,
"learning_rate": 2.6336530828343e-05,
"loss": 0.1288,
"mean_token_accuracy": 0.9556487739086151,
"step": 5870
},
{
"epoch": 2.260534923994612,
"grad_norm": 0.10768313363292305,
"learning_rate": 2.629879121902607e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9554717481136322,
"step": 5875
},
{
"epoch": 2.2624591110255916,
"grad_norm": 0.1064398396196536,
"learning_rate": 2.6261054998534225e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9554791688919068,
"step": 5880
},
{
"epoch": 2.264383298056571,
"grad_norm": 0.10476910163839293,
"learning_rate": 2.62233222733281e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9552315890789032,
"step": 5885
},
{
"epoch": 2.2663074850875504,
"grad_norm": 0.10681177262629739,
"learning_rate": 2.6185593149858485e-05,
"loss": 0.1312,
"mean_token_accuracy": 0.9546931982040405,
"step": 5890
},
{
"epoch": 2.26823167211853,
"grad_norm": 0.11073979536465368,
"learning_rate": 2.614786773456599e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9559510767459869,
"step": 5895
},
{
"epoch": 2.2701558591495092,
"grad_norm": 0.10405240104670332,
"learning_rate": 2.611014613388075e-05,
"loss": 0.1288,
"mean_token_accuracy": 0.9558224618434906,
"step": 5900
},
{
"epoch": 2.2720800461804886,
"grad_norm": 0.1063203663396225,
"learning_rate": 2.6072428454222175e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9550588607788086,
"step": 5905
},
{
"epoch": 2.274004233211468,
"grad_norm": 0.10756387316528157,
"learning_rate": 2.603471480199859e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.954735153913498,
"step": 5910
},
{
"epoch": 2.2759284202424475,
"grad_norm": 0.1036959392954559,
"learning_rate": 2.599700528360697e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9552125930786133,
"step": 5915
},
{
"epoch": 2.277852607273427,
"grad_norm": 0.10582831579947122,
"learning_rate": 2.5959300005432596e-05,
"loss": 0.1316,
"mean_token_accuracy": 0.9549217462539673,
"step": 5920
},
{
"epoch": 2.2797767943044063,
"grad_norm": 0.10638865399029085,
"learning_rate": 2.5921599073848828e-05,
"loss": 0.129,
"mean_token_accuracy": 0.9555549681186676,
"step": 5925
},
{
"epoch": 2.2817009813353857,
"grad_norm": 0.1063432249936701,
"learning_rate": 2.5883902595216737e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9553889214992524,
"step": 5930
},
{
"epoch": 2.283625168366365,
"grad_norm": 0.11415249587430622,
"learning_rate": 2.584621067588486e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9552380859851837,
"step": 5935
},
{
"epoch": 2.2855493553973445,
"grad_norm": 0.10559109236104772,
"learning_rate": 2.580852342218883e-05,
"loss": 0.1312,
"mean_token_accuracy": 0.9549641907215118,
"step": 5940
},
{
"epoch": 2.287473542428324,
"grad_norm": 0.11142315713399763,
"learning_rate": 2.5770840940451134e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9562214732170105,
"step": 5945
},
{
"epoch": 2.2893977294593033,
"grad_norm": 0.10625800042077924,
"learning_rate": 2.5733163336980825e-05,
"loss": 0.1282,
"mean_token_accuracy": 0.9559169769287109,
"step": 5950
},
{
"epoch": 2.2913219164902827,
"grad_norm": 0.10950350463698343,
"learning_rate": 2.5695490718073158e-05,
"loss": 0.1341,
"mean_token_accuracy": 0.9538970530033112,
"step": 5955
},
{
"epoch": 2.293246103521262,
"grad_norm": 0.10484188965016204,
"learning_rate": 2.5657823190009338e-05,
"loss": 0.1309,
"mean_token_accuracy": 0.9549141764640808,
"step": 5960
},
{
"epoch": 2.2951702905522415,
"grad_norm": 0.11126703936208204,
"learning_rate": 2.5620160859056204e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.955399614572525,
"step": 5965
},
{
"epoch": 2.297094477583221,
"grad_norm": 0.11122131191415677,
"learning_rate": 2.5582503831465938e-05,
"loss": 0.1321,
"mean_token_accuracy": 0.9543533623218536,
"step": 5970
},
{
"epoch": 2.2990186646142003,
"grad_norm": 0.1049382676038808,
"learning_rate": 2.554485221347575e-05,
"loss": 0.1325,
"mean_token_accuracy": 0.9545806467533111,
"step": 5975
},
{
"epoch": 2.3009428516451798,
"grad_norm": 0.10943907755115642,
"learning_rate": 2.5507206111307626e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9549081921577454,
"step": 5980
},
{
"epoch": 2.302867038676159,
"grad_norm": 0.10775764576510022,
"learning_rate": 2.5469565631167934e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9553714513778686,
"step": 5985
},
{
"epoch": 2.3047912257071386,
"grad_norm": 0.11461333255761326,
"learning_rate": 2.5431930879247218e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9552597403526306,
"step": 5990
},
{
"epoch": 2.306715412738118,
"grad_norm": 0.11188764073802483,
"learning_rate": 2.5394301961719858e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9555404365062714,
"step": 5995
},
{
"epoch": 2.3086395997690974,
"grad_norm": 0.10376185181583203,
"learning_rate": 2.535667898474377e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.955972284078598,
"step": 6000
},
{
"epoch": 2.310563786800077,
"grad_norm": 0.10982484930848939,
"learning_rate": 2.531906205446009e-05,
"loss": 0.1319,
"mean_token_accuracy": 0.9546591579914093,
"step": 6005
},
{
"epoch": 2.312487973831056,
"grad_norm": 0.10442132804146616,
"learning_rate": 2.528145127699294e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9557735204696656,
"step": 6010
},
{
"epoch": 2.3144121608620356,
"grad_norm": 0.10481317479938844,
"learning_rate": 2.5243846758449042e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558473527431488,
"step": 6015
},
{
"epoch": 2.316336347893015,
"grad_norm": 0.10672119468012388,
"learning_rate": 2.520624860491748e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9557987153530121,
"step": 6020
},
{
"epoch": 2.3182605349239944,
"grad_norm": 0.10977936393100184,
"learning_rate": 2.5168656922469398e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9552335500717163,
"step": 6025
},
{
"epoch": 2.3201847219549743,
"grad_norm": 0.10657246279734564,
"learning_rate": 2.5131071817157636e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9553155183792115,
"step": 6030
},
{
"epoch": 2.3221089089859532,
"grad_norm": 0.11667544810089918,
"learning_rate": 2.509349339501652e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9557375490665436,
"step": 6035
},
{
"epoch": 2.324033096016933,
"grad_norm": 0.10684484275625068,
"learning_rate": 2.505592176206151e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9550171077251435,
"step": 6040
},
{
"epoch": 2.325957283047912,
"grad_norm": 0.11365053047555458,
"learning_rate": 2.5018357024288917e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9558144569396972,
"step": 6045
},
{
"epoch": 2.327881470078892,
"grad_norm": 0.1057871075763028,
"learning_rate": 2.4980799287675578e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.9556811392307282,
"step": 6050
},
{
"epoch": 2.329805657109871,
"grad_norm": 0.11063592984299654,
"learning_rate": 2.4943248658178603e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9552521586418152,
"step": 6055
},
{
"epoch": 2.3317298441408507,
"grad_norm": 0.11130860025823555,
"learning_rate": 2.4905705241735032e-05,
"loss": 0.1312,
"mean_token_accuracy": 0.9547226846218109,
"step": 6060
},
{
"epoch": 2.3336540311718297,
"grad_norm": 0.10343549418767663,
"learning_rate": 2.4868169144261595e-05,
"loss": 0.1276,
"mean_token_accuracy": 0.9562317252159118,
"step": 6065
},
{
"epoch": 2.3355782182028095,
"grad_norm": 0.11017111425706975,
"learning_rate": 2.4830640471654317e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9559598863124847,
"step": 6070
},
{
"epoch": 2.3375024052337885,
"grad_norm": 0.10762762532836595,
"learning_rate": 2.4793119329788307e-05,
"loss": 0.129,
"mean_token_accuracy": 0.9555034756660461,
"step": 6075
},
{
"epoch": 2.3394265922647683,
"grad_norm": 0.1047860968850429,
"learning_rate": 2.475560582451743e-05,
"loss": 0.1323,
"mean_token_accuracy": 0.9545365333557129,
"step": 6080
},
{
"epoch": 2.3413507792957473,
"grad_norm": 0.10233494138880919,
"learning_rate": 2.471810006167401e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9555085003376007,
"step": 6085
},
{
"epoch": 2.343274966326727,
"grad_norm": 0.10617127970482486,
"learning_rate": 2.4680602147068526e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9554856240749359,
"step": 6090
},
{
"epoch": 2.345199153357706,
"grad_norm": 0.10203977332496438,
"learning_rate": 2.464311218648928e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9553018271923065,
"step": 6095
},
{
"epoch": 2.347123340388686,
"grad_norm": 0.10411939683124619,
"learning_rate": 2.4605630285702196e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9553432047367096,
"step": 6100
},
{
"epoch": 2.3490475274196654,
"grad_norm": 0.11351176055257131,
"learning_rate": 2.456815655045041e-05,
"loss": 0.1324,
"mean_token_accuracy": 0.9544381856918335,
"step": 6105
},
{
"epoch": 2.350971714450645,
"grad_norm": 0.11034240481943162,
"learning_rate": 2.4530691086454055e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9551987648010254,
"step": 6110
},
{
"epoch": 2.352895901481624,
"grad_norm": 0.11259140096829676,
"learning_rate": 2.4493233999409904e-05,
"loss": 0.1325,
"mean_token_accuracy": 0.9543862223625184,
"step": 6115
},
{
"epoch": 2.3548200885126036,
"grad_norm": 0.10479962693609542,
"learning_rate": 2.4455785394991104e-05,
"loss": 0.1303,
"mean_token_accuracy": 0.9551300168037414,
"step": 6120
},
{
"epoch": 2.356744275543583,
"grad_norm": 0.10724118026152689,
"learning_rate": 2.441834537884688e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9552154541015625,
"step": 6125
},
{
"epoch": 2.3586684625745624,
"grad_norm": 0.11255024870502185,
"learning_rate": 2.438091405660224e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.955460250377655,
"step": 6130
},
{
"epoch": 2.360592649605542,
"grad_norm": 0.10969541941331791,
"learning_rate": 2.4343491533857616e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.9556892514228821,
"step": 6135
},
{
"epoch": 2.3625168366365212,
"grad_norm": 0.111322111822571,
"learning_rate": 2.4306077916188662e-05,
"loss": 0.134,
"mean_token_accuracy": 0.953878390789032,
"step": 6140
},
{
"epoch": 2.3644410236675006,
"grad_norm": 0.10714069318727834,
"learning_rate": 2.4268673309145894e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9550508975982666,
"step": 6145
},
{
"epoch": 2.36636521069848,
"grad_norm": 0.1053211683904264,
"learning_rate": 2.423127781825441e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9550895631313324,
"step": 6150
},
{
"epoch": 2.3682893977294595,
"grad_norm": 0.10556315886193816,
"learning_rate": 2.419389154901358e-05,
"loss": 0.1276,
"mean_token_accuracy": 0.9559677720069886,
"step": 6155
},
{
"epoch": 2.370213584760439,
"grad_norm": 0.10627216599036601,
"learning_rate": 2.415651460689677e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.955669516324997,
"step": 6160
},
{
"epoch": 2.3721377717914183,
"grad_norm": 0.10979734924510835,
"learning_rate": 2.4119147097351014e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9557546257972718,
"step": 6165
},
{
"epoch": 2.3740619588223977,
"grad_norm": 0.10970260677661284,
"learning_rate": 2.4081789125796766e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9555782079696655,
"step": 6170
},
{
"epoch": 2.375986145853377,
"grad_norm": 0.10998606034881979,
"learning_rate": 2.404444079762756e-05,
"loss": 0.1311,
"mean_token_accuracy": 0.9547999441623688,
"step": 6175
},
{
"epoch": 2.3779103328843565,
"grad_norm": 0.10907461607280992,
"learning_rate": 2.400710221820969e-05,
"loss": 0.1308,
"mean_token_accuracy": 0.9551190435886383,
"step": 6180
},
{
"epoch": 2.379834519915336,
"grad_norm": 0.11295246820088488,
"learning_rate": 2.3969773492881992e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9561882793903351,
"step": 6185
},
{
"epoch": 2.3817587069463153,
"grad_norm": 0.10798424508112144,
"learning_rate": 2.393245472695549e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9552112817764282,
"step": 6190
},
{
"epoch": 2.3836828939772947,
"grad_norm": 0.10748192867953561,
"learning_rate": 2.38951460257131e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.9548419535160064,
"step": 6195
},
{
"epoch": 2.385607081008274,
"grad_norm": 0.10813811798830832,
"learning_rate": 2.3857847494409346e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9553369998931884,
"step": 6200
},
{
"epoch": 2.3875312680392535,
"grad_norm": 0.10923338235212277,
"learning_rate": 2.3820559238270075e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9558035612106324,
"step": 6205
},
{
"epoch": 2.389455455070233,
"grad_norm": 0.10504288528798039,
"learning_rate": 2.378328136249212e-05,
"loss": 0.1276,
"mean_token_accuracy": 0.9560654640197754,
"step": 6210
},
{
"epoch": 2.3913796421012123,
"grad_norm": 0.10898082105141034,
"learning_rate": 2.3746013972243063e-05,
"loss": 0.1316,
"mean_token_accuracy": 0.9543659031391144,
"step": 6215
},
{
"epoch": 2.3933038291321918,
"grad_norm": 0.1106138395232921,
"learning_rate": 2.370875717266087e-05,
"loss": 0.1308,
"mean_token_accuracy": 0.9551527023315429,
"step": 6220
},
{
"epoch": 2.395228016163171,
"grad_norm": 0.11050211677827432,
"learning_rate": 2.3671511068853654e-05,
"loss": 0.1321,
"mean_token_accuracy": 0.9545704066753388,
"step": 6225
},
{
"epoch": 2.3971522031941506,
"grad_norm": 0.10582372989514112,
"learning_rate": 2.3634275765899334e-05,
"loss": 0.1311,
"mean_token_accuracy": 0.9553862631320953,
"step": 6230
},
{
"epoch": 2.39907639022513,
"grad_norm": 0.10502224579162833,
"learning_rate": 2.3597051368845387e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.955961138010025,
"step": 6235
},
{
"epoch": 2.4010005772561094,
"grad_norm": 0.10486880517915072,
"learning_rate": 2.355983798270848e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9567702889442444,
"step": 6240
},
{
"epoch": 2.402924764287089,
"grad_norm": 0.10885534350453244,
"learning_rate": 2.3522635712474255e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9552649140357972,
"step": 6245
},
{
"epoch": 2.404848951318068,
"grad_norm": 0.10640605673443858,
"learning_rate": 2.348544466309698e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9563517928123474,
"step": 6250
},
{
"epoch": 2.4067731383490476,
"grad_norm": 0.10772345727706822,
"learning_rate": 2.3448264939499254e-05,
"loss": 0.1317,
"mean_token_accuracy": 0.9546447098255157,
"step": 6255
},
{
"epoch": 2.408697325380027,
"grad_norm": 0.10469289530051222,
"learning_rate": 2.341109664657175e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9551982462406159,
"step": 6260
},
{
"epoch": 2.4106215124110064,
"grad_norm": 0.10016045196030172,
"learning_rate": 2.337393988917287e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9561413705348969,
"step": 6265
},
{
"epoch": 2.412545699441986,
"grad_norm": 0.10461285052219418,
"learning_rate": 2.3336794772128472e-05,
"loss": 0.1287,
"mean_token_accuracy": 0.9557947039604187,
"step": 6270
},
{
"epoch": 2.4144698864729652,
"grad_norm": 0.10948702841750811,
"learning_rate": 2.3299661400231592e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9558991074562073,
"step": 6275
},
{
"epoch": 2.4163940735039446,
"grad_norm": 0.1056211764656455,
"learning_rate": 2.326253987824214e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9556284606456756,
"step": 6280
},
{
"epoch": 2.418318260534924,
"grad_norm": 0.10727635952931647,
"learning_rate": 2.322543031088655e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9564097642898559,
"step": 6285
},
{
"epoch": 2.4202424475659035,
"grad_norm": 0.10332341197029843,
"learning_rate": 2.3188332802857564e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9555360496044158,
"step": 6290
},
{
"epoch": 2.422166634596883,
"grad_norm": 0.10742295084290715,
"learning_rate": 2.3151247458813907e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.954986971616745,
"step": 6295
},
{
"epoch": 2.4240908216278623,
"grad_norm": 0.10873199498010518,
"learning_rate": 2.3114174383379972e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9566889405250549,
"step": 6300
},
{
"epoch": 2.4260150086588417,
"grad_norm": 0.11212883852617865,
"learning_rate": 2.3077113681145534e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9550743222236633,
"step": 6305
},
{
"epoch": 2.427939195689821,
"grad_norm": 0.10950238580710399,
"learning_rate": 2.304006545666548e-05,
"loss": 0.1299,
"mean_token_accuracy": 0.9552758276462555,
"step": 6310
},
{
"epoch": 2.4298633827208005,
"grad_norm": 0.10426204545899302,
"learning_rate": 2.300302981445948e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9555193781852722,
"step": 6315
},
{
"epoch": 2.43178756975178,
"grad_norm": 0.11268725403596633,
"learning_rate": 2.296600685901173e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9557891607284545,
"step": 6320
},
{
"epoch": 2.4337117567827593,
"grad_norm": 0.10299250442381977,
"learning_rate": 2.2928996694770595e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.955392187833786,
"step": 6325
},
{
"epoch": 2.4356359438137387,
"grad_norm": 0.11412695538997933,
"learning_rate": 2.2891999426148386e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9556662082672119,
"step": 6330
},
{
"epoch": 2.437560130844718,
"grad_norm": 0.1072278675480645,
"learning_rate": 2.285501515752102e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9552786529064179,
"step": 6335
},
{
"epoch": 2.4394843178756975,
"grad_norm": 0.10597779313195058,
"learning_rate": 2.281804399322775e-05,
"loss": 0.1288,
"mean_token_accuracy": 0.9556537508964539,
"step": 6340
},
{
"epoch": 2.441408504906677,
"grad_norm": 0.10529809189878536,
"learning_rate": 2.2781086037570857e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9557879388332366,
"step": 6345
},
{
"epoch": 2.4433326919376563,
"grad_norm": 0.10633758212276818,
"learning_rate": 2.274414139481533e-05,
"loss": 0.129,
"mean_token_accuracy": 0.955594515800476,
"step": 6350
},
{
"epoch": 2.4452568789686357,
"grad_norm": 0.10585477214924503,
"learning_rate": 2.2707210169188644e-05,
"loss": 0.1282,
"mean_token_accuracy": 0.9559841394424439,
"step": 6355
},
{
"epoch": 2.447181065999615,
"grad_norm": 0.10316097711805097,
"learning_rate": 2.2670292464880383e-05,
"loss": 0.1263,
"mean_token_accuracy": 0.956607323884964,
"step": 6360
},
{
"epoch": 2.4491052530305946,
"grad_norm": 0.1073703759899156,
"learning_rate": 2.263338838604202e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.956674462556839,
"step": 6365
},
{
"epoch": 2.451029440061574,
"grad_norm": 0.10776694295962279,
"learning_rate": 2.259649803678656e-05,
"loss": 0.1304,
"mean_token_accuracy": 0.9550556719303132,
"step": 6370
},
{
"epoch": 2.4529536270925534,
"grad_norm": 0.10483844103984719,
"learning_rate": 2.2559621521188277e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9559533834457398,
"step": 6375
},
{
"epoch": 2.454877814123533,
"grad_norm": 0.1060013694021598,
"learning_rate": 2.2522758943282442e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9556844234466553,
"step": 6380
},
{
"epoch": 2.456802001154512,
"grad_norm": 0.10954260145685858,
"learning_rate": 2.2485910407064985e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9554052412509918,
"step": 6385
},
{
"epoch": 2.4587261881854916,
"grad_norm": 0.10584320212310534,
"learning_rate": 2.2449076016492222e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9558072209358215,
"step": 6390
},
{
"epoch": 2.460650375216471,
"grad_norm": 0.11050640193114457,
"learning_rate": 2.2412255875480558e-05,
"loss": 0.1287,
"mean_token_accuracy": 0.9557153820991516,
"step": 6395
},
{
"epoch": 2.4625745622474504,
"grad_norm": 0.10813729433867338,
"learning_rate": 2.2375450087906214e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9559723854064941,
"step": 6400
},
{
"epoch": 2.46449874927843,
"grad_norm": 0.10682656562324673,
"learning_rate": 2.2338658757604908e-05,
"loss": 0.129,
"mean_token_accuracy": 0.9554776012897491,
"step": 6405
},
{
"epoch": 2.4664229363094092,
"grad_norm": 0.10658960516187169,
"learning_rate": 2.2301881988371574e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9560759603977204,
"step": 6410
},
{
"epoch": 2.4683471233403886,
"grad_norm": 0.10616178994876622,
"learning_rate": 2.226511988396006e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9551859855651855,
"step": 6415
},
{
"epoch": 2.470271310371368,
"grad_norm": 0.1172111283269971,
"learning_rate": 2.2228372548082842e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9552642643451691,
"step": 6420
},
{
"epoch": 2.4721954974023475,
"grad_norm": 0.10774671208770267,
"learning_rate": 2.2191640084410735e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9561912536621093,
"step": 6425
},
{
"epoch": 2.474119684433327,
"grad_norm": 0.10511961603577678,
"learning_rate": 2.215492259657262e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9561578452587127,
"step": 6430
},
{
"epoch": 2.4760438714643063,
"grad_norm": 0.10594521909254916,
"learning_rate": 2.2118220188155077e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9555766880512238,
"step": 6435
},
{
"epoch": 2.4779680584952857,
"grad_norm": 0.11163384256017442,
"learning_rate": 2.2081532962702177e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9559310555458069,
"step": 6440
},
{
"epoch": 2.479892245526265,
"grad_norm": 0.1045737292888624,
"learning_rate": 2.2044861023715174e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9565778374671936,
"step": 6445
},
{
"epoch": 2.4818164325572445,
"grad_norm": 0.1060088421688071,
"learning_rate": 2.2008204474652162e-05,
"loss": 0.1252,
"mean_token_accuracy": 0.9568614602088928,
"step": 6450
},
{
"epoch": 2.483740619588224,
"grad_norm": 0.10825864694038108,
"learning_rate": 2.1971563418927822e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9552516937255859,
"step": 6455
},
{
"epoch": 2.4856648066192033,
"grad_norm": 0.1070769288062764,
"learning_rate": 2.1934937959913142e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9554482281208039,
"step": 6460
},
{
"epoch": 2.4875889936501827,
"grad_norm": 0.10621601500927427,
"learning_rate": 2.1898328200935097e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9551669120788574,
"step": 6465
},
{
"epoch": 2.489513180681162,
"grad_norm": 0.10455563675305558,
"learning_rate": 2.186173424527639e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9556362867355347,
"step": 6470
},
{
"epoch": 2.4914373677121415,
"grad_norm": 0.10818889779174191,
"learning_rate": 2.1825156196175106e-05,
"loss": 0.1291,
"mean_token_accuracy": 0.9556038022041321,
"step": 6475
},
{
"epoch": 2.493361554743121,
"grad_norm": 0.1054164214678568,
"learning_rate": 2.178859415682447e-05,
"loss": 0.1307,
"mean_token_accuracy": 0.9552402257919311,
"step": 6480
},
{
"epoch": 2.4952857417741003,
"grad_norm": 0.10836527087770427,
"learning_rate": 2.175204823037255e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558619618415832,
"step": 6485
},
{
"epoch": 2.4972099288050797,
"grad_norm": 0.10985437809136969,
"learning_rate": 2.1715518519921957e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9555224239826202,
"step": 6490
},
{
"epoch": 2.499134115836059,
"grad_norm": 0.10726465444554566,
"learning_rate": 2.1679005128529546e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9549250960350036,
"step": 6495
},
{
"epoch": 2.5010583028670386,
"grad_norm": 0.10841250150364372,
"learning_rate": 2.164250815920611e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9560544431209564,
"step": 6500
},
{
"epoch": 2.502982489898018,
"grad_norm": 0.10814091803405554,
"learning_rate": 2.1606027714916157e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9557877242565155,
"step": 6505
},
{
"epoch": 2.5049066769289974,
"grad_norm": 0.10796065674626645,
"learning_rate": 2.1569563898577545e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9558773934841156,
"step": 6510
},
{
"epoch": 2.506830863959977,
"grad_norm": 0.10792165801758698,
"learning_rate": 2.1533116813061237e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.955658346414566,
"step": 6515
},
{
"epoch": 2.508755050990956,
"grad_norm": 0.10946774449000102,
"learning_rate": 2.149668656119099e-05,
"loss": 0.1309,
"mean_token_accuracy": 0.9549214005470276,
"step": 6520
},
{
"epoch": 2.5106792380219356,
"grad_norm": 0.10364437855149361,
"learning_rate": 2.1460273245743047e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9554930865764618,
"step": 6525
},
{
"epoch": 2.512603425052915,
"grad_norm": 0.1026199533452469,
"learning_rate": 2.142387696944591e-05,
"loss": 0.128,
"mean_token_accuracy": 0.9559862554073334,
"step": 6530
},
{
"epoch": 2.5145276120838944,
"grad_norm": 0.1069750821084188,
"learning_rate": 2.1387497834979997e-05,
"loss": 0.1301,
"mean_token_accuracy": 0.9553844213485718,
"step": 6535
},
{
"epoch": 2.516451799114874,
"grad_norm": 0.10262110699029317,
"learning_rate": 2.1351135944977358e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.956086528301239,
"step": 6540
},
{
"epoch": 2.5183759861458532,
"grad_norm": 0.10776952420084977,
"learning_rate": 2.131479140202138e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9561761379241943,
"step": 6545
},
{
"epoch": 2.5203001731768326,
"grad_norm": 0.10971225695518153,
"learning_rate": 2.1278464308646552e-05,
"loss": 0.1302,
"mean_token_accuracy": 0.9550739526748657,
"step": 6550
},
{
"epoch": 2.522224360207812,
"grad_norm": 0.11199782190688244,
"learning_rate": 2.1242154767338096e-05,
"loss": 0.1286,
"mean_token_accuracy": 0.9557594180107116,
"step": 6555
},
{
"epoch": 2.5241485472387915,
"grad_norm": 0.10862608502738286,
"learning_rate": 2.120586288053173e-05,
"loss": 0.1265,
"mean_token_accuracy": 0.9562830090522766,
"step": 6560
},
{
"epoch": 2.526072734269771,
"grad_norm": 0.10796891648445665,
"learning_rate": 2.1169588750613373e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9559341192245483,
"step": 6565
},
{
"epoch": 2.5279969213007503,
"grad_norm": 0.10522758943486674,
"learning_rate": 2.1133332479918833e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9564804494380951,
"step": 6570
},
{
"epoch": 2.52992110833173,
"grad_norm": 0.10780286762109235,
"learning_rate": 2.109709417073355e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9557962656021118,
"step": 6575
},
{
"epoch": 2.531845295362709,
"grad_norm": 0.10552125099465876,
"learning_rate": 2.1060873925292286e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9557866632938385,
"step": 6580
},
{
"epoch": 2.533769482393689,
"grad_norm": 0.10706134126626986,
"learning_rate": 2.1024671845778826e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9560283482074737,
"step": 6585
},
{
"epoch": 2.535693669424668,
"grad_norm": 0.10633516501224893,
"learning_rate": 2.098848803432573e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558331072330475,
"step": 6590
},
{
"epoch": 2.5376178564556477,
"grad_norm": 0.1113097029767163,
"learning_rate": 2.0952322593014017e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9564849495887756,
"step": 6595
},
{
"epoch": 2.5395420434866267,
"grad_norm": 0.10324399717862949,
"learning_rate": 2.0916175623872867e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9562363088130951,
"step": 6600
},
{
"epoch": 2.5414662305176066,
"grad_norm": 0.11009476615467657,
"learning_rate": 2.088004722887934e-05,
"loss": 0.1295,
"mean_token_accuracy": 0.9554395854473114,
"step": 6605
},
{
"epoch": 2.5433904175485855,
"grad_norm": 0.10771448514039726,
"learning_rate": 2.0843937509958134e-05,
"loss": 0.1306,
"mean_token_accuracy": 0.955112874507904,
"step": 6610
},
{
"epoch": 2.5453146045795654,
"grad_norm": 0.10636329319110463,
"learning_rate": 2.0807846568981203e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9561453282833099,
"step": 6615
},
{
"epoch": 2.5472387916105443,
"grad_norm": 0.10508686260356949,
"learning_rate": 2.0771774507767587e-05,
"loss": 0.1297,
"mean_token_accuracy": 0.9554025113582612,
"step": 6620
},
{
"epoch": 2.549162978641524,
"grad_norm": 0.10820991587473601,
"learning_rate": 2.0735721428083017e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9562888860702514,
"step": 6625
},
{
"epoch": 2.551087165672503,
"grad_norm": 0.10943406239997874,
"learning_rate": 2.069968743163967e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.955457329750061,
"step": 6630
},
{
"epoch": 2.553011352703483,
"grad_norm": 0.10546854728372138,
"learning_rate": 2.066367262009592e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9560437321662902,
"step": 6635
},
{
"epoch": 2.554935539734462,
"grad_norm": 0.10297408910326183,
"learning_rate": 2.0627677095056015e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9552169620990754,
"step": 6640
},
{
"epoch": 2.556859726765442,
"grad_norm": 0.10714476408363253,
"learning_rate": 2.059170095806975e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9556257486343384,
"step": 6645
},
{
"epoch": 2.558783913796421,
"grad_norm": 0.10349217261873211,
"learning_rate": 2.0555744310632258e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9557923078536987,
"step": 6650
},
{
"epoch": 2.5607081008274006,
"grad_norm": 0.11009263883011676,
"learning_rate": 2.0519807254183687e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.955490505695343,
"step": 6655
},
{
"epoch": 2.5626322878583796,
"grad_norm": 0.10526751847837303,
"learning_rate": 2.0483889890108898e-05,
"loss": 0.1296,
"mean_token_accuracy": 0.9555670380592346,
"step": 6660
},
{
"epoch": 2.5645564748893594,
"grad_norm": 0.10815576645410302,
"learning_rate": 2.044799231973723e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9559342682361602,
"step": 6665
},
{
"epoch": 2.5664806619203384,
"grad_norm": 0.10041254582476256,
"learning_rate": 2.041211464434214e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.955743670463562,
"step": 6670
},
{
"epoch": 2.5684048489513183,
"grad_norm": 0.11004558434797768,
"learning_rate": 2.037625696514097e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9560822963714599,
"step": 6675
},
{
"epoch": 2.5703290359822972,
"grad_norm": 0.10609185866996279,
"learning_rate": 2.034041938329466e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558017492294312,
"step": 6680
},
{
"epoch": 2.572253223013277,
"grad_norm": 0.1053426050483543,
"learning_rate": 2.0304601999907468e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.9568973779678345,
"step": 6685
},
{
"epoch": 2.574177410044256,
"grad_norm": 0.1106416909992716,
"learning_rate": 2.026880491602662e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9561525583267212,
"step": 6690
},
{
"epoch": 2.576101597075236,
"grad_norm": 0.1066555488483838,
"learning_rate": 2.0233028232642103e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.955971074104309,
"step": 6695
},
{
"epoch": 2.578025784106215,
"grad_norm": 0.10551400270459926,
"learning_rate": 2.019727205068636e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9562014818191529,
"step": 6700
},
{
"epoch": 2.5799499711371947,
"grad_norm": 0.10119569323207261,
"learning_rate": 2.016153647103398e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9566797375679016,
"step": 6705
},
{
"epoch": 2.5818741581681737,
"grad_norm": 0.10273703318326284,
"learning_rate": 2.0125821594501425e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9561544001102448,
"step": 6710
},
{
"epoch": 2.5837983451991535,
"grad_norm": 0.11088570545697085,
"learning_rate": 2.0090127521846763e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9558983266353607,
"step": 6715
},
{
"epoch": 2.585722532230133,
"grad_norm": 0.10191231115830651,
"learning_rate": 2.0054454353769365e-05,
"loss": 0.1249,
"mean_token_accuracy": 0.9568327248096467,
"step": 6720
},
{
"epoch": 2.5876467192611123,
"grad_norm": 0.10667919841751378,
"learning_rate": 2.001880219090963e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9559061825275421,
"step": 6725
},
{
"epoch": 2.5895709062920917,
"grad_norm": 0.11740208475938707,
"learning_rate": 1.9983171133848695e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9562270224094391,
"step": 6730
},
{
"epoch": 2.591495093323071,
"grad_norm": 0.10727495132744076,
"learning_rate": 1.994756128310814e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.9566376507282257,
"step": 6735
},
{
"epoch": 2.5934192803540506,
"grad_norm": 0.1042013004686997,
"learning_rate": 1.9911972739149744e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9562085866928101,
"step": 6740
},
{
"epoch": 2.59534346738503,
"grad_norm": 0.10940813111634506,
"learning_rate": 1.9876405602375163e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9557159006595611,
"step": 6745
},
{
"epoch": 2.5972676544160094,
"grad_norm": 0.10570441947989995,
"learning_rate": 1.984085997312566e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9561220288276673,
"step": 6750
},
{
"epoch": 2.599191841446989,
"grad_norm": 0.10513289877971002,
"learning_rate": 1.980533595168181e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9563930928707123,
"step": 6755
},
{
"epoch": 2.601116028477968,
"grad_norm": 0.1053488036160959,
"learning_rate": 1.9769833638263248e-05,
"loss": 0.127,
"mean_token_accuracy": 0.9562098443508148,
"step": 6760
},
{
"epoch": 2.6030402155089476,
"grad_norm": 0.10389067066050878,
"learning_rate": 1.973435313302835e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9562519192695618,
"step": 6765
},
{
"epoch": 2.604964402539927,
"grad_norm": 0.10461893885646582,
"learning_rate": 1.9698894536073992e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9565521538257599,
"step": 6770
},
{
"epoch": 2.6068885895709064,
"grad_norm": 0.10336641235090573,
"learning_rate": 1.966345794743521e-05,
"loss": 0.1278,
"mean_token_accuracy": 0.9559028029441834,
"step": 6775
},
{
"epoch": 2.608812776601886,
"grad_norm": 0.1101183754720948,
"learning_rate": 1.9628043467084972e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9560283601284028,
"step": 6780
},
{
"epoch": 2.6107369636328652,
"grad_norm": 0.1112827673424888,
"learning_rate": 1.9592651194933864e-05,
"loss": 0.1263,
"mean_token_accuracy": 0.9564993560314179,
"step": 6785
},
{
"epoch": 2.6126611506638446,
"grad_norm": 0.11076839280052687,
"learning_rate": 1.9557281230829842e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9558199286460877,
"step": 6790
},
{
"epoch": 2.614585337694824,
"grad_norm": 0.10513416421018357,
"learning_rate": 1.952193367455789e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9565446972846985,
"step": 6795
},
{
"epoch": 2.6165095247258034,
"grad_norm": 0.10570491472709956,
"learning_rate": 1.9486608625839796e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9567613065242767,
"step": 6800
},
{
"epoch": 2.618433711756783,
"grad_norm": 0.10932952586639395,
"learning_rate": 1.9451306184333866e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9563915014266968,
"step": 6805
},
{
"epoch": 2.6203578987877623,
"grad_norm": 0.10423882458759645,
"learning_rate": 1.941602644963459e-05,
"loss": 0.1241,
"mean_token_accuracy": 0.9574026703834534,
"step": 6810
},
{
"epoch": 2.6222820858187417,
"grad_norm": 0.11217501853304582,
"learning_rate": 1.938076952127243e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9566045343875885,
"step": 6815
},
{
"epoch": 2.624206272849721,
"grad_norm": 0.10654173637076583,
"learning_rate": 1.934553549871349e-05,
"loss": 0.1264,
"mean_token_accuracy": 0.9563957929611206,
"step": 6820
},
{
"epoch": 2.6261304598807005,
"grad_norm": 0.10847404127449174,
"learning_rate": 1.931032448135925e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9567358374595643,
"step": 6825
},
{
"epoch": 2.62805464691168,
"grad_norm": 0.10767105156625693,
"learning_rate": 1.9275136568546308e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9563683986663818,
"step": 6830
},
{
"epoch": 2.6299788339426593,
"grad_norm": 0.11103186074935593,
"learning_rate": 1.923997185954607e-05,
"loss": 0.1305,
"mean_token_accuracy": 0.9552824318408966,
"step": 6835
},
{
"epoch": 2.6319030209736387,
"grad_norm": 0.11115348877053942,
"learning_rate": 1.920483045356446e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9568055391311645,
"step": 6840
},
{
"epoch": 2.633827208004618,
"grad_norm": 0.11276780856069071,
"learning_rate": 1.9169712449741688e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9558255076408386,
"step": 6845
},
{
"epoch": 2.6357513950355975,
"grad_norm": 0.11060901419193486,
"learning_rate": 1.9134617947151938e-05,
"loss": 0.1276,
"mean_token_accuracy": 0.9561234533786773,
"step": 6850
},
{
"epoch": 2.637675582066577,
"grad_norm": 0.10861450011475303,
"learning_rate": 1.9099547044803084e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9562245905399323,
"step": 6855
},
{
"epoch": 2.6395997690975563,
"grad_norm": 0.10678156519802066,
"learning_rate": 1.9064499841636413e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9562100887298584,
"step": 6860
},
{
"epoch": 2.6415239561285357,
"grad_norm": 0.10875833039075991,
"learning_rate": 1.9029476436526374e-05,
"loss": 0.1261,
"mean_token_accuracy": 0.956655991077423,
"step": 6865
},
{
"epoch": 2.643448143159515,
"grad_norm": 0.11511155356237994,
"learning_rate": 1.8994476928280246e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9563781440258026,
"step": 6870
},
{
"epoch": 2.6453723301904946,
"grad_norm": 0.10591073491980244,
"learning_rate": 1.8959501415637935e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9563282132148743,
"step": 6875
},
{
"epoch": 2.647296517221474,
"grad_norm": 0.10860092720760628,
"learning_rate": 1.8924549997271614e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9557060241699219,
"step": 6880
},
{
"epoch": 2.6492207042524534,
"grad_norm": 0.10503352307327395,
"learning_rate": 1.888962277178548e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9563852548599243,
"step": 6885
},
{
"epoch": 2.651144891283433,
"grad_norm": 0.11013776759663174,
"learning_rate": 1.8854719837715513e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.9563451290130616,
"step": 6890
},
{
"epoch": 2.653069078314412,
"grad_norm": 0.10670392442749238,
"learning_rate": 1.8819841293529135e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9553812265396118,
"step": 6895
},
{
"epoch": 2.6549932653453916,
"grad_norm": 0.10979631553488628,
"learning_rate": 1.8784987237624958e-05,
"loss": 0.13,
"mean_token_accuracy": 0.9554021894931793,
"step": 6900
},
{
"epoch": 2.656917452376371,
"grad_norm": 0.10650971195174591,
"learning_rate": 1.8750157768332515e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9567661046981811,
"step": 6905
},
{
"epoch": 2.6588416394073504,
"grad_norm": 0.10569859297963535,
"learning_rate": 1.8715352983911987e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9560502111911774,
"step": 6910
},
{
"epoch": 2.66076582643833,
"grad_norm": 0.10535601083717823,
"learning_rate": 1.868057298255389e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9561728775501251,
"step": 6915
},
{
"epoch": 2.6626900134693092,
"grad_norm": 0.10643100279970104,
"learning_rate": 1.8645817862378857e-05,
"loss": 0.1264,
"mean_token_accuracy": 0.9563452661037445,
"step": 6920
},
{
"epoch": 2.6646142005002886,
"grad_norm": 0.10962944725677765,
"learning_rate": 1.8611087721437287e-05,
"loss": 0.1282,
"mean_token_accuracy": 0.9559251666069031,
"step": 6925
},
{
"epoch": 2.666538387531268,
"grad_norm": 0.10546649694289567,
"learning_rate": 1.8576382657709128e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9570260524749756,
"step": 6930
},
{
"epoch": 2.6684625745622474,
"grad_norm": 0.10294778802052634,
"learning_rate": 1.8541702769103586e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9561637580394745,
"step": 6935
},
{
"epoch": 2.670386761593227,
"grad_norm": 0.10709037535879702,
"learning_rate": 1.8507048153458852e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9558945298194885,
"step": 6940
},
{
"epoch": 2.6723109486242063,
"grad_norm": 0.11244918165012138,
"learning_rate": 1.8472418908541778e-05,
"loss": 0.125,
"mean_token_accuracy": 0.9568565487861633,
"step": 6945
},
{
"epoch": 2.6742351356551857,
"grad_norm": 0.10630468922918597,
"learning_rate": 1.843781513204767e-05,
"loss": 0.127,
"mean_token_accuracy": 0.956307715177536,
"step": 6950
},
{
"epoch": 2.676159322686165,
"grad_norm": 0.10419951897148877,
"learning_rate": 1.8403236921599987e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9554457426071167,
"step": 6955
},
{
"epoch": 2.6780835097171445,
"grad_norm": 0.11218125498578436,
"learning_rate": 1.836868437475006e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9557630300521851,
"step": 6960
},
{
"epoch": 2.680007696748124,
"grad_norm": 0.10948606028667217,
"learning_rate": 1.8334157588976784e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.9562645971775054,
"step": 6965
},
{
"epoch": 2.6819318837791033,
"grad_norm": 0.10772262444167868,
"learning_rate": 1.8299656661686438e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9570554375648499,
"step": 6970
},
{
"epoch": 2.6838560708100827,
"grad_norm": 0.10960968242808095,
"learning_rate": 1.8265181690212292e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9564591765403747,
"step": 6975
},
{
"epoch": 2.685780257841062,
"grad_norm": 0.1030152420845438,
"learning_rate": 1.8230732771814425e-05,
"loss": 0.1293,
"mean_token_accuracy": 0.9554629743099212,
"step": 6980
},
{
"epoch": 2.6877044448720415,
"grad_norm": 0.1147631862622607,
"learning_rate": 1.819631000367941e-05,
"loss": 0.1289,
"mean_token_accuracy": 0.9556432604789734,
"step": 6985
},
{
"epoch": 2.689628631903021,
"grad_norm": 0.10355133931423409,
"learning_rate": 1.8161913482920028e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9568826794624329,
"step": 6990
},
{
"epoch": 2.6915528189340003,
"grad_norm": 0.10616830597210221,
"learning_rate": 1.8127543306575034e-05,
"loss": 0.1282,
"mean_token_accuracy": 0.9558695256710052,
"step": 6995
},
{
"epoch": 2.6934770059649797,
"grad_norm": 0.11136236222785284,
"learning_rate": 1.8093199571608836e-05,
"loss": 0.1261,
"mean_token_accuracy": 0.9567158997058869,
"step": 7000
},
{
"epoch": 2.695401192995959,
"grad_norm": 0.11542457168063223,
"learning_rate": 1.805888237491129e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9568669140338898,
"step": 7005
},
{
"epoch": 2.6973253800269386,
"grad_norm": 0.10964388423684117,
"learning_rate": 1.802459181329732e-05,
"loss": 0.1232,
"mean_token_accuracy": 0.9577533662319183,
"step": 7010
},
{
"epoch": 2.699249567057918,
"grad_norm": 0.11697708785883097,
"learning_rate": 1.799032798350676e-05,
"loss": 0.1252,
"mean_token_accuracy": 0.9567424178123474,
"step": 7015
},
{
"epoch": 2.7011737540888974,
"grad_norm": 0.10669513491757858,
"learning_rate": 1.7956090982204015e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.95643470287323,
"step": 7020
},
{
"epoch": 2.703097941119877,
"grad_norm": 0.10777973722640331,
"learning_rate": 1.7921880905977788e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9562375128269196,
"step": 7025
},
{
"epoch": 2.705022128150856,
"grad_norm": 0.10701533180688344,
"learning_rate": 1.7887697851340825e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558044135570526,
"step": 7030
},
{
"epoch": 2.7069463151818356,
"grad_norm": 0.10331099074883404,
"learning_rate": 1.785354191472965e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.956113600730896,
"step": 7035
},
{
"epoch": 2.708870502212815,
"grad_norm": 0.11193788287349637,
"learning_rate": 1.7819413192504276e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9568765163421631,
"step": 7040
},
{
"epoch": 2.7107946892437944,
"grad_norm": 0.10723666773916213,
"learning_rate": 1.778531178094795e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9566305696964263,
"step": 7045
},
{
"epoch": 2.712718876274774,
"grad_norm": 0.10400150310051162,
"learning_rate": 1.775123777626685e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9561739981174469,
"step": 7050
},
{
"epoch": 2.7146430633057532,
"grad_norm": 0.10733826918036667,
"learning_rate": 1.771719127458985e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9562620878219604,
"step": 7055
},
{
"epoch": 2.7165672503367326,
"grad_norm": 0.12644344138008767,
"learning_rate": 1.7683172371968227e-05,
"loss": 0.1279,
"mean_token_accuracy": 0.9559725463390351,
"step": 7060
},
{
"epoch": 2.718491437367712,
"grad_norm": 0.10536033765819473,
"learning_rate": 1.7649181164375395e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9562171399593353,
"step": 7065
},
{
"epoch": 2.7204156243986914,
"grad_norm": 0.10884864080619998,
"learning_rate": 1.761521774770665e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.955652940273285,
"step": 7070
},
{
"epoch": 2.722339811429671,
"grad_norm": 0.10723787813049393,
"learning_rate": 1.758128221777885e-05,
"loss": 0.1241,
"mean_token_accuracy": 0.9573346912860871,
"step": 7075
},
{
"epoch": 2.7242639984606503,
"grad_norm": 0.10767449085985573,
"learning_rate": 1.754737467033023e-05,
"loss": 0.1292,
"mean_token_accuracy": 0.9555230498313904,
"step": 7080
},
{
"epoch": 2.7261881854916297,
"grad_norm": 0.10895307917278084,
"learning_rate": 1.751349520102003e-05,
"loss": 0.1249,
"mean_token_accuracy": 0.9571972370147706,
"step": 7085
},
{
"epoch": 2.728112372522609,
"grad_norm": 0.10650045123221602,
"learning_rate": 1.7479643905428316e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.9568059027194977,
"step": 7090
},
{
"epoch": 2.7300365595535885,
"grad_norm": 0.10928759118436902,
"learning_rate": 1.7445820879055647e-05,
"loss": 0.128,
"mean_token_accuracy": 0.955977874994278,
"step": 7095
},
{
"epoch": 2.731960746584568,
"grad_norm": 0.1073383269536675,
"learning_rate": 1.7412026217322836e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9569141745567322,
"step": 7100
},
{
"epoch": 2.7338849336155473,
"grad_norm": 0.10966952979678361,
"learning_rate": 1.737826001557068e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9563021123409271,
"step": 7105
},
{
"epoch": 2.7358091206465267,
"grad_norm": 0.10526562498309563,
"learning_rate": 1.7344522369059692e-05,
"loss": 0.1265,
"mean_token_accuracy": 0.9564592182636261,
"step": 7110
},
{
"epoch": 2.737733307677506,
"grad_norm": 0.11031935892748677,
"learning_rate": 1.73108133729698e-05,
"loss": 0.1262,
"mean_token_accuracy": 0.956552791595459,
"step": 7115
},
{
"epoch": 2.7396574947084855,
"grad_norm": 0.10418193840163821,
"learning_rate": 1.7277133122400114e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9562981486320495,
"step": 7120
},
{
"epoch": 2.741581681739465,
"grad_norm": 0.11099596444843954,
"learning_rate": 1.724348171236866e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9564831733703614,
"step": 7125
},
{
"epoch": 2.7435058687704443,
"grad_norm": 0.10204613720474413,
"learning_rate": 1.7209859237812097e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.9565585315227508,
"step": 7130
},
{
"epoch": 2.7454300558014237,
"grad_norm": 0.10686056931951905,
"learning_rate": 1.717626579358545e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9559927165508271,
"step": 7135
},
{
"epoch": 2.747354242832403,
"grad_norm": 0.10665318681343526,
"learning_rate": 1.7142701474461826e-05,
"loss": 0.1269,
"mean_token_accuracy": 0.9562374651432037,
"step": 7140
},
{
"epoch": 2.7492784298633826,
"grad_norm": 0.10557591127652506,
"learning_rate": 1.7109166375132196e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.956173038482666,
"step": 7145
},
{
"epoch": 2.7512026168943624,
"grad_norm": 0.11052732032634138,
"learning_rate": 1.7075660590205067e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9560621798038482,
"step": 7150
},
{
"epoch": 2.7531268039253414,
"grad_norm": 0.1087942108070962,
"learning_rate": 1.704218421420627e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9559900820255279,
"step": 7155
},
{
"epoch": 2.755050990956321,
"grad_norm": 0.10447864519398838,
"learning_rate": 1.7008737341578646e-05,
"loss": 0.1263,
"mean_token_accuracy": 0.9565044999122619,
"step": 7160
},
{
"epoch": 2.7569751779873,
"grad_norm": 0.10552092323762612,
"learning_rate": 1.697532006668182e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9559858977794647,
"step": 7165
},
{
"epoch": 2.75889936501828,
"grad_norm": 0.11015301455853907,
"learning_rate": 1.6941932483791913e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9562235772609711,
"step": 7170
},
{
"epoch": 2.760823552049259,
"grad_norm": 0.10941521089776884,
"learning_rate": 1.6908574687101273e-05,
"loss": 0.1227,
"mean_token_accuracy": 0.9576989531517028,
"step": 7175
},
{
"epoch": 2.762747739080239,
"grad_norm": 0.10782441194041115,
"learning_rate": 1.6875246770718202e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9567393243312836,
"step": 7180
},
{
"epoch": 2.764671926111218,
"grad_norm": 0.10542593214023584,
"learning_rate": 1.6841948828666742e-05,
"loss": 0.1233,
"mean_token_accuracy": 0.9576320827007294,
"step": 7185
},
{
"epoch": 2.7665961131421977,
"grad_norm": 0.10659113484500546,
"learning_rate": 1.680868095488634e-05,
"loss": 0.1258,
"mean_token_accuracy": 0.9567969083786011,
"step": 7190
},
{
"epoch": 2.7685203001731766,
"grad_norm": 0.11071007338136814,
"learning_rate": 1.6775443243231636e-05,
"loss": 0.1274,
"mean_token_accuracy": 0.9560856699943543,
"step": 7195
},
{
"epoch": 2.7704444872041565,
"grad_norm": 0.10800226795369741,
"learning_rate": 1.6742235787472148e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9565568685531616,
"step": 7200
},
{
"epoch": 2.7723686742351354,
"grad_norm": 0.10574854506306638,
"learning_rate": 1.670905868129208e-05,
"loss": 0.1247,
"mean_token_accuracy": 0.9571392416954041,
"step": 7205
},
{
"epoch": 2.7742928612661153,
"grad_norm": 0.10691098685310134,
"learning_rate": 1.667591201828997e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.95638587474823,
"step": 7210
},
{
"epoch": 2.7762170482970943,
"grad_norm": 0.10540967489714154,
"learning_rate": 1.6642795891978496e-05,
"loss": 0.126,
"mean_token_accuracy": 0.9566452860832214,
"step": 7215
},
{
"epoch": 2.778141235328074,
"grad_norm": 0.10674759101707822,
"learning_rate": 1.6609710395784193e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9566307127475738,
"step": 7220
},
{
"epoch": 2.780065422359053,
"grad_norm": 0.11002060431089043,
"learning_rate": 1.657665562304715e-05,
"loss": 0.123,
"mean_token_accuracy": 0.9575728833675384,
"step": 7225
},
{
"epoch": 2.781989609390033,
"grad_norm": 0.10576961272518313,
"learning_rate": 1.654363166702082e-05,
"loss": 0.1246,
"mean_token_accuracy": 0.9570701956748963,
"step": 7230
},
{
"epoch": 2.783913796421012,
"grad_norm": 0.11262475939757777,
"learning_rate": 1.6510638620871682e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9560605108737945,
"step": 7235
},
{
"epoch": 2.7858379834519917,
"grad_norm": 0.11127630384550656,
"learning_rate": 1.6477676577679042e-05,
"loss": 0.1262,
"mean_token_accuracy": 0.956292325258255,
"step": 7240
},
{
"epoch": 2.7877621704829707,
"grad_norm": 0.10658040995481745,
"learning_rate": 1.6444745630434705e-05,
"loss": 0.1254,
"mean_token_accuracy": 0.9569096446037293,
"step": 7245
},
{
"epoch": 2.7896863575139506,
"grad_norm": 0.11552542983328662,
"learning_rate": 1.6411845872042792e-05,
"loss": 0.1254,
"mean_token_accuracy": 0.9569571316242218,
"step": 7250
},
{
"epoch": 2.7916105445449295,
"grad_norm": 0.10708675669840825,
"learning_rate": 1.63789773953194e-05,
"loss": 0.1247,
"mean_token_accuracy": 0.957468980550766,
"step": 7255
},
{
"epoch": 2.7935347315759094,
"grad_norm": 0.10449931858438566,
"learning_rate": 1.6346140292992405e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9563921928405762,
"step": 7260
},
{
"epoch": 2.7954589186068883,
"grad_norm": 0.10809986665414069,
"learning_rate": 1.6313334657701146e-05,
"loss": 0.1285,
"mean_token_accuracy": 0.9558048605918884,
"step": 7265
},
{
"epoch": 2.797383105637868,
"grad_norm": 0.10739087950506812,
"learning_rate": 1.628056058199618e-05,
"loss": 0.125,
"mean_token_accuracy": 0.9568708956241607,
"step": 7270
},
{
"epoch": 2.799307292668847,
"grad_norm": 0.10402472359624435,
"learning_rate": 1.6247818158339053e-05,
"loss": 0.1263,
"mean_token_accuracy": 0.9565973937511444,
"step": 7275
},
{
"epoch": 2.801231479699827,
"grad_norm": 0.10522619304351352,
"learning_rate": 1.621510747910202e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9567424237728119,
"step": 7280
},
{
"epoch": 2.803155666730806,
"grad_norm": 0.1031627194210059,
"learning_rate": 1.6182428636567746e-05,
"loss": 0.126,
"mean_token_accuracy": 0.9567280828952789,
"step": 7285
},
{
"epoch": 2.805079853761786,
"grad_norm": 0.10885829619538895,
"learning_rate": 1.6149781722929112e-05,
"loss": 0.1272,
"mean_token_accuracy": 0.9562729597091675,
"step": 7290
},
{
"epoch": 2.807004040792765,
"grad_norm": 0.11357013093667029,
"learning_rate": 1.6117166830288894e-05,
"loss": 0.1238,
"mean_token_accuracy": 0.9574274599552155,
"step": 7295
},
{
"epoch": 2.8089282278237446,
"grad_norm": 0.10447156358180851,
"learning_rate": 1.608458405065955e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9568834602832794,
"step": 7300
},
{
"epoch": 2.810852414854724,
"grad_norm": 0.102388962981238,
"learning_rate": 1.6052033475962953e-05,
"loss": 0.1273,
"mean_token_accuracy": 0.9563150763511657,
"step": 7305
},
{
"epoch": 2.8127766018857034,
"grad_norm": 0.10645925025513615,
"learning_rate": 1.6019515198030078e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9563553273677826,
"step": 7310
},
{
"epoch": 2.814700788916683,
"grad_norm": 0.11653208782578775,
"learning_rate": 1.5987029308600822e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.956266438961029,
"step": 7315
},
{
"epoch": 2.8166249759476623,
"grad_norm": 0.10715577285113691,
"learning_rate": 1.5954575899323703e-05,
"loss": 0.1249,
"mean_token_accuracy": 0.9570776879787445,
"step": 7320
},
{
"epoch": 2.8185491629786417,
"grad_norm": 0.10736007734155249,
"learning_rate": 1.5922155061755602e-05,
"loss": 0.1283,
"mean_token_accuracy": 0.9558275938034058,
"step": 7325
},
{
"epoch": 2.820473350009621,
"grad_norm": 0.11065163771448285,
"learning_rate": 1.5889766887361485e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.9565343916416168,
"step": 7330
},
{
"epoch": 2.8223975370406005,
"grad_norm": 0.11471891290397622,
"learning_rate": 1.585741146751421e-05,
"loss": 0.1232,
"mean_token_accuracy": 0.9576056897640228,
"step": 7335
},
{
"epoch": 2.82432172407158,
"grad_norm": 0.11104147050631989,
"learning_rate": 1.5825088893494208e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9562883615493775,
"step": 7340
},
{
"epoch": 2.8262459111025593,
"grad_norm": 0.11255149427553987,
"learning_rate": 1.579279925648926e-05,
"loss": 0.1203,
"mean_token_accuracy": 0.9585840284824372,
"step": 7345
},
{
"epoch": 2.8281700981335387,
"grad_norm": 0.10481594755119765,
"learning_rate": 1.5760542647594192e-05,
"loss": 0.1246,
"mean_token_accuracy": 0.9568839311599732,
"step": 7350
},
{
"epoch": 2.830094285164518,
"grad_norm": 0.11130912788500391,
"learning_rate": 1.5728319157810693e-05,
"loss": 0.1251,
"mean_token_accuracy": 0.9572184145450592,
"step": 7355
},
{
"epoch": 2.8320184721954975,
"grad_norm": 0.109805966713199,
"learning_rate": 1.569612887804699e-05,
"loss": 0.126,
"mean_token_accuracy": 0.9567272245883942,
"step": 7360
},
{
"epoch": 2.833942659226477,
"grad_norm": 0.10635917964214982,
"learning_rate": 1.5663971899117635e-05,
"loss": 0.1249,
"mean_token_accuracy": 0.9571217775344849,
"step": 7365
},
{
"epoch": 2.8358668462574563,
"grad_norm": 0.10700521407852201,
"learning_rate": 1.563184831174321e-05,
"loss": 0.124,
"mean_token_accuracy": 0.9574501037597656,
"step": 7370
},
{
"epoch": 2.8377910332884357,
"grad_norm": 0.10854668460509331,
"learning_rate": 1.5599758206550114e-05,
"loss": 0.125,
"mean_token_accuracy": 0.9568875133991241,
"step": 7375
},
{
"epoch": 2.839715220319415,
"grad_norm": 0.10566738417028762,
"learning_rate": 1.5567701674070293e-05,
"loss": 0.126,
"mean_token_accuracy": 0.956588226556778,
"step": 7380
},
{
"epoch": 2.8416394073503946,
"grad_norm": 0.10208895046477207,
"learning_rate": 1.553567880474095e-05,
"loss": 0.124,
"mean_token_accuracy": 0.9574152231216431,
"step": 7385
},
{
"epoch": 2.843563594381374,
"grad_norm": 0.10397090537389687,
"learning_rate": 1.5503689688904343e-05,
"loss": 0.1227,
"mean_token_accuracy": 0.9577645778656005,
"step": 7390
},
{
"epoch": 2.8454877814123534,
"grad_norm": 0.10873053971174768,
"learning_rate": 1.5471734416807487e-05,
"loss": 0.1243,
"mean_token_accuracy": 0.957385802268982,
"step": 7395
},
{
"epoch": 2.8474119684433328,
"grad_norm": 0.10472998947209755,
"learning_rate": 1.5439813078601933e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9567414045333862,
"step": 7400
},
{
"epoch": 2.849336155474312,
"grad_norm": 0.10409222392640442,
"learning_rate": 1.5407925764343494e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9568287134170532,
"step": 7405
},
{
"epoch": 2.8512603425052916,
"grad_norm": 0.10556065666725871,
"learning_rate": 1.5376072563992006e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9562435626983643,
"step": 7410
},
{
"epoch": 2.853184529536271,
"grad_norm": 0.10452247547749552,
"learning_rate": 1.5344253567411033e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9570737600326538,
"step": 7415
},
{
"epoch": 2.8551087165672504,
"grad_norm": 0.1087680963204854,
"learning_rate": 1.5312468864367668e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9568704783916473,
"step": 7420
},
{
"epoch": 2.85703290359823,
"grad_norm": 0.10578472470541685,
"learning_rate": 1.5280718544532247e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9572508156299591,
"step": 7425
},
{
"epoch": 2.858957090629209,
"grad_norm": 0.1058352653218143,
"learning_rate": 1.5249002697478121e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9571494102478028,
"step": 7430
},
{
"epoch": 2.8608812776601886,
"grad_norm": 0.11115611407235973,
"learning_rate": 1.5217321412681357e-05,
"loss": 0.1239,
"mean_token_accuracy": 0.9573991417884826,
"step": 7435
},
{
"epoch": 2.862805464691168,
"grad_norm": 0.1029632486773793,
"learning_rate": 1.5185674779520554e-05,
"loss": 0.1243,
"mean_token_accuracy": 0.9571222126483917,
"step": 7440
},
{
"epoch": 2.8647296517221474,
"grad_norm": 0.10800954899956418,
"learning_rate": 1.515406288727651e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9563605070114136,
"step": 7445
},
{
"epoch": 2.866653838753127,
"grad_norm": 0.10476947230916332,
"learning_rate": 1.5122485825132043e-05,
"loss": 0.1251,
"mean_token_accuracy": 0.9568899631500244,
"step": 7450
},
{
"epoch": 2.8685780257841063,
"grad_norm": 0.11056280552061581,
"learning_rate": 1.509094368217171e-05,
"loss": 0.1244,
"mean_token_accuracy": 0.9571887671947479,
"step": 7455
},
{
"epoch": 2.8705022128150857,
"grad_norm": 0.10978622017949022,
"learning_rate": 1.5059436547381527e-05,
"loss": 0.1252,
"mean_token_accuracy": 0.9570884883403779,
"step": 7460
},
{
"epoch": 2.872426399846065,
"grad_norm": 0.11231449620491728,
"learning_rate": 1.5027964509648776e-05,
"loss": 0.1264,
"mean_token_accuracy": 0.9565370500087738,
"step": 7465
},
{
"epoch": 2.8743505868770445,
"grad_norm": 0.10649846229397224,
"learning_rate": 1.4996527657761719e-05,
"loss": 0.1266,
"mean_token_accuracy": 0.9563987612724304,
"step": 7470
},
{
"epoch": 2.876274773908024,
"grad_norm": 0.10580010864087648,
"learning_rate": 1.496512608040933e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9571030080318451,
"step": 7475
},
{
"epoch": 2.8781989609390033,
"grad_norm": 0.10399350282844987,
"learning_rate": 1.4933759866181085e-05,
"loss": 0.1228,
"mean_token_accuracy": 0.9577360272407531,
"step": 7480
},
{
"epoch": 2.8801231479699827,
"grad_norm": 0.10649576662983058,
"learning_rate": 1.4902429103566687e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9567151606082916,
"step": 7485
},
{
"epoch": 2.882047335000962,
"grad_norm": 0.10651968169816911,
"learning_rate": 1.487113388095584e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9568205177783966,
"step": 7490
},
{
"epoch": 2.8839715220319415,
"grad_norm": 0.10638941530991032,
"learning_rate": 1.4839874286637973e-05,
"loss": 0.1241,
"mean_token_accuracy": 0.9573801636695862,
"step": 7495
},
{
"epoch": 2.885895709062921,
"grad_norm": 0.1077741344290852,
"learning_rate": 1.4808650408801983e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.957089239358902,
"step": 7500
},
{
"epoch": 2.8878198960939003,
"grad_norm": 0.10345254233037338,
"learning_rate": 1.4777462335536044e-05,
"loss": 0.1217,
"mean_token_accuracy": 0.9581014931201934,
"step": 7505
},
{
"epoch": 2.8897440831248797,
"grad_norm": 0.10837312897375773,
"learning_rate": 1.4746310154827269e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.956810462474823,
"step": 7510
},
{
"epoch": 2.891668270155859,
"grad_norm": 0.10776077164463539,
"learning_rate": 1.4715193954561568e-05,
"loss": 0.1281,
"mean_token_accuracy": 0.956069827079773,
"step": 7515
},
{
"epoch": 2.8935924571868386,
"grad_norm": 0.1035556713444127,
"learning_rate": 1.4684113822523291e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9563699722290039,
"step": 7520
},
{
"epoch": 2.895516644217818,
"grad_norm": 0.11074344644349653,
"learning_rate": 1.4653069846395063e-05,
"loss": 0.1267,
"mean_token_accuracy": 0.956612741947174,
"step": 7525
},
{
"epoch": 2.8974408312487974,
"grad_norm": 0.10477033499360329,
"learning_rate": 1.4622062113757509e-05,
"loss": 0.1245,
"mean_token_accuracy": 0.957367604970932,
"step": 7530
},
{
"epoch": 2.8993650182797768,
"grad_norm": 0.10958436618782283,
"learning_rate": 1.4591090712088983e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9569049835205078,
"step": 7535
},
{
"epoch": 2.901289205310756,
"grad_norm": 0.10499998417791684,
"learning_rate": 1.4560155728765367e-05,
"loss": 0.1216,
"mean_token_accuracy": 0.9581144154071808,
"step": 7540
},
{
"epoch": 2.9032133923417356,
"grad_norm": 0.10655053344073778,
"learning_rate": 1.452925725105978e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9569212675094605,
"step": 7545
},
{
"epoch": 2.905137579372715,
"grad_norm": 0.10977747494446383,
"learning_rate": 1.4498395366142361e-05,
"loss": 0.1231,
"mean_token_accuracy": 0.9578100383281708,
"step": 7550
},
{
"epoch": 2.9070617664036944,
"grad_norm": 0.10651052736899481,
"learning_rate": 1.4467570161080018e-05,
"loss": 0.124,
"mean_token_accuracy": 0.9572772800922393,
"step": 7555
},
{
"epoch": 2.908985953434674,
"grad_norm": 0.10587084876989453,
"learning_rate": 1.4436781722836191e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9564314723014832,
"step": 7560
},
{
"epoch": 2.910910140465653,
"grad_norm": 0.10787590169557064,
"learning_rate": 1.4406030138270554e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.9569768607616425,
"step": 7565
},
{
"epoch": 2.9128343274966326,
"grad_norm": 0.11193040860381288,
"learning_rate": 1.4375315494138835e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.957205992937088,
"step": 7570
},
{
"epoch": 2.914758514527612,
"grad_norm": 0.10614147252868583,
"learning_rate": 1.4344637877092554e-05,
"loss": 0.1219,
"mean_token_accuracy": 0.9580237150192261,
"step": 7575
},
{
"epoch": 2.9166827015585914,
"grad_norm": 0.12226018340910985,
"learning_rate": 1.431399737367877e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9568642377853394,
"step": 7580
},
{
"epoch": 2.918606888589571,
"grad_norm": 0.1101874805916549,
"learning_rate": 1.4283394070339811e-05,
"loss": 0.1205,
"mean_token_accuracy": 0.9583792507648468,
"step": 7585
},
{
"epoch": 2.9205310756205503,
"grad_norm": 0.1058993738730307,
"learning_rate": 1.4252828053413098e-05,
"loss": 0.1237,
"mean_token_accuracy": 0.9573873460292817,
"step": 7590
},
{
"epoch": 2.9224552626515297,
"grad_norm": 0.11056817436133677,
"learning_rate": 1.4222299409130822e-05,
"loss": 0.1229,
"mean_token_accuracy": 0.9575633466243744,
"step": 7595
},
{
"epoch": 2.924379449682509,
"grad_norm": 0.10362373496270215,
"learning_rate": 1.4191808223619768e-05,
"loss": 0.1253,
"mean_token_accuracy": 0.9568525791168213,
"step": 7600
},
{
"epoch": 2.9263036367134885,
"grad_norm": 0.10916660290464916,
"learning_rate": 1.4161354582901015e-05,
"loss": 0.1249,
"mean_token_accuracy": 0.9571748375892639,
"step": 7605
},
{
"epoch": 2.928227823744468,
"grad_norm": 0.10764565709057493,
"learning_rate": 1.4130938572889746e-05,
"loss": 0.1265,
"mean_token_accuracy": 0.9566021263599396,
"step": 7610
},
{
"epoch": 2.9301520107754473,
"grad_norm": 0.1059320053361084,
"learning_rate": 1.4100560279394975e-05,
"loss": 0.1228,
"mean_token_accuracy": 0.9579496622085572,
"step": 7615
},
{
"epoch": 2.9320761978064267,
"grad_norm": 0.11360266392266798,
"learning_rate": 1.4070219788119315e-05,
"loss": 0.1223,
"mean_token_accuracy": 0.957982987165451,
"step": 7620
},
{
"epoch": 2.934000384837406,
"grad_norm": 0.10610798650914667,
"learning_rate": 1.4039917184658718e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9566937685012817,
"step": 7625
},
{
"epoch": 2.9359245718683855,
"grad_norm": 0.11071740328007927,
"learning_rate": 1.4009652554502243e-05,
"loss": 0.1232,
"mean_token_accuracy": 0.9576710939407349,
"step": 7630
},
{
"epoch": 2.937848758899365,
"grad_norm": 0.1072534333814872,
"learning_rate": 1.3979425983031841e-05,
"loss": 0.1247,
"mean_token_accuracy": 0.9570759296417236,
"step": 7635
},
{
"epoch": 2.9397729459303443,
"grad_norm": 0.10321128452437721,
"learning_rate": 1.3949237555522091e-05,
"loss": 0.1214,
"mean_token_accuracy": 0.9581948697566987,
"step": 7640
},
{
"epoch": 2.9416971329613237,
"grad_norm": 0.10854912685893148,
"learning_rate": 1.3919087357139954e-05,
"loss": 0.1259,
"mean_token_accuracy": 0.9564633071422577,
"step": 7645
},
{
"epoch": 2.943621319992303,
"grad_norm": 0.11080075613686713,
"learning_rate": 1.388897547294452e-05,
"loss": 0.1263,
"mean_token_accuracy": 0.956570053100586,
"step": 7650
},
{
"epoch": 2.9455455070232825,
"grad_norm": 0.11283077280520182,
"learning_rate": 1.385890198788683e-05,
"loss": 0.123,
"mean_token_accuracy": 0.9576871275901795,
"step": 7655
},
{
"epoch": 2.947469694054262,
"grad_norm": 0.1055597743138611,
"learning_rate": 1.3828866986809553e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9570915341377259,
"step": 7660
},
{
"epoch": 2.9493938810852414,
"grad_norm": 0.11716483869134532,
"learning_rate": 1.3798870554446819e-05,
"loss": 0.1238,
"mean_token_accuracy": 0.9574430525302887,
"step": 7665
},
{
"epoch": 2.9513180681162208,
"grad_norm": 0.11219262966435255,
"learning_rate": 1.3768912775423922e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9571912288665771,
"step": 7670
},
{
"epoch": 2.9532422551472,
"grad_norm": 0.10756629871932641,
"learning_rate": 1.3738993734257133e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.956557160615921,
"step": 7675
},
{
"epoch": 2.9551664421781796,
"grad_norm": 0.10819322200524713,
"learning_rate": 1.3709113515353428e-05,
"loss": 0.1241,
"mean_token_accuracy": 0.9571969687938691,
"step": 7680
},
{
"epoch": 2.957090629209159,
"grad_norm": 0.10522933962660358,
"learning_rate": 1.367927220301025e-05,
"loss": 0.1244,
"mean_token_accuracy": 0.9574010789394378,
"step": 7685
},
{
"epoch": 2.9590148162401384,
"grad_norm": 0.10942898684989587,
"learning_rate": 1.3649469881415276e-05,
"loss": 0.1254,
"mean_token_accuracy": 0.9567907691001892,
"step": 7690
},
{
"epoch": 2.960939003271118,
"grad_norm": 0.10772067005720781,
"learning_rate": 1.3619706634646201e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.9567744076251984,
"step": 7695
},
{
"epoch": 2.962863190302097,
"grad_norm": 0.1087985352173851,
"learning_rate": 1.3589982546670474e-05,
"loss": 0.1231,
"mean_token_accuracy": 0.9575774610042572,
"step": 7700
},
{
"epoch": 2.9647873773330766,
"grad_norm": 0.1067799950600145,
"learning_rate": 1.356029770134507e-05,
"loss": 0.1218,
"mean_token_accuracy": 0.9579530179500579,
"step": 7705
},
{
"epoch": 2.966711564364056,
"grad_norm": 0.10819695429679835,
"learning_rate": 1.3530652182416264e-05,
"loss": 0.1233,
"mean_token_accuracy": 0.9575179398059845,
"step": 7710
},
{
"epoch": 2.9686357513950354,
"grad_norm": 0.10453719331335365,
"learning_rate": 1.3501046073519364e-05,
"loss": 0.124,
"mean_token_accuracy": 0.9573317110538483,
"step": 7715
},
{
"epoch": 2.970559938426015,
"grad_norm": 0.10607372196854979,
"learning_rate": 1.3471479458178499e-05,
"loss": 0.1228,
"mean_token_accuracy": 0.9579567670822143,
"step": 7720
},
{
"epoch": 2.9724841254569947,
"grad_norm": 0.10440932629580602,
"learning_rate": 1.3441952419806391e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9573928415775299,
"step": 7725
},
{
"epoch": 2.9744083124879737,
"grad_norm": 0.11011335803374803,
"learning_rate": 1.3412465041704114e-05,
"loss": 0.1252,
"mean_token_accuracy": 0.9569807887077332,
"step": 7730
},
{
"epoch": 2.9763324995189535,
"grad_norm": 0.10803320079657108,
"learning_rate": 1.3383017407060824e-05,
"loss": 0.121,
"mean_token_accuracy": 0.9582719445228577,
"step": 7735
},
{
"epoch": 2.9782566865499325,
"grad_norm": 0.11326957663750642,
"learning_rate": 1.3353609598953587e-05,
"loss": 0.1231,
"mean_token_accuracy": 0.9574480533599854,
"step": 7740
},
{
"epoch": 2.9801808735809123,
"grad_norm": 0.1060972691963728,
"learning_rate": 1.3324241700347084e-05,
"loss": 0.1256,
"mean_token_accuracy": 0.956799840927124,
"step": 7745
},
{
"epoch": 2.9821050606118913,
"grad_norm": 0.10793607800115752,
"learning_rate": 1.3294913794093433e-05,
"loss": 0.1226,
"mean_token_accuracy": 0.9577231049537659,
"step": 7750
},
{
"epoch": 2.984029247642871,
"grad_norm": 0.10803527524344377,
"learning_rate": 1.326562596293189e-05,
"loss": 0.1236,
"mean_token_accuracy": 0.9576082110404969,
"step": 7755
},
{
"epoch": 2.98595343467385,
"grad_norm": 0.11256389464645765,
"learning_rate": 1.3236378289488682e-05,
"loss": 0.1245,
"mean_token_accuracy": 0.9571884274482727,
"step": 7760
},
{
"epoch": 2.98787762170483,
"grad_norm": 0.10757240512768781,
"learning_rate": 1.3207170856276736e-05,
"loss": 0.126,
"mean_token_accuracy": 0.9567049860954284,
"step": 7765
},
{
"epoch": 2.989801808735809,
"grad_norm": 0.11149173015234722,
"learning_rate": 1.317800374569545e-05,
"loss": 0.123,
"mean_token_accuracy": 0.9575415909290313,
"step": 7770
},
{
"epoch": 2.9917259957667888,
"grad_norm": 0.10859179660363494,
"learning_rate": 1.3148877040030466e-05,
"loss": 0.1255,
"mean_token_accuracy": 0.9568469166755676,
"step": 7775
},
{
"epoch": 2.9936501827977677,
"grad_norm": 0.10845196418753182,
"learning_rate": 1.3119790821453432e-05,
"loss": 0.1242,
"mean_token_accuracy": 0.9572749853134155,
"step": 7780
},
{
"epoch": 2.9955743698287476,
"grad_norm": 0.1043788293341144,
"learning_rate": 1.309074517202178e-05,
"loss": 0.1222,
"mean_token_accuracy": 0.957735562324524,
"step": 7785
},
{
"epoch": 2.9974985568597265,
"grad_norm": 0.1068433999214701,
"learning_rate": 1.3061740173678492e-05,
"loss": 0.1235,
"mean_token_accuracy": 0.9574783861637115,
"step": 7790
},
{
"epoch": 2.9994227438907064,
"grad_norm": 0.11024594434820396,
"learning_rate": 1.303277590825187e-05,
"loss": 0.1222,
"mean_token_accuracy": 0.958063292503357,
"step": 7795
},
{
"epoch": 3.0011545122185876,
"grad_norm": 0.10806090102622518,
"learning_rate": 1.3003852457455288e-05,
"loss": 0.1021,
"mean_token_accuracy": 0.9649597538842095,
"step": 7800
},
{
"epoch": 3.003078699249567,
"grad_norm": 0.1278029168675794,
"learning_rate": 1.297496990288697e-05,
"loss": 0.0891,
"mean_token_accuracy": 0.9692692935466767,
"step": 7805
},
{
"epoch": 3.0050028862805465,
"grad_norm": 0.12892837513477004,
"learning_rate": 1.2946128326029786e-05,
"loss": 0.0874,
"mean_token_accuracy": 0.9698507249355316,
"step": 7810
},
{
"epoch": 3.006927073311526,
"grad_norm": 0.12022972716434455,
"learning_rate": 1.2917327808250993e-05,
"loss": 0.0884,
"mean_token_accuracy": 0.9694180428981781,
"step": 7815
},
{
"epoch": 3.0088512603425053,
"grad_norm": 0.12531548139344598,
"learning_rate": 1.2888568430801995e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9698590219020844,
"step": 7820
},
{
"epoch": 3.0107754473734847,
"grad_norm": 0.11762844071368292,
"learning_rate": 1.2859850274818158e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9699797809123993,
"step": 7825
},
{
"epoch": 3.012699634404464,
"grad_norm": 0.11506289173555091,
"learning_rate": 1.2831173421318548e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9697744309902191,
"step": 7830
},
{
"epoch": 3.0146238214354435,
"grad_norm": 0.12532324926552826,
"learning_rate": 1.2802537951205695e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.9696701884269714,
"step": 7835
},
{
"epoch": 3.016548008466423,
"grad_norm": 0.12198962421807008,
"learning_rate": 1.2773943945265382e-05,
"loss": 0.0875,
"mean_token_accuracy": 0.9696745097637176,
"step": 7840
},
{
"epoch": 3.0184721954974023,
"grad_norm": 0.12526480360585637,
"learning_rate": 1.2745391484166427e-05,
"loss": 0.0885,
"mean_token_accuracy": 0.9692001581192017,
"step": 7845
},
{
"epoch": 3.0203963825283817,
"grad_norm": 0.12298338848117586,
"learning_rate": 1.271688064846044e-05,
"loss": 0.0874,
"mean_token_accuracy": 0.9697564482688904,
"step": 7850
},
{
"epoch": 3.022320569559361,
"grad_norm": 0.12363669211046695,
"learning_rate": 1.2688411518581589e-05,
"loss": 0.0862,
"mean_token_accuracy": 0.9699727296829224,
"step": 7855
},
{
"epoch": 3.0242447565903405,
"grad_norm": 0.12592966137030076,
"learning_rate": 1.2659984174846403e-05,
"loss": 0.0886,
"mean_token_accuracy": 0.9691447734832763,
"step": 7860
},
{
"epoch": 3.02616894362132,
"grad_norm": 0.12315689968587146,
"learning_rate": 1.2631598697453496e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9698756873607636,
"step": 7865
},
{
"epoch": 3.0280931306522993,
"grad_norm": 0.12179550081200788,
"learning_rate": 1.2603255166483374e-05,
"loss": 0.0857,
"mean_token_accuracy": 0.9701310276985169,
"step": 7870
},
{
"epoch": 3.0300173176832788,
"grad_norm": 0.12307347849136704,
"learning_rate": 1.2574953661898222e-05,
"loss": 0.0895,
"mean_token_accuracy": 0.968977826833725,
"step": 7875
},
{
"epoch": 3.031941504714258,
"grad_norm": 0.12251435917553875,
"learning_rate": 1.2546694263541667e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9701171934604644,
"step": 7880
},
{
"epoch": 3.0338656917452376,
"grad_norm": 0.1188143957854138,
"learning_rate": 1.251847705113851e-05,
"loss": 0.0843,
"mean_token_accuracy": 0.9706710934638977,
"step": 7885
},
{
"epoch": 3.035789878776217,
"grad_norm": 0.12475703694562157,
"learning_rate": 1.2490302104294586e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.9699966847896576,
"step": 7890
},
{
"epoch": 3.0377140658071964,
"grad_norm": 0.12093351966846219,
"learning_rate": 1.2462169502496435e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.9701157450675965,
"step": 7895
},
{
"epoch": 3.039638252838176,
"grad_norm": 0.12017604333104909,
"learning_rate": 1.2434079325111192e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9696642577648162,
"step": 7900
},
{
"epoch": 3.041562439869155,
"grad_norm": 0.12085886994243633,
"learning_rate": 1.240603165138626e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9699452579021454,
"step": 7905
},
{
"epoch": 3.0434866269001346,
"grad_norm": 0.12387593946204745,
"learning_rate": 1.2378026560449155e-05,
"loss": 0.0867,
"mean_token_accuracy": 0.9697232723236084,
"step": 7910
},
{
"epoch": 3.045410813931114,
"grad_norm": 0.12983147755354021,
"learning_rate": 1.2350064131307253e-05,
"loss": 0.0858,
"mean_token_accuracy": 0.9702217698097229,
"step": 7915
},
{
"epoch": 3.0473350009620934,
"grad_norm": 0.124590059629808,
"learning_rate": 1.2322144442847587e-05,
"loss": 0.0848,
"mean_token_accuracy": 0.9704817235469818,
"step": 7920
},
{
"epoch": 3.049259187993073,
"grad_norm": 0.12318032429282658,
"learning_rate": 1.2294267573836587e-05,
"loss": 0.0893,
"mean_token_accuracy": 0.9690053462982178,
"step": 7925
},
{
"epoch": 3.0511833750240522,
"grad_norm": 0.12644916618282354,
"learning_rate": 1.2266433602919883e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.969806456565857,
"step": 7930
},
{
"epoch": 3.0531075620550316,
"grad_norm": 0.12502760515043446,
"learning_rate": 1.2238642608622105e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9696021556854248,
"step": 7935
},
{
"epoch": 3.055031749086011,
"grad_norm": 0.12383987780357592,
"learning_rate": 1.2210894669346623e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.9695601046085358,
"step": 7940
},
{
"epoch": 3.0569559361169905,
"grad_norm": 0.12804783151969265,
"learning_rate": 1.2183189863375347e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9699361026287079,
"step": 7945
},
{
"epoch": 3.05888012314797,
"grad_norm": 0.11848934621495531,
"learning_rate": 1.2155528268868492e-05,
"loss": 0.0856,
"mean_token_accuracy": 0.9703084409236908,
"step": 7950
},
{
"epoch": 3.0608043101789493,
"grad_norm": 0.12423792043438635,
"learning_rate": 1.212790996386436e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.9698608636856079,
"step": 7955
},
{
"epoch": 3.0627284972099287,
"grad_norm": 0.1296803573215681,
"learning_rate": 1.2100335026279145e-05,
"loss": 0.0879,
"mean_token_accuracy": 0.9694078743457795,
"step": 7960
},
{
"epoch": 3.064652684240908,
"grad_norm": 0.12414203646830227,
"learning_rate": 1.2072803533906694e-05,
"loss": 0.0863,
"mean_token_accuracy": 0.9698703289031982,
"step": 7965
},
{
"epoch": 3.0665768712718875,
"grad_norm": 0.12216839952104394,
"learning_rate": 1.2045315564418255e-05,
"loss": 0.0852,
"mean_token_accuracy": 0.9703336238861084,
"step": 7970
},
{
"epoch": 3.068501058302867,
"grad_norm": 0.12430462906305857,
"learning_rate": 1.201787119536233e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.969856345653534,
"step": 7975
},
{
"epoch": 3.0704252453338463,
"grad_norm": 0.1182551093883623,
"learning_rate": 1.1990470504164394e-05,
"loss": 0.0867,
"mean_token_accuracy": 0.9697697043418885,
"step": 7980
},
{
"epoch": 3.0723494323648257,
"grad_norm": 0.12572587657675405,
"learning_rate": 1.1963113568126708e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.9698267221450806,
"step": 7985
},
{
"epoch": 3.074273619395805,
"grad_norm": 0.127462469416906,
"learning_rate": 1.1935800464428079e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9696932435035706,
"step": 7990
},
{
"epoch": 3.0761978064267845,
"grad_norm": 0.13134160058581362,
"learning_rate": 1.1908531270123665e-05,
"loss": 0.0878,
"mean_token_accuracy": 0.9693050265312195,
"step": 7995
},
{
"epoch": 3.078121993457764,
"grad_norm": 0.1238281796372475,
"learning_rate": 1.188130606214475e-05,
"loss": 0.0863,
"mean_token_accuracy": 0.970119446516037,
"step": 8000
},
{
"epoch": 3.0800461804887433,
"grad_norm": 0.13433419867660185,
"learning_rate": 1.185412491729853e-05,
"loss": 0.0842,
"mean_token_accuracy": 0.9708240568637848,
"step": 8005
},
{
"epoch": 3.0819703675197228,
"grad_norm": 0.12396118665861203,
"learning_rate": 1.1826987912267864e-05,
"loss": 0.0887,
"mean_token_accuracy": 0.9691758751869202,
"step": 8010
},
{
"epoch": 3.083894554550702,
"grad_norm": 0.11889626164987209,
"learning_rate": 1.1799895123611125e-05,
"loss": 0.087,
"mean_token_accuracy": 0.9697548627853394,
"step": 8015
},
{
"epoch": 3.0858187415816816,
"grad_norm": 0.131945297023811,
"learning_rate": 1.1772846627761899e-05,
"loss": 0.0879,
"mean_token_accuracy": 0.96954505443573,
"step": 8020
},
{
"epoch": 3.087742928612661,
"grad_norm": 0.12209211115052003,
"learning_rate": 1.1745842501028847e-05,
"loss": 0.0857,
"mean_token_accuracy": 0.9703159630298615,
"step": 8025
},
{
"epoch": 3.0896671156436404,
"grad_norm": 0.12460601320367481,
"learning_rate": 1.1718882819595454e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.970074987411499,
"step": 8030
},
{
"epoch": 3.09159130267462,
"grad_norm": 0.11829881911452493,
"learning_rate": 1.1691967659519796e-05,
"loss": 0.086,
"mean_token_accuracy": 0.9700419008731842,
"step": 8035
},
{
"epoch": 3.093515489705599,
"grad_norm": 0.13005744915575315,
"learning_rate": 1.1665097096734372e-05,
"loss": 0.0875,
"mean_token_accuracy": 0.96951624751091,
"step": 8040
},
{
"epoch": 3.0954396767365786,
"grad_norm": 0.13567902609403767,
"learning_rate": 1.1638271207045841e-05,
"loss": 0.0852,
"mean_token_accuracy": 0.9704005897045136,
"step": 8045
},
{
"epoch": 3.097363863767558,
"grad_norm": 0.12804817918149772,
"learning_rate": 1.1611490066134853e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9701052308082581,
"step": 8050
},
{
"epoch": 3.0992880507985374,
"grad_norm": 0.11891623802480417,
"learning_rate": 1.1584753749555789e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.96992067694664,
"step": 8055
},
{
"epoch": 3.101212237829517,
"grad_norm": 0.13000095226206448,
"learning_rate": 1.1558062332736595e-05,
"loss": 0.0854,
"mean_token_accuracy": 0.9702975928783417,
"step": 8060
},
{
"epoch": 3.1031364248604962,
"grad_norm": 0.12310517172219068,
"learning_rate": 1.1531415890978535e-05,
"loss": 0.09,
"mean_token_accuracy": 0.9687209069728852,
"step": 8065
},
{
"epoch": 3.1050606118914756,
"grad_norm": 0.12478784300253068,
"learning_rate": 1.1504814499456003e-05,
"loss": 0.0845,
"mean_token_accuracy": 0.970802241563797,
"step": 8070
},
{
"epoch": 3.106984798922455,
"grad_norm": 0.12995901203556426,
"learning_rate": 1.147825823321628e-05,
"loss": 0.087,
"mean_token_accuracy": 0.9696940183639526,
"step": 8075
},
{
"epoch": 3.108908985953435,
"grad_norm": 0.12465076291615731,
"learning_rate": 1.1451747167179341e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9700176894664765,
"step": 8080
},
{
"epoch": 3.110833172984414,
"grad_norm": 0.1250070740769649,
"learning_rate": 1.1425281376137659e-05,
"loss": 0.0855,
"mean_token_accuracy": 0.9701965272426605,
"step": 8085
},
{
"epoch": 3.1127573600153937,
"grad_norm": 0.12747187171365093,
"learning_rate": 1.139886093475597e-05,
"loss": 0.0867,
"mean_token_accuracy": 0.969889110326767,
"step": 8090
},
{
"epoch": 3.114681547046373,
"grad_norm": 0.12403764057755394,
"learning_rate": 1.137248591757108e-05,
"loss": 0.0877,
"mean_token_accuracy": 0.9695557832717896,
"step": 8095
},
{
"epoch": 3.1166057340773525,
"grad_norm": 0.12776305701838625,
"learning_rate": 1.1346156398991623e-05,
"loss": 0.0846,
"mean_token_accuracy": 0.9706474542617798,
"step": 8100
},
{
"epoch": 3.118529921108332,
"grad_norm": 0.12188474517126945,
"learning_rate": 1.1319872453297888e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9699768602848053,
"step": 8105
},
{
"epoch": 3.1204541081393113,
"grad_norm": 0.1255535944742883,
"learning_rate": 1.1293634154641593e-05,
"loss": 0.0875,
"mean_token_accuracy": 0.9696572244167327,
"step": 8110
},
{
"epoch": 3.1223782951702908,
"grad_norm": 0.12576509472395267,
"learning_rate": 1.1267441577045685e-05,
"loss": 0.085,
"mean_token_accuracy": 0.9705277800559997,
"step": 8115
},
{
"epoch": 3.12430248220127,
"grad_norm": 0.12282131609490717,
"learning_rate": 1.1241294794404102e-05,
"loss": 0.0883,
"mean_token_accuracy": 0.9693425118923187,
"step": 8120
},
{
"epoch": 3.1262266692322496,
"grad_norm": 0.12559358366060971,
"learning_rate": 1.121519388048161e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9699931085109711,
"step": 8125
},
{
"epoch": 3.128150856263229,
"grad_norm": 0.12785731056478644,
"learning_rate": 1.1189138908913568e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.9700387597084046,
"step": 8130
},
{
"epoch": 3.1300750432942084,
"grad_norm": 0.1263134280990649,
"learning_rate": 1.116312995320571e-05,
"loss": 0.0862,
"mean_token_accuracy": 0.9700113117694855,
"step": 8135
},
{
"epoch": 3.131999230325188,
"grad_norm": 0.12950712286712593,
"learning_rate": 1.1137167086733948e-05,
"loss": 0.0867,
"mean_token_accuracy": 0.9699228644371033,
"step": 8140
},
{
"epoch": 3.133923417356167,
"grad_norm": 0.12609052574189428,
"learning_rate": 1.1111250382744187e-05,
"loss": 0.087,
"mean_token_accuracy": 0.9696399331092834,
"step": 8145
},
{
"epoch": 3.1358476043871466,
"grad_norm": 0.1243791146405868,
"learning_rate": 1.1085379914352093e-05,
"loss": 0.0868,
"mean_token_accuracy": 0.9699157655239106,
"step": 8150
},
{
"epoch": 3.137771791418126,
"grad_norm": 0.12478666533557523,
"learning_rate": 1.105955575454289e-05,
"loss": 0.0855,
"mean_token_accuracy": 0.9703081667423248,
"step": 8155
},
{
"epoch": 3.1396959784491054,
"grad_norm": 0.12723248786466004,
"learning_rate": 1.1033777976171153e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.9698303639888763,
"step": 8160
},
{
"epoch": 3.141620165480085,
"grad_norm": 0.1224982823706387,
"learning_rate": 1.1008046651960615e-05,
"loss": 0.0847,
"mean_token_accuracy": 0.9705148994922638,
"step": 8165
},
{
"epoch": 3.1435443525110642,
"grad_norm": 0.13209837606796643,
"learning_rate": 1.0982361854503944e-05,
"loss": 0.0874,
"mean_token_accuracy": 0.969810402393341,
"step": 8170
},
{
"epoch": 3.1454685395420436,
"grad_norm": 0.12487860197240644,
"learning_rate": 1.095672365626256e-05,
"loss": 0.086,
"mean_token_accuracy": 0.9701269149780274,
"step": 8175
},
{
"epoch": 3.147392726573023,
"grad_norm": 0.1287032592687375,
"learning_rate": 1.0931132129566412e-05,
"loss": 0.0845,
"mean_token_accuracy": 0.9705935418605804,
"step": 8180
},
{
"epoch": 3.1493169136040025,
"grad_norm": 0.12612765501750817,
"learning_rate": 1.0905587346613772e-05,
"loss": 0.0879,
"mean_token_accuracy": 0.9693886756896972,
"step": 8185
},
{
"epoch": 3.151241100634982,
"grad_norm": 0.12710517193889465,
"learning_rate": 1.0880089379471056e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9695225954055786,
"step": 8190
},
{
"epoch": 3.1531652876659613,
"grad_norm": 0.1277696516634007,
"learning_rate": 1.0854638300072589e-05,
"loss": 0.0853,
"mean_token_accuracy": 0.9702028095722198,
"step": 8195
},
{
"epoch": 3.1550894746969407,
"grad_norm": 0.12538158801161664,
"learning_rate": 1.0829234180220433e-05,
"loss": 0.0883,
"mean_token_accuracy": 0.9693264961242676,
"step": 8200
},
{
"epoch": 3.15701366172792,
"grad_norm": 0.12482280763356315,
"learning_rate": 1.0803877091584147e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9699821174144745,
"step": 8205
},
{
"epoch": 3.1589378487588995,
"grad_norm": 0.12184134340839646,
"learning_rate": 1.0778567105700627e-05,
"loss": 0.086,
"mean_token_accuracy": 0.9702258408069611,
"step": 8210
},
{
"epoch": 3.160862035789879,
"grad_norm": 0.12622632430749628,
"learning_rate": 1.075330429397387e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.969819289445877,
"step": 8215
},
{
"epoch": 3.1627862228208583,
"grad_norm": 0.12128537119264614,
"learning_rate": 1.072808872767481e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.9694657444953918,
"step": 8220
},
{
"epoch": 3.1647104098518377,
"grad_norm": 0.1341024695293841,
"learning_rate": 1.0702920477941055e-05,
"loss": 0.0879,
"mean_token_accuracy": 0.9694705188274384,
"step": 8225
},
{
"epoch": 3.166634596882817,
"grad_norm": 0.12446827777112705,
"learning_rate": 1.067779961577675e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.9698066115379333,
"step": 8230
},
{
"epoch": 3.1685587839137965,
"grad_norm": 0.12635166599806483,
"learning_rate": 1.0652726212052344e-05,
"loss": 0.087,
"mean_token_accuracy": 0.969752311706543,
"step": 8235
},
{
"epoch": 3.170482970944776,
"grad_norm": 0.12415362694650094,
"learning_rate": 1.0627700337504411e-05,
"loss": 0.088,
"mean_token_accuracy": 0.9694055020809174,
"step": 8240
},
{
"epoch": 3.1724071579757553,
"grad_norm": 0.12308317166189457,
"learning_rate": 1.0602722062735412e-05,
"loss": 0.0839,
"mean_token_accuracy": 0.9707213938236237,
"step": 8245
},
{
"epoch": 3.1743313450067348,
"grad_norm": 0.12975850218368418,
"learning_rate": 1.0577791458213543e-05,
"loss": 0.087,
"mean_token_accuracy": 0.9694936692714691,
"step": 8250
},
{
"epoch": 3.176255532037714,
"grad_norm": 0.1342180776755028,
"learning_rate": 1.0552908594272492e-05,
"loss": 0.0871,
"mean_token_accuracy": 0.9696606636047364,
"step": 8255
},
{
"epoch": 3.1781797190686936,
"grad_norm": 0.12421484751480402,
"learning_rate": 1.0528073541111284e-05,
"loss": 0.0878,
"mean_token_accuracy": 0.9692697167396546,
"step": 8260
},
{
"epoch": 3.180103906099673,
"grad_norm": 0.12604153150022193,
"learning_rate": 1.0503286368794054e-05,
"loss": 0.0859,
"mean_token_accuracy": 0.9701263308525085,
"step": 8265
},
{
"epoch": 3.1820280931306524,
"grad_norm": 0.12564266714889988,
"learning_rate": 1.0478547147249842e-05,
"loss": 0.0857,
"mean_token_accuracy": 0.9703177154064179,
"step": 8270
},
{
"epoch": 3.183952280161632,
"grad_norm": 0.1257561651173637,
"learning_rate": 1.045385594627243e-05,
"loss": 0.0878,
"mean_token_accuracy": 0.9695186614990234,
"step": 8275
},
{
"epoch": 3.185876467192611,
"grad_norm": 0.12159795962859275,
"learning_rate": 1.0429212835520127e-05,
"loss": 0.0864,
"mean_token_accuracy": 0.97016139626503,
"step": 8280
},
{
"epoch": 3.1878006542235906,
"grad_norm": 0.1315051252009715,
"learning_rate": 1.0404617884515546e-05,
"loss": 0.0871,
"mean_token_accuracy": 0.9696720123291016,
"step": 8285
},
{
"epoch": 3.18972484125457,
"grad_norm": 0.12045718399538607,
"learning_rate": 1.0380071162645454e-05,
"loss": 0.086,
"mean_token_accuracy": 0.969960606098175,
"step": 8290
},
{
"epoch": 3.1916490282855494,
"grad_norm": 0.1263931601493202,
"learning_rate": 1.0355572739160548e-05,
"loss": 0.0854,
"mean_token_accuracy": 0.9703317403793335,
"step": 8295
},
{
"epoch": 3.193573215316529,
"grad_norm": 0.12819071683709377,
"learning_rate": 1.0331122683175271e-05,
"loss": 0.0874,
"mean_token_accuracy": 0.969643896818161,
"step": 8300
},
{
"epoch": 3.1954974023475082,
"grad_norm": 0.12759630896085855,
"learning_rate": 1.0306721063667613e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.969584834575653,
"step": 8305
},
{
"epoch": 3.1974215893784876,
"grad_norm": 0.13322178423454648,
"learning_rate": 1.0282367949478899e-05,
"loss": 0.0881,
"mean_token_accuracy": 0.9695081174373626,
"step": 8310
},
{
"epoch": 3.199345776409467,
"grad_norm": 0.12499356791319982,
"learning_rate": 1.0258063409313648e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.969955325126648,
"step": 8315
},
{
"epoch": 3.2012699634404465,
"grad_norm": 0.12456975891769033,
"learning_rate": 1.0233807511739293e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9698077619075776,
"step": 8320
},
{
"epoch": 3.203194150471426,
"grad_norm": 0.1294086133597835,
"learning_rate": 1.020960032518609e-05,
"loss": 0.0888,
"mean_token_accuracy": 0.9690967381000519,
"step": 8325
},
{
"epoch": 3.2051183375024053,
"grad_norm": 0.12508669985528187,
"learning_rate": 1.0185441917946831e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9697443544864655,
"step": 8330
},
{
"epoch": 3.2070425245333847,
"grad_norm": 0.12710477111832502,
"learning_rate": 1.0161332358176713e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9696725368499756,
"step": 8335
},
{
"epoch": 3.208966711564364,
"grad_norm": 0.1343357489271712,
"learning_rate": 1.0137271713893128e-05,
"loss": 0.0889,
"mean_token_accuracy": 0.9690210878849029,
"step": 8340
},
{
"epoch": 3.2108908985953435,
"grad_norm": 0.12538716805540998,
"learning_rate": 1.0113260052975452e-05,
"loss": 0.0876,
"mean_token_accuracy": 0.9695513010025024,
"step": 8345
},
{
"epoch": 3.212815085626323,
"grad_norm": 0.1288275528367617,
"learning_rate": 1.0089297443164894e-05,
"loss": 0.0868,
"mean_token_accuracy": 0.9696984708309173,
"step": 8350
},
{
"epoch": 3.2147392726573023,
"grad_norm": 0.1269432876964765,
"learning_rate": 1.0065383952064254e-05,
"loss": 0.0856,
"mean_token_accuracy": 0.9701885461807251,
"step": 8355
},
{
"epoch": 3.2166634596882817,
"grad_norm": 0.12251520794355583,
"learning_rate": 1.0041519647137779e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9700977683067322,
"step": 8360
},
{
"epoch": 3.218587646719261,
"grad_norm": 0.1265888644195425,
"learning_rate": 1.0017704595710947e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9698656022548675,
"step": 8365
},
{
"epoch": 3.2205118337502405,
"grad_norm": 0.12734965041860674,
"learning_rate": 9.993938864970288e-06,
"loss": 0.0883,
"mean_token_accuracy": 0.9695118367671967,
"step": 8370
},
{
"epoch": 3.22243602078122,
"grad_norm": 0.12474460914321397,
"learning_rate": 9.97022252196318e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9699595808982849,
"step": 8375
},
{
"epoch": 3.2243602078121993,
"grad_norm": 0.1306528695087571,
"learning_rate": 9.946555633597666e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9696216762065888,
"step": 8380
},
{
"epoch": 3.2262843948431787,
"grad_norm": 0.1266524631503943,
"learning_rate": 9.922938266642284e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9699699997901916,
"step": 8385
},
{
"epoch": 3.228208581874158,
"grad_norm": 0.1259564595531912,
"learning_rate": 9.899370487725866e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9694744288921356,
"step": 8390
},
{
"epoch": 3.2301327689051376,
"grad_norm": 0.12096085083215267,
"learning_rate": 9.875852363337315e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9708819687366486,
"step": 8395
},
{
"epoch": 3.232056955936117,
"grad_norm": 0.1331944475195223,
"learning_rate": 9.852383959825492e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9708536863327026,
"step": 8400
},
{
"epoch": 3.2339811429670964,
"grad_norm": 0.12740850552589505,
"learning_rate": 9.828965343398952e-06,
"loss": 0.0869,
"mean_token_accuracy": 0.969858956336975,
"step": 8405
},
{
"epoch": 3.235905329998076,
"grad_norm": 0.1267014162237489,
"learning_rate": 9.805596580125809e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9700336575508117,
"step": 8410
},
{
"epoch": 3.237829517029055,
"grad_norm": 0.12934145585469362,
"learning_rate": 9.78227773593354e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9699920177459717,
"step": 8415
},
{
"epoch": 3.2397537040600346,
"grad_norm": 0.124341432322928,
"learning_rate": 9.759008876608766e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9696942687034606,
"step": 8420
},
{
"epoch": 3.241677891091014,
"grad_norm": 0.1305683160263325,
"learning_rate": 9.73579006779711e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9699869990348816,
"step": 8425
},
{
"epoch": 3.2436020781219934,
"grad_norm": 0.12006655217324837,
"learning_rate": 9.712621375002999e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9707320332527161,
"step": 8430
},
{
"epoch": 3.245526265152973,
"grad_norm": 0.12418308091527765,
"learning_rate": 9.689502863589458e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9693997085094452,
"step": 8435
},
{
"epoch": 3.2474504521839522,
"grad_norm": 0.1273627357393596,
"learning_rate": 9.666434598777944e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9706789910793304,
"step": 8440
},
{
"epoch": 3.2493746392149316,
"grad_norm": 0.12610851458563718,
"learning_rate": 9.643416645648162e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9701983332633972,
"step": 8445
},
{
"epoch": 3.251298826245911,
"grad_norm": 0.12462572164326881,
"learning_rate": 9.620449069137891e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.970365023612976,
"step": 8450
},
{
"epoch": 3.2532230132768905,
"grad_norm": 0.12695478875600175,
"learning_rate": 9.597531934042773e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9702978789806366,
"step": 8455
},
{
"epoch": 3.25514720030787,
"grad_norm": 0.12596431800857663,
"learning_rate": 9.574665305016148e-06,
"loss": 0.0874,
"mean_token_accuracy": 0.9696771502494812,
"step": 8460
},
{
"epoch": 3.2570713873388493,
"grad_norm": 0.1263908940111466,
"learning_rate": 9.551849246568866e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705479264259338,
"step": 8465
},
{
"epoch": 3.2589955743698287,
"grad_norm": 0.12796106826662196,
"learning_rate": 9.529083823069123e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9702809691429138,
"step": 8470
},
{
"epoch": 3.260919761400808,
"grad_norm": 0.12584104905433993,
"learning_rate": 9.506369098742257e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9710902273654938,
"step": 8475
},
{
"epoch": 3.2628439484317875,
"grad_norm": 0.128886611194648,
"learning_rate": 9.483705137670563e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9707912802696228,
"step": 8480
},
{
"epoch": 3.264768135462767,
"grad_norm": 0.124633168632203,
"learning_rate": 9.46109200379314e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9700573444366455,
"step": 8485
},
{
"epoch": 3.2666923224937463,
"grad_norm": 0.13217726055423035,
"learning_rate": 9.438529760905694e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9699700951576233,
"step": 8490
},
{
"epoch": 3.2686165095247257,
"grad_norm": 0.1252762176406864,
"learning_rate": 9.41601847266034e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9699073255062103,
"step": 8495
},
{
"epoch": 3.270540696555705,
"grad_norm": 0.1332115069886139,
"learning_rate": 9.393558202565469e-06,
"loss": 0.0869,
"mean_token_accuracy": 0.9696744382381439,
"step": 8500
},
{
"epoch": 3.2724648835866845,
"grad_norm": 0.13767971258879022,
"learning_rate": 9.37114901398551e-06,
"loss": 0.0867,
"mean_token_accuracy": 0.9698637843132019,
"step": 8505
},
{
"epoch": 3.274389070617664,
"grad_norm": 0.12634734486679827,
"learning_rate": 9.348790970140803e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9696608185768127,
"step": 8510
},
{
"epoch": 3.2763132576486433,
"grad_norm": 0.12463310795171538,
"learning_rate": 9.326484134107397e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9704243242740631,
"step": 8515
},
{
"epoch": 3.2782374446796227,
"grad_norm": 0.13231686223559833,
"learning_rate": 9.304228568816873e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707625210285187,
"step": 8520
},
{
"epoch": 3.280161631710602,
"grad_norm": 0.12724142915743591,
"learning_rate": 9.282024337056164e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9705350875854493,
"step": 8525
},
{
"epoch": 3.2820858187415816,
"grad_norm": 0.12371333151127234,
"learning_rate": 9.259871501467374e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9705982744693756,
"step": 8530
},
{
"epoch": 3.284010005772561,
"grad_norm": 0.12490000522762193,
"learning_rate": 9.23777012454763e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9704672932624817,
"step": 8535
},
{
"epoch": 3.2859341928035404,
"grad_norm": 0.1289941739689611,
"learning_rate": 9.215720268648878e-06,
"loss": 0.0857,
"mean_token_accuracy": 0.970098227262497,
"step": 8540
},
{
"epoch": 3.28785837983452,
"grad_norm": 0.12967348920086358,
"learning_rate": 9.193721995977698e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9699119985103607,
"step": 8545
},
{
"epoch": 3.289782566865499,
"grad_norm": 0.12851709397206285,
"learning_rate": 9.17177536859517e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9700691282749176,
"step": 8550
},
{
"epoch": 3.2917067538964786,
"grad_norm": 0.1231932806947136,
"learning_rate": 9.149880448416648e-06,
"loss": 0.0857,
"mean_token_accuracy": 0.9702551066875458,
"step": 8555
},
{
"epoch": 3.293630940927458,
"grad_norm": 0.12495074882295382,
"learning_rate": 9.128037297211634e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9694799661636353,
"step": 8560
},
{
"epoch": 3.2955551279584374,
"grad_norm": 0.1288804330900242,
"learning_rate": 9.10624597660356e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9698720872402191,
"step": 8565
},
{
"epoch": 3.297479314989417,
"grad_norm": 0.12501816949094877,
"learning_rate": 9.084506548069645e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9702178537845612,
"step": 8570
},
{
"epoch": 3.2994035020203962,
"grad_norm": 0.12837424725335872,
"learning_rate": 9.062819072940715e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9699204087257385,
"step": 8575
},
{
"epoch": 3.3013276890513756,
"grad_norm": 0.12569337071515446,
"learning_rate": 9.041183612401025e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9705540895462036,
"step": 8580
},
{
"epoch": 3.303251876082355,
"grad_norm": 0.1295374658358602,
"learning_rate": 9.019600227488077e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9705880224704743,
"step": 8585
},
{
"epoch": 3.3051760631133345,
"grad_norm": 0.12715742158265253,
"learning_rate": 8.998068979092458e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9702557623386383,
"step": 8590
},
{
"epoch": 3.307100250144314,
"grad_norm": 0.13623753283325926,
"learning_rate": 8.976589927957687e-06,
"loss": 0.0863,
"mean_token_accuracy": 0.9700308620929718,
"step": 8595
},
{
"epoch": 3.3090244371752933,
"grad_norm": 0.1281690334105656,
"learning_rate": 8.955163134680011e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9699817836284638,
"step": 8600
},
{
"epoch": 3.3109486242062727,
"grad_norm": 0.13038861977362848,
"learning_rate": 8.933788659708256e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9706872582435608,
"step": 8605
},
{
"epoch": 3.312872811237252,
"grad_norm": 0.12433591222697461,
"learning_rate": 8.912466563343638e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9707286357879639,
"step": 8610
},
{
"epoch": 3.3147969982682315,
"grad_norm": 0.12254842004963368,
"learning_rate": 8.891196905739604e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9708254933357239,
"step": 8615
},
{
"epoch": 3.316721185299211,
"grad_norm": 0.12548382355964321,
"learning_rate": 8.869979746901677e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9700875878334045,
"step": 8620
},
{
"epoch": 3.3186453723301903,
"grad_norm": 0.13264124283049286,
"learning_rate": 8.848815146687257e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.970658129453659,
"step": 8625
},
{
"epoch": 3.3205695593611697,
"grad_norm": 0.12652569781174539,
"learning_rate": 8.827703164805471e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9701962292194366,
"step": 8630
},
{
"epoch": 3.3224937463921496,
"grad_norm": 0.12819513526318943,
"learning_rate": 8.806643860816998e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9706246614456177,
"step": 8635
},
{
"epoch": 3.3244179334231285,
"grad_norm": 0.1298989340360717,
"learning_rate": 8.78563729413392e-06,
"loss": 0.0872,
"mean_token_accuracy": 0.9696630299091339,
"step": 8640
},
{
"epoch": 3.3263421204541084,
"grad_norm": 0.1182656253664839,
"learning_rate": 8.764683524019512e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9715340971946717,
"step": 8645
},
{
"epoch": 3.3282663074850873,
"grad_norm": 0.12708389586633778,
"learning_rate": 8.743782609588108e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9707981884479523,
"step": 8650
},
{
"epoch": 3.330190494516067,
"grad_norm": 0.1285361679643166,
"learning_rate": 8.722934609804937e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9702385127544403,
"step": 8655
},
{
"epoch": 3.332114681547046,
"grad_norm": 0.12981013898874733,
"learning_rate": 8.702139583485938e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9703791618347168,
"step": 8660
},
{
"epoch": 3.334038868578026,
"grad_norm": 0.13135679062104932,
"learning_rate": 8.681397589297604e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.97044517993927,
"step": 8665
},
{
"epoch": 3.335963055609005,
"grad_norm": 0.12858230101996082,
"learning_rate": 8.660708685756826e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9698947310447693,
"step": 8670
},
{
"epoch": 3.337887242639985,
"grad_norm": 0.13140167849466097,
"learning_rate": 8.640072931230696e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9699412226676941,
"step": 8675
},
{
"epoch": 3.339811429670964,
"grad_norm": 0.13994509560367993,
"learning_rate": 8.619490383936366e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9701304972171784,
"step": 8680
},
{
"epoch": 3.3417356167019436,
"grad_norm": 0.1313381474793011,
"learning_rate": 8.598961101940898e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9697572290897369,
"step": 8685
},
{
"epoch": 3.3436598037329226,
"grad_norm": 0.12773458439501786,
"learning_rate": 8.578485143161077e-06,
"loss": 0.0869,
"mean_token_accuracy": 0.9697576522827148,
"step": 8690
},
{
"epoch": 3.3455839907639024,
"grad_norm": 0.12908337054513808,
"learning_rate": 8.558062565363236e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707521021366119,
"step": 8695
},
{
"epoch": 3.347508177794882,
"grad_norm": 0.14050298858857763,
"learning_rate": 8.537693426163137e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9696966946125031,
"step": 8700
},
{
"epoch": 3.3494323648258613,
"grad_norm": 0.13014009624854814,
"learning_rate": 8.517377783025762e-06,
"loss": 0.0882,
"mean_token_accuracy": 0.9692828714847564,
"step": 8705
},
{
"epoch": 3.3513565518568407,
"grad_norm": 0.13106790772771806,
"learning_rate": 8.497115693265184e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9706206440925598,
"step": 8710
},
{
"epoch": 3.35328073888782,
"grad_norm": 0.12709539158034155,
"learning_rate": 8.476907214044378e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9701223254203797,
"step": 8715
},
{
"epoch": 3.3552049259187995,
"grad_norm": 0.12594987993398446,
"learning_rate": 8.45675240237509e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.971149480342865,
"step": 8720
},
{
"epoch": 3.357129112949779,
"grad_norm": 0.12738524473365023,
"learning_rate": 8.436651315117652e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9705275237560272,
"step": 8725
},
{
"epoch": 3.3590532999807583,
"grad_norm": 0.1282987237539586,
"learning_rate": 8.416604008980836e-06,
"loss": 0.0863,
"mean_token_accuracy": 0.9699268102645874,
"step": 8730
},
{
"epoch": 3.3609774870117377,
"grad_norm": 0.1260384071569397,
"learning_rate": 8.396610540521679e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9706168353557587,
"step": 8735
},
{
"epoch": 3.362901674042717,
"grad_norm": 0.13272296976352874,
"learning_rate": 8.376670966145328e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9704327166080475,
"step": 8740
},
{
"epoch": 3.3648258610736965,
"grad_norm": 0.13256551590203566,
"learning_rate": 8.3567853421049e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9698093652725219,
"step": 8745
},
{
"epoch": 3.366750048104676,
"grad_norm": 0.1296375896640046,
"learning_rate": 8.336953724501301e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9705244064331054,
"step": 8750
},
{
"epoch": 3.3686742351356553,
"grad_norm": 0.12871414388401403,
"learning_rate": 8.317176169283084e-06,
"loss": 0.0869,
"mean_token_accuracy": 0.9697363972663879,
"step": 8755
},
{
"epoch": 3.3705984221666347,
"grad_norm": 0.12867392118207444,
"learning_rate": 8.297452732246263e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9703603863716126,
"step": 8760
},
{
"epoch": 3.372522609197614,
"grad_norm": 0.13251967572536488,
"learning_rate": 8.277783469034189e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9703691661357879,
"step": 8765
},
{
"epoch": 3.3744467962285936,
"grad_norm": 0.12963231003298034,
"learning_rate": 8.258168435137373e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9707355380058289,
"step": 8770
},
{
"epoch": 3.376370983259573,
"grad_norm": 0.13040555109562293,
"learning_rate": 8.238607685893345e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9706344664096832,
"step": 8775
},
{
"epoch": 3.3782951702905524,
"grad_norm": 0.1269499257113466,
"learning_rate": 8.219101276486479e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9708466231822968,
"step": 8780
},
{
"epoch": 3.380219357321532,
"grad_norm": 0.12505828556642629,
"learning_rate": 8.199649261947847e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9703607201576233,
"step": 8785
},
{
"epoch": 3.382143544352511,
"grad_norm": 0.12435969457559475,
"learning_rate": 8.180251697155073e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9701003551483154,
"step": 8790
},
{
"epoch": 3.3840677313834906,
"grad_norm": 0.1313400660491146,
"learning_rate": 8.160908636832154e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9699647605419159,
"step": 8795
},
{
"epoch": 3.38599191841447,
"grad_norm": 0.12681342418139757,
"learning_rate": 8.141620135549327e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.97001833319664,
"step": 8800
},
{
"epoch": 3.3879161054454494,
"grad_norm": 0.1267393721622519,
"learning_rate": 8.122386247722907e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9708464503288269,
"step": 8805
},
{
"epoch": 3.389840292476429,
"grad_norm": 0.13209641998243066,
"learning_rate": 8.103207027615142e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9694723188877106,
"step": 8810
},
{
"epoch": 3.3917644795074082,
"grad_norm": 0.12532336727924384,
"learning_rate": 8.084082529334043e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9706017315387726,
"step": 8815
},
{
"epoch": 3.3936886665383876,
"grad_norm": 0.12709702174008902,
"learning_rate": 8.065012806833249e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9706872940063477,
"step": 8820
},
{
"epoch": 3.395612853569367,
"grad_norm": 0.12864795259469106,
"learning_rate": 8.045997913911862e-06,
"loss": 0.0872,
"mean_token_accuracy": 0.9698288381099701,
"step": 8825
},
{
"epoch": 3.3975370406003464,
"grad_norm": 0.12646239900475942,
"learning_rate": 8.027037904214292e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9703909814357757,
"step": 8830
},
{
"epoch": 3.399461227631326,
"grad_norm": 0.12565513142414456,
"learning_rate": 8.008132831230126e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707223236560821,
"step": 8835
},
{
"epoch": 3.4013854146623053,
"grad_norm": 0.12760894959438268,
"learning_rate": 7.989282748293967e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9702325403690338,
"step": 8840
},
{
"epoch": 3.4033096016932847,
"grad_norm": 0.12872157096414913,
"learning_rate": 7.970487708585263e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705387651920319,
"step": 8845
},
{
"epoch": 3.405233788724264,
"grad_norm": 0.13074364076123604,
"learning_rate": 7.951747765128198e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9703906655311585,
"step": 8850
},
{
"epoch": 3.4071579757552435,
"grad_norm": 0.12860828576759153,
"learning_rate": 7.933062970791503e-06,
"loss": 0.0825,
"mean_token_accuracy": 0.9713438928127289,
"step": 8855
},
{
"epoch": 3.409082162786223,
"grad_norm": 0.12596652807591652,
"learning_rate": 7.91443337828833e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.970458322763443,
"step": 8860
},
{
"epoch": 3.4110063498172023,
"grad_norm": 0.12706147071281007,
"learning_rate": 7.895859040176094e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9701674222946167,
"step": 8865
},
{
"epoch": 3.4129305368481817,
"grad_norm": 0.12809440511811315,
"learning_rate": 7.877340008856327e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9701098024845123,
"step": 8870
},
{
"epoch": 3.414854723879161,
"grad_norm": 0.1246733969536944,
"learning_rate": 7.858876336574538e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9701932430267334,
"step": 8875
},
{
"epoch": 3.4167789109101405,
"grad_norm": 0.13569869596719736,
"learning_rate": 7.840468075420057e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9707090973854064,
"step": 8880
},
{
"epoch": 3.41870309794112,
"grad_norm": 0.12870972415192447,
"learning_rate": 7.822115277325882e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9700087785720826,
"step": 8885
},
{
"epoch": 3.4206272849720993,
"grad_norm": 0.1233483285767941,
"learning_rate": 7.803817994068534e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9699972093105316,
"step": 8890
},
{
"epoch": 3.4225514720030787,
"grad_norm": 0.1248853456521094,
"learning_rate": 7.785576277267934e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9701894283294678,
"step": 8895
},
{
"epoch": 3.424475659034058,
"grad_norm": 0.13033547431633313,
"learning_rate": 7.767390178387231e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9697039365768433,
"step": 8900
},
{
"epoch": 3.4263998460650376,
"grad_norm": 0.1282445236894112,
"learning_rate": 7.749259748732671e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9699876427650451,
"step": 8905
},
{
"epoch": 3.428324033096017,
"grad_norm": 0.13185925812837074,
"learning_rate": 7.731185039453438e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9702568829059601,
"step": 8910
},
{
"epoch": 3.4302482201269964,
"grad_norm": 0.12881109452347636,
"learning_rate": 7.713166101541522e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.970571768283844,
"step": 8915
},
{
"epoch": 3.432172407157976,
"grad_norm": 0.12574808543435886,
"learning_rate": 7.695202985831577e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9705647766590119,
"step": 8920
},
{
"epoch": 3.434096594188955,
"grad_norm": 0.12738407701511564,
"learning_rate": 7.677295743000772e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9711636543273926,
"step": 8925
},
{
"epoch": 3.4360207812199346,
"grad_norm": 0.12629537602643848,
"learning_rate": 7.659444423568638e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.97016361951828,
"step": 8930
},
{
"epoch": 3.437944968250914,
"grad_norm": 0.13288314080760535,
"learning_rate": 7.641649077896947e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705747485160827,
"step": 8935
},
{
"epoch": 3.4398691552818934,
"grad_norm": 0.12750592143154055,
"learning_rate": 7.623909756189565e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.970649367570877,
"step": 8940
},
{
"epoch": 3.441793342312873,
"grad_norm": 0.1261252863356084,
"learning_rate": 7.606226508492286e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9709120571613312,
"step": 8945
},
{
"epoch": 3.4437175293438522,
"grad_norm": 0.12844346524328273,
"learning_rate": 7.588599384692719e-06,
"loss": 0.0874,
"mean_token_accuracy": 0.9696136832237243,
"step": 8950
},
{
"epoch": 3.4456417163748316,
"grad_norm": 0.128895448462382,
"learning_rate": 7.571028434520136e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9703655481338501,
"step": 8955
},
{
"epoch": 3.447565903405811,
"grad_norm": 0.12565867696603822,
"learning_rate": 7.553513707545339e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.971028083562851,
"step": 8960
},
{
"epoch": 3.4494900904367904,
"grad_norm": 0.1294464609680728,
"learning_rate": 7.536055253180511e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9708369553089142,
"step": 8965
},
{
"epoch": 3.45141427746777,
"grad_norm": 0.12779041889602735,
"learning_rate": 7.518653120679074e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9704262435436248,
"step": 8970
},
{
"epoch": 3.4533384644987493,
"grad_norm": 0.12443532467597591,
"learning_rate": 7.501307359135556e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9715291023254394,
"step": 8975
},
{
"epoch": 3.4552626515297287,
"grad_norm": 0.12187677311471684,
"learning_rate": 7.484018017485463e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9712438225746155,
"step": 8980
},
{
"epoch": 3.457186838560708,
"grad_norm": 0.12822302730085042,
"learning_rate": 7.466785144505123e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9702449262142181,
"step": 8985
},
{
"epoch": 3.4591110255916875,
"grad_norm": 0.13088413276881763,
"learning_rate": 7.44960878881156e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9710516095161438,
"step": 8990
},
{
"epoch": 3.461035212622667,
"grad_norm": 0.1225224566435389,
"learning_rate": 7.432488998862341e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705376029014587,
"step": 8995
},
{
"epoch": 3.4629593996536463,
"grad_norm": 0.1384362152617989,
"learning_rate": 7.415425822955471e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9698244988918304,
"step": 9000
},
{
"epoch": 3.4648835866846257,
"grad_norm": 0.14183049964477246,
"learning_rate": 7.398419309229211e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9700653553009033,
"step": 9005
},
{
"epoch": 3.466807773715605,
"grad_norm": 0.12297008925890439,
"learning_rate": 7.3814695056619946e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9703586101531982,
"step": 9010
},
{
"epoch": 3.4687319607465845,
"grad_norm": 0.12722759866701883,
"learning_rate": 7.364576460072245e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9700540244579315,
"step": 9015
},
{
"epoch": 3.470656147777564,
"grad_norm": 0.1332776354167734,
"learning_rate": 7.347740220118271e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9711086690425873,
"step": 9020
},
{
"epoch": 3.4725803348085433,
"grad_norm": 0.13357779167266728,
"learning_rate": 7.330960833298123e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9706657350063324,
"step": 9025
},
{
"epoch": 3.4745045218395227,
"grad_norm": 0.21084750779345962,
"learning_rate": 7.314238346949456e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.970378315448761,
"step": 9030
},
{
"epoch": 3.476428708870502,
"grad_norm": 0.12639746006093333,
"learning_rate": 7.297572808249399e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9707970798015595,
"step": 9035
},
{
"epoch": 3.4783528959014816,
"grad_norm": 0.12444375526245546,
"learning_rate": 7.280964264214416e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709853947162628,
"step": 9040
},
{
"epoch": 3.480277082932461,
"grad_norm": 0.12516347291350133,
"learning_rate": 7.264412761700186e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9707713544368743,
"step": 9045
},
{
"epoch": 3.4822012699634404,
"grad_norm": 0.1275743992676517,
"learning_rate": 7.247918347401464e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707460880279541,
"step": 9050
},
{
"epoch": 3.48412545699442,
"grad_norm": 0.1264298342167565,
"learning_rate": 7.23148106785195e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9708225727081299,
"step": 9055
},
{
"epoch": 3.486049644025399,
"grad_norm": 0.1396312586335439,
"learning_rate": 7.21510096942415e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9702761232852936,
"step": 9060
},
{
"epoch": 3.4879738310563786,
"grad_norm": 0.12600088149391286,
"learning_rate": 7.1987780983292506e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9709597945213317,
"step": 9065
},
{
"epoch": 3.489898018087358,
"grad_norm": 0.12802814534644144,
"learning_rate": 7.1825125006169986e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9703998446464539,
"step": 9070
},
{
"epoch": 3.4918222051183374,
"grad_norm": 0.1280563216277743,
"learning_rate": 7.16630422217556e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9698512375354766,
"step": 9075
},
{
"epoch": 3.493746392149317,
"grad_norm": 0.13260569969650457,
"learning_rate": 7.150153308731388e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9711039125919342,
"step": 9080
},
{
"epoch": 3.4956705791802962,
"grad_norm": 0.1261891301050121,
"learning_rate": 7.1340598058490995e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9710277199745179,
"step": 9085
},
{
"epoch": 3.4975947662112756,
"grad_norm": 0.12707472253510566,
"learning_rate": 7.118023758931357e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9699563682079315,
"step": 9090
},
{
"epoch": 3.499518953242255,
"grad_norm": 0.12499224818434655,
"learning_rate": 7.102045213218714e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9716532945632934,
"step": 9095
},
{
"epoch": 3.5014431402732344,
"grad_norm": 0.131033648481688,
"learning_rate": 7.086124213789506e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9699096500873565,
"step": 9100
},
{
"epoch": 3.503367327304214,
"grad_norm": 0.13685556027766135,
"learning_rate": 7.070260805559728e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9701919317245483,
"step": 9105
},
{
"epoch": 3.5052915143351933,
"grad_norm": 0.12763545572226542,
"learning_rate": 7.054455033282899e-06,
"loss": 0.0805,
"mean_token_accuracy": 0.9719826638698578,
"step": 9110
},
{
"epoch": 3.5072157013661727,
"grad_norm": 0.12934854751913,
"learning_rate": 7.03870694154993e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9709598600864411,
"step": 9115
},
{
"epoch": 3.509139888397152,
"grad_norm": 0.13067551146740328,
"learning_rate": 7.023016574789008e-06,
"loss": 0.0817,
"mean_token_accuracy": 0.9717234134674072,
"step": 9120
},
{
"epoch": 3.5110640754281315,
"grad_norm": 0.126533600046491,
"learning_rate": 7.007383977265465e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9712643921375275,
"step": 9125
},
{
"epoch": 3.512988262459111,
"grad_norm": 0.1266039435538283,
"learning_rate": 6.991809193081661e-06,
"loss": 0.0867,
"mean_token_accuracy": 0.969794648885727,
"step": 9130
},
{
"epoch": 3.5149124494900903,
"grad_norm": 0.12863707875363184,
"learning_rate": 6.976292266176848e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9711201667785645,
"step": 9135
},
{
"epoch": 3.5168366365210697,
"grad_norm": 0.1284053461949965,
"learning_rate": 6.9608332403270655e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9714163899421692,
"step": 9140
},
{
"epoch": 3.518760823552049,
"grad_norm": 0.12531953200720355,
"learning_rate": 6.945432159144982e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9702307283878326,
"step": 9145
},
{
"epoch": 3.5206850105830285,
"grad_norm": 0.12688678414941892,
"learning_rate": 6.930089066079816e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9698631703853607,
"step": 9150
},
{
"epoch": 3.522609197614008,
"grad_norm": 0.13018679852235768,
"learning_rate": 6.9148040044171705e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9709725737571716,
"step": 9155
},
{
"epoch": 3.5245333846449873,
"grad_norm": 0.1391861731141968,
"learning_rate": 6.899577017278952e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9702840864658355,
"step": 9160
},
{
"epoch": 3.5264575716759667,
"grad_norm": 0.12922872655559622,
"learning_rate": 6.8844081476232076e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9706377267837525,
"step": 9165
},
{
"epoch": 3.5283817587069466,
"grad_norm": 0.13085069031984367,
"learning_rate": 6.869297438244039e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9710263013839722,
"step": 9170
},
{
"epoch": 3.5303059457379256,
"grad_norm": 0.15929457260546562,
"learning_rate": 6.854244931771467e-06,
"loss": 0.086,
"mean_token_accuracy": 0.970440661907196,
"step": 9175
},
{
"epoch": 3.5322301327689054,
"grad_norm": 0.13113039818982128,
"learning_rate": 6.839250670671308e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9708048164844513,
"step": 9180
},
{
"epoch": 3.5341543197998844,
"grad_norm": 0.12947898341430028,
"learning_rate": 6.824314697245057e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711683690547943,
"step": 9185
},
{
"epoch": 3.536078506830864,
"grad_norm": 0.12805154763824103,
"learning_rate": 6.8094370536297665e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.970403516292572,
"step": 9190
},
{
"epoch": 3.538002693861843,
"grad_norm": 0.12863808995173834,
"learning_rate": 6.794617781797934e-06,
"loss": 0.084,
"mean_token_accuracy": 0.970603746175766,
"step": 9195
},
{
"epoch": 3.539926880892823,
"grad_norm": 0.126568470390018,
"learning_rate": 6.779856923557385e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9701663970947265,
"step": 9200
},
{
"epoch": 3.541851067923802,
"grad_norm": 0.12205274777553857,
"learning_rate": 6.765154520551134e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9704317271709442,
"step": 9205
},
{
"epoch": 3.543775254954782,
"grad_norm": 0.13594787640050665,
"learning_rate": 6.750510614257306e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9710303544998169,
"step": 9210
},
{
"epoch": 3.545699441985761,
"grad_norm": 0.125655355880411,
"learning_rate": 6.735925245988972e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.970561021566391,
"step": 9215
},
{
"epoch": 3.5476236290167407,
"grad_norm": 0.12923608243530355,
"learning_rate": 6.72139845689407e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9711350262165069,
"step": 9220
},
{
"epoch": 3.5495478160477196,
"grad_norm": 0.13742684679218364,
"learning_rate": 6.706930287955278e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9699180245399475,
"step": 9225
},
{
"epoch": 3.5514720030786995,
"grad_norm": 0.12627623911720126,
"learning_rate": 6.692520779989888e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9704681694507599,
"step": 9230
},
{
"epoch": 3.5533961901096784,
"grad_norm": 0.12790034643230236,
"learning_rate": 6.678169973649703e-06,
"loss": 0.0869,
"mean_token_accuracy": 0.9698730170726776,
"step": 9235
},
{
"epoch": 3.5553203771406583,
"grad_norm": 0.12746343526254894,
"learning_rate": 6.663877909420924e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9704817056655883,
"step": 9240
},
{
"epoch": 3.5572445641716373,
"grad_norm": 0.12942624345380135,
"learning_rate": 6.649644627624014e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9712142050266266,
"step": 9245
},
{
"epoch": 3.559168751202617,
"grad_norm": 0.1294626315055323,
"learning_rate": 6.635470168413616e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.970136284828186,
"step": 9250
},
{
"epoch": 3.561092938233596,
"grad_norm": 0.1287659587086771,
"learning_rate": 6.62135457177841e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.97107794880867,
"step": 9255
},
{
"epoch": 3.563017125264576,
"grad_norm": 0.13124629790306566,
"learning_rate": 6.60729787754103e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9702930569648742,
"step": 9260
},
{
"epoch": 3.564941312295555,
"grad_norm": 0.13150538121829,
"learning_rate": 6.593300125357932e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9710995376110076,
"step": 9265
},
{
"epoch": 3.5668654993265347,
"grad_norm": 0.12911652795640852,
"learning_rate": 6.579361354719271e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9702031135559082,
"step": 9270
},
{
"epoch": 3.5687896863575137,
"grad_norm": 0.1261550636731243,
"learning_rate": 6.565481604948817e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9713754057884216,
"step": 9275
},
{
"epoch": 3.5707138733884936,
"grad_norm": 0.13217408438340994,
"learning_rate": 6.551660915203834e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9706252694129944,
"step": 9280
},
{
"epoch": 3.5726380604194725,
"grad_norm": 0.12628151081566255,
"learning_rate": 6.5378993244749655e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9705230236053467,
"step": 9285
},
{
"epoch": 3.5745622474504524,
"grad_norm": 0.12658169932670083,
"learning_rate": 6.524196871586113e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705257952213288,
"step": 9290
},
{
"epoch": 3.5764864344814313,
"grad_norm": 0.12600332071058087,
"learning_rate": 6.510553595194359e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.971400386095047,
"step": 9295
},
{
"epoch": 3.578410621512411,
"grad_norm": 0.12864461247839032,
"learning_rate": 6.496969533789829e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9705588459968567,
"step": 9300
},
{
"epoch": 3.58033480854339,
"grad_norm": 0.1288969574385522,
"learning_rate": 6.483444725695587e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9704531967639923,
"step": 9305
},
{
"epoch": 3.58225899557437,
"grad_norm": 0.1258000907397356,
"learning_rate": 6.469979209067545e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9705624639987945,
"step": 9310
},
{
"epoch": 3.5841831826053494,
"grad_norm": 0.12880633256077986,
"learning_rate": 6.456573021894331e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.97150416970253,
"step": 9315
},
{
"epoch": 3.586107369636329,
"grad_norm": 0.1289644166825434,
"learning_rate": 6.443226201997204e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9712906897068023,
"step": 9320
},
{
"epoch": 3.588031556667308,
"grad_norm": 0.12788660079345335,
"learning_rate": 6.4299387870299335e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9710439383983612,
"step": 9325
},
{
"epoch": 3.5899557436982876,
"grad_norm": 0.12775612967950467,
"learning_rate": 6.416710814478696e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9710244417190552,
"step": 9330
},
{
"epoch": 3.591879930729267,
"grad_norm": 0.12825385865965377,
"learning_rate": 6.403542321661972e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9705589354038239,
"step": 9335
},
{
"epoch": 3.5938041177602464,
"grad_norm": 0.12991215195891673,
"learning_rate": 6.390433345730433e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9702601253986358,
"step": 9340
},
{
"epoch": 3.595728304791226,
"grad_norm": 0.1267840160300084,
"learning_rate": 6.377383923666852e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9707222580909729,
"step": 9345
},
{
"epoch": 3.5976524918222053,
"grad_norm": 0.12276379955317407,
"learning_rate": 6.3643940922859886e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9711625814437866,
"step": 9350
},
{
"epoch": 3.5995766788531847,
"grad_norm": 0.13249444718998268,
"learning_rate": 6.351463888234477e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9701505243778229,
"step": 9355
},
{
"epoch": 3.601500865884164,
"grad_norm": 0.13043491753878234,
"learning_rate": 6.338593347990742e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9708169162273407,
"step": 9360
},
{
"epoch": 3.6034250529151435,
"grad_norm": 0.12976735823843522,
"learning_rate": 6.325782507864881e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9709677278995514,
"step": 9365
},
{
"epoch": 3.605349239946123,
"grad_norm": 0.12815702589482678,
"learning_rate": 6.313031403998566e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9709154903888703,
"step": 9370
},
{
"epoch": 3.6072734269771023,
"grad_norm": 0.12979207167662427,
"learning_rate": 6.300340072364952e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9704569756984711,
"step": 9375
},
{
"epoch": 3.6091976140080817,
"grad_norm": 0.126402904194075,
"learning_rate": 6.287708548768552e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711087584495545,
"step": 9380
},
{
"epoch": 3.611121801039061,
"grad_norm": 0.12843425985952908,
"learning_rate": 6.275136868845155e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9714385747909546,
"step": 9385
},
{
"epoch": 3.6130459880700405,
"grad_norm": 0.13333106158008526,
"learning_rate": 6.26262506806173e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9713011264801026,
"step": 9390
},
{
"epoch": 3.61497017510102,
"grad_norm": 0.14663434827906371,
"learning_rate": 6.250173181716304e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9708237528800965,
"step": 9395
},
{
"epoch": 3.6168943621319993,
"grad_norm": 0.12692314981959749,
"learning_rate": 6.23778124493787e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9708507835865021,
"step": 9400
},
{
"epoch": 3.6188185491629787,
"grad_norm": 0.12203236079210081,
"learning_rate": 6.2254492926863095e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9706545650959015,
"step": 9405
},
{
"epoch": 3.620742736193958,
"grad_norm": 0.12760412027520115,
"learning_rate": 6.213177359752266e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9708652913570404,
"step": 9410
},
{
"epoch": 3.6226669232249376,
"grad_norm": 0.13813682121731513,
"learning_rate": 6.200965480757063e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9703601837158203,
"step": 9415
},
{
"epoch": 3.624591110255917,
"grad_norm": 0.12734847847373157,
"learning_rate": 6.188813690152597e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9707621932029724,
"step": 9420
},
{
"epoch": 3.6265152972868964,
"grad_norm": 0.14059425449796248,
"learning_rate": 6.176722022221239e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9715953230857849,
"step": 9425
},
{
"epoch": 3.6284394843178758,
"grad_norm": 0.12913843992546312,
"learning_rate": 6.164690511075756e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9708456873893738,
"step": 9430
},
{
"epoch": 3.630363671348855,
"grad_norm": 0.12870779437650948,
"learning_rate": 6.152719190659195e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9706524848937989,
"step": 9435
},
{
"epoch": 3.6322878583798346,
"grad_norm": 0.13145419132449554,
"learning_rate": 6.1408080947447915e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9703650951385498,
"step": 9440
},
{
"epoch": 3.634212045410814,
"grad_norm": 0.13406184413384944,
"learning_rate": 6.128957256935885e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9707401990890503,
"step": 9445
},
{
"epoch": 3.6361362324417934,
"grad_norm": 0.12749027604201754,
"learning_rate": 6.117166710665809e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707375228404999,
"step": 9450
},
{
"epoch": 3.638060419472773,
"grad_norm": 0.12517043425750166,
"learning_rate": 6.105436489197804e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9704214990139007,
"step": 9455
},
{
"epoch": 3.639984606503752,
"grad_norm": 0.13340903779008972,
"learning_rate": 6.093766625624931e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9715105950832367,
"step": 9460
},
{
"epoch": 3.6419087935347316,
"grad_norm": 0.1264258374381348,
"learning_rate": 6.082157152869959e-06,
"loss": 0.083,
"mean_token_accuracy": 0.97146155834198,
"step": 9465
},
{
"epoch": 3.643832980565711,
"grad_norm": 0.12776191850725374,
"learning_rate": 6.070608103685293e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9705254197120666,
"step": 9470
},
{
"epoch": 3.6457571675966904,
"grad_norm": 0.12786128161907748,
"learning_rate": 6.059119510652865e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9703995406627655,
"step": 9475
},
{
"epoch": 3.64768135462767,
"grad_norm": 0.1354404763943873,
"learning_rate": 6.047691406184063e-06,
"loss": 0.0818,
"mean_token_accuracy": 0.9714069902896881,
"step": 9480
},
{
"epoch": 3.6496055416586493,
"grad_norm": 0.12580053609381317,
"learning_rate": 6.036323822519609e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9704692304134369,
"step": 9485
},
{
"epoch": 3.6515297286896287,
"grad_norm": 0.13263506232122327,
"learning_rate": 6.0250167917294906e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9703659892082215,
"step": 9490
},
{
"epoch": 3.653453915720608,
"grad_norm": 0.12324365526429366,
"learning_rate": 6.013770345712869e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709503173828125,
"step": 9495
},
{
"epoch": 3.6553781027515875,
"grad_norm": 0.12844079414476534,
"learning_rate": 6.0025845161979856e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9706700563430786,
"step": 9500
},
{
"epoch": 3.657302289782567,
"grad_norm": 0.12068984137989774,
"learning_rate": 5.991459334742063e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9714205920696258,
"step": 9505
},
{
"epoch": 3.6592264768135463,
"grad_norm": 0.12813199601373393,
"learning_rate": 5.980394832731235e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9706978559494018,
"step": 9510
},
{
"epoch": 3.6611506638445257,
"grad_norm": 0.13301041854928702,
"learning_rate": 5.96939104138044e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9705958902835846,
"step": 9515
},
{
"epoch": 3.663074850875505,
"grad_norm": 0.12989984590258896,
"learning_rate": 5.958447991733349e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9704363226890564,
"step": 9520
},
{
"epoch": 3.6649990379064845,
"grad_norm": 0.13131257145653005,
"learning_rate": 5.947565714662257e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9708589434623718,
"step": 9525
},
{
"epoch": 3.666923224937464,
"grad_norm": 0.12632135625413002,
"learning_rate": 5.936744240868021e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9707492649555206,
"step": 9530
},
{
"epoch": 3.6688474119684433,
"grad_norm": 0.12895466794249552,
"learning_rate": 5.9259836008799574e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705276906490325,
"step": 9535
},
{
"epoch": 3.6707715989994227,
"grad_norm": 0.13088433706746005,
"learning_rate": 5.91528382505576e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9706717550754547,
"step": 9540
},
{
"epoch": 3.672695786030402,
"grad_norm": 0.12617769608353183,
"learning_rate": 5.9046449435814105e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9710495352745057,
"step": 9545
},
{
"epoch": 3.6746199730613816,
"grad_norm": 0.12695856992703744,
"learning_rate": 5.894066986471097e-06,
"loss": 0.0815,
"mean_token_accuracy": 0.9717949390411377,
"step": 9550
},
{
"epoch": 3.676544160092361,
"grad_norm": 0.1446820744341158,
"learning_rate": 5.883549983567131e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9703502655029297,
"step": 9555
},
{
"epoch": 3.6784683471233404,
"grad_norm": 0.13044982400324565,
"learning_rate": 5.8730939645398635e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9701001346111298,
"step": 9560
},
{
"epoch": 3.6803925341543198,
"grad_norm": 0.12958478222913503,
"learning_rate": 5.862698958887599e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9706281781196594,
"step": 9565
},
{
"epoch": 3.682316721185299,
"grad_norm": 0.12732133750741922,
"learning_rate": 5.852364995936504e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9705536603927613,
"step": 9570
},
{
"epoch": 3.6842409082162786,
"grad_norm": 0.13493229587392408,
"learning_rate": 5.842092104840541e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.970888888835907,
"step": 9575
},
{
"epoch": 3.686165095247258,
"grad_norm": 0.1280209800426124,
"learning_rate": 5.831880314581377e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9710353314876556,
"step": 9580
},
{
"epoch": 3.6880892822782374,
"grad_norm": 0.13102973089755615,
"learning_rate": 5.821729653968301e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9712984323501587,
"step": 9585
},
{
"epoch": 3.690013469309217,
"grad_norm": 0.1324189617244035,
"learning_rate": 5.811640151638141e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9712521076202393,
"step": 9590
},
{
"epoch": 3.691937656340196,
"grad_norm": 0.12926347222955453,
"learning_rate": 5.8016118360551925e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.970823872089386,
"step": 9595
},
{
"epoch": 3.6938618433711756,
"grad_norm": 0.1266507464981616,
"learning_rate": 5.7916447355111335e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9709834456443787,
"step": 9600
},
{
"epoch": 3.695786030402155,
"grad_norm": 0.12877563320490412,
"learning_rate": 5.781738878124935e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9714086413383484,
"step": 9605
},
{
"epoch": 3.6977102174331344,
"grad_norm": 0.12918938997673993,
"learning_rate": 5.771894291842795e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705465793609619,
"step": 9610
},
{
"epoch": 3.699634404464114,
"grad_norm": 0.12659622004044696,
"learning_rate": 5.762111004438051e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9713815689086914,
"step": 9615
},
{
"epoch": 3.7015585914950933,
"grad_norm": 0.1340281643074414,
"learning_rate": 5.752389043511115e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9705156087875366,
"step": 9620
},
{
"epoch": 3.7034827785260727,
"grad_norm": 0.13195473895165047,
"learning_rate": 5.74272843648938e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707110404968262,
"step": 9625
},
{
"epoch": 3.705406965557052,
"grad_norm": 0.13339870846954704,
"learning_rate": 5.733129210627147e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.971345865726471,
"step": 9630
},
{
"epoch": 3.7073311525880315,
"grad_norm": 0.1303095363345547,
"learning_rate": 5.723591393005545e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9709326565265656,
"step": 9635
},
{
"epoch": 3.709255339619011,
"grad_norm": 0.13122225618803973,
"learning_rate": 5.714115010532475e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9712490439414978,
"step": 9640
},
{
"epoch": 3.7111795266499903,
"grad_norm": 0.13520625638718706,
"learning_rate": 5.704700089942502e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9707024335861206,
"step": 9645
},
{
"epoch": 3.7131037136809697,
"grad_norm": 0.1234826292184828,
"learning_rate": 5.695346657796808e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9709530830383301,
"step": 9650
},
{
"epoch": 3.715027900711949,
"grad_norm": 0.12783892191660337,
"learning_rate": 5.686054740483098e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.970134836435318,
"step": 9655
},
{
"epoch": 3.7169520877429285,
"grad_norm": 0.13140494485561072,
"learning_rate": 5.6768243642155355e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9708115994930268,
"step": 9660
},
{
"epoch": 3.718876274773908,
"grad_norm": 0.1279226683859733,
"learning_rate": 5.667655555034663e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9708460986614227,
"step": 9665
},
{
"epoch": 3.7208004618048873,
"grad_norm": 0.1280860588688012,
"learning_rate": 5.6585483388073375e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9706709027290344,
"step": 9670
},
{
"epoch": 3.7227246488358667,
"grad_norm": 0.12544249073631844,
"learning_rate": 5.649502741226642e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9709541261196136,
"step": 9675
},
{
"epoch": 3.724648835866846,
"grad_norm": 0.13036270589362559,
"learning_rate": 5.640518787811829e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9713950634002686,
"step": 9680
},
{
"epoch": 3.7265730228978255,
"grad_norm": 0.12191015362164194,
"learning_rate": 5.631596503908238e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9709760129451752,
"step": 9685
},
{
"epoch": 3.728497209928805,
"grad_norm": 0.13167359609413348,
"learning_rate": 5.622735914687238e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707652926445007,
"step": 9690
},
{
"epoch": 3.7304213969597844,
"grad_norm": 0.14381840216378108,
"learning_rate": 5.613937045146129e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9705403208732605,
"step": 9695
},
{
"epoch": 3.7323455839907638,
"grad_norm": 0.128441979982536,
"learning_rate": 5.605199920108101e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9714809536933899,
"step": 9700
},
{
"epoch": 3.734269771021743,
"grad_norm": 0.1293392279553799,
"learning_rate": 5.596524564222146e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709720969200134,
"step": 9705
},
{
"epoch": 3.7361939580527226,
"grad_norm": 0.12555112644927274,
"learning_rate": 5.587911001963e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9711412131786347,
"step": 9710
},
{
"epoch": 3.738118145083702,
"grad_norm": 0.12577695886277074,
"learning_rate": 5.579359257631066e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9712973713874817,
"step": 9715
},
{
"epoch": 3.7400423321146814,
"grad_norm": 0.1257272476723164,
"learning_rate": 5.570869355352341e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.971248596906662,
"step": 9720
},
{
"epoch": 3.741966519145661,
"grad_norm": 0.1273361782268896,
"learning_rate": 5.562441319078364e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9714790344238281,
"step": 9725
},
{
"epoch": 3.74389070617664,
"grad_norm": 0.1315909980513443,
"learning_rate": 5.554075172586131e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9712474584579468,
"step": 9730
},
{
"epoch": 3.7458148932076196,
"grad_norm": 0.1338115526857461,
"learning_rate": 5.545770939478045e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9701749801635742,
"step": 9735
},
{
"epoch": 3.747739080238599,
"grad_norm": 0.1254777546079142,
"learning_rate": 5.537528643181829e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9708792209625244,
"step": 9740
},
{
"epoch": 3.7496632672695784,
"grad_norm": 0.13495019415206566,
"learning_rate": 5.5293483069504805e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711350321769714,
"step": 9745
},
{
"epoch": 3.751587454300558,
"grad_norm": 0.1325753258034756,
"learning_rate": 5.5212299538621935e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9715189516544342,
"step": 9750
},
{
"epoch": 3.7535116413315377,
"grad_norm": 0.12845638484747274,
"learning_rate": 5.513173606820293e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9710196614265442,
"step": 9755
},
{
"epoch": 3.7554358283625167,
"grad_norm": 0.13421034634217963,
"learning_rate": 5.505179288553175e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9702002882957459,
"step": 9760
},
{
"epoch": 3.7573600153934965,
"grad_norm": 0.12980935458034462,
"learning_rate": 5.497247021614248e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.9716308295726777,
"step": 9765
},
{
"epoch": 3.7592842024244755,
"grad_norm": 0.1292440837300365,
"learning_rate": 5.489376828381857e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9712635397911071,
"step": 9770
},
{
"epoch": 3.7612083894554553,
"grad_norm": 0.13267514667464766,
"learning_rate": 5.481568731059224e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.970600175857544,
"step": 9775
},
{
"epoch": 3.7631325764864343,
"grad_norm": 0.1328581360184632,
"learning_rate": 5.473822751674394e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9713373780250549,
"step": 9780
},
{
"epoch": 3.765056763517414,
"grad_norm": 0.12474286120297946,
"learning_rate": 5.466138912080157e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.970719039440155,
"step": 9785
},
{
"epoch": 3.766980950548393,
"grad_norm": 0.12321370554616629,
"learning_rate": 5.458517233954e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9720763206481934,
"step": 9790
},
{
"epoch": 3.768905137579373,
"grad_norm": 0.12586128975028832,
"learning_rate": 5.450957738798047e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9712636828422546,
"step": 9795
},
{
"epoch": 3.770829324610352,
"grad_norm": 0.13099616747028167,
"learning_rate": 5.443460447938987e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9713502109050751,
"step": 9800
},
{
"epoch": 3.7727535116413318,
"grad_norm": 0.13315282807199783,
"learning_rate": 5.436025382528017e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9707408905029297,
"step": 9805
},
{
"epoch": 3.7746776986723107,
"grad_norm": 0.13012296461998368,
"learning_rate": 5.42865256354079e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9702576816082,
"step": 9810
},
{
"epoch": 3.7766018857032906,
"grad_norm": 0.1301751974531095,
"learning_rate": 5.421342011777347e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711966514587402,
"step": 9815
},
{
"epoch": 3.7785260727342695,
"grad_norm": 0.130077690349645,
"learning_rate": 5.414093747862066e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9712831854820252,
"step": 9820
},
{
"epoch": 3.7804502597652494,
"grad_norm": 0.1267481640529036,
"learning_rate": 5.406907792243597e-06,
"loss": 0.0845,
"mean_token_accuracy": 0.9705725729465484,
"step": 9825
},
{
"epoch": 3.7823744467962284,
"grad_norm": 0.1253012354373941,
"learning_rate": 5.3997841651948045e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9713540017604828,
"step": 9830
},
{
"epoch": 3.784298633827208,
"grad_norm": 0.132347676217188,
"learning_rate": 5.392722886812721e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9705925464630127,
"step": 9835
},
{
"epoch": 3.786222820858187,
"grad_norm": 0.1296479996379853,
"learning_rate": 5.3857239770184755e-06,
"loss": 0.0817,
"mean_token_accuracy": 0.9715558767318726,
"step": 9840
},
{
"epoch": 3.788147007889167,
"grad_norm": 0.1260459335050519,
"learning_rate": 5.378787455557247e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9711081922054291,
"step": 9845
},
{
"epoch": 3.790071194920146,
"grad_norm": 0.12971339976913693,
"learning_rate": 5.3719133419982e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707649409770965,
"step": 9850
},
{
"epoch": 3.791995381951126,
"grad_norm": 0.13658938924903805,
"learning_rate": 5.365101655734444e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.970706433057785,
"step": 9855
},
{
"epoch": 3.793919568982105,
"grad_norm": 0.12935536651610716,
"learning_rate": 5.358352415982966e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9708448588848114,
"step": 9860
},
{
"epoch": 3.7958437560130847,
"grad_norm": 0.12471233681763558,
"learning_rate": 5.351665641784581e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9710375130176544,
"step": 9865
},
{
"epoch": 3.7977679430440636,
"grad_norm": 0.1254634815523459,
"learning_rate": 5.345041352003874e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9707024693489075,
"step": 9870
},
{
"epoch": 3.7996921300750435,
"grad_norm": 0.1331609128556762,
"learning_rate": 5.338479565329152e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9710483849048615,
"step": 9875
},
{
"epoch": 3.8016163171060224,
"grad_norm": 0.12741349365771215,
"learning_rate": 5.331980300272393e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9710205376148224,
"step": 9880
},
{
"epoch": 3.8035405041370023,
"grad_norm": 0.13611099575339342,
"learning_rate": 5.3255435751691875e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9711984157562256,
"step": 9885
},
{
"epoch": 3.8054646911679813,
"grad_norm": 0.12980319201233825,
"learning_rate": 5.3191694081786865e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9707724273204803,
"step": 9890
},
{
"epoch": 3.807388878198961,
"grad_norm": 0.1426123333389128,
"learning_rate": 5.31285781728356e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9714478254318237,
"step": 9895
},
{
"epoch": 3.8093130652299405,
"grad_norm": 0.1282591232708518,
"learning_rate": 5.306608820289936e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9712033033370971,
"step": 9900
},
{
"epoch": 3.81123725226092,
"grad_norm": 0.12957055752538663,
"learning_rate": 5.300422434827353e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9710727035999298,
"step": 9905
},
{
"epoch": 3.8131614392918993,
"grad_norm": 0.1291094611425077,
"learning_rate": 5.2942986783487115e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9708347141742706,
"step": 9910
},
{
"epoch": 3.8150856263228787,
"grad_norm": 0.126529513057066,
"learning_rate": 5.288237568130227e-06,
"loss": 0.0813,
"mean_token_accuracy": 0.9718260645866394,
"step": 9915
},
{
"epoch": 3.817009813353858,
"grad_norm": 0.13568601561666901,
"learning_rate": 5.282239121271376e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9709510028362274,
"step": 9920
},
{
"epoch": 3.8189340003848375,
"grad_norm": 0.132814773260242,
"learning_rate": 5.2763033546948515e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9708267450332642,
"step": 9925
},
{
"epoch": 3.820858187415817,
"grad_norm": 0.13132014977440104,
"learning_rate": 5.270430285146514e-06,
"loss": 0.085,
"mean_token_accuracy": 0.970295125246048,
"step": 9930
},
{
"epoch": 3.8227823744467964,
"grad_norm": 0.133290690508834,
"learning_rate": 5.264619929195344e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.971205323934555,
"step": 9935
},
{
"epoch": 3.8247065614777758,
"grad_norm": 0.1314362482054655,
"learning_rate": 5.258872303233397e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707643210887908,
"step": 9940
},
{
"epoch": 3.826630748508755,
"grad_norm": 0.12533084082719861,
"learning_rate": 5.253187423475754e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9710265934467316,
"step": 9945
},
{
"epoch": 3.8285549355397346,
"grad_norm": 0.12911160533507593,
"learning_rate": 5.247565305960483e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709489643573761,
"step": 9950
},
{
"epoch": 3.830479122570714,
"grad_norm": 0.12233415417364961,
"learning_rate": 5.242005966548577e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9711369574069977,
"step": 9955
},
{
"epoch": 3.8324033096016934,
"grad_norm": 0.13001856988459168,
"learning_rate": 5.236509420923935e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9707648396492005,
"step": 9960
},
{
"epoch": 3.834327496632673,
"grad_norm": 0.1289324823675169,
"learning_rate": 5.23107568459329e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9699880659580231,
"step": 9965
},
{
"epoch": 3.836251683663652,
"grad_norm": 0.12593253880586072,
"learning_rate": 5.225704772886192e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9715697944164277,
"step": 9970
},
{
"epoch": 3.8381758706946316,
"grad_norm": 0.13267526238451943,
"learning_rate": 5.220396700954941e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9709756135940552,
"step": 9975
},
{
"epoch": 3.840100057725611,
"grad_norm": 0.12821941199142406,
"learning_rate": 5.215151483774559e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9702714800834655,
"step": 9980
},
{
"epoch": 3.8420242447565904,
"grad_norm": 0.13010407450206193,
"learning_rate": 5.209969136142742e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9708271741867065,
"step": 9985
},
{
"epoch": 3.84394843178757,
"grad_norm": 0.12906298041128167,
"learning_rate": 5.204849672679825e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711850464344025,
"step": 9990
},
{
"epoch": 3.8458726188185492,
"grad_norm": 0.1357119125248045,
"learning_rate": 5.199793107828726e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9714503288269043,
"step": 9995
},
{
"epoch": 3.8477968058495287,
"grad_norm": 0.12898214151698506,
"learning_rate": 5.194799455854923e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707751035690307,
"step": 10000
},
{
"epoch": 3.849720992880508,
"grad_norm": 0.12936479325355857,
"learning_rate": 5.189868730846402e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.971122008562088,
"step": 10005
},
{
"epoch": 3.8516451799114875,
"grad_norm": 0.12638559348310224,
"learning_rate": 5.185000946713621e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9716223776340485,
"step": 10010
},
{
"epoch": 3.853569366942467,
"grad_norm": 0.1274584727626304,
"learning_rate": 5.180196117189471e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9712328910827637,
"step": 10015
},
{
"epoch": 3.8554935539734463,
"grad_norm": 0.12934902608342166,
"learning_rate": 5.175454255829237e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9715910375118255,
"step": 10020
},
{
"epoch": 3.8574177410044257,
"grad_norm": 0.12649848202572228,
"learning_rate": 5.170775376010558e-06,
"loss": 0.0825,
"mean_token_accuracy": 0.9712820768356323,
"step": 10025
},
{
"epoch": 3.859341928035405,
"grad_norm": 0.12602422849900147,
"learning_rate": 5.166159490933391e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9714111924171448,
"step": 10030
},
{
"epoch": 3.8612661150663845,
"grad_norm": 0.12771046568125663,
"learning_rate": 5.161606613619979e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9702588737010955,
"step": 10035
},
{
"epoch": 3.863190302097364,
"grad_norm": 0.12819251323149022,
"learning_rate": 5.157116756914799e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9703081130981446,
"step": 10040
},
{
"epoch": 3.8651144891283433,
"grad_norm": 0.12439719751765296,
"learning_rate": 5.152689933484543e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9711783528327942,
"step": 10045
},
{
"epoch": 3.8670386761593227,
"grad_norm": 0.1314106629943747,
"learning_rate": 5.148326155818074e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711025774478912,
"step": 10050
},
{
"epoch": 3.868962863190302,
"grad_norm": 0.1250518592899649,
"learning_rate": 5.144025436226387e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9713931560516358,
"step": 10055
},
{
"epoch": 3.8708870502212815,
"grad_norm": 0.13078642303132598,
"learning_rate": 5.139787786842584e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9698314964771271,
"step": 10060
},
{
"epoch": 3.872811237252261,
"grad_norm": 0.12923350352929674,
"learning_rate": 5.135613219621834e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9711137771606445,
"step": 10065
},
{
"epoch": 3.8747354242832404,
"grad_norm": 0.12563235433177386,
"learning_rate": 5.131501746341337e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.971451610326767,
"step": 10070
},
{
"epoch": 3.8766596113142198,
"grad_norm": 0.13197299075249083,
"learning_rate": 5.127453378600299e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9704861402511596,
"step": 10075
},
{
"epoch": 3.878583798345199,
"grad_norm": 0.12846446070815468,
"learning_rate": 5.123468127819885e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.970909696817398,
"step": 10080
},
{
"epoch": 3.8805079853761786,
"grad_norm": 0.13823440456106123,
"learning_rate": 5.1195460052432016e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.970853990316391,
"step": 10085
},
{
"epoch": 3.882432172407158,
"grad_norm": 0.12688446312090473,
"learning_rate": 5.1156870219352635e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9702435910701752,
"step": 10090
},
{
"epoch": 3.8843563594381374,
"grad_norm": 0.12591304427956893,
"learning_rate": 5.111891188782951e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9713928639888764,
"step": 10095
},
{
"epoch": 3.886280546469117,
"grad_norm": 0.13381926417216353,
"learning_rate": 5.108158516494989e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.970725280046463,
"step": 10100
},
{
"epoch": 3.888204733500096,
"grad_norm": 0.13227264592543717,
"learning_rate": 5.104489015601915e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.9715412676334381,
"step": 10105
},
{
"epoch": 3.8901289205310756,
"grad_norm": 0.1332023257435153,
"learning_rate": 5.100882696456047e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9710934042930603,
"step": 10110
},
{
"epoch": 3.892053107562055,
"grad_norm": 0.1328629268332128,
"learning_rate": 5.09733956923146e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9711413741111755,
"step": 10115
},
{
"epoch": 3.8939772945930344,
"grad_norm": 0.12670490122460273,
"learning_rate": 5.093859643923948e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9714822411537171,
"step": 10120
},
{
"epoch": 3.895901481624014,
"grad_norm": 0.12499373344855456,
"learning_rate": 5.090442930351005e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9706906914710999,
"step": 10125
},
{
"epoch": 3.8978256686549932,
"grad_norm": 0.1245399094578281,
"learning_rate": 5.087089438151788e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9712965786457062,
"step": 10130
},
{
"epoch": 3.8997498556859727,
"grad_norm": 0.1301572958641481,
"learning_rate": 5.083799176787104e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9707218647003174,
"step": 10135
},
{
"epoch": 3.901674042716952,
"grad_norm": 0.13022481008797088,
"learning_rate": 5.080572155539369e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9702335953712463,
"step": 10140
},
{
"epoch": 3.9035982297479315,
"grad_norm": 0.12837698874608652,
"learning_rate": 5.077408383512584e-06,
"loss": 0.0805,
"mean_token_accuracy": 0.9720855534076691,
"step": 10145
},
{
"epoch": 3.905522416778911,
"grad_norm": 0.12753857050900713,
"learning_rate": 5.074307869632321e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9710967183113098,
"step": 10150
},
{
"epoch": 3.9074466038098903,
"grad_norm": 0.12859493835044503,
"learning_rate": 5.0712706226456835e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.971541553735733,
"step": 10155
},
{
"epoch": 3.9093707908408697,
"grad_norm": 0.12584170646455595,
"learning_rate": 5.068296651121286e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9715514481067657,
"step": 10160
},
{
"epoch": 3.911294977871849,
"grad_norm": 0.1269583447627004,
"learning_rate": 5.06538596344924e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9715434312820435,
"step": 10165
},
{
"epoch": 3.9132191649028285,
"grad_norm": 0.13090408555934774,
"learning_rate": 5.062538567841114e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9717414677143097,
"step": 10170
},
{
"epoch": 3.915143351933808,
"grad_norm": 0.1301397066057007,
"learning_rate": 5.059754472329919e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9714883327484131,
"step": 10175
},
{
"epoch": 3.9170675389647873,
"grad_norm": 0.1265616885928072,
"learning_rate": 5.0570336847700875e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9710824906826019,
"step": 10180
},
{
"epoch": 3.9189917259957667,
"grad_norm": 0.1292654952944248,
"learning_rate": 5.054376212837453e-06,
"loss": 0.0799,
"mean_token_accuracy": 0.9723989367485046,
"step": 10185
},
{
"epoch": 3.920915913026746,
"grad_norm": 0.12506948699138926,
"learning_rate": 5.051782064029214e-06,
"loss": 0.0843,
"mean_token_accuracy": 0.9707196116447449,
"step": 10190
},
{
"epoch": 3.9228401000577255,
"grad_norm": 0.12625564064244693,
"learning_rate": 5.0492512456639325e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709169268608093,
"step": 10195
},
{
"epoch": 3.924764287088705,
"grad_norm": 0.13116148538043676,
"learning_rate": 5.046783764881503e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9707896530628204,
"step": 10200
},
{
"epoch": 3.9266884741196844,
"grad_norm": 0.1284618138441698,
"learning_rate": 5.044379628643123e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9715276062488556,
"step": 10205
},
{
"epoch": 3.9286126611506638,
"grad_norm": 0.13214132106477244,
"learning_rate": 5.0420388437312975e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9706056773662567,
"step": 10210
},
{
"epoch": 3.930536848181643,
"grad_norm": 0.1348864793969741,
"learning_rate": 5.039761416749797e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9712868213653565,
"step": 10215
},
{
"epoch": 3.9324610352126226,
"grad_norm": 0.1289711702781738,
"learning_rate": 5.037547354123652e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.9727490663528442,
"step": 10220
},
{
"epoch": 3.934385222243602,
"grad_norm": 0.1271638377917533,
"learning_rate": 5.035396662099127e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.9715701401233673,
"step": 10225
},
{
"epoch": 3.9363094092745814,
"grad_norm": 0.13920813700768522,
"learning_rate": 5.033309346743716e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9715500593185424,
"step": 10230
},
{
"epoch": 3.938233596305561,
"grad_norm": 0.1270314032330905,
"learning_rate": 5.031285413946101e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9712082087993622,
"step": 10235
},
{
"epoch": 3.94015778333654,
"grad_norm": 0.13403626861564544,
"learning_rate": 5.0293248694161665e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9704956352710724,
"step": 10240
},
{
"epoch": 3.9420819703675196,
"grad_norm": 0.1309227117524348,
"learning_rate": 5.027427718684955e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9710226118564605,
"step": 10245
},
{
"epoch": 3.944006157398499,
"grad_norm": 0.13231010952518213,
"learning_rate": 5.02559396710467e-06,
"loss": 0.081,
"mean_token_accuracy": 0.9719463467597962,
"step": 10250
},
{
"epoch": 3.9459303444294784,
"grad_norm": 0.13128683536722258,
"learning_rate": 5.023823619848651e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9711349666118622,
"step": 10255
},
{
"epoch": 3.947854531460458,
"grad_norm": 0.12653554607775047,
"learning_rate": 5.02211668191137e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9714332520961761,
"step": 10260
},
{
"epoch": 3.9497787184914372,
"grad_norm": 0.13474648427718314,
"learning_rate": 5.020473158108398e-06,
"loss": 0.0829,
"mean_token_accuracy": 0.9714989304542542,
"step": 10265
},
{
"epoch": 3.9517029055224167,
"grad_norm": 0.13044370419525142,
"learning_rate": 5.0188930530764155e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9714354455471039,
"step": 10270
},
{
"epoch": 3.953627092553396,
"grad_norm": 0.12661398006963123,
"learning_rate": 5.017376371273177e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9711092829704284,
"step": 10275
},
{
"epoch": 3.9555512795843755,
"grad_norm": 0.1387673625001325,
"learning_rate": 5.015923116977517e-06,
"loss": 0.0818,
"mean_token_accuracy": 0.9715732455253601,
"step": 10280
},
{
"epoch": 3.957475466615355,
"grad_norm": 0.12749658417248053,
"learning_rate": 5.014533294289326e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9713805615901947,
"step": 10285
},
{
"epoch": 3.9593996536463343,
"grad_norm": 0.14480554531550074,
"learning_rate": 5.013206907129543e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9709558188915253,
"step": 10290
},
{
"epoch": 3.9613238406773137,
"grad_norm": 0.1340795746063367,
"learning_rate": 5.011943959240139e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9717664599418641,
"step": 10295
},
{
"epoch": 3.963248027708293,
"grad_norm": 0.13577146914254368,
"learning_rate": 5.010744454184122e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9720539331436158,
"step": 10300
},
{
"epoch": 3.9651722147392725,
"grad_norm": 0.13278406905491066,
"learning_rate": 5.009608395345504e-06,
"loss": 0.0817,
"mean_token_accuracy": 0.9717506885528564,
"step": 10305
},
{
"epoch": 3.967096401770252,
"grad_norm": 0.12880093192507883,
"learning_rate": 5.008535785929318e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9709747076034546,
"step": 10310
},
{
"epoch": 3.9690205888012313,
"grad_norm": 0.12894048623160223,
"learning_rate": 5.007526628961585e-06,
"loss": 0.0823,
"mean_token_accuracy": 0.9715463817119598,
"step": 10315
},
{
"epoch": 3.9709447758322107,
"grad_norm": 0.13374200890172133,
"learning_rate": 5.0065809272893155e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.970946192741394,
"step": 10320
},
{
"epoch": 3.97286896286319,
"grad_norm": 0.1347420834773057,
"learning_rate": 5.0056986835805045e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9711974918842315,
"step": 10325
},
{
"epoch": 3.97479314989417,
"grad_norm": 0.129123768389722,
"learning_rate": 5.004879900324117e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9712783515453338,
"step": 10330
},
{
"epoch": 3.976717336925149,
"grad_norm": 0.12496839255961688,
"learning_rate": 5.004124579830091e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9713409304618835,
"step": 10335
},
{
"epoch": 3.978641523956129,
"grad_norm": 0.12993674670818847,
"learning_rate": 5.003432724229319e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.970902556180954,
"step": 10340
},
{
"epoch": 3.9805657109871078,
"grad_norm": 0.1328995851373331,
"learning_rate": 5.002804335473649e-06,
"loss": 0.0855,
"mean_token_accuracy": 0.9702074944972991,
"step": 10345
},
{
"epoch": 3.9824898980180876,
"grad_norm": 0.1283922514424663,
"learning_rate": 5.0022394153358796e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9709455788135528,
"step": 10350
},
{
"epoch": 3.9844140850490666,
"grad_norm": 0.12695338540041073,
"learning_rate": 5.001737965409753e-06,
"loss": 0.0812,
"mean_token_accuracy": 0.9720363140106201,
"step": 10355
},
{
"epoch": 3.9863382720800464,
"grad_norm": 0.12901800004736622,
"learning_rate": 5.001299987109945e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.9715746283531189,
"step": 10360
},
{
"epoch": 3.9882624591110254,
"grad_norm": 0.13476223421856068,
"learning_rate": 5.0009254816720735e-06,
"loss": 0.0824,
"mean_token_accuracy": 0.9714550971984863,
"step": 10365
},
{
"epoch": 3.9901866461420052,
"grad_norm": 0.13027831801951967,
"learning_rate": 5.000614450152687e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9711871147155762,
"step": 10370
},
{
"epoch": 3.992110833172984,
"grad_norm": 0.13494495469316733,
"learning_rate": 5.000366893429256e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.971628975868225,
"step": 10375
},
{
"epoch": 3.994035020203964,
"grad_norm": 0.12309697899654617,
"learning_rate": 5.000182812200186e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9710172712802887,
"step": 10380
},
{
"epoch": 3.995959207234943,
"grad_norm": 0.13140302440475887,
"learning_rate": 5.000062206984804e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9709397912025451,
"step": 10385
},
{
"epoch": 3.997883394265923,
"grad_norm": 0.13152660776564254,
"learning_rate": 5.000005078123357e-06,
"loss": 0.0796,
"mean_token_accuracy": 0.972462397813797,
"step": 10390
},
{
"epoch": 3.9986530690783146,
"mean_token_accuracy": 0.9705722630023956,
"step": 10392,
"total_flos": 5409408689373184.0,
"train_loss": 0.14862096939577196,
"train_runtime": 59690.9668,
"train_samples_per_second": 2.786,
"train_steps_per_second": 0.174
}
],
"logging_steps": 5,
"max_steps": 10392,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5409408689373184.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}