SmolLM-3B-Science-FR / trainer_state.json
toroe's picture
Upload folder using huggingface_hub
5ea622b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3036,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004941321803582459,
"grad_norm": 3.079270839691162,
"learning_rate": 1.092896174863388e-06,
"loss": 1.004,
"mean_token_accuracy": 0.7358035773038865,
"num_tokens": 10405489.0,
"step": 5
},
{
"epoch": 0.009882643607164917,
"grad_norm": 1.1023986339569092,
"learning_rate": 2.459016393442623e-06,
"loss": 0.967,
"mean_token_accuracy": 0.7401202037930489,
"num_tokens": 20812735.0,
"step": 10
},
{
"epoch": 0.014823965410747375,
"grad_norm": 1.2933658361434937,
"learning_rate": 3.825136612021858e-06,
"loss": 0.9421,
"mean_token_accuracy": 0.7410167530179024,
"num_tokens": 31206554.0,
"step": 15
},
{
"epoch": 0.019765287214329835,
"grad_norm": 1.6906228065490723,
"learning_rate": 5.191256830601094e-06,
"loss": 0.9049,
"mean_token_accuracy": 0.7487473994493484,
"num_tokens": 41618401.0,
"step": 20
},
{
"epoch": 0.02470660901791229,
"grad_norm": 0.48920488357543945,
"learning_rate": 6.557377049180328e-06,
"loss": 0.8892,
"mean_token_accuracy": 0.7520575806498527,
"num_tokens": 52016212.0,
"step": 25
},
{
"epoch": 0.02964793082149475,
"grad_norm": 0.4231764078140259,
"learning_rate": 7.923497267759564e-06,
"loss": 0.9592,
"mean_token_accuracy": 0.7517324000597,
"num_tokens": 62414110.0,
"step": 30
},
{
"epoch": 0.034589252625077206,
"grad_norm": 0.3153907358646393,
"learning_rate": 9.2896174863388e-06,
"loss": 0.8669,
"mean_token_accuracy": 0.7558081388473511,
"num_tokens": 72804943.0,
"step": 35
},
{
"epoch": 0.03953057442865967,
"grad_norm": 0.26243749260902405,
"learning_rate": 1.0655737704918032e-05,
"loss": 0.8518,
"mean_token_accuracy": 0.7586903437972069,
"num_tokens": 83189221.0,
"step": 40
},
{
"epoch": 0.044471896232242125,
"grad_norm": 0.23591159284114838,
"learning_rate": 1.2021857923497268e-05,
"loss": 0.8462,
"mean_token_accuracy": 0.759694167971611,
"num_tokens": 93605213.0,
"step": 45
},
{
"epoch": 0.04941321803582458,
"grad_norm": 0.22021469473838806,
"learning_rate": 1.3387978142076505e-05,
"loss": 0.8408,
"mean_token_accuracy": 0.7603029757738113,
"num_tokens": 104010903.0,
"step": 50
},
{
"epoch": 0.054354539839407044,
"grad_norm": 0.20398813486099243,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.8297,
"mean_token_accuracy": 0.7627619341015816,
"num_tokens": 114411921.0,
"step": 55
},
{
"epoch": 0.0592958616429895,
"grad_norm": 0.21533672511577606,
"learning_rate": 1.6120218579234975e-05,
"loss": 0.8144,
"mean_token_accuracy": 0.7663307622075081,
"num_tokens": 124801309.0,
"step": 60
},
{
"epoch": 0.06423718344657196,
"grad_norm": 0.21680276095867157,
"learning_rate": 1.7486338797814207e-05,
"loss": 0.8258,
"mean_token_accuracy": 0.7630844265222549,
"num_tokens": 135229970.0,
"step": 65
},
{
"epoch": 0.06917850525015441,
"grad_norm": 0.19620661437511444,
"learning_rate": 1.8852459016393442e-05,
"loss": 0.8129,
"mean_token_accuracy": 0.7654996693134308,
"num_tokens": 145598239.0,
"step": 70
},
{
"epoch": 0.07411982705373688,
"grad_norm": 0.2198481261730194,
"learning_rate": 2.0218579234972678e-05,
"loss": 0.8055,
"mean_token_accuracy": 0.7672501325607299,
"num_tokens": 155991018.0,
"step": 75
},
{
"epoch": 0.07906114885731934,
"grad_norm": 0.25267091393470764,
"learning_rate": 2.1584699453551914e-05,
"loss": 0.8007,
"mean_token_accuracy": 0.7685669183731079,
"num_tokens": 166394949.0,
"step": 80
},
{
"epoch": 0.08400247066090179,
"grad_norm": 0.3027536869049072,
"learning_rate": 2.295081967213115e-05,
"loss": 0.8084,
"mean_token_accuracy": 0.7663731932640075,
"num_tokens": 176796419.0,
"step": 85
},
{
"epoch": 0.08894379246448425,
"grad_norm": 0.37499961256980896,
"learning_rate": 2.431693989071038e-05,
"loss": 0.8014,
"mean_token_accuracy": 0.7681452915072441,
"num_tokens": 187215188.0,
"step": 90
},
{
"epoch": 0.09388511426806671,
"grad_norm": 0.49170613288879395,
"learning_rate": 2.568306010928962e-05,
"loss": 0.8062,
"mean_token_accuracy": 0.7694577813148499,
"num_tokens": 197599168.0,
"step": 95
},
{
"epoch": 0.09882643607164916,
"grad_norm": 7.697641372680664,
"learning_rate": 2.7049180327868856e-05,
"loss": 0.8025,
"mean_token_accuracy": 0.7678478330373764,
"num_tokens": 208023285.0,
"step": 100
},
{
"epoch": 0.10376775787523163,
"grad_norm": 0.9521784782409668,
"learning_rate": 2.841530054644809e-05,
"loss": 0.7994,
"mean_token_accuracy": 0.7675640687346459,
"num_tokens": 218443542.0,
"step": 105
},
{
"epoch": 0.10870907967881409,
"grad_norm": 0.48149237036705017,
"learning_rate": 2.9781420765027324e-05,
"loss": 0.8044,
"mean_token_accuracy": 0.7670532912015915,
"num_tokens": 228855088.0,
"step": 110
},
{
"epoch": 0.11365040148239654,
"grad_norm": 0.5225210189819336,
"learning_rate": 3.114754098360656e-05,
"loss": 0.7831,
"mean_token_accuracy": 0.7721872299909591,
"num_tokens": 239245255.0,
"step": 115
},
{
"epoch": 0.118591723285979,
"grad_norm": 0.6328762769699097,
"learning_rate": 3.251366120218579e-05,
"loss": 0.792,
"mean_token_accuracy": 0.7695287197828293,
"num_tokens": 249641901.0,
"step": 120
},
{
"epoch": 0.12353304508956146,
"grad_norm": 0.47159990668296814,
"learning_rate": 3.387978142076503e-05,
"loss": 0.7973,
"mean_token_accuracy": 0.76757872402668,
"num_tokens": 260014931.0,
"step": 125
},
{
"epoch": 0.12847436689314393,
"grad_norm": 0.26565825939178467,
"learning_rate": 3.524590163934427e-05,
"loss": 0.7784,
"mean_token_accuracy": 0.7727497264742851,
"num_tokens": 270399134.0,
"step": 130
},
{
"epoch": 0.13341568869672638,
"grad_norm": 0.34533897042274475,
"learning_rate": 3.66120218579235e-05,
"loss": 0.7761,
"mean_token_accuracy": 0.7732908591628075,
"num_tokens": 280819722.0,
"step": 135
},
{
"epoch": 0.13835701050030882,
"grad_norm": 0.32269155979156494,
"learning_rate": 3.797814207650273e-05,
"loss": 0.7743,
"mean_token_accuracy": 0.7737134978175163,
"num_tokens": 291234416.0,
"step": 140
},
{
"epoch": 0.1432983323038913,
"grad_norm": 0.418888658285141,
"learning_rate": 3.934426229508197e-05,
"loss": 0.7832,
"mean_token_accuracy": 0.7711038008332253,
"num_tokens": 301649374.0,
"step": 145
},
{
"epoch": 0.14823965410747375,
"grad_norm": 2.6427414417266846,
"learning_rate": 4.07103825136612e-05,
"loss": 0.825,
"mean_token_accuracy": 0.766799908876419,
"num_tokens": 312034521.0,
"step": 150
},
{
"epoch": 0.1531809759110562,
"grad_norm": 2.3590986728668213,
"learning_rate": 4.207650273224044e-05,
"loss": 0.8059,
"mean_token_accuracy": 0.7664751559495926,
"num_tokens": 322426237.0,
"step": 155
},
{
"epoch": 0.15812229771463868,
"grad_norm": 2.972717523574829,
"learning_rate": 4.3442622950819674e-05,
"loss": 0.7977,
"mean_token_accuracy": 0.7695314347743988,
"num_tokens": 332818660.0,
"step": 160
},
{
"epoch": 0.16306361951822113,
"grad_norm": 0.5393229722976685,
"learning_rate": 4.4808743169398906e-05,
"loss": 0.7903,
"mean_token_accuracy": 0.7694852471351623,
"num_tokens": 343223659.0,
"step": 165
},
{
"epoch": 0.16800494132180357,
"grad_norm": 0.5978872776031494,
"learning_rate": 4.6174863387978145e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7744763985276222,
"num_tokens": 353601938.0,
"step": 170
},
{
"epoch": 0.17294626312538605,
"grad_norm": 0.36113736033439636,
"learning_rate": 4.754098360655738e-05,
"loss": 0.7702,
"mean_token_accuracy": 0.773854723572731,
"num_tokens": 363963995.0,
"step": 175
},
{
"epoch": 0.1778875849289685,
"grad_norm": 0.296596497297287,
"learning_rate": 4.890710382513661e-05,
"loss": 0.7635,
"mean_token_accuracy": 0.7756986305117607,
"num_tokens": 374367241.0,
"step": 180
},
{
"epoch": 0.18282890673255095,
"grad_norm": 0.35513490438461304,
"learning_rate": 4.9999984843247074e-05,
"loss": 0.7694,
"mean_token_accuracy": 0.7740960389375686,
"num_tokens": 384787359.0,
"step": 185
},
{
"epoch": 0.18777022853613343,
"grad_norm": 0.2819320261478424,
"learning_rate": 4.999945435882428e-05,
"loss": 0.759,
"mean_token_accuracy": 0.7767567992210388,
"num_tokens": 395178532.0,
"step": 190
},
{
"epoch": 0.19271155033971588,
"grad_norm": 0.25831785798072815,
"learning_rate": 4.9998166055133136e-05,
"loss": 0.7679,
"mean_token_accuracy": 0.7745850294828415,
"num_tokens": 405586629.0,
"step": 195
},
{
"epoch": 0.19765287214329832,
"grad_norm": 0.2429356724023819,
"learning_rate": 4.9996119971226544e-05,
"loss": 0.7596,
"mean_token_accuracy": 0.776530908048153,
"num_tokens": 415992685.0,
"step": 200
},
{
"epoch": 0.2025941939468808,
"grad_norm": 0.29596009850502014,
"learning_rate": 4.9993316169128334e-05,
"loss": 0.7434,
"mean_token_accuracy": 0.7808976873755455,
"num_tokens": 426380223.0,
"step": 205
},
{
"epoch": 0.20753551575046325,
"grad_norm": 0.32271862030029297,
"learning_rate": 4.9989754733831366e-05,
"loss": 0.7709,
"mean_token_accuracy": 0.7732070550322533,
"num_tokens": 436791731.0,
"step": 210
},
{
"epoch": 0.2124768375540457,
"grad_norm": 0.4768483340740204,
"learning_rate": 4.9985435773294975e-05,
"loss": 0.7609,
"mean_token_accuracy": 0.7759308516979218,
"num_tokens": 447209022.0,
"step": 215
},
{
"epoch": 0.21741815935762818,
"grad_norm": 0.38842251896858215,
"learning_rate": 4.998035941844167e-05,
"loss": 0.7515,
"mean_token_accuracy": 0.7781018510460853,
"num_tokens": 457588195.0,
"step": 220
},
{
"epoch": 0.22235948116121063,
"grad_norm": 0.3555106818675995,
"learning_rate": 4.9974525823153194e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.7750346094369889,
"num_tokens": 467968939.0,
"step": 225
},
{
"epoch": 0.22730080296479308,
"grad_norm": 0.34921368956565857,
"learning_rate": 4.9967935164265854e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7772198930382729,
"num_tokens": 478327236.0,
"step": 230
},
{
"epoch": 0.23224212476837555,
"grad_norm": 0.8633297681808472,
"learning_rate": 4.9960587641565125e-05,
"loss": 0.7599,
"mean_token_accuracy": 0.7760145485401153,
"num_tokens": 488716484.0,
"step": 235
},
{
"epoch": 0.237183446571958,
"grad_norm": 0.6466217041015625,
"learning_rate": 4.9952483477779654e-05,
"loss": 0.7544,
"mean_token_accuracy": 0.7774667114019393,
"num_tokens": 499127155.0,
"step": 240
},
{
"epoch": 0.24212476837554045,
"grad_norm": 0.6501172780990601,
"learning_rate": 4.994362291857445e-05,
"loss": 0.7658,
"mean_token_accuracy": 0.7741294875741005,
"num_tokens": 509553916.0,
"step": 245
},
{
"epoch": 0.24706609017912293,
"grad_norm": 0.9864006638526917,
"learning_rate": 4.993400623254347e-05,
"loss": 0.7816,
"mean_token_accuracy": 0.7745596036314965,
"num_tokens": 519972291.0,
"step": 250
},
{
"epoch": 0.2520074119827054,
"grad_norm": 0.7591823935508728,
"learning_rate": 4.99236337112015e-05,
"loss": 0.7593,
"mean_token_accuracy": 0.7757471099495887,
"num_tokens": 530382104.0,
"step": 255
},
{
"epoch": 0.25694873378628785,
"grad_norm": 0.8461949825286865,
"learning_rate": 4.9912505668975245e-05,
"loss": 0.7775,
"mean_token_accuracy": 0.7719829902052879,
"num_tokens": 540772176.0,
"step": 260
},
{
"epoch": 0.2618900555898703,
"grad_norm": 0.5740395188331604,
"learning_rate": 4.990062244319387e-05,
"loss": 0.7519,
"mean_token_accuracy": 0.7776331752538681,
"num_tokens": 551147852.0,
"step": 265
},
{
"epoch": 0.26683137739345275,
"grad_norm": 0.3479936420917511,
"learning_rate": 4.988798439407872e-05,
"loss": 0.7569,
"mean_token_accuracy": 0.7760330036282539,
"num_tokens": 561569474.0,
"step": 270
},
{
"epoch": 0.27177269919703523,
"grad_norm": 0.29772821068763733,
"learning_rate": 4.9874591904732446e-05,
"loss": 0.756,
"mean_token_accuracy": 0.7763150066137314,
"num_tokens": 571935576.0,
"step": 275
},
{
"epoch": 0.27671402100061765,
"grad_norm": 0.5030266046524048,
"learning_rate": 4.9860445381127385e-05,
"loss": 0.7792,
"mean_token_accuracy": 0.7774720788002014,
"num_tokens": 582349510.0,
"step": 280
},
{
"epoch": 0.2816553428042001,
"grad_norm": 0.3081815242767334,
"learning_rate": 4.984554525209321e-05,
"loss": 0.7452,
"mean_token_accuracy": 0.7791904672980309,
"num_tokens": 592706344.0,
"step": 285
},
{
"epoch": 0.2865966646077826,
"grad_norm": 0.29467764496803284,
"learning_rate": 4.9829891969303973e-05,
"loss": 0.7573,
"mean_token_accuracy": 0.7754112169146538,
"num_tokens": 603106180.0,
"step": 290
},
{
"epoch": 0.291537986411365,
"grad_norm": 0.2687968313694,
"learning_rate": 4.981348600726441e-05,
"loss": 0.7427,
"mean_token_accuracy": 0.7795338571071625,
"num_tokens": 613491192.0,
"step": 295
},
{
"epoch": 0.2964793082149475,
"grad_norm": 0.3151031732559204,
"learning_rate": 4.9796327863295536e-05,
"loss": 0.7648,
"mean_token_accuracy": 0.7766805797815323,
"num_tokens": 623915573.0,
"step": 300
},
{
"epoch": 0.30142063001853,
"grad_norm": 0.23440544307231903,
"learning_rate": 4.9778418057519595e-05,
"loss": 0.7398,
"mean_token_accuracy": 0.7802766650915146,
"num_tokens": 634319400.0,
"step": 305
},
{
"epoch": 0.3063619518221124,
"grad_norm": 0.24154525995254517,
"learning_rate": 4.9759757132844256e-05,
"loss": 0.7369,
"mean_token_accuracy": 0.7812311604619027,
"num_tokens": 644726414.0,
"step": 310
},
{
"epoch": 0.3113032736256949,
"grad_norm": 0.22309941053390503,
"learning_rate": 4.974034565494621e-05,
"loss": 0.7365,
"mean_token_accuracy": 0.7811038464307785,
"num_tokens": 655151755.0,
"step": 315
},
{
"epoch": 0.31624459542927735,
"grad_norm": 0.22005286812782288,
"learning_rate": 4.972018421225397e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7791362345218659,
"num_tokens": 665495983.0,
"step": 320
},
{
"epoch": 0.3211859172328598,
"grad_norm": 0.3202398121356964,
"learning_rate": 4.969927341593008e-05,
"loss": 0.7485,
"mean_token_accuracy": 0.7779804199934006,
"num_tokens": 675879481.0,
"step": 325
},
{
"epoch": 0.32612723903644225,
"grad_norm": 0.25242334604263306,
"learning_rate": 4.9677613899852535e-05,
"loss": 0.7419,
"mean_token_accuracy": 0.7794451892375946,
"num_tokens": 686284515.0,
"step": 330
},
{
"epoch": 0.33106856084002473,
"grad_norm": 0.3325527012348175,
"learning_rate": 4.965520632059562e-05,
"loss": 0.7633,
"mean_token_accuracy": 0.7767202571034432,
"num_tokens": 696699232.0,
"step": 335
},
{
"epoch": 0.33600988264360715,
"grad_norm": 0.39707615971565247,
"learning_rate": 4.963205135740997e-05,
"loss": 0.7493,
"mean_token_accuracy": 0.7775179639458656,
"num_tokens": 707076497.0,
"step": 340
},
{
"epoch": 0.3409512044471896,
"grad_norm": 0.346334308385849,
"learning_rate": 4.960814971220199e-05,
"loss": 0.7417,
"mean_token_accuracy": 0.7795804455876351,
"num_tokens": 717474591.0,
"step": 345
},
{
"epoch": 0.3458925262507721,
"grad_norm": 4.010451316833496,
"learning_rate": 4.958350210951259e-05,
"loss": 0.7478,
"mean_token_accuracy": 0.778372198343277,
"num_tokens": 727885602.0,
"step": 350
},
{
"epoch": 0.3508338480543545,
"grad_norm": 0.3823387026786804,
"learning_rate": 4.95581092964952e-05,
"loss": 0.7487,
"mean_token_accuracy": 0.7780078485608101,
"num_tokens": 738268731.0,
"step": 355
},
{
"epoch": 0.355775169857937,
"grad_norm": 0.33636268973350525,
"learning_rate": 4.953197204289315e-05,
"loss": 0.7465,
"mean_token_accuracy": 0.7781498372554779,
"num_tokens": 748674599.0,
"step": 360
},
{
"epoch": 0.3607164916615195,
"grad_norm": 0.34202906489372253,
"learning_rate": 4.9505091141016305e-05,
"loss": 0.7494,
"mean_token_accuracy": 0.7772357612848282,
"num_tokens": 759049314.0,
"step": 365
},
{
"epoch": 0.3656578134651019,
"grad_norm": 0.4149486720561981,
"learning_rate": 4.947746740571706e-05,
"loss": 0.7333,
"mean_token_accuracy": 0.7816486582159996,
"num_tokens": 769428740.0,
"step": 370
},
{
"epoch": 0.3705991352686844,
"grad_norm": 1.6017032861709595,
"learning_rate": 4.9449101674365643e-05,
"loss": 0.7588,
"mean_token_accuracy": 0.7753262847661972,
"num_tokens": 779800245.0,
"step": 375
},
{
"epoch": 0.37554045707226685,
"grad_norm": 0.9440131783485413,
"learning_rate": 4.941999480682474e-05,
"loss": 0.7398,
"mean_token_accuracy": 0.7795335426926613,
"num_tokens": 790196388.0,
"step": 380
},
{
"epoch": 0.3804817788758493,
"grad_norm": 0.8744800090789795,
"learning_rate": 4.939014768542342e-05,
"loss": 0.7344,
"mean_token_accuracy": 0.7812343299388885,
"num_tokens": 800569083.0,
"step": 385
},
{
"epoch": 0.38542310067943175,
"grad_norm": 0.825480043888092,
"learning_rate": 4.935956121493036e-05,
"loss": 0.7355,
"mean_token_accuracy": 0.7809210374951363,
"num_tokens": 810992288.0,
"step": 390
},
{
"epoch": 0.39036442248301423,
"grad_norm": 0.3783267140388489,
"learning_rate": 4.9328236322526475e-05,
"loss": 0.741,
"mean_token_accuracy": 0.7795790642499923,
"num_tokens": 821390810.0,
"step": 395
},
{
"epoch": 0.39530574428659665,
"grad_norm": 18.137908935546875,
"learning_rate": 4.9296173957776776e-05,
"loss": 0.7699,
"mean_token_accuracy": 0.7768698945641518,
"num_tokens": 831805263.0,
"step": 400
},
{
"epoch": 0.4002470660901791,
"grad_norm": 1.4189810752868652,
"learning_rate": 4.926337509260157e-05,
"loss": 0.7553,
"mean_token_accuracy": 0.7770886451005936,
"num_tokens": 842216817.0,
"step": 405
},
{
"epoch": 0.4051883878937616,
"grad_norm": 2.034182071685791,
"learning_rate": 4.9229840721247054e-05,
"loss": 0.7499,
"mean_token_accuracy": 0.7768227070569992,
"num_tokens": 852576765.0,
"step": 410
},
{
"epoch": 0.410129709697344,
"grad_norm": 0.6896814703941345,
"learning_rate": 4.919557186025512e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7783482626080513,
"num_tokens": 862986504.0,
"step": 415
},
{
"epoch": 0.4150710315009265,
"grad_norm": 0.46286579966545105,
"learning_rate": 4.9160569548432556e-05,
"loss": 0.7408,
"mean_token_accuracy": 0.7793940395116806,
"num_tokens": 873406106.0,
"step": 420
},
{
"epoch": 0.420012353304509,
"grad_norm": 0.41379502415657043,
"learning_rate": 4.912483484681959e-05,
"loss": 0.7559,
"mean_token_accuracy": 0.7800678476691246,
"num_tokens": 883753362.0,
"step": 425
},
{
"epoch": 0.4249536751080914,
"grad_norm": 0.5154562592506409,
"learning_rate": 4.908836883865768e-05,
"loss": 0.7387,
"mean_token_accuracy": 0.7794730111956596,
"num_tokens": 894178144.0,
"step": 430
},
{
"epoch": 0.4298949969116739,
"grad_norm": 0.38851067423820496,
"learning_rate": 4.905117262935669e-05,
"loss": 0.7407,
"mean_token_accuracy": 0.7789692118763923,
"num_tokens": 904574122.0,
"step": 435
},
{
"epoch": 0.43483631871525635,
"grad_norm": 2.381132125854492,
"learning_rate": 4.901324734646139e-05,
"loss": 0.7344,
"mean_token_accuracy": 0.7810276612639427,
"num_tokens": 914977007.0,
"step": 440
},
{
"epoch": 0.4397776405188388,
"grad_norm": 0.43396854400634766,
"learning_rate": 4.897459413961729e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.7823416009545326,
"num_tokens": 925387385.0,
"step": 445
},
{
"epoch": 0.44471896232242125,
"grad_norm": 0.38089531660079956,
"learning_rate": 4.893521418053575e-05,
"loss": 0.7695,
"mean_token_accuracy": 0.7785934925079345,
"num_tokens": 935815137.0,
"step": 450
},
{
"epoch": 0.44966028412600373,
"grad_norm": 0.3441498875617981,
"learning_rate": 4.88951086629585e-05,
"loss": 0.7305,
"mean_token_accuracy": 0.7820696160197258,
"num_tokens": 946244183.0,
"step": 455
},
{
"epoch": 0.45460160592958615,
"grad_norm": 0.29439103603363037,
"learning_rate": 4.885427880262144e-05,
"loss": 0.7453,
"mean_token_accuracy": 0.7779595762491226,
"num_tokens": 956604208.0,
"step": 460
},
{
"epoch": 0.4595429277331686,
"grad_norm": 5.913498878479004,
"learning_rate": 4.881272583721776e-05,
"loss": 0.7628,
"mean_token_accuracy": 0.777838508784771,
"num_tokens": 967017106.0,
"step": 465
},
{
"epoch": 0.4644842495367511,
"grad_norm": 0.28903448581695557,
"learning_rate": 4.8770451026360495e-05,
"loss": 0.7393,
"mean_token_accuracy": 0.7796890079975128,
"num_tokens": 977403787.0,
"step": 470
},
{
"epoch": 0.4694255713403335,
"grad_norm": 0.24736757576465607,
"learning_rate": 4.872745565154424e-05,
"loss": 0.7302,
"mean_token_accuracy": 0.7817555531859398,
"num_tokens": 987801619.0,
"step": 475
},
{
"epoch": 0.474366893143916,
"grad_norm": 0.260588139295578,
"learning_rate": 4.868374101610638e-05,
"loss": 0.7282,
"mean_token_accuracy": 0.7825865045189857,
"num_tokens": 998222682.0,
"step": 480
},
{
"epoch": 0.4793082149474985,
"grad_norm": 0.589485764503479,
"learning_rate": 4.863930844518757e-05,
"loss": 0.7409,
"mean_token_accuracy": 0.7823499292135239,
"num_tokens": 1008614437.0,
"step": 485
},
{
"epoch": 0.4842495367510809,
"grad_norm": 1.0684598684310913,
"learning_rate": 4.8594159285691546e-05,
"loss": 0.7398,
"mean_token_accuracy": 0.779399824142456,
"num_tokens": 1019023581.0,
"step": 490
},
{
"epoch": 0.4891908585546634,
"grad_norm": 0.2995496094226837,
"learning_rate": 4.8548294906244285e-05,
"loss": 0.7348,
"mean_token_accuracy": 0.7809380605816841,
"num_tokens": 1029439322.0,
"step": 495
},
{
"epoch": 0.49413218035824585,
"grad_norm": 0.2681543231010437,
"learning_rate": 4.8501716697152555e-05,
"loss": 0.7218,
"mean_token_accuracy": 0.7838469684123993,
"num_tokens": 1039851383.0,
"step": 500
},
{
"epoch": 0.4990735021618283,
"grad_norm": 0.30591872334480286,
"learning_rate": 4.845442607036176e-05,
"loss": 0.7391,
"mean_token_accuracy": 0.7794045254588127,
"num_tokens": 1050273096.0,
"step": 505
},
{
"epoch": 0.5040148239654108,
"grad_norm": 0.43375658988952637,
"learning_rate": 4.840642445941309e-05,
"loss": 0.7443,
"mean_token_accuracy": 0.778987355530262,
"num_tokens": 1060667850.0,
"step": 510
},
{
"epoch": 0.5089561457689932,
"grad_norm": 0.35767972469329834,
"learning_rate": 4.8357713319400155e-05,
"loss": 0.7319,
"mean_token_accuracy": 0.7822049915790558,
"num_tokens": 1071070190.0,
"step": 515
},
{
"epoch": 0.5138974675725757,
"grad_norm": 0.44550326466560364,
"learning_rate": 4.8308294126924794e-05,
"loss": 0.7213,
"mean_token_accuracy": 0.7840958744287491,
"num_tokens": 1081476164.0,
"step": 520
},
{
"epoch": 0.5188387893761581,
"grad_norm": 0.28565794229507446,
"learning_rate": 4.825816838005235e-05,
"loss": 0.7312,
"mean_token_accuracy": 0.7816817224025726,
"num_tokens": 1091865310.0,
"step": 525
},
{
"epoch": 0.5237801111797405,
"grad_norm": 0.2593757212162018,
"learning_rate": 4.820733759826626e-05,
"loss": 0.7295,
"mean_token_accuracy": 0.7820295438170433,
"num_tokens": 1102247757.0,
"step": 530
},
{
"epoch": 0.5287214329833231,
"grad_norm": 0.33316972851753235,
"learning_rate": 4.815580332242199e-05,
"loss": 0.7149,
"mean_token_accuracy": 0.7861398920416832,
"num_tokens": 1112634196.0,
"step": 535
},
{
"epoch": 0.5336627547869055,
"grad_norm": 0.2702305316925049,
"learning_rate": 4.810356711470033e-05,
"loss": 0.7209,
"mean_token_accuracy": 0.7841956496238709,
"num_tokens": 1123039113.0,
"step": 540
},
{
"epoch": 0.5386040765904879,
"grad_norm": 3.4884896278381348,
"learning_rate": 4.8050630558560026e-05,
"loss": 0.7353,
"mean_token_accuracy": 0.7829997181892395,
"num_tokens": 1133440814.0,
"step": 545
},
{
"epoch": 0.5435453983940705,
"grad_norm": 0.724455714225769,
"learning_rate": 4.799699525868979e-05,
"loss": 0.7273,
"mean_token_accuracy": 0.782489824295044,
"num_tokens": 1143853971.0,
"step": 550
},
{
"epoch": 0.5484867201976529,
"grad_norm": 0.3589349091053009,
"learning_rate": 4.7942662840959654e-05,
"loss": 0.7334,
"mean_token_accuracy": 0.7808837026357651,
"num_tokens": 1154257665.0,
"step": 555
},
{
"epoch": 0.5534280420012353,
"grad_norm": 0.27754878997802734,
"learning_rate": 4.7887634952371684e-05,
"loss": 0.7214,
"mean_token_accuracy": 0.7839814886450768,
"num_tokens": 1164628485.0,
"step": 560
},
{
"epoch": 0.5583693638048178,
"grad_norm": 0.4044415354728699,
"learning_rate": 4.7831913261010066e-05,
"loss": 0.7541,
"mean_token_accuracy": 0.7808719158172608,
"num_tokens": 1174996101.0,
"step": 565
},
{
"epoch": 0.5633106856084003,
"grad_norm": 0.3834975063800812,
"learning_rate": 4.777549945599051e-05,
"loss": 0.7407,
"mean_token_accuracy": 0.7786987662315369,
"num_tokens": 1185426572.0,
"step": 570
},
{
"epoch": 0.5682520074119827,
"grad_norm": 0.24947196245193481,
"learning_rate": 4.7718395247409095e-05,
"loss": 0.7264,
"mean_token_accuracy": 0.7827806279063225,
"num_tokens": 1195853750.0,
"step": 575
},
{
"epoch": 0.5731933292155652,
"grad_norm": 0.24807173013687134,
"learning_rate": 4.766060236629037e-05,
"loss": 0.7369,
"mean_token_accuracy": 0.7798179477453232,
"num_tokens": 1206269807.0,
"step": 580
},
{
"epoch": 0.5781346510191476,
"grad_norm": 0.21170632541179657,
"learning_rate": 4.760212256453493e-05,
"loss": 0.733,
"mean_token_accuracy": 0.7807521566748619,
"num_tokens": 1216679925.0,
"step": 585
},
{
"epoch": 0.58307597282273,
"grad_norm": 0.1931750476360321,
"learning_rate": 4.7542957614866296e-05,
"loss": 0.7288,
"mean_token_accuracy": 0.7816975250840187,
"num_tokens": 1227085555.0,
"step": 590
},
{
"epoch": 0.5880172946263126,
"grad_norm": 0.19650745391845703,
"learning_rate": 4.7483109310777165e-05,
"loss": 0.7196,
"mean_token_accuracy": 0.784217968583107,
"num_tokens": 1237498455.0,
"step": 595
},
{
"epoch": 0.592958616429895,
"grad_norm": 0.20376770198345184,
"learning_rate": 4.7422579466475035e-05,
"loss": 0.7173,
"mean_token_accuracy": 0.7853407070040703,
"num_tokens": 1247922816.0,
"step": 600
},
{
"epoch": 0.5978999382334774,
"grad_norm": 0.2214794009923935,
"learning_rate": 4.736136991682727e-05,
"loss": 0.7219,
"mean_token_accuracy": 0.7833896622061729,
"num_tokens": 1258329220.0,
"step": 605
},
{
"epoch": 0.60284126003706,
"grad_norm": 0.19553132355213165,
"learning_rate": 4.7299482517305404e-05,
"loss": 0.7157,
"mean_token_accuracy": 0.7851592242717743,
"num_tokens": 1268717321.0,
"step": 610
},
{
"epoch": 0.6077825818406424,
"grad_norm": 0.3079369068145752,
"learning_rate": 4.723691914392893e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.7822741761803627,
"num_tokens": 1279092136.0,
"step": 615
},
{
"epoch": 0.6127239036442248,
"grad_norm": 0.46084290742874146,
"learning_rate": 4.7173681693208444e-05,
"loss": 0.753,
"mean_token_accuracy": 0.7799653187394142,
"num_tokens": 1289484661.0,
"step": 620
},
{
"epoch": 0.6176652254478073,
"grad_norm": 11.295252799987793,
"learning_rate": 4.710977208208812e-05,
"loss": 0.7715,
"mean_token_accuracy": 0.778912840783596,
"num_tokens": 1299870309.0,
"step": 625
},
{
"epoch": 0.6226065472513898,
"grad_norm": 0.36179018020629883,
"learning_rate": 4.7045192247887634e-05,
"loss": 0.7139,
"mean_token_accuracy": 0.7855943024158478,
"num_tokens": 1310266867.0,
"step": 630
},
{
"epoch": 0.6275478690549722,
"grad_norm": 0.3986701965332031,
"learning_rate": 4.697994414824343e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.7834179788827896,
"num_tokens": 1320695348.0,
"step": 635
},
{
"epoch": 0.6324891908585547,
"grad_norm": 0.28618842363357544,
"learning_rate": 4.6914029761049357e-05,
"loss": 0.7199,
"mean_token_accuracy": 0.7842305526137352,
"num_tokens": 1331087810.0,
"step": 640
},
{
"epoch": 0.6374305126621371,
"grad_norm": 0.3706783652305603,
"learning_rate": 4.6847451084396724e-05,
"loss": 0.7151,
"mean_token_accuracy": 0.7855269074440002,
"num_tokens": 1341477976.0,
"step": 645
},
{
"epoch": 0.6423718344657195,
"grad_norm": 0.2638409435749054,
"learning_rate": 4.678021013651375e-05,
"loss": 0.7239,
"mean_token_accuracy": 0.7827935367822647,
"num_tokens": 1351900891.0,
"step": 650
},
{
"epoch": 0.6473131562693021,
"grad_norm": 0.20985962450504303,
"learning_rate": 4.6712308955704346e-05,
"loss": 0.7088,
"mean_token_accuracy": 0.7867402359843254,
"num_tokens": 1362281947.0,
"step": 655
},
{
"epoch": 0.6522544780728845,
"grad_norm": 0.2701743245124817,
"learning_rate": 4.664374960028638e-05,
"loss": 0.7351,
"mean_token_accuracy": 0.7816146582365036,
"num_tokens": 1372687416.0,
"step": 660
},
{
"epoch": 0.6571957998764669,
"grad_norm": 0.3363015651702881,
"learning_rate": 4.6574534148529225e-05,
"loss": 0.7438,
"mean_token_accuracy": 0.7821772992610931,
"num_tokens": 1383065396.0,
"step": 665
},
{
"epoch": 0.6621371216800495,
"grad_norm": 0.49438920617103577,
"learning_rate": 4.650466469859079e-05,
"loss": 0.7798,
"mean_token_accuracy": 0.7778581961989403,
"num_tokens": 1393456689.0,
"step": 670
},
{
"epoch": 0.6670784434836319,
"grad_norm": 0.27729520201683044,
"learning_rate": 4.643414336845394e-05,
"loss": 0.7271,
"mean_token_accuracy": 0.7821262225508689,
"num_tokens": 1403871987.0,
"step": 675
},
{
"epoch": 0.6720197652872143,
"grad_norm": 0.23920652270317078,
"learning_rate": 4.6362972295862225e-05,
"loss": 0.7128,
"mean_token_accuracy": 0.7859095022082329,
"num_tokens": 1414264057.0,
"step": 680
},
{
"epoch": 0.6769610870907968,
"grad_norm": 0.22164705395698547,
"learning_rate": 4.629115363825514e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7835411220788956,
"num_tokens": 1424682063.0,
"step": 685
},
{
"epoch": 0.6819024088943793,
"grad_norm": 0.19232025742530823,
"learning_rate": 4.6218689572702715e-05,
"loss": 0.7364,
"mean_token_accuracy": 0.7843156576156616,
"num_tokens": 1435058723.0,
"step": 690
},
{
"epoch": 0.6868437306979617,
"grad_norm": 0.21504122018814087,
"learning_rate": 4.614558229583948e-05,
"loss": 0.7234,
"mean_token_accuracy": 0.7829699948430061,
"num_tokens": 1445469769.0,
"step": 695
},
{
"epoch": 0.6917850525015442,
"grad_norm": 0.21127116680145264,
"learning_rate": 4.607183402379794e-05,
"loss": 0.7055,
"mean_token_accuracy": 0.7875325232744217,
"num_tokens": 1455878611.0,
"step": 700
},
{
"epoch": 0.6967263743051266,
"grad_norm": 0.221888467669487,
"learning_rate": 4.599744699214136e-05,
"loss": 0.7163,
"mean_token_accuracy": 0.7848389104008675,
"num_tokens": 1466265360.0,
"step": 705
},
{
"epoch": 0.701667696108709,
"grad_norm": 11.019208908081055,
"learning_rate": 4.5922423455795966e-05,
"loss": 0.7403,
"mean_token_accuracy": 0.7839446976780892,
"num_tokens": 1476653925.0,
"step": 710
},
{
"epoch": 0.7066090179122916,
"grad_norm": 0.2727574408054352,
"learning_rate": 4.584676568898267e-05,
"loss": 0.7104,
"mean_token_accuracy": 0.7866337120532989,
"num_tokens": 1487058650.0,
"step": 715
},
{
"epoch": 0.711550339715874,
"grad_norm": 1.0687726736068726,
"learning_rate": 4.5770475985148056e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.7805073946714401,
"num_tokens": 1497461614.0,
"step": 720
},
{
"epoch": 0.7164916615194564,
"grad_norm": 5.212482452392578,
"learning_rate": 4.5693556656894907e-05,
"loss": 0.7587,
"mean_token_accuracy": 0.7814112335443497,
"num_tokens": 1507862236.0,
"step": 725
},
{
"epoch": 0.721432983323039,
"grad_norm": 0.4863557815551758,
"learning_rate": 4.561601003591208e-05,
"loss": 0.7297,
"mean_token_accuracy": 0.7825336411595345,
"num_tokens": 1518272002.0,
"step": 730
},
{
"epoch": 0.7263743051266214,
"grad_norm": 0.4574221074581146,
"learning_rate": 4.5537838472903814e-05,
"loss": 0.7252,
"mean_token_accuracy": 0.7830310598015785,
"num_tokens": 1528707308.0,
"step": 735
},
{
"epoch": 0.7313156269302038,
"grad_norm": 0.34816044569015503,
"learning_rate": 4.54590443375185e-05,
"loss": 0.7118,
"mean_token_accuracy": 0.7860510513186455,
"num_tokens": 1539091834.0,
"step": 740
},
{
"epoch": 0.7362569487337863,
"grad_norm": 0.2924805283546448,
"learning_rate": 4.5379630018276834e-05,
"loss": 0.7141,
"mean_token_accuracy": 0.7856177687644958,
"num_tokens": 1549484512.0,
"step": 745
},
{
"epoch": 0.7411982705373688,
"grad_norm": 0.21858909726142883,
"learning_rate": 4.5299597922499396e-05,
"loss": 0.7144,
"mean_token_accuracy": 0.7855332553386688,
"num_tokens": 1559908828.0,
"step": 750
},
{
"epoch": 0.7461395923409512,
"grad_norm": 0.21600526571273804,
"learning_rate": 4.521895047623372e-05,
"loss": 0.7139,
"mean_token_accuracy": 0.7855884119868278,
"num_tokens": 1570311228.0,
"step": 755
},
{
"epoch": 0.7510809141445337,
"grad_norm": 2.1125078201293945,
"learning_rate": 4.513769012418071e-05,
"loss": 0.7108,
"mean_token_accuracy": 0.7866385370492935,
"num_tokens": 1580708990.0,
"step": 760
},
{
"epoch": 0.7560222359481161,
"grad_norm": 0.2704427242279053,
"learning_rate": 4.505581932962054e-05,
"loss": 0.7064,
"mean_token_accuracy": 0.7872796177864074,
"num_tokens": 1591096704.0,
"step": 765
},
{
"epoch": 0.7609635577516985,
"grad_norm": 0.6448246240615845,
"learning_rate": 4.4973340574338016e-05,
"loss": 0.7259,
"mean_token_accuracy": 0.7822867006063461,
"num_tokens": 1601474574.0,
"step": 770
},
{
"epoch": 0.7659048795552811,
"grad_norm": 0.28744909167289734,
"learning_rate": 4.4890256358547304e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.7849863648414612,
"num_tokens": 1611883617.0,
"step": 775
},
{
"epoch": 0.7708462013588635,
"grad_norm": 1.0210628509521484,
"learning_rate": 4.480656920081615e-05,
"loss": 0.7458,
"mean_token_accuracy": 0.7787564381957054,
"num_tokens": 1622289561.0,
"step": 780
},
{
"epoch": 0.7757875231624459,
"grad_norm": 0.6114581227302551,
"learning_rate": 4.472228163798956e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7828380897641182,
"num_tokens": 1632700884.0,
"step": 785
},
{
"epoch": 0.7807288449660285,
"grad_norm": 0.3882652223110199,
"learning_rate": 4.4637396225112846e-05,
"loss": 0.7328,
"mean_token_accuracy": 0.7821167722344399,
"num_tokens": 1643078204.0,
"step": 790
},
{
"epoch": 0.7856701667696109,
"grad_norm": 1.0247176885604858,
"learning_rate": 4.4551915535354256e-05,
"loss": 0.7134,
"mean_token_accuracy": 0.7853814095258713,
"num_tokens": 1653491014.0,
"step": 795
},
{
"epoch": 0.7906114885731933,
"grad_norm": 27.41879653930664,
"learning_rate": 4.446584215992687e-05,
"loss": 5.9935,
"mean_token_accuracy": 0.32389395159407286,
"num_tokens": 1663888090.0,
"step": 800
},
{
"epoch": 0.7955528103767758,
"grad_norm": 64.84490203857422,
"learning_rate": 4.437917870801015e-05,
"loss": 7.1083,
"mean_token_accuracy": 0.08303709244355559,
"num_tokens": 1674313567.0,
"step": 805
},
{
"epoch": 0.8004941321803583,
"grad_norm": 11.887207984924316,
"learning_rate": 4.429192780667077e-05,
"loss": 2.6923,
"mean_token_accuracy": 0.5082879170775414,
"num_tokens": 1684731527.0,
"step": 810
},
{
"epoch": 0.8054354539839407,
"grad_norm": 2.350527763366699,
"learning_rate": 4.4204092100783033e-05,
"loss": 0.8951,
"mean_token_accuracy": 0.7543378055095673,
"num_tokens": 1695152743.0,
"step": 815
},
{
"epoch": 0.8103767757875232,
"grad_norm": 5.608152866363525,
"learning_rate": 4.411567425294867e-05,
"loss": 0.8052,
"mean_token_accuracy": 0.7724114000797272,
"num_tokens": 1705562199.0,
"step": 820
},
{
"epoch": 0.8153180975911056,
"grad_norm": 0.6283033490180969,
"learning_rate": 4.402667694341611e-05,
"loss": 0.7615,
"mean_token_accuracy": 0.7743734747171402,
"num_tokens": 1715941127.0,
"step": 825
},
{
"epoch": 0.820259419394688,
"grad_norm": 2.0349133014678955,
"learning_rate": 4.393710286999929e-05,
"loss": 0.7722,
"mean_token_accuracy": 0.7739002510905266,
"num_tokens": 1726358797.0,
"step": 830
},
{
"epoch": 0.8252007411982706,
"grad_norm": 1.4279297590255737,
"learning_rate": 4.3846954747995825e-05,
"loss": 0.7408,
"mean_token_accuracy": 0.778858907520771,
"num_tokens": 1736768844.0,
"step": 835
},
{
"epoch": 0.830142063001853,
"grad_norm": 0.9458789825439453,
"learning_rate": 4.375623531010471e-05,
"loss": 0.7382,
"mean_token_accuracy": 0.7807383254170418,
"num_tokens": 1747158903.0,
"step": 840
},
{
"epoch": 0.8350833848054354,
"grad_norm": 0.6119948625564575,
"learning_rate": 4.366494730634348e-05,
"loss": 0.7256,
"mean_token_accuracy": 0.7840852901339531,
"num_tokens": 1757542631.0,
"step": 845
},
{
"epoch": 0.840024706609018,
"grad_norm": 0.30982956290245056,
"learning_rate": 4.357309350396488e-05,
"loss": 0.719,
"mean_token_accuracy": 0.7841277673840523,
"num_tokens": 1767957528.0,
"step": 850
},
{
"epoch": 0.8449660284126004,
"grad_norm": 0.28105172514915466,
"learning_rate": 4.3480676687372915e-05,
"loss": 0.7186,
"mean_token_accuracy": 0.7848882034420968,
"num_tokens": 1778374482.0,
"step": 855
},
{
"epoch": 0.8499073502161828,
"grad_norm": 0.23508192598819733,
"learning_rate": 4.3387699658038506e-05,
"loss": 0.737,
"mean_token_accuracy": 0.7816414266824723,
"num_tokens": 1788794176.0,
"step": 860
},
{
"epoch": 0.8548486720197653,
"grad_norm": 0.263019323348999,
"learning_rate": 4.329416523441454e-05,
"loss": 0.7297,
"mean_token_accuracy": 0.7812557741999626,
"num_tokens": 1799211612.0,
"step": 865
},
{
"epoch": 0.8597899938233478,
"grad_norm": 0.2726346254348755,
"learning_rate": 4.3200076251850455e-05,
"loss": 0.7165,
"mean_token_accuracy": 0.7844896405935288,
"num_tokens": 1809623863.0,
"step": 870
},
{
"epoch": 0.8647313156269302,
"grad_norm": 0.22352498769760132,
"learning_rate": 4.310543556250624e-05,
"loss": 0.7166,
"mean_token_accuracy": 0.7844405427575112,
"num_tokens": 1820035541.0,
"step": 875
},
{
"epoch": 0.8696726374305127,
"grad_norm": 0.21182860434055328,
"learning_rate": 4.301024603526603e-05,
"loss": 0.7179,
"mean_token_accuracy": 0.78359464854002,
"num_tokens": 1830385910.0,
"step": 880
},
{
"epoch": 0.8746139592340951,
"grad_norm": 0.1831175982952118,
"learning_rate": 4.291451055565113e-05,
"loss": 0.6931,
"mean_token_accuracy": 0.792209367454052,
"num_tokens": 1840794040.0,
"step": 885
},
{
"epoch": 0.8795552810376775,
"grad_norm": 0.17403781414031982,
"learning_rate": 4.281823202573252e-05,
"loss": 0.7116,
"mean_token_accuracy": 0.7860970973968506,
"num_tokens": 1851206662.0,
"step": 890
},
{
"epoch": 0.8844966028412601,
"grad_norm": 0.21906058490276337,
"learning_rate": 4.272141336404289e-05,
"loss": 0.7008,
"mean_token_accuracy": 0.7892033144831657,
"num_tokens": 1861585024.0,
"step": 895
},
{
"epoch": 0.8894379246448425,
"grad_norm": 0.20309416949748993,
"learning_rate": 4.2624057505488216e-05,
"loss": 0.7186,
"mean_token_accuracy": 0.7835696578025818,
"num_tokens": 1872004992.0,
"step": 900
},
{
"epoch": 0.8943792464484249,
"grad_norm": 0.18816368281841278,
"learning_rate": 4.252616740125871e-05,
"loss": 0.6973,
"mean_token_accuracy": 0.7894984766840935,
"num_tokens": 1882403165.0,
"step": 905
},
{
"epoch": 0.8993205682520075,
"grad_norm": 0.1928325891494751,
"learning_rate": 4.242774601873943e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.784919947385788,
"num_tokens": 1892805219.0,
"step": 910
},
{
"epoch": 0.9042618900555899,
"grad_norm": 0.18554538488388062,
"learning_rate": 4.23287963414203e-05,
"loss": 0.7102,
"mean_token_accuracy": 0.7861207097768783,
"num_tokens": 1903158706.0,
"step": 915
},
{
"epoch": 0.9092032118591723,
"grad_norm": 0.1917434185743332,
"learning_rate": 4.222932136880566e-05,
"loss": 0.7098,
"mean_token_accuracy": 0.7864427953958512,
"num_tokens": 1913578297.0,
"step": 920
},
{
"epoch": 0.9141445336627548,
"grad_norm": 0.18963773548603058,
"learning_rate": 4.212932411632336e-05,
"loss": 0.7249,
"mean_token_accuracy": 0.7831475377082825,
"num_tokens": 1923979178.0,
"step": 925
},
{
"epoch": 0.9190858554663373,
"grad_norm": 0.18606819212436676,
"learning_rate": 4.202880761523337e-05,
"loss": 0.7001,
"mean_token_accuracy": 0.7890260085463524,
"num_tokens": 1934367962.0,
"step": 930
},
{
"epoch": 0.9240271772699197,
"grad_norm": 0.19036024808883667,
"learning_rate": 4.1927774912535825e-05,
"loss": 0.71,
"mean_token_accuracy": 0.7863813400268554,
"num_tokens": 1944753938.0,
"step": 935
},
{
"epoch": 0.9289684990735022,
"grad_norm": 0.20415684580802917,
"learning_rate": 4.1826229070878716e-05,
"loss": 0.7295,
"mean_token_accuracy": 0.7834419712424279,
"num_tokens": 1955175313.0,
"step": 940
},
{
"epoch": 0.9339098208770846,
"grad_norm": 0.327197402715683,
"learning_rate": 4.1724173168465064e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.78517996519804,
"num_tokens": 1965584016.0,
"step": 945
},
{
"epoch": 0.938851142680667,
"grad_norm": 0.4138229787349701,
"learning_rate": 4.1621610298959556e-05,
"loss": 0.7201,
"mean_token_accuracy": 0.7838375464081764,
"num_tokens": 1975979152.0,
"step": 950
},
{
"epoch": 0.9437924644842496,
"grad_norm": 0.36716070771217346,
"learning_rate": 4.1518543571394796e-05,
"loss": 0.7119,
"mean_token_accuracy": 0.7862480774521827,
"num_tokens": 1986388324.0,
"step": 955
},
{
"epoch": 0.948733786287832,
"grad_norm": 0.2444203794002533,
"learning_rate": 4.141497611007705e-05,
"loss": 0.7307,
"mean_token_accuracy": 0.7824576959013939,
"num_tokens": 1996795931.0,
"step": 960
},
{
"epoch": 0.9536751080914144,
"grad_norm": 0.20585046708583832,
"learning_rate": 4.131091105449156e-05,
"loss": 0.7152,
"mean_token_accuracy": 0.7847880542278289,
"num_tokens": 2007173184.0,
"step": 965
},
{
"epoch": 0.958616429894997,
"grad_norm": 0.19417141377925873,
"learning_rate": 4.120635155920735e-05,
"loss": 0.7073,
"mean_token_accuracy": 0.7870074450969696,
"num_tokens": 2017580798.0,
"step": 970
},
{
"epoch": 0.9635577516985794,
"grad_norm": 0.19215287268161774,
"learning_rate": 4.110130079378159e-05,
"loss": 0.7111,
"mean_token_accuracy": 0.7853602975606918,
"num_tokens": 2027947222.0,
"step": 975
},
{
"epoch": 0.9684990735021618,
"grad_norm": 0.1740170270204544,
"learning_rate": 4.099576194266357e-05,
"loss": 0.7136,
"mean_token_accuracy": 0.7852953299880028,
"num_tokens": 2038360867.0,
"step": 980
},
{
"epoch": 0.9734403953057443,
"grad_norm": 0.5138667225837708,
"learning_rate": 4.0889738205098105e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7842022761702537,
"num_tokens": 2048762177.0,
"step": 985
},
{
"epoch": 0.9783817171093268,
"grad_norm": 2.790609121322632,
"learning_rate": 4.078323279502858e-05,
"loss": 0.7154,
"mean_token_accuracy": 0.7841217175126076,
"num_tokens": 2059178406.0,
"step": 990
},
{
"epoch": 0.9833230389129092,
"grad_norm": 0.30142104625701904,
"learning_rate": 4.067624894099956e-05,
"loss": 0.7053,
"mean_token_accuracy": 0.787280821800232,
"num_tokens": 2069556358.0,
"step": 995
},
{
"epoch": 0.9882643607164917,
"grad_norm": 0.29179099202156067,
"learning_rate": 4.056878988605884e-05,
"loss": 0.7086,
"mean_token_accuracy": 0.7862989202141761,
"num_tokens": 2079942474.0,
"step": 1000
},
{
"epoch": 0.9932056825200741,
"grad_norm": 0.2190045565366745,
"learning_rate": 4.0460858887659225e-05,
"loss": 0.7083,
"mean_token_accuracy": 0.786419989168644,
"num_tokens": 2090327369.0,
"step": 1005
},
{
"epoch": 0.9981470043236566,
"grad_norm": 0.18355406820774078,
"learning_rate": 4.0352459217559747e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.7855605989694595,
"num_tokens": 2100728668.0,
"step": 1010
},
{
"epoch": 1.0029647930821495,
"grad_norm": 0.2185661494731903,
"learning_rate": 4.024359416172644e-05,
"loss": 0.6858,
"mean_token_accuracy": 0.7917264806918609,
"num_tokens": 2110869886.0,
"step": 1015
},
{
"epoch": 1.007906114885732,
"grad_norm": 0.18818652629852295,
"learning_rate": 4.013426702023284e-05,
"loss": 0.6754,
"mean_token_accuracy": 0.7937879115343094,
"num_tokens": 2121248219.0,
"step": 1020
},
{
"epoch": 1.0128474366893143,
"grad_norm": 0.1852264106273651,
"learning_rate": 4.0024481107159836e-05,
"loss": 0.6724,
"mean_token_accuracy": 0.7944735899567604,
"num_tokens": 2131667569.0,
"step": 1025
},
{
"epoch": 1.0177887584928969,
"grad_norm": 0.24998919665813446,
"learning_rate": 3.991423975049527e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.7886683195829391,
"num_tokens": 2142075128.0,
"step": 1030
},
{
"epoch": 1.0227300802964794,
"grad_norm": 0.19213631749153137,
"learning_rate": 3.980354629203307e-05,
"loss": 0.6727,
"mean_token_accuracy": 0.7945450767874718,
"num_tokens": 2152492146.0,
"step": 1035
},
{
"epoch": 1.0276714021000617,
"grad_norm": 0.1905544400215149,
"learning_rate": 3.9692404087271896e-05,
"loss": 0.6783,
"mean_token_accuracy": 0.7932624593377113,
"num_tokens": 2162896460.0,
"step": 1040
},
{
"epoch": 1.0326127239036442,
"grad_norm": 3.515601396560669,
"learning_rate": 3.958081650531343e-05,
"loss": 0.7021,
"mean_token_accuracy": 0.7893586799502372,
"num_tokens": 2173297613.0,
"step": 1045
},
{
"epoch": 1.0375540457072268,
"grad_norm": 0.19767682254314423,
"learning_rate": 3.9468786928760316e-05,
"loss": 0.6863,
"mean_token_accuracy": 0.7907787173986435,
"num_tokens": 2183711345.0,
"step": 1050
},
{
"epoch": 1.042495367510809,
"grad_norm": 0.23874838650226593,
"learning_rate": 3.9356318753613525e-05,
"loss": 0.6753,
"mean_token_accuracy": 0.7938481122255325,
"num_tokens": 2194098086.0,
"step": 1055
},
{
"epoch": 1.0474366893143916,
"grad_norm": 0.208620086312294,
"learning_rate": 3.924341538916948e-05,
"loss": 0.6895,
"mean_token_accuracy": 0.7899346083402634,
"num_tokens": 2204491785.0,
"step": 1060
},
{
"epoch": 1.0523780111179741,
"grad_norm": 1.1455881595611572,
"learning_rate": 3.913008025791669e-05,
"loss": 0.6783,
"mean_token_accuracy": 0.7934534177184105,
"num_tokens": 2214880711.0,
"step": 1065
},
{
"epoch": 1.0573193329215564,
"grad_norm": 0.19406479597091675,
"learning_rate": 3.901631679543198e-05,
"loss": 0.677,
"mean_token_accuracy": 0.793326124548912,
"num_tokens": 2225265608.0,
"step": 1070
},
{
"epoch": 1.062260654725139,
"grad_norm": 0.18829670548439026,
"learning_rate": 3.890212845027637e-05,
"loss": 0.6706,
"mean_token_accuracy": 0.7950180992484093,
"num_tokens": 2235613832.0,
"step": 1075
},
{
"epoch": 1.0672019765287215,
"grad_norm": 0.19113752245903015,
"learning_rate": 3.8787518683890536e-05,
"loss": 0.6791,
"mean_token_accuracy": 0.7926081731915474,
"num_tokens": 2246002617.0,
"step": 1080
},
{
"epoch": 1.0721432983323038,
"grad_norm": 0.19079595804214478,
"learning_rate": 3.867249097048989e-05,
"loss": 0.6823,
"mean_token_accuracy": 0.7948415905237198,
"num_tokens": 2256415299.0,
"step": 1085
},
{
"epoch": 1.0770846201358864,
"grad_norm": 0.31613099575042725,
"learning_rate": 3.855704879695923e-05,
"loss": 0.6923,
"mean_token_accuracy": 0.7909860432147979,
"num_tokens": 2266807650.0,
"step": 1090
},
{
"epoch": 1.0820259419394689,
"grad_norm": 0.2967878580093384,
"learning_rate": 3.844119566274707e-05,
"loss": 0.7076,
"mean_token_accuracy": 0.7886807918548584,
"num_tokens": 2277220430.0,
"step": 1095
},
{
"epoch": 1.0869672637430512,
"grad_norm": 0.4129527807235718,
"learning_rate": 3.8324935079759555e-05,
"loss": 0.6778,
"mean_token_accuracy": 0.7931554391980171,
"num_tokens": 2287644164.0,
"step": 1100
},
{
"epoch": 1.0919085855466337,
"grad_norm": 0.39800935983657837,
"learning_rate": 3.820827057225401e-05,
"loss": 0.6771,
"mean_token_accuracy": 0.7932255193591118,
"num_tokens": 2298058744.0,
"step": 1105
},
{
"epoch": 1.0968499073502163,
"grad_norm": 0.23592214286327362,
"learning_rate": 3.809120567673209e-05,
"loss": 0.6731,
"mean_token_accuracy": 0.7948127672076225,
"num_tokens": 2308457445.0,
"step": 1110
},
{
"epoch": 1.1017912291537986,
"grad_norm": 0.3244028091430664,
"learning_rate": 3.797374394183257e-05,
"loss": 0.6903,
"mean_token_accuracy": 0.790345923602581,
"num_tokens": 2318859279.0,
"step": 1115
},
{
"epoch": 1.106732550957381,
"grad_norm": 0.24913185834884644,
"learning_rate": 3.785588892822383e-05,
"loss": 0.683,
"mean_token_accuracy": 0.7915908902883529,
"num_tokens": 2329278923.0,
"step": 1120
},
{
"epoch": 1.1116738727609636,
"grad_norm": 0.21519848704338074,
"learning_rate": 3.7737644208495835e-05,
"loss": 0.678,
"mean_token_accuracy": 0.7930225506424904,
"num_tokens": 2339685854.0,
"step": 1125
},
{
"epoch": 1.116615194564546,
"grad_norm": 0.24836380779743195,
"learning_rate": 3.76190133670519e-05,
"loss": 0.6884,
"mean_token_accuracy": 0.7918792888522148,
"num_tokens": 2350067715.0,
"step": 1130
},
{
"epoch": 1.1215565163681285,
"grad_norm": 0.31333428621292114,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.681,
"mean_token_accuracy": 0.7937313199043274,
"num_tokens": 2360468748.0,
"step": 1135
},
{
"epoch": 1.126497838171711,
"grad_norm": 3.056119918823242,
"learning_rate": 3.73806077150438e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.7900054767727852,
"num_tokens": 2370874127.0,
"step": 1140
},
{
"epoch": 1.1314391599752933,
"grad_norm": 0.37440919876098633,
"learning_rate": 3.7260840131373255e-05,
"loss": 0.6803,
"mean_token_accuracy": 0.7923753753304481,
"num_tokens": 2381252025.0,
"step": 1145
},
{
"epoch": 1.1363804817788759,
"grad_norm": 0.24887527525424957,
"learning_rate": 3.714070087955489e-05,
"loss": 0.6899,
"mean_token_accuracy": 0.791276590526104,
"num_tokens": 2391657101.0,
"step": 1150
},
{
"epoch": 1.1413218035824584,
"grad_norm": 0.2685694098472595,
"learning_rate": 3.702019360142181e-05,
"loss": 0.6717,
"mean_token_accuracy": 0.7948125019669533,
"num_tokens": 2402071516.0,
"step": 1155
},
{
"epoch": 1.1462631253860407,
"grad_norm": 0.20285147428512573,
"learning_rate": 3.689932194996322e-05,
"loss": 0.689,
"mean_token_accuracy": 0.7898005157709121,
"num_tokens": 2412437076.0,
"step": 1160
},
{
"epoch": 1.1512044471896232,
"grad_norm": 0.6126241683959961,
"learning_rate": 3.677808958921375e-05,
"loss": 0.6818,
"mean_token_accuracy": 0.793350088596344,
"num_tokens": 2422854087.0,
"step": 1165
},
{
"epoch": 1.1561457689932058,
"grad_norm": 0.5088633894920349,
"learning_rate": 3.665650019414239e-05,
"loss": 0.6814,
"mean_token_accuracy": 0.7923238545656204,
"num_tokens": 2433241676.0,
"step": 1170
},
{
"epoch": 1.161087090796788,
"grad_norm": 0.34475672245025635,
"learning_rate": 3.653455745054101e-05,
"loss": 0.6835,
"mean_token_accuracy": 0.7914653971791268,
"num_tokens": 2443645430.0,
"step": 1175
},
{
"epoch": 1.1660284126003706,
"grad_norm": 0.39755386114120483,
"learning_rate": 3.641226505491273e-05,
"loss": 0.7204,
"mean_token_accuracy": 0.7898602604866027,
"num_tokens": 2454036623.0,
"step": 1180
},
{
"epoch": 1.1709697344039531,
"grad_norm": 0.35569798946380615,
"learning_rate": 3.6289626714359815e-05,
"loss": 0.6794,
"mean_token_accuracy": 0.7927926614880562,
"num_tokens": 2464459884.0,
"step": 1185
},
{
"epoch": 1.1759110562075354,
"grad_norm": 0.36636069416999817,
"learning_rate": 3.616664614647129e-05,
"loss": 0.6917,
"mean_token_accuracy": 0.7891320005059242,
"num_tokens": 2474874175.0,
"step": 1190
},
{
"epoch": 1.180852378011118,
"grad_norm": 0.2763560116291046,
"learning_rate": 3.60433270792103e-05,
"loss": 0.6904,
"mean_token_accuracy": 0.791066774725914,
"num_tokens": 2485269212.0,
"step": 1195
},
{
"epoch": 1.1857936998147005,
"grad_norm": 0.3080281615257263,
"learning_rate": 3.591967325080104e-05,
"loss": 0.6788,
"mean_token_accuracy": 0.7932477965950966,
"num_tokens": 2495685298.0,
"step": 1200
},
{
"epoch": 1.1907350216182828,
"grad_norm": 0.3292888104915619,
"learning_rate": 3.5795688409615464e-05,
"loss": 0.6702,
"mean_token_accuracy": 0.7956741228699684,
"num_tokens": 2506084349.0,
"step": 1205
},
{
"epoch": 1.1956763434218654,
"grad_norm": 0.2559851109981537,
"learning_rate": 3.567137631405967e-05,
"loss": 0.6843,
"mean_token_accuracy": 0.7913395538926125,
"num_tokens": 2516495078.0,
"step": 1210
},
{
"epoch": 1.2006176652254479,
"grad_norm": 0.22338271141052246,
"learning_rate": 3.554674073245996e-05,
"loss": 0.6747,
"mean_token_accuracy": 0.7942400932312011,
"num_tokens": 2526877043.0,
"step": 1215
},
{
"epoch": 1.2055589870290302,
"grad_norm": 0.22159652411937714,
"learning_rate": 3.542178544294861e-05,
"loss": 0.6806,
"mean_token_accuracy": 0.7920667245984078,
"num_tokens": 2537271309.0,
"step": 1220
},
{
"epoch": 1.2105003088326127,
"grad_norm": 0.18364663422107697,
"learning_rate": 3.529651423334932e-05,
"loss": 0.6792,
"mean_token_accuracy": 0.7927497401833534,
"num_tokens": 2547657770.0,
"step": 1225
},
{
"epoch": 1.2154416306361953,
"grad_norm": 0.17027516663074493,
"learning_rate": 3.5170930901062436e-05,
"loss": 0.6792,
"mean_token_accuracy": 0.7929888218641281,
"num_tokens": 2558054962.0,
"step": 1230
},
{
"epoch": 1.2203829524397776,
"grad_norm": 0.2822136878967285,
"learning_rate": 3.5045039252949795e-05,
"loss": 0.7004,
"mean_token_accuracy": 0.7900258719921112,
"num_tokens": 2568444606.0,
"step": 1235
},
{
"epoch": 1.22532427424336,
"grad_norm": 0.3928567171096802,
"learning_rate": 3.491884310521938e-05,
"loss": 0.7061,
"mean_token_accuracy": 0.7906710639595985,
"num_tokens": 2578834015.0,
"step": 1240
},
{
"epoch": 1.2302655960469426,
"grad_norm": 0.2886963188648224,
"learning_rate": 3.479234628330955e-05,
"loss": 0.6705,
"mean_token_accuracy": 0.7952982589602471,
"num_tokens": 2589221980.0,
"step": 1245
},
{
"epoch": 1.235206917850525,
"grad_norm": 0.2760990858078003,
"learning_rate": 3.4665552621773165e-05,
"loss": 0.6821,
"mean_token_accuracy": 0.7916675806045532,
"num_tokens": 2599648424.0,
"step": 1250
},
{
"epoch": 1.2401482396541075,
"grad_norm": 0.22312766313552856,
"learning_rate": 3.453846596416131e-05,
"loss": 0.6707,
"mean_token_accuracy": 0.7951520264148713,
"num_tokens": 2610054239.0,
"step": 1255
},
{
"epoch": 1.24508956145769,
"grad_norm": 0.20437319576740265,
"learning_rate": 3.441109016290679e-05,
"loss": 0.6859,
"mean_token_accuracy": 0.791742579638958,
"num_tokens": 2620401721.0,
"step": 1260
},
{
"epoch": 1.2500308832612723,
"grad_norm": 0.19419367611408234,
"learning_rate": 3.428342907920732e-05,
"loss": 0.6855,
"mean_token_accuracy": 0.7908361554145813,
"num_tokens": 2630806140.0,
"step": 1265
},
{
"epoch": 1.2549722050648549,
"grad_norm": 0.1733561009168625,
"learning_rate": 3.4155486582908535e-05,
"loss": 0.6649,
"mean_token_accuracy": 0.7967527687549592,
"num_tokens": 2641221965.0,
"step": 1270
},
{
"epoch": 1.2599135268684374,
"grad_norm": 0.18513615429401398,
"learning_rate": 3.402726655238665e-05,
"loss": 0.6762,
"mean_token_accuracy": 0.7932528495788574,
"num_tokens": 2651649647.0,
"step": 1275
},
{
"epoch": 1.2648548486720197,
"grad_norm": 0.2018139809370041,
"learning_rate": 3.389877287443086e-05,
"loss": 0.6746,
"mean_token_accuracy": 0.7940797954797745,
"num_tokens": 2662073524.0,
"step": 1280
},
{
"epoch": 1.2697961704756022,
"grad_norm": 1.0830366611480713,
"learning_rate": 3.37700094441256e-05,
"loss": 0.6822,
"mean_token_accuracy": 0.7922612771391868,
"num_tokens": 2672493582.0,
"step": 1285
},
{
"epoch": 1.2747374922791848,
"grad_norm": 0.2617737948894501,
"learning_rate": 3.3640980164732395e-05,
"loss": 0.687,
"mean_token_accuracy": 0.7908132255077363,
"num_tokens": 2682914502.0,
"step": 1290
},
{
"epoch": 1.279678814082767,
"grad_norm": 0.21003659069538116,
"learning_rate": 3.351168894757157e-05,
"loss": 0.6911,
"mean_token_accuracy": 0.7897532656788826,
"num_tokens": 2693336417.0,
"step": 1295
},
{
"epoch": 1.2846201358863496,
"grad_norm": 0.21736383438110352,
"learning_rate": 3.33821397119037e-05,
"loss": 0.6827,
"mean_token_accuracy": 0.7918270066380501,
"num_tokens": 2703706020.0,
"step": 1300
},
{
"epoch": 1.2895614576899321,
"grad_norm": 0.21032196283340454,
"learning_rate": 3.325233638481078e-05,
"loss": 0.681,
"mean_token_accuracy": 0.792155022919178,
"num_tokens": 2714082196.0,
"step": 1305
},
{
"epoch": 1.2945027794935144,
"grad_norm": 0.18472126126289368,
"learning_rate": 3.312228290107717e-05,
"loss": 0.6812,
"mean_token_accuracy": 0.7917923405766487,
"num_tokens": 2724452502.0,
"step": 1310
},
{
"epoch": 1.299444101297097,
"grad_norm": 0.17233245074748993,
"learning_rate": 3.299198320307036e-05,
"loss": 0.6793,
"mean_token_accuracy": 0.7928365305066108,
"num_tokens": 2734827605.0,
"step": 1315
},
{
"epoch": 1.3043854231006795,
"grad_norm": 0.18500040471553802,
"learning_rate": 3.286144124062143e-05,
"loss": 0.681,
"mean_token_accuracy": 0.7920305550098419,
"num_tokens": 2745243883.0,
"step": 1320
},
{
"epoch": 1.3093267449042618,
"grad_norm": 0.2191830277442932,
"learning_rate": 3.27306609709053e-05,
"loss": 0.6651,
"mean_token_accuracy": 0.7963899701833725,
"num_tokens": 2755631702.0,
"step": 1325
},
{
"epoch": 1.3142680667078444,
"grad_norm": 0.19073499739170074,
"learning_rate": 3.2599646358320874e-05,
"loss": 0.6763,
"mean_token_accuracy": 0.7934411048889161,
"num_tokens": 2766039150.0,
"step": 1330
},
{
"epoch": 1.3192093885114269,
"grad_norm": 0.1874350607395172,
"learning_rate": 3.246840137437072e-05,
"loss": 0.6588,
"mean_token_accuracy": 0.7982455238699913,
"num_tokens": 2776460737.0,
"step": 1335
},
{
"epoch": 1.3241507103150092,
"grad_norm": 0.2026117742061615,
"learning_rate": 3.233692999754077e-05,
"loss": 0.6922,
"mean_token_accuracy": 0.7905497863888741,
"num_tokens": 2786856139.0,
"step": 1340
},
{
"epoch": 1.3290920321185917,
"grad_norm": 0.19188478589057922,
"learning_rate": 3.2205236213179734e-05,
"loss": 0.6672,
"mean_token_accuracy": 0.7957891747355461,
"num_tokens": 2797249229.0,
"step": 1345
},
{
"epoch": 1.3340333539221743,
"grad_norm": 0.20345532894134521,
"learning_rate": 3.207332401337823e-05,
"loss": 0.6749,
"mean_token_accuracy": 0.79354357868433,
"num_tokens": 2807623382.0,
"step": 1350
},
{
"epoch": 1.3389746757257566,
"grad_norm": 0.21133118867874146,
"learning_rate": 3.194119739684779e-05,
"loss": 0.6664,
"mean_token_accuracy": 0.7959961429238319,
"num_tokens": 2818024758.0,
"step": 1355
},
{
"epoch": 1.343915997529339,
"grad_norm": 0.19039379060268402,
"learning_rate": 3.1808860368799674e-05,
"loss": 0.6907,
"mean_token_accuracy": 0.7892735093832016,
"num_tokens": 2828425993.0,
"step": 1360
},
{
"epoch": 1.3488573193329216,
"grad_norm": 0.1846419721841812,
"learning_rate": 3.1676316940823426e-05,
"loss": 0.6681,
"mean_token_accuracy": 0.7955585345625877,
"num_tokens": 2838840590.0,
"step": 1365
},
{
"epoch": 1.353798641136504,
"grad_norm": 0.17371538281440735,
"learning_rate": 3.154357113076527e-05,
"loss": 0.6773,
"mean_token_accuracy": 0.7934020891785621,
"num_tokens": 2849258862.0,
"step": 1370
},
{
"epoch": 1.3587399629400865,
"grad_norm": 0.18384847044944763,
"learning_rate": 3.141062696260636e-05,
"loss": 0.6727,
"mean_token_accuracy": 0.7949780642986297,
"num_tokens": 2859638155.0,
"step": 1375
},
{
"epoch": 1.363681284743669,
"grad_norm": 0.18146079778671265,
"learning_rate": 3.1277488466340746e-05,
"loss": 0.6735,
"mean_token_accuracy": 0.7942267686128617,
"num_tokens": 2870014943.0,
"step": 1380
},
{
"epoch": 1.3686226065472513,
"grad_norm": 0.2187763750553131,
"learning_rate": 3.11441596778532e-05,
"loss": 0.6785,
"mean_token_accuracy": 0.793820746243,
"num_tokens": 2880394786.0,
"step": 1385
},
{
"epoch": 1.3735639283508339,
"grad_norm": 0.22467771172523499,
"learning_rate": 3.1010644638796956e-05,
"loss": 0.665,
"mean_token_accuracy": 0.7965216785669327,
"num_tokens": 2890806878.0,
"step": 1390
},
{
"epoch": 1.3785052501544164,
"grad_norm": 0.17766356468200684,
"learning_rate": 3.08769473964711e-05,
"loss": 0.6724,
"mean_token_accuracy": 0.7943286210298538,
"num_tokens": 2901194944.0,
"step": 1395
},
{
"epoch": 1.3834465719579987,
"grad_norm": 0.18762089312076569,
"learning_rate": 3.0743072003697946e-05,
"loss": 0.6681,
"mean_token_accuracy": 0.7956297710537911,
"num_tokens": 2911601255.0,
"step": 1400
},
{
"epoch": 1.3883878937615812,
"grad_norm": 0.2048417329788208,
"learning_rate": 3.060902251870017e-05,
"loss": 0.6734,
"mean_token_accuracy": 0.7950954049825668,
"num_tokens": 2922021556.0,
"step": 1405
},
{
"epoch": 1.3933292155651638,
"grad_norm": 0.2158186286687851,
"learning_rate": 3.0474803004977748e-05,
"loss": 0.6787,
"mean_token_accuracy": 0.7928067833185196,
"num_tokens": 2932410781.0,
"step": 1410
},
{
"epoch": 1.398270537368746,
"grad_norm": 0.25561225414276123,
"learning_rate": 3.0340417531184832e-05,
"loss": 0.687,
"mean_token_accuracy": 0.7903619438409806,
"num_tokens": 2942802115.0,
"step": 1415
},
{
"epoch": 1.4032118591723286,
"grad_norm": 0.21336261928081512,
"learning_rate": 3.02058701710064e-05,
"loss": 0.6757,
"mean_token_accuracy": 0.7946703806519508,
"num_tokens": 2953223325.0,
"step": 1420
},
{
"epoch": 1.4081531809759111,
"grad_norm": 0.2261497527360916,
"learning_rate": 3.007116500303475e-05,
"loss": 0.6659,
"mean_token_accuracy": 0.7963157400488854,
"num_tokens": 2963627568.0,
"step": 1425
},
{
"epoch": 1.4130945027794934,
"grad_norm": 0.20056448876857758,
"learning_rate": 2.9936306110645867e-05,
"loss": 0.68,
"mean_token_accuracy": 0.7920198395848275,
"num_tokens": 2974004296.0,
"step": 1430
},
{
"epoch": 1.418035824583076,
"grad_norm": 4.7345452308654785,
"learning_rate": 2.980129758187567e-05,
"loss": 0.6964,
"mean_token_accuracy": 0.7900218307971955,
"num_tokens": 2984403518.0,
"step": 1435
},
{
"epoch": 1.4229771463866585,
"grad_norm": 0.1929967850446701,
"learning_rate": 2.9666143509296057e-05,
"loss": 0.6775,
"mean_token_accuracy": 0.7929817318916321,
"num_tokens": 2994812742.0,
"step": 1440
},
{
"epoch": 1.4279184681902408,
"grad_norm": 0.20712681114673615,
"learning_rate": 2.9530847989890865e-05,
"loss": 0.6712,
"mean_token_accuracy": 0.7948137015104294,
"num_tokens": 3005236938.0,
"step": 1445
},
{
"epoch": 1.4328597899938234,
"grad_norm": 0.19081586599349976,
"learning_rate": 2.939541512493167e-05,
"loss": 0.6747,
"mean_token_accuracy": 0.7935641020536423,
"num_tokens": 3015651863.0,
"step": 1450
},
{
"epoch": 1.4378011117974059,
"grad_norm": 0.1719520092010498,
"learning_rate": 2.9259849019853458e-05,
"loss": 0.6733,
"mean_token_accuracy": 0.7942060053348541,
"num_tokens": 3026070210.0,
"step": 1455
},
{
"epoch": 1.4427424336009882,
"grad_norm": 0.22839383780956268,
"learning_rate": 2.9124153784130193e-05,
"loss": 0.6839,
"mean_token_accuracy": 0.7911037534475327,
"num_tokens": 3036453223.0,
"step": 1460
},
{
"epoch": 1.4476837554045707,
"grad_norm": 0.16924144327640533,
"learning_rate": 2.898833353115021e-05,
"loss": 0.6688,
"mean_token_accuracy": 0.7952630639076232,
"num_tokens": 3046881199.0,
"step": 1465
},
{
"epoch": 1.4526250772081533,
"grad_norm": 0.1802128255367279,
"learning_rate": 2.8852392378091564e-05,
"loss": 0.6756,
"mean_token_accuracy": 0.7934101119637489,
"num_tokens": 3057283547.0,
"step": 1470
},
{
"epoch": 1.4575663990117356,
"grad_norm": 0.2060791701078415,
"learning_rate": 2.8716334445797195e-05,
"loss": 0.6644,
"mean_token_accuracy": 0.7965093940496445,
"num_tokens": 3067693639.0,
"step": 1475
},
{
"epoch": 1.462507720815318,
"grad_norm": 0.17594724893569946,
"learning_rate": 2.8580163858650038e-05,
"loss": 0.6726,
"mean_token_accuracy": 0.7946979507803917,
"num_tokens": 3078082156.0,
"step": 1480
},
{
"epoch": 1.4674490426189006,
"grad_norm": 0.18219222128391266,
"learning_rate": 2.8443884744447974e-05,
"loss": 0.6704,
"mean_token_accuracy": 0.7950583577156067,
"num_tokens": 3088511458.0,
"step": 1485
},
{
"epoch": 1.472390364422483,
"grad_norm": 0.16935332119464874,
"learning_rate": 2.83075012342787e-05,
"loss": 0.6775,
"mean_token_accuracy": 0.7929304778575897,
"num_tokens": 3098888388.0,
"step": 1490
},
{
"epoch": 1.4773316862260655,
"grad_norm": 6.201488018035889,
"learning_rate": 2.8171017462394546e-05,
"loss": 0.6624,
"mean_token_accuracy": 0.7975944474339485,
"num_tokens": 3109255613.0,
"step": 1495
},
{
"epoch": 1.482273008029648,
"grad_norm": 0.5504547953605652,
"learning_rate": 2.803443756608707e-05,
"loss": 0.6719,
"mean_token_accuracy": 0.7945632874965668,
"num_tokens": 3119631952.0,
"step": 1500
},
{
"epoch": 1.4872143298332303,
"grad_norm": 0.21276314556598663,
"learning_rate": 2.789776568556173e-05,
"loss": 0.6828,
"mean_token_accuracy": 0.791711862385273,
"num_tokens": 3130027225.0,
"step": 1505
},
{
"epoch": 1.4921556516368129,
"grad_norm": 0.23470132052898407,
"learning_rate": 2.7761005963812337e-05,
"loss": 0.6635,
"mean_token_accuracy": 0.7968768313527107,
"num_tokens": 3140453678.0,
"step": 1510
},
{
"epoch": 1.4970969734403954,
"grad_norm": 0.2083427459001541,
"learning_rate": 2.762416254649545e-05,
"loss": 0.6643,
"mean_token_accuracy": 0.7962050586938858,
"num_tokens": 3150827695.0,
"step": 1515
},
{
"epoch": 1.502038295243978,
"grad_norm": 0.2057102769613266,
"learning_rate": 2.7487239581804753e-05,
"loss": 0.6746,
"mean_token_accuracy": 0.7935194700956345,
"num_tokens": 3161227090.0,
"step": 1520
},
{
"epoch": 1.5069796170475602,
"grad_norm": 0.1747300773859024,
"learning_rate": 2.7350241220345274e-05,
"loss": 0.682,
"mean_token_accuracy": 0.7913977935910225,
"num_tokens": 3171627806.0,
"step": 1525
},
{
"epoch": 1.5119209388511425,
"grad_norm": 0.16887561976909637,
"learning_rate": 2.7213171615007566e-05,
"loss": 0.6723,
"mean_token_accuracy": 0.7942707300186157,
"num_tokens": 3182049340.0,
"step": 1530
},
{
"epoch": 1.516862260654725,
"grad_norm": 0.20756720006465912,
"learning_rate": 2.7076034920841836e-05,
"loss": 0.6701,
"mean_token_accuracy": 0.794575659930706,
"num_tokens": 3192485069.0,
"step": 1535
},
{
"epoch": 1.5218035824583076,
"grad_norm": 0.18000604212284088,
"learning_rate": 2.6938835294931996e-05,
"loss": 0.665,
"mean_token_accuracy": 0.7962401360273361,
"num_tokens": 3202862965.0,
"step": 1540
},
{
"epoch": 1.5267449042618901,
"grad_norm": 0.1736893653869629,
"learning_rate": 2.680157689626961e-05,
"loss": 0.6747,
"mean_token_accuracy": 0.7937764227390289,
"num_tokens": 3213276418.0,
"step": 1545
},
{
"epoch": 1.5316862260654727,
"grad_norm": 0.16604070365428925,
"learning_rate": 2.6664263885627865e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.7897267058491707,
"num_tokens": 3223670758.0,
"step": 1550
},
{
"epoch": 1.536627547869055,
"grad_norm": 0.1939777284860611,
"learning_rate": 2.6526900425435425e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.7940390273928642,
"num_tokens": 3234097782.0,
"step": 1555
},
{
"epoch": 1.5415688696726373,
"grad_norm": 0.18386848270893097,
"learning_rate": 2.6389490679650236e-05,
"loss": 0.6782,
"mean_token_accuracy": 0.7937902882695198,
"num_tokens": 3244506053.0,
"step": 1560
},
{
"epoch": 1.5465101914762198,
"grad_norm": 0.20481400191783905,
"learning_rate": 2.625203881363334e-05,
"loss": 0.6826,
"mean_token_accuracy": 0.7955172687768937,
"num_tokens": 3254929952.0,
"step": 1565
},
{
"epoch": 1.5514515132798024,
"grad_norm": 0.17962992191314697,
"learning_rate": 2.6114548994022576e-05,
"loss": 0.6866,
"mean_token_accuracy": 0.7924586087465286,
"num_tokens": 3265334962.0,
"step": 1570
},
{
"epoch": 1.5563928350833849,
"grad_norm": 0.17282456159591675,
"learning_rate": 2.5977025388606286e-05,
"loss": 0.6717,
"mean_token_accuracy": 0.7941754475235939,
"num_tokens": 3275733927.0,
"step": 1575
},
{
"epoch": 1.5613341568869674,
"grad_norm": 0.22649475932121277,
"learning_rate": 2.5839472166196977e-05,
"loss": 0.6776,
"mean_token_accuracy": 0.7933913409709931,
"num_tokens": 3286097045.0,
"step": 1580
},
{
"epoch": 1.5662754786905497,
"grad_norm": 0.17881017923355103,
"learning_rate": 2.5701893496504953e-05,
"loss": 0.6816,
"mean_token_accuracy": 0.7914034590125084,
"num_tokens": 3296524291.0,
"step": 1585
},
{
"epoch": 1.571216800494132,
"grad_norm": 0.16787505149841309,
"learning_rate": 2.5564293550011913e-05,
"loss": 0.6684,
"mean_token_accuracy": 0.7956008955836296,
"num_tokens": 3306916064.0,
"step": 1590
},
{
"epoch": 1.5761581222977146,
"grad_norm": 0.16417014598846436,
"learning_rate": 2.5426676497844515e-05,
"loss": 0.6595,
"mean_token_accuracy": 0.7979877829551697,
"num_tokens": 3317295824.0,
"step": 1595
},
{
"epoch": 1.581099444101297,
"grad_norm": 0.1576405167579651,
"learning_rate": 2.5289046511647972e-05,
"loss": 0.664,
"mean_token_accuracy": 0.7966679364442826,
"num_tokens": 3327702026.0,
"step": 1600
},
{
"epoch": 1.5860407659048796,
"grad_norm": 0.1823538988828659,
"learning_rate": 2.515140776345956e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.7936360001564026,
"num_tokens": 3338102467.0,
"step": 1605
},
{
"epoch": 1.5909820877084622,
"grad_norm": 0.17999356985092163,
"learning_rate": 2.501376442558215e-05,
"loss": 0.6707,
"mean_token_accuracy": 0.7947127357125282,
"num_tokens": 3348517918.0,
"step": 1610
},
{
"epoch": 1.5959234095120445,
"grad_norm": 0.15889112651348114,
"learning_rate": 2.4876120670457754e-05,
"loss": 0.6498,
"mean_token_accuracy": 0.8010415449738503,
"num_tokens": 3358940680.0,
"step": 1615
},
{
"epoch": 1.6008647313156268,
"grad_norm": 0.16011574864387512,
"learning_rate": 2.4738480670541024e-05,
"loss": 0.678,
"mean_token_accuracy": 0.7928107127547264,
"num_tokens": 3369326978.0,
"step": 1620
},
{
"epoch": 1.6058060531192093,
"grad_norm": 0.5964378714561462,
"learning_rate": 2.460084859817281e-05,
"loss": 0.6659,
"mean_token_accuracy": 0.7966329261660576,
"num_tokens": 3379732990.0,
"step": 1625
},
{
"epoch": 1.6107473749227919,
"grad_norm": 0.15916962921619415,
"learning_rate": 2.4463228625453607e-05,
"loss": 0.6696,
"mean_token_accuracy": 0.7948744997382164,
"num_tokens": 3390132716.0,
"step": 1630
},
{
"epoch": 1.6156886967263744,
"grad_norm": 0.17995233833789825,
"learning_rate": 2.4325624924117142e-05,
"loss": 0.6807,
"mean_token_accuracy": 0.7919593781232834,
"num_tokens": 3400548833.0,
"step": 1635
},
{
"epoch": 1.620630018529957,
"grad_norm": 0.17512169480323792,
"learning_rate": 2.4188041665403925e-05,
"loss": 0.6673,
"mean_token_accuracy": 0.7959037974476815,
"num_tokens": 3410963830.0,
"step": 1640
},
{
"epoch": 1.6255713403335392,
"grad_norm": 0.17564837634563446,
"learning_rate": 2.4050483019934737e-05,
"loss": 0.665,
"mean_token_accuracy": 0.7963264405727386,
"num_tokens": 3421337051.0,
"step": 1645
},
{
"epoch": 1.6305126621371215,
"grad_norm": 0.17777878046035767,
"learning_rate": 2.3912953157584304e-05,
"loss": 0.6726,
"mean_token_accuracy": 0.7940599545836449,
"num_tokens": 3431747692.0,
"step": 1650
},
{
"epoch": 1.635453983940704,
"grad_norm": 0.1665956974029541,
"learning_rate": 2.3775456247354765e-05,
"loss": 0.664,
"mean_token_accuracy": 0.7964100062847137,
"num_tokens": 3442105623.0,
"step": 1655
},
{
"epoch": 1.6403953057442866,
"grad_norm": 2.0348362922668457,
"learning_rate": 2.3637996457249434e-05,
"loss": 0.6786,
"mean_token_accuracy": 0.7920338585972786,
"num_tokens": 3452509019.0,
"step": 1660
},
{
"epoch": 1.6453366275478691,
"grad_norm": 0.17581728100776672,
"learning_rate": 2.3500577954146356e-05,
"loss": 0.6664,
"mean_token_accuracy": 0.795609450340271,
"num_tokens": 3462892485.0,
"step": 1665
},
{
"epoch": 1.6502779493514517,
"grad_norm": 0.16040699183940887,
"learning_rate": 2.3363204903672002e-05,
"loss": 0.6852,
"mean_token_accuracy": 0.791227325797081,
"num_tokens": 3473319700.0,
"step": 1670
},
{
"epoch": 1.655219271155034,
"grad_norm": 0.1749061942100525,
"learning_rate": 2.3225881470075075e-05,
"loss": 0.6796,
"mean_token_accuracy": 0.7921550258994102,
"num_tokens": 3483743564.0,
"step": 1675
},
{
"epoch": 1.6601605929586163,
"grad_norm": 0.17449209094047546,
"learning_rate": 2.308861181610017e-05,
"loss": 0.6701,
"mean_token_accuracy": 0.7946789160370826,
"num_tokens": 3494099528.0,
"step": 1680
},
{
"epoch": 1.6651019147621988,
"grad_norm": 0.16798865795135498,
"learning_rate": 2.2951400102861664e-05,
"loss": 0.6674,
"mean_token_accuracy": 0.795718927681446,
"num_tokens": 3504480588.0,
"step": 1685
},
{
"epoch": 1.6700432365657814,
"grad_norm": 0.16419844329357147,
"learning_rate": 2.2814250489717536e-05,
"loss": 0.6771,
"mean_token_accuracy": 0.7932738527655602,
"num_tokens": 3514901417.0,
"step": 1690
},
{
"epoch": 1.6749845583693639,
"grad_norm": 0.16027598083019257,
"learning_rate": 2.267716713414332e-05,
"loss": 0.6781,
"mean_token_accuracy": 0.7923995703458786,
"num_tokens": 3525288452.0,
"step": 1695
},
{
"epoch": 1.6799258801729464,
"grad_norm": 0.16417835652828217,
"learning_rate": 2.2540154191606028e-05,
"loss": 0.671,
"mean_token_accuracy": 0.7948248594999313,
"num_tokens": 3535694493.0,
"step": 1700
},
{
"epoch": 1.6848672019765287,
"grad_norm": 0.1824580430984497,
"learning_rate": 2.240321581543822e-05,
"loss": 0.6724,
"mean_token_accuracy": 0.794626134634018,
"num_tokens": 3546068917.0,
"step": 1705
},
{
"epoch": 1.689808523780111,
"grad_norm": 0.16408054530620575,
"learning_rate": 2.226635615671211e-05,
"loss": 0.6643,
"mean_token_accuracy": 0.7960787147283555,
"num_tokens": 3556447087.0,
"step": 1710
},
{
"epoch": 1.6947498455836936,
"grad_norm": 0.16972069442272186,
"learning_rate": 2.2129579364113692e-05,
"loss": 0.6594,
"mean_token_accuracy": 0.7978889897465706,
"num_tokens": 3566863532.0,
"step": 1715
},
{
"epoch": 1.699691167387276,
"grad_norm": 0.1671704351902008,
"learning_rate": 2.1992889583817023e-05,
"loss": 0.6655,
"mean_token_accuracy": 0.7957222029566765,
"num_tokens": 3577293405.0,
"step": 1720
},
{
"epoch": 1.7046324891908586,
"grad_norm": 0.21393845975399017,
"learning_rate": 2.1856290959358504e-05,
"loss": 0.6873,
"mean_token_accuracy": 0.793942141532898,
"num_tokens": 3587700784.0,
"step": 1725
},
{
"epoch": 1.7095738109944412,
"grad_norm": 0.3951685428619385,
"learning_rate": 2.1719787631511302e-05,
"loss": 0.6751,
"mean_token_accuracy": 0.7932202309370041,
"num_tokens": 3598073941.0,
"step": 1730
},
{
"epoch": 1.7145151327980235,
"grad_norm": 0.1711612492799759,
"learning_rate": 2.1583383738159812e-05,
"loss": 0.6762,
"mean_token_accuracy": 0.7930672898888588,
"num_tokens": 3608444988.0,
"step": 1735
},
{
"epoch": 1.7194564546016058,
"grad_norm": 0.1807597279548645,
"learning_rate": 2.1447083414174212e-05,
"loss": 0.6646,
"mean_token_accuracy": 0.7962919890880584,
"num_tokens": 3618832648.0,
"step": 1740
},
{
"epoch": 1.7243977764051883,
"grad_norm": 0.16536164283752441,
"learning_rate": 2.1310890791285168e-05,
"loss": 0.6767,
"mean_token_accuracy": 0.7927768990397454,
"num_tokens": 3629210358.0,
"step": 1745
},
{
"epoch": 1.7293390982087709,
"grad_norm": 0.17039009928703308,
"learning_rate": 2.117480999795853e-05,
"loss": 0.676,
"mean_token_accuracy": 0.7928067699074746,
"num_tokens": 3639593453.0,
"step": 1750
},
{
"epoch": 1.7342804200123534,
"grad_norm": 0.1626947671175003,
"learning_rate": 2.103884515927023e-05,
"loss": 0.675,
"mean_token_accuracy": 0.7935326501727105,
"num_tokens": 3649995994.0,
"step": 1755
},
{
"epoch": 1.739221741815936,
"grad_norm": 0.15911497175693512,
"learning_rate": 2.090300039678119e-05,
"loss": 0.6669,
"mean_token_accuracy": 0.7955508276820182,
"num_tokens": 3660372590.0,
"step": 1760
},
{
"epoch": 1.7441630636195182,
"grad_norm": 0.16413941979408264,
"learning_rate": 2.0767279828412442e-05,
"loss": 0.6708,
"mean_token_accuracy": 0.7943277865648269,
"num_tokens": 3670778469.0,
"step": 1765
},
{
"epoch": 1.7491043854231005,
"grad_norm": 0.14908325672149658,
"learning_rate": 2.0631687568320258e-05,
"loss": 0.6613,
"mean_token_accuracy": 0.7970643430948258,
"num_tokens": 3681177969.0,
"step": 1770
},
{
"epoch": 1.754045707226683,
"grad_norm": 0.16836871206760406,
"learning_rate": 2.0496227726771415e-05,
"loss": 0.6851,
"mean_token_accuracy": 0.7903355419635772,
"num_tokens": 3691608197.0,
"step": 1775
},
{
"epoch": 1.7589870290302656,
"grad_norm": 0.15650926530361176,
"learning_rate": 2.0360904410018676e-05,
"loss": 0.6669,
"mean_token_accuracy": 0.7956840336322785,
"num_tokens": 3702023632.0,
"step": 1780
},
{
"epoch": 1.7639283508338481,
"grad_norm": 0.16152553260326385,
"learning_rate": 2.0225721720176244e-05,
"loss": 0.6722,
"mean_token_accuracy": 0.7941829591989518,
"num_tokens": 3712430248.0,
"step": 1785
},
{
"epoch": 1.7688696726374307,
"grad_norm": 0.18243561685085297,
"learning_rate": 2.009068375509544e-05,
"loss": 0.675,
"mean_token_accuracy": 0.7927650153636933,
"num_tokens": 3722829050.0,
"step": 1790
},
{
"epoch": 1.773810994441013,
"grad_norm": 0.18079873919487,
"learning_rate": 1.995579460824048e-05,
"loss": 0.6764,
"mean_token_accuracy": 0.7926057055592537,
"num_tokens": 3733231074.0,
"step": 1795
},
{
"epoch": 1.7787523162445953,
"grad_norm": 0.17142578959465027,
"learning_rate": 1.982105836856441e-05,
"loss": 0.669,
"mean_token_accuracy": 0.7976492524147034,
"num_tokens": 3743604338.0,
"step": 1800
},
{
"epoch": 1.7836936380481778,
"grad_norm": 0.17926448583602905,
"learning_rate": 1.9686479120385087e-05,
"loss": 0.6609,
"mean_token_accuracy": 0.7971063464879989,
"num_tokens": 3754021152.0,
"step": 1805
},
{
"epoch": 1.7886349598517604,
"grad_norm": 0.16492047905921936,
"learning_rate": 1.9552060943261456e-05,
"loss": 0.672,
"mean_token_accuracy": 0.7941944986581803,
"num_tokens": 3764394412.0,
"step": 1810
},
{
"epoch": 1.7935762816553429,
"grad_norm": 0.1678888201713562,
"learning_rate": 1.941780791186985e-05,
"loss": 0.6674,
"mean_token_accuracy": 0.7951693952083587,
"num_tokens": 3774799854.0,
"step": 1815
},
{
"epoch": 1.7985176034589254,
"grad_norm": 0.17829807102680206,
"learning_rate": 1.928372409588043e-05,
"loss": 0.6691,
"mean_token_accuracy": 0.7948068961501121,
"num_tokens": 3785193247.0,
"step": 1820
},
{
"epoch": 1.8034589252625077,
"grad_norm": 0.16471394896507263,
"learning_rate": 1.9149813559833897e-05,
"loss": 0.6637,
"mean_token_accuracy": 0.7964715838432312,
"num_tokens": 3795565687.0,
"step": 1825
},
{
"epoch": 1.80840024706609,
"grad_norm": 0.16672302782535553,
"learning_rate": 1.9016080363018214e-05,
"loss": 0.6655,
"mean_token_accuracy": 0.7956433966755867,
"num_tokens": 3805974954.0,
"step": 1830
},
{
"epoch": 1.8133415688696726,
"grad_norm": 0.1635795533657074,
"learning_rate": 1.8882528559345604e-05,
"loss": 0.6752,
"mean_token_accuracy": 0.7931825637817382,
"num_tokens": 3816394445.0,
"step": 1835
},
{
"epoch": 1.818282890673255,
"grad_norm": 0.1563975065946579,
"learning_rate": 1.8749162197229626e-05,
"loss": 0.6736,
"mean_token_accuracy": 0.7934539332985878,
"num_tokens": 3826795756.0,
"step": 1840
},
{
"epoch": 1.8232242124768376,
"grad_norm": 0.1613382250070572,
"learning_rate": 1.8615985319462486e-05,
"loss": 0.6666,
"mean_token_accuracy": 0.7956288605928421,
"num_tokens": 3837209980.0,
"step": 1845
},
{
"epoch": 1.8281655342804202,
"grad_norm": 0.18940430879592896,
"learning_rate": 1.848300196309245e-05,
"loss": 0.675,
"mean_token_accuracy": 0.7961811035871506,
"num_tokens": 3847604542.0,
"step": 1850
},
{
"epoch": 1.8331068560840025,
"grad_norm": 0.1619659811258316,
"learning_rate": 1.8350216159301483e-05,
"loss": 0.6665,
"mean_token_accuracy": 0.7956989362835885,
"num_tokens": 3858008174.0,
"step": 1855
},
{
"epoch": 1.8380481778875848,
"grad_norm": 0.17836914956569672,
"learning_rate": 1.821763193328309e-05,
"loss": 0.6719,
"mean_token_accuracy": 0.7940696641802788,
"num_tokens": 3868405303.0,
"step": 1860
},
{
"epoch": 1.8429894996911673,
"grad_norm": 0.1670689433813095,
"learning_rate": 1.8085253304120213e-05,
"loss": 0.6555,
"mean_token_accuracy": 0.7987230405211448,
"num_tokens": 3878781192.0,
"step": 1865
},
{
"epoch": 1.8479308214947499,
"grad_norm": 0.17044250667095184,
"learning_rate": 1.7953084284663486e-05,
"loss": 0.6698,
"mean_token_accuracy": 0.794747294485569,
"num_tokens": 3889178794.0,
"step": 1870
},
{
"epoch": 1.8528721432983324,
"grad_norm": 0.16052749752998352,
"learning_rate": 1.782112888140952e-05,
"loss": 0.6691,
"mean_token_accuracy": 0.7945168077945709,
"num_tokens": 3899590027.0,
"step": 1875
},
{
"epoch": 1.857813465101915,
"grad_norm": 0.15389318764209747,
"learning_rate": 1.7689391094379534e-05,
"loss": 0.668,
"mean_token_accuracy": 0.7955880552530289,
"num_tokens": 3909974381.0,
"step": 1880
},
{
"epoch": 1.8627547869054972,
"grad_norm": 0.1830136477947235,
"learning_rate": 1.7557874916997996e-05,
"loss": 0.6871,
"mean_token_accuracy": 0.7958386242389679,
"num_tokens": 3920358396.0,
"step": 1885
},
{
"epoch": 1.8676961087090795,
"grad_norm": 0.14622418582439423,
"learning_rate": 1.7426584335971658e-05,
"loss": 0.6667,
"mean_token_accuracy": 0.7957353934645652,
"num_tokens": 3930754434.0,
"step": 1890
},
{
"epoch": 1.872637430512662,
"grad_norm": 0.15185917913913727,
"learning_rate": 1.7295523331168673e-05,
"loss": 0.6626,
"mean_token_accuracy": 0.7965810731053352,
"num_tokens": 3941174197.0,
"step": 1895
},
{
"epoch": 1.8775787523162446,
"grad_norm": 0.1555299609899521,
"learning_rate": 1.7164695875497928e-05,
"loss": 0.6641,
"mean_token_accuracy": 0.7962276056408882,
"num_tokens": 3951578601.0,
"step": 1900
},
{
"epoch": 1.8825200741198271,
"grad_norm": 0.1599041372537613,
"learning_rate": 1.703410593478867e-05,
"loss": 0.6737,
"mean_token_accuracy": 0.793473419547081,
"num_tokens": 3961967546.0,
"step": 1905
},
{
"epoch": 1.8874613959234097,
"grad_norm": 0.15561096370220184,
"learning_rate": 1.6903757467670215e-05,
"loss": 0.6707,
"mean_token_accuracy": 0.7944751814007759,
"num_tokens": 3972373879.0,
"step": 1910
},
{
"epoch": 1.892402717726992,
"grad_norm": 0.14810524880886078,
"learning_rate": 1.6773654425452007e-05,
"loss": 0.6618,
"mean_token_accuracy": 0.7968007609248161,
"num_tokens": 3982774226.0,
"step": 1915
},
{
"epoch": 1.8973440395305743,
"grad_norm": 0.15639038383960724,
"learning_rate": 1.6643800752003824e-05,
"loss": 0.6709,
"mean_token_accuracy": 0.7942842915654182,
"num_tokens": 3993189170.0,
"step": 1920
},
{
"epoch": 1.9022853613341568,
"grad_norm": 0.15391488373279572,
"learning_rate": 1.6514200383636192e-05,
"loss": 0.6662,
"mean_token_accuracy": 0.7959662467241287,
"num_tokens": 4003600898.0,
"step": 1925
},
{
"epoch": 1.9072266831377394,
"grad_norm": 0.15802597999572754,
"learning_rate": 1.638485724898112e-05,
"loss": 0.662,
"mean_token_accuracy": 0.7968731462955475,
"num_tokens": 4013990867.0,
"step": 1930
},
{
"epoch": 1.9121680049413219,
"grad_norm": 0.14575393497943878,
"learning_rate": 1.6255775268872963e-05,
"loss": 0.66,
"mean_token_accuracy": 0.797378021478653,
"num_tokens": 4024405781.0,
"step": 1935
},
{
"epoch": 1.9171093267449044,
"grad_norm": 0.17408694326877594,
"learning_rate": 1.6126958356229604e-05,
"loss": 0.6541,
"mean_token_accuracy": 0.7994588032364845,
"num_tokens": 4034815424.0,
"step": 1940
},
{
"epoch": 1.9220506485484867,
"grad_norm": 0.16843891143798828,
"learning_rate": 1.5998410415933794e-05,
"loss": 0.6704,
"mean_token_accuracy": 0.7947205558419228,
"num_tokens": 4045217255.0,
"step": 1945
},
{
"epoch": 1.926991970352069,
"grad_norm": 0.16394077241420746,
"learning_rate": 1.587013534471485e-05,
"loss": 0.6638,
"mean_token_accuracy": 0.7960691630840302,
"num_tokens": 4055614692.0,
"step": 1950
},
{
"epoch": 1.9319332921556516,
"grad_norm": 1.020789623260498,
"learning_rate": 1.5742137031030436e-05,
"loss": 0.6735,
"mean_token_accuracy": 0.7944559305906296,
"num_tokens": 4066027413.0,
"step": 1955
},
{
"epoch": 1.936874613959234,
"grad_norm": 0.22389452159404755,
"learning_rate": 1.5614419354948783e-05,
"loss": 0.6778,
"mean_token_accuracy": 0.7925828203558922,
"num_tokens": 4076425169.0,
"step": 1960
},
{
"epoch": 1.9418159357628166,
"grad_norm": 0.17207863926887512,
"learning_rate": 1.548698618803104e-05,
"loss": 0.6627,
"mean_token_accuracy": 0.7969025999307633,
"num_tokens": 4086832157.0,
"step": 1965
},
{
"epoch": 1.9467572575663992,
"grad_norm": 0.16408152878284454,
"learning_rate": 1.535984139321386e-05,
"loss": 0.6611,
"mean_token_accuracy": 0.7972674682736397,
"num_tokens": 4097246660.0,
"step": 1970
},
{
"epoch": 1.9516985793699815,
"grad_norm": 0.1568727344274521,
"learning_rate": 1.5232988824692406e-05,
"loss": 0.6543,
"mean_token_accuracy": 0.7987544611096382,
"num_tokens": 4107628644.0,
"step": 1975
},
{
"epoch": 1.9566399011735638,
"grad_norm": 0.14633381366729736,
"learning_rate": 1.5106432327803417e-05,
"loss": 0.6626,
"mean_token_accuracy": 0.7965556159615517,
"num_tokens": 4118038279.0,
"step": 1980
},
{
"epoch": 1.9615812229771463,
"grad_norm": 0.17039474844932556,
"learning_rate": 1.4980175738908711e-05,
"loss": 0.6698,
"mean_token_accuracy": 0.79447071403265,
"num_tokens": 4128440012.0,
"step": 1985
},
{
"epoch": 1.9665225447807289,
"grad_norm": 0.17251941561698914,
"learning_rate": 1.4854222885278842e-05,
"loss": 0.7007,
"mean_token_accuracy": 0.7944662183523178,
"num_tokens": 4138816871.0,
"step": 1990
},
{
"epoch": 1.9714638665843114,
"grad_norm": 0.17853756248950958,
"learning_rate": 1.4728577584977118e-05,
"loss": 0.673,
"mean_token_accuracy": 0.793654565513134,
"num_tokens": 4149219587.0,
"step": 1995
},
{
"epoch": 1.976405188387894,
"grad_norm": 0.15176187455654144,
"learning_rate": 1.4603243646743859e-05,
"loss": 0.6587,
"mean_token_accuracy": 0.797548696398735,
"num_tokens": 4159615480.0,
"step": 2000
},
{
"epoch": 1.9813465101914762,
"grad_norm": 0.14701396226882935,
"learning_rate": 1.4478224869880908e-05,
"loss": 0.6653,
"mean_token_accuracy": 0.7960563406348229,
"num_tokens": 4170006610.0,
"step": 2005
},
{
"epoch": 1.9862878319950585,
"grad_norm": 0.1599283367395401,
"learning_rate": 1.4353525044136514e-05,
"loss": 0.6673,
"mean_token_accuracy": 0.7951024606823921,
"num_tokens": 4180405937.0,
"step": 2010
},
{
"epoch": 1.991229153798641,
"grad_norm": 0.16183249652385712,
"learning_rate": 1.4229147949590393e-05,
"loss": 0.6651,
"mean_token_accuracy": 0.7957586273550987,
"num_tokens": 4190804735.0,
"step": 2015
},
{
"epoch": 1.9961704756022236,
"grad_norm": 0.1604955494403839,
"learning_rate": 1.4105097356539203e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.794134946167469,
"num_tokens": 4201192556.0,
"step": 2020
},
{
"epoch": 2.0009882643607164,
"grad_norm": 0.2795203924179077,
"learning_rate": 1.3981377025382186e-05,
"loss": 0.665,
"mean_token_accuracy": 0.7965775315578167,
"num_tokens": 4211344649.0,
"step": 2025
},
{
"epoch": 2.005929586164299,
"grad_norm": 0.19513043761253357,
"learning_rate": 1.385799070650724e-05,
"loss": 0.6361,
"mean_token_accuracy": 0.8029673993587494,
"num_tokens": 4221754280.0,
"step": 2030
},
{
"epoch": 2.0108709079678815,
"grad_norm": 0.18438509106636047,
"learning_rate": 1.3734942140177201e-05,
"loss": 0.6424,
"mean_token_accuracy": 0.8008751258254051,
"num_tokens": 4232151391.0,
"step": 2035
},
{
"epoch": 2.015812229771464,
"grad_norm": 0.18968532979488373,
"learning_rate": 1.3612235056416442e-05,
"loss": 0.6534,
"mean_token_accuracy": 0.8006501421332359,
"num_tokens": 4242553642.0,
"step": 2040
},
{
"epoch": 2.020753551575046,
"grad_norm": 0.1811179518699646,
"learning_rate": 1.3489873174897862e-05,
"loss": 0.647,
"mean_token_accuracy": 0.7995884954929352,
"num_tokens": 4252982344.0,
"step": 2045
},
{
"epoch": 2.0256948733786286,
"grad_norm": 0.17677195370197296,
"learning_rate": 1.3367860204830063e-05,
"loss": 0.6405,
"mean_token_accuracy": 0.803388424217701,
"num_tokens": 4263377952.0,
"step": 2050
},
{
"epoch": 2.030636195182211,
"grad_norm": 0.1655615121126175,
"learning_rate": 1.3246199844844964e-05,
"loss": 0.6291,
"mean_token_accuracy": 0.8047855436801911,
"num_tokens": 4273790892.0,
"step": 2055
},
{
"epoch": 2.0355775169857937,
"grad_norm": 0.16419780254364014,
"learning_rate": 1.3124895782885668e-05,
"loss": 0.6324,
"mean_token_accuracy": 0.8037776455283165,
"num_tokens": 4284212585.0,
"step": 2060
},
{
"epoch": 2.0405188387893762,
"grad_norm": 0.18495163321495056,
"learning_rate": 1.300395169609463e-05,
"loss": 0.6434,
"mean_token_accuracy": 0.8021784752607346,
"num_tokens": 4294596888.0,
"step": 2065
},
{
"epoch": 2.0454601605929588,
"grad_norm": 0.19491487741470337,
"learning_rate": 1.2883371250702264e-05,
"loss": 0.6454,
"mean_token_accuracy": 0.802114674448967,
"num_tokens": 4305018504.0,
"step": 2070
},
{
"epoch": 2.0504014823965413,
"grad_norm": 0.1656065136194229,
"learning_rate": 1.2763158101915718e-05,
"loss": 0.6697,
"mean_token_accuracy": 0.7992842480540275,
"num_tokens": 4315416503.0,
"step": 2075
},
{
"epoch": 2.0553428042001234,
"grad_norm": 0.17609845101833344,
"learning_rate": 1.2643315893808172e-05,
"loss": 0.631,
"mean_token_accuracy": 0.8045080795884132,
"num_tokens": 4325842711.0,
"step": 2080
},
{
"epoch": 2.060284126003706,
"grad_norm": 0.16532674431800842,
"learning_rate": 1.252384825920827e-05,
"loss": 0.63,
"mean_token_accuracy": 0.8041578352451324,
"num_tokens": 4336237520.0,
"step": 2085
},
{
"epoch": 2.0652254478072885,
"grad_norm": 0.16881072521209717,
"learning_rate": 1.240475881959008e-05,
"loss": 0.6342,
"mean_token_accuracy": 0.8036715492606163,
"num_tokens": 4346647661.0,
"step": 2090
},
{
"epoch": 2.070166769610871,
"grad_norm": 0.1476944535970688,
"learning_rate": 1.2286051184963273e-05,
"loss": 0.6305,
"mean_token_accuracy": 0.8041099295020103,
"num_tokens": 4357023546.0,
"step": 2095
},
{
"epoch": 2.0751080914144535,
"grad_norm": 0.15514177083969116,
"learning_rate": 1.2167728953763714e-05,
"loss": 0.6311,
"mean_token_accuracy": 0.8037099212408065,
"num_tokens": 4367426888.0,
"step": 2100
},
{
"epoch": 2.0800494132180356,
"grad_norm": 0.16117146611213684,
"learning_rate": 1.2049795712744336e-05,
"loss": 0.6312,
"mean_token_accuracy": 0.8039769634604454,
"num_tokens": 4377836621.0,
"step": 2105
},
{
"epoch": 2.084990735021618,
"grad_norm": 0.15188126266002655,
"learning_rate": 1.1932255036866458e-05,
"loss": 0.6323,
"mean_token_accuracy": 0.8038640111684799,
"num_tokens": 4388192404.0,
"step": 2110
},
{
"epoch": 2.0899320568252007,
"grad_norm": 0.15676775574684143,
"learning_rate": 1.181511048919141e-05,
"loss": 0.6466,
"mean_token_accuracy": 0.799469843506813,
"num_tokens": 4398610857.0,
"step": 2115
},
{
"epoch": 2.094873378628783,
"grad_norm": 0.2041247934103012,
"learning_rate": 1.1698365620772523e-05,
"loss": 0.6429,
"mean_token_accuracy": 0.800723274052143,
"num_tokens": 4409031770.0,
"step": 2120
},
{
"epoch": 2.0998147004323657,
"grad_norm": 0.1508302092552185,
"learning_rate": 1.1582023970547464e-05,
"loss": 0.6307,
"mean_token_accuracy": 0.8041250929236412,
"num_tokens": 4419434453.0,
"step": 2125
},
{
"epoch": 2.1047560222359483,
"grad_norm": 0.16123157739639282,
"learning_rate": 1.1466089065230968e-05,
"loss": 0.6328,
"mean_token_accuracy": 0.8038171142339706,
"num_tokens": 4429862339.0,
"step": 2130
},
{
"epoch": 2.109697344039531,
"grad_norm": 0.15946310758590698,
"learning_rate": 1.1350564419207953e-05,
"loss": 0.6439,
"mean_token_accuracy": 0.8009535774588585,
"num_tokens": 4440241343.0,
"step": 2135
},
{
"epoch": 2.114638665843113,
"grad_norm": 0.15749602019786835,
"learning_rate": 1.123545353442696e-05,
"loss": 0.6322,
"mean_token_accuracy": 0.8037322282791137,
"num_tokens": 4450633333.0,
"step": 2140
},
{
"epoch": 2.1195799876466954,
"grad_norm": 1.4616570472717285,
"learning_rate": 1.112075990029398e-05,
"loss": 0.652,
"mean_token_accuracy": 0.8011793598532677,
"num_tokens": 4461024621.0,
"step": 2145
},
{
"epoch": 2.124521309450278,
"grad_norm": 2.4609215259552,
"learning_rate": 1.1006486993566774e-05,
"loss": 0.6475,
"mean_token_accuracy": 0.8001800090074539,
"num_tokens": 4471419630.0,
"step": 2150
},
{
"epoch": 2.1294626312538605,
"grad_norm": 0.15085525810718536,
"learning_rate": 1.089263827824934e-05,
"loss": 0.654,
"mean_token_accuracy": 0.7973979458212852,
"num_tokens": 4481813205.0,
"step": 2155
},
{
"epoch": 2.134403953057443,
"grad_norm": 0.16060177981853485,
"learning_rate": 1.0779217205487025e-05,
"loss": 0.6337,
"mean_token_accuracy": 0.8035554558038711,
"num_tokens": 4492232981.0,
"step": 2160
},
{
"epoch": 2.139345274861025,
"grad_norm": 0.14307548105716705,
"learning_rate": 1.0666227213461827e-05,
"loss": 0.6278,
"mean_token_accuracy": 0.8049931466579437,
"num_tokens": 4502573601.0,
"step": 2165
},
{
"epoch": 2.1442865966646076,
"grad_norm": 0.1611761599779129,
"learning_rate": 1.0553671727288243e-05,
"loss": 0.6347,
"mean_token_accuracy": 0.8029251515865325,
"num_tokens": 4512976405.0,
"step": 2170
},
{
"epoch": 2.14922791846819,
"grad_norm": 0.17778734862804413,
"learning_rate": 1.044155415890937e-05,
"loss": 0.6493,
"mean_token_accuracy": 0.8001389935612678,
"num_tokens": 4523382809.0,
"step": 2175
},
{
"epoch": 2.1541692402717727,
"grad_norm": 0.15306136012077332,
"learning_rate": 1.0329877906993537e-05,
"loss": 0.6389,
"mean_token_accuracy": 0.8018682345747947,
"num_tokens": 4533792085.0,
"step": 2180
},
{
"epoch": 2.1591105620753552,
"grad_norm": 0.15490379929542542,
"learning_rate": 1.0218646356831269e-05,
"loss": 0.638,
"mean_token_accuracy": 0.8019036680459977,
"num_tokens": 4544217327.0,
"step": 2185
},
{
"epoch": 2.1640518838789378,
"grad_norm": 0.16301098465919495,
"learning_rate": 1.0107862880232608e-05,
"loss": 0.636,
"mean_token_accuracy": 0.803830087184906,
"num_tokens": 4554626069.0,
"step": 2190
},
{
"epoch": 2.1689932056825203,
"grad_norm": 0.15061938762664795,
"learning_rate": 9.997530835425e-06,
"loss": 0.6367,
"mean_token_accuracy": 0.8023175925016404,
"num_tokens": 4565012403.0,
"step": 2195
},
{
"epoch": 2.1739345274861024,
"grad_norm": 0.15271735191345215,
"learning_rate": 9.887653566951405e-06,
"loss": 0.6386,
"mean_token_accuracy": 0.8026603817939758,
"num_tokens": 4575390772.0,
"step": 2200
},
{
"epoch": 2.178875849289685,
"grad_norm": 0.16427479684352875,
"learning_rate": 9.778234405568972e-06,
"loss": 0.6448,
"mean_token_accuracy": 0.8001790478825569,
"num_tokens": 4585791161.0,
"step": 2205
},
{
"epoch": 2.1838171710932675,
"grad_norm": 0.14360326528549194,
"learning_rate": 9.669276668148056e-06,
"loss": 0.6203,
"mean_token_accuracy": 0.8070179298520088,
"num_tokens": 4596212247.0,
"step": 2210
},
{
"epoch": 2.18875849289685,
"grad_norm": 0.15808238089084625,
"learning_rate": 9.560783657571642e-06,
"loss": 0.6616,
"mean_token_accuracy": 0.7990046426653862,
"num_tokens": 4606596210.0,
"step": 2215
},
{
"epoch": 2.1936998147004325,
"grad_norm": 0.14959020912647247,
"learning_rate": 9.452758662635283e-06,
"loss": 0.6398,
"mean_token_accuracy": 0.801710894703865,
"num_tokens": 4616980639.0,
"step": 2220
},
{
"epoch": 2.1986411365040146,
"grad_norm": 0.15106073021888733,
"learning_rate": 9.34520495794734e-06,
"loss": 0.6333,
"mean_token_accuracy": 0.803425170481205,
"num_tokens": 4627395951.0,
"step": 2225
},
{
"epoch": 2.203582458307597,
"grad_norm": 0.1476610153913498,
"learning_rate": 9.238125803829775e-06,
"loss": 0.6412,
"mean_token_accuracy": 0.800981068611145,
"num_tokens": 4637804875.0,
"step": 2230
},
{
"epoch": 2.2085237801111797,
"grad_norm": 0.1471738964319229,
"learning_rate": 9.131524446219272e-06,
"loss": 0.6258,
"mean_token_accuracy": 0.8057189077138901,
"num_tokens": 4648226582.0,
"step": 2235
},
{
"epoch": 2.213465101914762,
"grad_norm": 0.14815585315227509,
"learning_rate": 9.025404116568872e-06,
"loss": 0.6284,
"mean_token_accuracy": 0.8047678738832473,
"num_tokens": 4658619856.0,
"step": 2240
},
{
"epoch": 2.2184064237183447,
"grad_norm": 0.15571852028369904,
"learning_rate": 8.919768031750025e-06,
"loss": 0.645,
"mean_token_accuracy": 0.7999783381819725,
"num_tokens": 4669035963.0,
"step": 2245
},
{
"epoch": 2.2233477455219273,
"grad_norm": 0.3053502142429352,
"learning_rate": 8.814619393955023e-06,
"loss": 0.6337,
"mean_token_accuracy": 0.8034562259912491,
"num_tokens": 4679422184.0,
"step": 2250
},
{
"epoch": 2.22828906732551,
"grad_norm": 0.14345481991767883,
"learning_rate": 8.709961390599997e-06,
"loss": 0.6421,
"mean_token_accuracy": 0.8010098516941071,
"num_tokens": 4689831886.0,
"step": 2255
},
{
"epoch": 2.233230389129092,
"grad_norm": 0.14696592092514038,
"learning_rate": 8.605797194228234e-06,
"loss": 0.66,
"mean_token_accuracy": 0.7998679220676422,
"num_tokens": 4700259043.0,
"step": 2260
},
{
"epoch": 2.2381717109326744,
"grad_norm": 0.14800859987735748,
"learning_rate": 8.502129962414068e-06,
"loss": 0.6385,
"mean_token_accuracy": 0.8022643268108368,
"num_tokens": 4710648425.0,
"step": 2265
},
{
"epoch": 2.243113032736257,
"grad_norm": 3.7514922618865967,
"learning_rate": 8.39896283766711e-06,
"loss": 0.6549,
"mean_token_accuracy": 0.8020106881856919,
"num_tokens": 4721011109.0,
"step": 2270
},
{
"epoch": 2.2480543545398395,
"grad_norm": 0.14842398464679718,
"learning_rate": 8.296298947337029e-06,
"loss": 0.6297,
"mean_token_accuracy": 0.8041480585932732,
"num_tokens": 4731421318.0,
"step": 2275
},
{
"epoch": 2.252995676343422,
"grad_norm": 0.14325624704360962,
"learning_rate": 8.194141403518709e-06,
"loss": 0.6421,
"mean_token_accuracy": 0.8011141166090965,
"num_tokens": 4741847592.0,
"step": 2280
},
{
"epoch": 2.257936998147004,
"grad_norm": 0.1525031179189682,
"learning_rate": 8.092493302957935e-06,
"loss": 0.6503,
"mean_token_accuracy": 0.800957977771759,
"num_tokens": 4752234243.0,
"step": 2285
},
{
"epoch": 2.2628783199505866,
"grad_norm": 0.14679549634456635,
"learning_rate": 7.991357726957542e-06,
"loss": 0.6405,
"mean_token_accuracy": 0.8013312935829162,
"num_tokens": 4762670151.0,
"step": 2290
},
{
"epoch": 2.267819641754169,
"grad_norm": 0.14465415477752686,
"learning_rate": 7.890737741283952e-06,
"loss": 0.6334,
"mean_token_accuracy": 0.8034225985407829,
"num_tokens": 4773051063.0,
"step": 2295
},
{
"epoch": 2.2727609635577517,
"grad_norm": 0.15005916357040405,
"learning_rate": 7.790636396074308e-06,
"loss": 0.6363,
"mean_token_accuracy": 0.8025058448314667,
"num_tokens": 4783445371.0,
"step": 2300
},
{
"epoch": 2.2777022853613342,
"grad_norm": 0.14713406562805176,
"learning_rate": 7.691056725743958e-06,
"loss": 0.6377,
"mean_token_accuracy": 0.8021243140101433,
"num_tokens": 4793870222.0,
"step": 2305
},
{
"epoch": 2.2826436071649168,
"grad_norm": 1.410751461982727,
"learning_rate": 7.5920017488945145e-06,
"loss": 0.6391,
"mean_token_accuracy": 0.8034495621919632,
"num_tokens": 4804277953.0,
"step": 2310
},
{
"epoch": 2.2875849289684993,
"grad_norm": 0.14937101304531097,
"learning_rate": 7.4934744682223085e-06,
"loss": 0.6341,
"mean_token_accuracy": 0.8032669469714164,
"num_tokens": 4814689141.0,
"step": 2315
},
{
"epoch": 2.2925262507720814,
"grad_norm": 0.1451476365327835,
"learning_rate": 7.395477870427387e-06,
"loss": 0.6317,
"mean_token_accuracy": 0.8038819208741188,
"num_tokens": 4825078906.0,
"step": 2320
},
{
"epoch": 2.297467572575664,
"grad_norm": 0.14649330079555511,
"learning_rate": 7.2980149261229955e-06,
"loss": 0.6317,
"mean_token_accuracy": 0.8046122491359711,
"num_tokens": 4835469304.0,
"step": 2325
},
{
"epoch": 2.3024088943792465,
"grad_norm": 0.14319194853305817,
"learning_rate": 7.201088589745503e-06,
"loss": 0.6398,
"mean_token_accuracy": 0.8018277272582054,
"num_tokens": 4845886048.0,
"step": 2330
},
{
"epoch": 2.307350216182829,
"grad_norm": 0.15091899037361145,
"learning_rate": 7.104701799464855e-06,
"loss": 0.638,
"mean_token_accuracy": 0.8017465516924858,
"num_tokens": 4856217841.0,
"step": 2335
},
{
"epoch": 2.3122915379864115,
"grad_norm": 0.14482976496219635,
"learning_rate": 7.0088574770954874e-06,
"loss": 0.636,
"mean_token_accuracy": 0.8024977222084999,
"num_tokens": 4866616897.0,
"step": 2340
},
{
"epoch": 2.3172328597899936,
"grad_norm": 0.15366806089878082,
"learning_rate": 6.913558528007791e-06,
"loss": 0.6251,
"mean_token_accuracy": 0.8055261209607124,
"num_tokens": 4876985578.0,
"step": 2345
},
{
"epoch": 2.322174181593576,
"grad_norm": 0.16261132061481476,
"learning_rate": 6.818807841040001e-06,
"loss": 0.6347,
"mean_token_accuracy": 0.802975694835186,
"num_tokens": 4887408783.0,
"step": 2350
},
{
"epoch": 2.3271155033971587,
"grad_norm": 0.141385018825531,
"learning_rate": 6.724608288410661e-06,
"loss": 0.6283,
"mean_token_accuracy": 0.8047755777835846,
"num_tokens": 4897810849.0,
"step": 2355
},
{
"epoch": 2.332056825200741,
"grad_norm": 0.14989116787910461,
"learning_rate": 6.630962725631543e-06,
"loss": 0.6246,
"mean_token_accuracy": 0.8062204629182815,
"num_tokens": 4908237163.0,
"step": 2360
},
{
"epoch": 2.3369981470043237,
"grad_norm": 0.1412249654531479,
"learning_rate": 6.537873991421068e-06,
"loss": 0.6255,
"mean_token_accuracy": 0.8058675542473793,
"num_tokens": 4918628428.0,
"step": 2365
},
{
"epoch": 2.3419394688079063,
"grad_norm": 0.14880429208278656,
"learning_rate": 6.4453449076182946e-06,
"loss": 0.6306,
"mean_token_accuracy": 0.8041801512241363,
"num_tokens": 4929027480.0,
"step": 2370
},
{
"epoch": 2.346880790611489,
"grad_norm": 0.41716793179512024,
"learning_rate": 6.35337827909733e-06,
"loss": 0.6426,
"mean_token_accuracy": 0.8006162449717522,
"num_tokens": 4939428046.0,
"step": 2375
},
{
"epoch": 2.351822112415071,
"grad_norm": 0.14712265133857727,
"learning_rate": 6.2619768936823616e-06,
"loss": 0.6429,
"mean_token_accuracy": 0.8013240069150924,
"num_tokens": 4949815912.0,
"step": 2380
},
{
"epoch": 2.3567634342186534,
"grad_norm": 0.17049653828144073,
"learning_rate": 6.171143522063089e-06,
"loss": 0.6254,
"mean_token_accuracy": 0.8054996937513351,
"num_tokens": 4960242136.0,
"step": 2385
},
{
"epoch": 2.361704756022236,
"grad_norm": 0.14510297775268555,
"learning_rate": 6.08088091771078e-06,
"loss": 0.6348,
"mean_token_accuracy": 0.803258067369461,
"num_tokens": 4970639720.0,
"step": 2390
},
{
"epoch": 2.3666460778258185,
"grad_norm": 2.2291486263275146,
"learning_rate": 5.991191816794794e-06,
"loss": 0.6436,
"mean_token_accuracy": 0.8042269602417946,
"num_tokens": 4981043766.0,
"step": 2395
},
{
"epoch": 2.371587399629401,
"grad_norm": 4.8146796226501465,
"learning_rate": 5.902078938099611e-06,
"loss": 0.6346,
"mean_token_accuracy": 0.8034033760428428,
"num_tokens": 4991455153.0,
"step": 2400
},
{
"epoch": 2.376528721432983,
"grad_norm": 0.14338742196559906,
"learning_rate": 5.813544982942465e-06,
"loss": 0.6359,
"mean_token_accuracy": 0.8027326971292496,
"num_tokens": 5001859050.0,
"step": 2405
},
{
"epoch": 2.3814700432365656,
"grad_norm": 0.13393332064151764,
"learning_rate": 5.725592635091398e-06,
"loss": 0.6301,
"mean_token_accuracy": 0.8042582124471664,
"num_tokens": 5012262599.0,
"step": 2410
},
{
"epoch": 2.386411365040148,
"grad_norm": 0.14737042784690857,
"learning_rate": 5.638224560683966e-06,
"loss": 0.64,
"mean_token_accuracy": 0.8013565048575402,
"num_tokens": 5022644887.0,
"step": 2415
},
{
"epoch": 2.3913526868437307,
"grad_norm": 0.14444060623645782,
"learning_rate": 5.5514434081463815e-06,
"loss": 0.6318,
"mean_token_accuracy": 0.8037166804075241,
"num_tokens": 5033062786.0,
"step": 2420
},
{
"epoch": 2.3962940086473132,
"grad_norm": 0.14431585371494293,
"learning_rate": 5.465251808113247e-06,
"loss": 0.6387,
"mean_token_accuracy": 0.8019110590219498,
"num_tokens": 5043484013.0,
"step": 2425
},
{
"epoch": 2.4012353304508958,
"grad_norm": 0.14653638005256653,
"learning_rate": 5.379652373347793e-06,
"loss": 0.6364,
"mean_token_accuracy": 0.8026723086833953,
"num_tokens": 5053900530.0,
"step": 2430
},
{
"epoch": 2.4061766522544783,
"grad_norm": 0.1418423056602478,
"learning_rate": 5.294647698662686e-06,
"loss": 0.6325,
"mean_token_accuracy": 0.8036406919360161,
"num_tokens": 5064311977.0,
"step": 2435
},
{
"epoch": 2.4111179740580604,
"grad_norm": 0.13966286182403564,
"learning_rate": 5.210240360841392e-06,
"loss": 0.6395,
"mean_token_accuracy": 0.8020109251141548,
"num_tokens": 5074703925.0,
"step": 2440
},
{
"epoch": 2.416059295861643,
"grad_norm": 0.14265727996826172,
"learning_rate": 5.1264329185600285e-06,
"loss": 0.6488,
"mean_token_accuracy": 0.7987211719155312,
"num_tokens": 5085097312.0,
"step": 2445
},
{
"epoch": 2.4210006176652255,
"grad_norm": 0.1417674571275711,
"learning_rate": 5.0432279123098284e-06,
"loss": 0.624,
"mean_token_accuracy": 0.8060004249215126,
"num_tokens": 5095474470.0,
"step": 2450
},
{
"epoch": 2.425941939468808,
"grad_norm": 0.14316117763519287,
"learning_rate": 4.960627864320122e-06,
"loss": 0.6447,
"mean_token_accuracy": 0.8003643557429314,
"num_tokens": 5105827653.0,
"step": 2455
},
{
"epoch": 2.4308832612723905,
"grad_norm": 0.14158298075199127,
"learning_rate": 4.87863527848188e-06,
"loss": 0.6327,
"mean_token_accuracy": 0.8037482813000679,
"num_tokens": 5116224584.0,
"step": 2460
},
{
"epoch": 2.4358245830759726,
"grad_norm": 0.14278945326805115,
"learning_rate": 4.797252640271802e-06,
"loss": 0.6325,
"mean_token_accuracy": 0.8035951778292656,
"num_tokens": 5126633656.0,
"step": 2465
},
{
"epoch": 2.440765904879555,
"grad_norm": 0.1477539986371994,
"learning_rate": 4.7164824166769735e-06,
"loss": 0.6496,
"mean_token_accuracy": 0.7990917310118675,
"num_tokens": 5137002575.0,
"step": 2470
},
{
"epoch": 2.4457072266831377,
"grad_norm": 0.1400897353887558,
"learning_rate": 4.6363270561201185e-06,
"loss": 0.629,
"mean_token_accuracy": 0.8048480406403542,
"num_tokens": 5147386449.0,
"step": 2475
},
{
"epoch": 2.45064854848672,
"grad_norm": 0.14131523668766022,
"learning_rate": 4.556788988385327e-06,
"loss": 0.6477,
"mean_token_accuracy": 0.799259965121746,
"num_tokens": 5157825534.0,
"step": 2480
},
{
"epoch": 2.4555898702903027,
"grad_norm": 0.1450231820344925,
"learning_rate": 4.4778706245444475e-06,
"loss": 0.6351,
"mean_token_accuracy": 0.803033995628357,
"num_tokens": 5168222793.0,
"step": 2485
},
{
"epoch": 2.4605311920938853,
"grad_norm": 0.14175313711166382,
"learning_rate": 4.399574356883946e-06,
"loss": 0.6414,
"mean_token_accuracy": 0.8010082706809044,
"num_tokens": 5178625053.0,
"step": 2490
},
{
"epoch": 2.465472513897468,
"grad_norm": 0.1430099755525589,
"learning_rate": 4.32190255883245e-06,
"loss": 0.6361,
"mean_token_accuracy": 0.8026632949709892,
"num_tokens": 5189044012.0,
"step": 2495
},
{
"epoch": 2.47041383570105,
"grad_norm": 0.14343580603599548,
"learning_rate": 4.244857584888748e-06,
"loss": 0.6301,
"mean_token_accuracy": 0.8043952465057373,
"num_tokens": 5199460528.0,
"step": 2500
},
{
"epoch": 2.4753551575046324,
"grad_norm": 0.14571450650691986,
"learning_rate": 4.168441770550438e-06,
"loss": 0.6353,
"mean_token_accuracy": 0.8028825014829636,
"num_tokens": 5209861437.0,
"step": 2505
},
{
"epoch": 2.480296479308215,
"grad_norm": 0.13535760343074799,
"learning_rate": 4.092657432243144e-06,
"loss": 0.6195,
"mean_token_accuracy": 0.807587580382824,
"num_tokens": 5220231445.0,
"step": 2510
},
{
"epoch": 2.4852378011117975,
"grad_norm": 0.14562451839447021,
"learning_rate": 4.0175068672502784e-06,
"loss": 0.6306,
"mean_token_accuracy": 0.8039803951978683,
"num_tokens": 5230632565.0,
"step": 2515
},
{
"epoch": 2.49017912291538,
"grad_norm": 0.14036355912685394,
"learning_rate": 3.942992353643415e-06,
"loss": 0.6255,
"mean_token_accuracy": 0.8056451350450515,
"num_tokens": 5241008768.0,
"step": 2520
},
{
"epoch": 2.495120444718962,
"grad_norm": 0.14074988663196564,
"learning_rate": 3.869116150213212e-06,
"loss": 0.6373,
"mean_token_accuracy": 0.8024703413248062,
"num_tokens": 5251408473.0,
"step": 2525
},
{
"epoch": 2.5000617665225446,
"grad_norm": 0.1382495015859604,
"learning_rate": 3.7958804964009692e-06,
"loss": 0.6151,
"mean_token_accuracy": 0.8083513364195823,
"num_tokens": 5261777092.0,
"step": 2530
},
{
"epoch": 2.505003088326127,
"grad_norm": 0.1404552012681961,
"learning_rate": 3.7232876122307165e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.8052102610468864,
"num_tokens": 5272171725.0,
"step": 2535
},
{
"epoch": 2.5099444101297097,
"grad_norm": 0.13998018205165863,
"learning_rate": 3.651339698241943e-06,
"loss": 0.6342,
"mean_token_accuracy": 0.8030717715620994,
"num_tokens": 5282583777.0,
"step": 2540
},
{
"epoch": 2.5148857319332922,
"grad_norm": 0.1403067409992218,
"learning_rate": 3.5800389354228748e-06,
"loss": 0.6311,
"mean_token_accuracy": 0.8037037447094917,
"num_tokens": 5292979910.0,
"step": 2545
},
{
"epoch": 2.5198270537368748,
"grad_norm": 0.1378946155309677,
"learning_rate": 3.5093874851443497e-06,
"loss": 0.6392,
"mean_token_accuracy": 0.8019730687141419,
"num_tokens": 5303396667.0,
"step": 2550
},
{
"epoch": 2.5247683755404573,
"grad_norm": 0.13942013680934906,
"learning_rate": 3.4393874890943424e-06,
"loss": 0.6419,
"mean_token_accuracy": 0.8008860290050507,
"num_tokens": 5313787259.0,
"step": 2555
},
{
"epoch": 2.5297096973440394,
"grad_norm": 0.14019039273262024,
"learning_rate": 3.3700410692129815e-06,
"loss": 0.6458,
"mean_token_accuracy": 0.79988262206316,
"num_tokens": 5324192735.0,
"step": 2560
},
{
"epoch": 2.534651019147622,
"grad_norm": 0.14086246490478516,
"learning_rate": 3.3013503276282805e-06,
"loss": 0.621,
"mean_token_accuracy": 0.8067845925688744,
"num_tokens": 5334573931.0,
"step": 2565
},
{
"epoch": 2.5395923409512045,
"grad_norm": 0.1609017550945282,
"learning_rate": 3.233317346592385e-06,
"loss": 0.6284,
"mean_token_accuracy": 0.8044478759169579,
"num_tokens": 5344981148.0,
"step": 2570
},
{
"epoch": 2.544533662754787,
"grad_norm": 0.13504938781261444,
"learning_rate": 3.165944188418474e-06,
"loss": 0.6328,
"mean_token_accuracy": 0.8033879026770592,
"num_tokens": 5355403276.0,
"step": 2575
},
{
"epoch": 2.5494749845583695,
"grad_norm": 0.1396746188402176,
"learning_rate": 3.099232895418211e-06,
"loss": 0.6252,
"mean_token_accuracy": 0.8057782351970673,
"num_tokens": 5365798505.0,
"step": 2580
},
{
"epoch": 2.5544163063619516,
"grad_norm": 0.1381690502166748,
"learning_rate": 3.033185489839857e-06,
"loss": 0.6213,
"mean_token_accuracy": 0.8066289514303208,
"num_tokens": 5376169697.0,
"step": 2585
},
{
"epoch": 2.559357628165534,
"grad_norm": 0.13640715181827545,
"learning_rate": 2.9678039738069845e-06,
"loss": 0.6259,
"mean_token_accuracy": 0.8056413754820824,
"num_tokens": 5386549150.0,
"step": 2590
},
{
"epoch": 2.5642989499691167,
"grad_norm": 0.13234998285770416,
"learning_rate": 2.903090329257746e-06,
"loss": 0.6303,
"mean_token_accuracy": 0.8042984798550605,
"num_tokens": 5396948060.0,
"step": 2595
},
{
"epoch": 2.569240271772699,
"grad_norm": 0.14382146298885345,
"learning_rate": 2.8390465178848304e-06,
"loss": 0.6302,
"mean_token_accuracy": 0.803765270113945,
"num_tokens": 5407352335.0,
"step": 2600
},
{
"epoch": 2.5741815935762817,
"grad_norm": 0.1451369673013687,
"learning_rate": 2.7756744810759823e-06,
"loss": 0.6359,
"mean_token_accuracy": 0.8030797064304351,
"num_tokens": 5417765250.0,
"step": 2605
},
{
"epoch": 2.5791229153798643,
"grad_norm": 0.13900776207447052,
"learning_rate": 2.7129761398551556e-06,
"loss": 0.629,
"mean_token_accuracy": 0.8044662460684776,
"num_tokens": 5428196382.0,
"step": 2610
},
{
"epoch": 2.584064237183447,
"grad_norm": 0.14031672477722168,
"learning_rate": 2.650953394824274e-06,
"loss": 0.644,
"mean_token_accuracy": 0.8001787707209587,
"num_tokens": 5438603443.0,
"step": 2615
},
{
"epoch": 2.589005558987029,
"grad_norm": 0.14090599119663239,
"learning_rate": 2.5896081261056138e-06,
"loss": 0.6373,
"mean_token_accuracy": 0.8019549712538719,
"num_tokens": 5449000992.0,
"step": 2620
},
{
"epoch": 2.5939468807906114,
"grad_norm": 0.13333368301391602,
"learning_rate": 2.5289421932848336e-06,
"loss": 0.6282,
"mean_token_accuracy": 0.8051638424396514,
"num_tokens": 5459421463.0,
"step": 2625
},
{
"epoch": 2.598888202594194,
"grad_norm": 0.14006465673446655,
"learning_rate": 2.468957435354585e-06,
"loss": 0.6206,
"mean_token_accuracy": 0.8069762364029884,
"num_tokens": 5469817660.0,
"step": 2630
},
{
"epoch": 2.6038295243977765,
"grad_norm": 5.3190999031066895,
"learning_rate": 2.4096556706587726e-06,
"loss": 0.6435,
"mean_token_accuracy": 0.8018386244773865,
"num_tokens": 5480213057.0,
"step": 2635
},
{
"epoch": 2.608770846201359,
"grad_norm": 0.1389482617378235,
"learning_rate": 2.351038696837421e-06,
"loss": 0.6406,
"mean_token_accuracy": 0.8012316897511482,
"num_tokens": 5490632229.0,
"step": 2640
},
{
"epoch": 2.613712168004941,
"grad_norm": 0.1365329474210739,
"learning_rate": 2.2931082907722055e-06,
"loss": 0.6383,
"mean_token_accuracy": 0.8020335495471954,
"num_tokens": 5501003433.0,
"step": 2645
},
{
"epoch": 2.6186534898085236,
"grad_norm": 0.13382576406002045,
"learning_rate": 2.2358662085325723e-06,
"loss": 0.6219,
"mean_token_accuracy": 0.8068993985652924,
"num_tokens": 5511428459.0,
"step": 2650
},
{
"epoch": 2.623594811612106,
"grad_norm": 0.13468101620674133,
"learning_rate": 2.1793141853224978e-06,
"loss": 0.6259,
"mean_token_accuracy": 0.8055090084671974,
"num_tokens": 5521834701.0,
"step": 2655
},
{
"epoch": 2.6285361334156887,
"grad_norm": 0.13747040927410126,
"learning_rate": 2.1234539354279214e-06,
"loss": 0.6199,
"mean_token_accuracy": 0.8072109371423721,
"num_tokens": 5532228387.0,
"step": 2660
},
{
"epoch": 2.6334774552192712,
"grad_norm": 0.13710159063339233,
"learning_rate": 2.068287152164747e-06,
"loss": 0.6351,
"mean_token_accuracy": 0.8026650249958038,
"num_tokens": 5542623902.0,
"step": 2665
},
{
"epoch": 2.6384187770228538,
"grad_norm": 0.13862751424312592,
"learning_rate": 2.0138155078275293e-06,
"loss": 0.6341,
"mean_token_accuracy": 0.8030014872550965,
"num_tokens": 5553026750.0,
"step": 2670
},
{
"epoch": 2.6433600988264363,
"grad_norm": 0.13887560367584229,
"learning_rate": 1.96004065363877e-06,
"loss": 0.6402,
"mean_token_accuracy": 0.8015902519226075,
"num_tokens": 5563408728.0,
"step": 2675
},
{
"epoch": 2.6483014206300184,
"grad_norm": 0.13534307479858398,
"learning_rate": 1.9069642196988757e-06,
"loss": 0.6332,
"mean_token_accuracy": 0.8032783582806587,
"num_tokens": 5573792880.0,
"step": 2680
},
{
"epoch": 2.653242742433601,
"grad_norm": 0.13488322496414185,
"learning_rate": 1.8545878149367285e-06,
"loss": 0.6193,
"mean_token_accuracy": 0.80726078748703,
"num_tokens": 5584202613.0,
"step": 2685
},
{
"epoch": 2.6581840642371835,
"grad_norm": 0.13911563158035278,
"learning_rate": 1.80291302706094e-06,
"loss": 0.634,
"mean_token_accuracy": 0.8030529797077179,
"num_tokens": 5594573558.0,
"step": 2690
},
{
"epoch": 2.663125386040766,
"grad_norm": 0.13639000058174133,
"learning_rate": 1.7519414225116937e-06,
"loss": 0.6274,
"mean_token_accuracy": 0.8051569849252701,
"num_tokens": 5604952424.0,
"step": 2695
},
{
"epoch": 2.6680667078443485,
"grad_norm": 0.14200198650360107,
"learning_rate": 1.7016745464132732e-06,
"loss": 0.6306,
"mean_token_accuracy": 0.8043223142623901,
"num_tokens": 5615322036.0,
"step": 2700
},
{
"epoch": 2.6730080296479306,
"grad_norm": 0.13950730860233307,
"learning_rate": 1.6521139225272292e-06,
"loss": 0.638,
"mean_token_accuracy": 0.8019472226500511,
"num_tokens": 5625725163.0,
"step": 2705
},
{
"epoch": 2.677949351451513,
"grad_norm": 0.13591845333576202,
"learning_rate": 1.603261053206176e-06,
"loss": 0.631,
"mean_token_accuracy": 0.8039070650935173,
"num_tokens": 5636090693.0,
"step": 2710
},
{
"epoch": 2.6828906732550957,
"grad_norm": 0.13798172771930695,
"learning_rate": 1.5551174193482677e-06,
"loss": 0.6335,
"mean_token_accuracy": 0.8032683923840522,
"num_tokens": 5646434352.0,
"step": 2715
},
{
"epoch": 2.687831995058678,
"grad_norm": 0.1333172768354416,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.6275,
"mean_token_accuracy": 0.8049399971961975,
"num_tokens": 5656826072.0,
"step": 2720
},
{
"epoch": 2.6927733168622607,
"grad_norm": 0.13734152913093567,
"learning_rate": 1.4609636740734316e-06,
"loss": 0.6315,
"mean_token_accuracy": 0.803749541938305,
"num_tokens": 5667245386.0,
"step": 2725
},
{
"epoch": 2.6977146386658433,
"grad_norm": 0.13476236164569855,
"learning_rate": 1.414956416779692e-06,
"loss": 0.6424,
"mean_token_accuracy": 0.8015404507517815,
"num_tokens": 5677647700.0,
"step": 2730
},
{
"epoch": 2.702655960469426,
"grad_norm": 0.1344069540500641,
"learning_rate": 1.3696641031089501e-06,
"loss": 0.6309,
"mean_token_accuracy": 0.8040262326598168,
"num_tokens": 5688057469.0,
"step": 2735
},
{
"epoch": 2.707597282273008,
"grad_norm": 0.1328783482313156,
"learning_rate": 1.3250881060266952e-06,
"loss": 0.6335,
"mean_token_accuracy": 0.8033808618783951,
"num_tokens": 5698464888.0,
"step": 2740
},
{
"epoch": 2.7125386040765904,
"grad_norm": 0.14123857021331787,
"learning_rate": 1.2812297767843956e-06,
"loss": 0.6319,
"mean_token_accuracy": 0.8036202192306519,
"num_tokens": 5708857391.0,
"step": 2745
},
{
"epoch": 2.717479925880173,
"grad_norm": 0.1333305686712265,
"learning_rate": 1.2380904448785507e-06,
"loss": 0.6284,
"mean_token_accuracy": 0.8048888012766838,
"num_tokens": 5719273038.0,
"step": 2750
},
{
"epoch": 2.7224212476837555,
"grad_norm": 0.13743609189987183,
"learning_rate": 1.19567141801038e-06,
"loss": 0.6367,
"mean_token_accuracy": 0.802096837759018,
"num_tokens": 5729672047.0,
"step": 2755
},
{
"epoch": 2.727362569487338,
"grad_norm": 0.13438722491264343,
"learning_rate": 1.1539739820461804e-06,
"loss": 0.6378,
"mean_token_accuracy": 0.8020210683345794,
"num_tokens": 5740065329.0,
"step": 2760
},
{
"epoch": 2.73230389129092,
"grad_norm": 0.13518624007701874,
"learning_rate": 1.1129994009783624e-06,
"loss": 0.6281,
"mean_token_accuracy": 0.8048671677708625,
"num_tokens": 5750466926.0,
"step": 2765
},
{
"epoch": 2.7372452130945026,
"grad_norm": 0.13526305556297302,
"learning_rate": 1.0727489168871092e-06,
"loss": 0.6384,
"mean_token_accuracy": 0.801820358633995,
"num_tokens": 5760840649.0,
"step": 2770
},
{
"epoch": 2.742186534898085,
"grad_norm": 0.134367436170578,
"learning_rate": 1.0332237499027508e-06,
"loss": 0.6319,
"mean_token_accuracy": 0.8037395715713501,
"num_tokens": 5771252260.0,
"step": 2775
},
{
"epoch": 2.7471278567016677,
"grad_norm": 0.13103972375392914,
"learning_rate": 9.944250981687664e-07,
"loss": 0.6185,
"mean_token_accuracy": 0.8079636707901955,
"num_tokens": 5781659388.0,
"step": 2780
},
{
"epoch": 2.7520691785052502,
"grad_norm": 0.13419899344444275,
"learning_rate": 9.56354137805457e-07,
"loss": 0.639,
"mean_token_accuracy": 0.801702830195427,
"num_tokens": 5792085860.0,
"step": 2785
},
{
"epoch": 2.7570105003088328,
"grad_norm": 0.1617174595594406,
"learning_rate": 9.190120228743049e-07,
"loss": 0.6373,
"mean_token_accuracy": 0.8023781910538673,
"num_tokens": 5802520289.0,
"step": 2790
},
{
"epoch": 2.7619518221124153,
"grad_norm": 0.13774944841861725,
"learning_rate": 8.823998853429799e-07,
"loss": 0.635,
"mean_token_accuracy": 0.803001980483532,
"num_tokens": 5812910560.0,
"step": 2795
},
{
"epoch": 2.7668931439159974,
"grad_norm": 0.1353764832019806,
"learning_rate": 8.465188350510411e-07,
"loss": 0.6489,
"mean_token_accuracy": 0.803003454208374,
"num_tokens": 5823303672.0,
"step": 2800
},
{
"epoch": 2.77183446571958,
"grad_norm": 0.1379646211862564,
"learning_rate": 8.11369959676278e-07,
"loss": 0.6318,
"mean_token_accuracy": 0.803684464097023,
"num_tokens": 5833728533.0,
"step": 2805
},
{
"epoch": 2.7767757875231625,
"grad_norm": 0.1382586508989334,
"learning_rate": 7.769543247017452e-07,
"loss": 0.6409,
"mean_token_accuracy": 0.8013983756303787,
"num_tokens": 5844099051.0,
"step": 2810
},
{
"epoch": 2.781717109326745,
"grad_norm": 0.1347196102142334,
"learning_rate": 7.432729733834631e-07,
"loss": 0.6365,
"mean_token_accuracy": 0.8025758102536201,
"num_tokens": 5854513394.0,
"step": 2815
},
{
"epoch": 2.7866584311303275,
"grad_norm": 0.13394369184970856,
"learning_rate": 7.103269267188045e-07,
"loss": 0.6409,
"mean_token_accuracy": 0.8010724946856499,
"num_tokens": 5864893930.0,
"step": 2820
},
{
"epoch": 2.7915997529339096,
"grad_norm": 0.13620924949645996,
"learning_rate": 6.781171834155164e-07,
"loss": 0.6321,
"mean_token_accuracy": 0.8042621850967407,
"num_tokens": 5875301007.0,
"step": 2825
},
{
"epoch": 2.796541074737492,
"grad_norm": 0.13298700749874115,
"learning_rate": 6.466447198614806e-07,
"loss": 0.6386,
"mean_token_accuracy": 0.8018185943365097,
"num_tokens": 5885713081.0,
"step": 2830
},
{
"epoch": 2.8014823965410747,
"grad_norm": 0.13537032902240753,
"learning_rate": 6.15910490095084e-07,
"loss": 0.6311,
"mean_token_accuracy": 0.80390265583992,
"num_tokens": 5896120892.0,
"step": 2835
},
{
"epoch": 2.806423718344657,
"grad_norm": 0.13865377008914948,
"learning_rate": 5.85915425776326e-07,
"loss": 0.6385,
"mean_token_accuracy": 0.8017645820975303,
"num_tokens": 5906521523.0,
"step": 2840
},
{
"epoch": 2.8113650401482397,
"grad_norm": 0.133405864238739,
"learning_rate": 5.566604361585626e-07,
"loss": 0.631,
"mean_token_accuracy": 0.8040564343333244,
"num_tokens": 5916885991.0,
"step": 2845
},
{
"epoch": 2.8163063619518223,
"grad_norm": 1.976336121559143,
"learning_rate": 5.281464080609338e-07,
"loss": 0.6337,
"mean_token_accuracy": 0.8037149354815483,
"num_tokens": 5927294150.0,
"step": 2850
},
{
"epoch": 2.821247683755405,
"grad_norm": 0.13748565316200256,
"learning_rate": 5.003742058415112e-07,
"loss": 0.6276,
"mean_token_accuracy": 0.8046556517481804,
"num_tokens": 5937696658.0,
"step": 2855
},
{
"epoch": 2.826189005558987,
"grad_norm": 0.13383837044239044,
"learning_rate": 4.7334467137105933e-07,
"loss": 0.6298,
"mean_token_accuracy": 0.8045536518096924,
"num_tokens": 5948099499.0,
"step": 2860
},
{
"epoch": 2.8311303273625694,
"grad_norm": 0.13354896008968353,
"learning_rate": 4.470586240075486e-07,
"loss": 0.6492,
"mean_token_accuracy": 0.7988908976316452,
"num_tokens": 5958486114.0,
"step": 2865
},
{
"epoch": 2.836071649166152,
"grad_norm": 0.13508053123950958,
"learning_rate": 4.2151686057129156e-07,
"loss": 0.6349,
"mean_token_accuracy": 0.8028511360287667,
"num_tokens": 5968885481.0,
"step": 2870
},
{
"epoch": 2.8410129709697345,
"grad_norm": 0.13615567982196808,
"learning_rate": 3.967201553208122e-07,
"loss": 0.6493,
"mean_token_accuracy": 0.7993659228086472,
"num_tokens": 5979261107.0,
"step": 2875
},
{
"epoch": 2.845954292773317,
"grad_norm": 0.13443627953529358,
"learning_rate": 3.726692599293563e-07,
"loss": 0.6226,
"mean_token_accuracy": 0.8065914869308471,
"num_tokens": 5989695075.0,
"step": 2880
},
{
"epoch": 2.850895614576899,
"grad_norm": 0.13318121433258057,
"learning_rate": 3.4936490346210713e-07,
"loss": 0.6277,
"mean_token_accuracy": 0.8050418853759765,
"num_tokens": 6000080540.0,
"step": 2885
},
{
"epoch": 2.8558369363804816,
"grad_norm": 0.13402459025382996,
"learning_rate": 3.268077923541085e-07,
"loss": 0.6278,
"mean_token_accuracy": 0.8049208298325539,
"num_tokens": 6010495720.0,
"step": 2890
},
{
"epoch": 2.860778258184064,
"grad_norm": 0.13390561938285828,
"learning_rate": 3.049986103888125e-07,
"loss": 0.6264,
"mean_token_accuracy": 0.8052194505929947,
"num_tokens": 6020898745.0,
"step": 2895
},
{
"epoch": 2.8657195799876467,
"grad_norm": 0.1322164535522461,
"learning_rate": 2.8393801867738765e-07,
"loss": 0.6317,
"mean_token_accuracy": 0.8038971364498139,
"num_tokens": 6031294064.0,
"step": 2900
},
{
"epoch": 2.8706609017912292,
"grad_norm": 0.13253232836723328,
"learning_rate": 2.636266556386546e-07,
"loss": 0.6375,
"mean_token_accuracy": 0.8022791400551796,
"num_tokens": 6041694003.0,
"step": 2905
},
{
"epoch": 2.8756022235948118,
"grad_norm": 0.13198673725128174,
"learning_rate": 2.440651369797375e-07,
"loss": 0.6345,
"mean_token_accuracy": 0.80296840518713,
"num_tokens": 6052100553.0,
"step": 2910
},
{
"epoch": 2.8805435453983943,
"grad_norm": 0.13583111763000488,
"learning_rate": 2.252540556774152e-07,
"loss": 0.6376,
"mean_token_accuracy": 0.8022321462631226,
"num_tokens": 6062507192.0,
"step": 2915
},
{
"epoch": 2.8854848672019764,
"grad_norm": 0.13603807985782623,
"learning_rate": 2.0719398196012707e-07,
"loss": 0.6416,
"mean_token_accuracy": 0.8010483682155609,
"num_tokens": 6072897644.0,
"step": 2920
},
{
"epoch": 2.890426189005559,
"grad_norm": 0.13156232237815857,
"learning_rate": 1.8988546329069268e-07,
"loss": 0.6341,
"mean_token_accuracy": 0.8029652521014213,
"num_tokens": 6083301245.0,
"step": 2925
},
{
"epoch": 2.8953675108091415,
"grad_norm": 0.13426810503005981,
"learning_rate": 1.733290243497221e-07,
"loss": 0.6384,
"mean_token_accuracy": 0.8017426192760467,
"num_tokens": 6093714163.0,
"step": 2930
},
{
"epoch": 2.900308832612724,
"grad_norm": 0.13337218761444092,
"learning_rate": 1.57525167019712e-07,
"loss": 0.6338,
"mean_token_accuracy": 0.8033397659659386,
"num_tokens": 6104139692.0,
"step": 2935
},
{
"epoch": 2.9052501544163065,
"grad_norm": 0.13580240309238434,
"learning_rate": 1.4247437036981615e-07,
"loss": 0.6393,
"mean_token_accuracy": 0.8013502344489097,
"num_tokens": 6114557025.0,
"step": 2940
},
{
"epoch": 2.9101914762198886,
"grad_norm": 0.13431790471076965,
"learning_rate": 1.281770906413432e-07,
"loss": 0.6332,
"mean_token_accuracy": 0.8035564810037613,
"num_tokens": 6124910557.0,
"step": 2945
},
{
"epoch": 2.915132798023471,
"grad_norm": 0.13268694281578064,
"learning_rate": 1.1463376123391766e-07,
"loss": 0.6379,
"mean_token_accuracy": 0.802278782427311,
"num_tokens": 6135277257.0,
"step": 2950
},
{
"epoch": 2.9200741198270537,
"grad_norm": 0.13012386858463287,
"learning_rate": 1.0184479269233216e-07,
"loss": 0.6264,
"mean_token_accuracy": 0.8052372932434082,
"num_tokens": 6145692699.0,
"step": 2955
},
{
"epoch": 2.925015441630636,
"grad_norm": 0.13214385509490967,
"learning_rate": 8.981057269412674e-08,
"loss": 0.6298,
"mean_token_accuracy": 0.8040620595216751,
"num_tokens": 6156075039.0,
"step": 2960
},
{
"epoch": 2.9299567634342187,
"grad_norm": 0.3513474464416504,
"learning_rate": 7.853146603780947e-08,
"loss": 0.6322,
"mean_token_accuracy": 0.8042141646146774,
"num_tokens": 6166508847.0,
"step": 2965
},
{
"epoch": 2.9348980852378013,
"grad_norm": 0.1361996829509735,
"learning_rate": 6.800781463182082e-08,
"loss": 0.6387,
"mean_token_accuracy": 0.8021485671401024,
"num_tokens": 6176898601.0,
"step": 2970
},
{
"epoch": 2.939839407041384,
"grad_norm": 0.13293121755123138,
"learning_rate": 5.8239937484155794e-08,
"loss": 0.6285,
"mean_token_accuracy": 0.8048635452985764,
"num_tokens": 6187294229.0,
"step": 2975
},
{
"epoch": 2.944780728844966,
"grad_norm": 0.13646461069583893,
"learning_rate": 4.922813069269394e-08,
"loss": 0.6374,
"mean_token_accuracy": 0.802308914065361,
"num_tokens": 6197703524.0,
"step": 2980
},
{
"epoch": 2.9497220506485484,
"grad_norm": 0.13153176009655,
"learning_rate": 4.097266743623151e-08,
"loss": 0.6302,
"mean_token_accuracy": 0.8041860401630402,
"num_tokens": 6208108876.0,
"step": 2985
},
{
"epoch": 2.954663372452131,
"grad_norm": 0.1305973380804062,
"learning_rate": 3.3473797966199204e-08,
"loss": 0.6168,
"mean_token_accuracy": 0.8084435001015663,
"num_tokens": 6218509795.0,
"step": 2990
},
{
"epoch": 2.9596046942557135,
"grad_norm": 0.12855064868927002,
"learning_rate": 2.6731749599065435e-08,
"loss": 0.6366,
"mean_token_accuracy": 0.8023294299840927,
"num_tokens": 6228937505.0,
"step": 2995
},
{
"epoch": 2.964546016059296,
"grad_norm": 0.13430272042751312,
"learning_rate": 2.0746726709461316e-08,
"loss": 0.6258,
"mean_token_accuracy": 0.8053408190608025,
"num_tokens": 6239277445.0,
"step": 3000
},
{
"epoch": 2.969487337862878,
"grad_norm": 0.1332690417766571,
"learning_rate": 1.5518910723966163e-08,
"loss": 0.6336,
"mean_token_accuracy": 0.8029686883091927,
"num_tokens": 6249696839.0,
"step": 3005
},
{
"epoch": 2.9744286596664606,
"grad_norm": 0.1333792507648468,
"learning_rate": 1.1048460115634096e-08,
"loss": 0.6361,
"mean_token_accuracy": 0.8027278065681458,
"num_tokens": 6260105370.0,
"step": 3010
},
{
"epoch": 2.979369981470043,
"grad_norm": 0.13540172576904297,
"learning_rate": 7.335510399161804e-09,
"loss": 0.637,
"mean_token_accuracy": 0.8022717341780663,
"num_tokens": 6270523530.0,
"step": 3015
},
{
"epoch": 2.9843113032736257,
"grad_norm": 0.131788432598114,
"learning_rate": 4.380174126802916e-09,
"loss": 0.6301,
"mean_token_accuracy": 0.8042985036969185,
"num_tokens": 6280889193.0,
"step": 3020
},
{
"epoch": 2.9892526250772082,
"grad_norm": 0.13343243300914764,
"learning_rate": 2.1825408849401873e-09,
"loss": 0.6393,
"mean_token_accuracy": 0.8015558466315269,
"num_tokens": 6291299316.0,
"step": 3025
},
{
"epoch": 2.9941939468807908,
"grad_norm": 0.13485004007816315,
"learning_rate": 7.4267729138211e-10,
"loss": 0.6356,
"mean_token_accuracy": 0.8027144998311997,
"num_tokens": 6301702825.0,
"step": 3030
},
{
"epoch": 2.9991352686843733,
"grad_norm": 0.1348290592432022,
"learning_rate": 6.062699333675425e-11,
"loss": 0.6232,
"mean_token_accuracy": 0.806241363286972,
"num_tokens": 6312077683.0,
"step": 3035
}
],
"logging_steps": 5,
"max_steps": 3036,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 320,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.6636055544172904e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}