SmolLM-3B-Science-ES / trainer_state.json
toroe's picture
Upload folder using huggingface_hub
dce3241 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2916,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005145356315924878,
"grad_norm": 3.516500949859619,
"learning_rate": 5.47945205479452e-07,
"loss": 1.0549,
"mean_token_accuracy": 0.7244073122739791,
"num_tokens": 10390477.0,
"step": 5
},
{
"epoch": 0.010290712631849755,
"grad_norm": 2.3721697330474854,
"learning_rate": 1.2328767123287673e-06,
"loss": 1.0331,
"mean_token_accuracy": 0.7277479201555253,
"num_tokens": 20784784.0,
"step": 10
},
{
"epoch": 0.015436068947774634,
"grad_norm": 1.1673978567123413,
"learning_rate": 1.9178082191780823e-06,
"loss": 1.0055,
"mean_token_accuracy": 0.7293060600757599,
"num_tokens": 31194057.0,
"step": 15
},
{
"epoch": 0.02058142526369951,
"grad_norm": 1.1831053495407104,
"learning_rate": 2.6027397260273973e-06,
"loss": 0.9849,
"mean_token_accuracy": 0.7313222289085388,
"num_tokens": 41573889.0,
"step": 20
},
{
"epoch": 0.025726781579624387,
"grad_norm": 0.7422630786895752,
"learning_rate": 3.2876712328767123e-06,
"loss": 0.9496,
"mean_token_accuracy": 0.7391268193721772,
"num_tokens": 51955847.0,
"step": 25
},
{
"epoch": 0.030872137895549268,
"grad_norm": 0.4367648959159851,
"learning_rate": 3.972602739726027e-06,
"loss": 0.9444,
"mean_token_accuracy": 0.7393645942211151,
"num_tokens": 62362424.0,
"step": 30
},
{
"epoch": 0.03601749421147415,
"grad_norm": 0.38364696502685547,
"learning_rate": 4.657534246575343e-06,
"loss": 0.9117,
"mean_token_accuracy": 0.7468442142009735,
"num_tokens": 72760156.0,
"step": 35
},
{
"epoch": 0.04116285052739902,
"grad_norm": 0.32832905650138855,
"learning_rate": 5.342465753424658e-06,
"loss": 0.9062,
"mean_token_accuracy": 0.7470937430858612,
"num_tokens": 83160696.0,
"step": 40
},
{
"epoch": 0.0463082068433239,
"grad_norm": 0.254768967628479,
"learning_rate": 6.027397260273973e-06,
"loss": 0.8938,
"mean_token_accuracy": 0.7495229691267014,
"num_tokens": 93541996.0,
"step": 45
},
{
"epoch": 0.051453563159248775,
"grad_norm": 0.23051689565181732,
"learning_rate": 6.712328767123288e-06,
"loss": 0.9012,
"mean_token_accuracy": 0.7474822252988815,
"num_tokens": 103951096.0,
"step": 50
},
{
"epoch": 0.056598919475173655,
"grad_norm": 0.20961317420005798,
"learning_rate": 7.397260273972603e-06,
"loss": 0.8874,
"mean_token_accuracy": 0.75033338367939,
"num_tokens": 114348089.0,
"step": 55
},
{
"epoch": 0.061744275791098535,
"grad_norm": 0.20559196174144745,
"learning_rate": 8.082191780821919e-06,
"loss": 0.8788,
"mean_token_accuracy": 0.7523881673812867,
"num_tokens": 124723656.0,
"step": 60
},
{
"epoch": 0.06688963210702341,
"grad_norm": 0.2094876766204834,
"learning_rate": 8.767123287671233e-06,
"loss": 0.87,
"mean_token_accuracy": 0.7535106301307678,
"num_tokens": 135146677.0,
"step": 65
},
{
"epoch": 0.0720349884229483,
"grad_norm": 0.19069737195968628,
"learning_rate": 9.452054794520548e-06,
"loss": 0.8749,
"mean_token_accuracy": 0.7522967606782913,
"num_tokens": 145548072.0,
"step": 70
},
{
"epoch": 0.07718034473887317,
"grad_norm": 0.19116425514221191,
"learning_rate": 1.0136986301369864e-05,
"loss": 0.8592,
"mean_token_accuracy": 0.7557980835437774,
"num_tokens": 155949133.0,
"step": 75
},
{
"epoch": 0.08232570105479804,
"grad_norm": 0.21261022984981537,
"learning_rate": 1.082191780821918e-05,
"loss": 0.8596,
"mean_token_accuracy": 0.7559153437614441,
"num_tokens": 166313331.0,
"step": 80
},
{
"epoch": 0.08747105737072293,
"grad_norm": 0.19367730617523193,
"learning_rate": 1.1506849315068493e-05,
"loss": 0.8616,
"mean_token_accuracy": 0.7551171153783798,
"num_tokens": 176729555.0,
"step": 85
},
{
"epoch": 0.0926164136866478,
"grad_norm": 0.18082526326179504,
"learning_rate": 1.219178082191781e-05,
"loss": 0.8489,
"mean_token_accuracy": 0.757958498597145,
"num_tokens": 187130427.0,
"step": 90
},
{
"epoch": 0.09776177000257268,
"grad_norm": 0.20260317623615265,
"learning_rate": 1.2876712328767125e-05,
"loss": 0.8508,
"mean_token_accuracy": 0.7575758665800094,
"num_tokens": 197551850.0,
"step": 95
},
{
"epoch": 0.10290712631849755,
"grad_norm": 0.23283614218235016,
"learning_rate": 1.356164383561644e-05,
"loss": 0.8489,
"mean_token_accuracy": 0.7578668266534805,
"num_tokens": 207961400.0,
"step": 100
},
{
"epoch": 0.10805248263442244,
"grad_norm": 0.21379666030406952,
"learning_rate": 1.4246575342465754e-05,
"loss": 0.8339,
"mean_token_accuracy": 0.7610192090272904,
"num_tokens": 218376661.0,
"step": 105
},
{
"epoch": 0.11319783895034731,
"grad_norm": 0.20584498345851898,
"learning_rate": 1.493150684931507e-05,
"loss": 0.8371,
"mean_token_accuracy": 0.7601476907730103,
"num_tokens": 228779024.0,
"step": 110
},
{
"epoch": 0.11834319526627218,
"grad_norm": 0.22660651803016663,
"learning_rate": 1.5616438356164384e-05,
"loss": 0.8379,
"mean_token_accuracy": 0.7598486542701721,
"num_tokens": 239167993.0,
"step": 115
},
{
"epoch": 0.12348855158219707,
"grad_norm": 0.21756519377231598,
"learning_rate": 1.6301369863013702e-05,
"loss": 0.833,
"mean_token_accuracy": 0.7604194134473801,
"num_tokens": 249533588.0,
"step": 120
},
{
"epoch": 0.12863390789812196,
"grad_norm": 0.2544439733028412,
"learning_rate": 1.6986301369863014e-05,
"loss": 0.8267,
"mean_token_accuracy": 0.7623803347349167,
"num_tokens": 259910590.0,
"step": 125
},
{
"epoch": 0.13377926421404682,
"grad_norm": 0.22863225638866425,
"learning_rate": 1.767123287671233e-05,
"loss": 0.8326,
"mean_token_accuracy": 0.7607358664274215,
"num_tokens": 270321780.0,
"step": 130
},
{
"epoch": 0.1389246205299717,
"grad_norm": 0.2177596092224121,
"learning_rate": 1.8356164383561645e-05,
"loss": 0.8181,
"mean_token_accuracy": 0.7643768191337585,
"num_tokens": 280754879.0,
"step": 135
},
{
"epoch": 0.1440699768458966,
"grad_norm": 0.25882163643836975,
"learning_rate": 1.904109589041096e-05,
"loss": 0.83,
"mean_token_accuracy": 0.7610959112644196,
"num_tokens": 291123129.0,
"step": 140
},
{
"epoch": 0.14921533316182145,
"grad_norm": 0.3098185062408447,
"learning_rate": 1.9726027397260276e-05,
"loss": 0.8421,
"mean_token_accuracy": 0.7577149778604507,
"num_tokens": 301526988.0,
"step": 145
},
{
"epoch": 0.15436068947774634,
"grad_norm": 0.3141907751560211,
"learning_rate": 1.999995658762304e-05,
"loss": 0.8239,
"mean_token_accuracy": 0.7624665558338165,
"num_tokens": 311935967.0,
"step": 150
},
{
"epoch": 0.15950604579367123,
"grad_norm": 0.2806973159313202,
"learning_rate": 1.999969129158383e-05,
"loss": 0.8149,
"mean_token_accuracy": 0.7648573398590088,
"num_tokens": 322344265.0,
"step": 155
},
{
"epoch": 0.16465140210959608,
"grad_norm": 0.25377604365348816,
"learning_rate": 1.999918482601347e-05,
"loss": 0.8115,
"mean_token_accuracy": 0.7656003296375274,
"num_tokens": 332734169.0,
"step": 160
},
{
"epoch": 0.16979675842552097,
"grad_norm": 0.2188454419374466,
"learning_rate": 1.9998437207198492e-05,
"loss": 0.8159,
"mean_token_accuracy": 0.7638908416032791,
"num_tokens": 343149437.0,
"step": 165
},
{
"epoch": 0.17494211474144586,
"grad_norm": 0.284574031829834,
"learning_rate": 1.9997448459180285e-05,
"loss": 0.8125,
"mean_token_accuracy": 0.7646788358688354,
"num_tokens": 353540696.0,
"step": 170
},
{
"epoch": 0.18008747105737072,
"grad_norm": 0.2330782562494278,
"learning_rate": 1.999621861375427e-05,
"loss": 0.8015,
"mean_token_accuracy": 0.7678918361663818,
"num_tokens": 363936147.0,
"step": 175
},
{
"epoch": 0.1852328273732956,
"grad_norm": 0.3005109429359436,
"learning_rate": 1.9994747710468907e-05,
"loss": 0.8221,
"mean_token_accuracy": 0.7623626977205277,
"num_tokens": 374341093.0,
"step": 180
},
{
"epoch": 0.19037818368922046,
"grad_norm": 0.2539622485637665,
"learning_rate": 1.9993035796624416e-05,
"loss": 0.8191,
"mean_token_accuracy": 0.7632038950920105,
"num_tokens": 384736305.0,
"step": 185
},
{
"epoch": 0.19552354000514535,
"grad_norm": 0.3602747321128845,
"learning_rate": 1.9991082927271263e-05,
"loss": 0.8072,
"mean_token_accuracy": 0.765991085767746,
"num_tokens": 395134979.0,
"step": 190
},
{
"epoch": 0.20066889632107024,
"grad_norm": 0.273478627204895,
"learning_rate": 1.9988889165208373e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.7652726262807846,
"num_tokens": 405523607.0,
"step": 195
},
{
"epoch": 0.2058142526369951,
"grad_norm": 0.2960168719291687,
"learning_rate": 1.998645458098112e-05,
"loss": 0.813,
"mean_token_accuracy": 0.7640766650438309,
"num_tokens": 415905850.0,
"step": 200
},
{
"epoch": 0.21095960895291999,
"grad_norm": 0.23220957815647125,
"learning_rate": 1.998377925287908e-05,
"loss": 0.8086,
"mean_token_accuracy": 0.7652794599533081,
"num_tokens": 426274540.0,
"step": 205
},
{
"epoch": 0.21610496526884487,
"grad_norm": 0.28992024064064026,
"learning_rate": 1.9980863266933464e-05,
"loss": 0.8179,
"mean_token_accuracy": 0.7626183509826661,
"num_tokens": 436689729.0,
"step": 210
},
{
"epoch": 0.22125032158476973,
"grad_norm": 0.31990498304367065,
"learning_rate": 1.9977706716914402e-05,
"loss": 0.803,
"mean_token_accuracy": 0.7667577922344208,
"num_tokens": 447114386.0,
"step": 215
},
{
"epoch": 0.22639567790069462,
"grad_norm": 0.3343593180179596,
"learning_rate": 1.997430970432789e-05,
"loss": 0.7933,
"mean_token_accuracy": 0.7693847328424454,
"num_tokens": 457532029.0,
"step": 220
},
{
"epoch": 0.2315410342166195,
"grad_norm": 0.24708902835845947,
"learning_rate": 1.9970672338412554e-05,
"loss": 0.7975,
"mean_token_accuracy": 0.768178117275238,
"num_tokens": 467953269.0,
"step": 225
},
{
"epoch": 0.23668639053254437,
"grad_norm": 0.33174219727516174,
"learning_rate": 1.9966794736136114e-05,
"loss": 0.8089,
"mean_token_accuracy": 0.7650195062160492,
"num_tokens": 478360587.0,
"step": 230
},
{
"epoch": 0.24183174684846925,
"grad_norm": 0.2640225291252136,
"learning_rate": 1.9962677022191648e-05,
"loss": 0.7919,
"mean_token_accuracy": 0.7692882120609283,
"num_tokens": 488764842.0,
"step": 235
},
{
"epoch": 0.24697710316439414,
"grad_norm": 0.2795109450817108,
"learning_rate": 1.9958319328993553e-05,
"loss": 0.8134,
"mean_token_accuracy": 0.7633749455213547,
"num_tokens": 499181820.0,
"step": 240
},
{
"epoch": 0.25212245948031903,
"grad_norm": 0.220581516623497,
"learning_rate": 1.99537217966733e-05,
"loss": 0.8017,
"mean_token_accuracy": 0.7669906347990036,
"num_tokens": 509595885.0,
"step": 245
},
{
"epoch": 0.2572678157962439,
"grad_norm": 0.2826674282550812,
"learning_rate": 1.9948884573074948e-05,
"loss": 0.8109,
"mean_token_accuracy": 0.7639420449733734,
"num_tokens": 519993750.0,
"step": 250
},
{
"epoch": 0.26241317211216875,
"grad_norm": 0.24169522523880005,
"learning_rate": 1.9943807813750356e-05,
"loss": 0.7981,
"mean_token_accuracy": 0.7674963772296906,
"num_tokens": 530411077.0,
"step": 255
},
{
"epoch": 0.26755852842809363,
"grad_norm": 0.2998713552951813,
"learning_rate": 1.9938491681954196e-05,
"loss": 0.7992,
"mean_token_accuracy": 0.7669602394104004,
"num_tokens": 540817105.0,
"step": 260
},
{
"epoch": 0.2727038847440185,
"grad_norm": 0.3029504716396332,
"learning_rate": 1.993293634863871e-05,
"loss": 0.7932,
"mean_token_accuracy": 0.7686622679233551,
"num_tokens": 551223833.0,
"step": 265
},
{
"epoch": 0.2778492410599434,
"grad_norm": 0.263815701007843,
"learning_rate": 1.99271419924482e-05,
"loss": 0.7941,
"mean_token_accuracy": 0.7685212969779969,
"num_tokens": 561643697.0,
"step": 270
},
{
"epoch": 0.2829945973758683,
"grad_norm": 0.3639836311340332,
"learning_rate": 1.992110879971329e-05,
"loss": 0.7973,
"mean_token_accuracy": 0.7674791067838669,
"num_tokens": 572054629.0,
"step": 275
},
{
"epoch": 0.2881399536917932,
"grad_norm": 0.2445714920759201,
"learning_rate": 1.9914836964444934e-05,
"loss": 0.804,
"mean_token_accuracy": 0.7654124438762665,
"num_tokens": 582446471.0,
"step": 280
},
{
"epoch": 0.293285310007718,
"grad_norm": 0.23441371321678162,
"learning_rate": 1.990832668832818e-05,
"loss": 0.7832,
"mean_token_accuracy": 0.7708245635032653,
"num_tokens": 592850162.0,
"step": 285
},
{
"epoch": 0.2984306663236429,
"grad_norm": 0.27346476912498474,
"learning_rate": 1.9901578180715674e-05,
"loss": 0.7892,
"mean_token_accuracy": 0.7693257629871368,
"num_tokens": 603254165.0,
"step": 290
},
{
"epoch": 0.3035760226395678,
"grad_norm": 0.2540152966976166,
"learning_rate": 1.989459165862094e-05,
"loss": 0.7873,
"mean_token_accuracy": 0.7698075413703919,
"num_tokens": 613613424.0,
"step": 295
},
{
"epoch": 0.3087213789554927,
"grad_norm": 0.24472405016422272,
"learning_rate": 1.9887367346711387e-05,
"loss": 0.7903,
"mean_token_accuracy": 0.7687265604734421,
"num_tokens": 624004700.0,
"step": 300
},
{
"epoch": 0.31386673527141756,
"grad_norm": 0.2586674094200134,
"learning_rate": 1.987990547730111e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.7710338205099105,
"num_tokens": 634387240.0,
"step": 305
},
{
"epoch": 0.31901209158734245,
"grad_norm": 0.23755744099617004,
"learning_rate": 1.9872206290343384e-05,
"loss": 0.7865,
"mean_token_accuracy": 0.7694753050804138,
"num_tokens": 644778090.0,
"step": 310
},
{
"epoch": 0.3241574479032673,
"grad_norm": 0.3347889482975006,
"learning_rate": 1.9864270033422975e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7670842260122299,
"num_tokens": 655177170.0,
"step": 315
},
{
"epoch": 0.32930280421919217,
"grad_norm": 0.2553674578666687,
"learning_rate": 1.985609696174817e-05,
"loss": 0.7909,
"mean_token_accuracy": 0.7687036842107773,
"num_tokens": 665585375.0,
"step": 320
},
{
"epoch": 0.33444816053511706,
"grad_norm": 0.22028060257434845,
"learning_rate": 1.984768733814257e-05,
"loss": 0.7835,
"mean_token_accuracy": 0.7702562361955643,
"num_tokens": 675996436.0,
"step": 325
},
{
"epoch": 0.33959351685104194,
"grad_norm": 0.24218259751796722,
"learning_rate": 1.9839041433036636e-05,
"loss": 0.7882,
"mean_token_accuracy": 0.7690007984638214,
"num_tokens": 686407890.0,
"step": 330
},
{
"epoch": 0.34473887316696683,
"grad_norm": 0.2648875117301941,
"learning_rate": 1.9830159524459e-05,
"loss": 0.7914,
"mean_token_accuracy": 0.7680494576692581,
"num_tokens": 696772298.0,
"step": 335
},
{
"epoch": 0.3498842294828917,
"grad_norm": 0.3156125247478485,
"learning_rate": 1.982104189802751e-05,
"loss": 0.7882,
"mean_token_accuracy": 0.7689610362052918,
"num_tokens": 707159995.0,
"step": 340
},
{
"epoch": 0.35502958579881655,
"grad_norm": 0.21199235320091248,
"learning_rate": 1.9811688846940064e-05,
"loss": 0.7973,
"mean_token_accuracy": 0.7667856603860855,
"num_tokens": 717555566.0,
"step": 345
},
{
"epoch": 0.36017494211474144,
"grad_norm": 0.2736167013645172,
"learning_rate": 1.9802100671965167e-05,
"loss": 0.7927,
"mean_token_accuracy": 0.767885434627533,
"num_tokens": 727966808.0,
"step": 350
},
{
"epoch": 0.3653202984306663,
"grad_norm": 0.22648128867149353,
"learning_rate": 1.9792277681432257e-05,
"loss": 0.7907,
"mean_token_accuracy": 0.7685991823673248,
"num_tokens": 738371625.0,
"step": 355
},
{
"epoch": 0.3704656547465912,
"grad_norm": 0.26248160004615784,
"learning_rate": 1.9782220191221818e-05,
"loss": 0.7972,
"mean_token_accuracy": 0.7668163865804672,
"num_tokens": 748771291.0,
"step": 360
},
{
"epoch": 0.3756110110625161,
"grad_norm": 0.23813173174858093,
"learning_rate": 1.9771928524755182e-05,
"loss": 0.7871,
"mean_token_accuracy": 0.7691414833068848,
"num_tokens": 759119097.0,
"step": 365
},
{
"epoch": 0.38075636737844093,
"grad_norm": 0.32292458415031433,
"learning_rate": 1.976140301298416e-05,
"loss": 0.7858,
"mean_token_accuracy": 0.7693786770105362,
"num_tokens": 769547626.0,
"step": 370
},
{
"epoch": 0.3859017236943658,
"grad_norm": 0.23817549645900726,
"learning_rate": 1.9750643994380377e-05,
"loss": 0.7841,
"mean_token_accuracy": 0.7698348790407181,
"num_tokens": 779970748.0,
"step": 375
},
{
"epoch": 0.3910470800102907,
"grad_norm": 0.2122715711593628,
"learning_rate": 1.9739651814924404e-05,
"loss": 0.79,
"mean_token_accuracy": 0.76830395758152,
"num_tokens": 790354749.0,
"step": 380
},
{
"epoch": 0.3961924363262156,
"grad_norm": 0.25851792097091675,
"learning_rate": 1.972842682809463e-05,
"loss": 0.7844,
"mean_token_accuracy": 0.7702566295862198,
"num_tokens": 800785265.0,
"step": 385
},
{
"epoch": 0.4013377926421405,
"grad_norm": 0.24672254920005798,
"learning_rate": 1.9716969394855884e-05,
"loss": 0.7768,
"mean_token_accuracy": 0.7723254442214966,
"num_tokens": 811181165.0,
"step": 390
},
{
"epoch": 0.40648314895806537,
"grad_norm": 0.23226742446422577,
"learning_rate": 1.9705279883647842e-05,
"loss": 0.7809,
"mean_token_accuracy": 0.7711203783750534,
"num_tokens": 821562722.0,
"step": 395
},
{
"epoch": 0.4116285052739902,
"grad_norm": 0.22964943945407867,
"learning_rate": 1.9693358670373162e-05,
"loss": 0.7772,
"mean_token_accuracy": 0.7718722522258759,
"num_tokens": 831975336.0,
"step": 400
},
{
"epoch": 0.4167738615899151,
"grad_norm": 0.2204572707414627,
"learning_rate": 1.9681206138385418e-05,
"loss": 0.7829,
"mean_token_accuracy": 0.7703472405672074,
"num_tokens": 842370514.0,
"step": 405
},
{
"epoch": 0.42191921790583997,
"grad_norm": 0.21690765023231506,
"learning_rate": 1.966882267847675e-05,
"loss": 0.7778,
"mean_token_accuracy": 0.7715538173913956,
"num_tokens": 852764466.0,
"step": 410
},
{
"epoch": 0.42706457422176486,
"grad_norm": 0.2913786768913269,
"learning_rate": 1.9656208688865318e-05,
"loss": 0.7806,
"mean_token_accuracy": 0.7708995878696442,
"num_tokens": 863181464.0,
"step": 415
},
{
"epoch": 0.43220993053768975,
"grad_norm": 0.2708909809589386,
"learning_rate": 1.9643364575182474e-05,
"loss": 0.7853,
"mean_token_accuracy": 0.7692730128765106,
"num_tokens": 873548841.0,
"step": 420
},
{
"epoch": 0.43735528685361463,
"grad_norm": 0.20992125570774078,
"learning_rate": 1.9630290750459733e-05,
"loss": 0.7835,
"mean_token_accuracy": 0.7699774980545044,
"num_tokens": 883941994.0,
"step": 425
},
{
"epoch": 0.44250064316953946,
"grad_norm": 0.21337294578552246,
"learning_rate": 1.9616987635115502e-05,
"loss": 0.7725,
"mean_token_accuracy": 0.7728747427463531,
"num_tokens": 894335136.0,
"step": 430
},
{
"epoch": 0.44764599948546435,
"grad_norm": 0.30405837297439575,
"learning_rate": 1.9603455656941518e-05,
"loss": 0.7813,
"mean_token_accuracy": 0.7705006301403046,
"num_tokens": 904737119.0,
"step": 435
},
{
"epoch": 0.45279135580138924,
"grad_norm": 0.22999157011508942,
"learning_rate": 1.9589695251089154e-05,
"loss": 0.7804,
"mean_token_accuracy": 0.7707367807626724,
"num_tokens": 915122419.0,
"step": 440
},
{
"epoch": 0.4579367121173141,
"grad_norm": 0.37796908617019653,
"learning_rate": 1.9575706860055363e-05,
"loss": 0.7859,
"mean_token_accuracy": 0.7693438589572906,
"num_tokens": 925494224.0,
"step": 445
},
{
"epoch": 0.463082068433239,
"grad_norm": 0.4048900306224823,
"learning_rate": 1.9561490933668492e-05,
"loss": 0.79,
"mean_token_accuracy": 0.7678173094987869,
"num_tokens": 935892275.0,
"step": 450
},
{
"epoch": 0.4682274247491639,
"grad_norm": 0.2899525463581085,
"learning_rate": 1.95470479290738e-05,
"loss": 0.773,
"mean_token_accuracy": 0.7723690897226334,
"num_tokens": 946289615.0,
"step": 455
},
{
"epoch": 0.47337278106508873,
"grad_norm": 0.22180290520191193,
"learning_rate": 1.9532378310718745e-05,
"loss": 0.7721,
"mean_token_accuracy": 0.7729926645755768,
"num_tokens": 956672790.0,
"step": 460
},
{
"epoch": 0.4785181373810136,
"grad_norm": 0.32568320631980896,
"learning_rate": 1.951748255033809e-05,
"loss": 0.7845,
"mean_token_accuracy": 0.7696036785840988,
"num_tokens": 967100741.0,
"step": 465
},
{
"epoch": 0.4836634936969385,
"grad_norm": 0.21786752343177795,
"learning_rate": 1.9502361126938683e-05,
"loss": 0.7769,
"mean_token_accuracy": 0.7711740046739578,
"num_tokens": 977497458.0,
"step": 470
},
{
"epoch": 0.4888088500128634,
"grad_norm": 0.274349570274353,
"learning_rate": 1.9487014526784088e-05,
"loss": 0.7717,
"mean_token_accuracy": 0.772866228222847,
"num_tokens": 987914143.0,
"step": 475
},
{
"epoch": 0.4939542063287883,
"grad_norm": 0.27368083596229553,
"learning_rate": 1.9471443243378934e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.7704960852861404,
"num_tokens": 998321473.0,
"step": 480
},
{
"epoch": 0.49909956264471317,
"grad_norm": 0.2256074994802475,
"learning_rate": 1.9455647777453045e-05,
"loss": 0.7819,
"mean_token_accuracy": 0.7698213875293731,
"num_tokens": 1008748099.0,
"step": 485
},
{
"epoch": 0.5042449189606381,
"grad_norm": 0.2857572138309479,
"learning_rate": 1.9439628636945337e-05,
"loss": 0.7816,
"mean_token_accuracy": 0.7701322674751282,
"num_tokens": 1019143713.0,
"step": 490
},
{
"epoch": 0.5093902752765629,
"grad_norm": 1.1966010332107544,
"learning_rate": 1.9423386336987507e-05,
"loss": 0.7727,
"mean_token_accuracy": 0.7724141061306,
"num_tokens": 1029507845.0,
"step": 495
},
{
"epoch": 0.5145356315924878,
"grad_norm": 0.40811723470687866,
"learning_rate": 1.9406921399887432e-05,
"loss": 0.8462,
"mean_token_accuracy": 0.7615606904029846,
"num_tokens": 1039924539.0,
"step": 500
},
{
"epoch": 0.5196809879084127,
"grad_norm": 0.43425825238227844,
"learning_rate": 1.9390234355112386e-05,
"loss": 0.776,
"mean_token_accuracy": 0.7718711495399475,
"num_tokens": 1050307833.0,
"step": 505
},
{
"epoch": 0.5248263442243375,
"grad_norm": 0.2550142705440521,
"learning_rate": 1.9373325739272035e-05,
"loss": 0.7718,
"mean_token_accuracy": 0.772559967637062,
"num_tokens": 1060725805.0,
"step": 510
},
{
"epoch": 0.5299717005402624,
"grad_norm": 0.2491987943649292,
"learning_rate": 1.9356196096101145e-05,
"loss": 0.7818,
"mean_token_accuracy": 0.7696256130933762,
"num_tokens": 1071091088.0,
"step": 515
},
{
"epoch": 0.5351170568561873,
"grad_norm": 0.2185741513967514,
"learning_rate": 1.9338845976442128e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7729953050613403,
"num_tokens": 1081507707.0,
"step": 520
},
{
"epoch": 0.5402624131721122,
"grad_norm": 0.22735682129859924,
"learning_rate": 1.9321275938227315e-05,
"loss": 0.7846,
"mean_token_accuracy": 0.7691415637731552,
"num_tokens": 1091915702.0,
"step": 525
},
{
"epoch": 0.545407769488037,
"grad_norm": 0.2260189801454544,
"learning_rate": 1.930348654646101e-05,
"loss": 0.7783,
"mean_token_accuracy": 0.7702914178371429,
"num_tokens": 1102329794.0,
"step": 530
},
{
"epoch": 0.5505531258039619,
"grad_norm": 0.24121206998825073,
"learning_rate": 1.928547837320133e-05,
"loss": 0.7793,
"mean_token_accuracy": 0.7705663651227951,
"num_tokens": 1112744574.0,
"step": 535
},
{
"epoch": 0.5556984821198868,
"grad_norm": 0.21719065308570862,
"learning_rate": 1.92672519975418e-05,
"loss": 0.7709,
"mean_token_accuracy": 0.7725832790136338,
"num_tokens": 1123133981.0,
"step": 540
},
{
"epoch": 0.5608438384358116,
"grad_norm": 0.2318485677242279,
"learning_rate": 1.9248808005592748e-05,
"loss": 0.7593,
"mean_token_accuracy": 0.7762016743421555,
"num_tokens": 1133547201.0,
"step": 545
},
{
"epoch": 0.5659891947517366,
"grad_norm": 0.22322027385234833,
"learning_rate": 1.923014699046244e-05,
"loss": 0.7781,
"mean_token_accuracy": 0.7703386157751083,
"num_tokens": 1143928590.0,
"step": 550
},
{
"epoch": 0.5711345510676614,
"grad_norm": 0.25323280692100525,
"learning_rate": 1.9211269552238006e-05,
"loss": 0.7674,
"mean_token_accuracy": 0.7730859339237213,
"num_tokens": 1154289231.0,
"step": 555
},
{
"epoch": 0.5762799073835864,
"grad_norm": 0.19904294610023499,
"learning_rate": 1.919217629796616e-05,
"loss": 0.7562,
"mean_token_accuracy": 0.7763365268707275,
"num_tokens": 1164696083.0,
"step": 560
},
{
"epoch": 0.5814252636995112,
"grad_norm": 0.20939859747886658,
"learning_rate": 1.917286784163366e-05,
"loss": 0.7619,
"mean_token_accuracy": 0.7748125195503235,
"num_tokens": 1175092473.0,
"step": 565
},
{
"epoch": 0.586570620015436,
"grad_norm": 0.19839182496070862,
"learning_rate": 1.9153344804147583e-05,
"loss": 0.758,
"mean_token_accuracy": 0.7757725417613983,
"num_tokens": 1185494160.0,
"step": 570
},
{
"epoch": 0.591715976331361,
"grad_norm": 0.2394329458475113,
"learning_rate": 1.913360781331535e-05,
"loss": 0.7761,
"mean_token_accuracy": 0.770972666144371,
"num_tokens": 1195885099.0,
"step": 575
},
{
"epoch": 0.5968613326472858,
"grad_norm": 0.22963404655456543,
"learning_rate": 1.9113657503824513e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.7746537119150162,
"num_tokens": 1206298358.0,
"step": 580
},
{
"epoch": 0.6020066889632107,
"grad_norm": 0.2229454070329666,
"learning_rate": 1.9093494517222397e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.7687952756881714,
"num_tokens": 1216694302.0,
"step": 585
},
{
"epoch": 0.6071520452791356,
"grad_norm": 0.24715222418308258,
"learning_rate": 1.907311950189542e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.770742443203926,
"num_tokens": 1227102730.0,
"step": 590
},
{
"epoch": 0.6122974015950604,
"grad_norm": 0.26584967970848083,
"learning_rate": 1.9052533113048274e-05,
"loss": 0.7656,
"mean_token_accuracy": 0.7737416863441468,
"num_tokens": 1237504190.0,
"step": 595
},
{
"epoch": 0.6174427579109854,
"grad_norm": 0.23727792501449585,
"learning_rate": 1.903173601268284e-05,
"loss": 0.767,
"mean_token_accuracy": 0.7734443098306656,
"num_tokens": 1247914802.0,
"step": 600
},
{
"epoch": 0.6225881142269102,
"grad_norm": 0.22145575284957886,
"learning_rate": 1.90107288695769e-05,
"loss": 0.7717,
"mean_token_accuracy": 0.7720071583986282,
"num_tokens": 1258346476.0,
"step": 605
},
{
"epoch": 0.6277334705428351,
"grad_norm": 0.2083977609872818,
"learning_rate": 1.8989512359262643e-05,
"loss": 0.7679,
"mean_token_accuracy": 0.7729957222938537,
"num_tokens": 1268749806.0,
"step": 610
},
{
"epoch": 0.63287882685876,
"grad_norm": 0.21598635613918304,
"learning_rate": 1.8968087164004935e-05,
"loss": 0.7662,
"mean_token_accuracy": 0.7731610238552094,
"num_tokens": 1279117698.0,
"step": 615
},
{
"epoch": 0.6380241831746849,
"grad_norm": 0.19322967529296875,
"learning_rate": 1.894645397277937e-05,
"loss": 0.7502,
"mean_token_accuracy": 0.7780898064374924,
"num_tokens": 1289533458.0,
"step": 620
},
{
"epoch": 0.6431695394906097,
"grad_norm": 0.19693732261657715,
"learning_rate": 1.8924613481250128e-05,
"loss": 0.7727,
"mean_token_accuracy": 0.7718368798494339,
"num_tokens": 1299925979.0,
"step": 625
},
{
"epoch": 0.6483148958065346,
"grad_norm": 0.21000495553016663,
"learning_rate": 1.8902566391747596e-05,
"loss": 0.7734,
"mean_token_accuracy": 0.7717230170965195,
"num_tokens": 1310311084.0,
"step": 630
},
{
"epoch": 0.6534602521224595,
"grad_norm": 0.2045949399471283,
"learning_rate": 1.8880313413245794e-05,
"loss": 0.7717,
"mean_token_accuracy": 0.7715155422687531,
"num_tokens": 1320686452.0,
"step": 635
},
{
"epoch": 0.6586056084383843,
"grad_norm": 0.22864918410778046,
"learning_rate": 1.885785526133956e-05,
"loss": 0.7579,
"mean_token_accuracy": 0.7755870014429093,
"num_tokens": 1331096443.0,
"step": 640
},
{
"epoch": 0.6637509647543093,
"grad_norm": 0.2280014157295227,
"learning_rate": 1.8835192658221545e-05,
"loss": 0.7643,
"mean_token_accuracy": 0.7744101166725159,
"num_tokens": 1341514376.0,
"step": 645
},
{
"epoch": 0.6688963210702341,
"grad_norm": 0.224672332406044,
"learning_rate": 1.8812326332658997e-05,
"loss": 0.7674,
"mean_token_accuracy": 0.773426678776741,
"num_tokens": 1351882404.0,
"step": 650
},
{
"epoch": 0.6740416773861589,
"grad_norm": 0.2680542767047882,
"learning_rate": 1.878925701997032e-05,
"loss": 0.7573,
"mean_token_accuracy": 0.7760124772787094,
"num_tokens": 1362304069.0,
"step": 655
},
{
"epoch": 0.6791870337020839,
"grad_norm": 0.29469063878059387,
"learning_rate": 1.8765985462001424e-05,
"loss": 0.7636,
"mean_token_accuracy": 0.7740887552499771,
"num_tokens": 1372688301.0,
"step": 660
},
{
"epoch": 0.6843323900180087,
"grad_norm": 0.21097056567668915,
"learning_rate": 1.8742512407101875e-05,
"loss": 0.7578,
"mean_token_accuracy": 0.7753616005182267,
"num_tokens": 1383093337.0,
"step": 665
},
{
"epoch": 0.6894777463339337,
"grad_norm": 0.21591758728027344,
"learning_rate": 1.8718838610100832e-05,
"loss": 0.7566,
"mean_token_accuracy": 0.7760109454393387,
"num_tokens": 1393440159.0,
"step": 670
},
{
"epoch": 0.6946231026498585,
"grad_norm": 0.2419527769088745,
"learning_rate": 1.8694964832282764e-05,
"loss": 0.77,
"mean_token_accuracy": 0.7725261867046356,
"num_tokens": 1403836556.0,
"step": 675
},
{
"epoch": 0.6997684589657834,
"grad_norm": 0.27528461813926697,
"learning_rate": 1.8670891841362976e-05,
"loss": 0.7543,
"mean_token_accuracy": 0.7764055013656617,
"num_tokens": 1414213366.0,
"step": 680
},
{
"epoch": 0.7049138152817083,
"grad_norm": 0.2578127980232239,
"learning_rate": 1.8646620411462924e-05,
"loss": 0.7802,
"mean_token_accuracy": 0.7694523215293885,
"num_tokens": 1424633658.0,
"step": 685
},
{
"epoch": 0.7100591715976331,
"grad_norm": 0.23547138273715973,
"learning_rate": 1.8622151323085317e-05,
"loss": 0.7573,
"mean_token_accuracy": 0.7754601895809173,
"num_tokens": 1435040726.0,
"step": 690
},
{
"epoch": 0.715204527913558,
"grad_norm": 0.28163185715675354,
"learning_rate": 1.8597485363089026e-05,
"loss": 0.7574,
"mean_token_accuracy": 0.7757174968719482,
"num_tokens": 1445446395.0,
"step": 695
},
{
"epoch": 0.7203498842294829,
"grad_norm": 0.2508062422275543,
"learning_rate": 1.8572623324663756e-05,
"loss": 0.767,
"mean_token_accuracy": 0.7730412214994431,
"num_tokens": 1455855217.0,
"step": 700
},
{
"epoch": 0.7254952405454078,
"grad_norm": 0.20320753753185272,
"learning_rate": 1.8547566007304577e-05,
"loss": 0.7687,
"mean_token_accuracy": 0.7726406931877137,
"num_tokens": 1466245798.0,
"step": 705
},
{
"epoch": 0.7306405968613326,
"grad_norm": 0.2486068159341812,
"learning_rate": 1.8522314216786186e-05,
"loss": 0.7559,
"mean_token_accuracy": 0.77595334649086,
"num_tokens": 1476629034.0,
"step": 710
},
{
"epoch": 0.7357859531772575,
"grad_norm": 0.20530639588832855,
"learning_rate": 1.8496868765136996e-05,
"loss": 0.758,
"mean_token_accuracy": 0.7758067876100541,
"num_tokens": 1487047420.0,
"step": 715
},
{
"epoch": 0.7409313094931824,
"grad_norm": 0.20521405339241028,
"learning_rate": 1.8471230470613046e-05,
"loss": 0.7661,
"mean_token_accuracy": 0.7729949295520783,
"num_tokens": 1497477237.0,
"step": 720
},
{
"epoch": 0.7460766658091073,
"grad_norm": 0.2346915453672409,
"learning_rate": 1.844540015767167e-05,
"loss": 0.7633,
"mean_token_accuracy": 0.7740152984857559,
"num_tokens": 1507862470.0,
"step": 725
},
{
"epoch": 0.7512220221250322,
"grad_norm": 0.25663813948631287,
"learning_rate": 1.8419378656944983e-05,
"loss": 0.7633,
"mean_token_accuracy": 0.7739923149347305,
"num_tokens": 1518284565.0,
"step": 730
},
{
"epoch": 0.756367378440957,
"grad_norm": 0.19483357667922974,
"learning_rate": 1.8393166805213178e-05,
"loss": 0.7564,
"mean_token_accuracy": 0.7762584149837494,
"num_tokens": 1528673299.0,
"step": 735
},
{
"epoch": 0.7615127347568819,
"grad_norm": 0.298998087644577,
"learning_rate": 1.8366765445377614e-05,
"loss": 0.7634,
"mean_token_accuracy": 0.7737476319074631,
"num_tokens": 1539078416.0,
"step": 740
},
{
"epoch": 0.7666580910728068,
"grad_norm": 0.22814105451107025,
"learning_rate": 1.834017542643372e-05,
"loss": 0.769,
"mean_token_accuracy": 0.7724894404411315,
"num_tokens": 1549479202.0,
"step": 745
},
{
"epoch": 0.7718034473887316,
"grad_norm": 0.20318229496479034,
"learning_rate": 1.8313397603443665e-05,
"loss": 0.7508,
"mean_token_accuracy": 0.7772645950317383,
"num_tokens": 1559862298.0,
"step": 750
},
{
"epoch": 0.7769488037046566,
"grad_norm": 0.28188684582710266,
"learning_rate": 1.828643283750891e-05,
"loss": 0.7518,
"mean_token_accuracy": 0.776808711886406,
"num_tokens": 1570288616.0,
"step": 755
},
{
"epoch": 0.7820941600205814,
"grad_norm": 0.24467986822128296,
"learning_rate": 1.8259281995742467e-05,
"loss": 0.7663,
"mean_token_accuracy": 0.7730655431747436,
"num_tokens": 1580705251.0,
"step": 760
},
{
"epoch": 0.7872395163365064,
"grad_norm": 0.24917523562908173,
"learning_rate": 1.8231945951241043e-05,
"loss": 0.7625,
"mean_token_accuracy": 0.773466631770134,
"num_tokens": 1591106421.0,
"step": 765
},
{
"epoch": 0.7923848726524312,
"grad_norm": 0.18945878744125366,
"learning_rate": 1.8204425583056962e-05,
"loss": 0.7507,
"mean_token_accuracy": 0.7773733377456665,
"num_tokens": 1601520172.0,
"step": 770
},
{
"epoch": 0.797530228968356,
"grad_norm": 0.2917018234729767,
"learning_rate": 1.817672177616989e-05,
"loss": 0.7604,
"mean_token_accuracy": 0.7747350037097931,
"num_tokens": 1611899934.0,
"step": 775
},
{
"epoch": 0.802675585284281,
"grad_norm": 0.24668017029762268,
"learning_rate": 1.8148835421458374e-05,
"loss": 0.7601,
"mean_token_accuracy": 0.7746721476316452,
"num_tokens": 1622304284.0,
"step": 780
},
{
"epoch": 0.8078209416002058,
"grad_norm": 0.3010805547237396,
"learning_rate": 1.8120767415671208e-05,
"loss": 0.7715,
"mean_token_accuracy": 0.7717162489891052,
"num_tokens": 1632665638.0,
"step": 785
},
{
"epoch": 0.8129662979161307,
"grad_norm": 0.2387019842863083,
"learning_rate": 1.809251866139858e-05,
"loss": 0.7631,
"mean_token_accuracy": 0.773768350481987,
"num_tokens": 1643065667.0,
"step": 790
},
{
"epoch": 0.8181116542320556,
"grad_norm": 0.207286074757576,
"learning_rate": 1.8064090067043066e-05,
"loss": 0.7596,
"mean_token_accuracy": 0.7746483981609344,
"num_tokens": 1653489023.0,
"step": 795
},
{
"epoch": 0.8232570105479804,
"grad_norm": 0.20019465684890747,
"learning_rate": 1.8035482546790387e-05,
"loss": 0.7619,
"mean_token_accuracy": 0.7740916252136231,
"num_tokens": 1663916269.0,
"step": 800
},
{
"epoch": 0.8284023668639053,
"grad_norm": 0.24347372353076935,
"learning_rate": 1.8006697020580048e-05,
"loss": 0.7614,
"mean_token_accuracy": 0.7742874711751938,
"num_tokens": 1674339518.0,
"step": 805
},
{
"epoch": 0.8335477231798302,
"grad_norm": 0.2331140786409378,
"learning_rate": 1.7977734414075728e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7760845631361007,
"num_tokens": 1684753767.0,
"step": 810
},
{
"epoch": 0.8386930794957551,
"grad_norm": 0.26996636390686035,
"learning_rate": 1.7948595658635533e-05,
"loss": 0.7644,
"mean_token_accuracy": 0.7731517612934112,
"num_tokens": 1695167774.0,
"step": 815
},
{
"epoch": 0.8438384358116799,
"grad_norm": 0.23208890855312347,
"learning_rate": 1.791928169128202e-05,
"loss": 0.754,
"mean_token_accuracy": 0.7757177084684372,
"num_tokens": 1705535908.0,
"step": 820
},
{
"epoch": 0.8489837921276049,
"grad_norm": 0.22476951777935028,
"learning_rate": 1.7889793454672104e-05,
"loss": 0.757,
"mean_token_accuracy": 0.7750364452600479,
"num_tokens": 1715945968.0,
"step": 825
},
{
"epoch": 0.8541291484435297,
"grad_norm": 0.2300329953432083,
"learning_rate": 1.7860131897066702e-05,
"loss": 0.7687,
"mean_token_accuracy": 0.7721786022186279,
"num_tokens": 1726366494.0,
"step": 830
},
{
"epoch": 0.8592745047594545,
"grad_norm": 0.21463564038276672,
"learning_rate": 1.7830297972300266e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.773630577325821,
"num_tokens": 1736754760.0,
"step": 835
},
{
"epoch": 0.8644198610753795,
"grad_norm": 0.20610825717449188,
"learning_rate": 1.780029263975011e-05,
"loss": 0.7428,
"mean_token_accuracy": 0.7793777525424957,
"num_tokens": 1747169397.0,
"step": 840
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.17724700272083282,
"learning_rate": 1.7770116864305543e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7759394317865371,
"num_tokens": 1757562579.0,
"step": 845
},
{
"epoch": 0.8747105737072293,
"grad_norm": 0.20515885949134827,
"learning_rate": 1.773977161633686e-05,
"loss": 0.7585,
"mean_token_accuracy": 0.774996566772461,
"num_tokens": 1767951452.0,
"step": 850
},
{
"epoch": 0.8798559300231541,
"grad_norm": 0.21575596928596497,
"learning_rate": 1.770925787166412e-05,
"loss": 0.7352,
"mean_token_accuracy": 0.7808972954750061,
"num_tokens": 1778357806.0,
"step": 855
},
{
"epoch": 0.8850012863390789,
"grad_norm": 0.22368930280208588,
"learning_rate": 1.767857661152578e-05,
"loss": 0.7557,
"mean_token_accuracy": 0.775502935051918,
"num_tokens": 1788767633.0,
"step": 860
},
{
"epoch": 0.8901466426550039,
"grad_norm": 0.2021559774875641,
"learning_rate": 1.7647728822547126e-05,
"loss": 0.7609,
"mean_token_accuracy": 0.7740152478218079,
"num_tokens": 1799164623.0,
"step": 865
},
{
"epoch": 0.8952919989709287,
"grad_norm": 0.19145594537258148,
"learning_rate": 1.7616715496708575e-05,
"loss": 0.7562,
"mean_token_accuracy": 0.7754783451557159,
"num_tokens": 1809537031.0,
"step": 870
},
{
"epoch": 0.9004373552868536,
"grad_norm": 0.19838295876979828,
"learning_rate": 1.7585537631313738e-05,
"loss": 0.7554,
"mean_token_accuracy": 0.7755459159612655,
"num_tokens": 1819917273.0,
"step": 875
},
{
"epoch": 0.9055827116027785,
"grad_norm": 0.18983587622642517,
"learning_rate": 1.7554196228957374e-05,
"loss": 0.7629,
"mean_token_accuracy": 0.7733017027378082,
"num_tokens": 1830301370.0,
"step": 880
},
{
"epoch": 0.9107280679187034,
"grad_norm": 0.2382967472076416,
"learning_rate": 1.7522692297493145e-05,
"loss": 0.7545,
"mean_token_accuracy": 0.775650081038475,
"num_tokens": 1840697377.0,
"step": 885
},
{
"epoch": 0.9158734242346283,
"grad_norm": 0.2184479981660843,
"learning_rate": 1.7491026850001195e-05,
"loss": 0.761,
"mean_token_accuracy": 0.7739459365606308,
"num_tokens": 1851095319.0,
"step": 890
},
{
"epoch": 0.9210187805505531,
"grad_norm": 0.20179307460784912,
"learning_rate": 1.745920090475559e-05,
"loss": 0.7654,
"mean_token_accuracy": 0.7729564756155014,
"num_tokens": 1861483986.0,
"step": 895
},
{
"epoch": 0.926164136866478,
"grad_norm": 0.2453368604183197,
"learning_rate": 1.7427215485191567e-05,
"loss": 0.7555,
"mean_token_accuracy": 0.7756044954061508,
"num_tokens": 1871857137.0,
"step": 900
},
{
"epoch": 0.9313094931824029,
"grad_norm": 0.21209710836410522,
"learning_rate": 1.739507161987261e-05,
"loss": 0.7615,
"mean_token_accuracy": 0.7739556908607483,
"num_tokens": 1882273427.0,
"step": 905
},
{
"epoch": 0.9364548494983278,
"grad_norm": 0.18556725978851318,
"learning_rate": 1.736277034245739e-05,
"loss": 0.7557,
"mean_token_accuracy": 0.7750442743301391,
"num_tokens": 1892660604.0,
"step": 910
},
{
"epoch": 0.9416002058142526,
"grad_norm": 0.2182847410440445,
"learning_rate": 1.7330312691666517e-05,
"loss": 0.7592,
"mean_token_accuracy": 0.7745744317770005,
"num_tokens": 1903083350.0,
"step": 915
},
{
"epoch": 0.9467455621301775,
"grad_norm": 0.20762157440185547,
"learning_rate": 1.7297699711249144e-05,
"loss": 0.7481,
"mean_token_accuracy": 0.7772045373916626,
"num_tokens": 1913484635.0,
"step": 920
},
{
"epoch": 0.9518909184461024,
"grad_norm": 0.22431345283985138,
"learning_rate": 1.7264932449949403e-05,
"loss": 0.752,
"mean_token_accuracy": 0.7767430722713471,
"num_tokens": 1923902157.0,
"step": 925
},
{
"epoch": 0.9570362747620272,
"grad_norm": 0.21677188575267792,
"learning_rate": 1.7232011961472666e-05,
"loss": 0.751,
"mean_token_accuracy": 0.776458004117012,
"num_tokens": 1934287093.0,
"step": 930
},
{
"epoch": 0.9621816310779522,
"grad_norm": 0.24291808903217316,
"learning_rate": 1.7198939304451677e-05,
"loss": 0.7621,
"mean_token_accuracy": 0.7737038463354111,
"num_tokens": 1944670753.0,
"step": 935
},
{
"epoch": 0.967326987393877,
"grad_norm": 0.18869860470294952,
"learning_rate": 1.7165715542412505e-05,
"loss": 0.7474,
"mean_token_accuracy": 0.7775454163551331,
"num_tokens": 1955041733.0,
"step": 940
},
{
"epoch": 0.972472343709802,
"grad_norm": 0.2036639004945755,
"learning_rate": 1.7132341743740343e-05,
"loss": 0.7591,
"mean_token_accuracy": 0.7744521021842956,
"num_tokens": 1965413435.0,
"step": 945
},
{
"epoch": 0.9776177000257268,
"grad_norm": 0.19717492163181305,
"learning_rate": 1.709881898164515e-05,
"loss": 0.7626,
"mean_token_accuracy": 0.7734819889068604,
"num_tokens": 1975831716.0,
"step": 950
},
{
"epoch": 0.9827630563416516,
"grad_norm": 0.20819610357284546,
"learning_rate": 1.7065148334127137e-05,
"loss": 0.7622,
"mean_token_accuracy": 0.7739403694868088,
"num_tokens": 1986221267.0,
"step": 955
},
{
"epoch": 0.9879084126575766,
"grad_norm": 0.2304711937904358,
"learning_rate": 1.7031330883942106e-05,
"loss": 0.7604,
"mean_token_accuracy": 0.7743852972984314,
"num_tokens": 1996637419.0,
"step": 960
},
{
"epoch": 0.9930537689735014,
"grad_norm": 0.20585313439369202,
"learning_rate": 1.699736771856664e-05,
"loss": 0.7417,
"mean_token_accuracy": 0.7793391734361649,
"num_tokens": 2007024978.0,
"step": 965
},
{
"epoch": 0.9981991252894263,
"grad_norm": 0.20385591685771942,
"learning_rate": 1.6963259930163104e-05,
"loss": 0.7442,
"mean_token_accuracy": 0.7783006697893142,
"num_tokens": 2017443954.0,
"step": 970
},
{
"epoch": 1.003087213789555,
"grad_norm": 0.24495287239551544,
"learning_rate": 1.692900861554457e-05,
"loss": 0.7389,
"mean_token_accuracy": 0.779910335415288,
"num_tokens": 2027325031.0,
"step": 975
},
{
"epoch": 1.0082325701054797,
"grad_norm": 0.2038143426179886,
"learning_rate": 1.68946148761395e-05,
"loss": 0.7429,
"mean_token_accuracy": 0.7774323493242263,
"num_tokens": 2037733216.0,
"step": 980
},
{
"epoch": 1.0133779264214047,
"grad_norm": 0.19667111337184906,
"learning_rate": 1.6860079817956353e-05,
"loss": 0.7276,
"mean_token_accuracy": 0.7817776888608933,
"num_tokens": 2048163457.0,
"step": 985
},
{
"epoch": 1.0185232827373296,
"grad_norm": 0.2280578911304474,
"learning_rate": 1.682540455154801e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.7847582966089248,
"num_tokens": 2058559958.0,
"step": 990
},
{
"epoch": 1.0236686390532543,
"grad_norm": 0.20857077836990356,
"learning_rate": 1.6790590191976068e-05,
"loss": 0.7335,
"mean_token_accuracy": 0.7803497105836869,
"num_tokens": 2068976029.0,
"step": 995
},
{
"epoch": 1.0288139953691793,
"grad_norm": 0.23610693216323853,
"learning_rate": 1.6755637858774986e-05,
"loss": 0.7416,
"mean_token_accuracy": 0.7778135746717453,
"num_tokens": 2079378100.0,
"step": 1000
},
{
"epoch": 1.0339593516851042,
"grad_norm": 0.22110521793365479,
"learning_rate": 1.6720548675916058e-05,
"loss": 0.7354,
"mean_token_accuracy": 0.7797701776027679,
"num_tokens": 2089780930.0,
"step": 1005
},
{
"epoch": 1.0391047080010292,
"grad_norm": 0.2105809450149536,
"learning_rate": 1.6685323771771306e-05,
"loss": 0.727,
"mean_token_accuracy": 0.7818355768918991,
"num_tokens": 2100188750.0,
"step": 1010
},
{
"epoch": 1.0442500643169539,
"grad_norm": 0.20174016058444977,
"learning_rate": 1.664996427907717e-05,
"loss": 0.7327,
"mean_token_accuracy": 0.7801722586154938,
"num_tokens": 2110572072.0,
"step": 1015
},
{
"epoch": 1.0493954206328788,
"grad_norm": 0.1835939884185791,
"learning_rate": 1.6614471334898086e-05,
"loss": 0.7334,
"mean_token_accuracy": 0.7800804376602173,
"num_tokens": 2120973633.0,
"step": 1020
},
{
"epoch": 1.0545407769488038,
"grad_norm": 0.21300305426120758,
"learning_rate": 1.6578846080589934e-05,
"loss": 0.7299,
"mean_token_accuracy": 0.7812818288803101,
"num_tokens": 2131360621.0,
"step": 1025
},
{
"epoch": 1.0596861332647285,
"grad_norm": 0.1908658891916275,
"learning_rate": 1.6543089661763315e-05,
"loss": 0.7223,
"mean_token_accuracy": 0.7834197998046875,
"num_tokens": 2141793569.0,
"step": 1030
},
{
"epoch": 1.0648314895806534,
"grad_norm": 0.19039925932884216,
"learning_rate": 1.650720322824672e-05,
"loss": 0.7361,
"mean_token_accuracy": 0.779027310013771,
"num_tokens": 2152174814.0,
"step": 1035
},
{
"epoch": 1.0699768458965784,
"grad_norm": 0.2063397467136383,
"learning_rate": 1.6471187934049574e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.7826652109622956,
"num_tokens": 2162584314.0,
"step": 1040
},
{
"epoch": 1.0751222022125033,
"grad_norm": 0.20498026907444,
"learning_rate": 1.643504493732509e-05,
"loss": 0.7346,
"mean_token_accuracy": 0.7796628832817077,
"num_tokens": 2172965246.0,
"step": 1045
},
{
"epoch": 1.080267558528428,
"grad_norm": 0.23134800791740417,
"learning_rate": 1.639877540033305e-05,
"loss": 0.7271,
"mean_token_accuracy": 0.7816483587026596,
"num_tokens": 2183370435.0,
"step": 1050
},
{
"epoch": 1.085412914844353,
"grad_norm": 0.19139111042022705,
"learning_rate": 1.6362380489402433e-05,
"loss": 0.7228,
"mean_token_accuracy": 0.7829385250806808,
"num_tokens": 2193756185.0,
"step": 1055
},
{
"epoch": 1.090558271160278,
"grad_norm": 0.23847441375255585,
"learning_rate": 1.6325861374893885e-05,
"loss": 0.7357,
"mean_token_accuracy": 0.7798227697610856,
"num_tokens": 2204144787.0,
"step": 1060
},
{
"epoch": 1.0957036274762026,
"grad_norm": 0.19361141324043274,
"learning_rate": 1.6289219231162107e-05,
"loss": 0.7323,
"mean_token_accuracy": 0.7803295195102692,
"num_tokens": 2214555301.0,
"step": 1065
},
{
"epoch": 1.1008489837921276,
"grad_norm": 0.2365255057811737,
"learning_rate": 1.6252455236518088e-05,
"loss": 0.7223,
"mean_token_accuracy": 0.7834222823381424,
"num_tokens": 2224964461.0,
"step": 1070
},
{
"epoch": 1.1059943401080525,
"grad_norm": 0.22614383697509766,
"learning_rate": 1.6215570573191203e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.7800393998622894,
"num_tokens": 2235385359.0,
"step": 1075
},
{
"epoch": 1.1111396964239773,
"grad_norm": 0.1952890157699585,
"learning_rate": 1.6178566427291196e-05,
"loss": 0.7361,
"mean_token_accuracy": 0.7795492619276047,
"num_tokens": 2245790049.0,
"step": 1080
},
{
"epoch": 1.1162850527399022,
"grad_norm": 0.26438701152801514,
"learning_rate": 1.614144398877006e-05,
"loss": 0.733,
"mean_token_accuracy": 0.7802164793014527,
"num_tokens": 2256182680.0,
"step": 1085
},
{
"epoch": 1.1214304090558271,
"grad_norm": 0.19735948741436005,
"learning_rate": 1.610420445138373e-05,
"loss": 0.7238,
"mean_token_accuracy": 0.7827324986457824,
"num_tokens": 2266607807.0,
"step": 1090
},
{
"epoch": 1.126575765371752,
"grad_norm": 0.20691247284412384,
"learning_rate": 1.6066849012653745e-05,
"loss": 0.727,
"mean_token_accuracy": 0.7817386299371719,
"num_tokens": 2277008491.0,
"step": 1095
},
{
"epoch": 1.1317211216876768,
"grad_norm": 0.22432605922222137,
"learning_rate": 1.6029378873828695e-05,
"loss": 0.7298,
"mean_token_accuracy": 0.7808958977460861,
"num_tokens": 2287392752.0,
"step": 1100
},
{
"epoch": 1.1368664780036017,
"grad_norm": 0.20151114463806152,
"learning_rate": 1.599179523984562e-05,
"loss": 0.737,
"mean_token_accuracy": 0.7791817605495452,
"num_tokens": 2297803721.0,
"step": 1105
},
{
"epoch": 1.1420118343195267,
"grad_norm": 0.2306589037179947,
"learning_rate": 1.5954099319291256e-05,
"loss": 0.7325,
"mean_token_accuracy": 0.7801610469818115,
"num_tokens": 2308187255.0,
"step": 1110
},
{
"epoch": 1.1471571906354514,
"grad_norm": 0.1993226855993271,
"learning_rate": 1.5916292324363156e-05,
"loss": 0.7251,
"mean_token_accuracy": 0.7821116268634796,
"num_tokens": 2318589605.0,
"step": 1115
},
{
"epoch": 1.1523025469513763,
"grad_norm": 0.22113800048828125,
"learning_rate": 1.5878375470830737e-05,
"loss": 0.743,
"mean_token_accuracy": 0.7773738950490952,
"num_tokens": 2328989752.0,
"step": 1120
},
{
"epoch": 1.1574479032673013,
"grad_norm": 0.2003169059753418,
"learning_rate": 1.584034997799615e-05,
"loss": 0.728,
"mean_token_accuracy": 0.7815322816371918,
"num_tokens": 2339418236.0,
"step": 1125
},
{
"epoch": 1.1625932595832262,
"grad_norm": 0.2361934781074524,
"learning_rate": 1.5802217068655103e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.7839639008045196,
"num_tokens": 2349792828.0,
"step": 1130
},
{
"epoch": 1.167738615899151,
"grad_norm": 0.21388159692287445,
"learning_rate": 1.5763977969057514e-05,
"loss": 0.7327,
"mean_token_accuracy": 0.7798545330762863,
"num_tokens": 2360200634.0,
"step": 1135
},
{
"epoch": 1.172883972215076,
"grad_norm": 0.20029175281524658,
"learning_rate": 1.5725633908868098e-05,
"loss": 0.7338,
"mean_token_accuracy": 0.7797937542200089,
"num_tokens": 2370596817.0,
"step": 1140
},
{
"epoch": 1.1780293285310008,
"grad_norm": 0.19658546149730682,
"learning_rate": 1.568718612112681e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7830342799425125,
"num_tokens": 2380988279.0,
"step": 1145
},
{
"epoch": 1.1831746848469256,
"grad_norm": 0.23174834251403809,
"learning_rate": 1.5648635842209197e-05,
"loss": 0.7311,
"mean_token_accuracy": 0.7808991014957428,
"num_tokens": 2391412429.0,
"step": 1150
},
{
"epoch": 1.1883200411628505,
"grad_norm": 0.21775346994400024,
"learning_rate": 1.5609984311786645e-05,
"loss": 0.729,
"mean_token_accuracy": 0.7810937970876694,
"num_tokens": 2401771777.0,
"step": 1155
},
{
"epoch": 1.1934653974787754,
"grad_norm": 0.21019048988819122,
"learning_rate": 1.5571232772786517e-05,
"loss": 0.7253,
"mean_token_accuracy": 0.7820482671260833,
"num_tokens": 2412159833.0,
"step": 1160
},
{
"epoch": 1.1986107537947004,
"grad_norm": 0.20135393738746643,
"learning_rate": 1.553238247135216e-05,
"loss": 0.7276,
"mean_token_accuracy": 0.7814355909824371,
"num_tokens": 2422558752.0,
"step": 1165
},
{
"epoch": 1.203756110110625,
"grad_norm": 0.18914781510829926,
"learning_rate": 1.549343465680287e-05,
"loss": 0.731,
"mean_token_accuracy": 0.7803467661142349,
"num_tokens": 2432969997.0,
"step": 1170
},
{
"epoch": 1.20890146642655,
"grad_norm": 0.21189342439174652,
"learning_rate": 1.5454390581593687e-05,
"loss": 0.7262,
"mean_token_accuracy": 0.7820929646492004,
"num_tokens": 2443373443.0,
"step": 1175
},
{
"epoch": 1.214046822742475,
"grad_norm": 0.1938139647245407,
"learning_rate": 1.541525150127513e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7835055589675903,
"num_tokens": 2453770770.0,
"step": 1180
},
{
"epoch": 1.2191921790583997,
"grad_norm": 0.21657834947109222,
"learning_rate": 1.537601867445283e-05,
"loss": 0.7319,
"mean_token_accuracy": 0.7800745993852616,
"num_tokens": 2464158108.0,
"step": 1185
},
{
"epoch": 1.2243375353743247,
"grad_norm": 0.20266316831111908,
"learning_rate": 1.5336693362747036e-05,
"loss": 0.7274,
"mean_token_accuracy": 0.7812938541173935,
"num_tokens": 2474574860.0,
"step": 1190
},
{
"epoch": 1.2294828916902496,
"grad_norm": 0.21585378050804138,
"learning_rate": 1.5297276830752074e-05,
"loss": 0.7262,
"mean_token_accuracy": 0.7819346249103546,
"num_tokens": 2484977909.0,
"step": 1195
},
{
"epoch": 1.2346282480061745,
"grad_norm": 0.22400428354740143,
"learning_rate": 1.5257770345995648e-05,
"loss": 0.7325,
"mean_token_accuracy": 0.7801924586296082,
"num_tokens": 2495371556.0,
"step": 1200
},
{
"epoch": 1.2397736043220993,
"grad_norm": 0.2023976743221283,
"learning_rate": 1.5218175178898106e-05,
"loss": 0.7202,
"mean_token_accuracy": 0.7832947343587875,
"num_tokens": 2505752888.0,
"step": 1205
},
{
"epoch": 1.2449189606380242,
"grad_norm": 0.22408372163772583,
"learning_rate": 1.5178492602731581e-05,
"loss": 0.7254,
"mean_token_accuracy": 0.7819278568029404,
"num_tokens": 2516182631.0,
"step": 1210
},
{
"epoch": 1.250064316953949,
"grad_norm": 0.24158449470996857,
"learning_rate": 1.5138723893579028e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.781261432170868,
"num_tokens": 2526605123.0,
"step": 1215
},
{
"epoch": 1.2552096732698739,
"grad_norm": 0.2226988673210144,
"learning_rate": 1.5098870330293218e-05,
"loss": 0.7171,
"mean_token_accuracy": 0.7841993749141694,
"num_tokens": 2537002245.0,
"step": 1220
},
{
"epoch": 1.2603550295857988,
"grad_norm": 0.23769278824329376,
"learning_rate": 1.505893319445559e-05,
"loss": 0.7325,
"mean_token_accuracy": 0.7801701694726944,
"num_tokens": 2547440043.0,
"step": 1225
},
{
"epoch": 1.2655003859017238,
"grad_norm": 0.1922103464603424,
"learning_rate": 1.5018913770335046e-05,
"loss": 0.7274,
"mean_token_accuracy": 0.781620192527771,
"num_tokens": 2557858544.0,
"step": 1230
},
{
"epoch": 1.2706457422176487,
"grad_norm": 0.18663917481899261,
"learning_rate": 1.4978813344846661e-05,
"loss": 0.7391,
"mean_token_accuracy": 0.7782456696033477,
"num_tokens": 2568263241.0,
"step": 1235
},
{
"epoch": 1.2757910985335734,
"grad_norm": 0.19044432044029236,
"learning_rate": 1.4938633207510287e-05,
"loss": 0.7267,
"mean_token_accuracy": 0.7819855302572251,
"num_tokens": 2578685798.0,
"step": 1240
},
{
"epoch": 1.2809364548494984,
"grad_norm": 0.2261897772550583,
"learning_rate": 1.4898374650409094e-05,
"loss": 0.7266,
"mean_token_accuracy": 0.7815109491348267,
"num_tokens": 2589079528.0,
"step": 1245
},
{
"epoch": 1.286081811165423,
"grad_norm": 0.27746737003326416,
"learning_rate": 1.485803896814801e-05,
"loss": 0.7255,
"mean_token_accuracy": 0.7821037322282791,
"num_tokens": 2599488251.0,
"step": 1250
},
{
"epoch": 1.291227167481348,
"grad_norm": 0.21356722712516785,
"learning_rate": 1.4817627457812107e-05,
"loss": 0.7218,
"mean_token_accuracy": 0.7831762701272964,
"num_tokens": 2609884395.0,
"step": 1255
},
{
"epoch": 1.296372523797273,
"grad_norm": 0.1928669810295105,
"learning_rate": 1.4777141418924874e-05,
"loss": 0.7222,
"mean_token_accuracy": 0.7831795990467072,
"num_tokens": 2620262838.0,
"step": 1260
},
{
"epoch": 1.301517880113198,
"grad_norm": 0.20577941834926605,
"learning_rate": 1.4736582153406431e-05,
"loss": 0.7345,
"mean_token_accuracy": 0.7796419382095336,
"num_tokens": 2630615363.0,
"step": 1265
},
{
"epoch": 1.3066632364291229,
"grad_norm": 0.2071027308702469,
"learning_rate": 1.4695950965531679e-05,
"loss": 0.7207,
"mean_token_accuracy": 0.7833283305168152,
"num_tokens": 2641010705.0,
"step": 1270
},
{
"epoch": 1.3118085927450476,
"grad_norm": 0.20452018082141876,
"learning_rate": 1.4655249161888322e-05,
"loss": 0.7219,
"mean_token_accuracy": 0.7826146423816681,
"num_tokens": 2651367538.0,
"step": 1275
},
{
"epoch": 1.3169539490609725,
"grad_norm": 0.19734811782836914,
"learning_rate": 1.46144780513349e-05,
"loss": 0.7293,
"mean_token_accuracy": 0.7808092325925827,
"num_tokens": 2661752746.0,
"step": 1280
},
{
"epoch": 1.3220993053768972,
"grad_norm": 0.2132222205400467,
"learning_rate": 1.4573638944958654e-05,
"loss": 0.7213,
"mean_token_accuracy": 0.783295625448227,
"num_tokens": 2672170022.0,
"step": 1285
},
{
"epoch": 1.3272446616928222,
"grad_norm": 0.20818860828876495,
"learning_rate": 1.4532733156033399e-05,
"loss": 0.7257,
"mean_token_accuracy": 0.7817448288202286,
"num_tokens": 2682534056.0,
"step": 1290
},
{
"epoch": 1.3323900180087471,
"grad_norm": 0.21955865621566772,
"learning_rate": 1.449176199997726e-05,
"loss": 0.7227,
"mean_token_accuracy": 0.7827985614538193,
"num_tokens": 2692942602.0,
"step": 1295
},
{
"epoch": 1.337535374324672,
"grad_norm": 0.2140192687511444,
"learning_rate": 1.4450726794310408e-05,
"loss": 0.7245,
"mean_token_accuracy": 0.7822914987802505,
"num_tokens": 2703334544.0,
"step": 1300
},
{
"epoch": 1.3426807306405968,
"grad_norm": 0.19470317661762238,
"learning_rate": 1.4409628858612665e-05,
"loss": 0.725,
"mean_token_accuracy": 0.7822528421878815,
"num_tokens": 2713729545.0,
"step": 1305
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.19224713742733002,
"learning_rate": 1.4368469514481083e-05,
"loss": 0.7159,
"mean_token_accuracy": 0.784244379401207,
"num_tokens": 2724133639.0,
"step": 1310
},
{
"epoch": 1.3529714432724467,
"grad_norm": 0.19122512638568878,
"learning_rate": 1.4327250085487435e-05,
"loss": 0.7318,
"mean_token_accuracy": 0.7805237233638763,
"num_tokens": 2734527785.0,
"step": 1315
},
{
"epoch": 1.3581167995883714,
"grad_norm": 0.19252869486808777,
"learning_rate": 1.428597189713566e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7833033174276351,
"num_tokens": 2744932620.0,
"step": 1320
},
{
"epoch": 1.3632621559042963,
"grad_norm": 0.20513495802879333,
"learning_rate": 1.4244636276819247e-05,
"loss": 0.7288,
"mean_token_accuracy": 0.7811249732971192,
"num_tokens": 2755296881.0,
"step": 1325
},
{
"epoch": 1.3684075122202213,
"grad_norm": 0.21072103083133698,
"learning_rate": 1.4203244553778523e-05,
"loss": 0.7267,
"mean_token_accuracy": 0.781619307398796,
"num_tokens": 2765692282.0,
"step": 1330
},
{
"epoch": 1.3735528685361462,
"grad_norm": 0.23436151444911957,
"learning_rate": 1.4161798059057942e-05,
"loss": 0.7221,
"mean_token_accuracy": 0.7828515231609344,
"num_tokens": 2776081454.0,
"step": 1335
},
{
"epoch": 1.378698224852071,
"grad_norm": 0.19753047823905945,
"learning_rate": 1.4120298125463252e-05,
"loss": 0.73,
"mean_token_accuracy": 0.7808351576328277,
"num_tokens": 2786502061.0,
"step": 1340
},
{
"epoch": 1.3838435811679959,
"grad_norm": 0.2203502655029297,
"learning_rate": 1.4078746087518655e-05,
"loss": 0.7337,
"mean_token_accuracy": 0.7793877094984054,
"num_tokens": 2796907592.0,
"step": 1345
},
{
"epoch": 1.3889889374839208,
"grad_norm": 0.19259704649448395,
"learning_rate": 1.4037143281423885e-05,
"loss": 0.7254,
"mean_token_accuracy": 0.7815340638160706,
"num_tokens": 2807315293.0,
"step": 1350
},
{
"epoch": 1.3941342937998455,
"grad_norm": 0.20095007121562958,
"learning_rate": 1.3995491045011243e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7817043244838715,
"num_tokens": 2817690875.0,
"step": 1355
},
{
"epoch": 1.3992796501157705,
"grad_norm": 0.1883144974708557,
"learning_rate": 1.395379071770257e-05,
"loss": 0.7207,
"mean_token_accuracy": 0.7828416913747788,
"num_tokens": 2828099023.0,
"step": 1360
},
{
"epoch": 1.4044250064316954,
"grad_norm": 0.18824101984500885,
"learning_rate": 1.3912043640466175e-05,
"loss": 0.7194,
"mean_token_accuracy": 0.7835315078496933,
"num_tokens": 2838513670.0,
"step": 1365
},
{
"epoch": 1.4095703627476204,
"grad_norm": 0.19366908073425293,
"learning_rate": 1.387025115577373e-05,
"loss": 0.74,
"mean_token_accuracy": 0.7780154138803482,
"num_tokens": 2848908140.0,
"step": 1370
},
{
"epoch": 1.414715719063545,
"grad_norm": 0.18834272027015686,
"learning_rate": 1.382841460755707e-05,
"loss": 0.7279,
"mean_token_accuracy": 0.7807209342718124,
"num_tokens": 2859277820.0,
"step": 1375
},
{
"epoch": 1.41986107537947,
"grad_norm": 0.20872853696346283,
"learning_rate": 1.378653534116501e-05,
"loss": 0.7174,
"mean_token_accuracy": 0.7837792187929153,
"num_tokens": 2869692721.0,
"step": 1380
},
{
"epoch": 1.425006431695395,
"grad_norm": 0.21112094819545746,
"learning_rate": 1.3744614703320046e-05,
"loss": 0.7229,
"mean_token_accuracy": 0.7826662242412568,
"num_tokens": 2880093189.0,
"step": 1385
},
{
"epoch": 1.4301517880113197,
"grad_norm": 0.18763834238052368,
"learning_rate": 1.3702654042075077e-05,
"loss": 0.7244,
"mean_token_accuracy": 0.7820846647024154,
"num_tokens": 2890505046.0,
"step": 1390
},
{
"epoch": 1.4352971443272446,
"grad_norm": 0.1809283047914505,
"learning_rate": 1.3660654706770045e-05,
"loss": 0.7304,
"mean_token_accuracy": 0.7803399920463562,
"num_tokens": 2900869962.0,
"step": 1395
},
{
"epoch": 1.4404425006431696,
"grad_norm": 0.19696731865406036,
"learning_rate": 1.3618618047988541e-05,
"loss": 0.7229,
"mean_token_accuracy": 0.7826083064079284,
"num_tokens": 2911235205.0,
"step": 1400
},
{
"epoch": 1.4455878569590945,
"grad_norm": 0.21456697583198547,
"learning_rate": 1.3576545417514379e-05,
"loss": 0.7238,
"mean_token_accuracy": 0.7821739882230758,
"num_tokens": 2921630626.0,
"step": 1405
},
{
"epoch": 1.4507332132750193,
"grad_norm": 0.2006523162126541,
"learning_rate": 1.3534438168288122e-05,
"loss": 0.7236,
"mean_token_accuracy": 0.7829265475273133,
"num_tokens": 2931987655.0,
"step": 1410
},
{
"epoch": 1.4558785695909442,
"grad_norm": 0.20633164048194885,
"learning_rate": 1.3492297654363582e-05,
"loss": 0.7303,
"mean_token_accuracy": 0.7806042492389679,
"num_tokens": 2942398851.0,
"step": 1415
},
{
"epoch": 1.461023925906869,
"grad_norm": 0.18342465162277222,
"learning_rate": 1.3450125230864265e-05,
"loss": 0.7221,
"mean_token_accuracy": 0.7832267910242081,
"num_tokens": 2952797589.0,
"step": 1420
},
{
"epoch": 1.4661692822227939,
"grad_norm": 0.18584422767162323,
"learning_rate": 1.3407922253939801e-05,
"loss": 0.7207,
"mean_token_accuracy": 0.7827514231204986,
"num_tokens": 2963142662.0,
"step": 1425
},
{
"epoch": 1.4713146385387188,
"grad_norm": 0.2012760490179062,
"learning_rate": 1.3365690080722349e-05,
"loss": 0.7228,
"mean_token_accuracy": 0.7825741022825241,
"num_tokens": 2973551267.0,
"step": 1430
},
{
"epoch": 1.4764599948546437,
"grad_norm": 0.1869860738515854,
"learning_rate": 1.3323430069282922e-05,
"loss": 0.7123,
"mean_token_accuracy": 0.7854143947362899,
"num_tokens": 2983967802.0,
"step": 1435
},
{
"epoch": 1.4816053511705687,
"grad_norm": 0.18766240775585175,
"learning_rate": 1.3281143578587747e-05,
"loss": 0.7204,
"mean_token_accuracy": 0.7834302335977554,
"num_tokens": 2994361017.0,
"step": 1440
},
{
"epoch": 1.4867507074864934,
"grad_norm": 0.20129527151584625,
"learning_rate": 1.3238831968454547e-05,
"loss": 0.7295,
"mean_token_accuracy": 0.7809592038393021,
"num_tokens": 3004789429.0,
"step": 1445
},
{
"epoch": 1.4918960638024183,
"grad_norm": 0.1840723752975464,
"learning_rate": 1.3196496599508818e-05,
"loss": 0.7164,
"mean_token_accuracy": 0.7845001757144928,
"num_tokens": 3015178802.0,
"step": 1450
},
{
"epoch": 1.497041420118343,
"grad_norm": 0.18060751259326935,
"learning_rate": 1.3154138833140066e-05,
"loss": 0.7194,
"mean_token_accuracy": 0.7834681749343873,
"num_tokens": 3025590677.0,
"step": 1455
},
{
"epoch": 1.502186776434268,
"grad_norm": 0.20038992166519165,
"learning_rate": 1.3111760031458056e-05,
"loss": 0.7234,
"mean_token_accuracy": 0.7822674155235291,
"num_tokens": 3036005732.0,
"step": 1460
},
{
"epoch": 1.507332132750193,
"grad_norm": 0.17819659411907196,
"learning_rate": 1.3069361557248972e-05,
"loss": 0.7255,
"mean_token_accuracy": 0.781733363866806,
"num_tokens": 3046403093.0,
"step": 1465
},
{
"epoch": 1.512477489066118,
"grad_norm": 0.18325041234493256,
"learning_rate": 1.3026944773931623e-05,
"loss": 0.7241,
"mean_token_accuracy": 0.7819523394107819,
"num_tokens": 3056805589.0,
"step": 1470
},
{
"epoch": 1.5176228453820428,
"grad_norm": 0.1812821924686432,
"learning_rate": 1.2984511045513583e-05,
"loss": 0.7336,
"mean_token_accuracy": 0.7796476185321808,
"num_tokens": 3067202755.0,
"step": 1475
},
{
"epoch": 1.5227682016979676,
"grad_norm": 0.1757994294166565,
"learning_rate": 1.2942061736547338e-05,
"loss": 0.7252,
"mean_token_accuracy": 0.7821748554706573,
"num_tokens": 3077623015.0,
"step": 1480
},
{
"epoch": 1.5279135580138925,
"grad_norm": 0.18919821083545685,
"learning_rate": 1.2899598212086407e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.7825713455677032,
"num_tokens": 3088009778.0,
"step": 1485
},
{
"epoch": 1.5330589143298172,
"grad_norm": 0.18530713021755219,
"learning_rate": 1.285712183764142e-05,
"loss": 0.7308,
"mean_token_accuracy": 0.7805260062217713,
"num_tokens": 3098430248.0,
"step": 1490
},
{
"epoch": 1.5382042706457422,
"grad_norm": 0.1960536390542984,
"learning_rate": 1.2814633979136254e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.7830262005329132,
"num_tokens": 3108837811.0,
"step": 1495
},
{
"epoch": 1.543349626961667,
"grad_norm": 0.19076813757419586,
"learning_rate": 1.2772136002864067e-05,
"loss": 0.7221,
"mean_token_accuracy": 0.783091539144516,
"num_tokens": 3119231467.0,
"step": 1500
},
{
"epoch": 1.548494983277592,
"grad_norm": 0.20653417706489563,
"learning_rate": 1.2729629275443373e-05,
"loss": 0.7256,
"mean_token_accuracy": 0.781879261136055,
"num_tokens": 3129640782.0,
"step": 1505
},
{
"epoch": 1.553640339593517,
"grad_norm": 0.19807811081409454,
"learning_rate": 1.268711516377411e-05,
"loss": 0.7156,
"mean_token_accuracy": 0.7845076858997345,
"num_tokens": 3140041502.0,
"step": 1510
},
{
"epoch": 1.5587856959094417,
"grad_norm": 0.18549486994743347,
"learning_rate": 1.2644595034993667e-05,
"loss": 0.7145,
"mean_token_accuracy": 0.7843082189559937,
"num_tokens": 3150443408.0,
"step": 1515
},
{
"epoch": 1.5639310522253667,
"grad_norm": 0.18279866874217987,
"learning_rate": 1.260207025643293e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7818575918674469,
"num_tokens": 3160858821.0,
"step": 1520
},
{
"epoch": 1.5690764085412914,
"grad_norm": 0.19264079630374908,
"learning_rate": 1.25595421955723e-05,
"loss": 0.7373,
"mean_token_accuracy": 0.7786633670330048,
"num_tokens": 3171243172.0,
"step": 1525
},
{
"epoch": 1.5742217648572163,
"grad_norm": 0.20798954367637634,
"learning_rate": 1.2517012219997743e-05,
"loss": 0.7263,
"mean_token_accuracy": 0.7813588201999664,
"num_tokens": 3181666806.0,
"step": 1530
},
{
"epoch": 1.5793671211731413,
"grad_norm": 0.22132764756679535,
"learning_rate": 1.2474481697356784e-05,
"loss": 0.7158,
"mean_token_accuracy": 0.7841507345438004,
"num_tokens": 3192068095.0,
"step": 1535
},
{
"epoch": 1.5845124774890662,
"grad_norm": 0.1778470277786255,
"learning_rate": 1.2431951995314547e-05,
"loss": 0.7112,
"mean_token_accuracy": 0.7860257804393769,
"num_tokens": 3202473819.0,
"step": 1540
},
{
"epoch": 1.5896578338049911,
"grad_norm": 0.20948489010334015,
"learning_rate": 1.2389424481509766e-05,
"loss": 0.7283,
"mean_token_accuracy": 0.7805652767419815,
"num_tokens": 3212838888.0,
"step": 1545
},
{
"epoch": 1.5948031901209159,
"grad_norm": 0.22574825584888458,
"learning_rate": 1.2346900523510804e-05,
"loss": 0.7246,
"mean_token_accuracy": 0.7815930396318436,
"num_tokens": 3223222863.0,
"step": 1550
},
{
"epoch": 1.5999485464368406,
"grad_norm": 0.19260616600513458,
"learning_rate": 1.2304381488771684e-05,
"loss": 0.7192,
"mean_token_accuracy": 0.7832415938377381,
"num_tokens": 3233636338.0,
"step": 1555
},
{
"epoch": 1.6050939027527655,
"grad_norm": 0.2285853922367096,
"learning_rate": 1.2261868744588108e-05,
"loss": 0.7123,
"mean_token_accuracy": 0.7852210611104965,
"num_tokens": 3244050232.0,
"step": 1560
},
{
"epoch": 1.6102392590686905,
"grad_norm": 0.21371160447597504,
"learning_rate": 1.2219363658053496e-05,
"loss": 0.7236,
"mean_token_accuracy": 0.7823358774185181,
"num_tokens": 3254430770.0,
"step": 1565
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.20527417957782745,
"learning_rate": 1.217686759601501e-05,
"loss": 0.7147,
"mean_token_accuracy": 0.7844323009252548,
"num_tokens": 3264847891.0,
"step": 1570
},
{
"epoch": 1.6205299717005404,
"grad_norm": 0.18310104310512543,
"learning_rate": 1.2134381925029613e-05,
"loss": 0.725,
"mean_token_accuracy": 0.7815193057060241,
"num_tokens": 3275273883.0,
"step": 1575
},
{
"epoch": 1.6256753280164653,
"grad_norm": 0.18352478742599487,
"learning_rate": 1.209190801132012e-05,
"loss": 0.7174,
"mean_token_accuracy": 0.7840063363313675,
"num_tokens": 3285662067.0,
"step": 1580
},
{
"epoch": 1.63082068433239,
"grad_norm": 0.1829749196767807,
"learning_rate": 1.2049447220731266e-05,
"loss": 0.7256,
"mean_token_accuracy": 0.7813293248414993,
"num_tokens": 3296058473.0,
"step": 1585
},
{
"epoch": 1.6359660406483147,
"grad_norm": 0.18960633873939514,
"learning_rate": 1.2007000918685786e-05,
"loss": 0.7283,
"mean_token_accuracy": 0.780817398428917,
"num_tokens": 3306487200.0,
"step": 1590
},
{
"epoch": 1.6411113969642397,
"grad_norm": 0.18444406986236572,
"learning_rate": 1.196457047014049e-05,
"loss": 0.7166,
"mean_token_accuracy": 0.7841069996356964,
"num_tokens": 3316886128.0,
"step": 1595
},
{
"epoch": 1.6462567532801646,
"grad_norm": 0.18856759369373322,
"learning_rate": 1.1922157239542396e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.7824205875396728,
"num_tokens": 3327276884.0,
"step": 1600
},
{
"epoch": 1.6514021095960896,
"grad_norm": 0.18209627270698547,
"learning_rate": 1.1879762590784832e-05,
"loss": 0.7097,
"mean_token_accuracy": 0.7861823856830596,
"num_tokens": 3337639665.0,
"step": 1605
},
{
"epoch": 1.6565474659120145,
"grad_norm": 0.18820519745349884,
"learning_rate": 1.1837387887163594e-05,
"loss": 0.7186,
"mean_token_accuracy": 0.783584377169609,
"num_tokens": 3348052567.0,
"step": 1610
},
{
"epoch": 1.6616928222279392,
"grad_norm": 0.2538444399833679,
"learning_rate": 1.1795034491333089e-05,
"loss": 0.7258,
"mean_token_accuracy": 0.7812894821166992,
"num_tokens": 3358442738.0,
"step": 1615
},
{
"epoch": 1.6668381785438642,
"grad_norm": 0.22863799333572388,
"learning_rate": 1.175270376526252e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.7804809838533402,
"num_tokens": 3368845776.0,
"step": 1620
},
{
"epoch": 1.671983534859789,
"grad_norm": 0.1943485289812088,
"learning_rate": 1.1710397070192103e-05,
"loss": 0.7131,
"mean_token_accuracy": 0.7848410785198212,
"num_tokens": 3379247751.0,
"step": 1625
},
{
"epoch": 1.6771288911757138,
"grad_norm": 0.25817155838012695,
"learning_rate": 1.1668115766589278e-05,
"loss": 0.7209,
"mean_token_accuracy": 0.7828822433948517,
"num_tokens": 3389594723.0,
"step": 1630
},
{
"epoch": 1.6822742474916388,
"grad_norm": 0.21201983094215393,
"learning_rate": 1.1625861214104967e-05,
"loss": 0.7148,
"mean_token_accuracy": 0.7844018071889878,
"num_tokens": 3399983766.0,
"step": 1635
},
{
"epoch": 1.6874196038075637,
"grad_norm": 0.20084577798843384,
"learning_rate": 1.1583634771529843e-05,
"loss": 0.7167,
"mean_token_accuracy": 0.783909472823143,
"num_tokens": 3410348245.0,
"step": 1640
},
{
"epoch": 1.6925649601234887,
"grad_norm": 0.19142308831214905,
"learning_rate": 1.1541437796750651e-05,
"loss": 0.7216,
"mean_token_accuracy": 0.7828730583190918,
"num_tokens": 3420763568.0,
"step": 1645
},
{
"epoch": 1.6977103164394134,
"grad_norm": 0.21393629908561707,
"learning_rate": 1.1499271646706525e-05,
"loss": 0.7299,
"mean_token_accuracy": 0.7805344760417938,
"num_tokens": 3431161357.0,
"step": 1650
},
{
"epoch": 1.7028556727553383,
"grad_norm": 0.20122776925563812,
"learning_rate": 1.1457137677345362e-05,
"loss": 0.7147,
"mean_token_accuracy": 0.7848177880048752,
"num_tokens": 3441567248.0,
"step": 1655
},
{
"epoch": 1.708001029071263,
"grad_norm": 0.21072812378406525,
"learning_rate": 1.1415037243580219e-05,
"loss": 0.7069,
"mean_token_accuracy": 0.7868409931659699,
"num_tokens": 3451983024.0,
"step": 1660
},
{
"epoch": 1.713146385387188,
"grad_norm": 0.19437959790229797,
"learning_rate": 1.1372971699245732e-05,
"loss": 0.7196,
"mean_token_accuracy": 0.7829753428697586,
"num_tokens": 3462383943.0,
"step": 1665
},
{
"epoch": 1.718291741703113,
"grad_norm": 0.19020071625709534,
"learning_rate": 1.1330942397054599e-05,
"loss": 0.7231,
"mean_token_accuracy": 0.7823878258466721,
"num_tokens": 3472795076.0,
"step": 1670
},
{
"epoch": 1.7234370980190379,
"grad_norm": 0.17885635793209076,
"learning_rate": 1.1288950688554068e-05,
"loss": 0.7307,
"mean_token_accuracy": 0.7804099589586257,
"num_tokens": 3483201272.0,
"step": 1675
},
{
"epoch": 1.7285824543349628,
"grad_norm": 0.1998845338821411,
"learning_rate": 1.1246997924082465e-05,
"loss": 0.7178,
"mean_token_accuracy": 0.7835394382476807,
"num_tokens": 3493592730.0,
"step": 1680
},
{
"epoch": 1.7337278106508875,
"grad_norm": 0.1758834719657898,
"learning_rate": 1.1205085452725796e-05,
"loss": 0.7246,
"mean_token_accuracy": 0.7821025729179383,
"num_tokens": 3504000714.0,
"step": 1685
},
{
"epoch": 1.7388731669668125,
"grad_norm": 0.18512143194675446,
"learning_rate": 1.116321462227435e-05,
"loss": 0.7217,
"mean_token_accuracy": 0.7826696693897247,
"num_tokens": 3514403874.0,
"step": 1690
},
{
"epoch": 1.7440185232827372,
"grad_norm": 0.18202784657478333,
"learning_rate": 1.112138677917935e-05,
"loss": 0.7098,
"mean_token_accuracy": 0.7855264693498611,
"num_tokens": 3524800705.0,
"step": 1695
},
{
"epoch": 1.7491638795986622,
"grad_norm": 0.17856541275978088,
"learning_rate": 1.1079603268509671e-05,
"loss": 0.7223,
"mean_token_accuracy": 0.7820982217788697,
"num_tokens": 3535216074.0,
"step": 1700
},
{
"epoch": 1.754309235914587,
"grad_norm": 0.18560512363910675,
"learning_rate": 1.1037865433908574e-05,
"loss": 0.7163,
"mean_token_accuracy": 0.7839051306247711,
"num_tokens": 3545649484.0,
"step": 1705
},
{
"epoch": 1.759454592230512,
"grad_norm": 0.1768876314163208,
"learning_rate": 1.0996174617550506e-05,
"loss": 0.7147,
"mean_token_accuracy": 0.78475923538208,
"num_tokens": 3556016373.0,
"step": 1710
},
{
"epoch": 1.764599948546437,
"grad_norm": 0.18054403364658356,
"learning_rate": 1.0954532160097937e-05,
"loss": 0.7199,
"mean_token_accuracy": 0.783240556716919,
"num_tokens": 3566403061.0,
"step": 1715
},
{
"epoch": 1.7697453048623617,
"grad_norm": 0.1674446016550064,
"learning_rate": 1.0912939400658243e-05,
"loss": 0.7223,
"mean_token_accuracy": 0.7823190867900849,
"num_tokens": 3576811704.0,
"step": 1720
},
{
"epoch": 1.7748906611782866,
"grad_norm": 0.19289781153202057,
"learning_rate": 1.0871397676740647e-05,
"loss": 0.7268,
"mean_token_accuracy": 0.7810219496488571,
"num_tokens": 3587228256.0,
"step": 1725
},
{
"epoch": 1.7800360174942114,
"grad_norm": 0.19123966991901398,
"learning_rate": 1.0829908324213214e-05,
"loss": 0.7203,
"mean_token_accuracy": 0.7828882426023483,
"num_tokens": 3597638203.0,
"step": 1730
},
{
"epoch": 1.7851813738101363,
"grad_norm": 0.17165765166282654,
"learning_rate": 1.0788472677259888e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.7817003160715104,
"num_tokens": 3608044316.0,
"step": 1735
},
{
"epoch": 1.7903267301260612,
"grad_norm": 0.17944514751434326,
"learning_rate": 1.074709206833759e-05,
"loss": 0.7254,
"mean_token_accuracy": 0.7813136577606201,
"num_tokens": 3618445618.0,
"step": 1740
},
{
"epoch": 1.7954720864419862,
"grad_norm": 0.19152477383613586,
"learning_rate": 1.070576782813336e-05,
"loss": 0.7244,
"mean_token_accuracy": 0.7815535515546799,
"num_tokens": 3628864024.0,
"step": 1745
},
{
"epoch": 1.8006174427579111,
"grad_norm": 0.20791219174861908,
"learning_rate": 1.0664501285521585e-05,
"loss": 0.7185,
"mean_token_accuracy": 0.7832733541727066,
"num_tokens": 3639237896.0,
"step": 1750
},
{
"epoch": 1.8057627990738359,
"grad_norm": 0.17812472581863403,
"learning_rate": 1.0623293767521248e-05,
"loss": 0.7241,
"mean_token_accuracy": 0.7819753557443618,
"num_tokens": 3649639864.0,
"step": 1755
},
{
"epoch": 1.8109081553897606,
"grad_norm": 0.2049945890903473,
"learning_rate": 1.0582146599253271e-05,
"loss": 0.7257,
"mean_token_accuracy": 0.7812327802181244,
"num_tokens": 3660058988.0,
"step": 1760
},
{
"epoch": 1.8160535117056855,
"grad_norm": 0.17638404667377472,
"learning_rate": 1.0541061103897881e-05,
"loss": 0.7224,
"mean_token_accuracy": 0.782220122218132,
"num_tokens": 3670478945.0,
"step": 1765
},
{
"epoch": 1.8211988680216105,
"grad_norm": 0.19884805381298065,
"learning_rate": 1.0500038602652087e-05,
"loss": 0.7231,
"mean_token_accuracy": 0.7823522746562958,
"num_tokens": 3680853435.0,
"step": 1770
},
{
"epoch": 1.8263442243375354,
"grad_norm": 0.2005850225687027,
"learning_rate": 1.0459080414687166e-05,
"loss": 0.7109,
"mean_token_accuracy": 0.7854242146015167,
"num_tokens": 3691255046.0,
"step": 1775
},
{
"epoch": 1.8314895806534603,
"grad_norm": 0.18633082509040833,
"learning_rate": 1.041818785710627e-05,
"loss": 0.7138,
"mean_token_accuracy": 0.7844531148672104,
"num_tokens": 3701666509.0,
"step": 1780
},
{
"epoch": 1.8366349369693853,
"grad_norm": 0.20367996394634247,
"learning_rate": 1.037736224490205e-05,
"loss": 0.7277,
"mean_token_accuracy": 0.7811777710914611,
"num_tokens": 3712030587.0,
"step": 1785
},
{
"epoch": 1.84178029328531,
"grad_norm": 0.19179487228393555,
"learning_rate": 1.033660489091437e-05,
"loss": 0.7184,
"mean_token_accuracy": 0.7832197934389115,
"num_tokens": 3722418473.0,
"step": 1790
},
{
"epoch": 1.8469256496012347,
"grad_norm": 0.1956593543291092,
"learning_rate": 1.0295917105788116e-05,
"loss": 0.7176,
"mean_token_accuracy": 0.7836333483457565,
"num_tokens": 3732817985.0,
"step": 1795
},
{
"epoch": 1.8520710059171597,
"grad_norm": 0.18963828682899475,
"learning_rate": 1.0255300197931008e-05,
"loss": 0.7264,
"mean_token_accuracy": 0.781423020362854,
"num_tokens": 3743226210.0,
"step": 1800
},
{
"epoch": 1.8572163622330846,
"grad_norm": 0.16611480712890625,
"learning_rate": 1.021475547347157e-05,
"loss": 0.7201,
"mean_token_accuracy": 0.7829820960760117,
"num_tokens": 3753595703.0,
"step": 1805
},
{
"epoch": 1.8623617185490096,
"grad_norm": 0.17385919392108917,
"learning_rate": 1.017428423621708e-05,
"loss": 0.7294,
"mean_token_accuracy": 0.7805358350276947,
"num_tokens": 3764011843.0,
"step": 1810
},
{
"epoch": 1.8675070748649345,
"grad_norm": 0.18427050113677979,
"learning_rate": 1.0133887787611691e-05,
"loss": 0.7199,
"mean_token_accuracy": 0.7830179333686829,
"num_tokens": 3774434254.0,
"step": 1815
},
{
"epoch": 1.8726524311808592,
"grad_norm": 0.18085253238677979,
"learning_rate": 1.0093567426694544e-05,
"loss": 0.71,
"mean_token_accuracy": 0.7860257983207702,
"num_tokens": 3784829343.0,
"step": 1820
},
{
"epoch": 1.8777977874967842,
"grad_norm": 0.18546296656131744,
"learning_rate": 1.0053324450058017e-05,
"loss": 0.7121,
"mean_token_accuracy": 0.7851428180932999,
"num_tokens": 3795222093.0,
"step": 1825
},
{
"epoch": 1.8829431438127089,
"grad_norm": 0.1925223022699356,
"learning_rate": 1.0013160151806019e-05,
"loss": 0.7235,
"mean_token_accuracy": 0.7820713192224502,
"num_tokens": 3805638950.0,
"step": 1830
},
{
"epoch": 1.8880885001286338,
"grad_norm": 0.17734932899475098,
"learning_rate": 9.973075823512368e-06,
"loss": 0.7232,
"mean_token_accuracy": 0.78176209628582,
"num_tokens": 3816016677.0,
"step": 1835
},
{
"epoch": 1.8932338564445588,
"grad_norm": 0.1860755831003189,
"learning_rate": 9.933072754179285e-06,
"loss": 0.7142,
"mean_token_accuracy": 0.7846502423286438,
"num_tokens": 3826390292.0,
"step": 1840
},
{
"epoch": 1.8983792127604837,
"grad_norm": 0.17822657525539398,
"learning_rate": 9.893152230195909e-06,
"loss": 0.7158,
"mean_token_accuracy": 0.7840298235416412,
"num_tokens": 3836783019.0,
"step": 1845
},
{
"epoch": 1.9035245690764087,
"grad_norm": 0.19308920204639435,
"learning_rate": 9.85331553529696e-06,
"loss": 0.7089,
"mean_token_accuracy": 0.7863038212060929,
"num_tokens": 3847188499.0,
"step": 1850
},
{
"epoch": 1.9086699253923334,
"grad_norm": 0.18061098456382751,
"learning_rate": 9.813563950521435e-06,
"loss": 0.7116,
"mean_token_accuracy": 0.7850539714097977,
"num_tokens": 3857595213.0,
"step": 1855
},
{
"epoch": 1.9138152817082583,
"grad_norm": 0.18107041716575623,
"learning_rate": 9.773898754171425e-06,
"loss": 0.7219,
"mean_token_accuracy": 0.7823956727981567,
"num_tokens": 3867991091.0,
"step": 1860
},
{
"epoch": 1.918960638024183,
"grad_norm": 0.18848438560962677,
"learning_rate": 9.734321221771003e-06,
"loss": 0.7211,
"mean_token_accuracy": 0.7825707286596298,
"num_tokens": 3878420983.0,
"step": 1865
},
{
"epoch": 1.924105994340108,
"grad_norm": 0.1764371693134308,
"learning_rate": 9.69483262602522e-06,
"loss": 0.7193,
"mean_token_accuracy": 0.7829346388578415,
"num_tokens": 3888799338.0,
"step": 1870
},
{
"epoch": 1.929251350656033,
"grad_norm": 0.21297839283943176,
"learning_rate": 9.655434236779157e-06,
"loss": 0.7255,
"mean_token_accuracy": 0.78101367354393,
"num_tokens": 3899196340.0,
"step": 1875
},
{
"epoch": 1.9343967069719579,
"grad_norm": 0.16781088709831238,
"learning_rate": 9.616127320977103e-06,
"loss": 0.7241,
"mean_token_accuracy": 0.7818098127841949,
"num_tokens": 3909607686.0,
"step": 1880
},
{
"epoch": 1.9395420632878828,
"grad_norm": 0.1716819554567337,
"learning_rate": 9.576913142621814e-06,
"loss": 0.7226,
"mean_token_accuracy": 0.7824906349182129,
"num_tokens": 3920018100.0,
"step": 1885
},
{
"epoch": 1.9446874196038075,
"grad_norm": 0.16348664462566376,
"learning_rate": 9.537792962733865e-06,
"loss": 0.7087,
"mean_token_accuracy": 0.7859474241733551,
"num_tokens": 3930421105.0,
"step": 1890
},
{
"epoch": 1.9498327759197325,
"grad_norm": 0.17886386811733246,
"learning_rate": 9.498768039311091e-06,
"loss": 0.7195,
"mean_token_accuracy": 0.7828368335962296,
"num_tokens": 3940804188.0,
"step": 1895
},
{
"epoch": 1.9549781322356572,
"grad_norm": 0.19559861719608307,
"learning_rate": 9.459839627288149e-06,
"loss": 0.7223,
"mean_token_accuracy": 0.7822702258825303,
"num_tokens": 3951164761.0,
"step": 1900
},
{
"epoch": 1.9601234885515821,
"grad_norm": 0.1999935507774353,
"learning_rate": 9.421008978496147e-06,
"loss": 0.7302,
"mean_token_accuracy": 0.7797028571367264,
"num_tokens": 3961562769.0,
"step": 1905
},
{
"epoch": 1.965268844867507,
"grad_norm": 0.17995114624500275,
"learning_rate": 9.3822773416224e-06,
"loss": 0.7242,
"mean_token_accuracy": 0.7817402511835099,
"num_tokens": 3971972365.0,
"step": 1910
},
{
"epoch": 1.970414201183432,
"grad_norm": 0.19008983671665192,
"learning_rate": 9.343645962170267e-06,
"loss": 0.701,
"mean_token_accuracy": 0.7879698783159256,
"num_tokens": 3982366725.0,
"step": 1915
},
{
"epoch": 1.975559557499357,
"grad_norm": 0.1909506916999817,
"learning_rate": 9.305116082419098e-06,
"loss": 0.7189,
"mean_token_accuracy": 0.7828688323497772,
"num_tokens": 3992755868.0,
"step": 1920
},
{
"epoch": 1.9807049138152817,
"grad_norm": 0.20575623214244843,
"learning_rate": 9.266688941384307e-06,
"loss": 0.7144,
"mean_token_accuracy": 0.7844961941242218,
"num_tokens": 4003176984.0,
"step": 1925
},
{
"epoch": 1.9858502701312066,
"grad_norm": 0.19786439836025238,
"learning_rate": 9.228365774777498e-06,
"loss": 0.7134,
"mean_token_accuracy": 0.7853114068508148,
"num_tokens": 4013594555.0,
"step": 1930
},
{
"epoch": 1.9909956264471314,
"grad_norm": 0.19185397028923035,
"learning_rate": 9.190147814966747e-06,
"loss": 0.717,
"mean_token_accuracy": 0.7838179767131805,
"num_tokens": 4023992898.0,
"step": 1935
},
{
"epoch": 1.9961409827630563,
"grad_norm": 0.187905415892601,
"learning_rate": 9.152036290936966e-06,
"loss": 0.7137,
"mean_token_accuracy": 0.7846971601247787,
"num_tokens": 4034394748.0,
"step": 1940
},
{
"epoch": 2.001029071263185,
"grad_norm": 0.2134164422750473,
"learning_rate": 9.114032428250385e-06,
"loss": 0.7088,
"mean_token_accuracy": 0.7855551619278757,
"num_tokens": 4044275069.0,
"step": 1945
},
{
"epoch": 2.00617442757911,
"grad_norm": 0.19707649946212769,
"learning_rate": 9.07613744900714e-06,
"loss": 0.6946,
"mean_token_accuracy": 0.7890868008136749,
"num_tokens": 4054664720.0,
"step": 1950
},
{
"epoch": 2.0113197838950345,
"grad_norm": 0.1966494768857956,
"learning_rate": 9.038352571805973e-06,
"loss": 0.7024,
"mean_token_accuracy": 0.7869040161371231,
"num_tokens": 4065081909.0,
"step": 1955
},
{
"epoch": 2.0164651402109595,
"grad_norm": 0.21523414552211761,
"learning_rate": 9.000679011705048e-06,
"loss": 0.7,
"mean_token_accuracy": 0.7871535241603851,
"num_tokens": 4075473101.0,
"step": 1960
},
{
"epoch": 2.0216104965268844,
"grad_norm": 0.20164090394973755,
"learning_rate": 8.963117980182871e-06,
"loss": 0.6879,
"mean_token_accuracy": 0.7908169955015183,
"num_tokens": 4085893536.0,
"step": 1965
},
{
"epoch": 2.0267558528428093,
"grad_norm": 0.1790938675403595,
"learning_rate": 8.925670685099344e-06,
"loss": 0.6966,
"mean_token_accuracy": 0.7879111260175705,
"num_tokens": 4096303930.0,
"step": 1970
},
{
"epoch": 2.0319012091587343,
"grad_norm": 0.18033479154109955,
"learning_rate": 8.888338330656909e-06,
"loss": 0.6907,
"mean_token_accuracy": 0.7897056668996811,
"num_tokens": 4106711156.0,
"step": 1975
},
{
"epoch": 2.0370465654746592,
"grad_norm": 0.18739460408687592,
"learning_rate": 8.851122117361845e-06,
"loss": 0.6848,
"mean_token_accuracy": 0.7917119234800338,
"num_tokens": 4117100496.0,
"step": 1980
},
{
"epoch": 2.042191921790584,
"grad_norm": 0.18272534012794495,
"learning_rate": 8.814023241985633e-06,
"loss": 0.7014,
"mean_token_accuracy": 0.7867262125015259,
"num_tokens": 4127509126.0,
"step": 1985
},
{
"epoch": 2.0473372781065087,
"grad_norm": 0.2082863599061966,
"learning_rate": 8.777042897526491e-06,
"loss": 0.6971,
"mean_token_accuracy": 0.7877671688795089,
"num_tokens": 4137900742.0,
"step": 1990
},
{
"epoch": 2.0524826344224336,
"grad_norm": 0.18298238515853882,
"learning_rate": 8.740182273171021e-06,
"loss": 0.6937,
"mean_token_accuracy": 0.789008492231369,
"num_tokens": 4148307231.0,
"step": 1995
},
{
"epoch": 2.0576279907383586,
"grad_norm": 0.20916491746902466,
"learning_rate": 8.703442554255945e-06,
"loss": 0.6971,
"mean_token_accuracy": 0.7878756642341613,
"num_tokens": 4158703927.0,
"step": 2000
},
{
"epoch": 2.0627733470542835,
"grad_norm": 0.17486633360385895,
"learning_rate": 8.666824922229993e-06,
"loss": 0.6997,
"mean_token_accuracy": 0.7873221039772034,
"num_tokens": 4169110558.0,
"step": 2005
},
{
"epoch": 2.0679187033702084,
"grad_norm": 0.18294841051101685,
"learning_rate": 8.630330554615918e-06,
"loss": 0.6957,
"mean_token_accuracy": 0.7886905431747436,
"num_tokens": 4179479610.0,
"step": 2010
},
{
"epoch": 2.0730640596861334,
"grad_norm": 0.17662659287452698,
"learning_rate": 8.593960624972635e-06,
"loss": 0.6896,
"mean_token_accuracy": 0.790263557434082,
"num_tokens": 4189878842.0,
"step": 2015
},
{
"epoch": 2.0782094160020583,
"grad_norm": 0.18194252252578735,
"learning_rate": 8.557716302857469e-06,
"loss": 0.6955,
"mean_token_accuracy": 0.7886851370334625,
"num_tokens": 4200279229.0,
"step": 2020
},
{
"epoch": 2.083354772317983,
"grad_norm": 0.1726061850786209,
"learning_rate": 8.521598753788538e-06,
"loss": 0.6975,
"mean_token_accuracy": 0.7880923539400101,
"num_tokens": 4210680215.0,
"step": 2025
},
{
"epoch": 2.0885001286339078,
"grad_norm": 0.17614272236824036,
"learning_rate": 8.485609139207312e-06,
"loss": 0.6859,
"mean_token_accuracy": 0.7908884584903717,
"num_tokens": 4221086681.0,
"step": 2030
},
{
"epoch": 2.0936454849498327,
"grad_norm": 0.16882190108299255,
"learning_rate": 8.449748616441217e-06,
"loss": 0.6888,
"mean_token_accuracy": 0.7902822762727737,
"num_tokens": 4231490233.0,
"step": 2035
},
{
"epoch": 2.0987908412657577,
"grad_norm": 0.18029114603996277,
"learning_rate": 8.414018338666453e-06,
"loss": 0.6964,
"mean_token_accuracy": 0.7881254225969314,
"num_tokens": 4241880702.0,
"step": 2040
},
{
"epoch": 2.1039361975816826,
"grad_norm": 0.1758222132921219,
"learning_rate": 8.378419454870885e-06,
"loss": 0.6922,
"mean_token_accuracy": 0.7890959054231643,
"num_tokens": 4252273338.0,
"step": 2045
},
{
"epoch": 2.1090815538976075,
"grad_norm": 0.18869197368621826,
"learning_rate": 8.34295310981712e-06,
"loss": 0.6975,
"mean_token_accuracy": 0.7877348899841309,
"num_tokens": 4262684519.0,
"step": 2050
},
{
"epoch": 2.1142269102135325,
"grad_norm": 0.19612114131450653,
"learning_rate": 8.307620444005675e-06,
"loss": 0.6857,
"mean_token_accuracy": 0.7910007119178772,
"num_tokens": 4273105678.0,
"step": 2055
},
{
"epoch": 2.119372266529457,
"grad_norm": 0.18810050189495087,
"learning_rate": 8.272422593638312e-06,
"loss": 0.7012,
"mean_token_accuracy": 0.7865576684474945,
"num_tokens": 4283510594.0,
"step": 2060
},
{
"epoch": 2.124517622845382,
"grad_norm": 0.1900843381881714,
"learning_rate": 8.237360690581494e-06,
"loss": 0.6946,
"mean_token_accuracy": 0.7886899948120117,
"num_tokens": 4293904300.0,
"step": 2065
},
{
"epoch": 2.129662979161307,
"grad_norm": 0.20008240640163422,
"learning_rate": 8.202435862329992e-06,
"loss": 0.6931,
"mean_token_accuracy": 0.7892868250608445,
"num_tokens": 4304318437.0,
"step": 2070
},
{
"epoch": 2.134808335477232,
"grad_norm": 0.19377216696739197,
"learning_rate": 8.167649231970629e-06,
"loss": 0.7087,
"mean_token_accuracy": 0.7848498582839966,
"num_tokens": 4314743524.0,
"step": 2075
},
{
"epoch": 2.1399536917931568,
"grad_norm": 0.18533039093017578,
"learning_rate": 8.13300191814616e-06,
"loss": 0.6985,
"mean_token_accuracy": 0.787962692975998,
"num_tokens": 4325118311.0,
"step": 2080
},
{
"epoch": 2.1450990481090817,
"grad_norm": 0.18524758517742157,
"learning_rate": 8.098495035019307e-06,
"loss": 0.6933,
"mean_token_accuracy": 0.7891044408082962,
"num_tokens": 4335531487.0,
"step": 2085
},
{
"epoch": 2.1502444044250066,
"grad_norm": 0.20954075455665588,
"learning_rate": 8.064129692236914e-06,
"loss": 0.6988,
"mean_token_accuracy": 0.7874211251735688,
"num_tokens": 4345955448.0,
"step": 2090
},
{
"epoch": 2.155389760740931,
"grad_norm": 0.19117045402526855,
"learning_rate": 8.029906994894285e-06,
"loss": 0.6847,
"mean_token_accuracy": 0.7915515124797821,
"num_tokens": 4356359783.0,
"step": 2095
},
{
"epoch": 2.160535117056856,
"grad_norm": 0.21733401715755463,
"learning_rate": 7.995828043499637e-06,
"loss": 0.6933,
"mean_token_accuracy": 0.7889263033866882,
"num_tokens": 4366739707.0,
"step": 2100
},
{
"epoch": 2.165680473372781,
"grad_norm": 0.18222884833812714,
"learning_rate": 7.961893933938707e-06,
"loss": 0.7027,
"mean_token_accuracy": 0.7866089105606079,
"num_tokens": 4377106402.0,
"step": 2105
},
{
"epoch": 2.170825829688706,
"grad_norm": 0.1800689697265625,
"learning_rate": 7.92810575743952e-06,
"loss": 0.6993,
"mean_token_accuracy": 0.7875191777944565,
"num_tokens": 4387518880.0,
"step": 2110
},
{
"epoch": 2.175971186004631,
"grad_norm": 0.1838780641555786,
"learning_rate": 7.89446460053728e-06,
"loss": 0.6941,
"mean_token_accuracy": 0.7888187408447266,
"num_tokens": 4397892379.0,
"step": 2115
},
{
"epoch": 2.181116542320556,
"grad_norm": 0.19722041487693787,
"learning_rate": 7.860971545039466e-06,
"loss": 0.6971,
"mean_token_accuracy": 0.7882630676031113,
"num_tokens": 4408268522.0,
"step": 2120
},
{
"epoch": 2.186261898636481,
"grad_norm": 0.18653474748134613,
"learning_rate": 7.827627667991e-06,
"loss": 0.6955,
"mean_token_accuracy": 0.787960433959961,
"num_tokens": 4418677517.0,
"step": 2125
},
{
"epoch": 2.1914072549524053,
"grad_norm": 0.18152935802936554,
"learning_rate": 7.794434041639651e-06,
"loss": 0.6969,
"mean_token_accuracy": 0.7878915429115295,
"num_tokens": 4429109296.0,
"step": 2130
},
{
"epoch": 2.1965526112683302,
"grad_norm": 0.19016778469085693,
"learning_rate": 7.761391733401523e-06,
"loss": 0.6966,
"mean_token_accuracy": 0.7879950881004334,
"num_tokens": 4439524505.0,
"step": 2135
},
{
"epoch": 2.201697967584255,
"grad_norm": 0.18335750699043274,
"learning_rate": 7.728501805826751e-06,
"loss": 0.7016,
"mean_token_accuracy": 0.786550509929657,
"num_tokens": 4449922294.0,
"step": 2140
},
{
"epoch": 2.20684332390018,
"grad_norm": 0.17287969589233398,
"learning_rate": 7.695765316565326e-06,
"loss": 0.6885,
"mean_token_accuracy": 0.7902193248271943,
"num_tokens": 4460309251.0,
"step": 2145
},
{
"epoch": 2.211988680216105,
"grad_norm": 0.17045357823371887,
"learning_rate": 7.66318331833308e-06,
"loss": 0.6822,
"mean_token_accuracy": 0.7919429570436478,
"num_tokens": 4470722415.0,
"step": 2150
},
{
"epoch": 2.21713403653203,
"grad_norm": 0.1837993860244751,
"learning_rate": 7.630756858877835e-06,
"loss": 0.6917,
"mean_token_accuracy": 0.7894322812557221,
"num_tokens": 4481112230.0,
"step": 2155
},
{
"epoch": 2.2222793928479545,
"grad_norm": 0.1990688294172287,
"learning_rate": 7.598486980945721e-06,
"loss": 0.696,
"mean_token_accuracy": 0.7881556123495101,
"num_tokens": 4491526989.0,
"step": 2160
},
{
"epoch": 2.2274247491638794,
"grad_norm": 0.19104093313217163,
"learning_rate": 7.566374722247625e-06,
"loss": 0.7071,
"mean_token_accuracy": 0.7854187726974488,
"num_tokens": 4501890999.0,
"step": 2165
},
{
"epoch": 2.2325701054798044,
"grad_norm": 0.17589214444160461,
"learning_rate": 7.534421115425832e-06,
"loss": 0.7082,
"mean_token_accuracy": 0.7851406782865524,
"num_tokens": 4512263755.0,
"step": 2170
},
{
"epoch": 2.2377154617957293,
"grad_norm": 0.18910154700279236,
"learning_rate": 7.502627188020828e-06,
"loss": 0.7018,
"mean_token_accuracy": 0.7865214943885803,
"num_tokens": 4522673409.0,
"step": 2175
},
{
"epoch": 2.2428608181116543,
"grad_norm": 0.19419220089912415,
"learning_rate": 7.470993962438233e-06,
"loss": 0.6981,
"mean_token_accuracy": 0.787706145644188,
"num_tokens": 4533050765.0,
"step": 2180
},
{
"epoch": 2.248006174427579,
"grad_norm": 0.19314493238925934,
"learning_rate": 7.439522455915941e-06,
"loss": 0.6921,
"mean_token_accuracy": 0.7892083436250686,
"num_tokens": 4543452165.0,
"step": 2185
},
{
"epoch": 2.253151530743504,
"grad_norm": 0.21107150614261627,
"learning_rate": 7.408213680491409e-06,
"loss": 0.6969,
"mean_token_accuracy": 0.7882888942956925,
"num_tokens": 4553853739.0,
"step": 2190
},
{
"epoch": 2.258296887059429,
"grad_norm": 0.1907646358013153,
"learning_rate": 7.377068642969104e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.7884598582983017,
"num_tokens": 4564274917.0,
"step": 2195
},
{
"epoch": 2.2634422433753536,
"grad_norm": 0.17553763091564178,
"learning_rate": 7.346088344888125e-06,
"loss": 0.6951,
"mean_token_accuracy": 0.788986611366272,
"num_tokens": 4574641137.0,
"step": 2200
},
{
"epoch": 2.2685875996912785,
"grad_norm": 0.18935340642929077,
"learning_rate": 7.315273782490008e-06,
"loss": 0.6994,
"mean_token_accuracy": 0.7872116446495057,
"num_tokens": 4585019689.0,
"step": 2205
},
{
"epoch": 2.2737329560072035,
"grad_norm": 0.19580236077308655,
"learning_rate": 7.284625946686685e-06,
"loss": 0.693,
"mean_token_accuracy": 0.7892390996217727,
"num_tokens": 4595438984.0,
"step": 2210
},
{
"epoch": 2.2788783123231284,
"grad_norm": 0.19986537098884583,
"learning_rate": 7.254145823028617e-06,
"loss": 0.6936,
"mean_token_accuracy": 0.7890995383262634,
"num_tokens": 4605823855.0,
"step": 2215
},
{
"epoch": 2.2840236686390534,
"grad_norm": 0.1742008775472641,
"learning_rate": 7.2238343916730915e-06,
"loss": 0.6993,
"mean_token_accuracy": 0.7871985048055649,
"num_tokens": 4616238871.0,
"step": 2220
},
{
"epoch": 2.2891690249549783,
"grad_norm": 0.19267351925373077,
"learning_rate": 7.193692627352726e-06,
"loss": 0.6872,
"mean_token_accuracy": 0.7908998429775238,
"num_tokens": 4626643459.0,
"step": 2225
},
{
"epoch": 2.294314381270903,
"grad_norm": 0.17010417580604553,
"learning_rate": 7.163721499344107e-06,
"loss": 0.6966,
"mean_token_accuracy": 0.7883331865072251,
"num_tokens": 4637044833.0,
"step": 2230
},
{
"epoch": 2.2994597375868278,
"grad_norm": 0.17460167407989502,
"learning_rate": 7.133921971436622e-06,
"loss": 0.6989,
"mean_token_accuracy": 0.7876080513000489,
"num_tokens": 4647444102.0,
"step": 2235
},
{
"epoch": 2.3046050939027527,
"grad_norm": 0.17576715350151062,
"learning_rate": 7.104295001901473e-06,
"loss": 0.6878,
"mean_token_accuracy": 0.7904452890157699,
"num_tokens": 4657824599.0,
"step": 2240
},
{
"epoch": 2.3097504502186776,
"grad_norm": 0.16763444244861603,
"learning_rate": 7.074841543460853e-06,
"loss": 0.6844,
"mean_token_accuracy": 0.7918480813503266,
"num_tokens": 4668181094.0,
"step": 2245
},
{
"epoch": 2.3148958065346026,
"grad_norm": 0.1823350489139557,
"learning_rate": 7.0455625432573186e-06,
"loss": 0.6932,
"mean_token_accuracy": 0.7892414182424545,
"num_tokens": 4678584523.0,
"step": 2250
},
{
"epoch": 2.3200411628505275,
"grad_norm": 0.17247016727924347,
"learning_rate": 7.016458942823321e-06,
"loss": 0.6869,
"mean_token_accuracy": 0.7909802347421646,
"num_tokens": 4688981096.0,
"step": 2255
},
{
"epoch": 2.3251865191664525,
"grad_norm": 0.17673242092132568,
"learning_rate": 6.987531678050943e-06,
"loss": 0.6802,
"mean_token_accuracy": 0.7923660695552825,
"num_tokens": 4699404625.0,
"step": 2260
},
{
"epoch": 2.330331875482377,
"grad_norm": 0.16854751110076904,
"learning_rate": 6.958781679161788e-06,
"loss": 0.6842,
"mean_token_accuracy": 0.7919197797775268,
"num_tokens": 4709811697.0,
"step": 2265
},
{
"epoch": 2.335477231798302,
"grad_norm": 0.1761576384305954,
"learning_rate": 6.930209870677077e-06,
"loss": 0.685,
"mean_token_accuracy": 0.7914377897977829,
"num_tokens": 4720237781.0,
"step": 2270
},
{
"epoch": 2.340622588114227,
"grad_norm": 0.17306455969810486,
"learning_rate": 6.901817171387917e-06,
"loss": 0.7019,
"mean_token_accuracy": 0.7869494408369064,
"num_tokens": 4730606260.0,
"step": 2275
},
{
"epoch": 2.345767944430152,
"grad_norm": 0.18955180048942566,
"learning_rate": 6.873604494325757e-06,
"loss": 0.6948,
"mean_token_accuracy": 0.7886533975601197,
"num_tokens": 4741014261.0,
"step": 2280
},
{
"epoch": 2.3509133007460767,
"grad_norm": 0.1918182373046875,
"learning_rate": 6.845572746733015e-06,
"loss": 0.6907,
"mean_token_accuracy": 0.7898207098245621,
"num_tokens": 4751422939.0,
"step": 2285
},
{
"epoch": 2.3560586570620017,
"grad_norm": 0.17227977514266968,
"learning_rate": 6.8177228300339186e-06,
"loss": 0.6926,
"mean_token_accuracy": 0.7893718838691711,
"num_tokens": 4761799091.0,
"step": 2290
},
{
"epoch": 2.361204013377926,
"grad_norm": 0.19379264116287231,
"learning_rate": 6.79005563980551e-06,
"loss": 0.6867,
"mean_token_accuracy": 0.7909263670444489,
"num_tokens": 4772203894.0,
"step": 2295
},
{
"epoch": 2.366349369693851,
"grad_norm": 0.16864101588726044,
"learning_rate": 6.7625720657488526e-06,
"loss": 0.6954,
"mean_token_accuracy": 0.7885312736034393,
"num_tokens": 4782600873.0,
"step": 2300
},
{
"epoch": 2.371494726009776,
"grad_norm": 0.17416614294052124,
"learning_rate": 6.735272991660415e-06,
"loss": 0.7108,
"mean_token_accuracy": 0.784038883447647,
"num_tokens": 4793015981.0,
"step": 2305
},
{
"epoch": 2.376640082325701,
"grad_norm": 0.17601124942302704,
"learning_rate": 6.708159295403645e-06,
"loss": 0.6931,
"mean_token_accuracy": 0.7890658885240555,
"num_tokens": 4803428229.0,
"step": 2310
},
{
"epoch": 2.381785438641626,
"grad_norm": 0.17575791478157043,
"learning_rate": 6.681231848880758e-06,
"loss": 0.6931,
"mean_token_accuracy": 0.7891870647668838,
"num_tokens": 4813832709.0,
"step": 2315
},
{
"epoch": 2.386930794957551,
"grad_norm": 0.171165332198143,
"learning_rate": 6.654491518004684e-06,
"loss": 0.6977,
"mean_token_accuracy": 0.7878083676099777,
"num_tokens": 4824217333.0,
"step": 2320
},
{
"epoch": 2.392076151273476,
"grad_norm": 0.174204021692276,
"learning_rate": 6.6279391626712195e-06,
"loss": 0.6767,
"mean_token_accuracy": 0.7939162909984588,
"num_tokens": 4834578249.0,
"step": 2325
},
{
"epoch": 2.397221507589401,
"grad_norm": 0.17255154252052307,
"learning_rate": 6.601575636731393e-06,
"loss": 0.6853,
"mean_token_accuracy": 0.7911572694778443,
"num_tokens": 4845003859.0,
"step": 2330
},
{
"epoch": 2.4023668639053253,
"grad_norm": 0.17027340829372406,
"learning_rate": 6.575401787963991e-06,
"loss": 0.7016,
"mean_token_accuracy": 0.7866772085428237,
"num_tokens": 4855396377.0,
"step": 2335
},
{
"epoch": 2.40751222022125,
"grad_norm": 0.1804320216178894,
"learning_rate": 6.549418458048301e-06,
"loss": 0.6944,
"mean_token_accuracy": 0.7887315809726715,
"num_tokens": 4865807607.0,
"step": 2340
},
{
"epoch": 2.412657576537175,
"grad_norm": 0.17891834676265717,
"learning_rate": 6.523626482537051e-06,
"loss": 0.6924,
"mean_token_accuracy": 0.7891820967197418,
"num_tokens": 4876203853.0,
"step": 2345
},
{
"epoch": 2.4178029328531,
"grad_norm": 0.17687129974365234,
"learning_rate": 6.498026690829529e-06,
"loss": 0.6879,
"mean_token_accuracy": 0.7905671745538712,
"num_tokens": 4886593797.0,
"step": 2350
},
{
"epoch": 2.422948289169025,
"grad_norm": 0.16701921820640564,
"learning_rate": 6.472619906144924e-06,
"loss": 0.7011,
"mean_token_accuracy": 0.7867477118968964,
"num_tokens": 4897020034.0,
"step": 2355
},
{
"epoch": 2.42809364548495,
"grad_norm": 0.18092454969882965,
"learning_rate": 6.447406945495843e-06,
"loss": 0.6846,
"mean_token_accuracy": 0.7916429519653321,
"num_tokens": 4907433880.0,
"step": 2360
},
{
"epoch": 2.4332390018008745,
"grad_norm": 0.17751817405223846,
"learning_rate": 6.422388619662045e-06,
"loss": 0.694,
"mean_token_accuracy": 0.7888200342655182,
"num_tokens": 4917840148.0,
"step": 2365
},
{
"epoch": 2.4383843581167994,
"grad_norm": 0.1927623152732849,
"learning_rate": 6.3975657331643715e-06,
"loss": 0.6959,
"mean_token_accuracy": 0.7883234590291976,
"num_tokens": 4928232237.0,
"step": 2370
},
{
"epoch": 2.4435297144327244,
"grad_norm": 0.17448590695858002,
"learning_rate": 6.3729390842388585e-06,
"loss": 0.7,
"mean_token_accuracy": 0.7875474035739899,
"num_tokens": 4938631938.0,
"step": 2375
},
{
"epoch": 2.4486750707486493,
"grad_norm": 0.1870429664850235,
"learning_rate": 6.348509464811088e-06,
"loss": 0.698,
"mean_token_accuracy": 0.7877880901098251,
"num_tokens": 4949047787.0,
"step": 2380
},
{
"epoch": 2.4538204270645743,
"grad_norm": 0.1780596822500229,
"learning_rate": 6.3242776604707144e-06,
"loss": 0.6918,
"mean_token_accuracy": 0.7893176406621933,
"num_tokens": 4959424736.0,
"step": 2385
},
{
"epoch": 2.458965783380499,
"grad_norm": 0.16919124126434326,
"learning_rate": 6.300244450446195e-06,
"loss": 0.7012,
"mean_token_accuracy": 0.7870047926902771,
"num_tokens": 4969829870.0,
"step": 2390
},
{
"epoch": 2.464111139696424,
"grad_norm": 0.172256737947464,
"learning_rate": 6.27641060757974e-06,
"loss": 0.7041,
"mean_token_accuracy": 0.7858896970748901,
"num_tokens": 4980197224.0,
"step": 2395
},
{
"epoch": 2.469256496012349,
"grad_norm": 0.17399129271507263,
"learning_rate": 6.252776898302453e-06,
"loss": 0.6824,
"mean_token_accuracy": 0.7921358823776246,
"num_tokens": 4990600805.0,
"step": 2400
},
{
"epoch": 2.4744018523282736,
"grad_norm": 0.1758703887462616,
"learning_rate": 6.2293440826097005e-06,
"loss": 0.6961,
"mean_token_accuracy": 0.7880290925502778,
"num_tokens": 5000978207.0,
"step": 2405
},
{
"epoch": 2.4795472086441985,
"grad_norm": 0.1842864602804184,
"learning_rate": 6.206112914036657e-06,
"loss": 0.6965,
"mean_token_accuracy": 0.7884801357984543,
"num_tokens": 5011384736.0,
"step": 2410
},
{
"epoch": 2.4846925649601235,
"grad_norm": 0.1803123950958252,
"learning_rate": 6.1830841396340705e-06,
"loss": 0.6991,
"mean_token_accuracy": 0.7872378647327423,
"num_tokens": 5021771302.0,
"step": 2415
},
{
"epoch": 2.4898379212760484,
"grad_norm": 0.17343245446681976,
"learning_rate": 6.160258499944255e-06,
"loss": 0.6899,
"mean_token_accuracy": 0.7900263160467148,
"num_tokens": 5032158385.0,
"step": 2420
},
{
"epoch": 2.4949832775919734,
"grad_norm": 0.18295209109783173,
"learning_rate": 6.137636728977267e-06,
"loss": 0.6873,
"mean_token_accuracy": 0.7904597342014312,
"num_tokens": 5042580817.0,
"step": 2425
},
{
"epoch": 2.500128633907898,
"grad_norm": 0.1744978278875351,
"learning_rate": 6.115219554187303e-06,
"loss": 0.6944,
"mean_token_accuracy": 0.7883099675178528,
"num_tokens": 5052996785.0,
"step": 2430
},
{
"epoch": 2.505273990223823,
"grad_norm": 0.17044760286808014,
"learning_rate": 6.0930076964493034e-06,
"loss": 0.7044,
"mean_token_accuracy": 0.7858549505472183,
"num_tokens": 5063403777.0,
"step": 2435
},
{
"epoch": 2.5104193465397477,
"grad_norm": 0.1745147705078125,
"learning_rate": 6.07100187003578e-06,
"loss": 0.6946,
"mean_token_accuracy": 0.7888891041278839,
"num_tokens": 5073787624.0,
"step": 2440
},
{
"epoch": 2.5155647028556727,
"grad_norm": 0.17163583636283875,
"learning_rate": 6.049202782593837e-06,
"loss": 0.7091,
"mean_token_accuracy": 0.784814390540123,
"num_tokens": 5084155762.0,
"step": 2445
},
{
"epoch": 2.5207100591715976,
"grad_norm": 0.1630815863609314,
"learning_rate": 6.027611135122423e-06,
"loss": 0.6833,
"mean_token_accuracy": 0.7919480204582214,
"num_tokens": 5094520579.0,
"step": 2450
},
{
"epoch": 2.5258554154875226,
"grad_norm": 0.169570654630661,
"learning_rate": 6.006227621949783e-06,
"loss": 0.6912,
"mean_token_accuracy": 0.7897911489009857,
"num_tokens": 5104935332.0,
"step": 2455
},
{
"epoch": 2.5310007718034475,
"grad_norm": 0.17307248711585999,
"learning_rate": 5.985052930711133e-06,
"loss": 0.686,
"mean_token_accuracy": 0.7910365283489227,
"num_tokens": 5115312123.0,
"step": 2460
},
{
"epoch": 2.5361461281193725,
"grad_norm": 0.1717165857553482,
"learning_rate": 5.964087742326549e-06,
"loss": 0.7048,
"mean_token_accuracy": 0.7863658338785171,
"num_tokens": 5125722883.0,
"step": 2465
},
{
"epoch": 2.5412914844352974,
"grad_norm": 0.16661353409290314,
"learning_rate": 5.943332730979067e-06,
"loss": 0.6982,
"mean_token_accuracy": 0.7878574222326279,
"num_tokens": 5136118397.0,
"step": 2470
},
{
"epoch": 2.546436840751222,
"grad_norm": 0.18003134429454803,
"learning_rate": 5.922788564093009e-06,
"loss": 0.6942,
"mean_token_accuracy": 0.788626492023468,
"num_tokens": 5146490125.0,
"step": 2475
},
{
"epoch": 2.551582197067147,
"grad_norm": 0.17579644918441772,
"learning_rate": 5.902455902312511e-06,
"loss": 0.7027,
"mean_token_accuracy": 0.7862021327018738,
"num_tokens": 5156884568.0,
"step": 2480
},
{
"epoch": 2.556727553383072,
"grad_norm": 0.1727745682001114,
"learning_rate": 5.88233539948029e-06,
"loss": 0.6925,
"mean_token_accuracy": 0.7890074193477631,
"num_tokens": 5167297476.0,
"step": 2485
},
{
"epoch": 2.5618729096989967,
"grad_norm": 0.18698406219482422,
"learning_rate": 5.862427702616605e-06,
"loss": 0.6831,
"mean_token_accuracy": 0.7916372120380402,
"num_tokens": 5177725329.0,
"step": 2490
},
{
"epoch": 2.5670182660149217,
"grad_norm": 0.17920783162117004,
"learning_rate": 5.842733451898467e-06,
"loss": 0.7028,
"mean_token_accuracy": 0.7861807703971863,
"num_tokens": 5188136857.0,
"step": 2495
},
{
"epoch": 2.572163622330846,
"grad_norm": 0.17899206280708313,
"learning_rate": 5.823253280639039e-06,
"loss": 0.6814,
"mean_token_accuracy": 0.7923789769411087,
"num_tokens": 5198537170.0,
"step": 2500
},
{
"epoch": 2.577308978646771,
"grad_norm": 0.17629799246788025,
"learning_rate": 5.803987815267268e-06,
"loss": 0.6979,
"mean_token_accuracy": 0.7875349700450898,
"num_tokens": 5208920419.0,
"step": 2505
},
{
"epoch": 2.582454334962696,
"grad_norm": 0.16480837762355804,
"learning_rate": 5.7849376753077625e-06,
"loss": 0.6856,
"mean_token_accuracy": 0.7911129057407379,
"num_tokens": 5219341909.0,
"step": 2510
},
{
"epoch": 2.587599691278621,
"grad_norm": 0.1704104244709015,
"learning_rate": 5.766103473360842e-06,
"loss": 0.6955,
"mean_token_accuracy": 0.7883382886648178,
"num_tokens": 5229692388.0,
"step": 2515
},
{
"epoch": 2.592745047594546,
"grad_norm": 0.17449352145195007,
"learning_rate": 5.74748581508286e-06,
"loss": 0.6943,
"mean_token_accuracy": 0.788813516497612,
"num_tokens": 5240057257.0,
"step": 2520
},
{
"epoch": 2.597890403910471,
"grad_norm": 0.17114900052547455,
"learning_rate": 5.729085299166713e-06,
"loss": 0.6925,
"mean_token_accuracy": 0.7888531744480133,
"num_tokens": 5250472272.0,
"step": 2525
},
{
"epoch": 2.603035760226396,
"grad_norm": 0.19473806023597717,
"learning_rate": 5.710902517322597e-06,
"loss": 0.7034,
"mean_token_accuracy": 0.7864585638046264,
"num_tokens": 5260858183.0,
"step": 2530
},
{
"epoch": 2.6081811165423208,
"grad_norm": 0.17495019733905792,
"learning_rate": 5.6929380542589764e-06,
"loss": 0.6919,
"mean_token_accuracy": 0.7893568813800812,
"num_tokens": 5271278572.0,
"step": 2535
},
{
"epoch": 2.6133264728582457,
"grad_norm": 0.16885867714881897,
"learning_rate": 5.675192487663777e-06,
"loss": 0.6922,
"mean_token_accuracy": 0.7892817795276642,
"num_tokens": 5281676141.0,
"step": 2540
},
{
"epoch": 2.61847182917417,
"grad_norm": 0.17458729445934296,
"learning_rate": 5.657666388185823e-06,
"loss": 0.6925,
"mean_token_accuracy": 0.7891670197248459,
"num_tokens": 5292081395.0,
"step": 2545
},
{
"epoch": 2.623617185490095,
"grad_norm": 0.17178680002689362,
"learning_rate": 5.640360319416467e-06,
"loss": 0.6888,
"mean_token_accuracy": 0.7902668923139572,
"num_tokens": 5302469310.0,
"step": 2550
},
{
"epoch": 2.62876254180602,
"grad_norm": 0.1776588261127472,
"learning_rate": 5.623274837871483e-06,
"loss": 0.694,
"mean_token_accuracy": 0.7890095263719559,
"num_tokens": 5312857824.0,
"step": 2555
},
{
"epoch": 2.633907898121945,
"grad_norm": 0.17728105187416077,
"learning_rate": 5.606410492973162e-06,
"loss": 0.6885,
"mean_token_accuracy": 0.7901356816291809,
"num_tokens": 5323248156.0,
"step": 2560
},
{
"epoch": 2.63905325443787,
"grad_norm": 0.16960322856903076,
"learning_rate": 5.589767827032649e-06,
"loss": 0.7014,
"mean_token_accuracy": 0.7869833618402481,
"num_tokens": 5333628919.0,
"step": 2565
},
{
"epoch": 2.6441986107537945,
"grad_norm": 0.17245355248451233,
"learning_rate": 5.573347375232493e-06,
"loss": 0.6918,
"mean_token_accuracy": 0.7889738440513611,
"num_tokens": 5344021897.0,
"step": 2570
},
{
"epoch": 2.6493439670697194,
"grad_norm": 0.16664022207260132,
"learning_rate": 5.557149665609455e-06,
"loss": 0.7,
"mean_token_accuracy": 0.7870820313692093,
"num_tokens": 5354420664.0,
"step": 2575
},
{
"epoch": 2.6544893233856444,
"grad_norm": 0.17978526651859283,
"learning_rate": 5.54117521903751e-06,
"loss": 0.6783,
"mean_token_accuracy": 0.793021947145462,
"num_tokens": 5364835797.0,
"step": 2580
},
{
"epoch": 2.6596346797015693,
"grad_norm": 0.17351332306861877,
"learning_rate": 5.525424549211112e-06,
"loss": 0.6964,
"mean_token_accuracy": 0.7881974041461944,
"num_tokens": 5375251218.0,
"step": 2585
},
{
"epoch": 2.6647800360174942,
"grad_norm": 0.1651052087545395,
"learning_rate": 5.509898162628657e-06,
"loss": 0.6956,
"mean_token_accuracy": 0.7883888274431229,
"num_tokens": 5385653462.0,
"step": 2590
},
{
"epoch": 2.669925392333419,
"grad_norm": 0.17517080903053284,
"learning_rate": 5.494596558576215e-06,
"loss": 0.7,
"mean_token_accuracy": 0.7868314325809479,
"num_tokens": 5396055523.0,
"step": 2595
},
{
"epoch": 2.675070748649344,
"grad_norm": 0.18584103882312775,
"learning_rate": 5.4795202291114655e-06,
"loss": 0.6948,
"mean_token_accuracy": 0.7886532038450241,
"num_tokens": 5406483427.0,
"step": 2600
},
{
"epoch": 2.680216104965269,
"grad_norm": 0.1600140780210495,
"learning_rate": 5.464669659047871e-06,
"loss": 0.7105,
"mean_token_accuracy": 0.7844233065843582,
"num_tokens": 5416893350.0,
"step": 2605
},
{
"epoch": 2.6853614612811936,
"grad_norm": 0.16962645947933197,
"learning_rate": 5.450045325939086e-06,
"loss": 0.699,
"mean_token_accuracy": 0.7872962862253189,
"num_tokens": 5427314991.0,
"step": 2610
},
{
"epoch": 2.6905068175971185,
"grad_norm": 0.16842614114284515,
"learning_rate": 5.4356477000636155e-06,
"loss": 0.696,
"mean_token_accuracy": 0.7881788671016693,
"num_tokens": 5437722441.0,
"step": 2615
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.1736549735069275,
"learning_rate": 5.42147724440967e-06,
"loss": 0.6804,
"mean_token_accuracy": 0.7926200598478317,
"num_tokens": 5448128609.0,
"step": 2620
},
{
"epoch": 2.7007975302289684,
"grad_norm": 0.17829610407352448,
"learning_rate": 5.407534414660296e-06,
"loss": 0.7076,
"mean_token_accuracy": 0.7849996328353882,
"num_tokens": 5458552679.0,
"step": 2625
},
{
"epoch": 2.7059428865448933,
"grad_norm": 0.17205742001533508,
"learning_rate": 5.3938196591787055e-06,
"loss": 0.6886,
"mean_token_accuracy": 0.7903157830238342,
"num_tokens": 5468959670.0,
"step": 2630
},
{
"epoch": 2.711088242860818,
"grad_norm": 0.19865980744361877,
"learning_rate": 5.380333418993874e-06,
"loss": 0.6969,
"mean_token_accuracy": 0.7879400312900543,
"num_tokens": 5479359994.0,
"step": 2635
},
{
"epoch": 2.716233599176743,
"grad_norm": 0.17879824340343475,
"learning_rate": 5.367076127786349e-06,
"loss": 0.6799,
"mean_token_accuracy": 0.7927328020334243,
"num_tokens": 5489776454.0,
"step": 2640
},
{
"epoch": 2.7213789554926677,
"grad_norm": 0.16733145713806152,
"learning_rate": 5.354048211874305e-06,
"loss": 0.7009,
"mean_token_accuracy": 0.7868389576673508,
"num_tokens": 5500195720.0,
"step": 2645
},
{
"epoch": 2.7265243118085927,
"grad_norm": 0.17073538899421692,
"learning_rate": 5.341250090199836e-06,
"loss": 0.689,
"mean_token_accuracy": 0.7902264356613159,
"num_tokens": 5510590496.0,
"step": 2650
},
{
"epoch": 2.7316696681245176,
"grad_norm": 0.16798222064971924,
"learning_rate": 5.328682174315484e-06,
"loss": 0.6997,
"mean_token_accuracy": 0.7871428996324539,
"num_tokens": 5520984194.0,
"step": 2655
},
{
"epoch": 2.7368150244404426,
"grad_norm": 0.17511659860610962,
"learning_rate": 5.316344868370999e-06,
"loss": 0.7027,
"mean_token_accuracy": 0.7862639844417572,
"num_tokens": 5531378820.0,
"step": 2660
},
{
"epoch": 2.7419603807563675,
"grad_norm": 0.1656564474105835,
"learning_rate": 5.304238569100351e-06,
"loss": 0.6919,
"mean_token_accuracy": 0.7892627060413361,
"num_tokens": 5541788819.0,
"step": 2665
},
{
"epoch": 2.7471057370722924,
"grad_norm": 0.1768578141927719,
"learning_rate": 5.2923636658089674e-06,
"loss": 0.6951,
"mean_token_accuracy": 0.7887540727853775,
"num_tokens": 5552136282.0,
"step": 2670
},
{
"epoch": 2.7522510933882174,
"grad_norm": 0.16567444801330566,
"learning_rate": 5.280720540361213e-06,
"loss": 0.6902,
"mean_token_accuracy": 0.7896697282791137,
"num_tokens": 5562532385.0,
"step": 2675
},
{
"epoch": 2.757396449704142,
"grad_norm": 0.16643787920475006,
"learning_rate": 5.2693095671681125e-06,
"loss": 0.6946,
"mean_token_accuracy": 0.7891028523445129,
"num_tokens": 5572953965.0,
"step": 2680
},
{
"epoch": 2.762541806020067,
"grad_norm": 0.1679885983467102,
"learning_rate": 5.258131113175312e-06,
"loss": 0.6928,
"mean_token_accuracy": 0.7889256983995437,
"num_tokens": 5583365916.0,
"step": 2685
},
{
"epoch": 2.7676871623359918,
"grad_norm": 0.17079950869083405,
"learning_rate": 5.247185537851277e-06,
"loss": 0.693,
"mean_token_accuracy": 0.7887627691030502,
"num_tokens": 5593766520.0,
"step": 2690
},
{
"epoch": 2.7728325186519167,
"grad_norm": 0.17430275678634644,
"learning_rate": 5.236473193175727e-06,
"loss": 0.693,
"mean_token_accuracy": 0.7892514944076539,
"num_tokens": 5604146645.0,
"step": 2695
},
{
"epoch": 2.7779778749678417,
"grad_norm": 0.1720867156982422,
"learning_rate": 5.225994423628329e-06,
"loss": 0.6982,
"mean_token_accuracy": 0.7877674490213394,
"num_tokens": 5614554383.0,
"step": 2700
},
{
"epoch": 2.783123231283766,
"grad_norm": 0.17518489062786102,
"learning_rate": 5.215749566177612e-06,
"loss": 0.6908,
"mean_token_accuracy": 0.789122948050499,
"num_tokens": 5624946901.0,
"step": 2705
},
{
"epoch": 2.788268587599691,
"grad_norm": 0.17626863718032837,
"learning_rate": 5.2057389502701315e-06,
"loss": 0.6962,
"mean_token_accuracy": 0.7886440306901932,
"num_tokens": 5635349602.0,
"step": 2710
},
{
"epoch": 2.793413943915616,
"grad_norm": 0.17560714483261108,
"learning_rate": 5.19596289781988e-06,
"loss": 0.6859,
"mean_token_accuracy": 0.7909172236919403,
"num_tokens": 5645756384.0,
"step": 2715
},
{
"epoch": 2.798559300231541,
"grad_norm": 0.16886913776397705,
"learning_rate": 5.186421723197922e-06,
"loss": 0.6868,
"mean_token_accuracy": 0.7907733172178268,
"num_tokens": 5656178271.0,
"step": 2720
},
{
"epoch": 2.803704656547466,
"grad_norm": 0.1693251132965088,
"learning_rate": 5.177115733222307e-06,
"loss": 0.6947,
"mean_token_accuracy": 0.7886239379644394,
"num_tokens": 5666600795.0,
"step": 2725
},
{
"epoch": 2.808850012863391,
"grad_norm": 0.16803273558616638,
"learning_rate": 5.168045227148184e-06,
"loss": 0.6972,
"mean_token_accuracy": 0.787845715880394,
"num_tokens": 5676999671.0,
"step": 2730
},
{
"epoch": 2.813995369179316,
"grad_norm": 0.16651484370231628,
"learning_rate": 5.159210496658182e-06,
"loss": 0.6884,
"mean_token_accuracy": 0.7903055369853973,
"num_tokens": 5687397402.0,
"step": 2735
},
{
"epoch": 2.8191407254952408,
"grad_norm": 0.1660391241312027,
"learning_rate": 5.15061182585304e-06,
"loss": 0.6792,
"mean_token_accuracy": 0.7931360393762589,
"num_tokens": 5697794080.0,
"step": 2740
},
{
"epoch": 2.8242860818111657,
"grad_norm": 0.17520754039287567,
"learning_rate": 5.1422494912424595e-06,
"loss": 0.6986,
"mean_token_accuracy": 0.7876220345497131,
"num_tokens": 5708182662.0,
"step": 2745
},
{
"epoch": 2.82943143812709,
"grad_norm": 0.16872966289520264,
"learning_rate": 5.134123761736216e-06,
"loss": 0.6966,
"mean_token_accuracy": 0.7881220698356628,
"num_tokens": 5718594864.0,
"step": 2750
},
{
"epoch": 2.834576794443015,
"grad_norm": 0.16700060665607452,
"learning_rate": 5.126234898635518e-06,
"loss": 0.6942,
"mean_token_accuracy": 0.788716048002243,
"num_tokens": 5728995814.0,
"step": 2755
},
{
"epoch": 2.83972215075894,
"grad_norm": 0.17934595048427582,
"learning_rate": 5.118583155624593e-06,
"loss": 0.6929,
"mean_token_accuracy": 0.7886988967657089,
"num_tokens": 5739395668.0,
"step": 2760
},
{
"epoch": 2.844867507074865,
"grad_norm": 0.17512430250644684,
"learning_rate": 5.111168778762542e-06,
"loss": 0.6938,
"mean_token_accuracy": 0.7889688044786454,
"num_tokens": 5749807349.0,
"step": 2765
},
{
"epoch": 2.85001286339079,
"grad_norm": 0.1730797290802002,
"learning_rate": 5.103992006475416e-06,
"loss": 0.6951,
"mean_token_accuracy": 0.7884382456541061,
"num_tokens": 5760209986.0,
"step": 2770
},
{
"epoch": 2.8551582197067145,
"grad_norm": 0.17350102961063385,
"learning_rate": 5.097053069548554e-06,
"loss": 0.6985,
"mean_token_accuracy": 0.7875759869813919,
"num_tokens": 5770624277.0,
"step": 2775
},
{
"epoch": 2.8603035760226394,
"grad_norm": 0.16581964492797852,
"learning_rate": 5.090352191119167e-06,
"loss": 0.6968,
"mean_token_accuracy": 0.7881747186183929,
"num_tokens": 5780998524.0,
"step": 2780
},
{
"epoch": 2.8654489323385643,
"grad_norm": 0.1817171722650528,
"learning_rate": 5.083889586669148e-06,
"loss": 0.6957,
"mean_token_accuracy": 0.7882327765226365,
"num_tokens": 5791408666.0,
"step": 2785
},
{
"epoch": 2.8705942886544893,
"grad_norm": 0.18993426859378815,
"learning_rate": 5.077665464018158e-06,
"loss": 0.7035,
"mean_token_accuracy": 0.7860898345708847,
"num_tokens": 5801820214.0,
"step": 2790
},
{
"epoch": 2.8757396449704142,
"grad_norm": 0.21299272775650024,
"learning_rate": 5.071680023316934e-06,
"loss": 0.688,
"mean_token_accuracy": 0.7904294729232788,
"num_tokens": 5812235390.0,
"step": 2795
},
{
"epoch": 2.880885001286339,
"grad_norm": 0.17882496118545532,
"learning_rate": 5.065933457040855e-06,
"loss": 0.6996,
"mean_token_accuracy": 0.7872415781021118,
"num_tokens": 5822639708.0,
"step": 2800
},
{
"epoch": 2.886030357602264,
"grad_norm": 0.16855992376804352,
"learning_rate": 5.060425949983754e-06,
"loss": 0.682,
"mean_token_accuracy": 0.7919277369976043,
"num_tokens": 5833034354.0,
"step": 2805
},
{
"epoch": 2.891175713918189,
"grad_norm": 0.17891888320446014,
"learning_rate": 5.055157679251973e-06,
"loss": 0.6899,
"mean_token_accuracy": 0.7897359609603882,
"num_tokens": 5843403693.0,
"step": 2810
},
{
"epoch": 2.8963210702341136,
"grad_norm": 0.16349950432777405,
"learning_rate": 5.05012881425867e-06,
"loss": 0.6863,
"mean_token_accuracy": 0.7908827304840088,
"num_tokens": 5853803722.0,
"step": 2815
},
{
"epoch": 2.9014664265500385,
"grad_norm": 0.1745273768901825,
"learning_rate": 5.045339516718369e-06,
"loss": 0.6893,
"mean_token_accuracy": 0.7903022348880768,
"num_tokens": 5864184533.0,
"step": 2820
},
{
"epoch": 2.9066117828659634,
"grad_norm": 0.16056472063064575,
"learning_rate": 5.0407899406417626e-06,
"loss": 0.6952,
"mean_token_accuracy": 0.7885043084621429,
"num_tokens": 5874589484.0,
"step": 2825
},
{
"epoch": 2.9117571391818884,
"grad_norm": 0.17720390856266022,
"learning_rate": 5.036480232330756e-06,
"loss": 0.6936,
"mean_token_accuracy": 0.7888238668441773,
"num_tokens": 5885001020.0,
"step": 2830
},
{
"epoch": 2.9169024954978133,
"grad_norm": 0.16652604937553406,
"learning_rate": 5.032410530373764e-06,
"loss": 0.7,
"mean_token_accuracy": 0.7873083800077438,
"num_tokens": 5895408701.0,
"step": 2835
},
{
"epoch": 2.922047851813738,
"grad_norm": 0.17051072418689728,
"learning_rate": 5.028580965641256e-06,
"loss": 0.6925,
"mean_token_accuracy": 0.7890042126178741,
"num_tokens": 5905806664.0,
"step": 2840
},
{
"epoch": 2.9271932081296628,
"grad_norm": 0.16726641356945038,
"learning_rate": 5.024991661281546e-06,
"loss": 0.6962,
"mean_token_accuracy": 0.7878126442432404,
"num_tokens": 5916184097.0,
"step": 2845
},
{
"epoch": 2.9323385644455877,
"grad_norm": 0.16886568069458008,
"learning_rate": 5.0216427327168295e-06,
"loss": 0.6861,
"mean_token_accuracy": 0.7910468071699143,
"num_tokens": 5926604231.0,
"step": 2850
},
{
"epoch": 2.9374839207615127,
"grad_norm": 0.1645469218492508,
"learning_rate": 5.0185342876394775e-06,
"loss": 0.6954,
"mean_token_accuracy": 0.7883421629667282,
"num_tokens": 5937011174.0,
"step": 2855
},
{
"epoch": 2.9426292770774376,
"grad_norm": 0.17823590338230133,
"learning_rate": 5.0156664260085695e-06,
"loss": 0.6896,
"mean_token_accuracy": 0.7900129020214081,
"num_tokens": 5947409312.0,
"step": 2860
},
{
"epoch": 2.9477746333933625,
"grad_norm": 0.18835224211215973,
"learning_rate": 5.0130392400466835e-06,
"loss": 0.689,
"mean_token_accuracy": 0.7900780767202378,
"num_tokens": 5957805326.0,
"step": 2865
},
{
"epoch": 2.9529199897092875,
"grad_norm": 0.1715887039899826,
"learning_rate": 5.010652814236921e-06,
"loss": 0.6909,
"mean_token_accuracy": 0.7899001896381378,
"num_tokens": 5968218507.0,
"step": 2870
},
{
"epoch": 2.9580653460252124,
"grad_norm": 0.18797433376312256,
"learning_rate": 5.008507225320203e-06,
"loss": 0.704,
"mean_token_accuracy": 0.7860912084579468,
"num_tokens": 5978622283.0,
"step": 2875
},
{
"epoch": 2.9632107023411374,
"grad_norm": 0.17873001098632812,
"learning_rate": 5.00660254229279e-06,
"loss": 0.6968,
"mean_token_accuracy": 0.7881993442773819,
"num_tokens": 5988997989.0,
"step": 2880
},
{
"epoch": 2.968356058657062,
"grad_norm": 0.18319903314113617,
"learning_rate": 5.004938826404073e-06,
"loss": 0.6993,
"mean_token_accuracy": 0.787423062324524,
"num_tokens": 5999359936.0,
"step": 2885
},
{
"epoch": 2.973501414972987,
"grad_norm": 0.1805352121591568,
"learning_rate": 5.003516131154598e-06,
"loss": 0.6972,
"mean_token_accuracy": 0.7876656591892243,
"num_tokens": 6009761882.0,
"step": 2890
},
{
"epoch": 2.9786467712889118,
"grad_norm": 0.18117545545101166,
"learning_rate": 5.002334502294346e-06,
"loss": 0.6947,
"mean_token_accuracy": 0.7882909893989563,
"num_tokens": 6020167222.0,
"step": 2895
},
{
"epoch": 2.9837921276048367,
"grad_norm": 0.1723506599664688,
"learning_rate": 5.001393977821266e-06,
"loss": 0.7042,
"mean_token_accuracy": 0.7860449641942978,
"num_tokens": 6030555418.0,
"step": 2900
},
{
"epoch": 2.9889374839207616,
"grad_norm": 0.19804443418979645,
"learning_rate": 5.00069458798005e-06,
"loss": 0.6997,
"mean_token_accuracy": 0.7871603965759277,
"num_tokens": 6040947002.0,
"step": 2905
},
{
"epoch": 2.994082840236686,
"grad_norm": 0.16171956062316895,
"learning_rate": 5.000236355261159e-06,
"loss": 0.6994,
"mean_token_accuracy": 0.787379264831543,
"num_tokens": 6051343009.0,
"step": 2910
},
{
"epoch": 2.999228196552611,
"grad_norm": 0.17762000858783722,
"learning_rate": 5.000019294400102e-06,
"loss": 0.6875,
"mean_token_accuracy": 0.7905127763748169,
"num_tokens": 6061722567.0,
"step": 2915
}
],
"logging_steps": 5,
"max_steps": 2916,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 450,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2789425421603045e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}