env_40d8866 / trainer_state.json
bimabk's picture
Upload task output 1
ceca3cb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.912087912087912,
"eval_steps": 500,
"global_step": 2160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018315018315018316,
"grad_norm": 26.125,
"learning_rate": 1.2509723013743402e-06,
"loss": 1.4391,
"mean_token_accuracy": 0.7123644590377808,
"num_tokens": 9264.0,
"step": 5
},
{
"epoch": 0.03663003663003663,
"grad_norm": 24.5,
"learning_rate": 2.814687678092266e-06,
"loss": 1.2334,
"mean_token_accuracy": 0.7464994072914124,
"num_tokens": 18573.0,
"step": 10
},
{
"epoch": 0.054945054945054944,
"grad_norm": 14.6875,
"learning_rate": 4.378403054810191e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.7385512232780457,
"num_tokens": 28324.0,
"step": 15
},
{
"epoch": 0.07326007326007326,
"grad_norm": 8.625,
"learning_rate": 5.942118431528117e-06,
"loss": 1.1542,
"mean_token_accuracy": 0.7414855003356934,
"num_tokens": 37365.0,
"step": 20
},
{
"epoch": 0.09157509157509157,
"grad_norm": 7.75,
"learning_rate": 7.505833808246043e-06,
"loss": 0.963,
"mean_token_accuracy": 0.7728639602661133,
"num_tokens": 47169.0,
"step": 25
},
{
"epoch": 0.10989010989010989,
"grad_norm": 7.6875,
"learning_rate": 9.069549184963967e-06,
"loss": 0.687,
"mean_token_accuracy": 0.8294244527816772,
"num_tokens": 56922.0,
"step": 30
},
{
"epoch": 0.1282051282051282,
"grad_norm": 6.09375,
"learning_rate": 1.0633264561681893e-05,
"loss": 0.5571,
"mean_token_accuracy": 0.8617596507072449,
"num_tokens": 65860.0,
"step": 35
},
{
"epoch": 0.14652014652014653,
"grad_norm": 7.0625,
"learning_rate": 1.2196979938399817e-05,
"loss": 0.1916,
"mean_token_accuracy": 0.9526763677597045,
"num_tokens": 75644.0,
"step": 40
},
{
"epoch": 0.16483516483516483,
"grad_norm": 5.9375,
"learning_rate": 1.3760695315117745e-05,
"loss": 0.195,
"mean_token_accuracy": 0.9427057504653931,
"num_tokens": 84597.0,
"step": 45
},
{
"epoch": 0.18315018315018314,
"grad_norm": 2.046875,
"learning_rate": 1.532441069183567e-05,
"loss": 0.272,
"mean_token_accuracy": 0.9327008485794067,
"num_tokens": 93032.0,
"step": 50
},
{
"epoch": 0.20146520146520147,
"grad_norm": 7.5,
"learning_rate": 1.6888126068553595e-05,
"loss": 0.2524,
"mean_token_accuracy": 0.9316304802894593,
"num_tokens": 101726.0,
"step": 55
},
{
"epoch": 0.21978021978021978,
"grad_norm": 2.5625,
"learning_rate": 1.845184144527152e-05,
"loss": 0.2507,
"mean_token_accuracy": 0.9321518659591674,
"num_tokens": 110505.0,
"step": 60
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.55859375,
"learning_rate": 2.0015556821989444e-05,
"loss": 0.1683,
"mean_token_accuracy": 0.9499430179595947,
"num_tokens": 119502.0,
"step": 65
},
{
"epoch": 0.2564102564102564,
"grad_norm": 5.0625,
"learning_rate": 2.157927219870737e-05,
"loss": 0.3318,
"mean_token_accuracy": 0.911142885684967,
"num_tokens": 129089.0,
"step": 70
},
{
"epoch": 0.27472527472527475,
"grad_norm": 1.59375,
"learning_rate": 2.3142987575425293e-05,
"loss": 0.1793,
"mean_token_accuracy": 0.950410532951355,
"num_tokens": 138645.0,
"step": 75
},
{
"epoch": 0.29304029304029305,
"grad_norm": 1.53125,
"learning_rate": 2.4706702952143225e-05,
"loss": 0.0869,
"mean_token_accuracy": 0.9751996159553528,
"num_tokens": 148602.0,
"step": 80
},
{
"epoch": 0.31135531135531136,
"grad_norm": 2.25,
"learning_rate": 2.627041832886115e-05,
"loss": 0.1275,
"mean_token_accuracy": 0.9607115983963013,
"num_tokens": 157517.0,
"step": 85
},
{
"epoch": 0.32967032967032966,
"grad_norm": 2.03125,
"learning_rate": 2.7834133705579074e-05,
"loss": 0.0966,
"mean_token_accuracy": 0.9684963464736939,
"num_tokens": 166575.0,
"step": 90
},
{
"epoch": 0.34798534798534797,
"grad_norm": 0.55078125,
"learning_rate": 2.9397849082297e-05,
"loss": 0.133,
"mean_token_accuracy": 0.9579505920410156,
"num_tokens": 175735.0,
"step": 95
},
{
"epoch": 0.3663003663003663,
"grad_norm": 1.375,
"learning_rate": 3.096156445901492e-05,
"loss": 0.1709,
"mean_token_accuracy": 0.9461557865142822,
"num_tokens": 183857.0,
"step": 100
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.015625,
"learning_rate": 3.2525279835732844e-05,
"loss": 0.1466,
"mean_token_accuracy": 0.9522614717483521,
"num_tokens": 191716.0,
"step": 105
},
{
"epoch": 0.40293040293040294,
"grad_norm": 5.65625,
"learning_rate": 3.4088995212450776e-05,
"loss": 0.2514,
"mean_token_accuracy": 0.9299401640892029,
"num_tokens": 200518.0,
"step": 110
},
{
"epoch": 0.42124542124542125,
"grad_norm": 0.92578125,
"learning_rate": 3.56527105891687e-05,
"loss": 0.0687,
"mean_token_accuracy": 0.975311815738678,
"num_tokens": 209961.0,
"step": 115
},
{
"epoch": 0.43956043956043955,
"grad_norm": 2.53125,
"learning_rate": 3.7216425965886625e-05,
"loss": 0.1955,
"mean_token_accuracy": 0.9394341588020325,
"num_tokens": 218838.0,
"step": 120
},
{
"epoch": 0.45787545787545786,
"grad_norm": 1.6953125,
"learning_rate": 3.878014134260455e-05,
"loss": 0.1805,
"mean_token_accuracy": 0.944377863407135,
"num_tokens": 227866.0,
"step": 125
},
{
"epoch": 0.47619047619047616,
"grad_norm": 2.03125,
"learning_rate": 4.0343856719322474e-05,
"loss": 0.2456,
"mean_token_accuracy": 0.9300832152366638,
"num_tokens": 237211.0,
"step": 130
},
{
"epoch": 0.4945054945054945,
"grad_norm": 3.5625,
"learning_rate": 4.19075720960404e-05,
"loss": 0.1599,
"mean_token_accuracy": 0.9527908086776733,
"num_tokens": 247238.0,
"step": 135
},
{
"epoch": 0.5128205128205128,
"grad_norm": 1.0078125,
"learning_rate": 4.3471287472758323e-05,
"loss": 0.1544,
"mean_token_accuracy": 0.9530697703361511,
"num_tokens": 255982.0,
"step": 140
},
{
"epoch": 0.5311355311355311,
"grad_norm": 3.28125,
"learning_rate": 4.5035002849476255e-05,
"loss": 0.1674,
"mean_token_accuracy": 0.949328339099884,
"num_tokens": 265058.0,
"step": 145
},
{
"epoch": 0.5494505494505495,
"grad_norm": 2.375,
"learning_rate": 4.659871822619417e-05,
"loss": 0.1525,
"mean_token_accuracy": 0.9563624501228333,
"num_tokens": 274227.0,
"step": 150
},
{
"epoch": 0.5677655677655677,
"grad_norm": 2.28125,
"learning_rate": 4.8162433602912104e-05,
"loss": 0.2012,
"mean_token_accuracy": 0.940619957447052,
"num_tokens": 282699.0,
"step": 155
},
{
"epoch": 0.5860805860805861,
"grad_norm": 2.078125,
"learning_rate": 4.9726148979630036e-05,
"loss": 0.0786,
"mean_token_accuracy": 0.9756275773048401,
"num_tokens": 292119.0,
"step": 160
},
{
"epoch": 0.6043956043956044,
"grad_norm": 1.40625,
"learning_rate": 5.128986435634795e-05,
"loss": 0.1985,
"mean_token_accuracy": 0.9421133875846863,
"num_tokens": 301406.0,
"step": 165
},
{
"epoch": 0.6227106227106227,
"grad_norm": 1.5078125,
"learning_rate": 5.2853579733065885e-05,
"loss": 0.1154,
"mean_token_accuracy": 0.9618610620498658,
"num_tokens": 310481.0,
"step": 170
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.359375,
"learning_rate": 5.44172951097838e-05,
"loss": 0.1248,
"mean_token_accuracy": 0.9580595254898071,
"num_tokens": 319932.0,
"step": 175
},
{
"epoch": 0.6593406593406593,
"grad_norm": 1.546875,
"learning_rate": 5.5981010486501734e-05,
"loss": 0.0783,
"mean_token_accuracy": 0.972282862663269,
"num_tokens": 329237.0,
"step": 180
},
{
"epoch": 0.6776556776556777,
"grad_norm": 0.478515625,
"learning_rate": 5.754472586321966e-05,
"loss": 0.0932,
"mean_token_accuracy": 0.9683905124664307,
"num_tokens": 338362.0,
"step": 185
},
{
"epoch": 0.6959706959706959,
"grad_norm": 1.9453125,
"learning_rate": 5.910844123993758e-05,
"loss": 0.0953,
"mean_token_accuracy": 0.9647938251495362,
"num_tokens": 347632.0,
"step": 190
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.0234375,
"learning_rate": 6.067215661665551e-05,
"loss": 0.1225,
"mean_token_accuracy": 0.9624135136604309,
"num_tokens": 357312.0,
"step": 195
},
{
"epoch": 0.7326007326007326,
"grad_norm": 6.5,
"learning_rate": 6.223587199337343e-05,
"loss": 0.1022,
"mean_token_accuracy": 0.9669649362564087,
"num_tokens": 366372.0,
"step": 200
},
{
"epoch": 0.7509157509157509,
"grad_norm": 1.9609375,
"learning_rate": 6.379958737009136e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9620457410812377,
"num_tokens": 376261.0,
"step": 205
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.328125,
"learning_rate": 6.536330274680927e-05,
"loss": 0.1534,
"mean_token_accuracy": 0.9578806042671204,
"num_tokens": 385931.0,
"step": 210
},
{
"epoch": 0.7875457875457875,
"grad_norm": 1.6484375,
"learning_rate": 6.69270181235272e-05,
"loss": 0.1488,
"mean_token_accuracy": 0.9553104996681213,
"num_tokens": 395126.0,
"step": 215
},
{
"epoch": 0.8058608058608059,
"grad_norm": 1.5625,
"learning_rate": 6.849073350024514e-05,
"loss": 0.2034,
"mean_token_accuracy": 0.9399782657623291,
"num_tokens": 404038.0,
"step": 220
},
{
"epoch": 0.8241758241758241,
"grad_norm": 0.95703125,
"learning_rate": 7.005444887696306e-05,
"loss": 0.1123,
"mean_token_accuracy": 0.9649486184120178,
"num_tokens": 412865.0,
"step": 225
},
{
"epoch": 0.8424908424908425,
"grad_norm": 12.125,
"learning_rate": 7.161816425368099e-05,
"loss": 0.2582,
"mean_token_accuracy": 0.9260981917381287,
"num_tokens": 421875.0,
"step": 230
},
{
"epoch": 0.8608058608058609,
"grad_norm": 1.0078125,
"learning_rate": 7.31818796303989e-05,
"loss": 0.1237,
"mean_token_accuracy": 0.958947730064392,
"num_tokens": 430974.0,
"step": 235
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.890625,
"learning_rate": 7.474559500711684e-05,
"loss": 0.094,
"mean_token_accuracy": 0.9669261336326599,
"num_tokens": 440593.0,
"step": 240
},
{
"epoch": 0.8974358974358975,
"grad_norm": 1.5234375,
"learning_rate": 7.630931038383477e-05,
"loss": 0.1799,
"mean_token_accuracy": 0.9489495992660523,
"num_tokens": 449999.0,
"step": 245
},
{
"epoch": 0.9157509157509157,
"grad_norm": 0.1806640625,
"learning_rate": 7.787302576055269e-05,
"loss": 0.0973,
"mean_token_accuracy": 0.9693061113357544,
"num_tokens": 458710.0,
"step": 250
},
{
"epoch": 0.9340659340659341,
"grad_norm": 1.359375,
"learning_rate": 7.943674113727062e-05,
"loss": 0.1313,
"mean_token_accuracy": 0.9576280236244201,
"num_tokens": 468138.0,
"step": 255
},
{
"epoch": 0.9523809523809523,
"grad_norm": 3.21875,
"learning_rate": 8.100045651398853e-05,
"loss": 0.2788,
"mean_token_accuracy": 0.9308066725730896,
"num_tokens": 478094.0,
"step": 260
},
{
"epoch": 0.9706959706959707,
"grad_norm": 15.75,
"learning_rate": 8.256417189070647e-05,
"loss": 0.1394,
"mean_token_accuracy": 0.9597956895828247,
"num_tokens": 487236.0,
"step": 265
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.73046875,
"learning_rate": 8.41278872674244e-05,
"loss": 0.105,
"mean_token_accuracy": 0.9711844086647033,
"num_tokens": 496836.0,
"step": 270
},
{
"epoch": 1.0073260073260073,
"grad_norm": 4.125,
"learning_rate": 8.537885369635508e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9566316485404969,
"num_tokens": 505486.0,
"step": 275
},
{
"epoch": 1.0256410256410255,
"grad_norm": 38.0,
"learning_rate": 8.537864816105374e-05,
"loss": 0.1003,
"mean_token_accuracy": 0.9712117314338684,
"num_tokens": 515232.0,
"step": 280
},
{
"epoch": 1.043956043956044,
"grad_norm": 1.671875,
"learning_rate": 8.537814900572437e-05,
"loss": 0.1104,
"mean_token_accuracy": 0.9652750253677368,
"num_tokens": 523979.0,
"step": 285
},
{
"epoch": 1.0622710622710623,
"grad_norm": 0.361328125,
"learning_rate": 8.537735623494464e-05,
"loss": 0.0756,
"mean_token_accuracy": 0.9749327659606933,
"num_tokens": 533573.0,
"step": 290
},
{
"epoch": 1.0805860805860805,
"grad_norm": 3.5,
"learning_rate": 8.537626985598489e-05,
"loss": 0.5942,
"mean_token_accuracy": 0.8817975878715515,
"num_tokens": 543009.0,
"step": 295
},
{
"epoch": 1.098901098901099,
"grad_norm": 1.5390625,
"learning_rate": 8.537488987880808e-05,
"loss": 0.1367,
"mean_token_accuracy": 0.9571704030036926,
"num_tokens": 552940.0,
"step": 300
},
{
"epoch": 1.1172161172161172,
"grad_norm": 0.89453125,
"learning_rate": 8.537321631606968e-05,
"loss": 0.1217,
"mean_token_accuracy": 0.9623419761657714,
"num_tokens": 562364.0,
"step": 305
},
{
"epoch": 1.1355311355311355,
"grad_norm": 1.578125,
"learning_rate": 8.537124918311761e-05,
"loss": 0.1608,
"mean_token_accuracy": 0.949942660331726,
"num_tokens": 571646.0,
"step": 310
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.5,
"learning_rate": 8.536898849799202e-05,
"loss": 0.0852,
"mean_token_accuracy": 0.971439003944397,
"num_tokens": 581084.0,
"step": 315
},
{
"epoch": 1.1721611721611722,
"grad_norm": 1.8046875,
"learning_rate": 8.53664342814252e-05,
"loss": 0.1247,
"mean_token_accuracy": 0.9593337297439575,
"num_tokens": 590812.0,
"step": 320
},
{
"epoch": 1.1904761904761905,
"grad_norm": 1.2578125,
"learning_rate": 8.536358655684135e-05,
"loss": 0.1186,
"mean_token_accuracy": 0.957237160205841,
"num_tokens": 599646.0,
"step": 325
},
{
"epoch": 1.2087912087912087,
"grad_norm": 0.5078125,
"learning_rate": 8.536044535035635e-05,
"loss": 0.1778,
"mean_token_accuracy": 0.9485100388526917,
"num_tokens": 608962.0,
"step": 330
},
{
"epoch": 1.2271062271062272,
"grad_norm": 0.91796875,
"learning_rate": 8.535701069077756e-05,
"loss": 0.131,
"mean_token_accuracy": 0.9616786003112793,
"num_tokens": 617832.0,
"step": 335
},
{
"epoch": 1.2454212454212454,
"grad_norm": 1.28125,
"learning_rate": 8.535328260960355e-05,
"loss": 0.1983,
"mean_token_accuracy": 0.9394309878349304,
"num_tokens": 626739.0,
"step": 340
},
{
"epoch": 1.2637362637362637,
"grad_norm": 1.5234375,
"learning_rate": 8.534926114102375e-05,
"loss": 0.064,
"mean_token_accuracy": 0.9792219161987304,
"num_tokens": 636553.0,
"step": 345
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.73828125,
"learning_rate": 8.534494632191824e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.9512728333473206,
"num_tokens": 645322.0,
"step": 350
},
{
"epoch": 1.3003663003663004,
"grad_norm": 0.302734375,
"learning_rate": 8.534033819185732e-05,
"loss": 0.1257,
"mean_token_accuracy": 0.9629031181335449,
"num_tokens": 654789.0,
"step": 355
},
{
"epoch": 1.3186813186813187,
"grad_norm": 2.78125,
"learning_rate": 8.533543679310125e-05,
"loss": 0.1226,
"mean_token_accuracy": 0.962236201763153,
"num_tokens": 664164.0,
"step": 360
},
{
"epoch": 1.3369963369963371,
"grad_norm": 0.1669921875,
"learning_rate": 8.533024217059969e-05,
"loss": 0.0789,
"mean_token_accuracy": 0.9735670685768127,
"num_tokens": 674019.0,
"step": 365
},
{
"epoch": 1.3553113553113554,
"grad_norm": 6.84375,
"learning_rate": 8.53247543719915e-05,
"loss": 0.168,
"mean_token_accuracy": 0.9529448866844177,
"num_tokens": 683484.0,
"step": 370
},
{
"epoch": 1.3736263736263736,
"grad_norm": 1.359375,
"learning_rate": 8.531897344760409e-05,
"loss": 0.1943,
"mean_token_accuracy": 0.9423548102378845,
"num_tokens": 691983.0,
"step": 375
},
{
"epoch": 1.3919413919413919,
"grad_norm": 0.97265625,
"learning_rate": 8.531289945045318e-05,
"loss": 0.1563,
"mean_token_accuracy": 0.957783043384552,
"num_tokens": 701252.0,
"step": 380
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.93359375,
"learning_rate": 8.530653243624211e-05,
"loss": 0.2077,
"mean_token_accuracy": 0.9389472723007202,
"num_tokens": 709727.0,
"step": 385
},
{
"epoch": 1.4285714285714286,
"grad_norm": 11.875,
"learning_rate": 8.529987246336146e-05,
"loss": 0.0909,
"mean_token_accuracy": 0.9703719019889832,
"num_tokens": 719138.0,
"step": 390
},
{
"epoch": 1.4468864468864469,
"grad_norm": 3.4375,
"learning_rate": 8.529291959288849e-05,
"loss": 0.0696,
"mean_token_accuracy": 0.9730043172836303,
"num_tokens": 728302.0,
"step": 395
},
{
"epoch": 1.4652014652014653,
"grad_norm": 0.93359375,
"learning_rate": 8.528567388858653e-05,
"loss": 0.1001,
"mean_token_accuracy": 0.962419056892395,
"num_tokens": 737476.0,
"step": 400
},
{
"epoch": 1.4835164835164836,
"grad_norm": 0.7109375,
"learning_rate": 8.527813541690442e-05,
"loss": 0.1422,
"mean_token_accuracy": 0.9559484243392944,
"num_tokens": 746604.0,
"step": 405
},
{
"epoch": 1.5018315018315018,
"grad_norm": 0.62890625,
"learning_rate": 8.527030424697596e-05,
"loss": 0.1023,
"mean_token_accuracy": 0.9671313047409058,
"num_tokens": 756047.0,
"step": 410
},
{
"epoch": 1.52014652014652,
"grad_norm": 0.54296875,
"learning_rate": 8.526218045061917e-05,
"loss": 0.1209,
"mean_token_accuracy": 0.9592770576477051,
"num_tokens": 764805.0,
"step": 415
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.1796875,
"learning_rate": 8.525376410233573e-05,
"loss": 0.1895,
"mean_token_accuracy": 0.942843246459961,
"num_tokens": 773770.0,
"step": 420
},
{
"epoch": 1.5567765567765568,
"grad_norm": 0.90625,
"learning_rate": 8.524505527931021e-05,
"loss": 0.1104,
"mean_token_accuracy": 0.9629818797111511,
"num_tokens": 782555.0,
"step": 425
},
{
"epoch": 1.575091575091575,
"grad_norm": 1.953125,
"learning_rate": 8.523605406140945e-05,
"loss": 0.079,
"mean_token_accuracy": 0.9723729610443115,
"num_tokens": 791364.0,
"step": 430
},
{
"epoch": 1.5934065934065935,
"grad_norm": 0.765625,
"learning_rate": 8.522676053118176e-05,
"loss": 0.1355,
"mean_token_accuracy": 0.9603265643119812,
"num_tokens": 801577.0,
"step": 435
},
{
"epoch": 1.6117216117216118,
"grad_norm": 38.0,
"learning_rate": 8.521717477385618e-05,
"loss": 0.0925,
"mean_token_accuracy": 0.9714651226997375,
"num_tokens": 810680.0,
"step": 440
},
{
"epoch": 1.63003663003663,
"grad_norm": 54.0,
"learning_rate": 8.520729687734172e-05,
"loss": 0.4008,
"mean_token_accuracy": 0.9063192009925842,
"num_tokens": 819733.0,
"step": 445
},
{
"epoch": 1.6483516483516483,
"grad_norm": 17.0,
"learning_rate": 8.519712693222653e-05,
"loss": 0.2733,
"mean_token_accuracy": 0.9280066013336181,
"num_tokens": 828640.0,
"step": 450
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.26171875,
"learning_rate": 8.518666503177708e-05,
"loss": 0.3508,
"mean_token_accuracy": 0.912005627155304,
"num_tokens": 837843.0,
"step": 455
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.73046875,
"learning_rate": 8.517591127193731e-05,
"loss": 0.0529,
"mean_token_accuracy": 0.9821884870529175,
"num_tokens": 847611.0,
"step": 460
},
{
"epoch": 1.7032967032967035,
"grad_norm": 6.09375,
"learning_rate": 8.516486575132771e-05,
"loss": 0.1331,
"mean_token_accuracy": 0.959692919254303,
"num_tokens": 856321.0,
"step": 465
},
{
"epoch": 1.7216117216117217,
"grad_norm": 5.0,
"learning_rate": 8.515352857124449e-05,
"loss": 0.0689,
"mean_token_accuracy": 0.9763989567756652,
"num_tokens": 865828.0,
"step": 470
},
{
"epoch": 1.73992673992674,
"grad_norm": 1.5703125,
"learning_rate": 8.514189983565859e-05,
"loss": 0.0946,
"mean_token_accuracy": 0.9694816589355468,
"num_tokens": 875232.0,
"step": 475
},
{
"epoch": 1.7582417582417582,
"grad_norm": 21.25,
"learning_rate": 8.512997965121474e-05,
"loss": 0.0833,
"mean_token_accuracy": 0.9721729278564453,
"num_tokens": 884274.0,
"step": 480
},
{
"epoch": 1.7765567765567765,
"grad_norm": 0.7265625,
"learning_rate": 8.511776812723049e-05,
"loss": 0.0723,
"mean_token_accuracy": 0.9744561910629272,
"num_tokens": 893656.0,
"step": 485
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.6015625,
"learning_rate": 8.510526537569522e-05,
"loss": 0.0605,
"mean_token_accuracy": 0.9765355348587036,
"num_tokens": 902461.0,
"step": 490
},
{
"epoch": 1.8131868131868132,
"grad_norm": 1.6328125,
"learning_rate": 8.509247151126907e-05,
"loss": 0.097,
"mean_token_accuracy": 0.9699956893920898,
"num_tokens": 911366.0,
"step": 495
},
{
"epoch": 1.8315018315018317,
"grad_norm": 0.5625,
"learning_rate": 8.507938665128194e-05,
"loss": 0.0759,
"mean_token_accuracy": 0.9745470285415649,
"num_tokens": 920856.0,
"step": 500
},
{
"epoch": 1.84981684981685,
"grad_norm": 2.125,
"learning_rate": 8.506601091573238e-05,
"loss": 0.1981,
"mean_token_accuracy": 0.9415134191513062,
"num_tokens": 929641.0,
"step": 505
},
{
"epoch": 1.8681318681318682,
"grad_norm": 0.5625,
"learning_rate": 8.505234442728651e-05,
"loss": 0.1232,
"mean_token_accuracy": 0.9627613186836242,
"num_tokens": 939594.0,
"step": 510
},
{
"epoch": 1.8864468864468864,
"grad_norm": 35.5,
"learning_rate": 8.503838731127686e-05,
"loss": 0.1638,
"mean_token_accuracy": 0.9507665157318115,
"num_tokens": 948528.0,
"step": 515
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.1640625,
"learning_rate": 8.502413969570129e-05,
"loss": 0.1643,
"mean_token_accuracy": 0.9527613878250122,
"num_tokens": 957817.0,
"step": 520
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.18359375,
"learning_rate": 8.500960171122171e-05,
"loss": 0.1136,
"mean_token_accuracy": 0.9619654774665832,
"num_tokens": 966584.0,
"step": 525
},
{
"epoch": 1.9413919413919414,
"grad_norm": 2.75,
"learning_rate": 8.4994773491163e-05,
"loss": 0.1588,
"mean_token_accuracy": 0.9544906854629517,
"num_tokens": 975280.0,
"step": 530
},
{
"epoch": 1.9597069597069599,
"grad_norm": 1.2734375,
"learning_rate": 8.497965517151172e-05,
"loss": 0.2298,
"mean_token_accuracy": 0.9354098320007325,
"num_tokens": 984056.0,
"step": 535
},
{
"epoch": 1.978021978021978,
"grad_norm": 0.50390625,
"learning_rate": 8.49642468909148e-05,
"loss": 0.0629,
"mean_token_accuracy": 0.97831951379776,
"num_tokens": 993635.0,
"step": 540
},
{
"epoch": 1.9963369963369964,
"grad_norm": 1.140625,
"learning_rate": 8.494854879067847e-05,
"loss": 0.1468,
"mean_token_accuracy": 0.9564722418785095,
"num_tokens": 1003151.0,
"step": 545
},
{
"epoch": 2.0146520146520146,
"grad_norm": 0.73046875,
"learning_rate": 8.493256101476675e-05,
"loss": 0.1093,
"mean_token_accuracy": 0.9693841338157654,
"num_tokens": 1011069.0,
"step": 550
},
{
"epoch": 2.032967032967033,
"grad_norm": 0.5390625,
"learning_rate": 8.491628370980023e-05,
"loss": 0.1015,
"mean_token_accuracy": 0.9635228157043457,
"num_tokens": 1019386.0,
"step": 555
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.23828125,
"learning_rate": 8.489971702505472e-05,
"loss": 0.105,
"mean_token_accuracy": 0.969475531578064,
"num_tokens": 1028915.0,
"step": 560
},
{
"epoch": 2.06959706959707,
"grad_norm": 0.41796875,
"learning_rate": 8.488286111245985e-05,
"loss": 0.0659,
"mean_token_accuracy": 0.9784932613372803,
"num_tokens": 1038671.0,
"step": 565
},
{
"epoch": 2.087912087912088,
"grad_norm": 0.64453125,
"learning_rate": 8.486571612659775e-05,
"loss": 0.1141,
"mean_token_accuracy": 0.9647136211395264,
"num_tokens": 1048771.0,
"step": 570
},
{
"epoch": 2.1062271062271063,
"grad_norm": 1.1015625,
"learning_rate": 8.484828222470152e-05,
"loss": 0.0762,
"mean_token_accuracy": 0.9740150094032287,
"num_tokens": 1058283.0,
"step": 575
},
{
"epoch": 2.1245421245421245,
"grad_norm": 7.71875,
"learning_rate": 8.48305595666539e-05,
"loss": 0.1661,
"mean_token_accuracy": 0.9484933137893676,
"num_tokens": 1067021.0,
"step": 580
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.21484375,
"learning_rate": 8.481254831498573e-05,
"loss": 0.0481,
"mean_token_accuracy": 0.9831829905509949,
"num_tokens": 1076203.0,
"step": 585
},
{
"epoch": 2.161172161172161,
"grad_norm": 0.88671875,
"learning_rate": 8.479424863487448e-05,
"loss": 0.0859,
"mean_token_accuracy": 0.9719721555709839,
"num_tokens": 1085924.0,
"step": 590
},
{
"epoch": 2.1794871794871793,
"grad_norm": 0.1650390625,
"learning_rate": 8.477566069414271e-05,
"loss": 0.1872,
"mean_token_accuracy": 0.9515769362449646,
"num_tokens": 1095974.0,
"step": 595
},
{
"epoch": 2.197802197802198,
"grad_norm": 83.0,
"learning_rate": 8.475678466325665e-05,
"loss": 0.3268,
"mean_token_accuracy": 0.9221652507781982,
"num_tokens": 1105194.0,
"step": 600
},
{
"epoch": 2.2161172161172162,
"grad_norm": 0.9765625,
"learning_rate": 8.473762071532443e-05,
"loss": 0.0572,
"mean_token_accuracy": 0.9784234523773193,
"num_tokens": 1113857.0,
"step": 605
},
{
"epoch": 2.2344322344322345,
"grad_norm": 0.6171875,
"learning_rate": 8.471816902609471e-05,
"loss": 0.0721,
"mean_token_accuracy": 0.975789201259613,
"num_tokens": 1123835.0,
"step": 610
},
{
"epoch": 2.2527472527472527,
"grad_norm": 0.43359375,
"learning_rate": 8.46984297739549e-05,
"loss": 0.0894,
"mean_token_accuracy": 0.968826174736023,
"num_tokens": 1132971.0,
"step": 615
},
{
"epoch": 2.271062271062271,
"grad_norm": 0.3515625,
"learning_rate": 8.467840313992962e-05,
"loss": 0.044,
"mean_token_accuracy": 0.9843294978141784,
"num_tokens": 1142825.0,
"step": 620
},
{
"epoch": 2.2893772893772892,
"grad_norm": 1.09375,
"learning_rate": 8.465808930767897e-05,
"loss": 0.1133,
"mean_token_accuracy": 0.9606971025466919,
"num_tokens": 1152228.0,
"step": 625
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.76171875,
"learning_rate": 8.463748846349694e-05,
"loss": 0.0924,
"mean_token_accuracy": 0.968487274646759,
"num_tokens": 1161178.0,
"step": 630
},
{
"epoch": 2.326007326007326,
"grad_norm": 0.6875,
"learning_rate": 8.461660079630962e-05,
"loss": 0.1203,
"mean_token_accuracy": 0.9595974802970886,
"num_tokens": 1170764.0,
"step": 635
},
{
"epoch": 2.3443223443223444,
"grad_norm": 0.6171875,
"learning_rate": 8.45954264976735e-05,
"loss": 0.0458,
"mean_token_accuracy": 0.9816165566444397,
"num_tokens": 1180611.0,
"step": 640
},
{
"epoch": 2.3626373626373627,
"grad_norm": 1.2734375,
"learning_rate": 8.457396576177369e-05,
"loss": 0.109,
"mean_token_accuracy": 0.9632077097892762,
"num_tokens": 1188843.0,
"step": 645
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.5625,
"learning_rate": 8.455221878542219e-05,
"loss": 0.0454,
"mean_token_accuracy": 0.982891297340393,
"num_tokens": 1198270.0,
"step": 650
},
{
"epoch": 2.399267399267399,
"grad_norm": 0.447265625,
"learning_rate": 8.453018576805604e-05,
"loss": 0.0807,
"mean_token_accuracy": 0.9707582116127014,
"num_tokens": 1207459.0,
"step": 655
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.796875,
"learning_rate": 8.450786691173547e-05,
"loss": 0.1203,
"mean_token_accuracy": 0.9592945575714111,
"num_tokens": 1217120.0,
"step": 660
},
{
"epoch": 2.435897435897436,
"grad_norm": 0.8125,
"learning_rate": 8.448526242114215e-05,
"loss": 0.0962,
"mean_token_accuracy": 0.9632490515708924,
"num_tokens": 1226485.0,
"step": 665
},
{
"epoch": 2.4542124542124544,
"grad_norm": 306.0,
"learning_rate": 8.446237250357716e-05,
"loss": 0.2642,
"mean_token_accuracy": 0.9311501502990722,
"num_tokens": 1234436.0,
"step": 670
},
{
"epoch": 2.4725274725274726,
"grad_norm": 0.6875,
"learning_rate": 8.443919736895923e-05,
"loss": 0.0925,
"mean_token_accuracy": 0.9679561376571655,
"num_tokens": 1243656.0,
"step": 675
},
{
"epoch": 2.490842490842491,
"grad_norm": 0.326171875,
"learning_rate": 8.441573722982275e-05,
"loss": 0.1172,
"mean_token_accuracy": 0.9645622253417969,
"num_tokens": 1252230.0,
"step": 680
},
{
"epoch": 2.509157509157509,
"grad_norm": 4.59375,
"learning_rate": 8.439199230131578e-05,
"loss": 0.2658,
"mean_token_accuracy": 0.9416054487228394,
"num_tokens": 1261738.0,
"step": 685
},
{
"epoch": 2.5274725274725274,
"grad_norm": 16.25,
"learning_rate": 8.436796280119821e-05,
"loss": 0.1691,
"mean_token_accuracy": 0.959836196899414,
"num_tokens": 1270639.0,
"step": 690
},
{
"epoch": 2.5457875457875456,
"grad_norm": 2.140625,
"learning_rate": 8.43436489498396e-05,
"loss": 0.1439,
"mean_token_accuracy": 0.9541051030158997,
"num_tokens": 1279566.0,
"step": 695
},
{
"epoch": 2.564102564102564,
"grad_norm": 184.0,
"learning_rate": 8.431905097021727e-05,
"loss": 0.0963,
"mean_token_accuracy": 0.9705996751785279,
"num_tokens": 1288968.0,
"step": 700
},
{
"epoch": 2.5824175824175826,
"grad_norm": 0.984375,
"learning_rate": 8.429416908791423e-05,
"loss": 0.106,
"mean_token_accuracy": 0.9661186218261719,
"num_tokens": 1298605.0,
"step": 705
},
{
"epoch": 2.600732600732601,
"grad_norm": 0.84375,
"learning_rate": 8.426900353111708e-05,
"loss": 0.1213,
"mean_token_accuracy": 0.9659365892410279,
"num_tokens": 1306963.0,
"step": 710
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.310546875,
"learning_rate": 8.424355453061395e-05,
"loss": 0.11,
"mean_token_accuracy": 0.9625479221343994,
"num_tokens": 1315486.0,
"step": 715
},
{
"epoch": 2.6373626373626373,
"grad_norm": 0.7265625,
"learning_rate": 8.421782231979236e-05,
"loss": 0.095,
"mean_token_accuracy": 0.9687173247337342,
"num_tokens": 1325156.0,
"step": 720
},
{
"epoch": 2.6556776556776556,
"grad_norm": 0.671875,
"learning_rate": 8.419180713463716e-05,
"loss": 0.0597,
"mean_token_accuracy": 0.9778618574142456,
"num_tokens": 1334588.0,
"step": 725
},
{
"epoch": 2.6739926739926743,
"grad_norm": 0.56640625,
"learning_rate": 8.416550921372818e-05,
"loss": 0.0961,
"mean_token_accuracy": 0.965964937210083,
"num_tokens": 1343414.0,
"step": 730
},
{
"epoch": 2.6923076923076925,
"grad_norm": 1.609375,
"learning_rate": 8.413892879823828e-05,
"loss": 0.1179,
"mean_token_accuracy": 0.9629538536071778,
"num_tokens": 1353270.0,
"step": 735
},
{
"epoch": 2.7106227106227108,
"grad_norm": 0.2275390625,
"learning_rate": 8.411206613193094e-05,
"loss": 0.0733,
"mean_token_accuracy": 0.9732595682144165,
"num_tokens": 1362123.0,
"step": 740
},
{
"epoch": 2.728937728937729,
"grad_norm": 0.228515625,
"learning_rate": 8.408492146115815e-05,
"loss": 0.1294,
"mean_token_accuracy": 0.9595796465873718,
"num_tokens": 1371808.0,
"step": 745
},
{
"epoch": 2.7472527472527473,
"grad_norm": 0.185546875,
"learning_rate": 8.405749503485807e-05,
"loss": 0.0698,
"mean_token_accuracy": 0.9742272734642029,
"num_tokens": 1380865.0,
"step": 750
},
{
"epoch": 2.7655677655677655,
"grad_norm": 1.078125,
"learning_rate": 8.402978710455282e-05,
"loss": 0.0939,
"mean_token_accuracy": 0.9644173741340637,
"num_tokens": 1389329.0,
"step": 755
},
{
"epoch": 2.7838827838827838,
"grad_norm": 0.67578125,
"learning_rate": 8.400179792434609e-05,
"loss": 0.0986,
"mean_token_accuracy": 0.9603239297866821,
"num_tokens": 1397560.0,
"step": 760
},
{
"epoch": 2.802197802197802,
"grad_norm": 1.0859375,
"learning_rate": 8.397352775092089e-05,
"loss": 0.1284,
"mean_token_accuracy": 0.9567705154418945,
"num_tokens": 1406742.0,
"step": 765
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.1708984375,
"learning_rate": 8.394497684353717e-05,
"loss": 0.1174,
"mean_token_accuracy": 0.9635369896888732,
"num_tokens": 1416577.0,
"step": 770
},
{
"epoch": 2.838827838827839,
"grad_norm": 1.09375,
"learning_rate": 8.391614546402936e-05,
"loss": 0.1623,
"mean_token_accuracy": 0.947747004032135,
"num_tokens": 1424952.0,
"step": 775
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.6171875,
"learning_rate": 8.388703387680416e-05,
"loss": 0.0449,
"mean_token_accuracy": 0.9851283431053162,
"num_tokens": 1434524.0,
"step": 780
},
{
"epoch": 2.8754578754578755,
"grad_norm": 0.93359375,
"learning_rate": 8.385764234883788e-05,
"loss": 0.1116,
"mean_token_accuracy": 0.9607040166854859,
"num_tokens": 1443089.0,
"step": 785
},
{
"epoch": 2.8937728937728937,
"grad_norm": 0.59765625,
"learning_rate": 8.382797114967418e-05,
"loss": 0.0757,
"mean_token_accuracy": 0.9740386247634888,
"num_tokens": 1452624.0,
"step": 790
},
{
"epoch": 2.912087912087912,
"grad_norm": 0.76953125,
"learning_rate": 8.379802055142151e-05,
"loss": 0.0641,
"mean_token_accuracy": 0.9792454838752747,
"num_tokens": 1462735.0,
"step": 795
},
{
"epoch": 2.9304029304029307,
"grad_norm": 0.8671875,
"learning_rate": 8.376779082875063e-05,
"loss": 0.0948,
"mean_token_accuracy": 0.9742291688919067,
"num_tokens": 1472169.0,
"step": 800
},
{
"epoch": 2.948717948717949,
"grad_norm": 1.0,
"learning_rate": 8.37372822588921e-05,
"loss": 0.1112,
"mean_token_accuracy": 0.9607684016227722,
"num_tokens": 1481405.0,
"step": 805
},
{
"epoch": 2.967032967032967,
"grad_norm": 0.55859375,
"learning_rate": 8.370649512163369e-05,
"loss": 0.1755,
"mean_token_accuracy": 0.9431592702865601,
"num_tokens": 1490690.0,
"step": 810
},
{
"epoch": 2.9853479853479854,
"grad_norm": 0.1259765625,
"learning_rate": 8.367542969931792e-05,
"loss": 0.0996,
"mean_token_accuracy": 0.9651033759117127,
"num_tokens": 1499659.0,
"step": 815
},
{
"epoch": 3.0036630036630036,
"grad_norm": 0.1181640625,
"learning_rate": 8.364408627683935e-05,
"loss": 0.0761,
"mean_token_accuracy": 0.9716196894645691,
"num_tokens": 1507916.0,
"step": 820
},
{
"epoch": 3.021978021978022,
"grad_norm": 0.080078125,
"learning_rate": 8.361246514164205e-05,
"loss": 0.0558,
"mean_token_accuracy": 0.9798445224761962,
"num_tokens": 1517236.0,
"step": 825
},
{
"epoch": 3.04029304029304,
"grad_norm": 0.30078125,
"learning_rate": 8.358056658371692e-05,
"loss": 0.0628,
"mean_token_accuracy": 0.9788961172103882,
"num_tokens": 1526056.0,
"step": 830
},
{
"epoch": 3.0586080586080584,
"grad_norm": 0.8359375,
"learning_rate": 8.35483908955991e-05,
"loss": 0.064,
"mean_token_accuracy": 0.9767962694168091,
"num_tokens": 1535105.0,
"step": 835
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.71875,
"learning_rate": 8.351593837236514e-05,
"loss": 0.0672,
"mean_token_accuracy": 0.9740965247154236,
"num_tokens": 1543572.0,
"step": 840
},
{
"epoch": 3.0952380952380953,
"grad_norm": 0.765625,
"learning_rate": 8.348320931163043e-05,
"loss": 0.1008,
"mean_token_accuracy": 0.962606143951416,
"num_tokens": 1553371.0,
"step": 845
},
{
"epoch": 3.1135531135531136,
"grad_norm": 1.4765625,
"learning_rate": 8.345020401354646e-05,
"loss": 0.0652,
"mean_token_accuracy": 0.9775573253631592,
"num_tokens": 1563374.0,
"step": 850
},
{
"epoch": 3.131868131868132,
"grad_norm": 1.03125,
"learning_rate": 8.341692278079804e-05,
"loss": 0.0701,
"mean_token_accuracy": 0.9748265624046326,
"num_tokens": 1572311.0,
"step": 855
},
{
"epoch": 3.15018315018315,
"grad_norm": 0.447265625,
"learning_rate": 8.338336591860042e-05,
"loss": 0.0616,
"mean_token_accuracy": 0.9770539045333863,
"num_tokens": 1581662.0,
"step": 860
},
{
"epoch": 3.1684981684981683,
"grad_norm": 0.8046875,
"learning_rate": 8.334953373469673e-05,
"loss": 0.0847,
"mean_token_accuracy": 0.9684791564941406,
"num_tokens": 1590875.0,
"step": 865
},
{
"epoch": 3.186813186813187,
"grad_norm": 0.7265625,
"learning_rate": 8.331542653935491e-05,
"loss": 0.0618,
"mean_token_accuracy": 0.9756833434104919,
"num_tokens": 1599508.0,
"step": 870
},
{
"epoch": 3.2051282051282053,
"grad_norm": 0.484375,
"learning_rate": 8.328104464536502e-05,
"loss": 0.0538,
"mean_token_accuracy": 0.9818952322006226,
"num_tokens": 1609055.0,
"step": 875
},
{
"epoch": 3.2234432234432235,
"grad_norm": 1.578125,
"learning_rate": 8.324638836803633e-05,
"loss": 0.066,
"mean_token_accuracy": 0.9749167203903198,
"num_tokens": 1618211.0,
"step": 880
},
{
"epoch": 3.241758241758242,
"grad_norm": 0.482421875,
"learning_rate": 8.32114580251944e-05,
"loss": 0.0943,
"mean_token_accuracy": 0.9674638390541077,
"num_tokens": 1627192.0,
"step": 885
},
{
"epoch": 3.26007326007326,
"grad_norm": 0.09521484375,
"learning_rate": 8.317625393717823e-05,
"loss": 0.0424,
"mean_token_accuracy": 0.9843096375465393,
"num_tokens": 1636468.0,
"step": 890
},
{
"epoch": 3.2783882783882783,
"grad_norm": 1.1953125,
"learning_rate": 8.314077642683719e-05,
"loss": 0.0866,
"mean_token_accuracy": 0.971860671043396,
"num_tokens": 1645884.0,
"step": 895
},
{
"epoch": 3.2967032967032965,
"grad_norm": 0.796875,
"learning_rate": 8.310502581952828e-05,
"loss": 0.0579,
"mean_token_accuracy": 0.9784857869148255,
"num_tokens": 1655337.0,
"step": 900
},
{
"epoch": 3.315018315018315,
"grad_norm": 0.48828125,
"learning_rate": 8.306900244311288e-05,
"loss": 0.1142,
"mean_token_accuracy": 0.961796247959137,
"num_tokens": 1663976.0,
"step": 905
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.287109375,
"learning_rate": 8.303270662795399e-05,
"loss": 0.075,
"mean_token_accuracy": 0.9726075410842896,
"num_tokens": 1673433.0,
"step": 910
},
{
"epoch": 3.3516483516483517,
"grad_norm": 0.7578125,
"learning_rate": 8.299613870691302e-05,
"loss": 0.0939,
"mean_token_accuracy": 0.9688026666641235,
"num_tokens": 1683030.0,
"step": 915
},
{
"epoch": 3.36996336996337,
"grad_norm": 0.373046875,
"learning_rate": 8.295929901534686e-05,
"loss": 0.0319,
"mean_token_accuracy": 0.9874111294746399,
"num_tokens": 1693029.0,
"step": 920
},
{
"epoch": 3.3882783882783882,
"grad_norm": 0.2392578125,
"learning_rate": 8.29221878911047e-05,
"loss": 0.0532,
"mean_token_accuracy": 0.9800354242324829,
"num_tokens": 1703258.0,
"step": 925
},
{
"epoch": 3.4065934065934065,
"grad_norm": 0.73046875,
"learning_rate": 8.288480567452501e-05,
"loss": 0.1188,
"mean_token_accuracy": 0.9611821174621582,
"num_tokens": 1712754.0,
"step": 930
},
{
"epoch": 3.4249084249084247,
"grad_norm": 0.99609375,
"learning_rate": 8.284715270843238e-05,
"loss": 0.0829,
"mean_token_accuracy": 0.9708463668823242,
"num_tokens": 1721472.0,
"step": 935
},
{
"epoch": 3.4432234432234434,
"grad_norm": 10.3125,
"learning_rate": 8.280922933813442e-05,
"loss": 0.04,
"mean_token_accuracy": 0.9824108600616455,
"num_tokens": 1730959.0,
"step": 940
},
{
"epoch": 3.4615384615384617,
"grad_norm": 0.6015625,
"learning_rate": 8.277103591141852e-05,
"loss": 0.0678,
"mean_token_accuracy": 0.9735846400260926,
"num_tokens": 1739674.0,
"step": 945
},
{
"epoch": 3.47985347985348,
"grad_norm": 0.39453125,
"learning_rate": 8.273257277854872e-05,
"loss": 0.0424,
"mean_token_accuracy": 0.9842739105224609,
"num_tokens": 1749137.0,
"step": 950
},
{
"epoch": 3.498168498168498,
"grad_norm": 0.1630859375,
"learning_rate": 8.269384029226248e-05,
"loss": 0.0285,
"mean_token_accuracy": 0.9885275959968567,
"num_tokens": 1758530.0,
"step": 955
},
{
"epoch": 3.5164835164835164,
"grad_norm": 0.0927734375,
"learning_rate": 8.265483880776745e-05,
"loss": 0.0741,
"mean_token_accuracy": 0.9747227191925049,
"num_tokens": 1767672.0,
"step": 960
},
{
"epoch": 3.5347985347985347,
"grad_norm": 0.44921875,
"learning_rate": 8.26155686827382e-05,
"loss": 0.068,
"mean_token_accuracy": 0.975150191783905,
"num_tokens": 1776694.0,
"step": 965
},
{
"epoch": 3.553113553113553,
"grad_norm": 2.25,
"learning_rate": 8.257603027731291e-05,
"loss": 0.0536,
"mean_token_accuracy": 0.9809759497642517,
"num_tokens": 1785904.0,
"step": 970
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.53125,
"learning_rate": 8.253622395409019e-05,
"loss": 0.0555,
"mean_token_accuracy": 0.9794698238372803,
"num_tokens": 1795028.0,
"step": 975
},
{
"epoch": 3.58974358974359,
"grad_norm": 0.9609375,
"learning_rate": 8.24961500781256e-05,
"loss": 0.1048,
"mean_token_accuracy": 0.9619524002075195,
"num_tokens": 1802957.0,
"step": 980
},
{
"epoch": 3.608058608058608,
"grad_norm": 0.337890625,
"learning_rate": 8.24558090169284e-05,
"loss": 0.0801,
"mean_token_accuracy": 0.9719898581504822,
"num_tokens": 1811233.0,
"step": 985
},
{
"epoch": 3.6263736263736264,
"grad_norm": 0.30859375,
"learning_rate": 8.241520114045813e-05,
"loss": 0.0932,
"mean_token_accuracy": 0.9668406844139099,
"num_tokens": 1820206.0,
"step": 990
},
{
"epoch": 3.6446886446886446,
"grad_norm": 0.291015625,
"learning_rate": 8.237432682112127e-05,
"loss": 0.0814,
"mean_token_accuracy": 0.968066930770874,
"num_tokens": 1828757.0,
"step": 995
},
{
"epoch": 3.663003663003663,
"grad_norm": 0.51171875,
"learning_rate": 8.233318643376773e-05,
"loss": 0.0786,
"mean_token_accuracy": 0.972130823135376,
"num_tokens": 1837693.0,
"step": 1000
},
{
"epoch": 3.6813186813186816,
"grad_norm": 0.138671875,
"learning_rate": 8.229178035568755e-05,
"loss": 0.0772,
"mean_token_accuracy": 0.9723419427871705,
"num_tokens": 1847020.0,
"step": 1005
},
{
"epoch": 3.6996336996337,
"grad_norm": 1.1796875,
"learning_rate": 8.225010896660734e-05,
"loss": 0.1051,
"mean_token_accuracy": 0.9616187572479248,
"num_tokens": 1855982.0,
"step": 1010
},
{
"epoch": 3.717948717948718,
"grad_norm": 0.44140625,
"learning_rate": 8.220817264868678e-05,
"loss": 0.0785,
"mean_token_accuracy": 0.9704046010971069,
"num_tokens": 1865186.0,
"step": 1015
},
{
"epoch": 3.7362637362637363,
"grad_norm": 0.82421875,
"learning_rate": 8.216597178651523e-05,
"loss": 0.0473,
"mean_token_accuracy": 0.9826258182525635,
"num_tokens": 1874733.0,
"step": 1020
},
{
"epoch": 3.7545787545787546,
"grad_norm": 1.078125,
"learning_rate": 8.212350676710807e-05,
"loss": 0.0746,
"mean_token_accuracy": 0.9718662738800049,
"num_tokens": 1884155.0,
"step": 1025
},
{
"epoch": 3.772893772893773,
"grad_norm": 0.58984375,
"learning_rate": 8.208077797990322e-05,
"loss": 0.0739,
"mean_token_accuracy": 0.9724728226661682,
"num_tokens": 1892962.0,
"step": 1030
},
{
"epoch": 3.791208791208791,
"grad_norm": 0.5234375,
"learning_rate": 8.203778581675761e-05,
"loss": 0.0665,
"mean_token_accuracy": 0.9769334554672241,
"num_tokens": 1902461.0,
"step": 1035
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.6640625,
"learning_rate": 8.199453067194351e-05,
"loss": 0.0952,
"mean_token_accuracy": 0.9705726265907287,
"num_tokens": 1911844.0,
"step": 1040
},
{
"epoch": 3.8278388278388276,
"grad_norm": 0.82421875,
"learning_rate": 8.195101294214486e-05,
"loss": 0.0615,
"mean_token_accuracy": 0.9792343139648437,
"num_tokens": 1921110.0,
"step": 1045
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.546875,
"learning_rate": 8.190723302645387e-05,
"loss": 0.0671,
"mean_token_accuracy": 0.9760551929473877,
"num_tokens": 1930834.0,
"step": 1050
},
{
"epoch": 3.8644688644688645,
"grad_norm": 0.70703125,
"learning_rate": 8.186319132636706e-05,
"loss": 0.0888,
"mean_token_accuracy": 0.9672855019569397,
"num_tokens": 1939564.0,
"step": 1055
},
{
"epoch": 3.8827838827838828,
"grad_norm": 0.125,
"learning_rate": 8.18188882457818e-05,
"loss": 0.0718,
"mean_token_accuracy": 0.9734614849090576,
"num_tokens": 1948652.0,
"step": 1060
},
{
"epoch": 3.901098901098901,
"grad_norm": 0.52734375,
"learning_rate": 8.177432419099249e-05,
"loss": 0.0496,
"mean_token_accuracy": 0.9841477632522583,
"num_tokens": 1958891.0,
"step": 1065
},
{
"epoch": 3.9194139194139193,
"grad_norm": 0.482421875,
"learning_rate": 8.172949957068689e-05,
"loss": 0.0773,
"mean_token_accuracy": 0.9700749635696411,
"num_tokens": 1968507.0,
"step": 1070
},
{
"epoch": 3.937728937728938,
"grad_norm": 0.90234375,
"learning_rate": 8.168441479594237e-05,
"loss": 0.0839,
"mean_token_accuracy": 0.9697647333145142,
"num_tokens": 1977929.0,
"step": 1075
},
{
"epoch": 3.956043956043956,
"grad_norm": 0.63671875,
"learning_rate": 8.163907028022208e-05,
"loss": 0.0534,
"mean_token_accuracy": 0.9822108268737793,
"num_tokens": 1987374.0,
"step": 1080
},
{
"epoch": 3.9743589743589745,
"grad_norm": 0.490234375,
"learning_rate": 8.159346643937122e-05,
"loss": 0.0895,
"mean_token_accuracy": 0.9678827285766601,
"num_tokens": 1995512.0,
"step": 1085
},
{
"epoch": 3.9926739926739927,
"grad_norm": 1.046875,
"learning_rate": 8.154760369161322e-05,
"loss": 0.0842,
"mean_token_accuracy": 0.9745811820030212,
"num_tokens": 2005014.0,
"step": 1090
},
{
"epoch": 4.010989010989011,
"grad_norm": 5.25,
"learning_rate": 8.150148245754586e-05,
"loss": 0.0784,
"mean_token_accuracy": 0.9694916486740113,
"num_tokens": 2013958.0,
"step": 1095
},
{
"epoch": 4.029304029304029,
"grad_norm": 0.482421875,
"learning_rate": 8.145510316013748e-05,
"loss": 0.0379,
"mean_token_accuracy": 0.9864168405532837,
"num_tokens": 2023416.0,
"step": 1100
},
{
"epoch": 4.0476190476190474,
"grad_norm": 0.11328125,
"learning_rate": 8.140846622472304e-05,
"loss": 0.0336,
"mean_token_accuracy": 0.9863126277923584,
"num_tokens": 2032892.0,
"step": 1105
},
{
"epoch": 4.065934065934066,
"grad_norm": 0.1259765625,
"learning_rate": 8.13615720790003e-05,
"loss": 0.0572,
"mean_token_accuracy": 0.9810018301010132,
"num_tokens": 2042824.0,
"step": 1110
},
{
"epoch": 4.084249084249084,
"grad_norm": 1.078125,
"learning_rate": 8.131442115302573e-05,
"loss": 0.0579,
"mean_token_accuracy": 0.9789334416389466,
"num_tokens": 2052256.0,
"step": 1115
},
{
"epoch": 4.102564102564102,
"grad_norm": 2.3125,
"learning_rate": 8.12670138792108e-05,
"loss": 0.0372,
"mean_token_accuracy": 0.9866159200668335,
"num_tokens": 2061743.0,
"step": 1120
},
{
"epoch": 4.1208791208791204,
"grad_norm": 0.3828125,
"learning_rate": 8.121935069231779e-05,
"loss": 0.0484,
"mean_token_accuracy": 0.9815837264060974,
"num_tokens": 2069937.0,
"step": 1125
},
{
"epoch": 4.13919413919414,
"grad_norm": 1.015625,
"learning_rate": 8.1171432029456e-05,
"loss": 0.0687,
"mean_token_accuracy": 0.9753403425216675,
"num_tokens": 2079121.0,
"step": 1130
},
{
"epoch": 4.157509157509158,
"grad_norm": 0.77734375,
"learning_rate": 8.11232583300776e-05,
"loss": 0.06,
"mean_token_accuracy": 0.9811612010002136,
"num_tokens": 2088105.0,
"step": 1135
},
{
"epoch": 4.175824175824176,
"grad_norm": 1.5234375,
"learning_rate": 8.107483003597365e-05,
"loss": 0.0537,
"mean_token_accuracy": 0.9808408856391907,
"num_tokens": 2096831.0,
"step": 1140
},
{
"epoch": 4.194139194139194,
"grad_norm": 0.1767578125,
"learning_rate": 8.102614759127002e-05,
"loss": 0.0222,
"mean_token_accuracy": 0.9910707116127014,
"num_tokens": 2106634.0,
"step": 1145
},
{
"epoch": 4.212454212454213,
"grad_norm": 0.37109375,
"learning_rate": 8.097721144242338e-05,
"loss": 0.0617,
"mean_token_accuracy": 0.9770854115486145,
"num_tokens": 2116094.0,
"step": 1150
},
{
"epoch": 4.230769230769231,
"grad_norm": 0.1806640625,
"learning_rate": 8.092802203821708e-05,
"loss": 0.0256,
"mean_token_accuracy": 0.9892764806747436,
"num_tokens": 2125097.0,
"step": 1155
},
{
"epoch": 4.249084249084249,
"grad_norm": 0.130859375,
"learning_rate": 8.087857982975698e-05,
"loss": 0.0323,
"mean_token_accuracy": 0.9884976744651794,
"num_tokens": 2134122.0,
"step": 1160
},
{
"epoch": 4.267399267399267,
"grad_norm": 1.625,
"learning_rate": 8.082888527046738e-05,
"loss": 0.0549,
"mean_token_accuracy": 0.9823671579360962,
"num_tokens": 2142806.0,
"step": 1165
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.703125,
"learning_rate": 8.077893881608685e-05,
"loss": 0.0772,
"mean_token_accuracy": 0.9735370635986328,
"num_tokens": 2151281.0,
"step": 1170
},
{
"epoch": 4.304029304029304,
"grad_norm": 0.97265625,
"learning_rate": 8.072874092466398e-05,
"loss": 0.065,
"mean_token_accuracy": 0.9773920774459839,
"num_tokens": 2160764.0,
"step": 1175
},
{
"epoch": 4.322344322344322,
"grad_norm": 0.69140625,
"learning_rate": 8.067829205655333e-05,
"loss": 0.0588,
"mean_token_accuracy": 0.9791547775268554,
"num_tokens": 2169484.0,
"step": 1180
},
{
"epoch": 4.34065934065934,
"grad_norm": 0.64453125,
"learning_rate": 8.062759267441103e-05,
"loss": 0.0444,
"mean_token_accuracy": 0.9826448798179627,
"num_tokens": 2178294.0,
"step": 1185
},
{
"epoch": 4.358974358974359,
"grad_norm": 0.09423828125,
"learning_rate": 8.057664324319065e-05,
"loss": 0.0673,
"mean_token_accuracy": 0.9759621739387512,
"num_tokens": 2187496.0,
"step": 1190
},
{
"epoch": 4.377289377289378,
"grad_norm": 1.2421875,
"learning_rate": 8.052544423013895e-05,
"loss": 0.0366,
"mean_token_accuracy": 0.9850521922111511,
"num_tokens": 2196704.0,
"step": 1195
},
{
"epoch": 4.395604395604396,
"grad_norm": 0.73828125,
"learning_rate": 8.047399610479149e-05,
"loss": 0.0487,
"mean_token_accuracy": 0.9822589874267578,
"num_tokens": 2205968.0,
"step": 1200
},
{
"epoch": 4.413919413919414,
"grad_norm": 0.9140625,
"learning_rate": 8.042229933896844e-05,
"loss": 0.0742,
"mean_token_accuracy": 0.9758718729019165,
"num_tokens": 2215463.0,
"step": 1205
},
{
"epoch": 4.4322344322344325,
"grad_norm": 0.5703125,
"learning_rate": 8.037035440677016e-05,
"loss": 0.0424,
"mean_token_accuracy": 0.9832551598548889,
"num_tokens": 2224409.0,
"step": 1210
},
{
"epoch": 4.450549450549451,
"grad_norm": 0.396484375,
"learning_rate": 8.03181617845729e-05,
"loss": 0.0614,
"mean_token_accuracy": 0.9775595307350159,
"num_tokens": 2233382.0,
"step": 1215
},
{
"epoch": 4.468864468864469,
"grad_norm": 0.671875,
"learning_rate": 8.026572195102447e-05,
"loss": 0.0516,
"mean_token_accuracy": 0.982862401008606,
"num_tokens": 2242990.0,
"step": 1220
},
{
"epoch": 4.487179487179487,
"grad_norm": 0.2255859375,
"learning_rate": 8.021303538703972e-05,
"loss": 0.0649,
"mean_token_accuracy": 0.9760475039482117,
"num_tokens": 2251597.0,
"step": 1225
},
{
"epoch": 4.5054945054945055,
"grad_norm": 0.384765625,
"learning_rate": 8.01601025757963e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9712528467178345,
"num_tokens": 2260017.0,
"step": 1230
},
{
"epoch": 4.523809523809524,
"grad_norm": 0.1015625,
"learning_rate": 8.010692400273009e-05,
"loss": 0.0514,
"mean_token_accuracy": 0.9804089546203614,
"num_tokens": 2270253.0,
"step": 1235
},
{
"epoch": 4.542124542124542,
"grad_norm": 0.53515625,
"learning_rate": 8.00535001555308e-05,
"loss": 0.0392,
"mean_token_accuracy": 0.9841665983200073,
"num_tokens": 2279714.0,
"step": 1240
},
{
"epoch": 4.56043956043956,
"grad_norm": 0.68359375,
"learning_rate": 7.999983152413753e-05,
"loss": 0.0686,
"mean_token_accuracy": 0.9762724280357361,
"num_tokens": 2289630.0,
"step": 1245
},
{
"epoch": 4.5787545787545785,
"grad_norm": 1.515625,
"learning_rate": 7.994591860073424e-05,
"loss": 0.0641,
"mean_token_accuracy": 0.9772836685180664,
"num_tokens": 2297980.0,
"step": 1250
},
{
"epoch": 4.597069597069597,
"grad_norm": 3.890625,
"learning_rate": 7.989176187974522e-05,
"loss": 0.0768,
"mean_token_accuracy": 0.9789605379104614,
"num_tokens": 2307539.0,
"step": 1255
},
{
"epoch": 4.615384615384615,
"grad_norm": 22.25,
"learning_rate": 7.983736185783057e-05,
"loss": 0.1298,
"mean_token_accuracy": 0.9655901789665222,
"num_tokens": 2316208.0,
"step": 1260
},
{
"epoch": 4.633699633699633,
"grad_norm": 19.875,
"learning_rate": 7.97827190338817e-05,
"loss": 0.068,
"mean_token_accuracy": 0.977760374546051,
"num_tokens": 2325231.0,
"step": 1265
},
{
"epoch": 4.652014652014652,
"grad_norm": 0.404296875,
"learning_rate": 7.972783390901666e-05,
"loss": 0.0608,
"mean_token_accuracy": 0.9790961384773255,
"num_tokens": 2334422.0,
"step": 1270
},
{
"epoch": 4.670329670329671,
"grad_norm": 17.25,
"learning_rate": 7.967270698657563e-05,
"loss": 0.077,
"mean_token_accuracy": 0.9756144642829895,
"num_tokens": 2343150.0,
"step": 1275
},
{
"epoch": 4.688644688644689,
"grad_norm": 0.578125,
"learning_rate": 7.96173387721162e-05,
"loss": 0.0358,
"mean_token_accuracy": 0.9854479551315307,
"num_tokens": 2352172.0,
"step": 1280
},
{
"epoch": 4.706959706959707,
"grad_norm": 0.53125,
"learning_rate": 7.95617297734089e-05,
"loss": 0.0488,
"mean_token_accuracy": 0.9814175963401794,
"num_tokens": 2362355.0,
"step": 1285
},
{
"epoch": 4.725274725274725,
"grad_norm": 0.54296875,
"learning_rate": 7.950588050043236e-05,
"loss": 0.0309,
"mean_token_accuracy": 0.9877290248870849,
"num_tokens": 2372065.0,
"step": 1290
},
{
"epoch": 4.743589743589744,
"grad_norm": 0.37109375,
"learning_rate": 7.944979146536874e-05,
"loss": 0.0425,
"mean_token_accuracy": 0.9832926988601685,
"num_tokens": 2381430.0,
"step": 1295
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.55859375,
"learning_rate": 7.939346318259904e-05,
"loss": 0.0294,
"mean_token_accuracy": 0.9892051696777344,
"num_tokens": 2390948.0,
"step": 1300
},
{
"epoch": 4.78021978021978,
"grad_norm": 0.734375,
"learning_rate": 7.933689616869828e-05,
"loss": 0.0499,
"mean_token_accuracy": 0.9788974165916443,
"num_tokens": 2399937.0,
"step": 1305
},
{
"epoch": 4.798534798534798,
"grad_norm": 1.2265625,
"learning_rate": 7.92800909424309e-05,
"loss": 0.059,
"mean_token_accuracy": 0.9804568767547608,
"num_tokens": 2408310.0,
"step": 1310
},
{
"epoch": 4.816849816849817,
"grad_norm": 0.53515625,
"learning_rate": 7.922304802474593e-05,
"loss": 0.0689,
"mean_token_accuracy": 0.9761590838432312,
"num_tokens": 2417776.0,
"step": 1315
},
{
"epoch": 4.835164835164835,
"grad_norm": 0.1728515625,
"learning_rate": 7.916576793877218e-05,
"loss": 0.096,
"mean_token_accuracy": 0.9676541090011597,
"num_tokens": 2427560.0,
"step": 1320
},
{
"epoch": 4.853479853479853,
"grad_norm": 0.142578125,
"learning_rate": 7.91082512098135e-05,
"loss": 0.0481,
"mean_token_accuracy": 0.9823224782943726,
"num_tokens": 2437579.0,
"step": 1325
},
{
"epoch": 4.871794871794872,
"grad_norm": 0.123046875,
"learning_rate": 7.905049836534396e-05,
"loss": 0.0371,
"mean_token_accuracy": 0.9855931043624878,
"num_tokens": 2446578.0,
"step": 1330
},
{
"epoch": 4.8901098901098905,
"grad_norm": 1.0703125,
"learning_rate": 7.8992509935003e-05,
"loss": 0.0589,
"mean_token_accuracy": 0.9781901359558105,
"num_tokens": 2455332.0,
"step": 1335
},
{
"epoch": 4.908424908424909,
"grad_norm": 0.25,
"learning_rate": 7.893428645059053e-05,
"loss": 0.0478,
"mean_token_accuracy": 0.9811420202255249,
"num_tokens": 2464469.0,
"step": 1340
},
{
"epoch": 4.926739926739927,
"grad_norm": 0.55859375,
"learning_rate": 7.887582844606212e-05,
"loss": 0.0416,
"mean_token_accuracy": 0.9845540761947632,
"num_tokens": 2474332.0,
"step": 1345
},
{
"epoch": 4.945054945054945,
"grad_norm": 0.1943359375,
"learning_rate": 7.881713645752409e-05,
"loss": 0.0567,
"mean_token_accuracy": 0.977370023727417,
"num_tokens": 2483586.0,
"step": 1350
},
{
"epoch": 4.9633699633699635,
"grad_norm": 0.57421875,
"learning_rate": 7.875821102322853e-05,
"loss": 0.0489,
"mean_token_accuracy": 0.9813120841979981,
"num_tokens": 2492757.0,
"step": 1355
},
{
"epoch": 4.981684981684982,
"grad_norm": 0.158203125,
"learning_rate": 7.869905268356847e-05,
"loss": 0.0683,
"mean_token_accuracy": 0.9747755646705627,
"num_tokens": 2501516.0,
"step": 1360
},
{
"epoch": 5.0,
"grad_norm": 0.41015625,
"learning_rate": 7.863966198107285e-05,
"loss": 0.0371,
"mean_token_accuracy": 0.9825921297073364,
"num_tokens": 2509780.0,
"step": 1365
},
{
"epoch": 5.018315018315018,
"grad_norm": 0.1923828125,
"learning_rate": 7.858003946040152e-05,
"loss": 0.0349,
"mean_token_accuracy": 0.9879397273063659,
"num_tokens": 2518456.0,
"step": 1370
},
{
"epoch": 5.0366300366300365,
"grad_norm": 0.38671875,
"learning_rate": 7.852018566834035e-05,
"loss": 0.0623,
"mean_token_accuracy": 0.9814428091049194,
"num_tokens": 2526696.0,
"step": 1375
},
{
"epoch": 5.054945054945055,
"grad_norm": 15.5625,
"learning_rate": 7.846010115379609e-05,
"loss": 0.0851,
"mean_token_accuracy": 0.9754502534866333,
"num_tokens": 2535458.0,
"step": 1380
},
{
"epoch": 5.073260073260073,
"grad_norm": 0.1337890625,
"learning_rate": 7.839978646779148e-05,
"loss": 0.0302,
"mean_token_accuracy": 0.9864336133003235,
"num_tokens": 2544932.0,
"step": 1385
},
{
"epoch": 5.091575091575091,
"grad_norm": 0.474609375,
"learning_rate": 7.833924216346e-05,
"loss": 0.044,
"mean_token_accuracy": 0.9824022054672241,
"num_tokens": 2553707.0,
"step": 1390
},
{
"epoch": 5.1098901098901095,
"grad_norm": 0.62890625,
"learning_rate": 7.827846879604103e-05,
"loss": 0.0476,
"mean_token_accuracy": 0.9825940251350402,
"num_tokens": 2562559.0,
"step": 1395
},
{
"epoch": 5.128205128205128,
"grad_norm": 0.3046875,
"learning_rate": 7.821746692287458e-05,
"loss": 0.0301,
"mean_token_accuracy": 0.9876471161842346,
"num_tokens": 2571694.0,
"step": 1400
},
{
"epoch": 5.146520146520147,
"grad_norm": 0.2236328125,
"learning_rate": 7.815623710339623e-05,
"loss": 0.041,
"mean_token_accuracy": 0.9862527489662171,
"num_tokens": 2580208.0,
"step": 1405
},
{
"epoch": 5.164835164835165,
"grad_norm": 0.96484375,
"learning_rate": 7.809477989913203e-05,
"loss": 0.0618,
"mean_token_accuracy": 0.9779723167419434,
"num_tokens": 2588156.0,
"step": 1410
},
{
"epoch": 5.183150183150183,
"grad_norm": 0.20703125,
"learning_rate": 7.803309587369332e-05,
"loss": 0.0303,
"mean_token_accuracy": 0.9863034844398498,
"num_tokens": 2596974.0,
"step": 1415
},
{
"epoch": 5.201465201465202,
"grad_norm": 0.65234375,
"learning_rate": 7.79711855927716e-05,
"loss": 0.0248,
"mean_token_accuracy": 0.9895213365554809,
"num_tokens": 2605921.0,
"step": 1420
},
{
"epoch": 5.21978021978022,
"grad_norm": 0.6171875,
"learning_rate": 7.790904962413324e-05,
"loss": 0.0588,
"mean_token_accuracy": 0.9793162941932678,
"num_tokens": 2615551.0,
"step": 1425
},
{
"epoch": 5.238095238095238,
"grad_norm": 0.61328125,
"learning_rate": 7.784668853761446e-05,
"loss": 0.0344,
"mean_token_accuracy": 0.987682557106018,
"num_tokens": 2624766.0,
"step": 1430
},
{
"epoch": 5.256410256410256,
"grad_norm": 0.138671875,
"learning_rate": 7.778410290511585e-05,
"loss": 0.0183,
"mean_token_accuracy": 0.9921578407287598,
"num_tokens": 2634654.0,
"step": 1435
},
{
"epoch": 5.274725274725275,
"grad_norm": 0.16796875,
"learning_rate": 7.772129330059739e-05,
"loss": 0.025,
"mean_token_accuracy": 0.991031551361084,
"num_tokens": 2644515.0,
"step": 1440
},
{
"epoch": 5.293040293040293,
"grad_norm": 0.126953125,
"learning_rate": 7.7658260300073e-05,
"loss": 0.0384,
"mean_token_accuracy": 0.9862215161323548,
"num_tokens": 2653798.0,
"step": 1445
},
{
"epoch": 5.311355311355311,
"grad_norm": 5.0,
"learning_rate": 7.759500448160529e-05,
"loss": 0.0418,
"mean_token_accuracy": 0.9845625400543213,
"num_tokens": 2662880.0,
"step": 1450
},
{
"epoch": 5.329670329670329,
"grad_norm": 12.0625,
"learning_rate": 7.753152642530036e-05,
"loss": 0.0456,
"mean_token_accuracy": 0.9851066589355468,
"num_tokens": 2671685.0,
"step": 1455
},
{
"epoch": 5.347985347985348,
"grad_norm": 0.953125,
"learning_rate": 7.746782671330237e-05,
"loss": 0.0476,
"mean_token_accuracy": 0.9837478876113892,
"num_tokens": 2680763.0,
"step": 1460
},
{
"epoch": 5.366300366300366,
"grad_norm": 0.1611328125,
"learning_rate": 7.740390592978824e-05,
"loss": 0.0462,
"mean_token_accuracy": 0.9852417230606079,
"num_tokens": 2690295.0,
"step": 1465
},
{
"epoch": 5.384615384615385,
"grad_norm": 1.109375,
"learning_rate": 7.733976466096226e-05,
"loss": 0.0513,
"mean_token_accuracy": 0.9832407712936402,
"num_tokens": 2699104.0,
"step": 1470
},
{
"epoch": 5.402930402930403,
"grad_norm": 0.3984375,
"learning_rate": 7.727540349505082e-05,
"loss": 0.0598,
"mean_token_accuracy": 0.9803775191307068,
"num_tokens": 2708621.0,
"step": 1475
},
{
"epoch": 5.4212454212454215,
"grad_norm": 0.240234375,
"learning_rate": 7.721082302229688e-05,
"loss": 0.042,
"mean_token_accuracy": 0.9849236011505127,
"num_tokens": 2718857.0,
"step": 1480
},
{
"epoch": 5.43956043956044,
"grad_norm": 0.5078125,
"learning_rate": 7.714602383495464e-05,
"loss": 0.0365,
"mean_token_accuracy": 0.986870002746582,
"num_tokens": 2728231.0,
"step": 1485
},
{
"epoch": 5.457875457875458,
"grad_norm": 0.578125,
"learning_rate": 7.708100652728407e-05,
"loss": 0.076,
"mean_token_accuracy": 0.9744701385498047,
"num_tokens": 2737360.0,
"step": 1490
},
{
"epoch": 5.476190476190476,
"grad_norm": 1.421875,
"learning_rate": 7.70157716955455e-05,
"loss": 0.0334,
"mean_token_accuracy": 0.9870843052864074,
"num_tokens": 2746745.0,
"step": 1495
},
{
"epoch": 5.4945054945054945,
"grad_norm": 3.28125,
"learning_rate": 7.695031993799411e-05,
"loss": 0.0462,
"mean_token_accuracy": 0.982709014415741,
"num_tokens": 2756089.0,
"step": 1500
},
{
"epoch": 5.512820512820513,
"grad_norm": 0.2294921875,
"learning_rate": 7.688465185487447e-05,
"loss": 0.0319,
"mean_token_accuracy": 0.9880306243896484,
"num_tokens": 2766072.0,
"step": 1505
},
{
"epoch": 5.531135531135531,
"grad_norm": 0.390625,
"learning_rate": 7.681876804841504e-05,
"loss": 0.0392,
"mean_token_accuracy": 0.9848615050315856,
"num_tokens": 2775370.0,
"step": 1510
},
{
"epoch": 5.549450549450549,
"grad_norm": 0.8125,
"learning_rate": 7.675266912282259e-05,
"loss": 0.0376,
"mean_token_accuracy": 0.9833124279975891,
"num_tokens": 2784606.0,
"step": 1515
},
{
"epoch": 5.5677655677655675,
"grad_norm": 0.44140625,
"learning_rate": 7.668635568427677e-05,
"loss": 0.0412,
"mean_token_accuracy": 0.9833675742149353,
"num_tokens": 2794109.0,
"step": 1520
},
{
"epoch": 5.586080586080586,
"grad_norm": 8.375,
"learning_rate": 7.661982834092442e-05,
"loss": 0.0441,
"mean_token_accuracy": 0.9847989916801453,
"num_tokens": 2804339.0,
"step": 1525
},
{
"epoch": 5.604395604395604,
"grad_norm": 1.4609375,
"learning_rate": 7.65530877028741e-05,
"loss": 0.0445,
"mean_token_accuracy": 0.9856428861618042,
"num_tokens": 2813328.0,
"step": 1530
},
{
"epoch": 5.622710622710622,
"grad_norm": 0.93359375,
"learning_rate": 7.648613438219043e-05,
"loss": 0.0599,
"mean_token_accuracy": 0.9801060795783997,
"num_tokens": 2822846.0,
"step": 1535
},
{
"epoch": 5.641025641025641,
"grad_norm": 2.09375,
"learning_rate": 7.64189689928885e-05,
"loss": 0.0703,
"mean_token_accuracy": 0.9766062736511231,
"num_tokens": 2832793.0,
"step": 1540
},
{
"epoch": 5.65934065934066,
"grad_norm": 4.40625,
"learning_rate": 7.635159215092825e-05,
"loss": 0.0482,
"mean_token_accuracy": 0.984825873374939,
"num_tokens": 2842566.0,
"step": 1545
},
{
"epoch": 5.677655677655678,
"grad_norm": 0.23828125,
"learning_rate": 7.62840044742088e-05,
"loss": 0.028,
"mean_token_accuracy": 0.9913866996765137,
"num_tokens": 2852366.0,
"step": 1550
},
{
"epoch": 5.695970695970696,
"grad_norm": 0.4921875,
"learning_rate": 7.621620658256279e-05,
"loss": 0.0232,
"mean_token_accuracy": 0.9910756468772888,
"num_tokens": 2861611.0,
"step": 1555
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.55078125,
"learning_rate": 7.61481990977507e-05,
"loss": 0.0417,
"mean_token_accuracy": 0.9838123202323914,
"num_tokens": 2870193.0,
"step": 1560
},
{
"epoch": 5.732600732600733,
"grad_norm": 0.26953125,
"learning_rate": 7.607998264345515e-05,
"loss": 0.0427,
"mean_token_accuracy": 0.9834349632263184,
"num_tokens": 2879533.0,
"step": 1565
},
{
"epoch": 5.750915750915751,
"grad_norm": 0.09814453125,
"learning_rate": 7.601155784527516e-05,
"loss": 0.0395,
"mean_token_accuracy": 0.9865917205810547,
"num_tokens": 2888588.0,
"step": 1570
},
{
"epoch": 5.769230769230769,
"grad_norm": 0.63671875,
"learning_rate": 7.594292533072048e-05,
"loss": 0.0359,
"mean_token_accuracy": 0.9868963241577149,
"num_tokens": 2898690.0,
"step": 1575
},
{
"epoch": 5.787545787545787,
"grad_norm": 0.205078125,
"learning_rate": 7.587408572920568e-05,
"loss": 0.0284,
"mean_token_accuracy": 0.9881658792495728,
"num_tokens": 2908343.0,
"step": 1580
},
{
"epoch": 5.805860805860806,
"grad_norm": 0.3671875,
"learning_rate": 7.58050396720446e-05,
"loss": 0.0257,
"mean_token_accuracy": 0.990417754650116,
"num_tokens": 2918583.0,
"step": 1585
},
{
"epoch": 5.824175824175824,
"grad_norm": 0.470703125,
"learning_rate": 7.573578779244438e-05,
"loss": 0.0461,
"mean_token_accuracy": 0.9842254996299744,
"num_tokens": 2927267.0,
"step": 1590
},
{
"epoch": 5.842490842490842,
"grad_norm": 0.462890625,
"learning_rate": 7.566633072549971e-05,
"loss": 0.0422,
"mean_token_accuracy": 0.9848962306976319,
"num_tokens": 2937053.0,
"step": 1595
},
{
"epoch": 5.860805860805861,
"grad_norm": 0.388671875,
"learning_rate": 7.559666910818704e-05,
"loss": 0.0444,
"mean_token_accuracy": 0.9840786457061768,
"num_tokens": 2945426.0,
"step": 1600
},
{
"epoch": 5.8791208791208796,
"grad_norm": 0.431640625,
"learning_rate": 7.552680357935869e-05,
"loss": 0.04,
"mean_token_accuracy": 0.9841191053390503,
"num_tokens": 2953945.0,
"step": 1605
},
{
"epoch": 5.897435897435898,
"grad_norm": 0.2255859375,
"learning_rate": 7.5456734779737e-05,
"loss": 0.0426,
"mean_token_accuracy": 0.9842138767242432,
"num_tokens": 2962787.0,
"step": 1610
},
{
"epoch": 5.915750915750916,
"grad_norm": 0.671875,
"learning_rate": 7.53864633519085e-05,
"loss": 0.045,
"mean_token_accuracy": 0.9817765593528748,
"num_tokens": 2971473.0,
"step": 1615
},
{
"epoch": 5.934065934065934,
"grad_norm": 0.75390625,
"learning_rate": 7.531598994031796e-05,
"loss": 0.0691,
"mean_token_accuracy": 0.9776899933815002,
"num_tokens": 2980183.0,
"step": 1620
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.474609375,
"learning_rate": 7.524531519126248e-05,
"loss": 0.0509,
"mean_token_accuracy": 0.9816537737846375,
"num_tokens": 2989666.0,
"step": 1625
},
{
"epoch": 5.970695970695971,
"grad_norm": 0.984375,
"learning_rate": 7.517443975288563e-05,
"loss": 0.0569,
"mean_token_accuracy": 0.9782140016555786,
"num_tokens": 2998079.0,
"step": 1630
},
{
"epoch": 5.989010989010989,
"grad_norm": 1.109375,
"learning_rate": 7.510336427517143e-05,
"loss": 0.0628,
"mean_token_accuracy": 0.9775285959243775,
"num_tokens": 3007389.0,
"step": 1635
},
{
"epoch": 6.007326007326007,
"grad_norm": 0.3203125,
"learning_rate": 7.503208940993842e-05,
"loss": 0.0693,
"mean_token_accuracy": 0.9747921347618103,
"num_tokens": 3015549.0,
"step": 1640
},
{
"epoch": 6.0256410256410255,
"grad_norm": 0.69140625,
"learning_rate": 7.49606158108337e-05,
"loss": 0.0209,
"mean_token_accuracy": 0.9922340869903564,
"num_tokens": 3025486.0,
"step": 1645
},
{
"epoch": 6.043956043956044,
"grad_norm": 0.095703125,
"learning_rate": 7.488894413332689e-05,
"loss": 0.162,
"mean_token_accuracy": 0.9714855909347534,
"num_tokens": 3034529.0,
"step": 1650
},
{
"epoch": 6.062271062271062,
"grad_norm": 26.0,
"learning_rate": 7.481707503470417e-05,
"loss": 0.0312,
"mean_token_accuracy": 0.9905990958213806,
"num_tokens": 3043834.0,
"step": 1655
},
{
"epoch": 6.08058608058608,
"grad_norm": 1.6875,
"learning_rate": 7.474500917406223e-05,
"loss": 0.0467,
"mean_token_accuracy": 0.9854714870452881,
"num_tokens": 3053424.0,
"step": 1660
},
{
"epoch": 6.0989010989010985,
"grad_norm": 0.1240234375,
"learning_rate": 7.467274721230221e-05,
"loss": 0.0179,
"mean_token_accuracy": 0.9933658838272095,
"num_tokens": 3063201.0,
"step": 1665
},
{
"epoch": 6.117216117216117,
"grad_norm": 0.06640625,
"learning_rate": 7.460028981212365e-05,
"loss": 0.0242,
"mean_token_accuracy": 0.9913597822189331,
"num_tokens": 3072991.0,
"step": 1670
},
{
"epoch": 6.135531135531136,
"grad_norm": 0.921875,
"learning_rate": 7.452763763801842e-05,
"loss": 0.032,
"mean_token_accuracy": 0.9883728504180909,
"num_tokens": 3082543.0,
"step": 1675
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.09619140625,
"learning_rate": 7.445479135626463e-05,
"loss": 0.0306,
"mean_token_accuracy": 0.9895648956298828,
"num_tokens": 3091715.0,
"step": 1680
},
{
"epoch": 6.172161172161172,
"grad_norm": 0.345703125,
"learning_rate": 7.43817516349205e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.9860040664672851,
"num_tokens": 3100483.0,
"step": 1685
},
{
"epoch": 6.190476190476191,
"grad_norm": 0.255859375,
"learning_rate": 7.430851914381827e-05,
"loss": 0.0254,
"mean_token_accuracy": 0.9893843650817871,
"num_tokens": 3109988.0,
"step": 1690
},
{
"epoch": 6.208791208791209,
"grad_norm": 0.384765625,
"learning_rate": 7.423509455455799e-05,
"loss": 0.0415,
"mean_token_accuracy": 0.9854371070861816,
"num_tokens": 3119503.0,
"step": 1695
},
{
"epoch": 6.227106227106227,
"grad_norm": 0.095703125,
"learning_rate": 7.416147854050143e-05,
"loss": 0.0345,
"mean_token_accuracy": 0.9878103852272033,
"num_tokens": 3128641.0,
"step": 1700
},
{
"epoch": 6.245421245421245,
"grad_norm": 0.447265625,
"learning_rate": 7.408767177676586e-05,
"loss": 0.0222,
"mean_token_accuracy": 0.9916223526000977,
"num_tokens": 3137766.0,
"step": 1705
},
{
"epoch": 6.263736263736264,
"grad_norm": 1.140625,
"learning_rate": 7.40136749402179e-05,
"loss": 0.0295,
"mean_token_accuracy": 0.9880544662475585,
"num_tokens": 3146326.0,
"step": 1710
},
{
"epoch": 6.282051282051282,
"grad_norm": 0.09765625,
"learning_rate": 7.393948870946729e-05,
"loss": 0.0219,
"mean_token_accuracy": 0.9904427409172059,
"num_tokens": 3155485.0,
"step": 1715
},
{
"epoch": 6.3003663003663,
"grad_norm": 0.62890625,
"learning_rate": 7.386511376486061e-05,
"loss": 0.0399,
"mean_token_accuracy": 0.984571659564972,
"num_tokens": 3164816.0,
"step": 1720
},
{
"epoch": 6.318681318681318,
"grad_norm": 0.318359375,
"learning_rate": 7.37905507884752e-05,
"loss": 0.0201,
"mean_token_accuracy": 0.992521858215332,
"num_tokens": 3174718.0,
"step": 1725
},
{
"epoch": 6.336996336996337,
"grad_norm": 0.578125,
"learning_rate": 7.371580046411267e-05,
"loss": 0.0357,
"mean_token_accuracy": 0.986468493938446,
"num_tokens": 3183409.0,
"step": 1730
},
{
"epoch": 6.355311355311355,
"grad_norm": 0.39453125,
"learning_rate": 7.364086347729285e-05,
"loss": 0.0232,
"mean_token_accuracy": 0.9905255913734436,
"num_tokens": 3192371.0,
"step": 1735
},
{
"epoch": 6.373626373626374,
"grad_norm": 0.333984375,
"learning_rate": 7.356574051524742e-05,
"loss": 0.0307,
"mean_token_accuracy": 0.9887727737426758,
"num_tokens": 3201677.0,
"step": 1740
},
{
"epoch": 6.391941391941392,
"grad_norm": 0.703125,
"learning_rate": 7.349043226691354e-05,
"loss": 0.0274,
"mean_token_accuracy": 0.9895096063613892,
"num_tokens": 3211218.0,
"step": 1745
},
{
"epoch": 6.410256410256411,
"grad_norm": 4.03125,
"learning_rate": 7.341493942292763e-05,
"loss": 0.0364,
"mean_token_accuracy": 0.9867009520530701,
"num_tokens": 3219808.0,
"step": 1750
},
{
"epoch": 6.428571428571429,
"grad_norm": 1.0703125,
"learning_rate": 7.333926267561898e-05,
"loss": 0.0389,
"mean_token_accuracy": 0.9854157328605652,
"num_tokens": 3229195.0,
"step": 1755
},
{
"epoch": 6.446886446886447,
"grad_norm": 0.61328125,
"learning_rate": 7.326340271900346e-05,
"loss": 0.0479,
"mean_token_accuracy": 0.9841797947883606,
"num_tokens": 3237885.0,
"step": 1760
},
{
"epoch": 6.465201465201465,
"grad_norm": 0.95703125,
"learning_rate": 7.318736024877707e-05,
"loss": 0.04,
"mean_token_accuracy": 0.9877835512161255,
"num_tokens": 3247579.0,
"step": 1765
},
{
"epoch": 6.483516483516484,
"grad_norm": 0.1904296875,
"learning_rate": 7.31111359623096e-05,
"loss": 0.0299,
"mean_token_accuracy": 0.9874194860458374,
"num_tokens": 3256966.0,
"step": 1770
},
{
"epoch": 6.501831501831502,
"grad_norm": 0.99609375,
"learning_rate": 7.30347305586383e-05,
"loss": 0.0335,
"mean_token_accuracy": 0.9878918051719665,
"num_tokens": 3266350.0,
"step": 1775
},
{
"epoch": 6.52014652014652,
"grad_norm": 0.57421875,
"learning_rate": 7.295814473846134e-05,
"loss": 0.0326,
"mean_token_accuracy": 0.9886090993881226,
"num_tokens": 3274954.0,
"step": 1780
},
{
"epoch": 6.538461538461538,
"grad_norm": 0.55078125,
"learning_rate": 7.288137920413148e-05,
"loss": 0.0426,
"mean_token_accuracy": 0.9847822427749634,
"num_tokens": 3282952.0,
"step": 1785
},
{
"epoch": 6.556776556776557,
"grad_norm": 0.1943359375,
"learning_rate": 7.280443465964961e-05,
"loss": 0.0197,
"mean_token_accuracy": 0.9919935941696167,
"num_tokens": 3292733.0,
"step": 1790
},
{
"epoch": 6.575091575091575,
"grad_norm": 0.365234375,
"learning_rate": 7.272731181065829e-05,
"loss": 0.0531,
"mean_token_accuracy": 0.9843693256378174,
"num_tokens": 3302237.0,
"step": 1795
},
{
"epoch": 6.593406593406593,
"grad_norm": 0.45703125,
"learning_rate": 7.265001136443525e-05,
"loss": 0.0221,
"mean_token_accuracy": 0.9909010767936707,
"num_tokens": 3311751.0,
"step": 1800
},
{
"epoch": 6.611721611721611,
"grad_norm": 0.275390625,
"learning_rate": 7.257253402988693e-05,
"loss": 0.032,
"mean_token_accuracy": 0.9890513896942139,
"num_tokens": 3320625.0,
"step": 1805
},
{
"epoch": 6.63003663003663,
"grad_norm": 0.48828125,
"learning_rate": 7.249488051754199e-05,
"loss": 0.0278,
"mean_token_accuracy": 0.989040732383728,
"num_tokens": 3329629.0,
"step": 1810
},
{
"epoch": 6.648351648351649,
"grad_norm": 0.158203125,
"learning_rate": 7.241705153954479e-05,
"loss": 0.0291,
"mean_token_accuracy": 0.988949990272522,
"num_tokens": 3338881.0,
"step": 1815
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.484375,
"learning_rate": 7.23390478096488e-05,
"loss": 0.0316,
"mean_token_accuracy": 0.9878135204315186,
"num_tokens": 3348032.0,
"step": 1820
},
{
"epoch": 6.684981684981685,
"grad_norm": 0.65234375,
"learning_rate": 7.226087004321018e-05,
"loss": 0.032,
"mean_token_accuracy": 0.9878805875778198,
"num_tokens": 3357685.0,
"step": 1825
},
{
"epoch": 6.7032967032967035,
"grad_norm": 0.51171875,
"learning_rate": 7.218251895718108e-05,
"loss": 0.0314,
"mean_token_accuracy": 0.9871991038322449,
"num_tokens": 3366081.0,
"step": 1830
},
{
"epoch": 6.721611721611722,
"grad_norm": 0.52734375,
"learning_rate": 7.210399527010315e-05,
"loss": 0.0371,
"mean_token_accuracy": 0.9859683156013489,
"num_tokens": 3375665.0,
"step": 1835
},
{
"epoch": 6.73992673992674,
"grad_norm": 0.6875,
"learning_rate": 7.202529970210093e-05,
"loss": 0.0437,
"mean_token_accuracy": 0.9847039103507995,
"num_tokens": 3385407.0,
"step": 1840
},
{
"epoch": 6.758241758241758,
"grad_norm": 0.9296875,
"learning_rate": 7.194643297487525e-05,
"loss": 0.0482,
"mean_token_accuracy": 0.9819490432739257,
"num_tokens": 3394276.0,
"step": 1845
},
{
"epoch": 6.7765567765567765,
"grad_norm": 0.46875,
"learning_rate": 7.186739581169659e-05,
"loss": 0.0768,
"mean_token_accuracy": 0.9809007167816162,
"num_tokens": 3403876.0,
"step": 1850
},
{
"epoch": 6.794871794871795,
"grad_norm": 0.921875,
"learning_rate": 7.178818893739847e-05,
"loss": 0.0345,
"mean_token_accuracy": 0.9873276352882385,
"num_tokens": 3413010.0,
"step": 1855
},
{
"epoch": 6.813186813186813,
"grad_norm": 0.3203125,
"learning_rate": 7.170881307837081e-05,
"loss": 0.0364,
"mean_token_accuracy": 0.9852291464805603,
"num_tokens": 3420921.0,
"step": 1860
},
{
"epoch": 6.831501831501831,
"grad_norm": 0.44921875,
"learning_rate": 7.162926896255323e-05,
"loss": 0.0379,
"mean_token_accuracy": 0.9871521234512329,
"num_tokens": 3429748.0,
"step": 1865
},
{
"epoch": 6.8498168498168495,
"grad_norm": 0.09765625,
"learning_rate": 7.154955731942842e-05,
"loss": 0.0338,
"mean_token_accuracy": 0.9871858716011047,
"num_tokens": 3438647.0,
"step": 1870
},
{
"epoch": 6.868131868131869,
"grad_norm": 0.111328125,
"learning_rate": 7.146967888001541e-05,
"loss": 0.0384,
"mean_token_accuracy": 0.9856713056564331,
"num_tokens": 3448087.0,
"step": 1875
},
{
"epoch": 6.886446886446887,
"grad_norm": 0.51171875,
"learning_rate": 7.138963437686289e-05,
"loss": 0.0423,
"mean_token_accuracy": 0.9847253203392029,
"num_tokens": 3457095.0,
"step": 1880
},
{
"epoch": 6.904761904761905,
"grad_norm": 0.09521484375,
"learning_rate": 7.13094245440425e-05,
"loss": 0.0335,
"mean_token_accuracy": 0.9881868481636047,
"num_tokens": 3466170.0,
"step": 1885
},
{
"epoch": 6.923076923076923,
"grad_norm": 0.255859375,
"learning_rate": 7.122905011714206e-05,
"loss": 0.0331,
"mean_token_accuracy": 0.987188744544983,
"num_tokens": 3475299.0,
"step": 1890
},
{
"epoch": 6.941391941391942,
"grad_norm": 0.365234375,
"learning_rate": 7.114851183325886e-05,
"loss": 0.0412,
"mean_token_accuracy": 0.984969186782837,
"num_tokens": 3485021.0,
"step": 1895
},
{
"epoch": 6.95970695970696,
"grad_norm": 0.609375,
"learning_rate": 7.10678104309929e-05,
"loss": 0.0348,
"mean_token_accuracy": 0.9886750221252442,
"num_tokens": 3493774.0,
"step": 1900
},
{
"epoch": 6.978021978021978,
"grad_norm": 0.703125,
"learning_rate": 7.098694665044011e-05,
"loss": 0.0339,
"mean_token_accuracy": 0.9876073241233826,
"num_tokens": 3503382.0,
"step": 1905
},
{
"epoch": 6.996336996336996,
"grad_norm": 1.0078125,
"learning_rate": 7.090592123318553e-05,
"loss": 0.0437,
"mean_token_accuracy": 0.9858802318572998,
"num_tokens": 3512668.0,
"step": 1910
},
{
"epoch": 7.014652014652015,
"grad_norm": 0.1640625,
"learning_rate": 7.082473492229653e-05,
"loss": 0.0192,
"mean_token_accuracy": 0.9916712999343872,
"num_tokens": 3520969.0,
"step": 1915
},
{
"epoch": 7.032967032967033,
"grad_norm": 0.52734375,
"learning_rate": 7.074338846231605e-05,
"loss": 0.0239,
"mean_token_accuracy": 0.9903509378433227,
"num_tokens": 3529196.0,
"step": 1920
},
{
"epoch": 7.051282051282051,
"grad_norm": 0.84765625,
"learning_rate": 7.066188259925569e-05,
"loss": 0.0569,
"mean_token_accuracy": 0.9826701760292054,
"num_tokens": 3538654.0,
"step": 1925
},
{
"epoch": 7.069597069597069,
"grad_norm": 0.373046875,
"learning_rate": 7.05802180805889e-05,
"loss": 0.0255,
"mean_token_accuracy": 0.9906298637390136,
"num_tokens": 3547981.0,
"step": 1930
},
{
"epoch": 7.087912087912088,
"grad_norm": 0.94921875,
"learning_rate": 7.049839565524414e-05,
"loss": 0.0212,
"mean_token_accuracy": 0.9925713777542114,
"num_tokens": 3557721.0,
"step": 1935
},
{
"epoch": 7.106227106227106,
"grad_norm": 1.0,
"learning_rate": 7.041641607359798e-05,
"loss": 0.024,
"mean_token_accuracy": 0.991481339931488,
"num_tokens": 3566312.0,
"step": 1940
},
{
"epoch": 7.124542124542124,
"grad_norm": 0.2197265625,
"learning_rate": 7.033428008746831e-05,
"loss": 0.0199,
"mean_token_accuracy": 0.9931520938873291,
"num_tokens": 3575508.0,
"step": 1945
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.21484375,
"learning_rate": 7.025198845010726e-05,
"loss": 0.016,
"mean_token_accuracy": 0.9932388305664063,
"num_tokens": 3584603.0,
"step": 1950
},
{
"epoch": 7.1611721611721615,
"grad_norm": 11.4375,
"learning_rate": 7.016954191619448e-05,
"loss": 0.0315,
"mean_token_accuracy": 0.9889041304588317,
"num_tokens": 3593828.0,
"step": 1955
},
{
"epoch": 7.17948717948718,
"grad_norm": 1.15625,
"learning_rate": 7.008694124183013e-05,
"loss": 0.0246,
"mean_token_accuracy": 0.9922425985336304,
"num_tokens": 3602200.0,
"step": 1960
},
{
"epoch": 7.197802197802198,
"grad_norm": 0.326171875,
"learning_rate": 7.000418718452799e-05,
"loss": 0.0145,
"mean_token_accuracy": 0.993871533870697,
"num_tokens": 3611380.0,
"step": 1965
},
{
"epoch": 7.216117216117216,
"grad_norm": 0.5078125,
"learning_rate": 6.992128050320839e-05,
"loss": 0.024,
"mean_token_accuracy": 0.9902616381645203,
"num_tokens": 3621064.0,
"step": 1970
},
{
"epoch": 7.2344322344322345,
"grad_norm": 0.07763671875,
"learning_rate": 6.983822195819146e-05,
"loss": 0.0157,
"mean_token_accuracy": 0.9932525634765625,
"num_tokens": 3630093.0,
"step": 1975
},
{
"epoch": 7.252747252747253,
"grad_norm": 0.2353515625,
"learning_rate": 6.975501231118994e-05,
"loss": 0.0236,
"mean_token_accuracy": 0.9911327123641968,
"num_tokens": 3639168.0,
"step": 1980
},
{
"epoch": 7.271062271062271,
"grad_norm": 0.5859375,
"learning_rate": 6.967165232530237e-05,
"loss": 0.0216,
"mean_token_accuracy": 0.991173791885376,
"num_tokens": 3647754.0,
"step": 1985
},
{
"epoch": 7.289377289377289,
"grad_norm": 0.2080078125,
"learning_rate": 6.958814276500599e-05,
"loss": 0.0162,
"mean_token_accuracy": 0.9926367402076721,
"num_tokens": 3657191.0,
"step": 1990
},
{
"epoch": 7.3076923076923075,
"grad_norm": 0.1357421875,
"learning_rate": 6.950448439614973e-05,
"loss": 0.0163,
"mean_token_accuracy": 0.9930072546005249,
"num_tokens": 3667054.0,
"step": 1995
},
{
"epoch": 7.326007326007326,
"grad_norm": 0.384765625,
"learning_rate": 6.942067798594726e-05,
"loss": 0.0211,
"mean_token_accuracy": 0.9917723655700683,
"num_tokens": 3677025.0,
"step": 2000
},
{
"epoch": 7.344322344322344,
"grad_norm": 2.3125,
"learning_rate": 6.933672430296986e-05,
"loss": 0.0389,
"mean_token_accuracy": 0.9862228393554687,
"num_tokens": 3685112.0,
"step": 2005
},
{
"epoch": 7.362637362637362,
"grad_norm": 0.126953125,
"learning_rate": 6.925262411713945e-05,
"loss": 0.0225,
"mean_token_accuracy": 0.9903972864151,
"num_tokens": 3694862.0,
"step": 2010
},
{
"epoch": 7.380952380952381,
"grad_norm": 0.26171875,
"learning_rate": 6.916837819972149e-05,
"loss": 0.0274,
"mean_token_accuracy": 0.9918236613273621,
"num_tokens": 3704805.0,
"step": 2015
},
{
"epoch": 7.3992673992674,
"grad_norm": 0.08349609375,
"learning_rate": 6.908398732331793e-05,
"loss": 0.017,
"mean_token_accuracy": 0.9925737380981445,
"num_tokens": 3714304.0,
"step": 2020
},
{
"epoch": 7.417582417582418,
"grad_norm": 0.375,
"learning_rate": 6.899945226186005e-05,
"loss": 0.0219,
"mean_token_accuracy": 0.9916564226150513,
"num_tokens": 3723305.0,
"step": 2025
},
{
"epoch": 7.435897435897436,
"grad_norm": 1.15625,
"learning_rate": 6.89147737906015e-05,
"loss": 0.0269,
"mean_token_accuracy": 0.9908839344978333,
"num_tokens": 3732073.0,
"step": 2030
},
{
"epoch": 7.454212454212454,
"grad_norm": 0.091796875,
"learning_rate": 6.882995268611106e-05,
"loss": 0.0226,
"mean_token_accuracy": 0.9915896058082581,
"num_tokens": 3741071.0,
"step": 2035
},
{
"epoch": 7.472527472527473,
"grad_norm": 0.62890625,
"learning_rate": 6.874498972626559e-05,
"loss": 0.0204,
"mean_token_accuracy": 0.9924831748008728,
"num_tokens": 3750420.0,
"step": 2040
},
{
"epoch": 7.490842490842491,
"grad_norm": 0.15625,
"learning_rate": 6.865988569024286e-05,
"loss": 0.03,
"mean_token_accuracy": 0.9895938873291016,
"num_tokens": 3760153.0,
"step": 2045
},
{
"epoch": 7.509157509157509,
"grad_norm": 0.416015625,
"learning_rate": 6.857464135851444e-05,
"loss": 0.0304,
"mean_token_accuracy": 0.9892897367477417,
"num_tokens": 3768898.0,
"step": 2050
},
{
"epoch": 7.527472527472527,
"grad_norm": 0.2578125,
"learning_rate": 6.848925751283853e-05,
"loss": 0.0203,
"mean_token_accuracy": 0.9921239137649536,
"num_tokens": 3778718.0,
"step": 2055
},
{
"epoch": 7.545787545787546,
"grad_norm": 0.06298828125,
"learning_rate": 6.840373493625274e-05,
"loss": 0.0203,
"mean_token_accuracy": 0.9922136068344116,
"num_tokens": 3788022.0,
"step": 2060
},
{
"epoch": 7.564102564102564,
"grad_norm": 0.69921875,
"learning_rate": 6.831807441306698e-05,
"loss": 0.036,
"mean_token_accuracy": 0.988727355003357,
"num_tokens": 3797131.0,
"step": 2065
},
{
"epoch": 7.582417582417582,
"grad_norm": 0.263671875,
"learning_rate": 6.823227672885628e-05,
"loss": 0.0319,
"mean_token_accuracy": 0.9886006474494934,
"num_tokens": 3806894.0,
"step": 2070
},
{
"epoch": 7.6007326007326,
"grad_norm": 0.435546875,
"learning_rate": 6.814634267045346e-05,
"loss": 0.0245,
"mean_token_accuracy": 0.9917014598846435,
"num_tokens": 3815606.0,
"step": 2075
},
{
"epoch": 7.619047619047619,
"grad_norm": 0.26171875,
"learning_rate": 6.806027302594206e-05,
"loss": 0.0308,
"mean_token_accuracy": 0.988996148109436,
"num_tokens": 3824337.0,
"step": 2080
},
{
"epoch": 7.637362637362637,
"grad_norm": 0.1240234375,
"learning_rate": 6.797406858464905e-05,
"loss": 0.0289,
"mean_token_accuracy": 0.9898035883903503,
"num_tokens": 3833957.0,
"step": 2085
},
{
"epoch": 7.655677655677656,
"grad_norm": 0.369140625,
"learning_rate": 6.788773013713758e-05,
"loss": 0.0265,
"mean_token_accuracy": 0.9909451246261597,
"num_tokens": 3843114.0,
"step": 2090
},
{
"epoch": 7.673992673992674,
"grad_norm": 0.62109375,
"learning_rate": 6.780125847519971e-05,
"loss": 0.0206,
"mean_token_accuracy": 0.9923561453819275,
"num_tokens": 3852882.0,
"step": 2095
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.3984375,
"learning_rate": 6.771465439184927e-05,
"loss": 0.026,
"mean_token_accuracy": 0.9902096509933471,
"num_tokens": 3862419.0,
"step": 2100
},
{
"epoch": 7.710622710622711,
"grad_norm": 0.69921875,
"learning_rate": 6.762791868131442e-05,
"loss": 0.0226,
"mean_token_accuracy": 0.9907670021057129,
"num_tokens": 3871716.0,
"step": 2105
},
{
"epoch": 7.728937728937729,
"grad_norm": 0.10888671875,
"learning_rate": 6.754105213903045e-05,
"loss": 0.023,
"mean_token_accuracy": 0.9909697294235229,
"num_tokens": 3880542.0,
"step": 2110
},
{
"epoch": 7.747252747252747,
"grad_norm": 0.1943359375,
"learning_rate": 6.745405556163253e-05,
"loss": 0.0346,
"mean_token_accuracy": 0.9859986186027527,
"num_tokens": 3889484.0,
"step": 2115
},
{
"epoch": 7.7655677655677655,
"grad_norm": 0.37890625,
"learning_rate": 6.736692974694833e-05,
"loss": 0.022,
"mean_token_accuracy": 0.9915480494499207,
"num_tokens": 3898636.0,
"step": 2120
},
{
"epoch": 7.783882783882784,
"grad_norm": 0.41796875,
"learning_rate": 6.727967549399072e-05,
"loss": 0.0253,
"mean_token_accuracy": 0.9902077794075013,
"num_tokens": 3907808.0,
"step": 2125
},
{
"epoch": 7.802197802197802,
"grad_norm": 0.1796875,
"learning_rate": 6.719229360295044e-05,
"loss": 0.0352,
"mean_token_accuracy": 0.9878880977630615,
"num_tokens": 3916667.0,
"step": 2130
},
{
"epoch": 7.82051282051282,
"grad_norm": 0.26171875,
"learning_rate": 6.710478487518882e-05,
"loss": 0.0247,
"mean_token_accuracy": 0.9887702345848084,
"num_tokens": 3926085.0,
"step": 2135
},
{
"epoch": 7.8388278388278385,
"grad_norm": 0.5703125,
"learning_rate": 6.701715011323034e-05,
"loss": 0.0318,
"mean_token_accuracy": 0.9896463632583619,
"num_tokens": 3934900.0,
"step": 2140
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.09326171875,
"learning_rate": 6.692939012075532e-05,
"loss": 0.0124,
"mean_token_accuracy": 0.9943976402282715,
"num_tokens": 3944656.0,
"step": 2145
},
{
"epoch": 7.875457875457876,
"grad_norm": 0.291015625,
"learning_rate": 6.684150570259256e-05,
"loss": 0.0216,
"mean_token_accuracy": 0.9906636476516724,
"num_tokens": 3954257.0,
"step": 2150
},
{
"epoch": 7.893772893772894,
"grad_norm": 0.3984375,
"learning_rate": 6.675349766471193e-05,
"loss": 0.0266,
"mean_token_accuracy": 0.9893643379211425,
"num_tokens": 3962549.0,
"step": 2155
},
{
"epoch": 7.912087912087912,
"grad_norm": 0.3125,
"learning_rate": 6.6665366814217e-05,
"loss": 0.0203,
"mean_token_accuracy": 0.9919966578483581,
"num_tokens": 3971970.0,
"step": 2160
}
],
"logging_steps": 5,
"max_steps": 5460,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.623835599352627e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}