gpt-oss-20b-math7k / trainer_state.json
HectorHe's picture
Model save
315932c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 645,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004662004662004662,
"grad_norm": 206.61297042982778,
"learning_rate": 0.0,
"loss": 6.0047,
"mean_token_accuracy": 0.26166820526123047,
"num_tokens": 9481.0,
"step": 1
},
{
"epoch": 0.009324009324009324,
"grad_norm": 237.90030179764935,
"learning_rate": 1.5384615384615387e-07,
"loss": 5.9896,
"mean_token_accuracy": 0.26549550890922546,
"num_tokens": 18486.0,
"step": 2
},
{
"epoch": 0.013986013986013986,
"grad_norm": 191.565778458644,
"learning_rate": 3.0769230769230774e-07,
"loss": 5.8697,
"mean_token_accuracy": 0.26646704971790314,
"num_tokens": 28054.0,
"step": 3
},
{
"epoch": 0.018648018648018648,
"grad_norm": 173.73423360043478,
"learning_rate": 4.615384615384616e-07,
"loss": 5.7664,
"mean_token_accuracy": 0.28080685436725616,
"num_tokens": 38100.0,
"step": 4
},
{
"epoch": 0.023310023310023312,
"grad_norm": 188.53311568741194,
"learning_rate": 6.153846153846155e-07,
"loss": 6.1116,
"mean_token_accuracy": 0.24714654684066772,
"num_tokens": 46674.0,
"step": 5
},
{
"epoch": 0.027972027972027972,
"grad_norm": 138.50157116139866,
"learning_rate": 7.692307692307694e-07,
"loss": 5.7375,
"mean_token_accuracy": 0.2682064026594162,
"num_tokens": 55892.0,
"step": 6
},
{
"epoch": 0.03263403263403263,
"grad_norm": 137.331644930486,
"learning_rate": 9.230769230769232e-07,
"loss": 5.7865,
"mean_token_accuracy": 0.2567756175994873,
"num_tokens": 64557.0,
"step": 7
},
{
"epoch": 0.037296037296037296,
"grad_norm": 113.93237748074404,
"learning_rate": 1.076923076923077e-06,
"loss": 5.4186,
"mean_token_accuracy": 0.2943734973669052,
"num_tokens": 74534.0,
"step": 8
},
{
"epoch": 0.04195804195804196,
"grad_norm": 97.42189347532377,
"learning_rate": 1.230769230769231e-06,
"loss": 5.1123,
"mean_token_accuracy": 0.29780860245227814,
"num_tokens": 84839.0,
"step": 9
},
{
"epoch": 0.046620046620046623,
"grad_norm": 115.13017773253503,
"learning_rate": 1.3846153846153848e-06,
"loss": 5.0274,
"mean_token_accuracy": 0.30082090198993683,
"num_tokens": 94486.0,
"step": 10
},
{
"epoch": 0.05128205128205128,
"grad_norm": 92.86208928347652,
"learning_rate": 1.5384615384615387e-06,
"loss": 4.8261,
"mean_token_accuracy": 0.31195709109306335,
"num_tokens": 103901.0,
"step": 11
},
{
"epoch": 0.055944055944055944,
"grad_norm": 74.98678911328179,
"learning_rate": 1.6923076923076926e-06,
"loss": 4.3226,
"mean_token_accuracy": 0.332173153758049,
"num_tokens": 113421.0,
"step": 12
},
{
"epoch": 0.06060606060606061,
"grad_norm": 65.42071488824216,
"learning_rate": 1.8461538461538465e-06,
"loss": 4.1213,
"mean_token_accuracy": 0.34372538328170776,
"num_tokens": 122888.0,
"step": 13
},
{
"epoch": 0.06526806526806526,
"grad_norm": 51.76685372566078,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.8684,
"mean_token_accuracy": 0.36909155547618866,
"num_tokens": 132046.0,
"step": 14
},
{
"epoch": 0.06993006993006994,
"grad_norm": 47.330915768289,
"learning_rate": 2.153846153846154e-06,
"loss": 3.5595,
"mean_token_accuracy": 0.4020439237356186,
"num_tokens": 141586.0,
"step": 15
},
{
"epoch": 0.07459207459207459,
"grad_norm": 39.592533375514385,
"learning_rate": 2.307692307692308e-06,
"loss": 3.0525,
"mean_token_accuracy": 0.46130137145519257,
"num_tokens": 151366.0,
"step": 16
},
{
"epoch": 0.07925407925407925,
"grad_norm": 38.681099075404326,
"learning_rate": 2.461538461538462e-06,
"loss": 2.8418,
"mean_token_accuracy": 0.47952745854854584,
"num_tokens": 160361.0,
"step": 17
},
{
"epoch": 0.08391608391608392,
"grad_norm": 26.957564440869795,
"learning_rate": 2.615384615384616e-06,
"loss": 2.4668,
"mean_token_accuracy": 0.5341931283473969,
"num_tokens": 170716.0,
"step": 18
},
{
"epoch": 0.08857808857808858,
"grad_norm": 29.725894005554363,
"learning_rate": 2.7692307692307697e-06,
"loss": 2.4476,
"mean_token_accuracy": 0.530603438615799,
"num_tokens": 179378.0,
"step": 19
},
{
"epoch": 0.09324009324009325,
"grad_norm": 22.45654494245414,
"learning_rate": 2.9230769230769236e-06,
"loss": 1.9608,
"mean_token_accuracy": 0.6144967377185822,
"num_tokens": 189155.0,
"step": 20
},
{
"epoch": 0.0979020979020979,
"grad_norm": 22.420677559887096,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.8408,
"mean_token_accuracy": 0.6224100291728973,
"num_tokens": 198128.0,
"step": 21
},
{
"epoch": 0.10256410256410256,
"grad_norm": 20.045412685294725,
"learning_rate": 3.2307692307692313e-06,
"loss": 1.5358,
"mean_token_accuracy": 0.668096661567688,
"num_tokens": 207526.0,
"step": 22
},
{
"epoch": 0.10722610722610723,
"grad_norm": 17.826429643351467,
"learning_rate": 3.384615384615385e-06,
"loss": 1.3385,
"mean_token_accuracy": 0.7064461410045624,
"num_tokens": 217437.0,
"step": 23
},
{
"epoch": 0.11188811188811189,
"grad_norm": 14.795248732817978,
"learning_rate": 3.538461538461539e-06,
"loss": 1.1874,
"mean_token_accuracy": 0.728604257106781,
"num_tokens": 226725.0,
"step": 24
},
{
"epoch": 0.11655011655011654,
"grad_norm": 16.367290617286876,
"learning_rate": 3.692307692307693e-06,
"loss": 0.9682,
"mean_token_accuracy": 0.7707604169845581,
"num_tokens": 235721.0,
"step": 25
},
{
"epoch": 0.12121212121212122,
"grad_norm": 9.588756303110177,
"learning_rate": 3.846153846153847e-06,
"loss": 0.8884,
"mean_token_accuracy": 0.7929337918758392,
"num_tokens": 244662.0,
"step": 26
},
{
"epoch": 0.1258741258741259,
"grad_norm": 7.015049125629419,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7978,
"mean_token_accuracy": 0.8093675374984741,
"num_tokens": 254047.0,
"step": 27
},
{
"epoch": 0.13053613053613053,
"grad_norm": 5.597049269802009,
"learning_rate": 4.1538461538461545e-06,
"loss": 0.7933,
"mean_token_accuracy": 0.8108646273612976,
"num_tokens": 262770.0,
"step": 28
},
{
"epoch": 0.1351981351981352,
"grad_norm": 12.495340171422823,
"learning_rate": 4.307692307692308e-06,
"loss": 0.7206,
"mean_token_accuracy": 0.8183692693710327,
"num_tokens": 272936.0,
"step": 29
},
{
"epoch": 0.13986013986013987,
"grad_norm": 4.8146353297051325,
"learning_rate": 4.461538461538462e-06,
"loss": 0.7378,
"mean_token_accuracy": 0.8188252151012421,
"num_tokens": 282677.0,
"step": 30
},
{
"epoch": 0.1445221445221445,
"grad_norm": 4.397488614777519,
"learning_rate": 4.615384615384616e-06,
"loss": 0.7168,
"mean_token_accuracy": 0.8155234456062317,
"num_tokens": 291851.0,
"step": 31
},
{
"epoch": 0.14918414918414918,
"grad_norm": 3.8407813105595707,
"learning_rate": 4.76923076923077e-06,
"loss": 0.653,
"mean_token_accuracy": 0.8314312100410461,
"num_tokens": 300775.0,
"step": 32
},
{
"epoch": 0.15384615384615385,
"grad_norm": 3.9490135740465506,
"learning_rate": 4.923076923076924e-06,
"loss": 0.6982,
"mean_token_accuracy": 0.8216440379619598,
"num_tokens": 309003.0,
"step": 33
},
{
"epoch": 0.1585081585081585,
"grad_norm": 3.1111593206661423,
"learning_rate": 5.076923076923077e-06,
"loss": 0.6367,
"mean_token_accuracy": 0.8357088267803192,
"num_tokens": 318722.0,
"step": 34
},
{
"epoch": 0.16317016317016317,
"grad_norm": 3.8540200089616787,
"learning_rate": 5.230769230769232e-06,
"loss": 0.6725,
"mean_token_accuracy": 0.8249399065971375,
"num_tokens": 328363.0,
"step": 35
},
{
"epoch": 0.16783216783216784,
"grad_norm": 3.2279851487302533,
"learning_rate": 5.384615384615385e-06,
"loss": 0.6361,
"mean_token_accuracy": 0.8352257907390594,
"num_tokens": 337298.0,
"step": 36
},
{
"epoch": 0.17249417249417248,
"grad_norm": 3.2846323967634277,
"learning_rate": 5.538461538461539e-06,
"loss": 0.585,
"mean_token_accuracy": 0.8430881202220917,
"num_tokens": 346068.0,
"step": 37
},
{
"epoch": 0.17715617715617715,
"grad_norm": 3.201411224226897,
"learning_rate": 5.692307692307692e-06,
"loss": 0.6067,
"mean_token_accuracy": 0.8457746207714081,
"num_tokens": 354566.0,
"step": 38
},
{
"epoch": 0.18181818181818182,
"grad_norm": 3.603688713367759,
"learning_rate": 5.846153846153847e-06,
"loss": 0.5355,
"mean_token_accuracy": 0.860386848449707,
"num_tokens": 363662.0,
"step": 39
},
{
"epoch": 0.1864801864801865,
"grad_norm": 3.095744451514778,
"learning_rate": 6e-06,
"loss": 0.6101,
"mean_token_accuracy": 0.8404548466205597,
"num_tokens": 372778.0,
"step": 40
},
{
"epoch": 0.19114219114219114,
"grad_norm": 2.851157082704997,
"learning_rate": 6.153846153846155e-06,
"loss": 0.5961,
"mean_token_accuracy": 0.8459599018096924,
"num_tokens": 381990.0,
"step": 41
},
{
"epoch": 0.1958041958041958,
"grad_norm": 2.920481788097288,
"learning_rate": 6.307692307692308e-06,
"loss": 0.6033,
"mean_token_accuracy": 0.8439716398715973,
"num_tokens": 390764.0,
"step": 42
},
{
"epoch": 0.20046620046620048,
"grad_norm": 2.6590805207702877,
"learning_rate": 6.461538461538463e-06,
"loss": 0.5411,
"mean_token_accuracy": 0.8539808988571167,
"num_tokens": 399544.0,
"step": 43
},
{
"epoch": 0.20512820512820512,
"grad_norm": 2.869045415372655,
"learning_rate": 6.615384615384616e-06,
"loss": 0.5994,
"mean_token_accuracy": 0.8429957032203674,
"num_tokens": 408895.0,
"step": 44
},
{
"epoch": 0.2097902097902098,
"grad_norm": 2.5632574285139693,
"learning_rate": 6.76923076923077e-06,
"loss": 0.5474,
"mean_token_accuracy": 0.8559859097003937,
"num_tokens": 417528.0,
"step": 45
},
{
"epoch": 0.21445221445221446,
"grad_norm": 3.0716143926843213,
"learning_rate": 6.923076923076923e-06,
"loss": 0.5659,
"mean_token_accuracy": 0.8473467230796814,
"num_tokens": 426632.0,
"step": 46
},
{
"epoch": 0.2191142191142191,
"grad_norm": 2.816132536269355,
"learning_rate": 7.076923076923078e-06,
"loss": 0.6149,
"mean_token_accuracy": 0.8401601612567902,
"num_tokens": 435698.0,
"step": 47
},
{
"epoch": 0.22377622377622378,
"grad_norm": 2.4694488487146544,
"learning_rate": 7.230769230769231e-06,
"loss": 0.5604,
"mean_token_accuracy": 0.8553016185760498,
"num_tokens": 444865.0,
"step": 48
},
{
"epoch": 0.22843822843822845,
"grad_norm": 2.534416309991065,
"learning_rate": 7.384615384615386e-06,
"loss": 0.5465,
"mean_token_accuracy": 0.8501911461353302,
"num_tokens": 454293.0,
"step": 49
},
{
"epoch": 0.2331002331002331,
"grad_norm": 2.5987473992591994,
"learning_rate": 7.538461538461539e-06,
"loss": 0.5779,
"mean_token_accuracy": 0.846989244222641,
"num_tokens": 463669.0,
"step": 50
},
{
"epoch": 0.23776223776223776,
"grad_norm": 2.9304334416656035,
"learning_rate": 7.692307692307694e-06,
"loss": 0.5951,
"mean_token_accuracy": 0.842944860458374,
"num_tokens": 472913.0,
"step": 51
},
{
"epoch": 0.24242424242424243,
"grad_norm": 2.557007651858603,
"learning_rate": 7.846153846153847e-06,
"loss": 0.5522,
"mean_token_accuracy": 0.8499407768249512,
"num_tokens": 482966.0,
"step": 52
},
{
"epoch": 0.24708624708624707,
"grad_norm": 2.413067290036643,
"learning_rate": 8.000000000000001e-06,
"loss": 0.556,
"mean_token_accuracy": 0.8527302443981171,
"num_tokens": 491361.0,
"step": 53
},
{
"epoch": 0.2517482517482518,
"grad_norm": 2.3167133059321765,
"learning_rate": 8.153846153846154e-06,
"loss": 0.5474,
"mean_token_accuracy": 0.8524684011936188,
"num_tokens": 500582.0,
"step": 54
},
{
"epoch": 0.2564102564102564,
"grad_norm": 2.459059208267421,
"learning_rate": 8.307692307692309e-06,
"loss": 0.5474,
"mean_token_accuracy": 0.8470076322555542,
"num_tokens": 510444.0,
"step": 55
},
{
"epoch": 0.26107226107226106,
"grad_norm": 2.3850596138441955,
"learning_rate": 8.461538461538462e-06,
"loss": 0.5477,
"mean_token_accuracy": 0.8558304607868195,
"num_tokens": 519595.0,
"step": 56
},
{
"epoch": 0.26573426573426573,
"grad_norm": 2.425014600765099,
"learning_rate": 8.615384615384617e-06,
"loss": 0.5312,
"mean_token_accuracy": 0.856484979391098,
"num_tokens": 529256.0,
"step": 57
},
{
"epoch": 0.2703962703962704,
"grad_norm": 2.41544921845608,
"learning_rate": 8.76923076923077e-06,
"loss": 0.5581,
"mean_token_accuracy": 0.8501502573490143,
"num_tokens": 538883.0,
"step": 58
},
{
"epoch": 0.27505827505827507,
"grad_norm": 2.3601513682778537,
"learning_rate": 8.923076923076925e-06,
"loss": 0.5381,
"mean_token_accuracy": 0.8557923436164856,
"num_tokens": 547820.0,
"step": 59
},
{
"epoch": 0.27972027972027974,
"grad_norm": 2.5228638625713815,
"learning_rate": 9.076923076923078e-06,
"loss": 0.5478,
"mean_token_accuracy": 0.8510215282440186,
"num_tokens": 556572.0,
"step": 60
},
{
"epoch": 0.28438228438228436,
"grad_norm": 2.4788861740314925,
"learning_rate": 9.230769230769232e-06,
"loss": 0.5702,
"mean_token_accuracy": 0.8453376293182373,
"num_tokens": 565988.0,
"step": 61
},
{
"epoch": 0.289044289044289,
"grad_norm": 2.443670735215224,
"learning_rate": 9.384615384615385e-06,
"loss": 0.5701,
"mean_token_accuracy": 0.8493904769420624,
"num_tokens": 575388.0,
"step": 62
},
{
"epoch": 0.2937062937062937,
"grad_norm": 2.51188327450779,
"learning_rate": 9.53846153846154e-06,
"loss": 0.6161,
"mean_token_accuracy": 0.8381120562553406,
"num_tokens": 584991.0,
"step": 63
},
{
"epoch": 0.29836829836829837,
"grad_norm": 2.4568944814979763,
"learning_rate": 9.692307692307693e-06,
"loss": 0.5544,
"mean_token_accuracy": 0.8492381870746613,
"num_tokens": 594666.0,
"step": 64
},
{
"epoch": 0.30303030303030304,
"grad_norm": 2.3543409090851872,
"learning_rate": 9.846153846153848e-06,
"loss": 0.5627,
"mean_token_accuracy": 0.8481525778770447,
"num_tokens": 603192.0,
"step": 65
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.516015882056836,
"learning_rate": 1e-05,
"loss": 0.5518,
"mean_token_accuracy": 0.8509586453437805,
"num_tokens": 612501.0,
"step": 66
},
{
"epoch": 0.3123543123543124,
"grad_norm": 2.5702229906928626,
"learning_rate": 9.999933987646821e-06,
"loss": 0.5652,
"mean_token_accuracy": 0.8515127599239349,
"num_tokens": 621564.0,
"step": 67
},
{
"epoch": 0.317016317016317,
"grad_norm": 2.311235457272914,
"learning_rate": 9.99973595252401e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.8656518757343292,
"num_tokens": 630726.0,
"step": 68
},
{
"epoch": 0.32167832167832167,
"grad_norm": 2.3626760252822736,
"learning_rate": 9.999405900441683e-06,
"loss": 0.5291,
"mean_token_accuracy": 0.8573567271232605,
"num_tokens": 639950.0,
"step": 69
},
{
"epoch": 0.32634032634032634,
"grad_norm": 2.5106829719874733,
"learning_rate": 9.998943841083179e-06,
"loss": 0.5333,
"mean_token_accuracy": 0.8539510667324066,
"num_tokens": 649402.0,
"step": 70
},
{
"epoch": 0.331002331002331,
"grad_norm": 2.374126097719514,
"learning_rate": 9.99834978800478e-06,
"loss": 0.5136,
"mean_token_accuracy": 0.8584230840206146,
"num_tokens": 658474.0,
"step": 71
},
{
"epoch": 0.3356643356643357,
"grad_norm": 2.2734777943143945,
"learning_rate": 9.997623758635298e-06,
"loss": 0.5469,
"mean_token_accuracy": 0.8496910333633423,
"num_tokens": 667908.0,
"step": 72
},
{
"epoch": 0.34032634032634035,
"grad_norm": 2.5255087017576043,
"learning_rate": 9.996765774275587e-06,
"loss": 0.527,
"mean_token_accuracy": 0.8526964783668518,
"num_tokens": 677532.0,
"step": 73
},
{
"epoch": 0.34498834498834496,
"grad_norm": 2.2143213966162216,
"learning_rate": 9.995775860097897e-06,
"loss": 0.5219,
"mean_token_accuracy": 0.8588562309741974,
"num_tokens": 686796.0,
"step": 74
},
{
"epoch": 0.34965034965034963,
"grad_norm": 2.1934169241509665,
"learning_rate": 9.994654045145142e-06,
"loss": 0.5401,
"mean_token_accuracy": 0.8564260303974152,
"num_tokens": 695776.0,
"step": 75
},
{
"epoch": 0.3543123543123543,
"grad_norm": 2.153560775154425,
"learning_rate": 9.993400362330058e-06,
"loss": 0.5446,
"mean_token_accuracy": 0.8583995997905731,
"num_tokens": 704530.0,
"step": 76
},
{
"epoch": 0.358974358974359,
"grad_norm": 2.128801081198065,
"learning_rate": 9.992014848434221e-06,
"loss": 0.5363,
"mean_token_accuracy": 0.8531051576137543,
"num_tokens": 714123.0,
"step": 77
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.312071712839466,
"learning_rate": 9.990497544106981e-06,
"loss": 0.5344,
"mean_token_accuracy": 0.8557191491127014,
"num_tokens": 723719.0,
"step": 78
},
{
"epoch": 0.3682983682983683,
"grad_norm": 2.4795067817061573,
"learning_rate": 9.988848493864259e-06,
"loss": 0.5597,
"mean_token_accuracy": 0.8507181704044342,
"num_tokens": 733447.0,
"step": 79
},
{
"epoch": 0.372960372960373,
"grad_norm": 2.2998442504007452,
"learning_rate": 9.987067746087251e-06,
"loss": 0.5467,
"mean_token_accuracy": 0.8498886525630951,
"num_tokens": 742567.0,
"step": 80
},
{
"epoch": 0.3776223776223776,
"grad_norm": 2.220780163026348,
"learning_rate": 9.985155353021004e-06,
"loss": 0.538,
"mean_token_accuracy": 0.8577711582183838,
"num_tokens": 751488.0,
"step": 81
},
{
"epoch": 0.3822843822843823,
"grad_norm": 2.23769737502521,
"learning_rate": 9.983111370772877e-06,
"loss": 0.5375,
"mean_token_accuracy": 0.8561404347419739,
"num_tokens": 760518.0,
"step": 82
},
{
"epoch": 0.38694638694638694,
"grad_norm": 2.3468431520556243,
"learning_rate": 9.980935859310907e-06,
"loss": 0.5071,
"mean_token_accuracy": 0.8601345419883728,
"num_tokens": 769855.0,
"step": 83
},
{
"epoch": 0.3916083916083916,
"grad_norm": 2.420088401775177,
"learning_rate": 9.97862888246204e-06,
"loss": 0.5517,
"mean_token_accuracy": 0.8494808971881866,
"num_tokens": 780309.0,
"step": 84
},
{
"epoch": 0.3962703962703963,
"grad_norm": 2.1832026417273864,
"learning_rate": 9.976190507910265e-06,
"loss": 0.5007,
"mean_token_accuracy": 0.8591891229152679,
"num_tokens": 790560.0,
"step": 85
},
{
"epoch": 0.40093240093240096,
"grad_norm": 2.217081826161623,
"learning_rate": 9.97362080719462e-06,
"loss": 0.4945,
"mean_token_accuracy": 0.8628792762756348,
"num_tokens": 799972.0,
"step": 86
},
{
"epoch": 0.40559440559440557,
"grad_norm": 2.1021603924715877,
"learning_rate": 9.970919855707103e-06,
"loss": 0.5149,
"mean_token_accuracy": 0.8582651615142822,
"num_tokens": 808998.0,
"step": 87
},
{
"epoch": 0.41025641025641024,
"grad_norm": 2.2137525851668163,
"learning_rate": 9.968087732690452e-06,
"loss": 0.5643,
"mean_token_accuracy": 0.8490354120731354,
"num_tokens": 818322.0,
"step": 88
},
{
"epoch": 0.4149184149184149,
"grad_norm": 2.1357303202881193,
"learning_rate": 9.965124521235827e-06,
"loss": 0.5465,
"mean_token_accuracy": 0.8538801968097687,
"num_tokens": 828468.0,
"step": 89
},
{
"epoch": 0.4195804195804196,
"grad_norm": 2.144386382606401,
"learning_rate": 9.962030308280363e-06,
"loss": 0.5598,
"mean_token_accuracy": 0.8531892895698547,
"num_tokens": 836909.0,
"step": 90
},
{
"epoch": 0.42424242424242425,
"grad_norm": 2.120216517402183,
"learning_rate": 9.958805184604631e-06,
"loss": 0.508,
"mean_token_accuracy": 0.8650859892368317,
"num_tokens": 845651.0,
"step": 91
},
{
"epoch": 0.4289044289044289,
"grad_norm": 2.3227262446712014,
"learning_rate": 9.955449244829966e-06,
"loss": 0.5533,
"mean_token_accuracy": 0.850572019815445,
"num_tokens": 854858.0,
"step": 92
},
{
"epoch": 0.43356643356643354,
"grad_norm": 2.0524529217213985,
"learning_rate": 9.95196258741569e-06,
"loss": 0.5097,
"mean_token_accuracy": 0.8616639971733093,
"num_tokens": 863666.0,
"step": 93
},
{
"epoch": 0.4382284382284382,
"grad_norm": 2.178830324772212,
"learning_rate": 9.948345314656234e-06,
"loss": 0.513,
"mean_token_accuracy": 0.8623040318489075,
"num_tokens": 873307.0,
"step": 94
},
{
"epoch": 0.4428904428904429,
"grad_norm": 2.178878661646046,
"learning_rate": 9.94459753267812e-06,
"loss": 0.5377,
"mean_token_accuracy": 0.851812869310379,
"num_tokens": 882337.0,
"step": 95
},
{
"epoch": 0.44755244755244755,
"grad_norm": 2.2151123798224366,
"learning_rate": 9.94071935143687e-06,
"loss": 0.527,
"mean_token_accuracy": 0.8585948050022125,
"num_tokens": 891528.0,
"step": 96
},
{
"epoch": 0.4522144522144522,
"grad_norm": 2.153917534671565,
"learning_rate": 9.936710884713752e-06,
"loss": 0.5136,
"mean_token_accuracy": 0.8649525940418243,
"num_tokens": 900782.0,
"step": 97
},
{
"epoch": 0.4568764568764569,
"grad_norm": 2.0747029950901057,
"learning_rate": 9.932572250112469e-06,
"loss": 0.5389,
"mean_token_accuracy": 0.8555485904216766,
"num_tokens": 909811.0,
"step": 98
},
{
"epoch": 0.46153846153846156,
"grad_norm": 1.983982143931433,
"learning_rate": 9.92830356905569e-06,
"loss": 0.4814,
"mean_token_accuracy": 0.8693816661834717,
"num_tokens": 919377.0,
"step": 99
},
{
"epoch": 0.4662004662004662,
"grad_norm": 2.1327543713500603,
"learning_rate": 9.923904966781496e-06,
"loss": 0.5515,
"mean_token_accuracy": 0.8478606641292572,
"num_tokens": 929132.0,
"step": 100
},
{
"epoch": 0.47086247086247085,
"grad_norm": 2.396154142849225,
"learning_rate": 9.919376572339703e-06,
"loss": 0.5521,
"mean_token_accuracy": 0.8526241481304169,
"num_tokens": 938942.0,
"step": 101
},
{
"epoch": 0.4755244755244755,
"grad_norm": 2.1649063337799603,
"learning_rate": 9.914718518588076e-06,
"loss": 0.5689,
"mean_token_accuracy": 0.8482916951179504,
"num_tokens": 948271.0,
"step": 102
},
{
"epoch": 0.4801864801864802,
"grad_norm": 2.0645529438999506,
"learning_rate": 9.909930942188436e-06,
"loss": 0.5219,
"mean_token_accuracy": 0.8553557991981506,
"num_tokens": 957852.0,
"step": 103
},
{
"epoch": 0.48484848484848486,
"grad_norm": 2.221579905396912,
"learning_rate": 9.905013983602639e-06,
"loss": 0.5092,
"mean_token_accuracy": 0.8628838658332825,
"num_tokens": 967455.0,
"step": 104
},
{
"epoch": 0.48951048951048953,
"grad_norm": 2.1265754888438533,
"learning_rate": 9.899967787088468e-06,
"loss": 0.5494,
"mean_token_accuracy": 0.849203497171402,
"num_tokens": 977978.0,
"step": 105
},
{
"epoch": 0.49417249417249415,
"grad_norm": 2.1206984788420145,
"learning_rate": 9.89479250069539e-06,
"loss": 0.513,
"mean_token_accuracy": 0.8621029257774353,
"num_tokens": 987423.0,
"step": 106
},
{
"epoch": 0.4988344988344988,
"grad_norm": 2.2958737279282917,
"learning_rate": 9.889488276260222e-06,
"loss": 0.5562,
"mean_token_accuracy": 0.8555387854576111,
"num_tokens": 996656.0,
"step": 107
},
{
"epoch": 0.5034965034965035,
"grad_norm": 2.2321368229532443,
"learning_rate": 9.88405526940267e-06,
"loss": 0.5579,
"mean_token_accuracy": 0.854218989610672,
"num_tokens": 1005952.0,
"step": 108
},
{
"epoch": 0.5081585081585082,
"grad_norm": 2.145064834641496,
"learning_rate": 9.87849363952076e-06,
"loss": 0.525,
"mean_token_accuracy": 0.8584134578704834,
"num_tokens": 1015020.0,
"step": 109
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.2441739750519316,
"learning_rate": 9.872803549786177e-06,
"loss": 0.542,
"mean_token_accuracy": 0.8544652462005615,
"num_tokens": 1023815.0,
"step": 110
},
{
"epoch": 0.5174825174825175,
"grad_norm": 2.259110803475594,
"learning_rate": 9.866985167139453e-06,
"loss": 0.5356,
"mean_token_accuracy": 0.8533321619033813,
"num_tokens": 1032375.0,
"step": 111
},
{
"epoch": 0.5221445221445221,
"grad_norm": 2.1512979801268095,
"learning_rate": 9.861038662285093e-06,
"loss": 0.5272,
"mean_token_accuracy": 0.8559562265872955,
"num_tokens": 1041287.0,
"step": 112
},
{
"epoch": 0.5268065268065268,
"grad_norm": 2.0895533173369616,
"learning_rate": 9.854964209686555e-06,
"loss": 0.5578,
"mean_token_accuracy": 0.8457550704479218,
"num_tokens": 1052277.0,
"step": 113
},
{
"epoch": 0.5314685314685315,
"grad_norm": 2.07804701176361,
"learning_rate": 9.848761987561132e-06,
"loss": 0.5587,
"mean_token_accuracy": 0.8497881889343262,
"num_tokens": 1061624.0,
"step": 114
},
{
"epoch": 0.5361305361305362,
"grad_norm": 2.029504980400925,
"learning_rate": 9.842432177874725e-06,
"loss": 0.5222,
"mean_token_accuracy": 0.8595547080039978,
"num_tokens": 1070577.0,
"step": 115
},
{
"epoch": 0.5407925407925408,
"grad_norm": 1.9605500161506988,
"learning_rate": 9.835974966336504e-06,
"loss": 0.49,
"mean_token_accuracy": 0.8654608428478241,
"num_tokens": 1080819.0,
"step": 116
},
{
"epoch": 0.5454545454545454,
"grad_norm": 2.2122252988560223,
"learning_rate": 9.82939054239346e-06,
"loss": 0.5679,
"mean_token_accuracy": 0.8448387086391449,
"num_tokens": 1089899.0,
"step": 117
},
{
"epoch": 0.5501165501165501,
"grad_norm": 2.072420046717628,
"learning_rate": 9.822679099224844e-06,
"loss": 0.5611,
"mean_token_accuracy": 0.8533997237682343,
"num_tokens": 1099422.0,
"step": 118
},
{
"epoch": 0.5547785547785548,
"grad_norm": 2.2414814409383754,
"learning_rate": 9.815840833736508e-06,
"loss": 0.5222,
"mean_token_accuracy": 0.8619909286499023,
"num_tokens": 1108059.0,
"step": 119
},
{
"epoch": 0.5594405594405595,
"grad_norm": 1.945252737941495,
"learning_rate": 9.808875946555109e-06,
"loss": 0.5034,
"mean_token_accuracy": 0.8639355599880219,
"num_tokens": 1117344.0,
"step": 120
},
{
"epoch": 0.5641025641025641,
"grad_norm": 1.8638080342388037,
"learning_rate": 9.801784642022254e-06,
"loss": 0.4565,
"mean_token_accuracy": 0.8745285272598267,
"num_tokens": 1126493.0,
"step": 121
},
{
"epoch": 0.5687645687645687,
"grad_norm": 2.1910077292347006,
"learning_rate": 9.794567128188466e-06,
"loss": 0.5313,
"mean_token_accuracy": 0.8547643721103668,
"num_tokens": 1135454.0,
"step": 122
},
{
"epoch": 0.5734265734265734,
"grad_norm": 2.320759590932711,
"learning_rate": 9.787223616807118e-06,
"loss": 0.5518,
"mean_token_accuracy": 0.8485196530818939,
"num_tokens": 1145120.0,
"step": 123
},
{
"epoch": 0.578088578088578,
"grad_norm": 2.205905846852017,
"learning_rate": 9.779754323328192e-06,
"loss": 0.5135,
"mean_token_accuracy": 0.8605582118034363,
"num_tokens": 1154569.0,
"step": 124
},
{
"epoch": 0.5827505827505828,
"grad_norm": 1.9377848593966973,
"learning_rate": 9.772159466891971e-06,
"loss": 0.4979,
"mean_token_accuracy": 0.8665166199207306,
"num_tokens": 1163964.0,
"step": 125
},
{
"epoch": 0.5874125874125874,
"grad_norm": 2.143716998667602,
"learning_rate": 9.764439270322612e-06,
"loss": 0.5332,
"mean_token_accuracy": 0.8567988276481628,
"num_tokens": 1172705.0,
"step": 126
},
{
"epoch": 0.5920745920745921,
"grad_norm": 1.9575201068560006,
"learning_rate": 9.756593960121598e-06,
"loss": 0.4744,
"mean_token_accuracy": 0.8642941415309906,
"num_tokens": 1182428.0,
"step": 127
},
{
"epoch": 0.5967365967365967,
"grad_norm": 2.057073316572943,
"learning_rate": 9.748623766461101e-06,
"loss": 0.5236,
"mean_token_accuracy": 0.8576280772686005,
"num_tokens": 1191692.0,
"step": 128
},
{
"epoch": 0.6013986013986014,
"grad_norm": 2.315109150829485,
"learning_rate": 9.740528923177227e-06,
"loss": 0.5901,
"mean_token_accuracy": 0.8435404002666473,
"num_tokens": 1200804.0,
"step": 129
},
{
"epoch": 0.6060606060606061,
"grad_norm": 2.160355676496633,
"learning_rate": 9.732309667763158e-06,
"loss": 0.5131,
"mean_token_accuracy": 0.8617720901966095,
"num_tokens": 1209500.0,
"step": 130
},
{
"epoch": 0.6107226107226107,
"grad_norm": 2.1245956205330776,
"learning_rate": 9.723966241362178e-06,
"loss": 0.5508,
"mean_token_accuracy": 0.8526682257652283,
"num_tokens": 1218360.0,
"step": 131
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.9474554252940732,
"learning_rate": 9.7154988887606e-06,
"loss": 0.5172,
"mean_token_accuracy": 0.8601753413677216,
"num_tokens": 1227995.0,
"step": 132
},
{
"epoch": 0.62004662004662,
"grad_norm": 2.0598106310347886,
"learning_rate": 9.706907858380593e-06,
"loss": 0.5292,
"mean_token_accuracy": 0.853904515504837,
"num_tokens": 1237369.0,
"step": 133
},
{
"epoch": 0.6247086247086248,
"grad_norm": 2.167359943385511,
"learning_rate": 9.69819340227288e-06,
"loss": 0.5337,
"mean_token_accuracy": 0.8580853343009949,
"num_tokens": 1246811.0,
"step": 134
},
{
"epoch": 0.6293706293706294,
"grad_norm": 2.0419581939057703,
"learning_rate": 9.68935577610935e-06,
"loss": 0.5584,
"mean_token_accuracy": 0.8511963188648224,
"num_tokens": 1257728.0,
"step": 135
},
{
"epoch": 0.634032634032634,
"grad_norm": 2.179167067571999,
"learning_rate": 9.680395239175563e-06,
"loss": 0.547,
"mean_token_accuracy": 0.8516505658626556,
"num_tokens": 1267082.0,
"step": 136
},
{
"epoch": 0.6386946386946387,
"grad_norm": 2.1523271385107843,
"learning_rate": 9.671312054363126e-06,
"loss": 0.5097,
"mean_token_accuracy": 0.8639609515666962,
"num_tokens": 1276457.0,
"step": 137
},
{
"epoch": 0.6433566433566433,
"grad_norm": 2.2140190778794717,
"learning_rate": 9.662106488162001e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8548583686351776,
"num_tokens": 1285816.0,
"step": 138
},
{
"epoch": 0.6480186480186481,
"grad_norm": 1.9892790673234197,
"learning_rate": 9.652778810652669e-06,
"loss": 0.5141,
"mean_token_accuracy": 0.85847008228302,
"num_tokens": 1294833.0,
"step": 139
},
{
"epoch": 0.6526806526806527,
"grad_norm": 2.138516925288519,
"learning_rate": 9.643329295498215e-06,
"loss": 0.5171,
"mean_token_accuracy": 0.8615297675132751,
"num_tokens": 1304626.0,
"step": 140
},
{
"epoch": 0.6573426573426573,
"grad_norm": 2.2563369591379376,
"learning_rate": 9.633758219936299e-06,
"loss": 0.5306,
"mean_token_accuracy": 0.8586675524711609,
"num_tokens": 1314352.0,
"step": 141
},
{
"epoch": 0.662004662004662,
"grad_norm": 2.1291835975325446,
"learning_rate": 9.624065864771017e-06,
"loss": 0.5387,
"mean_token_accuracy": 0.854125052690506,
"num_tokens": 1323815.0,
"step": 142
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.9289636352361732,
"learning_rate": 9.614252514364671e-06,
"loss": 0.5302,
"mean_token_accuracy": 0.8544224202632904,
"num_tokens": 1333085.0,
"step": 143
},
{
"epoch": 0.6713286713286714,
"grad_norm": 2.142108471781227,
"learning_rate": 9.604318456629415e-06,
"loss": 0.5841,
"mean_token_accuracy": 0.8435969054698944,
"num_tokens": 1342490.0,
"step": 144
},
{
"epoch": 0.675990675990676,
"grad_norm": 2.256848010848965,
"learning_rate": 9.594263983018818e-06,
"loss": 0.5644,
"mean_token_accuracy": 0.8460557162761688,
"num_tokens": 1351579.0,
"step": 145
},
{
"epoch": 0.6806526806526807,
"grad_norm": 2.0607158153594343,
"learning_rate": 9.584089388519307e-06,
"loss": 0.525,
"mean_token_accuracy": 0.8550526797771454,
"num_tokens": 1363017.0,
"step": 146
},
{
"epoch": 0.6853146853146853,
"grad_norm": 2.0404326207258223,
"learning_rate": 9.573794971641518e-06,
"loss": 0.5067,
"mean_token_accuracy": 0.8611325323581696,
"num_tokens": 1372276.0,
"step": 147
},
{
"epoch": 0.6899766899766899,
"grad_norm": 2.017662742079808,
"learning_rate": 9.563381034411529e-06,
"loss": 0.5173,
"mean_token_accuracy": 0.8577908575534821,
"num_tokens": 1381474.0,
"step": 148
},
{
"epoch": 0.6946386946386947,
"grad_norm": 2.2063297261619668,
"learning_rate": 9.55284788236201e-06,
"loss": 0.5575,
"mean_token_accuracy": 0.8511577248573303,
"num_tokens": 1390844.0,
"step": 149
},
{
"epoch": 0.6993006993006993,
"grad_norm": 2.2322560935923295,
"learning_rate": 9.542195824523251e-06,
"loss": 0.4993,
"mean_token_accuracy": 0.8618028461933136,
"num_tokens": 1399649.0,
"step": 150
},
{
"epoch": 0.703962703962704,
"grad_norm": 2.1940354248783667,
"learning_rate": 9.531425173414095e-06,
"loss": 0.5766,
"mean_token_accuracy": 0.8469588458538055,
"num_tokens": 1409096.0,
"step": 151
},
{
"epoch": 0.7086247086247086,
"grad_norm": 2.276091425733425,
"learning_rate": 9.520536245032783e-06,
"loss": 0.6052,
"mean_token_accuracy": 0.8366052806377411,
"num_tokens": 1417944.0,
"step": 152
},
{
"epoch": 0.7132867132867133,
"grad_norm": 2.199611556073951,
"learning_rate": 9.509529358847655e-06,
"loss": 0.5454,
"mean_token_accuracy": 0.853013813495636,
"num_tokens": 1427079.0,
"step": 153
},
{
"epoch": 0.717948717948718,
"grad_norm": 2.080422260259384,
"learning_rate": 9.498404837787811e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8578924536705017,
"num_tokens": 1436521.0,
"step": 154
},
{
"epoch": 0.7226107226107226,
"grad_norm": 1.9852264975254732,
"learning_rate": 9.48716300823361e-06,
"loss": 0.5259,
"mean_token_accuracy": 0.862900048494339,
"num_tokens": 1445553.0,
"step": 155
},
{
"epoch": 0.7272727272727273,
"grad_norm": 2.0020640140351738,
"learning_rate": 9.475804200007104e-06,
"loss": 0.5327,
"mean_token_accuracy": 0.8615141212940216,
"num_tokens": 1453774.0,
"step": 156
},
{
"epoch": 0.7319347319347319,
"grad_norm": 2.1863487215175392,
"learning_rate": 9.464328746362367e-06,
"loss": 0.5588,
"mean_token_accuracy": 0.8489395678043365,
"num_tokens": 1462611.0,
"step": 157
},
{
"epoch": 0.7365967365967366,
"grad_norm": 2.4020487040671084,
"learning_rate": 9.452736983975708e-06,
"loss": 0.4923,
"mean_token_accuracy": 0.8653963208198547,
"num_tokens": 1471736.0,
"step": 158
},
{
"epoch": 0.7412587412587412,
"grad_norm": 2.1281438451825987,
"learning_rate": 9.441029252935804e-06,
"loss": 0.5397,
"mean_token_accuracy": 0.8503198325634003,
"num_tokens": 1482117.0,
"step": 159
},
{
"epoch": 0.745920745920746,
"grad_norm": 2.1763048717368756,
"learning_rate": 9.429205896733705e-06,
"loss": 0.5566,
"mean_token_accuracy": 0.8503492772579193,
"num_tokens": 1491292.0,
"step": 160
},
{
"epoch": 0.7505827505827506,
"grad_norm": 1.9354492116167334,
"learning_rate": 9.417267262252775e-06,
"loss": 0.503,
"mean_token_accuracy": 0.8627839088439941,
"num_tokens": 1500735.0,
"step": 161
},
{
"epoch": 0.7552447552447552,
"grad_norm": 2.0708959320246887,
"learning_rate": 9.405213699758507e-06,
"loss": 0.5067,
"mean_token_accuracy": 0.8632858693599701,
"num_tokens": 1510190.0,
"step": 162
},
{
"epoch": 0.7599067599067599,
"grad_norm": 2.11618902292271,
"learning_rate": 9.393045562888245e-06,
"loss": 0.5424,
"mean_token_accuracy": 0.8534725308418274,
"num_tokens": 1519207.0,
"step": 163
},
{
"epoch": 0.7645687645687645,
"grad_norm": 2.027410237379273,
"learning_rate": 9.380763208640809e-06,
"loss": 0.5197,
"mean_token_accuracy": 0.8590981066226959,
"num_tokens": 1528808.0,
"step": 164
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.008086929554116,
"learning_rate": 9.368366997366027e-06,
"loss": 0.5071,
"mean_token_accuracy": 0.8610014617443085,
"num_tokens": 1537765.0,
"step": 165
},
{
"epoch": 0.7738927738927739,
"grad_norm": 2.1365369175139763,
"learning_rate": 9.355857292754152e-06,
"loss": 0.5299,
"mean_token_accuracy": 0.8569334447383881,
"num_tokens": 1547822.0,
"step": 166
},
{
"epoch": 0.7785547785547785,
"grad_norm": 1.8545592775377633,
"learning_rate": 9.343234461825204e-06,
"loss": 0.4791,
"mean_token_accuracy": 0.869120866060257,
"num_tokens": 1557620.0,
"step": 167
},
{
"epoch": 0.7832167832167832,
"grad_norm": 2.028039178942425,
"learning_rate": 9.330498874918191e-06,
"loss": 0.5106,
"mean_token_accuracy": 0.8565393388271332,
"num_tokens": 1567111.0,
"step": 168
},
{
"epoch": 0.7878787878787878,
"grad_norm": 2.0701117186586817,
"learning_rate": 9.317650905680254e-06,
"loss": 0.4913,
"mean_token_accuracy": 0.8696520030498505,
"num_tokens": 1575836.0,
"step": 169
},
{
"epoch": 0.7925407925407926,
"grad_norm": 2.1194332133731235,
"learning_rate": 9.304690931055694e-06,
"loss": 0.5311,
"mean_token_accuracy": 0.8621737062931061,
"num_tokens": 1584822.0,
"step": 170
},
{
"epoch": 0.7972027972027972,
"grad_norm": 2.1282940280365747,
"learning_rate": 9.29161933127492e-06,
"loss": 0.5601,
"mean_token_accuracy": 0.8475571274757385,
"num_tokens": 1594428.0,
"step": 171
},
{
"epoch": 0.8018648018648019,
"grad_norm": 2.0641332133798937,
"learning_rate": 9.278436489843298e-06,
"loss": 0.5373,
"mean_token_accuracy": 0.8547400534152985,
"num_tokens": 1603786.0,
"step": 172
},
{
"epoch": 0.8065268065268065,
"grad_norm": 2.0160796623607804,
"learning_rate": 9.265142793529883e-06,
"loss": 0.4971,
"mean_token_accuracy": 0.8660332560539246,
"num_tokens": 1612747.0,
"step": 173
},
{
"epoch": 0.8111888111888111,
"grad_norm": 1.9129699635418105,
"learning_rate": 9.251738632356086e-06,
"loss": 0.4926,
"mean_token_accuracy": 0.869436115026474,
"num_tokens": 1621819.0,
"step": 174
},
{
"epoch": 0.8158508158508159,
"grad_norm": 2.3884591059212537,
"learning_rate": 9.238224399584232e-06,
"loss": 0.5476,
"mean_token_accuracy": 0.8577151894569397,
"num_tokens": 1631685.0,
"step": 175
},
{
"epoch": 0.8205128205128205,
"grad_norm": 1.8831561540253134,
"learning_rate": 9.224600491706009e-06,
"loss": 0.4786,
"mean_token_accuracy": 0.8686897456645966,
"num_tokens": 1640794.0,
"step": 176
},
{
"epoch": 0.8251748251748252,
"grad_norm": 2.0432546539992247,
"learning_rate": 9.210867308430847e-06,
"loss": 0.5173,
"mean_token_accuracy": 0.8568342328071594,
"num_tokens": 1650186.0,
"step": 177
},
{
"epoch": 0.8298368298368298,
"grad_norm": 2.0098051546686357,
"learning_rate": 9.197025252674192e-06,
"loss": 0.5181,
"mean_token_accuracy": 0.8594348132610321,
"num_tokens": 1659577.0,
"step": 178
},
{
"epoch": 0.8344988344988346,
"grad_norm": 2.0448629279973214,
"learning_rate": 9.183074730545674e-06,
"loss": 0.521,
"mean_token_accuracy": 0.8540050983428955,
"num_tokens": 1669821.0,
"step": 179
},
{
"epoch": 0.8391608391608392,
"grad_norm": 2.139730175214238,
"learning_rate": 9.169016151337202e-06,
"loss": 0.5651,
"mean_token_accuracy": 0.8448547720909119,
"num_tokens": 1679579.0,
"step": 180
},
{
"epoch": 0.8438228438228438,
"grad_norm": 2.0464895767510742,
"learning_rate": 9.15484992751095e-06,
"loss": 0.5221,
"mean_token_accuracy": 0.8626176416873932,
"num_tokens": 1688922.0,
"step": 181
},
{
"epoch": 0.8484848484848485,
"grad_norm": 1.9954837428115382,
"learning_rate": 9.140576474687263e-06,
"loss": 0.5279,
"mean_token_accuracy": 0.8613512217998505,
"num_tokens": 1697949.0,
"step": 182
},
{
"epoch": 0.8531468531468531,
"grad_norm": 1.9567685319451251,
"learning_rate": 9.126196211632456e-06,
"loss": 0.5274,
"mean_token_accuracy": 0.8570147156715393,
"num_tokens": 1708365.0,
"step": 183
},
{
"epoch": 0.8578088578088578,
"grad_norm": 1.9210555989780966,
"learning_rate": 9.11170956024653e-06,
"loss": 0.5269,
"mean_token_accuracy": 0.8548833727836609,
"num_tokens": 1717340.0,
"step": 184
},
{
"epoch": 0.8624708624708625,
"grad_norm": 1.9318021919862647,
"learning_rate": 9.097116945550794e-06,
"loss": 0.5359,
"mean_token_accuracy": 0.8578689694404602,
"num_tokens": 1727305.0,
"step": 185
},
{
"epoch": 0.8671328671328671,
"grad_norm": 2.026417344807131,
"learning_rate": 9.082418795675397e-06,
"loss": 0.5617,
"mean_token_accuracy": 0.8492590188980103,
"num_tokens": 1737093.0,
"step": 186
},
{
"epoch": 0.8717948717948718,
"grad_norm": 1.8991450738896973,
"learning_rate": 9.067615541846768e-06,
"loss": 0.5235,
"mean_token_accuracy": 0.859586089849472,
"num_tokens": 1746772.0,
"step": 187
},
{
"epoch": 0.8764568764568764,
"grad_norm": 1.9522407091267568,
"learning_rate": 9.052707618374958e-06,
"loss": 0.4765,
"mean_token_accuracy": 0.8663023710250854,
"num_tokens": 1756307.0,
"step": 188
},
{
"epoch": 0.8811188811188811,
"grad_norm": 2.048727988149695,
"learning_rate": 9.037695462640908e-06,
"loss": 0.5331,
"mean_token_accuracy": 0.8549124300479889,
"num_tokens": 1766010.0,
"step": 189
},
{
"epoch": 0.8857808857808858,
"grad_norm": 1.9934806378380259,
"learning_rate": 9.022579515083601e-06,
"loss": 0.5421,
"mean_token_accuracy": 0.8553925156593323,
"num_tokens": 1774885.0,
"step": 190
},
{
"epoch": 0.8904428904428905,
"grad_norm": 2.0689376168576286,
"learning_rate": 9.007360219187163e-06,
"loss": 0.514,
"mean_token_accuracy": 0.8535875976085663,
"num_tokens": 1785248.0,
"step": 191
},
{
"epoch": 0.8951048951048951,
"grad_norm": 2.009916861001166,
"learning_rate": 8.99203802146783e-06,
"loss": 0.497,
"mean_token_accuracy": 0.8642706871032715,
"num_tokens": 1794525.0,
"step": 192
},
{
"epoch": 0.8997668997668997,
"grad_norm": 2.0465375766970277,
"learning_rate": 8.976613371460856e-06,
"loss": 0.5271,
"mean_token_accuracy": 0.8549227714538574,
"num_tokens": 1804413.0,
"step": 193
},
{
"epoch": 0.9044289044289044,
"grad_norm": 1.917310132189426,
"learning_rate": 8.961086721707331e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.8640461564064026,
"num_tokens": 1813695.0,
"step": 194
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.915744851468885,
"learning_rate": 8.945458527740892e-06,
"loss": 0.4707,
"mean_token_accuracy": 0.8734670579433441,
"num_tokens": 1823851.0,
"step": 195
},
{
"epoch": 0.9137529137529138,
"grad_norm": 1.9274655587588858,
"learning_rate": 8.929729248074364e-06,
"loss": 0.5021,
"mean_token_accuracy": 0.8575229346752167,
"num_tokens": 1833828.0,
"step": 196
},
{
"epoch": 0.9184149184149184,
"grad_norm": 2.026284467899399,
"learning_rate": 8.913899344186312e-06,
"loss": 0.5287,
"mean_token_accuracy": 0.8602744936943054,
"num_tokens": 1842998.0,
"step": 197
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.05400953187219,
"learning_rate": 8.897969280507494e-06,
"loss": 0.5324,
"mean_token_accuracy": 0.862190306186676,
"num_tokens": 1851919.0,
"step": 198
},
{
"epoch": 0.9277389277389277,
"grad_norm": 2.0766187536409078,
"learning_rate": 8.881939524407238e-06,
"loss": 0.5412,
"mean_token_accuracy": 0.8545754849910736,
"num_tokens": 1860906.0,
"step": 199
},
{
"epoch": 0.9324009324009324,
"grad_norm": 1.989815369304169,
"learning_rate": 8.86581054617973e-06,
"loss": 0.5453,
"mean_token_accuracy": 0.8528104722499847,
"num_tokens": 1870470.0,
"step": 200
},
{
"epoch": 0.9370629370629371,
"grad_norm": 2.088847720398196,
"learning_rate": 8.849582819030217e-06,
"loss": 0.5432,
"mean_token_accuracy": 0.8563026189804077,
"num_tokens": 1879984.0,
"step": 201
},
{
"epoch": 0.9417249417249417,
"grad_norm": 2.078290662100905,
"learning_rate": 8.833256819061126e-06,
"loss": 0.4983,
"mean_token_accuracy": 0.8666514456272125,
"num_tokens": 1889348.0,
"step": 202
},
{
"epoch": 0.9463869463869464,
"grad_norm": 1.9223822960306767,
"learning_rate": 8.81683302525809e-06,
"loss": 0.5411,
"mean_token_accuracy": 0.8597702980041504,
"num_tokens": 1898543.0,
"step": 203
},
{
"epoch": 0.951048951048951,
"grad_norm": 1.8855235649147306,
"learning_rate": 8.800311919475902e-06,
"loss": 0.5014,
"mean_token_accuracy": 0.8621995449066162,
"num_tokens": 1908052.0,
"step": 204
},
{
"epoch": 0.9557109557109557,
"grad_norm": 1.9276050868545773,
"learning_rate": 8.783693986424365e-06,
"loss": 0.504,
"mean_token_accuracy": 0.8629100322723389,
"num_tokens": 1917592.0,
"step": 205
},
{
"epoch": 0.9603729603729604,
"grad_norm": 2.143001225129588,
"learning_rate": 8.76697971365409e-06,
"loss": 0.5387,
"mean_token_accuracy": 0.8589153587818146,
"num_tokens": 1927404.0,
"step": 206
},
{
"epoch": 0.965034965034965,
"grad_norm": 2.1777112033624557,
"learning_rate": 8.750169591542177e-06,
"loss": 0.5252,
"mean_token_accuracy": 0.8522741794586182,
"num_tokens": 1936273.0,
"step": 207
},
{
"epoch": 0.9696969696969697,
"grad_norm": 1.91067041727946,
"learning_rate": 8.733264113277832e-06,
"loss": 0.4991,
"mean_token_accuracy": 0.8641117215156555,
"num_tokens": 1944830.0,
"step": 208
},
{
"epoch": 0.9743589743589743,
"grad_norm": 2.004004655149868,
"learning_rate": 8.716263774847902e-06,
"loss": 0.5276,
"mean_token_accuracy": 0.8581711649894714,
"num_tokens": 1954306.0,
"step": 209
},
{
"epoch": 0.9790209790209791,
"grad_norm": 1.968917380031634,
"learning_rate": 8.69916907502232e-06,
"loss": 0.5138,
"mean_token_accuracy": 0.8624255359172821,
"num_tokens": 1964061.0,
"step": 210
},
{
"epoch": 0.9836829836829837,
"grad_norm": 1.9939460274690515,
"learning_rate": 8.681980515339464e-06,
"loss": 0.4816,
"mean_token_accuracy": 0.8704831600189209,
"num_tokens": 1973286.0,
"step": 211
},
{
"epoch": 0.9883449883449883,
"grad_norm": 2.057525517897735,
"learning_rate": 8.664698600091462e-06,
"loss": 0.5221,
"mean_token_accuracy": 0.8580233752727509,
"num_tokens": 1983373.0,
"step": 212
},
{
"epoch": 0.993006993006993,
"grad_norm": 1.79381112298063,
"learning_rate": 8.64732383630937e-06,
"loss": 0.4376,
"mean_token_accuracy": 0.8793376386165619,
"num_tokens": 1992207.0,
"step": 213
},
{
"epoch": 0.9976689976689976,
"grad_norm": 1.933384153410609,
"learning_rate": 8.629856733748325e-06,
"loss": 0.505,
"mean_token_accuracy": 0.85846146941185,
"num_tokens": 2001400.0,
"step": 214
},
{
"epoch": 1.0,
"grad_norm": 1.933384153410609,
"learning_rate": 8.612297804872562e-06,
"loss": 0.4925,
"mean_token_accuracy": 0.8816215991973877,
"num_tokens": 2003256.0,
"step": 215
},
{
"epoch": 1.0046620046620047,
"grad_norm": 2.819257871473016,
"learning_rate": 8.594647564840407e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.8927328288555145,
"num_tokens": 2012460.0,
"step": 216
},
{
"epoch": 1.0093240093240092,
"grad_norm": 1.7635713882513158,
"learning_rate": 8.57690653148913e-06,
"loss": 0.392,
"mean_token_accuracy": 0.887998104095459,
"num_tokens": 2022109.0,
"step": 217
},
{
"epoch": 1.013986013986014,
"grad_norm": 1.7892473419907957,
"learning_rate": 8.559075225319786e-06,
"loss": 0.3855,
"mean_token_accuracy": 0.8922514021396637,
"num_tokens": 2030710.0,
"step": 218
},
{
"epoch": 1.0186480186480187,
"grad_norm": 1.6823650842914313,
"learning_rate": 8.54115416948192e-06,
"loss": 0.356,
"mean_token_accuracy": 0.9053717851638794,
"num_tokens": 2039777.0,
"step": 219
},
{
"epoch": 1.0233100233100234,
"grad_norm": 1.939934426049098,
"learning_rate": 8.523143889758228e-06,
"loss": 0.3694,
"mean_token_accuracy": 0.8991726338863373,
"num_tokens": 2049184.0,
"step": 220
},
{
"epoch": 1.027972027972028,
"grad_norm": 1.752148060977119,
"learning_rate": 8.505044914549131e-06,
"loss": 0.39,
"mean_token_accuracy": 0.8928222954273224,
"num_tokens": 2058845.0,
"step": 221
},
{
"epoch": 1.0326340326340326,
"grad_norm": 2.1158285136058033,
"learning_rate": 8.48685777485727e-06,
"loss": 0.3596,
"mean_token_accuracy": 0.8957322537899017,
"num_tokens": 2067868.0,
"step": 222
},
{
"epoch": 1.0372960372960374,
"grad_norm": 2.1547545690668026,
"learning_rate": 8.46858300427193e-06,
"loss": 0.3831,
"mean_token_accuracy": 0.8963182866573334,
"num_tokens": 2077258.0,
"step": 223
},
{
"epoch": 1.0419580419580419,
"grad_norm": 1.9477915170139626,
"learning_rate": 8.450221138953383e-06,
"loss": 0.3642,
"mean_token_accuracy": 0.8969131708145142,
"num_tokens": 2086219.0,
"step": 224
},
{
"epoch": 1.0466200466200466,
"grad_norm": 1.9674246406742486,
"learning_rate": 8.431772717617154e-06,
"loss": 0.348,
"mean_token_accuracy": 0.9007111489772797,
"num_tokens": 2095430.0,
"step": 225
},
{
"epoch": 1.0512820512820513,
"grad_norm": 2.0055164195358235,
"learning_rate": 8.413238281518225e-06,
"loss": 0.3575,
"mean_token_accuracy": 0.8980507254600525,
"num_tokens": 2104162.0,
"step": 226
},
{
"epoch": 1.055944055944056,
"grad_norm": 1.989341084975444,
"learning_rate": 8.394618374435148e-06,
"loss": 0.3634,
"mean_token_accuracy": 0.8973170518875122,
"num_tokens": 2114619.0,
"step": 227
},
{
"epoch": 1.0606060606060606,
"grad_norm": 2.1674333688477074,
"learning_rate": 8.375913542654093e-06,
"loss": 0.3665,
"mean_token_accuracy": 0.8995259404182434,
"num_tokens": 2124605.0,
"step": 228
},
{
"epoch": 1.0652680652680653,
"grad_norm": 2.0014110756016703,
"learning_rate": 8.357124334952818e-06,
"loss": 0.3884,
"mean_token_accuracy": 0.889716625213623,
"num_tokens": 2133888.0,
"step": 229
},
{
"epoch": 1.06993006993007,
"grad_norm": 1.9633860039478925,
"learning_rate": 8.33825130258458e-06,
"loss": 0.3578,
"mean_token_accuracy": 0.8981423377990723,
"num_tokens": 2142444.0,
"step": 230
},
{
"epoch": 1.0745920745920745,
"grad_norm": 2.013885901701478,
"learning_rate": 8.319294999261941e-06,
"loss": 0.335,
"mean_token_accuracy": 0.9031325578689575,
"num_tokens": 2151447.0,
"step": 231
},
{
"epoch": 1.0792540792540792,
"grad_norm": 1.8717843659595645,
"learning_rate": 8.300255981140544e-06,
"loss": 0.3334,
"mean_token_accuracy": 0.905355840921402,
"num_tokens": 2161278.0,
"step": 232
},
{
"epoch": 1.083916083916084,
"grad_norm": 1.8282559453848013,
"learning_rate": 8.281134806802783e-06,
"loss": 0.3789,
"mean_token_accuracy": 0.8943277895450592,
"num_tokens": 2170704.0,
"step": 233
},
{
"epoch": 1.0885780885780885,
"grad_norm": 2.0761431582961705,
"learning_rate": 8.261932037241418e-06,
"loss": 0.3726,
"mean_token_accuracy": 0.8970981240272522,
"num_tokens": 2180907.0,
"step": 234
},
{
"epoch": 1.0932400932400932,
"grad_norm": 1.8858855419887754,
"learning_rate": 8.242648235843123e-06,
"loss": 0.3135,
"mean_token_accuracy": 0.9130119383335114,
"num_tokens": 2190790.0,
"step": 235
},
{
"epoch": 1.097902097902098,
"grad_norm": 1.8790292725099809,
"learning_rate": 8.223283968371945e-06,
"loss": 0.3485,
"mean_token_accuracy": 0.904576301574707,
"num_tokens": 2199657.0,
"step": 236
},
{
"epoch": 1.1025641025641026,
"grad_norm": 1.9939775129861892,
"learning_rate": 8.203839802952708e-06,
"loss": 0.3518,
"mean_token_accuracy": 0.9006724953651428,
"num_tokens": 2209030.0,
"step": 237
},
{
"epoch": 1.1072261072261071,
"grad_norm": 2.318895943210041,
"learning_rate": 8.184316310054355e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.8964027166366577,
"num_tokens": 2218265.0,
"step": 238
},
{
"epoch": 1.1118881118881119,
"grad_norm": 2.1196825455595003,
"learning_rate": 8.164714062473201e-06,
"loss": 0.3995,
"mean_token_accuracy": 0.889014482498169,
"num_tokens": 2227845.0,
"step": 239
},
{
"epoch": 1.1165501165501166,
"grad_norm": 2.1570886496790247,
"learning_rate": 8.14503363531613e-06,
"loss": 0.367,
"mean_token_accuracy": 0.900328516960144,
"num_tokens": 2237505.0,
"step": 240
},
{
"epoch": 1.121212121212121,
"grad_norm": 1.892665235627306,
"learning_rate": 8.125275605983725e-06,
"loss": 0.3458,
"mean_token_accuracy": 0.9030775427818298,
"num_tokens": 2246730.0,
"step": 241
},
{
"epoch": 1.1258741258741258,
"grad_norm": 1.848983532276759,
"learning_rate": 8.10544055415332e-06,
"loss": 0.3527,
"mean_token_accuracy": 0.9045140743255615,
"num_tokens": 2255845.0,
"step": 242
},
{
"epoch": 1.1305361305361306,
"grad_norm": 1.919023337141047,
"learning_rate": 8.085529061762007e-06,
"loss": 0.3663,
"mean_token_accuracy": 0.8982928693294525,
"num_tokens": 2264305.0,
"step": 243
},
{
"epoch": 1.1351981351981353,
"grad_norm": 1.8321927781965666,
"learning_rate": 8.065541712989546e-06,
"loss": 0.337,
"mean_token_accuracy": 0.9019491672515869,
"num_tokens": 2273361.0,
"step": 244
},
{
"epoch": 1.1398601398601398,
"grad_norm": 2.0269533829700013,
"learning_rate": 8.04547909424124e-06,
"loss": 0.3614,
"mean_token_accuracy": 0.9004016220569611,
"num_tokens": 2283178.0,
"step": 245
},
{
"epoch": 1.1445221445221445,
"grad_norm": 1.9130074634196965,
"learning_rate": 8.025341794130722e-06,
"loss": 0.3482,
"mean_token_accuracy": 0.9014346897602081,
"num_tokens": 2292865.0,
"step": 246
},
{
"epoch": 1.1491841491841492,
"grad_norm": 1.8547690304895703,
"learning_rate": 8.005130403462687e-06,
"loss": 0.3427,
"mean_token_accuracy": 0.9018321931362152,
"num_tokens": 2302310.0,
"step": 247
},
{
"epoch": 1.1538461538461537,
"grad_norm": 1.8535777462460932,
"learning_rate": 7.98484551521556e-06,
"loss": 0.3529,
"mean_token_accuracy": 0.9022264182567596,
"num_tokens": 2312189.0,
"step": 248
},
{
"epoch": 1.1585081585081585,
"grad_norm": 2.435521274717378,
"learning_rate": 7.964487724524105e-06,
"loss": 0.3926,
"mean_token_accuracy": 0.8963950574398041,
"num_tokens": 2321143.0,
"step": 249
},
{
"epoch": 1.1631701631701632,
"grad_norm": 2.3200358499575375,
"learning_rate": 7.944057628661948e-06,
"loss": 0.3675,
"mean_token_accuracy": 0.8951332271099091,
"num_tokens": 2330428.0,
"step": 250
},
{
"epoch": 1.167832167832168,
"grad_norm": 2.033123158008929,
"learning_rate": 7.923555827024069e-06,
"loss": 0.3621,
"mean_token_accuracy": 0.8988972008228302,
"num_tokens": 2339659.0,
"step": 251
},
{
"epoch": 1.1724941724941724,
"grad_norm": 1.9890967367444383,
"learning_rate": 7.902982921109215e-06,
"loss": 0.3734,
"mean_token_accuracy": 0.8945567905902863,
"num_tokens": 2348827.0,
"step": 252
},
{
"epoch": 1.1771561771561772,
"grad_norm": 1.9463270812100881,
"learning_rate": 7.882339514502236e-06,
"loss": 0.3334,
"mean_token_accuracy": 0.9068220555782318,
"num_tokens": 2359744.0,
"step": 253
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.8550101013313145,
"learning_rate": 7.861626212856404e-06,
"loss": 0.3528,
"mean_token_accuracy": 0.9038570523262024,
"num_tokens": 2368857.0,
"step": 254
},
{
"epoch": 1.1864801864801864,
"grad_norm": 1.8873717371173877,
"learning_rate": 7.840843623875621e-06,
"loss": 0.3496,
"mean_token_accuracy": 0.9028986990451813,
"num_tokens": 2378219.0,
"step": 255
},
{
"epoch": 1.191142191142191,
"grad_norm": 1.9979141390235062,
"learning_rate": 7.8199923572966e-06,
"loss": 0.3789,
"mean_token_accuracy": 0.8938669860363007,
"num_tokens": 2387549.0,
"step": 256
},
{
"epoch": 1.1958041958041958,
"grad_norm": 2.016685234805154,
"learning_rate": 7.799073024870972e-06,
"loss": 0.3611,
"mean_token_accuracy": 0.8983322978019714,
"num_tokens": 2397564.0,
"step": 257
},
{
"epoch": 1.2004662004662006,
"grad_norm": 1.9415097190450272,
"learning_rate": 7.778086240347343e-06,
"loss": 0.3468,
"mean_token_accuracy": 0.9009680449962616,
"num_tokens": 2406482.0,
"step": 258
},
{
"epoch": 1.205128205128205,
"grad_norm": 2.2166846370519435,
"learning_rate": 7.757032619453285e-06,
"loss": 0.3436,
"mean_token_accuracy": 0.9036185145378113,
"num_tokens": 2415453.0,
"step": 259
},
{
"epoch": 1.2097902097902098,
"grad_norm": 1.7953859731976027,
"learning_rate": 7.735912779877266e-06,
"loss": 0.3557,
"mean_token_accuracy": 0.9016382992267609,
"num_tokens": 2424373.0,
"step": 260
},
{
"epoch": 1.2144522144522145,
"grad_norm": 2.0224193055579796,
"learning_rate": 7.714727341250533e-06,
"loss": 0.3206,
"mean_token_accuracy": 0.910454124212265,
"num_tokens": 2433081.0,
"step": 261
},
{
"epoch": 1.219114219114219,
"grad_norm": 2.105326719368714,
"learning_rate": 7.693476925128937e-06,
"loss": 0.4086,
"mean_token_accuracy": 0.8892490267753601,
"num_tokens": 2442248.0,
"step": 262
},
{
"epoch": 1.2237762237762237,
"grad_norm": 2.054961145659142,
"learning_rate": 7.672162154974686e-06,
"loss": 0.3334,
"mean_token_accuracy": 0.9058608114719391,
"num_tokens": 2451308.0,
"step": 263
},
{
"epoch": 1.2284382284382285,
"grad_norm": 2.0190722916864448,
"learning_rate": 7.650783656138065e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8919758200645447,
"num_tokens": 2461071.0,
"step": 264
},
{
"epoch": 1.2331002331002332,
"grad_norm": 2.0459573081488664,
"learning_rate": 7.629342055839077e-06,
"loss": 0.3466,
"mean_token_accuracy": 0.9024505317211151,
"num_tokens": 2471649.0,
"step": 265
},
{
"epoch": 1.2377622377622377,
"grad_norm": 2.045255678828029,
"learning_rate": 7.607837983149057e-06,
"loss": 0.3902,
"mean_token_accuracy": 0.8912324607372284,
"num_tokens": 2481584.0,
"step": 266
},
{
"epoch": 1.2424242424242424,
"grad_norm": 2.0728524474968553,
"learning_rate": 7.586272068972196e-06,
"loss": 0.4005,
"mean_token_accuracy": 0.8886791169643402,
"num_tokens": 2491498.0,
"step": 267
},
{
"epoch": 1.2470862470862472,
"grad_norm": 1.9682634266288053,
"learning_rate": 7.564644946027049e-06,
"loss": 0.3578,
"mean_token_accuracy": 0.9002366364002228,
"num_tokens": 2501066.0,
"step": 268
},
{
"epoch": 1.2517482517482517,
"grad_norm": 1.7696539616448115,
"learning_rate": 7.5429572488279615e-06,
"loss": 0.3167,
"mean_token_accuracy": 0.9112659692764282,
"num_tokens": 2510213.0,
"step": 269
},
{
"epoch": 1.2564102564102564,
"grad_norm": 1.869201624827092,
"learning_rate": 7.521209613666457e-06,
"loss": 0.3298,
"mean_token_accuracy": 0.9096185564994812,
"num_tokens": 2518850.0,
"step": 270
},
{
"epoch": 1.2610722610722611,
"grad_norm": 1.7896035852004597,
"learning_rate": 7.499402678592568e-06,
"loss": 0.3452,
"mean_token_accuracy": 0.902194082736969,
"num_tokens": 2528729.0,
"step": 271
},
{
"epoch": 1.2657342657342658,
"grad_norm": 1.8578212430775984,
"learning_rate": 7.477537083396114e-06,
"loss": 0.3377,
"mean_token_accuracy": 0.9030162990093231,
"num_tokens": 2538576.0,
"step": 272
},
{
"epoch": 1.2703962703962703,
"grad_norm": 1.8583558341062996,
"learning_rate": 7.45561346958794e-06,
"loss": 0.3327,
"mean_token_accuracy": 0.9063755571842194,
"num_tokens": 2548020.0,
"step": 273
},
{
"epoch": 1.275058275058275,
"grad_norm": 1.945988010631059,
"learning_rate": 7.433632480381083e-06,
"loss": 0.3535,
"mean_token_accuracy": 0.9046582877635956,
"num_tokens": 2556984.0,
"step": 274
},
{
"epoch": 1.2797202797202798,
"grad_norm": 2.0099202078958958,
"learning_rate": 7.4115947606719105e-06,
"loss": 0.3612,
"mean_token_accuracy": 0.899190753698349,
"num_tokens": 2566255.0,
"step": 275
},
{
"epoch": 1.2843822843822843,
"grad_norm": 2.1032377215815,
"learning_rate": 7.389500957021192e-06,
"loss": 0.351,
"mean_token_accuracy": 0.9027476012706757,
"num_tokens": 2575331.0,
"step": 276
},
{
"epoch": 1.289044289044289,
"grad_norm": 2.0973543646568227,
"learning_rate": 7.367351717635136e-06,
"loss": 0.3561,
"mean_token_accuracy": 0.899603396654129,
"num_tokens": 2584609.0,
"step": 277
},
{
"epoch": 1.2937062937062938,
"grad_norm": 1.9524574422408152,
"learning_rate": 7.345147692346373e-06,
"loss": 0.3621,
"mean_token_accuracy": 0.8965962827205658,
"num_tokens": 2593822.0,
"step": 278
},
{
"epoch": 1.2983682983682985,
"grad_norm": 1.8684420517312623,
"learning_rate": 7.3228895325948835e-06,
"loss": 0.3404,
"mean_token_accuracy": 0.9057947397232056,
"num_tokens": 2602990.0,
"step": 279
},
{
"epoch": 1.303030303030303,
"grad_norm": 1.8423583661355911,
"learning_rate": 7.3005778914088895e-06,
"loss": 0.3181,
"mean_token_accuracy": 0.9117348790168762,
"num_tokens": 2612564.0,
"step": 280
},
{
"epoch": 1.3076923076923077,
"grad_norm": 1.8719988018492713,
"learning_rate": 7.278213423385701e-06,
"loss": 0.3633,
"mean_token_accuracy": 0.8996273577213287,
"num_tokens": 2621342.0,
"step": 281
},
{
"epoch": 1.3123543123543124,
"grad_norm": 2.097992632063848,
"learning_rate": 7.255796784672496e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8940203785896301,
"num_tokens": 2631008.0,
"step": 282
},
{
"epoch": 1.317016317016317,
"grad_norm": 2.1962950013308147,
"learning_rate": 7.233328632947087e-06,
"loss": 0.3176,
"mean_token_accuracy": 0.9080378413200378,
"num_tokens": 2640269.0,
"step": 283
},
{
"epoch": 1.3216783216783217,
"grad_norm": 1.8870826341467308,
"learning_rate": 7.210809627398615e-06,
"loss": 0.4153,
"mean_token_accuracy": 0.8845455944538116,
"num_tokens": 2650339.0,
"step": 284
},
{
"epoch": 1.3263403263403264,
"grad_norm": 2.4502915111296306,
"learning_rate": 7.188240428708211e-06,
"loss": 0.3789,
"mean_token_accuracy": 0.8997355103492737,
"num_tokens": 2659574.0,
"step": 285
},
{
"epoch": 1.3310023310023311,
"grad_norm": 1.8548118165118153,
"learning_rate": 7.165621699029615e-06,
"loss": 0.3484,
"mean_token_accuracy": 0.9010252058506012,
"num_tokens": 2670228.0,
"step": 286
},
{
"epoch": 1.3356643356643356,
"grad_norm": 1.9980570026677085,
"learning_rate": 7.1429541019697505e-06,
"loss": 0.3404,
"mean_token_accuracy": 0.9027169346809387,
"num_tokens": 2679552.0,
"step": 287
},
{
"epoch": 1.3403263403263403,
"grad_norm": 2.0041119431302143,
"learning_rate": 7.120238302569245e-06,
"loss": 0.3582,
"mean_token_accuracy": 0.8998216986656189,
"num_tokens": 2689039.0,
"step": 288
},
{
"epoch": 1.3449883449883449,
"grad_norm": 2.1117609014334198,
"learning_rate": 7.097474967282936e-06,
"loss": 0.3538,
"mean_token_accuracy": 0.9006753861904144,
"num_tokens": 2698186.0,
"step": 289
},
{
"epoch": 1.3496503496503496,
"grad_norm": 2.0486984888660893,
"learning_rate": 7.0746647639602994e-06,
"loss": 0.3735,
"mean_token_accuracy": 0.8939989805221558,
"num_tokens": 2707633.0,
"step": 290
},
{
"epoch": 1.3543123543123543,
"grad_norm": 2.0849324489543966,
"learning_rate": 7.051808361825867e-06,
"loss": 0.3615,
"mean_token_accuracy": 0.8953780829906464,
"num_tokens": 2717211.0,
"step": 291
},
{
"epoch": 1.358974358974359,
"grad_norm": 2.0186929363318846,
"learning_rate": 7.028906431459593e-06,
"loss": 0.3721,
"mean_token_accuracy": 0.8980013132095337,
"num_tokens": 2726028.0,
"step": 292
},
{
"epoch": 1.3636363636363638,
"grad_norm": 2.1845476421129746,
"learning_rate": 7.0059596447771714e-06,
"loss": 0.3582,
"mean_token_accuracy": 0.9010344445705414,
"num_tokens": 2735300.0,
"step": 293
},
{
"epoch": 1.3682983682983683,
"grad_norm": 1.895694935116521,
"learning_rate": 6.982968675010332e-06,
"loss": 0.3425,
"mean_token_accuracy": 0.904049277305603,
"num_tokens": 2745014.0,
"step": 294
},
{
"epoch": 1.372960372960373,
"grad_norm": 2.1465561267774618,
"learning_rate": 6.959934196687079e-06,
"loss": 0.3973,
"mean_token_accuracy": 0.8906912803649902,
"num_tokens": 2754679.0,
"step": 295
},
{
"epoch": 1.3776223776223775,
"grad_norm": 2.1936247573101584,
"learning_rate": 6.93685688561191e-06,
"loss": 0.3958,
"mean_token_accuracy": 0.8900346755981445,
"num_tokens": 2763778.0,
"step": 296
},
{
"epoch": 1.3822843822843822,
"grad_norm": 2.2014931147614707,
"learning_rate": 6.913737418845985e-06,
"loss": 0.3484,
"mean_token_accuracy": 0.8970995843410492,
"num_tokens": 2772924.0,
"step": 297
},
{
"epoch": 1.386946386946387,
"grad_norm": 1.9248709270571454,
"learning_rate": 6.890576474687264e-06,
"loss": 0.374,
"mean_token_accuracy": 0.8932155966758728,
"num_tokens": 2782527.0,
"step": 298
},
{
"epoch": 1.3916083916083917,
"grad_norm": 2.127937137873515,
"learning_rate": 6.8673747326506e-06,
"loss": 0.4019,
"mean_token_accuracy": 0.8896147608757019,
"num_tokens": 2792249.0,
"step": 299
},
{
"epoch": 1.3962703962703964,
"grad_norm": 2.2064832449632656,
"learning_rate": 6.8441328734478115e-06,
"loss": 0.3501,
"mean_token_accuracy": 0.8990257680416107,
"num_tokens": 2801881.0,
"step": 300
},
{
"epoch": 1.400932400932401,
"grad_norm": 1.8573619457430928,
"learning_rate": 6.820851578967708e-06,
"loss": 0.3872,
"mean_token_accuracy": 0.8920771181583405,
"num_tokens": 2811478.0,
"step": 301
},
{
"epoch": 1.4055944055944056,
"grad_norm": 2.0254753288605225,
"learning_rate": 6.797531532256079e-06,
"loss": 0.3734,
"mean_token_accuracy": 0.8915270268917084,
"num_tokens": 2821393.0,
"step": 302
},
{
"epoch": 1.4102564102564101,
"grad_norm": 2.0305791181245927,
"learning_rate": 6.774173417495667e-06,
"loss": 0.3913,
"mean_token_accuracy": 0.8924152255058289,
"num_tokens": 2830925.0,
"step": 303
},
{
"epoch": 1.4149184149184149,
"grad_norm": 2.004426882378005,
"learning_rate": 6.750777919986075e-06,
"loss": 0.3446,
"mean_token_accuracy": 0.900405764579773,
"num_tokens": 2840991.0,
"step": 304
},
{
"epoch": 1.4195804195804196,
"grad_norm": 1.888471422341293,
"learning_rate": 6.727345726123684e-06,
"loss": 0.4219,
"mean_token_accuracy": 0.8845990300178528,
"num_tokens": 2850900.0,
"step": 305
},
{
"epoch": 1.4242424242424243,
"grad_norm": 2.3906699897573067,
"learning_rate": 6.703877523381495e-06,
"loss": 0.339,
"mean_token_accuracy": 0.90447598695755,
"num_tokens": 2860353.0,
"step": 306
},
{
"epoch": 1.428904428904429,
"grad_norm": 1.84404450023143,
"learning_rate": 6.680374000288968e-06,
"loss": 0.3976,
"mean_token_accuracy": 0.8878736197948456,
"num_tokens": 2868792.0,
"step": 307
},
{
"epoch": 1.4335664335664335,
"grad_norm": 2.2669491579710908,
"learning_rate": 6.656835846411824e-06,
"loss": 0.3815,
"mean_token_accuracy": 0.892831951379776,
"num_tokens": 2877774.0,
"step": 308
},
{
"epoch": 1.4382284382284383,
"grad_norm": 2.049722778481883,
"learning_rate": 6.633263752331808e-06,
"loss": 0.341,
"mean_token_accuracy": 0.9073670506477356,
"num_tokens": 2886388.0,
"step": 309
},
{
"epoch": 1.4428904428904428,
"grad_norm": 1.9762730207593977,
"learning_rate": 6.609658409626431e-06,
"loss": 0.3336,
"mean_token_accuracy": 0.9044366478919983,
"num_tokens": 2896615.0,
"step": 310
},
{
"epoch": 1.4475524475524475,
"grad_norm": 2.0368692579640504,
"learning_rate": 6.586020510848676e-06,
"loss": 0.3983,
"mean_token_accuracy": 0.8900512158870697,
"num_tokens": 2905800.0,
"step": 311
},
{
"epoch": 1.4522144522144522,
"grad_norm": 2.2134931405904728,
"learning_rate": 6.562350749506691e-06,
"loss": 0.3625,
"mean_token_accuracy": 0.8963052034378052,
"num_tokens": 2914643.0,
"step": 312
},
{
"epoch": 1.456876456876457,
"grad_norm": 2.0611362055815206,
"learning_rate": 6.538649820043427e-06,
"loss": 0.3273,
"mean_token_accuracy": 0.9081335067749023,
"num_tokens": 2924023.0,
"step": 313
},
{
"epoch": 1.4615384615384617,
"grad_norm": 2.054315048122934,
"learning_rate": 6.514918417816275e-06,
"loss": 0.3644,
"mean_token_accuracy": 0.8971601724624634,
"num_tokens": 2933568.0,
"step": 314
},
{
"epoch": 1.4662004662004662,
"grad_norm": 1.8809804843640063,
"learning_rate": 6.4911572390766575e-06,
"loss": 0.3577,
"mean_token_accuracy": 0.9007624089717865,
"num_tokens": 2943220.0,
"step": 315
},
{
"epoch": 1.470862470862471,
"grad_norm": 2.2331671138955405,
"learning_rate": 6.46736698094961e-06,
"loss": 0.3637,
"mean_token_accuracy": 0.8960618078708649,
"num_tokens": 2953179.0,
"step": 316
},
{
"epoch": 1.4755244755244754,
"grad_norm": 1.9954505430707832,
"learning_rate": 6.443548341413316e-06,
"loss": 0.3644,
"mean_token_accuracy": 0.8977023661136627,
"num_tokens": 2961793.0,
"step": 317
},
{
"epoch": 1.4801864801864801,
"grad_norm": 1.9724699365395055,
"learning_rate": 6.419702019278643e-06,
"loss": 0.3758,
"mean_token_accuracy": 0.8944825232028961,
"num_tokens": 2971416.0,
"step": 318
},
{
"epoch": 1.4848484848484849,
"grad_norm": 2.0154500749099387,
"learning_rate": 6.3958287141686294e-06,
"loss": 0.3699,
"mean_token_accuracy": 0.8967875242233276,
"num_tokens": 2980488.0,
"step": 319
},
{
"epoch": 1.4895104895104896,
"grad_norm": 1.9968714801195095,
"learning_rate": 6.371929126497963e-06,
"loss": 0.3482,
"mean_token_accuracy": 0.8994722068309784,
"num_tokens": 2989801.0,
"step": 320
},
{
"epoch": 1.494172494172494,
"grad_norm": 1.8686828557788273,
"learning_rate": 6.348003957452433e-06,
"loss": 0.3408,
"mean_token_accuracy": 0.9057431817054749,
"num_tokens": 2998497.0,
"step": 321
},
{
"epoch": 1.4988344988344988,
"grad_norm": 1.961491939994152,
"learning_rate": 6.324053908968353e-06,
"loss": 0.3708,
"mean_token_accuracy": 0.898443877696991,
"num_tokens": 3008134.0,
"step": 322
},
{
"epoch": 1.5034965034965035,
"grad_norm": 2.074784243247548,
"learning_rate": 6.300079683711973e-06,
"loss": 0.3431,
"mean_token_accuracy": 0.9046167731285095,
"num_tokens": 3017363.0,
"step": 323
},
{
"epoch": 1.508158508158508,
"grad_norm": 2.0249158717108338,
"learning_rate": 6.276081985058857e-06,
"loss": 0.3313,
"mean_token_accuracy": 0.9048315584659576,
"num_tokens": 3026359.0,
"step": 324
},
{
"epoch": 1.5128205128205128,
"grad_norm": 2.0275965811268506,
"learning_rate": 6.2520615170732555e-06,
"loss": 0.364,
"mean_token_accuracy": 0.8976201117038727,
"num_tokens": 3035205.0,
"step": 325
},
{
"epoch": 1.5174825174825175,
"grad_norm": 2.1956087559799706,
"learning_rate": 6.228018984487443e-06,
"loss": 0.4091,
"mean_token_accuracy": 0.8842986524105072,
"num_tokens": 3044415.0,
"step": 326
},
{
"epoch": 1.5221445221445222,
"grad_norm": 2.454246418757256,
"learning_rate": 6.20395509268104e-06,
"loss": 0.3733,
"mean_token_accuracy": 0.8995526134967804,
"num_tokens": 3053666.0,
"step": 327
},
{
"epoch": 1.526806526806527,
"grad_norm": 2.1884276223847343,
"learning_rate": 6.179870547660326e-06,
"loss": 0.3684,
"mean_token_accuracy": 0.8975951671600342,
"num_tokens": 3063098.0,
"step": 328
},
{
"epoch": 1.5314685314685315,
"grad_norm": 2.1597054844225747,
"learning_rate": 6.15576605603752e-06,
"loss": 0.3949,
"mean_token_accuracy": 0.8913069069385529,
"num_tokens": 3072433.0,
"step": 329
},
{
"epoch": 1.5361305361305362,
"grad_norm": 2.113291542241382,
"learning_rate": 6.13164232501005e-06,
"loss": 0.3543,
"mean_token_accuracy": 0.9012714624404907,
"num_tokens": 3081854.0,
"step": 330
},
{
"epoch": 1.5407925407925407,
"grad_norm": 1.9680970926751453,
"learning_rate": 6.107500062339806e-06,
"loss": 0.3338,
"mean_token_accuracy": 0.906892865896225,
"num_tokens": 3090890.0,
"step": 331
},
{
"epoch": 1.5454545454545454,
"grad_norm": 1.9286059557341297,
"learning_rate": 6.083339976332375e-06,
"loss": 0.3927,
"mean_token_accuracy": 0.8934414088726044,
"num_tokens": 3100652.0,
"step": 332
},
{
"epoch": 1.5501165501165501,
"grad_norm": 2.086637851483338,
"learning_rate": 6.05916277581626e-06,
"loss": 0.3607,
"mean_token_accuracy": 0.8994470834732056,
"num_tokens": 3109678.0,
"step": 333
},
{
"epoch": 1.5547785547785549,
"grad_norm": 2.0191393522647414,
"learning_rate": 6.034969170122079e-06,
"loss": 0.3506,
"mean_token_accuracy": 0.9034914076328278,
"num_tokens": 3118426.0,
"step": 334
},
{
"epoch": 1.5594405594405596,
"grad_norm": 2.0591466593650005,
"learning_rate": 6.010759869061768e-06,
"loss": 0.3887,
"mean_token_accuracy": 0.8912419378757477,
"num_tokens": 3128508.0,
"step": 335
},
{
"epoch": 1.564102564102564,
"grad_norm": 1.9767624630895566,
"learning_rate": 5.986535582907739e-06,
"loss": 0.3392,
"mean_token_accuracy": 0.9024395048618317,
"num_tokens": 3138063.0,
"step": 336
},
{
"epoch": 1.5687645687645686,
"grad_norm": 1.968183719788235,
"learning_rate": 5.96229702237205e-06,
"loss": 0.3617,
"mean_token_accuracy": 0.8971932530403137,
"num_tokens": 3147501.0,
"step": 337
},
{
"epoch": 1.5734265734265733,
"grad_norm": 1.9372262604926125,
"learning_rate": 5.938044898585555e-06,
"loss": 0.3583,
"mean_token_accuracy": 0.9022301137447357,
"num_tokens": 3156896.0,
"step": 338
},
{
"epoch": 1.578088578088578,
"grad_norm": 2.007232850048927,
"learning_rate": 5.913779923077035e-06,
"loss": 0.3512,
"mean_token_accuracy": 0.8955269753932953,
"num_tokens": 3166931.0,
"step": 339
},
{
"epoch": 1.5827505827505828,
"grad_norm": 1.906142795024627,
"learning_rate": 5.889502807752329e-06,
"loss": 0.3772,
"mean_token_accuracy": 0.8949233889579773,
"num_tokens": 3176414.0,
"step": 340
},
{
"epoch": 1.5874125874125875,
"grad_norm": 2.1876602349847154,
"learning_rate": 5.865214264873441e-06,
"loss": 0.4217,
"mean_token_accuracy": 0.8859592080116272,
"num_tokens": 3185570.0,
"step": 341
},
{
"epoch": 1.5920745920745922,
"grad_norm": 2.036501182255171,
"learning_rate": 5.840915007037648e-06,
"loss": 0.3967,
"mean_token_accuracy": 0.891197919845581,
"num_tokens": 3194296.0,
"step": 342
},
{
"epoch": 1.5967365967365967,
"grad_norm": 2.165455182761244,
"learning_rate": 5.816605747156588e-06,
"loss": 0.3858,
"mean_token_accuracy": 0.8901144564151764,
"num_tokens": 3204108.0,
"step": 343
},
{
"epoch": 1.6013986013986012,
"grad_norm": 2.020010750738948,
"learning_rate": 5.792287198435349e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.9072842001914978,
"num_tokens": 3213706.0,
"step": 344
},
{
"epoch": 1.606060606060606,
"grad_norm": 1.8267767685145804,
"learning_rate": 5.767960074351545e-06,
"loss": 0.3166,
"mean_token_accuracy": 0.9123204052448273,
"num_tokens": 3223445.0,
"step": 345
},
{
"epoch": 1.6107226107226107,
"grad_norm": 1.8249584358478421,
"learning_rate": 5.74362508863438e-06,
"loss": 0.3557,
"mean_token_accuracy": 0.9038134515285492,
"num_tokens": 3232428.0,
"step": 346
},
{
"epoch": 1.6153846153846154,
"grad_norm": 2.031208096971931,
"learning_rate": 5.719282955243705e-06,
"loss": 0.3729,
"mean_token_accuracy": 0.8925433158874512,
"num_tokens": 3241575.0,
"step": 347
},
{
"epoch": 1.6200466200466201,
"grad_norm": 2.1078269837893817,
"learning_rate": 5.69493438834908e-06,
"loss": 0.3927,
"mean_token_accuracy": 0.89081871509552,
"num_tokens": 3250577.0,
"step": 348
},
{
"epoch": 1.6247086247086249,
"grad_norm": 2.053266844522503,
"learning_rate": 5.670580102308816e-06,
"loss": 0.3708,
"mean_token_accuracy": 0.8976758718490601,
"num_tokens": 3259305.0,
"step": 349
},
{
"epoch": 1.6293706293706294,
"grad_norm": 2.2217560492827486,
"learning_rate": 5.646220811649013e-06,
"loss": 0.3631,
"mean_token_accuracy": 0.9028047621250153,
"num_tokens": 3268502.0,
"step": 350
},
{
"epoch": 1.6340326340326339,
"grad_norm": 1.9993214416997105,
"learning_rate": 5.6218572310426065e-06,
"loss": 0.3751,
"mean_token_accuracy": 0.8933521211147308,
"num_tokens": 3276568.0,
"step": 351
},
{
"epoch": 1.6386946386946386,
"grad_norm": 2.1734690923322115,
"learning_rate": 5.59749007528839e-06,
"loss": 0.3498,
"mean_token_accuracy": 0.9007490277290344,
"num_tokens": 3285881.0,
"step": 352
},
{
"epoch": 1.6433566433566433,
"grad_norm": 1.98059234929291,
"learning_rate": 5.573120059290047e-06,
"loss": 0.3484,
"mean_token_accuracy": 0.9028733670711517,
"num_tokens": 3295490.0,
"step": 353
},
{
"epoch": 1.648018648018648,
"grad_norm": 1.8606757870005566,
"learning_rate": 5.5487478980351805e-06,
"loss": 0.342,
"mean_token_accuracy": 0.9035466313362122,
"num_tokens": 3304645.0,
"step": 354
},
{
"epoch": 1.6526806526806528,
"grad_norm": 2.0090495432050726,
"learning_rate": 5.524374306574331e-06,
"loss": 0.3516,
"mean_token_accuracy": 0.9037186503410339,
"num_tokens": 3313188.0,
"step": 355
},
{
"epoch": 1.6573426573426573,
"grad_norm": 1.9688667212238595,
"learning_rate": 5.500000000000001e-06,
"loss": 0.3673,
"mean_token_accuracy": 0.9001396596431732,
"num_tokens": 3322514.0,
"step": 356
},
{
"epoch": 1.662004662004662,
"grad_norm": 2.1361286046991568,
"learning_rate": 5.47562569342567e-06,
"loss": 0.3553,
"mean_token_accuracy": 0.9002968966960907,
"num_tokens": 3332207.0,
"step": 357
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.9983765145158914,
"learning_rate": 5.451252101964821e-06,
"loss": 0.345,
"mean_token_accuracy": 0.9027390778064728,
"num_tokens": 3342114.0,
"step": 358
},
{
"epoch": 1.6713286713286712,
"grad_norm": 2.221522630829983,
"learning_rate": 5.426879940709956e-06,
"loss": 0.4167,
"mean_token_accuracy": 0.8860217034816742,
"num_tokens": 3351663.0,
"step": 359
},
{
"epoch": 1.675990675990676,
"grad_norm": 2.351737145138835,
"learning_rate": 5.402509924711612e-06,
"loss": 0.3787,
"mean_token_accuracy": 0.8979602456092834,
"num_tokens": 3360915.0,
"step": 360
},
{
"epoch": 1.6806526806526807,
"grad_norm": 2.2067080103402366,
"learning_rate": 5.378142768957396e-06,
"loss": 0.3694,
"mean_token_accuracy": 0.8943982720375061,
"num_tokens": 3370401.0,
"step": 361
},
{
"epoch": 1.6853146853146854,
"grad_norm": 2.2270263309806366,
"learning_rate": 5.353779188350989e-06,
"loss": 0.3982,
"mean_token_accuracy": 0.8878339529037476,
"num_tokens": 3380711.0,
"step": 362
},
{
"epoch": 1.68997668997669,
"grad_norm": 1.8254660836200485,
"learning_rate": 5.329419897691187e-06,
"loss": 0.3084,
"mean_token_accuracy": 0.9157176613807678,
"num_tokens": 3390032.0,
"step": 363
},
{
"epoch": 1.6946386946386947,
"grad_norm": 1.8835789154592308,
"learning_rate": 5.305065611650921e-06,
"loss": 0.3653,
"mean_token_accuracy": 0.8960134088993073,
"num_tokens": 3399395.0,
"step": 364
},
{
"epoch": 1.6993006993006992,
"grad_norm": 1.9906870579116909,
"learning_rate": 5.280717044756298e-06,
"loss": 0.3656,
"mean_token_accuracy": 0.8999285995960236,
"num_tokens": 3408677.0,
"step": 365
},
{
"epoch": 1.7039627039627039,
"grad_norm": 2.0749533505721245,
"learning_rate": 5.256374911365621e-06,
"loss": 0.3439,
"mean_token_accuracy": 0.9051547646522522,
"num_tokens": 3418478.0,
"step": 366
},
{
"epoch": 1.7086247086247086,
"grad_norm": 1.8643996151955922,
"learning_rate": 5.232039925648457e-06,
"loss": 0.3449,
"mean_token_accuracy": 0.9024885594844818,
"num_tokens": 3427641.0,
"step": 367
},
{
"epoch": 1.7132867132867133,
"grad_norm": 1.8625024767097098,
"learning_rate": 5.207712801564652e-06,
"loss": 0.3371,
"mean_token_accuracy": 0.9041755497455597,
"num_tokens": 3437250.0,
"step": 368
},
{
"epoch": 1.717948717948718,
"grad_norm": 2.0436576954469823,
"learning_rate": 5.1833942528434145e-06,
"loss": 0.3721,
"mean_token_accuracy": 0.8964725732803345,
"num_tokens": 3446807.0,
"step": 369
},
{
"epoch": 1.7226107226107226,
"grad_norm": 1.9839081705779613,
"learning_rate": 5.159084992962354e-06,
"loss": 0.3303,
"mean_token_accuracy": 0.9086009562015533,
"num_tokens": 3456306.0,
"step": 370
},
{
"epoch": 1.7272727272727273,
"grad_norm": 2.0000086910401667,
"learning_rate": 5.13478573512656e-06,
"loss": 0.3667,
"mean_token_accuracy": 0.8962165117263794,
"num_tokens": 3466027.0,
"step": 371
},
{
"epoch": 1.7319347319347318,
"grad_norm": 2.038609761919514,
"learning_rate": 5.110497192247671e-06,
"loss": 0.3434,
"mean_token_accuracy": 0.9010106325149536,
"num_tokens": 3475253.0,
"step": 372
},
{
"epoch": 1.7365967365967365,
"grad_norm": 2.1715737014463894,
"learning_rate": 5.086220076922965e-06,
"loss": 0.3448,
"mean_token_accuracy": 0.9069474339485168,
"num_tokens": 3484178.0,
"step": 373
},
{
"epoch": 1.7412587412587412,
"grad_norm": 1.867996028633842,
"learning_rate": 5.061955101414448e-06,
"loss": 0.3381,
"mean_token_accuracy": 0.9080156981945038,
"num_tokens": 3494001.0,
"step": 374
},
{
"epoch": 1.745920745920746,
"grad_norm": 2.1685787201422837,
"learning_rate": 5.0377029776279514e-06,
"loss": 0.3693,
"mean_token_accuracy": 0.902137279510498,
"num_tokens": 3503472.0,
"step": 375
},
{
"epoch": 1.7505827505827507,
"grad_norm": 2.054942165163828,
"learning_rate": 5.013464417092263e-06,
"loss": 0.3739,
"mean_token_accuracy": 0.8999918103218079,
"num_tokens": 3512786.0,
"step": 376
},
{
"epoch": 1.7552447552447552,
"grad_norm": 2.0866190139943246,
"learning_rate": 4.989240130938232e-06,
"loss": 0.3298,
"mean_token_accuracy": 0.9082687795162201,
"num_tokens": 3522266.0,
"step": 377
},
{
"epoch": 1.75990675990676,
"grad_norm": 1.9164366875851901,
"learning_rate": 4.9650308298779215e-06,
"loss": 0.3728,
"mean_token_accuracy": 0.8957255184650421,
"num_tokens": 3531741.0,
"step": 378
},
{
"epoch": 1.7645687645687644,
"grad_norm": 2.1286294755388386,
"learning_rate": 4.940837224183742e-06,
"loss": 0.3469,
"mean_token_accuracy": 0.9021869897842407,
"num_tokens": 3541026.0,
"step": 379
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.9795732637258567,
"learning_rate": 4.916660023667627e-06,
"loss": 0.389,
"mean_token_accuracy": 0.8947827517986298,
"num_tokens": 3550336.0,
"step": 380
},
{
"epoch": 1.7738927738927739,
"grad_norm": 2.2291523062610845,
"learning_rate": 4.892499937660195e-06,
"loss": 0.3698,
"mean_token_accuracy": 0.9012640714645386,
"num_tokens": 3558944.0,
"step": 381
},
{
"epoch": 1.7785547785547786,
"grad_norm": 2.142983628191579,
"learning_rate": 4.8683576749899505e-06,
"loss": 0.3583,
"mean_token_accuracy": 0.896765947341919,
"num_tokens": 3567747.0,
"step": 382
},
{
"epoch": 1.7832167832167833,
"grad_norm": 2.1501200195656134,
"learning_rate": 4.844233943962481e-06,
"loss": 0.3476,
"mean_token_accuracy": 0.9035030007362366,
"num_tokens": 3576606.0,
"step": 383
},
{
"epoch": 1.7878787878787878,
"grad_norm": 1.8885760684090536,
"learning_rate": 4.820129452339676e-06,
"loss": 0.3913,
"mean_token_accuracy": 0.8892778158187866,
"num_tokens": 3586451.0,
"step": 384
},
{
"epoch": 1.7925407925407926,
"grad_norm": 2.3749625105377423,
"learning_rate": 4.796044907318961e-06,
"loss": 0.3493,
"mean_token_accuracy": 0.9036368727684021,
"num_tokens": 3595348.0,
"step": 385
},
{
"epoch": 1.797202797202797,
"grad_norm": 2.1616552910281044,
"learning_rate": 4.771981015512559e-06,
"loss": 0.404,
"mean_token_accuracy": 0.888810396194458,
"num_tokens": 3605507.0,
"step": 386
},
{
"epoch": 1.8018648018648018,
"grad_norm": 1.859001299709581,
"learning_rate": 4.747938482926746e-06,
"loss": 0.3422,
"mean_token_accuracy": 0.9032579958438873,
"num_tokens": 3615183.0,
"step": 387
},
{
"epoch": 1.8065268065268065,
"grad_norm": 2.1152225374726337,
"learning_rate": 4.723918014941144e-06,
"loss": 0.4379,
"mean_token_accuracy": 0.8827772438526154,
"num_tokens": 3624158.0,
"step": 388
},
{
"epoch": 1.8111888111888113,
"grad_norm": 2.149586435718605,
"learning_rate": 4.69992031628803e-06,
"loss": 0.3702,
"mean_token_accuracy": 0.8978259861469269,
"num_tokens": 3633033.0,
"step": 389
},
{
"epoch": 1.815850815850816,
"grad_norm": 2.0006781263840843,
"learning_rate": 4.675946091031648e-06,
"loss": 0.3604,
"mean_token_accuracy": 0.901467889547348,
"num_tokens": 3642167.0,
"step": 390
},
{
"epoch": 1.8205128205128205,
"grad_norm": 1.9865318354604984,
"learning_rate": 4.65199604254757e-06,
"loss": 0.3266,
"mean_token_accuracy": 0.902495414018631,
"num_tokens": 3651178.0,
"step": 391
},
{
"epoch": 1.8251748251748252,
"grad_norm": 1.8174794166059944,
"learning_rate": 4.628070873502038e-06,
"loss": 0.3595,
"mean_token_accuracy": 0.8980917930603027,
"num_tokens": 3660853.0,
"step": 392
},
{
"epoch": 1.8298368298368297,
"grad_norm": 1.8237936767860705,
"learning_rate": 4.604171285831373e-06,
"loss": 0.3247,
"mean_token_accuracy": 0.9076565206050873,
"num_tokens": 3671265.0,
"step": 393
},
{
"epoch": 1.8344988344988344,
"grad_norm": 2.967734088138488,
"learning_rate": 4.5802979807213585e-06,
"loss": 0.3284,
"mean_token_accuracy": 0.9076884090900421,
"num_tokens": 3681935.0,
"step": 394
},
{
"epoch": 1.8391608391608392,
"grad_norm": 1.7848473602660753,
"learning_rate": 4.556451658586687e-06,
"loss": 0.3327,
"mean_token_accuracy": 0.9050241410732269,
"num_tokens": 3690809.0,
"step": 395
},
{
"epoch": 1.843822843822844,
"grad_norm": 2.132215578463288,
"learning_rate": 4.532633019050392e-06,
"loss": 0.3344,
"mean_token_accuracy": 0.9007371068000793,
"num_tokens": 3700254.0,
"step": 396
},
{
"epoch": 1.8484848484848486,
"grad_norm": 1.802687602683309,
"learning_rate": 4.508842760923344e-06,
"loss": 0.3319,
"mean_token_accuracy": 0.9053416550159454,
"num_tokens": 3709572.0,
"step": 397
},
{
"epoch": 1.8531468531468531,
"grad_norm": 2.141026355729347,
"learning_rate": 4.4850815821837265e-06,
"loss": 0.3686,
"mean_token_accuracy": 0.8985418379306793,
"num_tokens": 3718477.0,
"step": 398
},
{
"epoch": 1.8578088578088578,
"grad_norm": 2.064794666005344,
"learning_rate": 4.4613501799565755e-06,
"loss": 0.3776,
"mean_token_accuracy": 0.8916601240634918,
"num_tokens": 3728055.0,
"step": 399
},
{
"epoch": 1.8624708624708624,
"grad_norm": 2.3103287859074477,
"learning_rate": 4.43764925049331e-06,
"loss": 0.3689,
"mean_token_accuracy": 0.8986863493919373,
"num_tokens": 3736756.0,
"step": 400
},
{
"epoch": 1.867132867132867,
"grad_norm": 2.1582853507409765,
"learning_rate": 4.413979489151326e-06,
"loss": 0.3664,
"mean_token_accuracy": 0.8990582525730133,
"num_tokens": 3746049.0,
"step": 401
},
{
"epoch": 1.8717948717948718,
"grad_norm": 2.346758495250227,
"learning_rate": 4.3903415903735725e-06,
"loss": 0.3933,
"mean_token_accuracy": 0.893925815820694,
"num_tokens": 3755161.0,
"step": 402
},
{
"epoch": 1.8764568764568765,
"grad_norm": 2.2276831775466626,
"learning_rate": 4.366736247668194e-06,
"loss": 0.3578,
"mean_token_accuracy": 0.9008547365665436,
"num_tokens": 3763950.0,
"step": 403
},
{
"epoch": 1.8811188811188813,
"grad_norm": 2.005267203414508,
"learning_rate": 4.343164153588176e-06,
"loss": 0.3592,
"mean_token_accuracy": 0.8986201584339142,
"num_tokens": 3773509.0,
"step": 404
},
{
"epoch": 1.8857808857808858,
"grad_norm": 2.07305174671755,
"learning_rate": 4.3196259997110326e-06,
"loss": 0.374,
"mean_token_accuracy": 0.8963338732719421,
"num_tokens": 3782734.0,
"step": 405
},
{
"epoch": 1.8904428904428905,
"grad_norm": 1.8657634956210976,
"learning_rate": 4.296122476618507e-06,
"loss": 0.2938,
"mean_token_accuracy": 0.9148504436016083,
"num_tokens": 3792797.0,
"step": 406
},
{
"epoch": 1.895104895104895,
"grad_norm": 1.7854646245845165,
"learning_rate": 4.2726542738763185e-06,
"loss": 0.3506,
"mean_token_accuracy": 0.9033430516719818,
"num_tokens": 3801566.0,
"step": 407
},
{
"epoch": 1.8997668997668997,
"grad_norm": 2.1277604879204297,
"learning_rate": 4.249222080013927e-06,
"loss": 0.4041,
"mean_token_accuracy": 0.8863241672515869,
"num_tokens": 3810506.0,
"step": 408
},
{
"epoch": 1.9044289044289044,
"grad_norm": 2.203068756546148,
"learning_rate": 4.2258265825043365e-06,
"loss": 0.3564,
"mean_token_accuracy": 0.8995753824710846,
"num_tokens": 3819472.0,
"step": 409
},
{
"epoch": 1.9090909090909092,
"grad_norm": 1.9056766783093182,
"learning_rate": 4.202468467743922e-06,
"loss": 0.3437,
"mean_token_accuracy": 0.9032955765724182,
"num_tokens": 3828386.0,
"step": 410
},
{
"epoch": 1.913752913752914,
"grad_norm": 2.0477422656146715,
"learning_rate": 4.1791484210322945e-06,
"loss": 0.3963,
"mean_token_accuracy": 0.8910067081451416,
"num_tokens": 3837876.0,
"step": 411
},
{
"epoch": 1.9184149184149184,
"grad_norm": 2.003989704673306,
"learning_rate": 4.15586712655219e-06,
"loss": 0.3766,
"mean_token_accuracy": 0.894858181476593,
"num_tokens": 3847317.0,
"step": 412
},
{
"epoch": 1.9230769230769231,
"grad_norm": 2.040339338795703,
"learning_rate": 4.1326252673494006e-06,
"loss": 0.3611,
"mean_token_accuracy": 0.9015864133834839,
"num_tokens": 3856516.0,
"step": 413
},
{
"epoch": 1.9277389277389276,
"grad_norm": 1.9997790559031594,
"learning_rate": 4.109423525312738e-06,
"loss": 0.3524,
"mean_token_accuracy": 0.9032379388809204,
"num_tokens": 3866154.0,
"step": 414
},
{
"epoch": 1.9324009324009324,
"grad_norm": 1.8004052846393808,
"learning_rate": 4.086262581154015e-06,
"loss": 0.3496,
"mean_token_accuracy": 0.9037461578845978,
"num_tokens": 3875591.0,
"step": 415
},
{
"epoch": 1.937062937062937,
"grad_norm": 2.0740585166328422,
"learning_rate": 4.0631431143880915e-06,
"loss": 0.3397,
"mean_token_accuracy": 0.9040902256965637,
"num_tokens": 3885411.0,
"step": 416
},
{
"epoch": 1.9417249417249418,
"grad_norm": 2.0199474623334037,
"learning_rate": 4.040065803312921e-06,
"loss": 0.3619,
"mean_token_accuracy": 0.8977847695350647,
"num_tokens": 3893582.0,
"step": 417
},
{
"epoch": 1.9463869463869465,
"grad_norm": 1.9355091980822123,
"learning_rate": 4.017031324989669e-06,
"loss": 0.3382,
"mean_token_accuracy": 0.9016455113887787,
"num_tokens": 3903096.0,
"step": 418
},
{
"epoch": 1.951048951048951,
"grad_norm": 2.0385295249371067,
"learning_rate": 3.994040355222828e-06,
"loss": 0.3558,
"mean_token_accuracy": 0.899684876203537,
"num_tokens": 3912289.0,
"step": 419
},
{
"epoch": 1.9557109557109555,
"grad_norm": 2.1721644040047225,
"learning_rate": 3.971093568540408e-06,
"loss": 0.3711,
"mean_token_accuracy": 0.893602579832077,
"num_tokens": 3921342.0,
"step": 420
},
{
"epoch": 1.9603729603729603,
"grad_norm": 2.173567052204643,
"learning_rate": 3.948191638174135e-06,
"loss": 0.3468,
"mean_token_accuracy": 0.9026708602905273,
"num_tokens": 3930927.0,
"step": 421
},
{
"epoch": 1.965034965034965,
"grad_norm": 2.183705481115713,
"learning_rate": 3.925335236039702e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.8991383612155914,
"num_tokens": 3939964.0,
"step": 422
},
{
"epoch": 1.9696969696969697,
"grad_norm": 1.8559395167069694,
"learning_rate": 3.902525032717067e-06,
"loss": 0.3593,
"mean_token_accuracy": 0.8971259593963623,
"num_tokens": 3949279.0,
"step": 423
},
{
"epoch": 1.9743589743589745,
"grad_norm": 1.9573191314090432,
"learning_rate": 3.879761697430756e-06,
"loss": 0.3379,
"mean_token_accuracy": 0.9067609906196594,
"num_tokens": 3957777.0,
"step": 424
},
{
"epoch": 1.9790209790209792,
"grad_norm": 2.0757659987817423,
"learning_rate": 3.8570458980302526e-06,
"loss": 0.3598,
"mean_token_accuracy": 0.8982405662536621,
"num_tokens": 3967234.0,
"step": 425
},
{
"epoch": 1.9836829836829837,
"grad_norm": 2.1061841538720314,
"learning_rate": 3.834378300970385e-06,
"loss": 0.3595,
"mean_token_accuracy": 0.9008378386497498,
"num_tokens": 3976342.0,
"step": 426
},
{
"epoch": 1.9883449883449882,
"grad_norm": 2.012444035670508,
"learning_rate": 3.811759571291792e-06,
"loss": 0.3734,
"mean_token_accuracy": 0.8925457000732422,
"num_tokens": 3985787.0,
"step": 427
},
{
"epoch": 1.993006993006993,
"grad_norm": 2.0766910903000557,
"learning_rate": 3.789190372601387e-06,
"loss": 0.3746,
"mean_token_accuracy": 0.8980415463447571,
"num_tokens": 3995313.0,
"step": 428
},
{
"epoch": 1.9976689976689976,
"grad_norm": 2.3515299611822744,
"learning_rate": 3.7666713670529153e-06,
"loss": 0.3756,
"mean_token_accuracy": 0.8946518898010254,
"num_tokens": 4004696.0,
"step": 429
},
{
"epoch": 2.0,
"grad_norm": 2.8965666457221966,
"learning_rate": 3.7442032153275053e-06,
"loss": 0.2816,
"mean_token_accuracy": 0.9159619212150574,
"num_tokens": 4006594.0,
"step": 430
},
{
"epoch": 2.0046620046620047,
"grad_norm": 1.9140541750140387,
"learning_rate": 3.7217865766143014e-06,
"loss": 0.2339,
"mean_token_accuracy": 0.9410516619682312,
"num_tokens": 4016177.0,
"step": 431
},
{
"epoch": 2.0093240093240095,
"grad_norm": 1.9667546418678483,
"learning_rate": 3.6994221085911107e-06,
"loss": 0.2227,
"mean_token_accuracy": 0.9423635900020599,
"num_tokens": 4025325.0,
"step": 432
},
{
"epoch": 2.013986013986014,
"grad_norm": 1.9133395961990365,
"learning_rate": 3.6771104674051184e-06,
"loss": 0.2103,
"mean_token_accuracy": 0.9473540186882019,
"num_tokens": 4034580.0,
"step": 433
},
{
"epoch": 2.0186480186480185,
"grad_norm": 2.010682976434144,
"learning_rate": 3.654852307653628e-06,
"loss": 0.2033,
"mean_token_accuracy": 0.9448116421699524,
"num_tokens": 4043927.0,
"step": 434
},
{
"epoch": 2.023310023310023,
"grad_norm": 2.264710867597396,
"learning_rate": 3.6326482823648656e-06,
"loss": 0.1945,
"mean_token_accuracy": 0.9458509981632233,
"num_tokens": 4052748.0,
"step": 435
},
{
"epoch": 2.027972027972028,
"grad_norm": 3.3055471689921445,
"learning_rate": 3.6104990429788102e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9497792422771454,
"num_tokens": 4062547.0,
"step": 436
},
{
"epoch": 2.0326340326340326,
"grad_norm": 3.545040453235985,
"learning_rate": 3.588405239328091e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9520241916179657,
"num_tokens": 4072151.0,
"step": 437
},
{
"epoch": 2.0372960372960374,
"grad_norm": 4.174805075882892,
"learning_rate": 3.5663675196189184e-06,
"loss": 0.2058,
"mean_token_accuracy": 0.9453835189342499,
"num_tokens": 4081740.0,
"step": 438
},
{
"epoch": 2.041958041958042,
"grad_norm": 2.8755369630384786,
"learning_rate": 3.5443865304120607e-06,
"loss": 0.1838,
"mean_token_accuracy": 0.9492034912109375,
"num_tokens": 4090794.0,
"step": 439
},
{
"epoch": 2.046620046620047,
"grad_norm": 2.7985718626593763,
"learning_rate": 3.522462916603887e-06,
"loss": 0.2009,
"mean_token_accuracy": 0.9445820152759552,
"num_tokens": 4099715.0,
"step": 440
},
{
"epoch": 2.051282051282051,
"grad_norm": 2.307748489794371,
"learning_rate": 3.500597321407435e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9495159089565277,
"num_tokens": 4109312.0,
"step": 441
},
{
"epoch": 2.055944055944056,
"grad_norm": 2.167026623774839,
"learning_rate": 3.478790386333546e-06,
"loss": 0.2046,
"mean_token_accuracy": 0.9436551332473755,
"num_tokens": 4118384.0,
"step": 442
},
{
"epoch": 2.0606060606060606,
"grad_norm": 2.197047391901218,
"learning_rate": 3.45704275117204e-06,
"loss": 0.2117,
"mean_token_accuracy": 0.9415621161460876,
"num_tokens": 4127809.0,
"step": 443
},
{
"epoch": 2.0652680652680653,
"grad_norm": 1.944924615630873,
"learning_rate": 3.435355053972953e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9510793387889862,
"num_tokens": 4137712.0,
"step": 444
},
{
"epoch": 2.06993006993007,
"grad_norm": 2.3278118252307975,
"learning_rate": 3.4137279310278054e-06,
"loss": 0.2021,
"mean_token_accuracy": 0.9442847967147827,
"num_tokens": 4148072.0,
"step": 445
},
{
"epoch": 2.0745920745920747,
"grad_norm": 2.094002783532905,
"learning_rate": 3.392162016850945e-06,
"loss": 0.1972,
"mean_token_accuracy": 0.9459817409515381,
"num_tokens": 4158042.0,
"step": 446
},
{
"epoch": 2.0792540792540795,
"grad_norm": 2.142035620280267,
"learning_rate": 3.3706579441609224e-06,
"loss": 0.1887,
"mean_token_accuracy": 0.9461734592914581,
"num_tokens": 4167033.0,
"step": 447
},
{
"epoch": 2.0839160839160837,
"grad_norm": 2.4332724683825884,
"learning_rate": 3.3492163438619362e-06,
"loss": 0.2023,
"mean_token_accuracy": 0.9443434476852417,
"num_tokens": 4176760.0,
"step": 448
},
{
"epoch": 2.0885780885780885,
"grad_norm": 2.228646127092191,
"learning_rate": 3.327837845025315e-06,
"loss": 0.1917,
"mean_token_accuracy": 0.9497533142566681,
"num_tokens": 4186226.0,
"step": 449
},
{
"epoch": 2.093240093240093,
"grad_norm": 2.459774250577539,
"learning_rate": 3.3065230748710646e-06,
"loss": 0.2124,
"mean_token_accuracy": 0.9428280293941498,
"num_tokens": 4195499.0,
"step": 450
},
{
"epoch": 2.097902097902098,
"grad_norm": 2.4038605044962686,
"learning_rate": 3.2852726587494673e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9510272741317749,
"num_tokens": 4204395.0,
"step": 451
},
{
"epoch": 2.1025641025641026,
"grad_norm": 2.4928140282417517,
"learning_rate": 3.2640872201227364e-06,
"loss": 0.1967,
"mean_token_accuracy": 0.9442079365253448,
"num_tokens": 4213737.0,
"step": 452
},
{
"epoch": 2.1072261072261074,
"grad_norm": 2.256378296799341,
"learning_rate": 3.242967380546717e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.947964072227478,
"num_tokens": 4223235.0,
"step": 453
},
{
"epoch": 2.111888111888112,
"grad_norm": 2.1809559104100487,
"learning_rate": 3.221913759652657e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9498437345027924,
"num_tokens": 4232422.0,
"step": 454
},
{
"epoch": 2.1165501165501164,
"grad_norm": 2.2886117964470643,
"learning_rate": 3.20092697512903e-06,
"loss": 0.2023,
"mean_token_accuracy": 0.9447461664676666,
"num_tokens": 4241879.0,
"step": 455
},
{
"epoch": 2.121212121212121,
"grad_norm": 2.212820158000472,
"learning_rate": 3.180007642703402e-06,
"loss": 0.1865,
"mean_token_accuracy": 0.9497852921485901,
"num_tokens": 4251371.0,
"step": 456
},
{
"epoch": 2.125874125874126,
"grad_norm": 2.2290617079488553,
"learning_rate": 3.1591563761243816e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9462641477584839,
"num_tokens": 4260491.0,
"step": 457
},
{
"epoch": 2.1305361305361306,
"grad_norm": 2.143157870337979,
"learning_rate": 3.138373787143598e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9492957592010498,
"num_tokens": 4269810.0,
"step": 458
},
{
"epoch": 2.1351981351981353,
"grad_norm": 2.38741737774062,
"learning_rate": 3.1176604854977665e-06,
"loss": 0.1902,
"mean_token_accuracy": 0.9479398429393768,
"num_tokens": 4279116.0,
"step": 459
},
{
"epoch": 2.13986013986014,
"grad_norm": 2.3435622778544203,
"learning_rate": 3.0970170788907878e-06,
"loss": 0.2016,
"mean_token_accuracy": 0.9475513100624084,
"num_tokens": 4288073.0,
"step": 460
},
{
"epoch": 2.1445221445221447,
"grad_norm": 2.338474318951203,
"learning_rate": 3.076444172975932e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.952549546957016,
"num_tokens": 4296974.0,
"step": 461
},
{
"epoch": 2.149184149184149,
"grad_norm": 2.5215652212493653,
"learning_rate": 3.055942371338052e-06,
"loss": 0.199,
"mean_token_accuracy": 0.9409568309783936,
"num_tokens": 4306323.0,
"step": 462
},
{
"epoch": 2.1538461538461537,
"grad_norm": 2.327317690460305,
"learning_rate": 3.035512275475896e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9496185481548309,
"num_tokens": 4315250.0,
"step": 463
},
{
"epoch": 2.1585081585081585,
"grad_norm": 2.210597087672196,
"learning_rate": 3.0151544847844385e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9511962532997131,
"num_tokens": 4324195.0,
"step": 464
},
{
"epoch": 2.163170163170163,
"grad_norm": 2.3071386492578045,
"learning_rate": 2.994869596537316e-06,
"loss": 0.1983,
"mean_token_accuracy": 0.9448690712451935,
"num_tokens": 4333101.0,
"step": 465
},
{
"epoch": 2.167832167832168,
"grad_norm": 2.2596793138798104,
"learning_rate": 2.9746582058692803e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.9491128325462341,
"num_tokens": 4341626.0,
"step": 466
},
{
"epoch": 2.1724941724941726,
"grad_norm": 2.2305243738775964,
"learning_rate": 2.954520905758762e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9493462145328522,
"num_tokens": 4350773.0,
"step": 467
},
{
"epoch": 2.177156177156177,
"grad_norm": 1.8784409884434285,
"learning_rate": 2.934458287010455e-06,
"loss": 0.1608,
"mean_token_accuracy": 0.9552096724510193,
"num_tokens": 4360531.0,
"step": 468
},
{
"epoch": 2.1818181818181817,
"grad_norm": 2.419757102541458,
"learning_rate": 2.9144709382379955e-06,
"loss": 0.1929,
"mean_token_accuracy": 0.9458503425121307,
"num_tokens": 4370583.0,
"step": 469
},
{
"epoch": 2.1864801864801864,
"grad_norm": 2.1959363632762354,
"learning_rate": 2.894559445846682e-06,
"loss": 0.1891,
"mean_token_accuracy": 0.9474013149738312,
"num_tokens": 4380422.0,
"step": 470
},
{
"epoch": 2.191142191142191,
"grad_norm": 2.4899530178247877,
"learning_rate": 2.8747243940162774e-06,
"loss": 0.2115,
"mean_token_accuracy": 0.9412341713905334,
"num_tokens": 4389278.0,
"step": 471
},
{
"epoch": 2.195804195804196,
"grad_norm": 2.4399237002135146,
"learning_rate": 2.854966364683872e-06,
"loss": 0.1853,
"mean_token_accuracy": 0.9496332406997681,
"num_tokens": 4397770.0,
"step": 472
},
{
"epoch": 2.2004662004662006,
"grad_norm": 2.215224089050249,
"learning_rate": 2.835285937526801e-06,
"loss": 0.1783,
"mean_token_accuracy": 0.9523276388645172,
"num_tokens": 4407968.0,
"step": 473
},
{
"epoch": 2.2051282051282053,
"grad_norm": 2.2820905304008647,
"learning_rate": 2.8156836899456475e-06,
"loss": 0.1972,
"mean_token_accuracy": 0.9460384547710419,
"num_tokens": 4417210.0,
"step": 474
},
{
"epoch": 2.20979020979021,
"grad_norm": 2.261724927462342,
"learning_rate": 2.796160197047294e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9524807631969452,
"num_tokens": 4427199.0,
"step": 475
},
{
"epoch": 2.2144522144522143,
"grad_norm": 2.2910100819309966,
"learning_rate": 2.7767160316280583e-06,
"loss": 0.187,
"mean_token_accuracy": 0.9480907320976257,
"num_tokens": 4435729.0,
"step": 476
},
{
"epoch": 2.219114219114219,
"grad_norm": 2.5356260307985727,
"learning_rate": 2.757351764156877e-06,
"loss": 0.1898,
"mean_token_accuracy": 0.9483682215213776,
"num_tokens": 4444444.0,
"step": 477
},
{
"epoch": 2.2237762237762237,
"grad_norm": 2.185337315098767,
"learning_rate": 2.7380679627585817e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9521575570106506,
"num_tokens": 4453166.0,
"step": 478
},
{
"epoch": 2.2284382284382285,
"grad_norm": 2.379831452455389,
"learning_rate": 2.718865193197218e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9449348151683807,
"num_tokens": 4462042.0,
"step": 479
},
{
"epoch": 2.233100233100233,
"grad_norm": 2.171897352199924,
"learning_rate": 2.699744018859457e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9506651163101196,
"num_tokens": 4471400.0,
"step": 480
},
{
"epoch": 2.237762237762238,
"grad_norm": 2.216542641182696,
"learning_rate": 2.680705000738061e-06,
"loss": 0.1691,
"mean_token_accuracy": 0.9513165950775146,
"num_tokens": 4481107.0,
"step": 481
},
{
"epoch": 2.242424242424242,
"grad_norm": 2.1820463594567134,
"learning_rate": 2.661748697415423e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.94921013712883,
"num_tokens": 4490865.0,
"step": 482
},
{
"epoch": 2.247086247086247,
"grad_norm": 2.3975273161711868,
"learning_rate": 2.642875665047182e-06,
"loss": 0.2074,
"mean_token_accuracy": 0.9420484900474548,
"num_tokens": 4500700.0,
"step": 483
},
{
"epoch": 2.2517482517482517,
"grad_norm": 2.3576310035425885,
"learning_rate": 2.6240864573459095e-06,
"loss": 0.1933,
"mean_token_accuracy": 0.948281466960907,
"num_tokens": 4509781.0,
"step": 484
},
{
"epoch": 2.2564102564102564,
"grad_norm": 2.1072685233889783,
"learning_rate": 2.6053816255648543e-06,
"loss": 0.1791,
"mean_token_accuracy": 0.9498388171195984,
"num_tokens": 4519055.0,
"step": 485
},
{
"epoch": 2.261072261072261,
"grad_norm": 2.3694467761113365,
"learning_rate": 2.586761718481776e-06,
"loss": 0.2016,
"mean_token_accuracy": 0.9420836865901947,
"num_tokens": 4528368.0,
"step": 486
},
{
"epoch": 2.265734265734266,
"grad_norm": 2.354865943769181,
"learning_rate": 2.5682272823828474e-06,
"loss": 0.195,
"mean_token_accuracy": 0.9475519955158234,
"num_tokens": 4537216.0,
"step": 487
},
{
"epoch": 2.2703962703962706,
"grad_norm": 2.0798315233476146,
"learning_rate": 2.5497788610466177e-06,
"loss": 0.1775,
"mean_token_accuracy": 0.9497499167919159,
"num_tokens": 4546595.0,
"step": 488
},
{
"epoch": 2.2750582750582753,
"grad_norm": 2.245906905354928,
"learning_rate": 2.53141699572807e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9471332430839539,
"num_tokens": 4555647.0,
"step": 489
},
{
"epoch": 2.2797202797202796,
"grad_norm": 2.3166573667243973,
"learning_rate": 2.5131422251427313e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9487544298171997,
"num_tokens": 4565487.0,
"step": 490
},
{
"epoch": 2.2843822843822843,
"grad_norm": 2.356746974573123,
"learning_rate": 2.4949550854508713e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.9462016522884369,
"num_tokens": 4574193.0,
"step": 491
},
{
"epoch": 2.289044289044289,
"grad_norm": 2.3601704690415213,
"learning_rate": 2.476856110241773e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.9484397768974304,
"num_tokens": 4583652.0,
"step": 492
},
{
"epoch": 2.2937062937062938,
"grad_norm": 2.194739252262633,
"learning_rate": 2.458845830518082e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.948834627866745,
"num_tokens": 4593314.0,
"step": 493
},
{
"epoch": 2.2983682983682985,
"grad_norm": 2.262428951601218,
"learning_rate": 2.440924774680215e-06,
"loss": 0.196,
"mean_token_accuracy": 0.9466191530227661,
"num_tokens": 4602528.0,
"step": 494
},
{
"epoch": 2.303030303030303,
"grad_norm": 2.2737186057132246,
"learning_rate": 2.4230934685108707e-06,
"loss": 0.1959,
"mean_token_accuracy": 0.948832631111145,
"num_tokens": 4611387.0,
"step": 495
},
{
"epoch": 2.3076923076923075,
"grad_norm": 2.19558751291413,
"learning_rate": 2.405352435159595e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9490224421024323,
"num_tokens": 4620809.0,
"step": 496
},
{
"epoch": 2.312354312354312,
"grad_norm": 2.222802090174067,
"learning_rate": 2.3877021951274374e-06,
"loss": 0.1911,
"mean_token_accuracy": 0.9486294388771057,
"num_tokens": 4629397.0,
"step": 497
},
{
"epoch": 2.317016317016317,
"grad_norm": 2.206389027273452,
"learning_rate": 2.3701432662516772e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9500547051429749,
"num_tokens": 4638953.0,
"step": 498
},
{
"epoch": 2.3216783216783217,
"grad_norm": 2.5956601776793056,
"learning_rate": 2.3526761636906313e-06,
"loss": 0.1849,
"mean_token_accuracy": 0.9497621655464172,
"num_tokens": 4648328.0,
"step": 499
},
{
"epoch": 2.3263403263403264,
"grad_norm": 2.222447522693371,
"learning_rate": 2.3353013999085402e-06,
"loss": 0.1878,
"mean_token_accuracy": 0.9493353068828583,
"num_tokens": 4658253.0,
"step": 500
},
{
"epoch": 2.331002331002331,
"grad_norm": 2.1725903860996265,
"learning_rate": 2.3180194846605367e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9540884494781494,
"num_tokens": 4667108.0,
"step": 501
},
{
"epoch": 2.335664335664336,
"grad_norm": 2.2106617960868546,
"learning_rate": 2.300830924977683e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9487708210945129,
"num_tokens": 4675537.0,
"step": 502
},
{
"epoch": 2.3403263403263406,
"grad_norm": 2.26357210261458,
"learning_rate": 2.283736225152099e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.9498123228549957,
"num_tokens": 4684963.0,
"step": 503
},
{
"epoch": 2.344988344988345,
"grad_norm": 2.2970053926748366,
"learning_rate": 2.26673588672217e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9470812678337097,
"num_tokens": 4694167.0,
"step": 504
},
{
"epoch": 2.3496503496503496,
"grad_norm": 2.180761219218081,
"learning_rate": 2.249830408457826e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9532337486743927,
"num_tokens": 4704455.0,
"step": 505
},
{
"epoch": 2.3543123543123543,
"grad_norm": 2.1592931519499854,
"learning_rate": 2.2330202863459123e-06,
"loss": 0.18,
"mean_token_accuracy": 0.9502027928829193,
"num_tokens": 4714149.0,
"step": 506
},
{
"epoch": 2.358974358974359,
"grad_norm": 2.3049252234685382,
"learning_rate": 2.2163060135756364e-06,
"loss": 0.193,
"mean_token_accuracy": 0.9451474845409393,
"num_tokens": 4724018.0,
"step": 507
},
{
"epoch": 2.3636363636363638,
"grad_norm": 2.047360241682356,
"learning_rate": 2.1996880805241e-06,
"loss": 0.1603,
"mean_token_accuracy": 0.9543373584747314,
"num_tokens": 4733728.0,
"step": 508
},
{
"epoch": 2.3682983682983685,
"grad_norm": 2.243883991122542,
"learning_rate": 2.1831669747419093e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9504224061965942,
"num_tokens": 4743424.0,
"step": 509
},
{
"epoch": 2.3729603729603728,
"grad_norm": 2.0125158383022383,
"learning_rate": 2.166743180938875e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9573898613452911,
"num_tokens": 4753172.0,
"step": 510
},
{
"epoch": 2.3776223776223775,
"grad_norm": 1.9899532151234143,
"learning_rate": 2.150417180969784e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.9538862705230713,
"num_tokens": 4763449.0,
"step": 511
},
{
"epoch": 2.382284382284382,
"grad_norm": 2.1855347374072602,
"learning_rate": 2.1341894538202726e-06,
"loss": 0.1933,
"mean_token_accuracy": 0.9471964538097382,
"num_tokens": 4773131.0,
"step": 512
},
{
"epoch": 2.386946386946387,
"grad_norm": 2.1191232295552638,
"learning_rate": 2.1180604755927646e-06,
"loss": 0.1705,
"mean_token_accuracy": 0.9518732130527496,
"num_tokens": 4782766.0,
"step": 513
},
{
"epoch": 2.3916083916083917,
"grad_norm": 2.1956924392039268,
"learning_rate": 2.102030719492508e-06,
"loss": 0.1917,
"mean_token_accuracy": 0.9490616321563721,
"num_tokens": 4792428.0,
"step": 514
},
{
"epoch": 2.3962703962703964,
"grad_norm": 2.2810533470677705,
"learning_rate": 2.086100655813688e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9471859931945801,
"num_tokens": 4801662.0,
"step": 515
},
{
"epoch": 2.400932400932401,
"grad_norm": 2.244633860144498,
"learning_rate": 2.0702707519256365e-06,
"loss": 0.18,
"mean_token_accuracy": 0.9517810344696045,
"num_tokens": 4810704.0,
"step": 516
},
{
"epoch": 2.4055944055944054,
"grad_norm": 2.494051955620123,
"learning_rate": 2.0545414722591096e-06,
"loss": 0.1864,
"mean_token_accuracy": 0.9491457939147949,
"num_tokens": 4819473.0,
"step": 517
},
{
"epoch": 2.41025641025641,
"grad_norm": 2.0714876270735867,
"learning_rate": 2.03891327829267e-06,
"loss": 0.1611,
"mean_token_accuracy": 0.9556960761547089,
"num_tokens": 4829237.0,
"step": 518
},
{
"epoch": 2.414918414918415,
"grad_norm": 2.1368173991326387,
"learning_rate": 2.0233866285391455e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9533757269382477,
"num_tokens": 4838770.0,
"step": 519
},
{
"epoch": 2.4195804195804196,
"grad_norm": 2.4559899729586157,
"learning_rate": 2.0079619785321713e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9503377377986908,
"num_tokens": 4848313.0,
"step": 520
},
{
"epoch": 2.4242424242424243,
"grad_norm": 2.408883608780403,
"learning_rate": 1.992639780812838e-06,
"loss": 0.1797,
"mean_token_accuracy": 0.9508785903453827,
"num_tokens": 4856905.0,
"step": 521
},
{
"epoch": 2.428904428904429,
"grad_norm": 2.4178249725991776,
"learning_rate": 1.9774204849164004e-06,
"loss": 0.1909,
"mean_token_accuracy": 0.9491060078144073,
"num_tokens": 4866602.0,
"step": 522
},
{
"epoch": 2.4335664335664333,
"grad_norm": 2.1757038919566085,
"learning_rate": 1.9623045373590955e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9516046643257141,
"num_tokens": 4875823.0,
"step": 523
},
{
"epoch": 2.438228438228438,
"grad_norm": 2.27555860775744,
"learning_rate": 1.9472923816250427e-06,
"loss": 0.1869,
"mean_token_accuracy": 0.9469164311885834,
"num_tokens": 4885891.0,
"step": 524
},
{
"epoch": 2.4428904428904428,
"grad_norm": 2.3720687067635517,
"learning_rate": 1.9323844581532334e-06,
"loss": 0.1901,
"mean_token_accuracy": 0.9472060799598694,
"num_tokens": 4895721.0,
"step": 525
},
{
"epoch": 2.4475524475524475,
"grad_norm": 2.3467414380479483,
"learning_rate": 1.9175812043246034e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9455529749393463,
"num_tokens": 4905678.0,
"step": 526
},
{
"epoch": 2.4522144522144522,
"grad_norm": 2.1506573419595045,
"learning_rate": 1.9028830544492074e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.948146402835846,
"num_tokens": 4915087.0,
"step": 527
},
{
"epoch": 2.456876456876457,
"grad_norm": 2.379645454246707,
"learning_rate": 1.8882904397534705e-06,
"loss": 0.2152,
"mean_token_accuracy": 0.9423483908176422,
"num_tokens": 4924020.0,
"step": 528
},
{
"epoch": 2.4615384615384617,
"grad_norm": 2.264126564788235,
"learning_rate": 1.8738037883675445e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9492262303829193,
"num_tokens": 4933023.0,
"step": 529
},
{
"epoch": 2.4662004662004664,
"grad_norm": 2.190357275615204,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.1996,
"mean_token_accuracy": 0.9468533098697662,
"num_tokens": 4941720.0,
"step": 530
},
{
"epoch": 2.4708624708624707,
"grad_norm": 2.0683527870115483,
"learning_rate": 1.8451500724890509e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9456667006015778,
"num_tokens": 4951562.0,
"step": 531
},
{
"epoch": 2.4755244755244754,
"grad_norm": 2.3584045584131617,
"learning_rate": 1.8309838486627995e-06,
"loss": 0.2036,
"mean_token_accuracy": 0.94382044672966,
"num_tokens": 4960478.0,
"step": 532
},
{
"epoch": 2.48018648018648,
"grad_norm": 2.1234920339277044,
"learning_rate": 1.816925269454327e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9552336037158966,
"num_tokens": 4970160.0,
"step": 533
},
{
"epoch": 2.484848484848485,
"grad_norm": 2.3345270471564303,
"learning_rate": 1.8029747473258092e-06,
"loss": 0.1859,
"mean_token_accuracy": 0.9483939707279205,
"num_tokens": 4979864.0,
"step": 534
},
{
"epoch": 2.4895104895104896,
"grad_norm": 2.407762000225371,
"learning_rate": 1.789132691569153e-06,
"loss": 0.195,
"mean_token_accuracy": 0.9456151127815247,
"num_tokens": 4988667.0,
"step": 535
},
{
"epoch": 2.4941724941724943,
"grad_norm": 2.347393889553957,
"learning_rate": 1.7753995082939932e-06,
"loss": 0.1732,
"mean_token_accuracy": 0.9526363909244537,
"num_tokens": 4998687.0,
"step": 536
},
{
"epoch": 2.4988344988344986,
"grad_norm": 2.5390123853320445,
"learning_rate": 1.7617756004157693e-06,
"loss": 0.1987,
"mean_token_accuracy": 0.945299506187439,
"num_tokens": 5007734.0,
"step": 537
},
{
"epoch": 2.5034965034965033,
"grad_norm": 2.438556510312228,
"learning_rate": 1.7482613676439153e-06,
"loss": 0.1943,
"mean_token_accuracy": 0.9447510838508606,
"num_tokens": 5016545.0,
"step": 538
},
{
"epoch": 2.508158508158508,
"grad_norm": 2.3154480135315194,
"learning_rate": 1.7348572064701188e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9469634890556335,
"num_tokens": 5025652.0,
"step": 539
},
{
"epoch": 2.5128205128205128,
"grad_norm": 2.149785741270873,
"learning_rate": 1.721563510156704e-06,
"loss": 0.1718,
"mean_token_accuracy": 0.9520467817783356,
"num_tokens": 5035567.0,
"step": 540
},
{
"epoch": 2.5174825174825175,
"grad_norm": 2.4390965002580463,
"learning_rate": 1.7083806687250795e-06,
"loss": 0.1999,
"mean_token_accuracy": 0.947026789188385,
"num_tokens": 5044442.0,
"step": 541
},
{
"epoch": 2.5221445221445222,
"grad_norm": 2.294881333986716,
"learning_rate": 1.6953090689443074e-06,
"loss": 0.1868,
"mean_token_accuracy": 0.9499901831150055,
"num_tokens": 5053663.0,
"step": 542
},
{
"epoch": 2.526806526806527,
"grad_norm": 2.2057605088088197,
"learning_rate": 1.6823490943197473e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9531058371067047,
"num_tokens": 5063344.0,
"step": 543
},
{
"epoch": 2.5314685314685317,
"grad_norm": 2.4507085825396007,
"learning_rate": 1.6695011250818094e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9547788798809052,
"num_tokens": 5073038.0,
"step": 544
},
{
"epoch": 2.5361305361305364,
"grad_norm": 2.3638435763054493,
"learning_rate": 1.6567655381747976e-06,
"loss": 0.1927,
"mean_token_accuracy": 0.9466958940029144,
"num_tokens": 5081603.0,
"step": 545
},
{
"epoch": 2.5407925407925407,
"grad_norm": 2.32752114137372,
"learning_rate": 1.6441427072458493e-06,
"loss": 0.1958,
"mean_token_accuracy": 0.9444697499275208,
"num_tokens": 5091091.0,
"step": 546
},
{
"epoch": 2.5454545454545454,
"grad_norm": 2.509857709791671,
"learning_rate": 1.6316330026339743e-06,
"loss": 0.213,
"mean_token_accuracy": 0.941786378622055,
"num_tokens": 5100278.0,
"step": 547
},
{
"epoch": 2.55011655011655,
"grad_norm": 2.0952838106121914,
"learning_rate": 1.6192367913591916e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9539439380168915,
"num_tokens": 5110291.0,
"step": 548
},
{
"epoch": 2.554778554778555,
"grad_norm": 2.4251986988481704,
"learning_rate": 1.6069544371117556e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9515743851661682,
"num_tokens": 5119413.0,
"step": 549
},
{
"epoch": 2.5594405594405596,
"grad_norm": 2.2675536817510276,
"learning_rate": 1.5947863002414938e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9483968019485474,
"num_tokens": 5128968.0,
"step": 550
},
{
"epoch": 2.564102564102564,
"grad_norm": 2.225047159227379,
"learning_rate": 1.5827327377472262e-06,
"loss": 0.1872,
"mean_token_accuracy": 0.9502107799053192,
"num_tokens": 5138040.0,
"step": 551
},
{
"epoch": 2.5687645687645686,
"grad_norm": 2.5646265404291033,
"learning_rate": 1.5707941032662967e-06,
"loss": 0.1941,
"mean_token_accuracy": 0.9476030170917511,
"num_tokens": 5147278.0,
"step": 552
},
{
"epoch": 2.5734265734265733,
"grad_norm": 2.512848592730855,
"learning_rate": 1.558970747064198e-06,
"loss": 0.184,
"mean_token_accuracy": 0.9474705457687378,
"num_tokens": 5156257.0,
"step": 553
},
{
"epoch": 2.578088578088578,
"grad_norm": 2.092641334488186,
"learning_rate": 1.5472630160242921e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.9498989582061768,
"num_tokens": 5166290.0,
"step": 554
},
{
"epoch": 2.582750582750583,
"grad_norm": 2.232715743448255,
"learning_rate": 1.5356712536376345e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.9500272274017334,
"num_tokens": 5176118.0,
"step": 555
},
{
"epoch": 2.5874125874125875,
"grad_norm": 2.5230454773039828,
"learning_rate": 1.5241957999928974e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.952000617980957,
"num_tokens": 5185921.0,
"step": 556
},
{
"epoch": 2.5920745920745922,
"grad_norm": 2.6026889894908978,
"learning_rate": 1.5128369917663924e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9518662393093109,
"num_tokens": 5194883.0,
"step": 557
},
{
"epoch": 2.596736596736597,
"grad_norm": 2.4331824963283375,
"learning_rate": 1.5015951622121896e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9511874914169312,
"num_tokens": 5203915.0,
"step": 558
},
{
"epoch": 2.6013986013986012,
"grad_norm": 2.3494840072703598,
"learning_rate": 1.490470641152345e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9490853250026703,
"num_tokens": 5212979.0,
"step": 559
},
{
"epoch": 2.606060606060606,
"grad_norm": 2.37225992368595,
"learning_rate": 1.4794637549672182e-06,
"loss": 0.2049,
"mean_token_accuracy": 0.9459311068058014,
"num_tokens": 5222060.0,
"step": 560
},
{
"epoch": 2.6107226107226107,
"grad_norm": 2.2581667401414354,
"learning_rate": 1.4685748265859043e-06,
"loss": 0.1853,
"mean_token_accuracy": 0.949056476354599,
"num_tokens": 5230955.0,
"step": 561
},
{
"epoch": 2.6153846153846154,
"grad_norm": 2.049312050186223,
"learning_rate": 1.457804175476751e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9556883871555328,
"num_tokens": 5241027.0,
"step": 562
},
{
"epoch": 2.62004662004662,
"grad_norm": 2.2972322506270095,
"learning_rate": 1.447152117637992e-06,
"loss": 0.185,
"mean_token_accuracy": 0.9484710693359375,
"num_tokens": 5250218.0,
"step": 563
},
{
"epoch": 2.624708624708625,
"grad_norm": 2.1577151240414443,
"learning_rate": 1.436618965588472e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9517091810703278,
"num_tokens": 5259812.0,
"step": 564
},
{
"epoch": 2.629370629370629,
"grad_norm": 2.2654942496472836,
"learning_rate": 1.4262050283584836e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.952095627784729,
"num_tokens": 5268750.0,
"step": 565
},
{
"epoch": 2.634032634032634,
"grad_norm": 2.0539513008080132,
"learning_rate": 1.4159106114806943e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9545489847660065,
"num_tokens": 5277570.0,
"step": 566
},
{
"epoch": 2.6386946386946386,
"grad_norm": 2.2352237949709814,
"learning_rate": 1.4057360169811832e-06,
"loss": 0.1856,
"mean_token_accuracy": 0.9448749423027039,
"num_tokens": 5287563.0,
"step": 567
},
{
"epoch": 2.6433566433566433,
"grad_norm": 2.274026900774616,
"learning_rate": 1.3956815433705861e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9487999975681305,
"num_tokens": 5296479.0,
"step": 568
},
{
"epoch": 2.648018648018648,
"grad_norm": 2.271359052605945,
"learning_rate": 1.3857474856353299e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9485887885093689,
"num_tokens": 5305636.0,
"step": 569
},
{
"epoch": 2.652680652680653,
"grad_norm": 2.2832232390511935,
"learning_rate": 1.3759341352289832e-06,
"loss": 0.1919,
"mean_token_accuracy": 0.9442925155162811,
"num_tokens": 5315157.0,
"step": 570
},
{
"epoch": 2.6573426573426575,
"grad_norm": 2.4224807634049874,
"learning_rate": 1.3662417800637023e-06,
"loss": 0.196,
"mean_token_accuracy": 0.9467974901199341,
"num_tokens": 5324117.0,
"step": 571
},
{
"epoch": 2.6620046620046622,
"grad_norm": 2.383821092419705,
"learning_rate": 1.3566707045017867e-06,
"loss": 0.181,
"mean_token_accuracy": 0.9486918449401855,
"num_tokens": 5332491.0,
"step": 572
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.451846629070992,
"learning_rate": 1.3472211893473327e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9497961103916168,
"num_tokens": 5341476.0,
"step": 573
},
{
"epoch": 2.6713286713286712,
"grad_norm": 2.2050228122309297,
"learning_rate": 1.3378935118380004e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.9522497951984406,
"num_tokens": 5351031.0,
"step": 574
},
{
"epoch": 2.675990675990676,
"grad_norm": 2.221685948440498,
"learning_rate": 1.3286879456368746e-06,
"loss": 0.1868,
"mean_token_accuracy": 0.9497689306735992,
"num_tokens": 5360953.0,
"step": 575
},
{
"epoch": 2.6806526806526807,
"grad_norm": 2.3611824277077895,
"learning_rate": 1.319604760824439e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9504337906837463,
"num_tokens": 5370450.0,
"step": 576
},
{
"epoch": 2.6853146853146854,
"grad_norm": 2.524561240638075,
"learning_rate": 1.31064422389065e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.947800487279892,
"num_tokens": 5379656.0,
"step": 577
},
{
"epoch": 2.6899766899766897,
"grad_norm": 2.4377644762322848,
"learning_rate": 1.3018065977271215e-06,
"loss": 0.2048,
"mean_token_accuracy": 0.9430462419986725,
"num_tokens": 5388537.0,
"step": 578
},
{
"epoch": 2.6946386946386944,
"grad_norm": 2.399526282941045,
"learning_rate": 1.293092141619407e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9497084021568298,
"num_tokens": 5398026.0,
"step": 579
},
{
"epoch": 2.699300699300699,
"grad_norm": 2.461708260111587,
"learning_rate": 1.2845011112394e-06,
"loss": 0.1979,
"mean_token_accuracy": 0.9429396092891693,
"num_tokens": 5407274.0,
"step": 580
},
{
"epoch": 2.703962703962704,
"grad_norm": 2.1140139302951506,
"learning_rate": 1.276033758637823e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.954795241355896,
"num_tokens": 5416685.0,
"step": 581
},
{
"epoch": 2.7086247086247086,
"grad_norm": 2.256627061683475,
"learning_rate": 1.2676903322368423e-06,
"loss": 0.1914,
"mean_token_accuracy": 0.9461691081523895,
"num_tokens": 5426216.0,
"step": 582
},
{
"epoch": 2.7132867132867133,
"grad_norm": 2.224029918327043,
"learning_rate": 1.2594710768227734e-06,
"loss": 0.1902,
"mean_token_accuracy": 0.9479357004165649,
"num_tokens": 5435218.0,
"step": 583
},
{
"epoch": 2.717948717948718,
"grad_norm": 2.3441213495740616,
"learning_rate": 1.2513762335389004e-06,
"loss": 0.1952,
"mean_token_accuracy": 0.9469590187072754,
"num_tokens": 5444848.0,
"step": 584
},
{
"epoch": 2.722610722610723,
"grad_norm": 2.2818966905915525,
"learning_rate": 1.2434060398784039e-06,
"loss": 0.1861,
"mean_token_accuracy": 0.9511770606040955,
"num_tokens": 5453935.0,
"step": 585
},
{
"epoch": 2.7272727272727275,
"grad_norm": 2.527822525615972,
"learning_rate": 1.2355607296773896e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9509838223457336,
"num_tokens": 5463825.0,
"step": 586
},
{
"epoch": 2.731934731934732,
"grad_norm": 2.3014422706991193,
"learning_rate": 1.2278405331080296e-06,
"loss": 0.2017,
"mean_token_accuracy": 0.9433080554008484,
"num_tokens": 5472633.0,
"step": 587
},
{
"epoch": 2.7365967365967365,
"grad_norm": 2.481469694015223,
"learning_rate": 1.2202456766718092e-06,
"loss": 0.1929,
"mean_token_accuracy": 0.9468889534473419,
"num_tokens": 5481508.0,
"step": 588
},
{
"epoch": 2.7412587412587412,
"grad_norm": 2.2243902331538803,
"learning_rate": 1.212776383192883e-06,
"loss": 0.1921,
"mean_token_accuracy": 0.9495699405670166,
"num_tokens": 5490583.0,
"step": 589
},
{
"epoch": 2.745920745920746,
"grad_norm": 2.079107821673962,
"learning_rate": 1.2054328718115336e-06,
"loss": 0.171,
"mean_token_accuracy": 0.9525066316127777,
"num_tokens": 5500701.0,
"step": 590
},
{
"epoch": 2.7505827505827507,
"grad_norm": 2.612790704517144,
"learning_rate": 1.1982153579777483e-06,
"loss": 0.1945,
"mean_token_accuracy": 0.9466931521892548,
"num_tokens": 5509719.0,
"step": 591
},
{
"epoch": 2.755244755244755,
"grad_norm": 2.279498702941535,
"learning_rate": 1.1911240534448899e-06,
"loss": 0.2023,
"mean_token_accuracy": 0.9443814754486084,
"num_tokens": 5518911.0,
"step": 592
},
{
"epoch": 2.7599067599067597,
"grad_norm": 2.302875853828702,
"learning_rate": 1.1841591662634943e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9504655301570892,
"num_tokens": 5528230.0,
"step": 593
},
{
"epoch": 2.7645687645687644,
"grad_norm": 2.391231643668634,
"learning_rate": 1.1773209007751562e-06,
"loss": 0.1973,
"mean_token_accuracy": 0.9472830295562744,
"num_tokens": 5537899.0,
"step": 594
},
{
"epoch": 2.769230769230769,
"grad_norm": 2.195727031813818,
"learning_rate": 1.1706094576065416e-06,
"loss": 0.1797,
"mean_token_accuracy": 0.9503377079963684,
"num_tokens": 5547675.0,
"step": 595
},
{
"epoch": 2.773892773892774,
"grad_norm": 2.3904334187953777,
"learning_rate": 1.164025033663497e-06,
"loss": 0.2021,
"mean_token_accuracy": 0.9435946643352509,
"num_tokens": 5557022.0,
"step": 596
},
{
"epoch": 2.7785547785547786,
"grad_norm": 2.2336433005731924,
"learning_rate": 1.1575678221252763e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9506051242351532,
"num_tokens": 5566788.0,
"step": 597
},
{
"epoch": 2.7832167832167833,
"grad_norm": 2.45559515222341,
"learning_rate": 1.1512380124388695e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.948212593793869,
"num_tokens": 5575464.0,
"step": 598
},
{
"epoch": 2.787878787878788,
"grad_norm": 2.3652897419571213,
"learning_rate": 1.1450357903134463e-06,
"loss": 0.1838,
"mean_token_accuracy": 0.9474283754825592,
"num_tokens": 5584904.0,
"step": 599
},
{
"epoch": 2.792540792540793,
"grad_norm": 2.3554447950041304,
"learning_rate": 1.1389613377149086e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.9471111297607422,
"num_tokens": 5594389.0,
"step": 600
},
{
"epoch": 2.797202797202797,
"grad_norm": 2.2047518432891633,
"learning_rate": 1.1330148328605484e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.951262503862381,
"num_tokens": 5603819.0,
"step": 601
},
{
"epoch": 2.801864801864802,
"grad_norm": 2.291063530837646,
"learning_rate": 1.127196450213825e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.9493178725242615,
"num_tokens": 5614459.0,
"step": 602
},
{
"epoch": 2.8065268065268065,
"grad_norm": 2.11284252635657,
"learning_rate": 1.1215063604792396e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9510103464126587,
"num_tokens": 5623744.0,
"step": 603
},
{
"epoch": 2.8111888111888113,
"grad_norm": 2.1857533437327925,
"learning_rate": 1.1159447305973313e-06,
"loss": 0.1835,
"mean_token_accuracy": 0.9497886300086975,
"num_tokens": 5632743.0,
"step": 604
},
{
"epoch": 2.815850815850816,
"grad_norm": 2.3920794052563665,
"learning_rate": 1.1105117237397777e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.95287024974823,
"num_tokens": 5641669.0,
"step": 605
},
{
"epoch": 2.8205128205128203,
"grad_norm": 2.3412796580645745,
"learning_rate": 1.1052074993046102e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.9482883512973785,
"num_tokens": 5650735.0,
"step": 606
},
{
"epoch": 2.825174825174825,
"grad_norm": 2.3440493241719147,
"learning_rate": 1.100032212911533e-06,
"loss": 0.2039,
"mean_token_accuracy": 0.9425942301750183,
"num_tokens": 5659501.0,
"step": 607
},
{
"epoch": 2.8298368298368297,
"grad_norm": 2.0320304598963213,
"learning_rate": 1.0949860163973616e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9530138373374939,
"num_tokens": 5669759.0,
"step": 608
},
{
"epoch": 2.8344988344988344,
"grad_norm": 2.325052766726883,
"learning_rate": 1.0900690578115643e-06,
"loss": 0.1905,
"mean_token_accuracy": 0.9488844573497772,
"num_tokens": 5678984.0,
"step": 609
},
{
"epoch": 2.839160839160839,
"grad_norm": 2.4642634644073396,
"learning_rate": 1.0852814814119238e-06,
"loss": 0.2002,
"mean_token_accuracy": 0.9488008916378021,
"num_tokens": 5688220.0,
"step": 610
},
{
"epoch": 2.843822843822844,
"grad_norm": 2.345629463948856,
"learning_rate": 1.0806234276602984e-06,
"loss": 0.1949,
"mean_token_accuracy": 0.9474165737628937,
"num_tokens": 5697038.0,
"step": 611
},
{
"epoch": 2.8484848484848486,
"grad_norm": 2.0761774146720398,
"learning_rate": 1.0760950332185055e-06,
"loss": 0.1623,
"mean_token_accuracy": 0.9569342732429504,
"num_tokens": 5706946.0,
"step": 612
},
{
"epoch": 2.8531468531468533,
"grad_norm": 2.1184654015758357,
"learning_rate": 1.071696430944311e-06,
"loss": 0.1657,
"mean_token_accuracy": 0.9557340145111084,
"num_tokens": 5716519.0,
"step": 613
},
{
"epoch": 2.857808857808858,
"grad_norm": 2.7525481756348427,
"learning_rate": 1.0674277498875325e-06,
"loss": 0.192,
"mean_token_accuracy": 0.9446141123771667,
"num_tokens": 5725936.0,
"step": 614
},
{
"epoch": 2.8624708624708624,
"grad_norm": 2.415698399491206,
"learning_rate": 1.0632891152862493e-06,
"loss": 0.1881,
"mean_token_accuracy": 0.9490151107311249,
"num_tokens": 5735600.0,
"step": 615
},
{
"epoch": 2.867132867132867,
"grad_norm": 2.0904164041799644,
"learning_rate": 1.0592806485631326e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9557408690452576,
"num_tokens": 5745302.0,
"step": 616
},
{
"epoch": 2.871794871794872,
"grad_norm": 2.373611778744568,
"learning_rate": 1.0554024673218808e-06,
"loss": 0.1925,
"mean_token_accuracy": 0.9461594521999359,
"num_tokens": 5754314.0,
"step": 617
},
{
"epoch": 2.8764568764568765,
"grad_norm": 2.3770918318336687,
"learning_rate": 1.0516546853437686e-06,
"loss": 0.1888,
"mean_token_accuracy": 0.9470888376235962,
"num_tokens": 5763507.0,
"step": 618
},
{
"epoch": 2.8811188811188813,
"grad_norm": 2.615459038788364,
"learning_rate": 1.0480374125843114e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9462830722332001,
"num_tokens": 5772748.0,
"step": 619
},
{
"epoch": 2.8857808857808855,
"grad_norm": 2.412878313629322,
"learning_rate": 1.0445507551700356e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9492884576320648,
"num_tokens": 5782160.0,
"step": 620
},
{
"epoch": 2.8904428904428903,
"grad_norm": 2.349334019362764,
"learning_rate": 1.0411948153953696e-06,
"loss": 0.194,
"mean_token_accuracy": 0.9473778307437897,
"num_tokens": 5790728.0,
"step": 621
},
{
"epoch": 2.895104895104895,
"grad_norm": 2.342666618452988,
"learning_rate": 1.0379696917196378e-06,
"loss": 0.1764,
"mean_token_accuracy": 0.9477843642234802,
"num_tokens": 5800771.0,
"step": 622
},
{
"epoch": 2.8997668997668997,
"grad_norm": 2.369332709510537,
"learning_rate": 1.0348754787641751e-06,
"loss": 0.1775,
"mean_token_accuracy": 0.949904203414917,
"num_tokens": 5809633.0,
"step": 623
},
{
"epoch": 2.9044289044289044,
"grad_norm": 2.2013635420452173,
"learning_rate": 1.031912267309549e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9559362530708313,
"num_tokens": 5819766.0,
"step": 624
},
{
"epoch": 2.909090909090909,
"grad_norm": 2.208698138845042,
"learning_rate": 1.029080144292899e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9497508108615875,
"num_tokens": 5829413.0,
"step": 625
},
{
"epoch": 2.913752913752914,
"grad_norm": 2.332053491912036,
"learning_rate": 1.026379192805382e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9528988003730774,
"num_tokens": 5839124.0,
"step": 626
},
{
"epoch": 2.9184149184149186,
"grad_norm": 2.4789581729644077,
"learning_rate": 1.0238094920897374e-06,
"loss": 0.173,
"mean_token_accuracy": 0.953327864408493,
"num_tokens": 5848406.0,
"step": 627
},
{
"epoch": 2.9230769230769234,
"grad_norm": 2.2923048279485667,
"learning_rate": 1.0213711175379614e-06,
"loss": 0.171,
"mean_token_accuracy": 0.956305056810379,
"num_tokens": 5857592.0,
"step": 628
},
{
"epoch": 2.9277389277389276,
"grad_norm": 2.34320318426194,
"learning_rate": 1.0190641406890946e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9514197111129761,
"num_tokens": 5866687.0,
"step": 629
},
{
"epoch": 2.9324009324009324,
"grad_norm": 2.4904828242049293,
"learning_rate": 1.0168886292271246e-06,
"loss": 0.189,
"mean_token_accuracy": 0.9476959109306335,
"num_tokens": 5875890.0,
"step": 630
},
{
"epoch": 2.937062937062937,
"grad_norm": 2.216909312747613,
"learning_rate": 1.0148446469789979e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9502739012241364,
"num_tokens": 5884491.0,
"step": 631
},
{
"epoch": 2.941724941724942,
"grad_norm": 2.393678749036967,
"learning_rate": 1.0129322539127494e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9475315511226654,
"num_tokens": 5894473.0,
"step": 632
},
{
"epoch": 2.9463869463869465,
"grad_norm": 2.3025122993118443,
"learning_rate": 1.011151506135742e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9525851905345917,
"num_tokens": 5903874.0,
"step": 633
},
{
"epoch": 2.951048951048951,
"grad_norm": 2.149623052195717,
"learning_rate": 1.0095024558930204e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9500269293785095,
"num_tokens": 5913132.0,
"step": 634
},
{
"epoch": 2.9557109557109555,
"grad_norm": 2.347852575129903,
"learning_rate": 1.0079851515657794e-06,
"loss": 0.1992,
"mean_token_accuracy": 0.9474380910396576,
"num_tokens": 5923130.0,
"step": 635
},
{
"epoch": 2.9603729603729603,
"grad_norm": 2.4799488075118408,
"learning_rate": 1.006599637669943e-06,
"loss": 0.192,
"mean_token_accuracy": 0.946626216173172,
"num_tokens": 5931948.0,
"step": 636
},
{
"epoch": 2.965034965034965,
"grad_norm": 2.1839881945919055,
"learning_rate": 1.0053459548548582e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9527485072612762,
"num_tokens": 5941672.0,
"step": 637
},
{
"epoch": 2.9696969696969697,
"grad_norm": 2.3405062342525884,
"learning_rate": 1.004224139902105e-06,
"loss": 0.192,
"mean_token_accuracy": 0.9456218183040619,
"num_tokens": 5951095.0,
"step": 638
},
{
"epoch": 2.9743589743589745,
"grad_norm": 2.1209481683555906,
"learning_rate": 1.0032342257244139e-06,
"loss": 0.17,
"mean_token_accuracy": 0.9527087509632111,
"num_tokens": 5961196.0,
"step": 639
},
{
"epoch": 2.979020979020979,
"grad_norm": 2.3906839410006957,
"learning_rate": 1.0023762413647023e-06,
"loss": 0.19,
"mean_token_accuracy": 0.9449608623981476,
"num_tokens": 5970352.0,
"step": 640
},
{
"epoch": 2.983682983682984,
"grad_norm": 2.2744118662012736,
"learning_rate": 1.0016502119952224e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9502459466457367,
"num_tokens": 5979759.0,
"step": 641
},
{
"epoch": 2.988344988344988,
"grad_norm": 2.3978292522263933,
"learning_rate": 1.0010561589168217e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9475610256195068,
"num_tokens": 5989632.0,
"step": 642
},
{
"epoch": 2.993006993006993,
"grad_norm": 2.220217029439386,
"learning_rate": 1.0005940995583183e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.9482160210609436,
"num_tokens": 5999031.0,
"step": 643
},
{
"epoch": 2.9976689976689976,
"grad_norm": 2.303701651736636,
"learning_rate": 1.0002640474759911e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.950013667345047,
"num_tokens": 6008269.0,
"step": 644
},
{
"epoch": 3.0,
"grad_norm": 2.303701651736636,
"learning_rate": 1.0000660123531788e-06,
"loss": 0.1305,
"mean_token_accuracy": 0.9722093343734741,
"num_tokens": 6010434.0,
"step": 645
},
{
"epoch": 3.0,
"step": 645,
"total_flos": 194968109678592.0,
"train_loss": 0.4934643579314845,
"train_runtime": 19223.4417,
"train_samples_per_second": 1.069,
"train_steps_per_second": 0.034
}
],
"logging_steps": 1,
"max_steps": 645,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 194968109678592.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}