gemma2-2b-it-falsereject / trainer_state.json
skdrx's picture
Upload folder using huggingface_hub
86c474a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5484,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005470459518599562,
"grad_norm": 9.625,
"learning_rate": 9.983588621444202e-06,
"loss": 1.9919,
"mean_token_accuracy": 0.5373850524425506,
"num_tokens": 25832.0,
"step": 10
},
{
"epoch": 0.010940919037199124,
"grad_norm": 8.625,
"learning_rate": 9.965353756382203e-06,
"loss": 1.6719,
"mean_token_accuracy": 0.5806249260902405,
"num_tokens": 51122.0,
"step": 20
},
{
"epoch": 0.016411378555798686,
"grad_norm": 8.875,
"learning_rate": 9.947118891320205e-06,
"loss": 1.6045,
"mean_token_accuracy": 0.5889442741870881,
"num_tokens": 76631.0,
"step": 30
},
{
"epoch": 0.02188183807439825,
"grad_norm": 9.4375,
"learning_rate": 9.928884026258206e-06,
"loss": 1.5537,
"mean_token_accuracy": 0.6036407649517059,
"num_tokens": 101784.0,
"step": 40
},
{
"epoch": 0.02735229759299781,
"grad_norm": 9.0625,
"learning_rate": 9.910649161196208e-06,
"loss": 1.4784,
"mean_token_accuracy": 0.6087825715541839,
"num_tokens": 127369.0,
"step": 50
},
{
"epoch": 0.03282275711159737,
"grad_norm": 9.0,
"learning_rate": 9.89241429613421e-06,
"loss": 1.4971,
"mean_token_accuracy": 0.6069094657897949,
"num_tokens": 152695.0,
"step": 60
},
{
"epoch": 0.038293216630196934,
"grad_norm": 8.6875,
"learning_rate": 9.874179431072211e-06,
"loss": 1.4571,
"mean_token_accuracy": 0.6164368271827698,
"num_tokens": 178219.0,
"step": 70
},
{
"epoch": 0.0437636761487965,
"grad_norm": 9.1875,
"learning_rate": 9.855944566010213e-06,
"loss": 1.4476,
"mean_token_accuracy": 0.6144214510917664,
"num_tokens": 203604.0,
"step": 80
},
{
"epoch": 0.04923413566739606,
"grad_norm": 8.9375,
"learning_rate": 9.837709700948214e-06,
"loss": 1.4509,
"mean_token_accuracy": 0.6110931515693665,
"num_tokens": 229492.0,
"step": 90
},
{
"epoch": 0.05470459518599562,
"grad_norm": 8.375,
"learning_rate": 9.819474835886216e-06,
"loss": 1.4126,
"mean_token_accuracy": 0.620923125743866,
"num_tokens": 255685.0,
"step": 100
},
{
"epoch": 0.060175054704595186,
"grad_norm": 9.3125,
"learning_rate": 9.801239970824217e-06,
"loss": 1.4117,
"mean_token_accuracy": 0.6242850065231323,
"num_tokens": 280910.0,
"step": 110
},
{
"epoch": 0.06564551422319474,
"grad_norm": 9.125,
"learning_rate": 9.783005105762217e-06,
"loss": 1.3618,
"mean_token_accuracy": 0.6288298785686492,
"num_tokens": 306335.0,
"step": 120
},
{
"epoch": 0.0711159737417943,
"grad_norm": 9.4375,
"learning_rate": 9.76477024070022e-06,
"loss": 1.3932,
"mean_token_accuracy": 0.6229633510112762,
"num_tokens": 331606.0,
"step": 130
},
{
"epoch": 0.07658643326039387,
"grad_norm": 8.9375,
"learning_rate": 9.74653537563822e-06,
"loss": 1.4133,
"mean_token_accuracy": 0.6180586159229279,
"num_tokens": 357437.0,
"step": 140
},
{
"epoch": 0.08205689277899343,
"grad_norm": 9.125,
"learning_rate": 9.728300510576222e-06,
"loss": 1.4235,
"mean_token_accuracy": 0.6152303397655488,
"num_tokens": 382692.0,
"step": 150
},
{
"epoch": 0.087527352297593,
"grad_norm": 8.9375,
"learning_rate": 9.710065645514224e-06,
"loss": 1.382,
"mean_token_accuracy": 0.621176666021347,
"num_tokens": 407995.0,
"step": 160
},
{
"epoch": 0.09299781181619256,
"grad_norm": 9.125,
"learning_rate": 9.691830780452225e-06,
"loss": 1.3665,
"mean_token_accuracy": 0.6267438113689423,
"num_tokens": 433527.0,
"step": 170
},
{
"epoch": 0.09846827133479212,
"grad_norm": 9.1875,
"learning_rate": 9.673595915390227e-06,
"loss": 1.3829,
"mean_token_accuracy": 0.6232609272003173,
"num_tokens": 459043.0,
"step": 180
},
{
"epoch": 0.10393873085339168,
"grad_norm": 9.0,
"learning_rate": 9.655361050328229e-06,
"loss": 1.3797,
"mean_token_accuracy": 0.6250600516796112,
"num_tokens": 484989.0,
"step": 190
},
{
"epoch": 0.10940919037199125,
"grad_norm": 9.0625,
"learning_rate": 9.63712618526623e-06,
"loss": 1.3799,
"mean_token_accuracy": 0.6250615298748017,
"num_tokens": 510010.0,
"step": 200
},
{
"epoch": 0.11487964989059081,
"grad_norm": 9.75,
"learning_rate": 9.618891320204232e-06,
"loss": 1.3541,
"mean_token_accuracy": 0.6320520222187043,
"num_tokens": 535568.0,
"step": 210
},
{
"epoch": 0.12035010940919037,
"grad_norm": 9.1875,
"learning_rate": 9.600656455142233e-06,
"loss": 1.3568,
"mean_token_accuracy": 0.6311915695667267,
"num_tokens": 560786.0,
"step": 220
},
{
"epoch": 0.12582056892778992,
"grad_norm": 9.0,
"learning_rate": 9.582421590080235e-06,
"loss": 1.3751,
"mean_token_accuracy": 0.6243879318237304,
"num_tokens": 586419.0,
"step": 230
},
{
"epoch": 0.13129102844638948,
"grad_norm": 9.5625,
"learning_rate": 9.564186725018235e-06,
"loss": 1.3421,
"mean_token_accuracy": 0.6328221976757049,
"num_tokens": 611683.0,
"step": 240
},
{
"epoch": 0.13676148796498905,
"grad_norm": 9.0,
"learning_rate": 9.545951859956238e-06,
"loss": 1.3199,
"mean_token_accuracy": 0.6378473103046417,
"num_tokens": 636737.0,
"step": 250
},
{
"epoch": 0.1422319474835886,
"grad_norm": 8.6875,
"learning_rate": 9.527716994894238e-06,
"loss": 1.3362,
"mean_token_accuracy": 0.6359954178333282,
"num_tokens": 661611.0,
"step": 260
},
{
"epoch": 0.14770240700218817,
"grad_norm": 9.0,
"learning_rate": 9.50948212983224e-06,
"loss": 1.2803,
"mean_token_accuracy": 0.649063116312027,
"num_tokens": 686464.0,
"step": 270
},
{
"epoch": 0.15317286652078774,
"grad_norm": 8.8125,
"learning_rate": 9.491247264770241e-06,
"loss": 1.3093,
"mean_token_accuracy": 0.6380355060100555,
"num_tokens": 711615.0,
"step": 280
},
{
"epoch": 0.1586433260393873,
"grad_norm": 8.4375,
"learning_rate": 9.473012399708243e-06,
"loss": 1.3034,
"mean_token_accuracy": 0.6415359675884247,
"num_tokens": 736807.0,
"step": 290
},
{
"epoch": 0.16411378555798686,
"grad_norm": 9.3125,
"learning_rate": 9.454777534646244e-06,
"loss": 1.3251,
"mean_token_accuracy": 0.6348649680614471,
"num_tokens": 762209.0,
"step": 300
},
{
"epoch": 0.16958424507658643,
"grad_norm": 8.375,
"learning_rate": 9.436542669584246e-06,
"loss": 1.2911,
"mean_token_accuracy": 0.6452939510345459,
"num_tokens": 787695.0,
"step": 310
},
{
"epoch": 0.175054704595186,
"grad_norm": 8.5,
"learning_rate": 9.418307804522248e-06,
"loss": 1.3028,
"mean_token_accuracy": 0.640759015083313,
"num_tokens": 813451.0,
"step": 320
},
{
"epoch": 0.18052516411378555,
"grad_norm": 9.1875,
"learning_rate": 9.400072939460249e-06,
"loss": 1.2847,
"mean_token_accuracy": 0.6437187314033508,
"num_tokens": 838902.0,
"step": 330
},
{
"epoch": 0.18599562363238512,
"grad_norm": 8.875,
"learning_rate": 9.38183807439825e-06,
"loss": 1.2791,
"mean_token_accuracy": 0.6461306512355804,
"num_tokens": 864097.0,
"step": 340
},
{
"epoch": 0.19146608315098468,
"grad_norm": 8.8125,
"learning_rate": 9.363603209336252e-06,
"loss": 1.2946,
"mean_token_accuracy": 0.6436072170734406,
"num_tokens": 889959.0,
"step": 350
},
{
"epoch": 0.19693654266958424,
"grad_norm": 9.0625,
"learning_rate": 9.345368344274252e-06,
"loss": 1.3278,
"mean_token_accuracy": 0.6354196608066559,
"num_tokens": 915925.0,
"step": 360
},
{
"epoch": 0.2024070021881838,
"grad_norm": 9.5625,
"learning_rate": 9.327133479212255e-06,
"loss": 1.3017,
"mean_token_accuracy": 0.6431743144989014,
"num_tokens": 941137.0,
"step": 370
},
{
"epoch": 0.20787746170678337,
"grad_norm": 8.875,
"learning_rate": 9.308898614150255e-06,
"loss": 1.3143,
"mean_token_accuracy": 0.638333660364151,
"num_tokens": 966309.0,
"step": 380
},
{
"epoch": 0.21334792122538293,
"grad_norm": 8.5625,
"learning_rate": 9.290663749088259e-06,
"loss": 1.2774,
"mean_token_accuracy": 0.6447885036468506,
"num_tokens": 991629.0,
"step": 390
},
{
"epoch": 0.2188183807439825,
"grad_norm": 9.0625,
"learning_rate": 9.272428884026259e-06,
"loss": 1.2702,
"mean_token_accuracy": 0.646415501832962,
"num_tokens": 1017315.0,
"step": 400
},
{
"epoch": 0.22428884026258206,
"grad_norm": 9.125,
"learning_rate": 9.25419401896426e-06,
"loss": 1.2708,
"mean_token_accuracy": 0.6465388536453247,
"num_tokens": 1042374.0,
"step": 410
},
{
"epoch": 0.22975929978118162,
"grad_norm": 9.1875,
"learning_rate": 9.235959153902262e-06,
"loss": 1.2859,
"mean_token_accuracy": 0.6423357188701629,
"num_tokens": 1067942.0,
"step": 420
},
{
"epoch": 0.23522975929978118,
"grad_norm": 8.8125,
"learning_rate": 9.217724288840263e-06,
"loss": 1.2934,
"mean_token_accuracy": 0.6421127438545227,
"num_tokens": 1093790.0,
"step": 430
},
{
"epoch": 0.24070021881838075,
"grad_norm": 8.5,
"learning_rate": 9.199489423778265e-06,
"loss": 1.3007,
"mean_token_accuracy": 0.6417977750301361,
"num_tokens": 1119210.0,
"step": 440
},
{
"epoch": 0.2461706783369803,
"grad_norm": 9.4375,
"learning_rate": 9.181254558716267e-06,
"loss": 1.3159,
"mean_token_accuracy": 0.6400705456733704,
"num_tokens": 1144670.0,
"step": 450
},
{
"epoch": 0.25164113785557984,
"grad_norm": 9.625,
"learning_rate": 9.163019693654268e-06,
"loss": 1.27,
"mean_token_accuracy": 0.6472333431243896,
"num_tokens": 1170288.0,
"step": 460
},
{
"epoch": 0.25711159737417943,
"grad_norm": 9.6875,
"learning_rate": 9.14478482859227e-06,
"loss": 1.2754,
"mean_token_accuracy": 0.6426136493682861,
"num_tokens": 1195969.0,
"step": 470
},
{
"epoch": 0.26258205689277897,
"grad_norm": 9.1875,
"learning_rate": 9.12654996353027e-06,
"loss": 1.2805,
"mean_token_accuracy": 0.6377979755401612,
"num_tokens": 1221472.0,
"step": 480
},
{
"epoch": 0.26805251641137856,
"grad_norm": 9.25,
"learning_rate": 9.108315098468273e-06,
"loss": 1.2649,
"mean_token_accuracy": 0.6489916682243347,
"num_tokens": 1246637.0,
"step": 490
},
{
"epoch": 0.2735229759299781,
"grad_norm": 9.1875,
"learning_rate": 9.090080233406273e-06,
"loss": 1.2838,
"mean_token_accuracy": 0.6428185880184174,
"num_tokens": 1272427.0,
"step": 500
},
{
"epoch": 0.2789934354485777,
"grad_norm": 8.6875,
"learning_rate": 9.071845368344276e-06,
"loss": 1.2394,
"mean_token_accuracy": 0.6552128493785858,
"num_tokens": 1297420.0,
"step": 510
},
{
"epoch": 0.2844638949671772,
"grad_norm": 8.1875,
"learning_rate": 9.053610503282276e-06,
"loss": 1.2798,
"mean_token_accuracy": 0.6457675039768219,
"num_tokens": 1322816.0,
"step": 520
},
{
"epoch": 0.2899343544857768,
"grad_norm": 8.625,
"learning_rate": 9.03537563822028e-06,
"loss": 1.2445,
"mean_token_accuracy": 0.6481238782405854,
"num_tokens": 1348483.0,
"step": 530
},
{
"epoch": 0.29540481400437635,
"grad_norm": 9.25,
"learning_rate": 9.017140773158279e-06,
"loss": 1.2699,
"mean_token_accuracy": 0.6499922752380372,
"num_tokens": 1373968.0,
"step": 540
},
{
"epoch": 0.30087527352297594,
"grad_norm": 9.0,
"learning_rate": 8.99890590809628e-06,
"loss": 1.2504,
"mean_token_accuracy": 0.6519196212291718,
"num_tokens": 1399038.0,
"step": 550
},
{
"epoch": 0.3063457330415755,
"grad_norm": 8.8125,
"learning_rate": 8.980671043034282e-06,
"loss": 1.2576,
"mean_token_accuracy": 0.6471436500549317,
"num_tokens": 1424348.0,
"step": 560
},
{
"epoch": 0.31181619256017507,
"grad_norm": 8.875,
"learning_rate": 8.962436177972284e-06,
"loss": 1.2728,
"mean_token_accuracy": 0.6437059342861176,
"num_tokens": 1450046.0,
"step": 570
},
{
"epoch": 0.3172866520787746,
"grad_norm": 8.9375,
"learning_rate": 8.944201312910286e-06,
"loss": 1.2699,
"mean_token_accuracy": 0.6483521461486816,
"num_tokens": 1475329.0,
"step": 580
},
{
"epoch": 0.3227571115973742,
"grad_norm": 9.0,
"learning_rate": 8.925966447848287e-06,
"loss": 1.2841,
"mean_token_accuracy": 0.6452650845050811,
"num_tokens": 1500705.0,
"step": 590
},
{
"epoch": 0.3282275711159737,
"grad_norm": 9.4375,
"learning_rate": 8.907731582786287e-06,
"loss": 1.2879,
"mean_token_accuracy": 0.6446281552314759,
"num_tokens": 1526093.0,
"step": 600
},
{
"epoch": 0.3336980306345733,
"grad_norm": 9.9375,
"learning_rate": 8.88949671772429e-06,
"loss": 1.2879,
"mean_token_accuracy": 0.6415457963943482,
"num_tokens": 1551249.0,
"step": 610
},
{
"epoch": 0.33916849015317285,
"grad_norm": 8.875,
"learning_rate": 8.87126185266229e-06,
"loss": 1.2563,
"mean_token_accuracy": 0.6477604985237122,
"num_tokens": 1576597.0,
"step": 620
},
{
"epoch": 0.34463894967177244,
"grad_norm": 8.8125,
"learning_rate": 8.853026987600293e-06,
"loss": 1.2371,
"mean_token_accuracy": 0.6518564283847809,
"num_tokens": 1601829.0,
"step": 630
},
{
"epoch": 0.350109409190372,
"grad_norm": 9.0625,
"learning_rate": 8.834792122538293e-06,
"loss": 1.2774,
"mean_token_accuracy": 0.6432484328746796,
"num_tokens": 1627555.0,
"step": 640
},
{
"epoch": 0.35557986870897157,
"grad_norm": 8.75,
"learning_rate": 8.816557257476297e-06,
"loss": 1.2568,
"mean_token_accuracy": 0.6520409166812897,
"num_tokens": 1653045.0,
"step": 650
},
{
"epoch": 0.3610503282275711,
"grad_norm": 8.625,
"learning_rate": 8.798322392414297e-06,
"loss": 1.2746,
"mean_token_accuracy": 0.6470682263374329,
"num_tokens": 1678382.0,
"step": 660
},
{
"epoch": 0.3665207877461707,
"grad_norm": 9.25,
"learning_rate": 8.780087527352298e-06,
"loss": 1.2593,
"mean_token_accuracy": 0.6493343353271485,
"num_tokens": 1703866.0,
"step": 670
},
{
"epoch": 0.37199124726477023,
"grad_norm": 8.625,
"learning_rate": 8.7618526622903e-06,
"loss": 1.2595,
"mean_token_accuracy": 0.6501845002174378,
"num_tokens": 1729154.0,
"step": 680
},
{
"epoch": 0.3774617067833698,
"grad_norm": 9.0,
"learning_rate": 8.743617797228301e-06,
"loss": 1.2543,
"mean_token_accuracy": 0.6474352359771729,
"num_tokens": 1754459.0,
"step": 690
},
{
"epoch": 0.38293216630196936,
"grad_norm": 8.4375,
"learning_rate": 8.725382932166303e-06,
"loss": 1.2385,
"mean_token_accuracy": 0.6522322833538056,
"num_tokens": 1779833.0,
"step": 700
},
{
"epoch": 0.38840262582056895,
"grad_norm": 8.8125,
"learning_rate": 8.707148067104304e-06,
"loss": 1.2308,
"mean_token_accuracy": 0.6520416557788848,
"num_tokens": 1805297.0,
"step": 710
},
{
"epoch": 0.3938730853391685,
"grad_norm": 9.5625,
"learning_rate": 8.688913202042306e-06,
"loss": 1.2602,
"mean_token_accuracy": 0.6470251500606536,
"num_tokens": 1830573.0,
"step": 720
},
{
"epoch": 0.3993435448577681,
"grad_norm": 9.25,
"learning_rate": 8.670678336980308e-06,
"loss": 1.2518,
"mean_token_accuracy": 0.6481884896755219,
"num_tokens": 1855937.0,
"step": 730
},
{
"epoch": 0.4048140043763676,
"grad_norm": 9.125,
"learning_rate": 8.652443471918308e-06,
"loss": 1.2469,
"mean_token_accuracy": 0.6519103825092316,
"num_tokens": 1881096.0,
"step": 740
},
{
"epoch": 0.4102844638949672,
"grad_norm": 9.0625,
"learning_rate": 8.634208606856311e-06,
"loss": 1.236,
"mean_token_accuracy": 0.6525180697441101,
"num_tokens": 1906526.0,
"step": 750
},
{
"epoch": 0.41575492341356673,
"grad_norm": 8.8125,
"learning_rate": 8.61597374179431e-06,
"loss": 1.2583,
"mean_token_accuracy": 0.647457766532898,
"num_tokens": 1931741.0,
"step": 760
},
{
"epoch": 0.4212253829321663,
"grad_norm": 8.9375,
"learning_rate": 8.597738876732314e-06,
"loss": 1.2296,
"mean_token_accuracy": 0.6541453003883362,
"num_tokens": 1956816.0,
"step": 770
},
{
"epoch": 0.42669584245076586,
"grad_norm": 9.625,
"learning_rate": 8.579504011670314e-06,
"loss": 1.2343,
"mean_token_accuracy": 0.6523520529270173,
"num_tokens": 1981932.0,
"step": 780
},
{
"epoch": 0.43216630196936545,
"grad_norm": 9.0625,
"learning_rate": 8.561269146608316e-06,
"loss": 1.2474,
"mean_token_accuracy": 0.6512964367866516,
"num_tokens": 2007382.0,
"step": 790
},
{
"epoch": 0.437636761487965,
"grad_norm": 8.5625,
"learning_rate": 8.543034281546317e-06,
"loss": 1.2184,
"mean_token_accuracy": 0.6564052760601043,
"num_tokens": 2033121.0,
"step": 800
},
{
"epoch": 0.4431072210065646,
"grad_norm": 9.25,
"learning_rate": 8.524799416484319e-06,
"loss": 1.2495,
"mean_token_accuracy": 0.6491189241409302,
"num_tokens": 2058751.0,
"step": 810
},
{
"epoch": 0.4485776805251641,
"grad_norm": 8.8125,
"learning_rate": 8.50656455142232e-06,
"loss": 1.2014,
"mean_token_accuracy": 0.6595002830028533,
"num_tokens": 2083836.0,
"step": 820
},
{
"epoch": 0.4540481400437637,
"grad_norm": 9.0,
"learning_rate": 8.488329686360322e-06,
"loss": 1.2478,
"mean_token_accuracy": 0.650605583190918,
"num_tokens": 2109163.0,
"step": 830
},
{
"epoch": 0.45951859956236324,
"grad_norm": 8.6875,
"learning_rate": 8.470094821298323e-06,
"loss": 1.2486,
"mean_token_accuracy": 0.6492572844028472,
"num_tokens": 2134626.0,
"step": 840
},
{
"epoch": 0.4649890590809628,
"grad_norm": 9.125,
"learning_rate": 8.451859956236325e-06,
"loss": 1.2397,
"mean_token_accuracy": 0.6523317098617554,
"num_tokens": 2159963.0,
"step": 850
},
{
"epoch": 0.47045951859956237,
"grad_norm": 9.125,
"learning_rate": 8.433625091174325e-06,
"loss": 1.2381,
"mean_token_accuracy": 0.6511462509632111,
"num_tokens": 2185310.0,
"step": 860
},
{
"epoch": 0.4759299781181619,
"grad_norm": 9.125,
"learning_rate": 8.415390226112328e-06,
"loss": 1.2015,
"mean_token_accuracy": 0.6603323996067048,
"num_tokens": 2210642.0,
"step": 870
},
{
"epoch": 0.4814004376367615,
"grad_norm": 9.0625,
"learning_rate": 8.397155361050328e-06,
"loss": 1.2369,
"mean_token_accuracy": 0.6551064074039459,
"num_tokens": 2235713.0,
"step": 880
},
{
"epoch": 0.486870897155361,
"grad_norm": 9.25,
"learning_rate": 8.378920495988331e-06,
"loss": 1.2588,
"mean_token_accuracy": 0.6488555371761322,
"num_tokens": 2261549.0,
"step": 890
},
{
"epoch": 0.4923413566739606,
"grad_norm": 9.3125,
"learning_rate": 8.360685630926331e-06,
"loss": 1.2308,
"mean_token_accuracy": 0.6573903739452363,
"num_tokens": 2286614.0,
"step": 900
},
{
"epoch": 0.49781181619256015,
"grad_norm": 9.25,
"learning_rate": 8.342450765864333e-06,
"loss": 1.215,
"mean_token_accuracy": 0.6597650587558747,
"num_tokens": 2312037.0,
"step": 910
},
{
"epoch": 0.5032822757111597,
"grad_norm": 8.75,
"learning_rate": 8.324215900802335e-06,
"loss": 1.2239,
"mean_token_accuracy": 0.6548255145549774,
"num_tokens": 2337544.0,
"step": 920
},
{
"epoch": 0.5087527352297593,
"grad_norm": 8.8125,
"learning_rate": 8.305981035740336e-06,
"loss": 1.2629,
"mean_token_accuracy": 0.6493556082248688,
"num_tokens": 2363200.0,
"step": 930
},
{
"epoch": 0.5142231947483589,
"grad_norm": 9.0625,
"learning_rate": 8.287746170678338e-06,
"loss": 1.21,
"mean_token_accuracy": 0.6589162766933441,
"num_tokens": 2388450.0,
"step": 940
},
{
"epoch": 0.5196936542669585,
"grad_norm": 8.8125,
"learning_rate": 8.26951130561634e-06,
"loss": 1.2373,
"mean_token_accuracy": 0.6519401609897614,
"num_tokens": 2413796.0,
"step": 950
},
{
"epoch": 0.5251641137855579,
"grad_norm": 9.1875,
"learning_rate": 8.251276440554341e-06,
"loss": 1.2194,
"mean_token_accuracy": 0.6579577445983886,
"num_tokens": 2439412.0,
"step": 960
},
{
"epoch": 0.5306345733041575,
"grad_norm": 9.0,
"learning_rate": 8.233041575492342e-06,
"loss": 1.241,
"mean_token_accuracy": 0.6511935293674469,
"num_tokens": 2464594.0,
"step": 970
},
{
"epoch": 0.5361050328227571,
"grad_norm": 8.875,
"learning_rate": 8.214806710430342e-06,
"loss": 1.2325,
"mean_token_accuracy": 0.6544456005096435,
"num_tokens": 2490161.0,
"step": 980
},
{
"epoch": 0.5415754923413567,
"grad_norm": 9.3125,
"learning_rate": 8.196571845368346e-06,
"loss": 1.2768,
"mean_token_accuracy": 0.6460224032402039,
"num_tokens": 2515978.0,
"step": 990
},
{
"epoch": 0.5470459518599562,
"grad_norm": 8.5625,
"learning_rate": 8.178336980306346e-06,
"loss": 1.2196,
"mean_token_accuracy": 0.6573625862598419,
"num_tokens": 2541854.0,
"step": 1000
},
{
"epoch": 0.5525164113785558,
"grad_norm": 8.8125,
"learning_rate": 8.160102115244349e-06,
"loss": 1.2044,
"mean_token_accuracy": 0.6583312273025512,
"num_tokens": 2566969.0,
"step": 1010
},
{
"epoch": 0.5579868708971554,
"grad_norm": 9.25,
"learning_rate": 8.141867250182349e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.6549983143806457,
"num_tokens": 2592252.0,
"step": 1020
},
{
"epoch": 0.563457330415755,
"grad_norm": 9.0625,
"learning_rate": 8.12363238512035e-06,
"loss": 1.2106,
"mean_token_accuracy": 0.6564074397087097,
"num_tokens": 2617509.0,
"step": 1030
},
{
"epoch": 0.5689277899343544,
"grad_norm": 8.625,
"learning_rate": 8.105397520058352e-06,
"loss": 1.239,
"mean_token_accuracy": 0.6544763445854187,
"num_tokens": 2642416.0,
"step": 1040
},
{
"epoch": 0.574398249452954,
"grad_norm": 9.0625,
"learning_rate": 8.087162654996354e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.6501368045806885,
"num_tokens": 2668075.0,
"step": 1050
},
{
"epoch": 0.5798687089715536,
"grad_norm": 9.8125,
"learning_rate": 8.068927789934355e-06,
"loss": 1.2368,
"mean_token_accuracy": 0.6530345678329468,
"num_tokens": 2693387.0,
"step": 1060
},
{
"epoch": 0.5853391684901532,
"grad_norm": 8.5,
"learning_rate": 8.050692924872357e-06,
"loss": 1.223,
"mean_token_accuracy": 0.6532883644104004,
"num_tokens": 2718953.0,
"step": 1070
},
{
"epoch": 0.5908096280087527,
"grad_norm": 9.5625,
"learning_rate": 8.032458059810358e-06,
"loss": 1.2449,
"mean_token_accuracy": 0.6541399002075196,
"num_tokens": 2744468.0,
"step": 1080
},
{
"epoch": 0.5962800875273523,
"grad_norm": 8.8125,
"learning_rate": 8.01422319474836e-06,
"loss": 1.2418,
"mean_token_accuracy": 0.6520101726055145,
"num_tokens": 2770101.0,
"step": 1090
},
{
"epoch": 0.6017505470459519,
"grad_norm": 9.125,
"learning_rate": 7.99598832968636e-06,
"loss": 1.2313,
"mean_token_accuracy": 0.6546747028827667,
"num_tokens": 2795378.0,
"step": 1100
},
{
"epoch": 0.6072210065645515,
"grad_norm": 9.3125,
"learning_rate": 7.977753464624363e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.6587331891059875,
"num_tokens": 2820924.0,
"step": 1110
},
{
"epoch": 0.612691466083151,
"grad_norm": 8.8125,
"learning_rate": 7.959518599562363e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.6550480246543884,
"num_tokens": 2846859.0,
"step": 1120
},
{
"epoch": 0.6181619256017505,
"grad_norm": 9.5,
"learning_rate": 7.941283734500366e-06,
"loss": 1.2137,
"mean_token_accuracy": 0.6592613101005554,
"num_tokens": 2872191.0,
"step": 1130
},
{
"epoch": 0.6236323851203501,
"grad_norm": 9.5625,
"learning_rate": 7.923048869438366e-06,
"loss": 1.2026,
"mean_token_accuracy": 0.661452466249466,
"num_tokens": 2897466.0,
"step": 1140
},
{
"epoch": 0.6291028446389497,
"grad_norm": 8.875,
"learning_rate": 7.90481400437637e-06,
"loss": 1.2344,
"mean_token_accuracy": 0.6545186638832092,
"num_tokens": 2923267.0,
"step": 1150
},
{
"epoch": 0.6345733041575492,
"grad_norm": 8.875,
"learning_rate": 7.88657913931437e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.6601431190967559,
"num_tokens": 2948499.0,
"step": 1160
},
{
"epoch": 0.6400437636761488,
"grad_norm": 9.3125,
"learning_rate": 7.868344274252371e-06,
"loss": 1.2165,
"mean_token_accuracy": 0.6581261634826661,
"num_tokens": 2973709.0,
"step": 1170
},
{
"epoch": 0.6455142231947484,
"grad_norm": 9.1875,
"learning_rate": 7.850109409190373e-06,
"loss": 1.2423,
"mean_token_accuracy": 0.6519295990467071,
"num_tokens": 2999177.0,
"step": 1180
},
{
"epoch": 0.650984682713348,
"grad_norm": 8.625,
"learning_rate": 7.831874544128374e-06,
"loss": 1.2413,
"mean_token_accuracy": 0.6543058276176452,
"num_tokens": 3025180.0,
"step": 1190
},
{
"epoch": 0.6564551422319475,
"grad_norm": 9.25,
"learning_rate": 7.813639679066376e-06,
"loss": 1.2133,
"mean_token_accuracy": 0.6597758173942566,
"num_tokens": 3050754.0,
"step": 1200
},
{
"epoch": 0.661925601750547,
"grad_norm": 9.25,
"learning_rate": 7.795404814004377e-06,
"loss": 1.2227,
"mean_token_accuracy": 0.6577537059783936,
"num_tokens": 3075900.0,
"step": 1210
},
{
"epoch": 0.6673960612691466,
"grad_norm": 9.0625,
"learning_rate": 7.777169948942377e-06,
"loss": 1.2266,
"mean_token_accuracy": 0.655048793554306,
"num_tokens": 3101018.0,
"step": 1220
},
{
"epoch": 0.6728665207877462,
"grad_norm": 9.25,
"learning_rate": 7.75893508388038e-06,
"loss": 1.1967,
"mean_token_accuracy": 0.6654035151004791,
"num_tokens": 3126618.0,
"step": 1230
},
{
"epoch": 0.6783369803063457,
"grad_norm": 9.0625,
"learning_rate": 7.74070021881838e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.6532664895057678,
"num_tokens": 3151368.0,
"step": 1240
},
{
"epoch": 0.6838074398249453,
"grad_norm": 9.0625,
"learning_rate": 7.722465353756384e-06,
"loss": 1.1909,
"mean_token_accuracy": 0.6590378880500793,
"num_tokens": 3176714.0,
"step": 1250
},
{
"epoch": 0.6892778993435449,
"grad_norm": 9.25,
"learning_rate": 7.704230488694384e-06,
"loss": 1.2238,
"mean_token_accuracy": 0.6598979830741882,
"num_tokens": 3202322.0,
"step": 1260
},
{
"epoch": 0.6947483588621444,
"grad_norm": 9.125,
"learning_rate": 7.685995623632387e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.6573567986488342,
"num_tokens": 3227693.0,
"step": 1270
},
{
"epoch": 0.700218818380744,
"grad_norm": 9.375,
"learning_rate": 7.667760758570387e-06,
"loss": 1.2031,
"mean_token_accuracy": 0.6580542623996735,
"num_tokens": 3252807.0,
"step": 1280
},
{
"epoch": 0.7056892778993435,
"grad_norm": 8.9375,
"learning_rate": 7.649525893508388e-06,
"loss": 1.1975,
"mean_token_accuracy": 0.6616823971271515,
"num_tokens": 3278218.0,
"step": 1290
},
{
"epoch": 0.7111597374179431,
"grad_norm": 9.3125,
"learning_rate": 7.63129102844639e-06,
"loss": 1.2157,
"mean_token_accuracy": 0.6570103228092193,
"num_tokens": 3304019.0,
"step": 1300
},
{
"epoch": 0.7166301969365426,
"grad_norm": 9.4375,
"learning_rate": 7.6130561633843915e-06,
"loss": 1.2026,
"mean_token_accuracy": 0.6590713143348694,
"num_tokens": 3329456.0,
"step": 1310
},
{
"epoch": 0.7221006564551422,
"grad_norm": 9.1875,
"learning_rate": 7.594821298322393e-06,
"loss": 1.2449,
"mean_token_accuracy": 0.6509311914443969,
"num_tokens": 3355024.0,
"step": 1320
},
{
"epoch": 0.7275711159737418,
"grad_norm": 8.875,
"learning_rate": 7.576586433260395e-06,
"loss": 1.1989,
"mean_token_accuracy": 0.6584409058094025,
"num_tokens": 3380203.0,
"step": 1330
},
{
"epoch": 0.7330415754923414,
"grad_norm": 9.0,
"learning_rate": 7.5583515681983954e-06,
"loss": 1.1704,
"mean_token_accuracy": 0.6633239209651947,
"num_tokens": 3405735.0,
"step": 1340
},
{
"epoch": 0.7385120350109409,
"grad_norm": 9.3125,
"learning_rate": 7.540116703136398e-06,
"loss": 1.1788,
"mean_token_accuracy": 0.6651252508163452,
"num_tokens": 3431101.0,
"step": 1350
},
{
"epoch": 0.7439824945295405,
"grad_norm": 9.0625,
"learning_rate": 7.521881838074399e-06,
"loss": 1.2175,
"mean_token_accuracy": 0.6579277157783509,
"num_tokens": 3456836.0,
"step": 1360
},
{
"epoch": 0.74945295404814,
"grad_norm": 9.25,
"learning_rate": 7.503646973012401e-06,
"loss": 1.236,
"mean_token_accuracy": 0.6502565503120422,
"num_tokens": 3482262.0,
"step": 1370
},
{
"epoch": 0.7549234135667396,
"grad_norm": 9.4375,
"learning_rate": 7.485412107950402e-06,
"loss": 1.2048,
"mean_token_accuracy": 0.6587684571743011,
"num_tokens": 3507578.0,
"step": 1380
},
{
"epoch": 0.7603938730853391,
"grad_norm": 8.875,
"learning_rate": 7.467177242888403e-06,
"loss": 1.1982,
"mean_token_accuracy": 0.6572374284267426,
"num_tokens": 3533438.0,
"step": 1390
},
{
"epoch": 0.7658643326039387,
"grad_norm": 9.3125,
"learning_rate": 7.448942377826405e-06,
"loss": 1.1968,
"mean_token_accuracy": 0.6594564735889434,
"num_tokens": 3558955.0,
"step": 1400
},
{
"epoch": 0.7713347921225383,
"grad_norm": 9.0,
"learning_rate": 7.430707512764406e-06,
"loss": 1.1535,
"mean_token_accuracy": 0.6660135149955749,
"num_tokens": 3584024.0,
"step": 1410
},
{
"epoch": 0.7768052516411379,
"grad_norm": 8.8125,
"learning_rate": 7.412472647702408e-06,
"loss": 1.1802,
"mean_token_accuracy": 0.6659539043903351,
"num_tokens": 3609518.0,
"step": 1420
},
{
"epoch": 0.7822757111597374,
"grad_norm": 8.8125,
"learning_rate": 7.394237782640409e-06,
"loss": 1.2023,
"mean_token_accuracy": 0.6609654724597931,
"num_tokens": 3635205.0,
"step": 1430
},
{
"epoch": 0.787746170678337,
"grad_norm": 9.0625,
"learning_rate": 7.3760029175784105e-06,
"loss": 1.2011,
"mean_token_accuracy": 0.6600887596607208,
"num_tokens": 3660799.0,
"step": 1440
},
{
"epoch": 0.7932166301969366,
"grad_norm": 9.125,
"learning_rate": 7.357768052516412e-06,
"loss": 1.2188,
"mean_token_accuracy": 0.6550438821315765,
"num_tokens": 3686173.0,
"step": 1450
},
{
"epoch": 0.7986870897155361,
"grad_norm": 8.8125,
"learning_rate": 7.339533187454413e-06,
"loss": 1.1869,
"mean_token_accuracy": 0.6660338282585144,
"num_tokens": 3711634.0,
"step": 1460
},
{
"epoch": 0.8041575492341356,
"grad_norm": 8.5,
"learning_rate": 7.321298322392415e-06,
"loss": 1.2082,
"mean_token_accuracy": 0.6563746392726898,
"num_tokens": 3737277.0,
"step": 1470
},
{
"epoch": 0.8096280087527352,
"grad_norm": 9.125,
"learning_rate": 7.303063457330416e-06,
"loss": 1.2071,
"mean_token_accuracy": 0.6584551155567169,
"num_tokens": 3763061.0,
"step": 1480
},
{
"epoch": 0.8150984682713348,
"grad_norm": 9.375,
"learning_rate": 7.2848285922684185e-06,
"loss": 1.1978,
"mean_token_accuracy": 0.6639045178890228,
"num_tokens": 3788680.0,
"step": 1490
},
{
"epoch": 0.8205689277899344,
"grad_norm": 8.875,
"learning_rate": 7.266593727206419e-06,
"loss": 1.2125,
"mean_token_accuracy": 0.6617669343948365,
"num_tokens": 3814212.0,
"step": 1500
},
{
"epoch": 0.8260393873085339,
"grad_norm": 9.0625,
"learning_rate": 7.248358862144421e-06,
"loss": 1.1955,
"mean_token_accuracy": 0.6609017014503479,
"num_tokens": 3839753.0,
"step": 1510
},
{
"epoch": 0.8315098468271335,
"grad_norm": 9.375,
"learning_rate": 7.230123997082422e-06,
"loss": 1.1969,
"mean_token_accuracy": 0.6626928210258484,
"num_tokens": 3865200.0,
"step": 1520
},
{
"epoch": 0.8369803063457331,
"grad_norm": 8.75,
"learning_rate": 7.211889132020423e-06,
"loss": 1.2099,
"mean_token_accuracy": 0.662108862400055,
"num_tokens": 3891102.0,
"step": 1530
},
{
"epoch": 0.8424507658643327,
"grad_norm": 8.8125,
"learning_rate": 7.1936542669584256e-06,
"loss": 1.1902,
"mean_token_accuracy": 0.6671268463134765,
"num_tokens": 3915979.0,
"step": 1540
},
{
"epoch": 0.8479212253829321,
"grad_norm": 8.875,
"learning_rate": 7.175419401896426e-06,
"loss": 1.2321,
"mean_token_accuracy": 0.651692271232605,
"num_tokens": 3941101.0,
"step": 1550
},
{
"epoch": 0.8533916849015317,
"grad_norm": 8.75,
"learning_rate": 7.157184536834429e-06,
"loss": 1.2018,
"mean_token_accuracy": 0.6599911451339722,
"num_tokens": 3966588.0,
"step": 1560
},
{
"epoch": 0.8588621444201313,
"grad_norm": 9.6875,
"learning_rate": 7.1389496717724295e-06,
"loss": 1.1601,
"mean_token_accuracy": 0.6702842473983764,
"num_tokens": 3991816.0,
"step": 1570
},
{
"epoch": 0.8643326039387309,
"grad_norm": 9.125,
"learning_rate": 7.120714806710431e-06,
"loss": 1.2061,
"mean_token_accuracy": 0.6644167900085449,
"num_tokens": 4017159.0,
"step": 1580
},
{
"epoch": 0.8698030634573304,
"grad_norm": 9.0625,
"learning_rate": 7.102479941648433e-06,
"loss": 1.1982,
"mean_token_accuracy": 0.661655330657959,
"num_tokens": 4042412.0,
"step": 1590
},
{
"epoch": 0.87527352297593,
"grad_norm": 9.25,
"learning_rate": 7.0842450765864334e-06,
"loss": 1.1893,
"mean_token_accuracy": 0.6662971138954162,
"num_tokens": 4068029.0,
"step": 1600
},
{
"epoch": 0.8807439824945296,
"grad_norm": 8.75,
"learning_rate": 7.066010211524436e-06,
"loss": 1.2143,
"mean_token_accuracy": 0.6562692880630493,
"num_tokens": 4093581.0,
"step": 1610
},
{
"epoch": 0.8862144420131292,
"grad_norm": 8.8125,
"learning_rate": 7.047775346462437e-06,
"loss": 1.164,
"mean_token_accuracy": 0.6694628000259399,
"num_tokens": 4118811.0,
"step": 1620
},
{
"epoch": 0.8916849015317286,
"grad_norm": 9.25,
"learning_rate": 7.029540481400439e-06,
"loss": 1.2158,
"mean_token_accuracy": 0.6589863717555999,
"num_tokens": 4144422.0,
"step": 1630
},
{
"epoch": 0.8971553610503282,
"grad_norm": 9.375,
"learning_rate": 7.01130561633844e-06,
"loss": 1.2068,
"mean_token_accuracy": 0.6575411677360534,
"num_tokens": 4169788.0,
"step": 1640
},
{
"epoch": 0.9026258205689278,
"grad_norm": 9.1875,
"learning_rate": 6.9930707512764405e-06,
"loss": 1.2022,
"mean_token_accuracy": 0.6641773998737335,
"num_tokens": 4195416.0,
"step": 1650
},
{
"epoch": 0.9080962800875274,
"grad_norm": 8.9375,
"learning_rate": 6.974835886214443e-06,
"loss": 1.2009,
"mean_token_accuracy": 0.6612559735774994,
"num_tokens": 4220959.0,
"step": 1660
},
{
"epoch": 0.9135667396061269,
"grad_norm": 9.375,
"learning_rate": 6.956601021152444e-06,
"loss": 1.1772,
"mean_token_accuracy": 0.6646326899528503,
"num_tokens": 4246261.0,
"step": 1670
},
{
"epoch": 0.9190371991247265,
"grad_norm": 8.75,
"learning_rate": 6.938366156090446e-06,
"loss": 1.2133,
"mean_token_accuracy": 0.660455447435379,
"num_tokens": 4271550.0,
"step": 1680
},
{
"epoch": 0.9245076586433261,
"grad_norm": 9.5,
"learning_rate": 6.920131291028447e-06,
"loss": 1.198,
"mean_token_accuracy": 0.661461490392685,
"num_tokens": 4297359.0,
"step": 1690
},
{
"epoch": 0.9299781181619255,
"grad_norm": 8.9375,
"learning_rate": 6.901896425966449e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.6552061200141907,
"num_tokens": 4322920.0,
"step": 1700
},
{
"epoch": 0.9354485776805251,
"grad_norm": 9.1875,
"learning_rate": 6.88366156090445e-06,
"loss": 1.1856,
"mean_token_accuracy": 0.6640595495700836,
"num_tokens": 4348175.0,
"step": 1710
},
{
"epoch": 0.9409190371991247,
"grad_norm": 9.125,
"learning_rate": 6.865426695842451e-06,
"loss": 1.2074,
"mean_token_accuracy": 0.6573819875717163,
"num_tokens": 4373898.0,
"step": 1720
},
{
"epoch": 0.9463894967177243,
"grad_norm": 9.0,
"learning_rate": 6.847191830780453e-06,
"loss": 1.2118,
"mean_token_accuracy": 0.6596818029880523,
"num_tokens": 4399398.0,
"step": 1730
},
{
"epoch": 0.9518599562363238,
"grad_norm": 9.125,
"learning_rate": 6.828956965718454e-06,
"loss": 1.1888,
"mean_token_accuracy": 0.6635434806346894,
"num_tokens": 4424889.0,
"step": 1740
},
{
"epoch": 0.9573304157549234,
"grad_norm": 9.0625,
"learning_rate": 6.8107221006564564e-06,
"loss": 1.1966,
"mean_token_accuracy": 0.6608928442001343,
"num_tokens": 4449872.0,
"step": 1750
},
{
"epoch": 0.962800875273523,
"grad_norm": 9.4375,
"learning_rate": 6.792487235594457e-06,
"loss": 1.197,
"mean_token_accuracy": 0.6581675052642822,
"num_tokens": 4475097.0,
"step": 1760
},
{
"epoch": 0.9682713347921226,
"grad_norm": 8.8125,
"learning_rate": 6.774252370532458e-06,
"loss": 1.164,
"mean_token_accuracy": 0.6660536825656891,
"num_tokens": 4500595.0,
"step": 1770
},
{
"epoch": 0.973741794310722,
"grad_norm": 9.0625,
"learning_rate": 6.75601750547046e-06,
"loss": 1.1718,
"mean_token_accuracy": 0.6656007945537568,
"num_tokens": 4526142.0,
"step": 1780
},
{
"epoch": 0.9792122538293216,
"grad_norm": 9.625,
"learning_rate": 6.737782640408461e-06,
"loss": 1.1778,
"mean_token_accuracy": 0.670012629032135,
"num_tokens": 4551204.0,
"step": 1790
},
{
"epoch": 0.9846827133479212,
"grad_norm": 8.625,
"learning_rate": 6.7195477753464636e-06,
"loss": 1.2063,
"mean_token_accuracy": 0.6588987648487091,
"num_tokens": 4576664.0,
"step": 1800
},
{
"epoch": 0.9901531728665208,
"grad_norm": 9.3125,
"learning_rate": 6.701312910284464e-06,
"loss": 1.1862,
"mean_token_accuracy": 0.6618767023086548,
"num_tokens": 4602021.0,
"step": 1810
},
{
"epoch": 0.9956236323851203,
"grad_norm": 8.875,
"learning_rate": 6.683078045222467e-06,
"loss": 1.1911,
"mean_token_accuracy": 0.6659906089305878,
"num_tokens": 4627687.0,
"step": 1820
},
{
"epoch": 1.00109409190372,
"grad_norm": 8.3125,
"learning_rate": 6.6648431801604675e-06,
"loss": 1.2073,
"mean_token_accuracy": 0.6611107409000396,
"num_tokens": 4653161.0,
"step": 1830
},
{
"epoch": 1.0065645514223194,
"grad_norm": 8.8125,
"learning_rate": 6.646608315098468e-06,
"loss": 1.04,
"mean_token_accuracy": 0.6974774897098541,
"num_tokens": 4678651.0,
"step": 1840
},
{
"epoch": 1.012035010940919,
"grad_norm": 9.75,
"learning_rate": 6.628373450036471e-06,
"loss": 1.0189,
"mean_token_accuracy": 0.7027334988117218,
"num_tokens": 4703974.0,
"step": 1850
},
{
"epoch": 1.0175054704595186,
"grad_norm": 9.5625,
"learning_rate": 6.610138584974471e-06,
"loss": 1.0393,
"mean_token_accuracy": 0.6992483794689178,
"num_tokens": 4729182.0,
"step": 1860
},
{
"epoch": 1.0229759299781183,
"grad_norm": 9.1875,
"learning_rate": 6.591903719912474e-06,
"loss": 1.0248,
"mean_token_accuracy": 0.6999619722366333,
"num_tokens": 4754700.0,
"step": 1870
},
{
"epoch": 1.0284463894967177,
"grad_norm": 9.4375,
"learning_rate": 6.573668854850475e-06,
"loss": 1.0292,
"mean_token_accuracy": 0.699607890844345,
"num_tokens": 4780368.0,
"step": 1880
},
{
"epoch": 1.0339168490153172,
"grad_norm": 9.5625,
"learning_rate": 6.555433989788475e-06,
"loss": 1.0111,
"mean_token_accuracy": 0.7044564247131347,
"num_tokens": 4805626.0,
"step": 1890
},
{
"epoch": 1.039387308533917,
"grad_norm": 9.5,
"learning_rate": 6.537199124726478e-06,
"loss": 1.0417,
"mean_token_accuracy": 0.6986953377723694,
"num_tokens": 4830769.0,
"step": 1900
},
{
"epoch": 1.0448577680525164,
"grad_norm": 9.875,
"learning_rate": 6.5189642596644785e-06,
"loss": 0.9978,
"mean_token_accuracy": 0.7059278726577759,
"num_tokens": 4856510.0,
"step": 1910
},
{
"epoch": 1.0503282275711159,
"grad_norm": 9.875,
"learning_rate": 6.500729394602481e-06,
"loss": 1.041,
"mean_token_accuracy": 0.6938862383365632,
"num_tokens": 4881433.0,
"step": 1920
},
{
"epoch": 1.0557986870897156,
"grad_norm": 9.9375,
"learning_rate": 6.482494529540482e-06,
"loss": 1.0299,
"mean_token_accuracy": 0.7019344508647919,
"num_tokens": 4906727.0,
"step": 1930
},
{
"epoch": 1.061269146608315,
"grad_norm": 10.3125,
"learning_rate": 6.464259664478484e-06,
"loss": 1.0445,
"mean_token_accuracy": 0.6943408429622651,
"num_tokens": 4932196.0,
"step": 1940
},
{
"epoch": 1.0667396061269148,
"grad_norm": 10.25,
"learning_rate": 6.446024799416485e-06,
"loss": 1.0244,
"mean_token_accuracy": 0.700201416015625,
"num_tokens": 4957115.0,
"step": 1950
},
{
"epoch": 1.0722100656455142,
"grad_norm": 9.0625,
"learning_rate": 6.427789934354486e-06,
"loss": 1.0142,
"mean_token_accuracy": 0.7004279494285583,
"num_tokens": 4982766.0,
"step": 1960
},
{
"epoch": 1.0776805251641137,
"grad_norm": 10.25,
"learning_rate": 6.409555069292488e-06,
"loss": 1.0371,
"mean_token_accuracy": 0.6925820887088776,
"num_tokens": 5008061.0,
"step": 1970
},
{
"epoch": 1.0831509846827134,
"grad_norm": 9.6875,
"learning_rate": 6.391320204230489e-06,
"loss": 1.0356,
"mean_token_accuracy": 0.7003223180770874,
"num_tokens": 5033178.0,
"step": 1980
},
{
"epoch": 1.088621444201313,
"grad_norm": 10.125,
"learning_rate": 6.373085339168491e-06,
"loss": 1.0496,
"mean_token_accuracy": 0.6927937686443328,
"num_tokens": 5058461.0,
"step": 1990
},
{
"epoch": 1.0940919037199124,
"grad_norm": 9.9375,
"learning_rate": 6.354850474106492e-06,
"loss": 1.0666,
"mean_token_accuracy": 0.6866900682449341,
"num_tokens": 5083870.0,
"step": 2000
},
{
"epoch": 1.099562363238512,
"grad_norm": 10.25,
"learning_rate": 6.3366156090444944e-06,
"loss": 1.0257,
"mean_token_accuracy": 0.6989123821258545,
"num_tokens": 5109168.0,
"step": 2010
},
{
"epoch": 1.1050328227571116,
"grad_norm": 10.5,
"learning_rate": 6.318380743982495e-06,
"loss": 1.0333,
"mean_token_accuracy": 0.6956054985523223,
"num_tokens": 5134952.0,
"step": 2020
},
{
"epoch": 1.1105032822757113,
"grad_norm": 9.625,
"learning_rate": 6.300145878920496e-06,
"loss": 1.0169,
"mean_token_accuracy": 0.6954958021640778,
"num_tokens": 5160606.0,
"step": 2030
},
{
"epoch": 1.1159737417943107,
"grad_norm": 9.125,
"learning_rate": 6.281911013858498e-06,
"loss": 1.023,
"mean_token_accuracy": 0.7010883867740632,
"num_tokens": 5186194.0,
"step": 2040
},
{
"epoch": 1.1214442013129102,
"grad_norm": 10.0625,
"learning_rate": 6.263676148796499e-06,
"loss": 1.0373,
"mean_token_accuracy": 0.694850617647171,
"num_tokens": 5211467.0,
"step": 2050
},
{
"epoch": 1.12691466083151,
"grad_norm": 9.375,
"learning_rate": 6.2454412837345015e-06,
"loss": 1.023,
"mean_token_accuracy": 0.6988016486167907,
"num_tokens": 5236817.0,
"step": 2060
},
{
"epoch": 1.1323851203501094,
"grad_norm": 9.6875,
"learning_rate": 6.227206418672502e-06,
"loss": 1.0054,
"mean_token_accuracy": 0.704439902305603,
"num_tokens": 5262238.0,
"step": 2070
},
{
"epoch": 1.1378555798687089,
"grad_norm": 9.6875,
"learning_rate": 6.208971553610503e-06,
"loss": 1.0176,
"mean_token_accuracy": 0.6989111959934234,
"num_tokens": 5287171.0,
"step": 2080
},
{
"epoch": 1.1433260393873086,
"grad_norm": 9.4375,
"learning_rate": 6.1907366885485055e-06,
"loss": 1.0479,
"mean_token_accuracy": 0.6994122922420501,
"num_tokens": 5312851.0,
"step": 2090
},
{
"epoch": 1.148796498905908,
"grad_norm": 10.0,
"learning_rate": 6.172501823486506e-06,
"loss": 1.0211,
"mean_token_accuracy": 0.7004438996315002,
"num_tokens": 5338784.0,
"step": 2100
},
{
"epoch": 1.1542669584245075,
"grad_norm": 10.0,
"learning_rate": 6.154266958424509e-06,
"loss": 1.0014,
"mean_token_accuracy": 0.7057562172412872,
"num_tokens": 5364113.0,
"step": 2110
},
{
"epoch": 1.1597374179431073,
"grad_norm": 9.9375,
"learning_rate": 6.136032093362509e-06,
"loss": 1.0,
"mean_token_accuracy": 0.7027003407478333,
"num_tokens": 5389799.0,
"step": 2120
},
{
"epoch": 1.1652078774617067,
"grad_norm": 10.0,
"learning_rate": 6.117797228300512e-06,
"loss": 1.0171,
"mean_token_accuracy": 0.7003865003585815,
"num_tokens": 5415045.0,
"step": 2130
},
{
"epoch": 1.1706783369803064,
"grad_norm": 9.75,
"learning_rate": 6.099562363238513e-06,
"loss": 1.0371,
"mean_token_accuracy": 0.6990915298461914,
"num_tokens": 5440323.0,
"step": 2140
},
{
"epoch": 1.176148796498906,
"grad_norm": 9.9375,
"learning_rate": 6.081327498176513e-06,
"loss": 1.0348,
"mean_token_accuracy": 0.7001543581485749,
"num_tokens": 5465572.0,
"step": 2150
},
{
"epoch": 1.1816192560175054,
"grad_norm": 9.8125,
"learning_rate": 6.063092633114516e-06,
"loss": 1.0207,
"mean_token_accuracy": 0.6994670450687408,
"num_tokens": 5491158.0,
"step": 2160
},
{
"epoch": 1.187089715536105,
"grad_norm": 9.875,
"learning_rate": 6.0448577680525165e-06,
"loss": 1.0209,
"mean_token_accuracy": 0.700932627916336,
"num_tokens": 5516927.0,
"step": 2170
},
{
"epoch": 1.1925601750547046,
"grad_norm": 9.9375,
"learning_rate": 6.026622902990519e-06,
"loss": 1.0589,
"mean_token_accuracy": 0.6895469307899476,
"num_tokens": 5542251.0,
"step": 2180
},
{
"epoch": 1.1980306345733043,
"grad_norm": 10.375,
"learning_rate": 6.00838803792852e-06,
"loss": 1.0421,
"mean_token_accuracy": 0.6975519716739654,
"num_tokens": 5567438.0,
"step": 2190
},
{
"epoch": 1.2035010940919038,
"grad_norm": 10.1875,
"learning_rate": 5.9901531728665204e-06,
"loss": 1.0358,
"mean_token_accuracy": 0.6961750984191895,
"num_tokens": 5592726.0,
"step": 2200
},
{
"epoch": 1.2089715536105032,
"grad_norm": 9.4375,
"learning_rate": 5.971918307804523e-06,
"loss": 1.0394,
"mean_token_accuracy": 0.6967484354972839,
"num_tokens": 5618018.0,
"step": 2210
},
{
"epoch": 1.214442013129103,
"grad_norm": 9.5625,
"learning_rate": 5.953683442742524e-06,
"loss": 1.0465,
"mean_token_accuracy": 0.6956069469451904,
"num_tokens": 5643607.0,
"step": 2220
},
{
"epoch": 1.2199124726477024,
"grad_norm": 9.8125,
"learning_rate": 5.935448577680526e-06,
"loss": 1.0473,
"mean_token_accuracy": 0.6949532628059387,
"num_tokens": 5668817.0,
"step": 2230
},
{
"epoch": 1.225382932166302,
"grad_norm": 9.625,
"learning_rate": 5.917213712618527e-06,
"loss": 0.9865,
"mean_token_accuracy": 0.7074730455875397,
"num_tokens": 5694321.0,
"step": 2240
},
{
"epoch": 1.2308533916849016,
"grad_norm": 10.0,
"learning_rate": 5.898978847556529e-06,
"loss": 1.039,
"mean_token_accuracy": 0.6975044906139374,
"num_tokens": 5719832.0,
"step": 2250
},
{
"epoch": 1.236323851203501,
"grad_norm": 9.875,
"learning_rate": 5.88074398249453e-06,
"loss": 1.0395,
"mean_token_accuracy": 0.6937489449977875,
"num_tokens": 5745267.0,
"step": 2260
},
{
"epoch": 1.2417943107221006,
"grad_norm": 9.4375,
"learning_rate": 5.862509117432531e-06,
"loss": 1.0194,
"mean_token_accuracy": 0.6990994691848755,
"num_tokens": 5770849.0,
"step": 2270
},
{
"epoch": 1.2472647702407003,
"grad_norm": 10.1875,
"learning_rate": 5.844274252370533e-06,
"loss": 1.0543,
"mean_token_accuracy": 0.6901531100273133,
"num_tokens": 5795693.0,
"step": 2280
},
{
"epoch": 1.2527352297592997,
"grad_norm": 9.8125,
"learning_rate": 5.826039387308534e-06,
"loss": 1.0315,
"mean_token_accuracy": 0.7007197082042694,
"num_tokens": 5820987.0,
"step": 2290
},
{
"epoch": 1.2582056892778994,
"grad_norm": 10.125,
"learning_rate": 5.807804522246536e-06,
"loss": 1.0334,
"mean_token_accuracy": 0.6947225153446197,
"num_tokens": 5846512.0,
"step": 2300
},
{
"epoch": 1.263676148796499,
"grad_norm": 10.5,
"learning_rate": 5.789569657184537e-06,
"loss": 1.0451,
"mean_token_accuracy": 0.6949298202991485,
"num_tokens": 5872260.0,
"step": 2310
},
{
"epoch": 1.2691466083150984,
"grad_norm": 10.0,
"learning_rate": 5.771334792122538e-06,
"loss": 1.0218,
"mean_token_accuracy": 0.7000970780849457,
"num_tokens": 5897464.0,
"step": 2320
},
{
"epoch": 1.274617067833698,
"grad_norm": 10.0625,
"learning_rate": 5.75309992706054e-06,
"loss": 0.9975,
"mean_token_accuracy": 0.7064898908138275,
"num_tokens": 5922759.0,
"step": 2330
},
{
"epoch": 1.2800875273522976,
"grad_norm": 10.5,
"learning_rate": 5.734865061998541e-06,
"loss": 1.0501,
"mean_token_accuracy": 0.693008977174759,
"num_tokens": 5948112.0,
"step": 2340
},
{
"epoch": 1.2855579868708973,
"grad_norm": 10.5,
"learning_rate": 5.7166301969365435e-06,
"loss": 1.0517,
"mean_token_accuracy": 0.6909720063209533,
"num_tokens": 5973664.0,
"step": 2350
},
{
"epoch": 1.2910284463894968,
"grad_norm": 9.5625,
"learning_rate": 5.698395331874544e-06,
"loss": 1.0002,
"mean_token_accuracy": 0.7026338756084443,
"num_tokens": 5999362.0,
"step": 2360
},
{
"epoch": 1.2964989059080962,
"grad_norm": 9.875,
"learning_rate": 5.680160466812547e-06,
"loss": 1.0145,
"mean_token_accuracy": 0.7004771769046784,
"num_tokens": 6025166.0,
"step": 2370
},
{
"epoch": 1.3019693654266957,
"grad_norm": 10.0,
"learning_rate": 5.661925601750547e-06,
"loss": 1.01,
"mean_token_accuracy": 0.7036912262439727,
"num_tokens": 6050665.0,
"step": 2380
},
{
"epoch": 1.3074398249452954,
"grad_norm": 10.25,
"learning_rate": 5.643690736688548e-06,
"loss": 1.027,
"mean_token_accuracy": 0.7006226062774659,
"num_tokens": 6075993.0,
"step": 2390
},
{
"epoch": 1.312910284463895,
"grad_norm": 10.3125,
"learning_rate": 5.6254558716265506e-06,
"loss": 0.9982,
"mean_token_accuracy": 0.7021169304847718,
"num_tokens": 6101165.0,
"step": 2400
},
{
"epoch": 1.3183807439824946,
"grad_norm": 9.75,
"learning_rate": 5.607221006564551e-06,
"loss": 1.0364,
"mean_token_accuracy": 0.6955066680908203,
"num_tokens": 6126628.0,
"step": 2410
},
{
"epoch": 1.323851203501094,
"grad_norm": 10.5,
"learning_rate": 5.588986141502554e-06,
"loss": 1.0552,
"mean_token_accuracy": 0.6929129481315612,
"num_tokens": 6152116.0,
"step": 2420
},
{
"epoch": 1.3293216630196936,
"grad_norm": 9.8125,
"learning_rate": 5.5707512764405545e-06,
"loss": 1.0342,
"mean_token_accuracy": 0.6941097199916839,
"num_tokens": 6177489.0,
"step": 2430
},
{
"epoch": 1.3347921225382933,
"grad_norm": 9.875,
"learning_rate": 5.552516411378557e-06,
"loss": 1.039,
"mean_token_accuracy": 0.697142231464386,
"num_tokens": 6202873.0,
"step": 2440
},
{
"epoch": 1.3402625820568927,
"grad_norm": 9.375,
"learning_rate": 5.534281546316558e-06,
"loss": 1.0287,
"mean_token_accuracy": 0.6986703455448151,
"num_tokens": 6228667.0,
"step": 2450
},
{
"epoch": 1.3457330415754925,
"grad_norm": 9.75,
"learning_rate": 5.516046681254558e-06,
"loss": 1.0121,
"mean_token_accuracy": 0.7000769674777985,
"num_tokens": 6253923.0,
"step": 2460
},
{
"epoch": 1.351203501094092,
"grad_norm": 9.75,
"learning_rate": 5.497811816192561e-06,
"loss": 1.011,
"mean_token_accuracy": 0.7024608731269837,
"num_tokens": 6279341.0,
"step": 2470
},
{
"epoch": 1.3566739606126914,
"grad_norm": 9.6875,
"learning_rate": 5.479576951130562e-06,
"loss": 1.0576,
"mean_token_accuracy": 0.6930296897888184,
"num_tokens": 6304966.0,
"step": 2480
},
{
"epoch": 1.3621444201312911,
"grad_norm": 10.5,
"learning_rate": 5.461342086068564e-06,
"loss": 0.9841,
"mean_token_accuracy": 0.7117383360862732,
"num_tokens": 6330323.0,
"step": 2490
},
{
"epoch": 1.3676148796498906,
"grad_norm": 9.4375,
"learning_rate": 5.443107221006565e-06,
"loss": 1.0234,
"mean_token_accuracy": 0.697024530172348,
"num_tokens": 6356015.0,
"step": 2500
},
{
"epoch": 1.37308533916849,
"grad_norm": 10.0,
"learning_rate": 5.424872355944566e-06,
"loss": 1.0141,
"mean_token_accuracy": 0.7016679644584656,
"num_tokens": 6381515.0,
"step": 2510
},
{
"epoch": 1.3785557986870898,
"grad_norm": 9.625,
"learning_rate": 5.406637490882568e-06,
"loss": 1.038,
"mean_token_accuracy": 0.6987586081027984,
"num_tokens": 6407329.0,
"step": 2520
},
{
"epoch": 1.3840262582056893,
"grad_norm": 10.0625,
"learning_rate": 5.388402625820569e-06,
"loss": 1.0408,
"mean_token_accuracy": 0.6964347183704376,
"num_tokens": 6433411.0,
"step": 2530
},
{
"epoch": 1.3894967177242887,
"grad_norm": 9.75,
"learning_rate": 5.370167760758571e-06,
"loss": 1.0198,
"mean_token_accuracy": 0.7017770171165466,
"num_tokens": 6459012.0,
"step": 2540
},
{
"epoch": 1.3949671772428884,
"grad_norm": 9.9375,
"learning_rate": 5.351932895696572e-06,
"loss": 1.0705,
"mean_token_accuracy": 0.6890166878700257,
"num_tokens": 6484324.0,
"step": 2550
},
{
"epoch": 1.400437636761488,
"grad_norm": 9.5625,
"learning_rate": 5.333698030634574e-06,
"loss": 1.0185,
"mean_token_accuracy": 0.7025589048862457,
"num_tokens": 6509536.0,
"step": 2560
},
{
"epoch": 1.4059080962800876,
"grad_norm": 10.1875,
"learning_rate": 5.315463165572575e-06,
"loss": 1.0356,
"mean_token_accuracy": 0.7018558621406555,
"num_tokens": 6534962.0,
"step": 2570
},
{
"epoch": 1.411378555798687,
"grad_norm": 9.3125,
"learning_rate": 5.297228300510576e-06,
"loss": 1.0416,
"mean_token_accuracy": 0.6961363673210144,
"num_tokens": 6560702.0,
"step": 2580
},
{
"epoch": 1.4168490153172866,
"grad_norm": 10.4375,
"learning_rate": 5.278993435448578e-06,
"loss": 1.0248,
"mean_token_accuracy": 0.7003186762332916,
"num_tokens": 6586229.0,
"step": 2590
},
{
"epoch": 1.4223194748358863,
"grad_norm": 10.125,
"learning_rate": 5.260758570386579e-06,
"loss": 1.0435,
"mean_token_accuracy": 0.6960846245288849,
"num_tokens": 6611544.0,
"step": 2600
},
{
"epoch": 1.4277899343544858,
"grad_norm": 9.8125,
"learning_rate": 5.2425237053245814e-06,
"loss": 1.0204,
"mean_token_accuracy": 0.7010906517505646,
"num_tokens": 6636774.0,
"step": 2610
},
{
"epoch": 1.4332603938730855,
"grad_norm": 10.25,
"learning_rate": 5.224288840262582e-06,
"loss": 0.9894,
"mean_token_accuracy": 0.7089883327484131,
"num_tokens": 6661874.0,
"step": 2620
},
{
"epoch": 1.438730853391685,
"grad_norm": 10.3125,
"learning_rate": 5.206053975200584e-06,
"loss": 1.0123,
"mean_token_accuracy": 0.7034552216529846,
"num_tokens": 6686911.0,
"step": 2630
},
{
"epoch": 1.4442013129102844,
"grad_norm": 10.375,
"learning_rate": 5.187819110138585e-06,
"loss": 1.0494,
"mean_token_accuracy": 0.6926229059696197,
"num_tokens": 6712162.0,
"step": 2640
},
{
"epoch": 1.449671772428884,
"grad_norm": 10.0625,
"learning_rate": 5.169584245076586e-06,
"loss": 1.006,
"mean_token_accuracy": 0.7052405416965485,
"num_tokens": 6737129.0,
"step": 2650
},
{
"epoch": 1.4551422319474836,
"grad_norm": 10.25,
"learning_rate": 5.1513493800145886e-06,
"loss": 1.0193,
"mean_token_accuracy": 0.7007757365703583,
"num_tokens": 6762564.0,
"step": 2660
},
{
"epoch": 1.460612691466083,
"grad_norm": 9.875,
"learning_rate": 5.133114514952589e-06,
"loss": 1.0164,
"mean_token_accuracy": 0.7008088171482086,
"num_tokens": 6788114.0,
"step": 2670
},
{
"epoch": 1.4660831509846828,
"grad_norm": 10.6875,
"learning_rate": 5.114879649890592e-06,
"loss": 1.0335,
"mean_token_accuracy": 0.6988726854324341,
"num_tokens": 6813529.0,
"step": 2680
},
{
"epoch": 1.4715536105032823,
"grad_norm": 9.9375,
"learning_rate": 5.0966447848285925e-06,
"loss": 1.0457,
"mean_token_accuracy": 0.697508692741394,
"num_tokens": 6839250.0,
"step": 2690
},
{
"epoch": 1.4770240700218817,
"grad_norm": 9.75,
"learning_rate": 5.078409919766594e-06,
"loss": 1.0272,
"mean_token_accuracy": 0.700483775138855,
"num_tokens": 6864969.0,
"step": 2700
},
{
"epoch": 1.4824945295404814,
"grad_norm": 10.5625,
"learning_rate": 5.060175054704596e-06,
"loss": 1.0065,
"mean_token_accuracy": 0.7078005909919739,
"num_tokens": 6890107.0,
"step": 2710
},
{
"epoch": 1.487964989059081,
"grad_norm": 10.3125,
"learning_rate": 5.041940189642596e-06,
"loss": 1.0484,
"mean_token_accuracy": 0.6926610350608826,
"num_tokens": 6915446.0,
"step": 2720
},
{
"epoch": 1.4934354485776806,
"grad_norm": 9.875,
"learning_rate": 5.023705324580599e-06,
"loss": 0.9933,
"mean_token_accuracy": 0.7024588465690613,
"num_tokens": 6940768.0,
"step": 2730
},
{
"epoch": 1.49890590809628,
"grad_norm": 10.0625,
"learning_rate": 5.0054704595186e-06,
"loss": 1.013,
"mean_token_accuracy": 0.7026680052280426,
"num_tokens": 6966595.0,
"step": 2740
},
{
"epoch": 1.5043763676148796,
"grad_norm": 10.6875,
"learning_rate": 4.987235594456601e-06,
"loss": 1.0465,
"mean_token_accuracy": 0.696220201253891,
"num_tokens": 6991472.0,
"step": 2750
},
{
"epoch": 1.509846827133479,
"grad_norm": 10.0625,
"learning_rate": 4.969000729394603e-06,
"loss": 1.0226,
"mean_token_accuracy": 0.7012458741664886,
"num_tokens": 7016868.0,
"step": 2760
},
{
"epoch": 1.5153172866520788,
"grad_norm": 9.5625,
"learning_rate": 4.950765864332604e-06,
"loss": 1.0363,
"mean_token_accuracy": 0.6987705647945404,
"num_tokens": 7042904.0,
"step": 2770
},
{
"epoch": 1.5207877461706785,
"grad_norm": 10.0,
"learning_rate": 4.932530999270605e-06,
"loss": 1.0403,
"mean_token_accuracy": 0.6974053025245667,
"num_tokens": 7068329.0,
"step": 2780
},
{
"epoch": 1.526258205689278,
"grad_norm": 10.5625,
"learning_rate": 4.914296134208607e-06,
"loss": 1.0297,
"mean_token_accuracy": 0.6953822433948517,
"num_tokens": 7093465.0,
"step": 2790
},
{
"epoch": 1.5317286652078774,
"grad_norm": 9.6875,
"learning_rate": 4.896061269146608e-06,
"loss": 1.0098,
"mean_token_accuracy": 0.7032950639724731,
"num_tokens": 7119262.0,
"step": 2800
},
{
"epoch": 1.537199124726477,
"grad_norm": 10.0625,
"learning_rate": 4.87782640408461e-06,
"loss": 1.0469,
"mean_token_accuracy": 0.6940477907657623,
"num_tokens": 7144654.0,
"step": 2810
},
{
"epoch": 1.5426695842450766,
"grad_norm": 9.875,
"learning_rate": 4.8595915390226115e-06,
"loss": 0.9971,
"mean_token_accuracy": 0.706735360622406,
"num_tokens": 7170258.0,
"step": 2820
},
{
"epoch": 1.5481400437636763,
"grad_norm": 9.75,
"learning_rate": 4.841356673960613e-06,
"loss": 1.0352,
"mean_token_accuracy": 0.6959325015544892,
"num_tokens": 7195794.0,
"step": 2830
},
{
"epoch": 1.5536105032822758,
"grad_norm": 9.875,
"learning_rate": 4.823121808898615e-06,
"loss": 1.015,
"mean_token_accuracy": 0.7041930437088013,
"num_tokens": 7221353.0,
"step": 2840
},
{
"epoch": 1.5590809628008753,
"grad_norm": 9.5625,
"learning_rate": 4.804886943836615e-06,
"loss": 0.9925,
"mean_token_accuracy": 0.7065845012664795,
"num_tokens": 7246656.0,
"step": 2850
},
{
"epoch": 1.5645514223194747,
"grad_norm": 10.0625,
"learning_rate": 4.786652078774617e-06,
"loss": 1.0202,
"mean_token_accuracy": 0.7003905832767486,
"num_tokens": 7271876.0,
"step": 2860
},
{
"epoch": 1.5700218818380745,
"grad_norm": 10.0,
"learning_rate": 4.768417213712619e-06,
"loss": 1.0114,
"mean_token_accuracy": 0.7012807905673981,
"num_tokens": 7296834.0,
"step": 2870
},
{
"epoch": 1.575492341356674,
"grad_norm": 9.875,
"learning_rate": 4.75018234865062e-06,
"loss": 1.0231,
"mean_token_accuracy": 0.7027694880962372,
"num_tokens": 7321984.0,
"step": 2880
},
{
"epoch": 1.5809628008752736,
"grad_norm": 10.0625,
"learning_rate": 4.731947483588622e-06,
"loss": 1.0165,
"mean_token_accuracy": 0.7003744542598724,
"num_tokens": 7347322.0,
"step": 2890
},
{
"epoch": 1.5864332603938731,
"grad_norm": 9.9375,
"learning_rate": 4.713712618526623e-06,
"loss": 1.0458,
"mean_token_accuracy": 0.6967971920967102,
"num_tokens": 7372753.0,
"step": 2900
},
{
"epoch": 1.5919037199124726,
"grad_norm": 10.0625,
"learning_rate": 4.695477753464625e-06,
"loss": 1.0454,
"mean_token_accuracy": 0.6949516236782074,
"num_tokens": 7398016.0,
"step": 2910
},
{
"epoch": 1.597374179431072,
"grad_norm": 10.0625,
"learning_rate": 4.677242888402626e-06,
"loss": 1.0547,
"mean_token_accuracy": 0.6950974285602569,
"num_tokens": 7423781.0,
"step": 2920
},
{
"epoch": 1.6028446389496718,
"grad_norm": 9.375,
"learning_rate": 4.659008023340627e-06,
"loss": 1.0073,
"mean_token_accuracy": 0.7060118734836578,
"num_tokens": 7449042.0,
"step": 2930
},
{
"epoch": 1.6083150984682715,
"grad_norm": 9.5625,
"learning_rate": 4.640773158278629e-06,
"loss": 1.0522,
"mean_token_accuracy": 0.6925396621227264,
"num_tokens": 7474666.0,
"step": 2940
},
{
"epoch": 1.613785557986871,
"grad_norm": 10.625,
"learning_rate": 4.6225382932166305e-06,
"loss": 1.0214,
"mean_token_accuracy": 0.698899906873703,
"num_tokens": 7500009.0,
"step": 2950
},
{
"epoch": 1.6192560175054704,
"grad_norm": 10.1875,
"learning_rate": 4.604303428154632e-06,
"loss": 1.002,
"mean_token_accuracy": 0.7080122113227845,
"num_tokens": 7524900.0,
"step": 2960
},
{
"epoch": 1.62472647702407,
"grad_norm": 9.875,
"learning_rate": 4.586068563092634e-06,
"loss": 1.0137,
"mean_token_accuracy": 0.7044197976589203,
"num_tokens": 7549885.0,
"step": 2970
},
{
"epoch": 1.6301969365426696,
"grad_norm": 10.0625,
"learning_rate": 4.567833698030634e-06,
"loss": 1.0224,
"mean_token_accuracy": 0.702244508266449,
"num_tokens": 7575873.0,
"step": 2980
},
{
"epoch": 1.635667396061269,
"grad_norm": 10.0,
"learning_rate": 4.549598832968636e-06,
"loss": 1.0231,
"mean_token_accuracy": 0.7018688678741455,
"num_tokens": 7601140.0,
"step": 2990
},
{
"epoch": 1.6411378555798688,
"grad_norm": 10.375,
"learning_rate": 4.531363967906638e-06,
"loss": 1.0168,
"mean_token_accuracy": 0.7000043153762817,
"num_tokens": 7626744.0,
"step": 3000
},
{
"epoch": 1.6466083150984683,
"grad_norm": 10.125,
"learning_rate": 4.513129102844639e-06,
"loss": 1.0382,
"mean_token_accuracy": 0.6963400661945343,
"num_tokens": 7652205.0,
"step": 3010
},
{
"epoch": 1.6520787746170678,
"grad_norm": 9.875,
"learning_rate": 4.494894237782641e-06,
"loss": 1.0311,
"mean_token_accuracy": 0.6954720914363861,
"num_tokens": 7677302.0,
"step": 3020
},
{
"epoch": 1.6575492341356672,
"grad_norm": 9.6875,
"learning_rate": 4.476659372720642e-06,
"loss": 1.0238,
"mean_token_accuracy": 0.7008414506912232,
"num_tokens": 7702641.0,
"step": 3030
},
{
"epoch": 1.663019693654267,
"grad_norm": 10.6875,
"learning_rate": 4.458424507658644e-06,
"loss": 1.0455,
"mean_token_accuracy": 0.6952131390571594,
"num_tokens": 7728092.0,
"step": 3040
},
{
"epoch": 1.6684901531728666,
"grad_norm": 10.5625,
"learning_rate": 4.440189642596645e-06,
"loss": 1.0563,
"mean_token_accuracy": 0.6946309566497803,
"num_tokens": 7753418.0,
"step": 3050
},
{
"epoch": 1.6739606126914661,
"grad_norm": 9.875,
"learning_rate": 4.421954777534646e-06,
"loss": 1.0137,
"mean_token_accuracy": 0.7010623216629028,
"num_tokens": 7778914.0,
"step": 3060
},
{
"epoch": 1.6794310722100656,
"grad_norm": 9.875,
"learning_rate": 4.403719912472648e-06,
"loss": 1.0461,
"mean_token_accuracy": 0.694762361049652,
"num_tokens": 7803962.0,
"step": 3070
},
{
"epoch": 1.684901531728665,
"grad_norm": 10.0625,
"learning_rate": 4.3854850474106495e-06,
"loss": 1.0144,
"mean_token_accuracy": 0.6980993211269378,
"num_tokens": 7829041.0,
"step": 3080
},
{
"epoch": 1.6903719912472648,
"grad_norm": 9.625,
"learning_rate": 4.367250182348651e-06,
"loss": 1.0323,
"mean_token_accuracy": 0.6983445227146149,
"num_tokens": 7854555.0,
"step": 3090
},
{
"epoch": 1.6958424507658645,
"grad_norm": 9.75,
"learning_rate": 4.349015317286653e-06,
"loss": 1.0601,
"mean_token_accuracy": 0.6918485045433045,
"num_tokens": 7880371.0,
"step": 3100
},
{
"epoch": 1.701312910284464,
"grad_norm": 9.625,
"learning_rate": 4.330780452224654e-06,
"loss": 1.0337,
"mean_token_accuracy": 0.6994279623031616,
"num_tokens": 7905824.0,
"step": 3110
},
{
"epoch": 1.7067833698030634,
"grad_norm": 9.9375,
"learning_rate": 4.312545587162655e-06,
"loss": 1.0253,
"mean_token_accuracy": 0.6975195348262787,
"num_tokens": 7931361.0,
"step": 3120
},
{
"epoch": 1.712253829321663,
"grad_norm": 10.625,
"learning_rate": 4.2943107221006566e-06,
"loss": 1.0498,
"mean_token_accuracy": 0.6924622654914856,
"num_tokens": 7956719.0,
"step": 3130
},
{
"epoch": 1.7177242888402626,
"grad_norm": 10.25,
"learning_rate": 4.276075857038658e-06,
"loss": 1.0236,
"mean_token_accuracy": 0.7029446244239808,
"num_tokens": 7982503.0,
"step": 3140
},
{
"epoch": 1.723194748358862,
"grad_norm": 10.0625,
"learning_rate": 4.25784099197666e-06,
"loss": 1.0426,
"mean_token_accuracy": 0.6945299327373504,
"num_tokens": 8008037.0,
"step": 3150
},
{
"epoch": 1.7286652078774618,
"grad_norm": 10.125,
"learning_rate": 4.239606126914661e-06,
"loss": 1.0232,
"mean_token_accuracy": 0.6983104586601258,
"num_tokens": 8033255.0,
"step": 3160
},
{
"epoch": 1.7341356673960613,
"grad_norm": 10.25,
"learning_rate": 4.221371261852663e-06,
"loss": 1.0267,
"mean_token_accuracy": 0.7013046503067016,
"num_tokens": 8058442.0,
"step": 3170
},
{
"epoch": 1.7396061269146608,
"grad_norm": 10.25,
"learning_rate": 4.203136396790664e-06,
"loss": 1.0473,
"mean_token_accuracy": 0.6944118142127991,
"num_tokens": 8084197.0,
"step": 3180
},
{
"epoch": 1.7450765864332602,
"grad_norm": 9.9375,
"learning_rate": 4.184901531728665e-06,
"loss": 1.0403,
"mean_token_accuracy": 0.6959960579872131,
"num_tokens": 8109669.0,
"step": 3190
},
{
"epoch": 1.75054704595186,
"grad_norm": 9.5625,
"learning_rate": 4.166666666666667e-06,
"loss": 1.0224,
"mean_token_accuracy": 0.7018200099468231,
"num_tokens": 8135074.0,
"step": 3200
},
{
"epoch": 1.7560175054704596,
"grad_norm": 9.8125,
"learning_rate": 4.1484318016046685e-06,
"loss": 1.0563,
"mean_token_accuracy": 0.6930766403675079,
"num_tokens": 8160391.0,
"step": 3210
},
{
"epoch": 1.7614879649890591,
"grad_norm": 10.0625,
"learning_rate": 4.13019693654267e-06,
"loss": 1.029,
"mean_token_accuracy": 0.6969853162765502,
"num_tokens": 8185889.0,
"step": 3220
},
{
"epoch": 1.7669584245076586,
"grad_norm": 10.3125,
"learning_rate": 4.111962071480672e-06,
"loss": 1.0038,
"mean_token_accuracy": 0.7046498537063599,
"num_tokens": 8211618.0,
"step": 3230
},
{
"epoch": 1.772428884026258,
"grad_norm": 10.25,
"learning_rate": 4.093727206418673e-06,
"loss": 1.0284,
"mean_token_accuracy": 0.7003251373767853,
"num_tokens": 8236773.0,
"step": 3240
},
{
"epoch": 1.7778993435448578,
"grad_norm": 10.5,
"learning_rate": 4.075492341356674e-06,
"loss": 1.0244,
"mean_token_accuracy": 0.6951791286468506,
"num_tokens": 8262043.0,
"step": 3250
},
{
"epoch": 1.7833698030634575,
"grad_norm": 10.25,
"learning_rate": 4.0572574762946756e-06,
"loss": 1.0271,
"mean_token_accuracy": 0.6958181917667389,
"num_tokens": 8288205.0,
"step": 3260
},
{
"epoch": 1.788840262582057,
"grad_norm": 10.375,
"learning_rate": 4.039022611232677e-06,
"loss": 1.0131,
"mean_token_accuracy": 0.7018442392349243,
"num_tokens": 8313349.0,
"step": 3270
},
{
"epoch": 1.7943107221006565,
"grad_norm": 10.375,
"learning_rate": 4.020787746170679e-06,
"loss": 1.0274,
"mean_token_accuracy": 0.7001440703868866,
"num_tokens": 8338997.0,
"step": 3280
},
{
"epoch": 1.799781181619256,
"grad_norm": 9.9375,
"learning_rate": 4.00255288110868e-06,
"loss": 1.0354,
"mean_token_accuracy": 0.6997708559036255,
"num_tokens": 8364446.0,
"step": 3290
},
{
"epoch": 1.8052516411378556,
"grad_norm": 10.75,
"learning_rate": 3.984318016046682e-06,
"loss": 1.0228,
"mean_token_accuracy": 0.7024441123008728,
"num_tokens": 8390005.0,
"step": 3300
},
{
"epoch": 1.8107221006564551,
"grad_norm": 10.0625,
"learning_rate": 3.9660831509846835e-06,
"loss": 1.0369,
"mean_token_accuracy": 0.6996273756027221,
"num_tokens": 8415557.0,
"step": 3310
},
{
"epoch": 1.8161925601750548,
"grad_norm": 9.6875,
"learning_rate": 3.947848285922684e-06,
"loss": 1.0282,
"mean_token_accuracy": 0.6991647839546203,
"num_tokens": 8441067.0,
"step": 3320
},
{
"epoch": 1.8216630196936543,
"grad_norm": 10.0625,
"learning_rate": 3.929613420860686e-06,
"loss": 1.0441,
"mean_token_accuracy": 0.6979943752288819,
"num_tokens": 8466748.0,
"step": 3330
},
{
"epoch": 1.8271334792122538,
"grad_norm": 10.8125,
"learning_rate": 3.9113785557986875e-06,
"loss": 1.0425,
"mean_token_accuracy": 0.6951029121875762,
"num_tokens": 8491752.0,
"step": 3340
},
{
"epoch": 1.8326039387308533,
"grad_norm": 10.0625,
"learning_rate": 3.893143690736689e-06,
"loss": 1.0116,
"mean_token_accuracy": 0.7038478553295135,
"num_tokens": 8517225.0,
"step": 3350
},
{
"epoch": 1.838074398249453,
"grad_norm": 9.4375,
"learning_rate": 3.874908825674691e-06,
"loss": 1.0348,
"mean_token_accuracy": 0.6978711068630219,
"num_tokens": 8542666.0,
"step": 3360
},
{
"epoch": 1.8435448577680527,
"grad_norm": 9.875,
"learning_rate": 3.856673960612692e-06,
"loss": 1.0388,
"mean_token_accuracy": 0.6979636132717133,
"num_tokens": 8568498.0,
"step": 3370
},
{
"epoch": 1.8490153172866521,
"grad_norm": 9.625,
"learning_rate": 3.838439095550693e-06,
"loss": 1.0077,
"mean_token_accuracy": 0.7020620405673981,
"num_tokens": 8593770.0,
"step": 3380
},
{
"epoch": 1.8544857768052516,
"grad_norm": 10.625,
"learning_rate": 3.8202042304886946e-06,
"loss": 1.0382,
"mean_token_accuracy": 0.696090292930603,
"num_tokens": 8619623.0,
"step": 3390
},
{
"epoch": 1.859956236323851,
"grad_norm": 11.4375,
"learning_rate": 3.8019693654266957e-06,
"loss": 1.0215,
"mean_token_accuracy": 0.7019686102867126,
"num_tokens": 8644852.0,
"step": 3400
},
{
"epoch": 1.8654266958424508,
"grad_norm": 10.75,
"learning_rate": 3.7837345003646973e-06,
"loss": 1.0507,
"mean_token_accuracy": 0.695297920703888,
"num_tokens": 8670333.0,
"step": 3410
},
{
"epoch": 1.8708971553610503,
"grad_norm": 9.8125,
"learning_rate": 3.765499635302699e-06,
"loss": 1.0505,
"mean_token_accuracy": 0.6986589133739471,
"num_tokens": 8695967.0,
"step": 3420
},
{
"epoch": 1.87636761487965,
"grad_norm": 10.625,
"learning_rate": 3.7472647702407005e-06,
"loss": 1.0098,
"mean_token_accuracy": 0.7004635334014893,
"num_tokens": 8721252.0,
"step": 3430
},
{
"epoch": 1.8818380743982495,
"grad_norm": 10.1875,
"learning_rate": 3.729029905178702e-06,
"loss": 1.0037,
"mean_token_accuracy": 0.7041628360748291,
"num_tokens": 8746944.0,
"step": 3440
},
{
"epoch": 1.887308533916849,
"grad_norm": 10.4375,
"learning_rate": 3.7107950401167037e-06,
"loss": 1.0375,
"mean_token_accuracy": 0.6980779647827149,
"num_tokens": 8771892.0,
"step": 3450
},
{
"epoch": 1.8927789934354484,
"grad_norm": 10.0625,
"learning_rate": 3.692560175054705e-06,
"loss": 0.9951,
"mean_token_accuracy": 0.7054601550102234,
"num_tokens": 8797131.0,
"step": 3460
},
{
"epoch": 1.8982494529540481,
"grad_norm": 10.5,
"learning_rate": 3.674325309992706e-06,
"loss": 0.9912,
"mean_token_accuracy": 0.7073158025741577,
"num_tokens": 8822466.0,
"step": 3470
},
{
"epoch": 1.9037199124726478,
"grad_norm": 9.4375,
"learning_rate": 3.6560904449307076e-06,
"loss": 1.0059,
"mean_token_accuracy": 0.7047175526618957,
"num_tokens": 8847987.0,
"step": 3480
},
{
"epoch": 1.9091903719912473,
"grad_norm": 10.3125,
"learning_rate": 3.637855579868709e-06,
"loss": 1.0167,
"mean_token_accuracy": 0.7027063131332397,
"num_tokens": 8873940.0,
"step": 3490
},
{
"epoch": 1.9146608315098468,
"grad_norm": 9.8125,
"learning_rate": 3.619620714806711e-06,
"loss": 1.0089,
"mean_token_accuracy": 0.7014587640762329,
"num_tokens": 8899633.0,
"step": 3500
},
{
"epoch": 1.9201312910284463,
"grad_norm": 9.5,
"learning_rate": 3.6013858497447124e-06,
"loss": 1.0017,
"mean_token_accuracy": 0.7041820049285888,
"num_tokens": 8924575.0,
"step": 3510
},
{
"epoch": 1.925601750547046,
"grad_norm": 10.375,
"learning_rate": 3.5831509846827136e-06,
"loss": 1.0165,
"mean_token_accuracy": 0.6998761057853699,
"num_tokens": 8950150.0,
"step": 3520
},
{
"epoch": 1.9310722100656457,
"grad_norm": 10.5,
"learning_rate": 3.5649161196207147e-06,
"loss": 0.9949,
"mean_token_accuracy": 0.7060512363910675,
"num_tokens": 8975532.0,
"step": 3530
},
{
"epoch": 1.9365426695842451,
"grad_norm": 10.0625,
"learning_rate": 3.5466812545587163e-06,
"loss": 1.026,
"mean_token_accuracy": 0.7015553712844849,
"num_tokens": 9000990.0,
"step": 3540
},
{
"epoch": 1.9420131291028446,
"grad_norm": 9.8125,
"learning_rate": 3.528446389496718e-06,
"loss": 1.0334,
"mean_token_accuracy": 0.7001884341239929,
"num_tokens": 9026718.0,
"step": 3550
},
{
"epoch": 1.947483588621444,
"grad_norm": 10.1875,
"learning_rate": 3.5102115244347195e-06,
"loss": 1.0248,
"mean_token_accuracy": 0.6993842363357544,
"num_tokens": 9052433.0,
"step": 3560
},
{
"epoch": 1.9529540481400438,
"grad_norm": 9.875,
"learning_rate": 3.491976659372721e-06,
"loss": 1.0273,
"mean_token_accuracy": 0.7003478467464447,
"num_tokens": 9077782.0,
"step": 3570
},
{
"epoch": 1.9584245076586433,
"grad_norm": 10.5,
"learning_rate": 3.4737417943107223e-06,
"loss": 1.0047,
"mean_token_accuracy": 0.7034225165843964,
"num_tokens": 9103349.0,
"step": 3580
},
{
"epoch": 1.963894967177243,
"grad_norm": 10.0,
"learning_rate": 3.455506929248724e-06,
"loss": 1.047,
"mean_token_accuracy": 0.693518990278244,
"num_tokens": 9129033.0,
"step": 3590
},
{
"epoch": 1.9693654266958425,
"grad_norm": 10.25,
"learning_rate": 3.437272064186725e-06,
"loss": 1.042,
"mean_token_accuracy": 0.698636132478714,
"num_tokens": 9154512.0,
"step": 3600
},
{
"epoch": 1.974835886214442,
"grad_norm": 10.1875,
"learning_rate": 3.4190371991247266e-06,
"loss": 1.0232,
"mean_token_accuracy": 0.7020216822624207,
"num_tokens": 9179746.0,
"step": 3610
},
{
"epoch": 1.9803063457330414,
"grad_norm": 10.5625,
"learning_rate": 3.400802334062728e-06,
"loss": 1.0277,
"mean_token_accuracy": 0.6972903072834015,
"num_tokens": 9205381.0,
"step": 3620
},
{
"epoch": 1.9857768052516411,
"grad_norm": 11.0625,
"learning_rate": 3.38256746900073e-06,
"loss": 1.0172,
"mean_token_accuracy": 0.7026154100894928,
"num_tokens": 9230261.0,
"step": 3630
},
{
"epoch": 1.9912472647702408,
"grad_norm": 10.1875,
"learning_rate": 3.3643326039387314e-06,
"loss": 1.01,
"mean_token_accuracy": 0.7005713403224945,
"num_tokens": 9255736.0,
"step": 3640
},
{
"epoch": 1.9967177242888403,
"grad_norm": 9.8125,
"learning_rate": 3.3460977388767325e-06,
"loss": 1.0041,
"mean_token_accuracy": 0.7037198424339295,
"num_tokens": 9281129.0,
"step": 3650
},
{
"epoch": 2.00218818380744,
"grad_norm": 9.875,
"learning_rate": 3.327862873814734e-06,
"loss": 1.0031,
"mean_token_accuracy": 0.7007030785083771,
"num_tokens": 9306380.0,
"step": 3660
},
{
"epoch": 2.0076586433260393,
"grad_norm": 9.875,
"learning_rate": 3.3096280087527353e-06,
"loss": 0.9473,
"mean_token_accuracy": 0.717999279499054,
"num_tokens": 9331366.0,
"step": 3670
},
{
"epoch": 2.0131291028446388,
"grad_norm": 9.625,
"learning_rate": 3.291393143690737e-06,
"loss": 0.9626,
"mean_token_accuracy": 0.717300283908844,
"num_tokens": 9356901.0,
"step": 3680
},
{
"epoch": 2.0185995623632387,
"grad_norm": 10.3125,
"learning_rate": 3.2731582786287385e-06,
"loss": 0.9602,
"mean_token_accuracy": 0.7138874650001525,
"num_tokens": 9382234.0,
"step": 3690
},
{
"epoch": 2.024070021881838,
"grad_norm": 9.75,
"learning_rate": 3.25492341356674e-06,
"loss": 0.9654,
"mean_token_accuracy": 0.712285041809082,
"num_tokens": 9407437.0,
"step": 3700
},
{
"epoch": 2.0295404814004376,
"grad_norm": 10.0625,
"learning_rate": 3.2366885485047412e-06,
"loss": 0.9342,
"mean_token_accuracy": 0.7209572613239288,
"num_tokens": 9432454.0,
"step": 3710
},
{
"epoch": 2.035010940919037,
"grad_norm": 10.75,
"learning_rate": 3.218453683442743e-06,
"loss": 0.9631,
"mean_token_accuracy": 0.7131079971790314,
"num_tokens": 9457857.0,
"step": 3720
},
{
"epoch": 2.0404814004376366,
"grad_norm": 10.3125,
"learning_rate": 3.200218818380744e-06,
"loss": 0.9577,
"mean_token_accuracy": 0.7177065372467041,
"num_tokens": 9482811.0,
"step": 3730
},
{
"epoch": 2.0459518599562365,
"grad_norm": 10.9375,
"learning_rate": 3.1819839533187456e-06,
"loss": 0.9614,
"mean_token_accuracy": 0.7131213903427124,
"num_tokens": 9508186.0,
"step": 3740
},
{
"epoch": 2.051422319474836,
"grad_norm": 10.75,
"learning_rate": 3.163749088256747e-06,
"loss": 0.9639,
"mean_token_accuracy": 0.7165208160877228,
"num_tokens": 9533509.0,
"step": 3750
},
{
"epoch": 2.0568927789934355,
"grad_norm": 10.0625,
"learning_rate": 3.1455142231947488e-06,
"loss": 0.9535,
"mean_token_accuracy": 0.7169022679328918,
"num_tokens": 9558600.0,
"step": 3760
},
{
"epoch": 2.062363238512035,
"grad_norm": 10.5625,
"learning_rate": 3.12727935813275e-06,
"loss": 0.9646,
"mean_token_accuracy": 0.7135210871696472,
"num_tokens": 9584233.0,
"step": 3770
},
{
"epoch": 2.0678336980306344,
"grad_norm": 10.25,
"learning_rate": 3.1090444930707515e-06,
"loss": 0.9786,
"mean_token_accuracy": 0.709892874956131,
"num_tokens": 9609789.0,
"step": 3780
},
{
"epoch": 2.0733041575492344,
"grad_norm": 10.375,
"learning_rate": 3.090809628008753e-06,
"loss": 0.9738,
"mean_token_accuracy": 0.7121241211891174,
"num_tokens": 9634822.0,
"step": 3790
},
{
"epoch": 2.078774617067834,
"grad_norm": 11.25,
"learning_rate": 3.0725747629467543e-06,
"loss": 0.9616,
"mean_token_accuracy": 0.7132679998874665,
"num_tokens": 9660314.0,
"step": 3800
},
{
"epoch": 2.0842450765864333,
"grad_norm": 10.625,
"learning_rate": 3.054339897884756e-06,
"loss": 0.9579,
"mean_token_accuracy": 0.7166781783103943,
"num_tokens": 9685937.0,
"step": 3810
},
{
"epoch": 2.089715536105033,
"grad_norm": 10.8125,
"learning_rate": 3.0361050328227575e-06,
"loss": 0.9741,
"mean_token_accuracy": 0.714539396762848,
"num_tokens": 9711177.0,
"step": 3820
},
{
"epoch": 2.0951859956236323,
"grad_norm": 11.125,
"learning_rate": 3.0178701677607587e-06,
"loss": 0.9463,
"mean_token_accuracy": 0.7216998755931854,
"num_tokens": 9736426.0,
"step": 3830
},
{
"epoch": 2.1006564551422318,
"grad_norm": 10.25,
"learning_rate": 2.9996353026987602e-06,
"loss": 0.9648,
"mean_token_accuracy": 0.7115364730358124,
"num_tokens": 9762127.0,
"step": 3840
},
{
"epoch": 2.1061269146608317,
"grad_norm": 10.75,
"learning_rate": 2.981400437636762e-06,
"loss": 0.9475,
"mean_token_accuracy": 0.7189037084579468,
"num_tokens": 9787178.0,
"step": 3850
},
{
"epoch": 2.111597374179431,
"grad_norm": 10.5,
"learning_rate": 2.9631655725747634e-06,
"loss": 0.9537,
"mean_token_accuracy": 0.7175340712070465,
"num_tokens": 9812717.0,
"step": 3860
},
{
"epoch": 2.1170678336980306,
"grad_norm": 10.1875,
"learning_rate": 2.9449307075127646e-06,
"loss": 0.9472,
"mean_token_accuracy": 0.7212161302566529,
"num_tokens": 9837881.0,
"step": 3870
},
{
"epoch": 2.12253829321663,
"grad_norm": 10.3125,
"learning_rate": 2.926695842450766e-06,
"loss": 0.9608,
"mean_token_accuracy": 0.7193179249763488,
"num_tokens": 9863317.0,
"step": 3880
},
{
"epoch": 2.1280087527352296,
"grad_norm": 10.5625,
"learning_rate": 2.9084609773887674e-06,
"loss": 0.951,
"mean_token_accuracy": 0.7160651385784149,
"num_tokens": 9888868.0,
"step": 3890
},
{
"epoch": 2.1334792122538295,
"grad_norm": 10.25,
"learning_rate": 2.890226112326769e-06,
"loss": 0.9581,
"mean_token_accuracy": 0.716667366027832,
"num_tokens": 9914219.0,
"step": 3900
},
{
"epoch": 2.138949671772429,
"grad_norm": 10.1875,
"learning_rate": 2.8719912472647705e-06,
"loss": 0.9637,
"mean_token_accuracy": 0.7133105576038361,
"num_tokens": 9939476.0,
"step": 3910
},
{
"epoch": 2.1444201312910285,
"grad_norm": 10.3125,
"learning_rate": 2.853756382202772e-06,
"loss": 0.9541,
"mean_token_accuracy": 0.7155322432518005,
"num_tokens": 9964926.0,
"step": 3920
},
{
"epoch": 2.149890590809628,
"grad_norm": 10.4375,
"learning_rate": 2.8355215171407733e-06,
"loss": 0.9801,
"mean_token_accuracy": 0.7113901436328888,
"num_tokens": 9990537.0,
"step": 3930
},
{
"epoch": 2.1553610503282274,
"grad_norm": 10.3125,
"learning_rate": 2.817286652078775e-06,
"loss": 0.9581,
"mean_token_accuracy": 0.7181893765926362,
"num_tokens": 10016002.0,
"step": 3940
},
{
"epoch": 2.160831509846827,
"grad_norm": 10.4375,
"learning_rate": 2.799051787016776e-06,
"loss": 0.947,
"mean_token_accuracy": 0.7155714333057404,
"num_tokens": 10041372.0,
"step": 3950
},
{
"epoch": 2.166301969365427,
"grad_norm": 10.25,
"learning_rate": 2.7808169219547776e-06,
"loss": 0.9951,
"mean_token_accuracy": 0.705439954996109,
"num_tokens": 10066639.0,
"step": 3960
},
{
"epoch": 2.1717724288840263,
"grad_norm": 9.9375,
"learning_rate": 2.7625820568927792e-06,
"loss": 0.9621,
"mean_token_accuracy": 0.715647429227829,
"num_tokens": 10091974.0,
"step": 3970
},
{
"epoch": 2.177242888402626,
"grad_norm": 10.5,
"learning_rate": 2.744347191830781e-06,
"loss": 0.9731,
"mean_token_accuracy": 0.7148273229598999,
"num_tokens": 10117300.0,
"step": 3980
},
{
"epoch": 2.1827133479212253,
"grad_norm": 10.125,
"learning_rate": 2.7261123267687824e-06,
"loss": 0.9625,
"mean_token_accuracy": 0.7132592558860779,
"num_tokens": 10142825.0,
"step": 3990
},
{
"epoch": 2.1881838074398248,
"grad_norm": 10.5625,
"learning_rate": 2.7078774617067836e-06,
"loss": 0.9887,
"mean_token_accuracy": 0.7106162488460541,
"num_tokens": 10168114.0,
"step": 4000
},
{
"epoch": 2.1936542669584247,
"grad_norm": 9.9375,
"learning_rate": 2.6896425966447848e-06,
"loss": 0.9496,
"mean_token_accuracy": 0.721660703420639,
"num_tokens": 10193441.0,
"step": 4010
},
{
"epoch": 2.199124726477024,
"grad_norm": 10.375,
"learning_rate": 2.6714077315827863e-06,
"loss": 0.9785,
"mean_token_accuracy": 0.7132415533065796,
"num_tokens": 10219252.0,
"step": 4020
},
{
"epoch": 2.2045951859956237,
"grad_norm": 10.25,
"learning_rate": 2.653172866520788e-06,
"loss": 0.9639,
"mean_token_accuracy": 0.7168880999088287,
"num_tokens": 10244918.0,
"step": 4030
},
{
"epoch": 2.210065645514223,
"grad_norm": 10.1875,
"learning_rate": 2.6349380014587895e-06,
"loss": 0.9507,
"mean_token_accuracy": 0.7177555739879609,
"num_tokens": 10270446.0,
"step": 4040
},
{
"epoch": 2.2155361050328226,
"grad_norm": 10.4375,
"learning_rate": 2.616703136396791e-06,
"loss": 0.9464,
"mean_token_accuracy": 0.7184527516365051,
"num_tokens": 10295965.0,
"step": 4050
},
{
"epoch": 2.2210065645514225,
"grad_norm": 9.8125,
"learning_rate": 2.5984682713347927e-06,
"loss": 0.9547,
"mean_token_accuracy": 0.7186037957668304,
"num_tokens": 10321256.0,
"step": 4060
},
{
"epoch": 2.226477024070022,
"grad_norm": 10.8125,
"learning_rate": 2.580233406272794e-06,
"loss": 0.9507,
"mean_token_accuracy": 0.7183585882186889,
"num_tokens": 10346406.0,
"step": 4070
},
{
"epoch": 2.2319474835886215,
"grad_norm": 10.5,
"learning_rate": 2.561998541210795e-06,
"loss": 0.9862,
"mean_token_accuracy": 0.710406482219696,
"num_tokens": 10371870.0,
"step": 4080
},
{
"epoch": 2.237417943107221,
"grad_norm": 10.375,
"learning_rate": 2.5437636761487966e-06,
"loss": 0.9938,
"mean_token_accuracy": 0.7080509960651398,
"num_tokens": 10397552.0,
"step": 4090
},
{
"epoch": 2.2428884026258205,
"grad_norm": 10.375,
"learning_rate": 2.5255288110867982e-06,
"loss": 0.962,
"mean_token_accuracy": 0.7114235162734985,
"num_tokens": 10423310.0,
"step": 4100
},
{
"epoch": 2.24835886214442,
"grad_norm": 10.375,
"learning_rate": 2.5072939460248e-06,
"loss": 0.9784,
"mean_token_accuracy": 0.7133292317390442,
"num_tokens": 10448405.0,
"step": 4110
},
{
"epoch": 2.25382932166302,
"grad_norm": 9.8125,
"learning_rate": 2.489059080962801e-06,
"loss": 0.9837,
"mean_token_accuracy": 0.7114726364612579,
"num_tokens": 10474015.0,
"step": 4120
},
{
"epoch": 2.2592997811816193,
"grad_norm": 10.1875,
"learning_rate": 2.4708242159008026e-06,
"loss": 0.9765,
"mean_token_accuracy": 0.7124983072280884,
"num_tokens": 10500122.0,
"step": 4130
},
{
"epoch": 2.264770240700219,
"grad_norm": 11.0,
"learning_rate": 2.452589350838804e-06,
"loss": 1.0051,
"mean_token_accuracy": 0.7033932983875275,
"num_tokens": 10525569.0,
"step": 4140
},
{
"epoch": 2.2702407002188183,
"grad_norm": 11.1875,
"learning_rate": 2.4343544857768053e-06,
"loss": 0.9537,
"mean_token_accuracy": 0.7190083563327789,
"num_tokens": 10550809.0,
"step": 4150
},
{
"epoch": 2.2757111597374178,
"grad_norm": 11.0,
"learning_rate": 2.416119620714807e-06,
"loss": 0.9574,
"mean_token_accuracy": 0.7156289637088775,
"num_tokens": 10576391.0,
"step": 4160
},
{
"epoch": 2.2811816192560173,
"grad_norm": 10.3125,
"learning_rate": 2.3978847556528085e-06,
"loss": 0.9615,
"mean_token_accuracy": 0.7188243508338928,
"num_tokens": 10601261.0,
"step": 4170
},
{
"epoch": 2.286652078774617,
"grad_norm": 10.0625,
"learning_rate": 2.3796498905908097e-06,
"loss": 0.9545,
"mean_token_accuracy": 0.715437775850296,
"num_tokens": 10626305.0,
"step": 4180
},
{
"epoch": 2.2921225382932167,
"grad_norm": 11.0,
"learning_rate": 2.3614150255288113e-06,
"loss": 0.9931,
"mean_token_accuracy": 0.7077142417430877,
"num_tokens": 10652029.0,
"step": 4190
},
{
"epoch": 2.297592997811816,
"grad_norm": 10.25,
"learning_rate": 2.343180160466813e-06,
"loss": 0.976,
"mean_token_accuracy": 0.7103690207004547,
"num_tokens": 10677356.0,
"step": 4200
},
{
"epoch": 2.3030634573304156,
"grad_norm": 10.0625,
"learning_rate": 2.324945295404814e-06,
"loss": 0.9647,
"mean_token_accuracy": 0.7177474439144135,
"num_tokens": 10703431.0,
"step": 4210
},
{
"epoch": 2.308533916849015,
"grad_norm": 10.125,
"learning_rate": 2.3067104303428156e-06,
"loss": 0.9758,
"mean_token_accuracy": 0.7121629536151886,
"num_tokens": 10728992.0,
"step": 4220
},
{
"epoch": 2.314004376367615,
"grad_norm": 10.125,
"learning_rate": 2.2884755652808172e-06,
"loss": 0.934,
"mean_token_accuracy": 0.7213209450244904,
"num_tokens": 10754404.0,
"step": 4230
},
{
"epoch": 2.3194748358862145,
"grad_norm": 9.75,
"learning_rate": 2.270240700218819e-06,
"loss": 0.9461,
"mean_token_accuracy": 0.719011914730072,
"num_tokens": 10779944.0,
"step": 4240
},
{
"epoch": 2.324945295404814,
"grad_norm": 11.0625,
"learning_rate": 2.25200583515682e-06,
"loss": 0.9966,
"mean_token_accuracy": 0.7089413404464722,
"num_tokens": 10805085.0,
"step": 4250
},
{
"epoch": 2.3304157549234135,
"grad_norm": 10.375,
"learning_rate": 2.2337709700948216e-06,
"loss": 0.9707,
"mean_token_accuracy": 0.7119170904159546,
"num_tokens": 10830483.0,
"step": 4260
},
{
"epoch": 2.335886214442013,
"grad_norm": 10.5625,
"learning_rate": 2.215536105032823e-06,
"loss": 0.9641,
"mean_token_accuracy": 0.7150032758712769,
"num_tokens": 10856177.0,
"step": 4270
},
{
"epoch": 2.341356673960613,
"grad_norm": 10.625,
"learning_rate": 2.1973012399708243e-06,
"loss": 0.971,
"mean_token_accuracy": 0.7109645545482636,
"num_tokens": 10882042.0,
"step": 4280
},
{
"epoch": 2.3468271334792123,
"grad_norm": 10.1875,
"learning_rate": 2.179066374908826e-06,
"loss": 0.9215,
"mean_token_accuracy": 0.7236122965812684,
"num_tokens": 10907588.0,
"step": 4290
},
{
"epoch": 2.352297592997812,
"grad_norm": 11.0,
"learning_rate": 2.1608315098468275e-06,
"loss": 0.9629,
"mean_token_accuracy": 0.7140246748924255,
"num_tokens": 10932821.0,
"step": 4300
},
{
"epoch": 2.3577680525164113,
"grad_norm": 11.0,
"learning_rate": 2.1425966447848287e-06,
"loss": 0.9542,
"mean_token_accuracy": 0.712429267168045,
"num_tokens": 10958451.0,
"step": 4310
},
{
"epoch": 2.363238512035011,
"grad_norm": 10.25,
"learning_rate": 2.1243617797228303e-06,
"loss": 0.9602,
"mean_token_accuracy": 0.7146483659744263,
"num_tokens": 10983957.0,
"step": 4320
},
{
"epoch": 2.3687089715536107,
"grad_norm": 10.375,
"learning_rate": 2.106126914660832e-06,
"loss": 0.991,
"mean_token_accuracy": 0.7093435227870941,
"num_tokens": 11009581.0,
"step": 4330
},
{
"epoch": 2.37417943107221,
"grad_norm": 10.5625,
"learning_rate": 2.087892049598833e-06,
"loss": 0.9806,
"mean_token_accuracy": 0.7109127819538117,
"num_tokens": 11035199.0,
"step": 4340
},
{
"epoch": 2.3796498905908097,
"grad_norm": 10.0,
"learning_rate": 2.0696571845368346e-06,
"loss": 0.9605,
"mean_token_accuracy": 0.7149803459644317,
"num_tokens": 11061011.0,
"step": 4350
},
{
"epoch": 2.385120350109409,
"grad_norm": 10.8125,
"learning_rate": 2.0514223194748362e-06,
"loss": 0.9667,
"mean_token_accuracy": 0.7125830769538879,
"num_tokens": 11086545.0,
"step": 4360
},
{
"epoch": 2.3905908096280086,
"grad_norm": 10.25,
"learning_rate": 2.0331874544128374e-06,
"loss": 0.9716,
"mean_token_accuracy": 0.7115569293498993,
"num_tokens": 11111650.0,
"step": 4370
},
{
"epoch": 2.3960612691466086,
"grad_norm": 9.875,
"learning_rate": 2.014952589350839e-06,
"loss": 0.9835,
"mean_token_accuracy": 0.7110448122024536,
"num_tokens": 11136976.0,
"step": 4380
},
{
"epoch": 2.401531728665208,
"grad_norm": 10.5,
"learning_rate": 1.9967177242888406e-06,
"loss": 0.9696,
"mean_token_accuracy": 0.7116839528083801,
"num_tokens": 11162409.0,
"step": 4390
},
{
"epoch": 2.4070021881838075,
"grad_norm": 10.0625,
"learning_rate": 1.9784828592268417e-06,
"loss": 0.9518,
"mean_token_accuracy": 0.7186504125595092,
"num_tokens": 11187779.0,
"step": 4400
},
{
"epoch": 2.412472647702407,
"grad_norm": 10.3125,
"learning_rate": 1.9602479941648433e-06,
"loss": 0.9594,
"mean_token_accuracy": 0.7139133214950562,
"num_tokens": 11213494.0,
"step": 4410
},
{
"epoch": 2.4179431072210065,
"grad_norm": 10.8125,
"learning_rate": 1.942013129102845e-06,
"loss": 0.9563,
"mean_token_accuracy": 0.7193277597427368,
"num_tokens": 11238852.0,
"step": 4420
},
{
"epoch": 2.423413566739606,
"grad_norm": 9.8125,
"learning_rate": 1.923778264040846e-06,
"loss": 0.9477,
"mean_token_accuracy": 0.7188430905342102,
"num_tokens": 11264480.0,
"step": 4430
},
{
"epoch": 2.428884026258206,
"grad_norm": 10.625,
"learning_rate": 1.9055433989788477e-06,
"loss": 0.9579,
"mean_token_accuracy": 0.7149973511695862,
"num_tokens": 11290175.0,
"step": 4440
},
{
"epoch": 2.4343544857768054,
"grad_norm": 10.5625,
"learning_rate": 1.8873085339168493e-06,
"loss": 0.9772,
"mean_token_accuracy": 0.7096493124961853,
"num_tokens": 11315301.0,
"step": 4450
},
{
"epoch": 2.439824945295405,
"grad_norm": 10.125,
"learning_rate": 1.8690736688548504e-06,
"loss": 0.9463,
"mean_token_accuracy": 0.7188547492027283,
"num_tokens": 11340809.0,
"step": 4460
},
{
"epoch": 2.4452954048140043,
"grad_norm": 10.375,
"learning_rate": 1.850838803792852e-06,
"loss": 0.9753,
"mean_token_accuracy": 0.7100773870944976,
"num_tokens": 11366227.0,
"step": 4470
},
{
"epoch": 2.450765864332604,
"grad_norm": 10.8125,
"learning_rate": 1.8326039387308536e-06,
"loss": 0.9786,
"mean_token_accuracy": 0.7101668000221253,
"num_tokens": 11391908.0,
"step": 4480
},
{
"epoch": 2.4562363238512033,
"grad_norm": 10.5625,
"learning_rate": 1.814369073668855e-06,
"loss": 0.9722,
"mean_token_accuracy": 0.7132100880146026,
"num_tokens": 11417099.0,
"step": 4490
},
{
"epoch": 2.461706783369803,
"grad_norm": 10.25,
"learning_rate": 1.7961342086068564e-06,
"loss": 0.9672,
"mean_token_accuracy": 0.7163193345069885,
"num_tokens": 11442529.0,
"step": 4500
},
{
"epoch": 2.4671772428884027,
"grad_norm": 10.5,
"learning_rate": 1.777899343544858e-06,
"loss": 0.9741,
"mean_token_accuracy": 0.7144057810306549,
"num_tokens": 11467943.0,
"step": 4510
},
{
"epoch": 2.472647702407002,
"grad_norm": 10.125,
"learning_rate": 1.7596644784828594e-06,
"loss": 0.99,
"mean_token_accuracy": 0.7097407221794129,
"num_tokens": 11493655.0,
"step": 4520
},
{
"epoch": 2.4781181619256016,
"grad_norm": 9.5625,
"learning_rate": 1.7414296134208607e-06,
"loss": 0.9242,
"mean_token_accuracy": 0.7254151999950409,
"num_tokens": 11519074.0,
"step": 4530
},
{
"epoch": 2.483588621444201,
"grad_norm": 10.6875,
"learning_rate": 1.7231947483588623e-06,
"loss": 0.9623,
"mean_token_accuracy": 0.7154429078102111,
"num_tokens": 11544510.0,
"step": 4540
},
{
"epoch": 2.489059080962801,
"grad_norm": 10.625,
"learning_rate": 1.7049598832968637e-06,
"loss": 0.9405,
"mean_token_accuracy": 0.720922189950943,
"num_tokens": 11569975.0,
"step": 4550
},
{
"epoch": 2.4945295404814005,
"grad_norm": 10.375,
"learning_rate": 1.686725018234865e-06,
"loss": 0.9643,
"mean_token_accuracy": 0.7141575336456298,
"num_tokens": 11595508.0,
"step": 4560
},
{
"epoch": 2.5,
"grad_norm": 10.4375,
"learning_rate": 1.6684901531728667e-06,
"loss": 0.9722,
"mean_token_accuracy": 0.716586035490036,
"num_tokens": 11621057.0,
"step": 4570
},
{
"epoch": 2.5054704595185995,
"grad_norm": 9.875,
"learning_rate": 1.650255288110868e-06,
"loss": 0.9678,
"mean_token_accuracy": 0.7154919922351837,
"num_tokens": 11646277.0,
"step": 4580
},
{
"epoch": 2.510940919037199,
"grad_norm": 10.375,
"learning_rate": 1.6320204230488696e-06,
"loss": 0.9514,
"mean_token_accuracy": 0.7179181814193726,
"num_tokens": 11671463.0,
"step": 4590
},
{
"epoch": 2.516411378555799,
"grad_norm": 10.375,
"learning_rate": 1.613785557986871e-06,
"loss": 0.945,
"mean_token_accuracy": 0.7157581686973572,
"num_tokens": 11696766.0,
"step": 4600
},
{
"epoch": 2.5218818380743984,
"grad_norm": 9.875,
"learning_rate": 1.5955506929248724e-06,
"loss": 0.9507,
"mean_token_accuracy": 0.717986673116684,
"num_tokens": 11722577.0,
"step": 4610
},
{
"epoch": 2.527352297592998,
"grad_norm": 10.4375,
"learning_rate": 1.577315827862874e-06,
"loss": 0.9841,
"mean_token_accuracy": 0.7108976006507873,
"num_tokens": 11747611.0,
"step": 4620
},
{
"epoch": 2.5328227571115973,
"grad_norm": 10.5,
"learning_rate": 1.5590809628008754e-06,
"loss": 0.9691,
"mean_token_accuracy": 0.7137386500835419,
"num_tokens": 11773081.0,
"step": 4630
},
{
"epoch": 2.538293216630197,
"grad_norm": 10.6875,
"learning_rate": 1.5408460977388768e-06,
"loss": 0.9968,
"mean_token_accuracy": 0.7075820744037629,
"num_tokens": 11798361.0,
"step": 4640
},
{
"epoch": 2.5437636761487967,
"grad_norm": 10.0625,
"learning_rate": 1.5226112326768783e-06,
"loss": 0.9582,
"mean_token_accuracy": 0.7191313743591309,
"num_tokens": 11823896.0,
"step": 4650
},
{
"epoch": 2.549234135667396,
"grad_norm": 10.5625,
"learning_rate": 1.5043763676148797e-06,
"loss": 0.9983,
"mean_token_accuracy": 0.7068405508995056,
"num_tokens": 11849650.0,
"step": 4660
},
{
"epoch": 2.5547045951859957,
"grad_norm": 10.3125,
"learning_rate": 1.4861415025528813e-06,
"loss": 0.9546,
"mean_token_accuracy": 0.7164286077022552,
"num_tokens": 11874973.0,
"step": 4670
},
{
"epoch": 2.560175054704595,
"grad_norm": 10.5625,
"learning_rate": 1.4679066374908827e-06,
"loss": 0.9665,
"mean_token_accuracy": 0.7142378628253937,
"num_tokens": 11900214.0,
"step": 4680
},
{
"epoch": 2.5656455142231946,
"grad_norm": 10.5,
"learning_rate": 1.4496717724288843e-06,
"loss": 0.9344,
"mean_token_accuracy": 0.725408935546875,
"num_tokens": 11925985.0,
"step": 4690
},
{
"epoch": 2.5711159737417946,
"grad_norm": 10.5625,
"learning_rate": 1.4314369073668857e-06,
"loss": 0.9571,
"mean_token_accuracy": 0.7170994579792023,
"num_tokens": 11951080.0,
"step": 4700
},
{
"epoch": 2.5765864332603936,
"grad_norm": 10.25,
"learning_rate": 1.413202042304887e-06,
"loss": 0.9723,
"mean_token_accuracy": 0.7106690168380737,
"num_tokens": 11976761.0,
"step": 4710
},
{
"epoch": 2.5820568927789935,
"grad_norm": 11.125,
"learning_rate": 1.3949671772428886e-06,
"loss": 0.9664,
"mean_token_accuracy": 0.7155169665813446,
"num_tokens": 12002057.0,
"step": 4720
},
{
"epoch": 2.587527352297593,
"grad_norm": 10.8125,
"learning_rate": 1.37673231218089e-06,
"loss": 0.9859,
"mean_token_accuracy": 0.7098533868789673,
"num_tokens": 12027349.0,
"step": 4730
},
{
"epoch": 2.5929978118161925,
"grad_norm": 10.1875,
"learning_rate": 1.3584974471188914e-06,
"loss": 0.9681,
"mean_token_accuracy": 0.7169641613960266,
"num_tokens": 12053110.0,
"step": 4740
},
{
"epoch": 2.598468271334792,
"grad_norm": 11.1875,
"learning_rate": 1.340262582056893e-06,
"loss": 0.9769,
"mean_token_accuracy": 0.7144347429275513,
"num_tokens": 12078660.0,
"step": 4750
},
{
"epoch": 2.6039387308533914,
"grad_norm": 10.0,
"learning_rate": 1.3220277169948944e-06,
"loss": 0.9535,
"mean_token_accuracy": 0.717149305343628,
"num_tokens": 12104041.0,
"step": 4760
},
{
"epoch": 2.6094091903719914,
"grad_norm": 10.5,
"learning_rate": 1.3037928519328957e-06,
"loss": 0.9509,
"mean_token_accuracy": 0.7132203102111816,
"num_tokens": 12129327.0,
"step": 4770
},
{
"epoch": 2.614879649890591,
"grad_norm": 10.5,
"learning_rate": 1.2855579868708973e-06,
"loss": 0.9856,
"mean_token_accuracy": 0.7076945900917053,
"num_tokens": 12154514.0,
"step": 4780
},
{
"epoch": 2.6203501094091903,
"grad_norm": 10.4375,
"learning_rate": 1.267323121808899e-06,
"loss": 0.9708,
"mean_token_accuracy": 0.7127987205982208,
"num_tokens": 12180006.0,
"step": 4790
},
{
"epoch": 2.62582056892779,
"grad_norm": 10.5625,
"learning_rate": 1.2490882567469003e-06,
"loss": 0.9789,
"mean_token_accuracy": 0.7122283995151519,
"num_tokens": 12205688.0,
"step": 4800
},
{
"epoch": 2.6312910284463893,
"grad_norm": 10.125,
"learning_rate": 1.2308533916849017e-06,
"loss": 0.9624,
"mean_token_accuracy": 0.71814124584198,
"num_tokens": 12230947.0,
"step": 4810
},
{
"epoch": 2.636761487964989,
"grad_norm": 10.0625,
"learning_rate": 1.212618526622903e-06,
"loss": 0.9897,
"mean_token_accuracy": 0.7073604583740234,
"num_tokens": 12255955.0,
"step": 4820
},
{
"epoch": 2.6422319474835887,
"grad_norm": 10.1875,
"learning_rate": 1.1943836615609047e-06,
"loss": 0.9558,
"mean_token_accuracy": 0.7145933747291565,
"num_tokens": 12281300.0,
"step": 4830
},
{
"epoch": 2.647702407002188,
"grad_norm": 10.75,
"learning_rate": 1.176148796498906e-06,
"loss": 0.9583,
"mean_token_accuracy": 0.7154503166675568,
"num_tokens": 12306967.0,
"step": 4840
},
{
"epoch": 2.6531728665207877,
"grad_norm": 10.625,
"learning_rate": 1.1579139314369074e-06,
"loss": 0.9714,
"mean_token_accuracy": 0.7120263636112213,
"num_tokens": 12332653.0,
"step": 4850
},
{
"epoch": 2.658643326039387,
"grad_norm": 10.3125,
"learning_rate": 1.139679066374909e-06,
"loss": 0.9431,
"mean_token_accuracy": 0.7196206390857697,
"num_tokens": 12357964.0,
"step": 4860
},
{
"epoch": 2.664113785557987,
"grad_norm": 9.875,
"learning_rate": 1.1214442013129104e-06,
"loss": 0.9235,
"mean_token_accuracy": 0.72520210146904,
"num_tokens": 12383359.0,
"step": 4870
},
{
"epoch": 2.6695842450765865,
"grad_norm": 10.8125,
"learning_rate": 1.1032093362509118e-06,
"loss": 0.9541,
"mean_token_accuracy": 0.7171355724334717,
"num_tokens": 12408973.0,
"step": 4880
},
{
"epoch": 2.675054704595186,
"grad_norm": 9.875,
"learning_rate": 1.0849744711889134e-06,
"loss": 0.9588,
"mean_token_accuracy": 0.7152066648006439,
"num_tokens": 12434174.0,
"step": 4890
},
{
"epoch": 2.6805251641137855,
"grad_norm": 9.9375,
"learning_rate": 1.0667396061269147e-06,
"loss": 0.9417,
"mean_token_accuracy": 0.7199950873851776,
"num_tokens": 12459929.0,
"step": 4900
},
{
"epoch": 2.685995623632385,
"grad_norm": 10.3125,
"learning_rate": 1.0485047410649161e-06,
"loss": 0.9403,
"mean_token_accuracy": 0.7192999660968781,
"num_tokens": 12485603.0,
"step": 4910
},
{
"epoch": 2.691466083150985,
"grad_norm": 10.3125,
"learning_rate": 1.0302698760029177e-06,
"loss": 0.9588,
"mean_token_accuracy": 0.7183211863040924,
"num_tokens": 12510854.0,
"step": 4920
},
{
"epoch": 2.6969365426695844,
"grad_norm": 10.0625,
"learning_rate": 1.012035010940919e-06,
"loss": 0.9445,
"mean_token_accuracy": 0.7210539758205414,
"num_tokens": 12536418.0,
"step": 4930
},
{
"epoch": 2.702407002188184,
"grad_norm": 10.5625,
"learning_rate": 9.938001458789205e-07,
"loss": 0.9721,
"mean_token_accuracy": 0.713829755783081,
"num_tokens": 12562114.0,
"step": 4940
},
{
"epoch": 2.7078774617067833,
"grad_norm": 10.0,
"learning_rate": 9.75565280816922e-07,
"loss": 0.9548,
"mean_token_accuracy": 0.7159802138805389,
"num_tokens": 12587542.0,
"step": 4950
},
{
"epoch": 2.713347921225383,
"grad_norm": 10.75,
"learning_rate": 9.573304157549234e-07,
"loss": 0.9454,
"mean_token_accuracy": 0.7186602294445038,
"num_tokens": 12613047.0,
"step": 4960
},
{
"epoch": 2.7188183807439827,
"grad_norm": 11.3125,
"learning_rate": 9.39095550692925e-07,
"loss": 1.0026,
"mean_token_accuracy": 0.7088643789291382,
"num_tokens": 12638277.0,
"step": 4970
},
{
"epoch": 2.7242888402625822,
"grad_norm": 10.5,
"learning_rate": 9.208606856309264e-07,
"loss": 0.9739,
"mean_token_accuracy": 0.7109079241752625,
"num_tokens": 12663642.0,
"step": 4980
},
{
"epoch": 2.7297592997811817,
"grad_norm": 10.5,
"learning_rate": 9.026258205689278e-07,
"loss": 0.9636,
"mean_token_accuracy": 0.712633740901947,
"num_tokens": 12688780.0,
"step": 4990
},
{
"epoch": 2.735229759299781,
"grad_norm": 10.3125,
"learning_rate": 8.843909555069294e-07,
"loss": 0.9593,
"mean_token_accuracy": 0.7172781467437744,
"num_tokens": 12714020.0,
"step": 5000
},
{
"epoch": 2.7407002188183807,
"grad_norm": 10.625,
"learning_rate": 8.661560904449308e-07,
"loss": 0.9668,
"mean_token_accuracy": 0.7146118104457855,
"num_tokens": 12739229.0,
"step": 5010
},
{
"epoch": 2.74617067833698,
"grad_norm": 10.4375,
"learning_rate": 8.479212253829322e-07,
"loss": 0.9617,
"mean_token_accuracy": 0.7141262829303742,
"num_tokens": 12764906.0,
"step": 5020
},
{
"epoch": 2.7516411378555796,
"grad_norm": 11.0625,
"learning_rate": 8.296863603209337e-07,
"loss": 0.9534,
"mean_token_accuracy": 0.7181793093681336,
"num_tokens": 12790046.0,
"step": 5030
},
{
"epoch": 2.7571115973741795,
"grad_norm": 9.8125,
"learning_rate": 8.114514952589351e-07,
"loss": 0.9333,
"mean_token_accuracy": 0.7222773134708405,
"num_tokens": 12815713.0,
"step": 5040
},
{
"epoch": 2.762582056892779,
"grad_norm": 10.625,
"learning_rate": 7.932166301969366e-07,
"loss": 0.9581,
"mean_token_accuracy": 0.7161290049552917,
"num_tokens": 12841290.0,
"step": 5050
},
{
"epoch": 2.7680525164113785,
"grad_norm": 10.4375,
"learning_rate": 7.749817651349381e-07,
"loss": 0.9709,
"mean_token_accuracy": 0.7144385755062104,
"num_tokens": 12867299.0,
"step": 5060
},
{
"epoch": 2.773522975929978,
"grad_norm": 10.3125,
"learning_rate": 7.567469000729396e-07,
"loss": 0.9299,
"mean_token_accuracy": 0.7200077176094055,
"num_tokens": 12892111.0,
"step": 5070
},
{
"epoch": 2.7789934354485775,
"grad_norm": 10.375,
"learning_rate": 7.38512035010941e-07,
"loss": 1.0044,
"mean_token_accuracy": 0.706270956993103,
"num_tokens": 12917547.0,
"step": 5080
},
{
"epoch": 2.7844638949671774,
"grad_norm": 10.75,
"learning_rate": 7.202771699489424e-07,
"loss": 0.9561,
"mean_token_accuracy": 0.7158130586147309,
"num_tokens": 12942824.0,
"step": 5090
},
{
"epoch": 2.789934354485777,
"grad_norm": 11.25,
"learning_rate": 7.020423048869439e-07,
"loss": 0.9943,
"mean_token_accuracy": 0.7063575088977814,
"num_tokens": 12968051.0,
"step": 5100
},
{
"epoch": 2.7954048140043763,
"grad_norm": 10.0,
"learning_rate": 6.838074398249453e-07,
"loss": 0.9686,
"mean_token_accuracy": 0.7124229729175567,
"num_tokens": 12992995.0,
"step": 5110
},
{
"epoch": 2.800875273522976,
"grad_norm": 10.25,
"learning_rate": 6.655725747629469e-07,
"loss": 0.9806,
"mean_token_accuracy": 0.7110258340835571,
"num_tokens": 13018611.0,
"step": 5120
},
{
"epoch": 2.8063457330415753,
"grad_norm": 10.1875,
"learning_rate": 6.473377097009483e-07,
"loss": 0.9462,
"mean_token_accuracy": 0.7187185704708099,
"num_tokens": 13044142.0,
"step": 5130
},
{
"epoch": 2.8118161925601752,
"grad_norm": 10.1875,
"learning_rate": 6.291028446389497e-07,
"loss": 0.9522,
"mean_token_accuracy": 0.7164785265922546,
"num_tokens": 13069649.0,
"step": 5140
},
{
"epoch": 2.8172866520787747,
"grad_norm": 11.0,
"learning_rate": 6.108679795769512e-07,
"loss": 0.9323,
"mean_token_accuracy": 0.7243307530879974,
"num_tokens": 13094938.0,
"step": 5150
},
{
"epoch": 2.822757111597374,
"grad_norm": 10.375,
"learning_rate": 5.926331145149526e-07,
"loss": 0.9351,
"mean_token_accuracy": 0.7236999869346619,
"num_tokens": 13120177.0,
"step": 5160
},
{
"epoch": 2.8282275711159737,
"grad_norm": 10.375,
"learning_rate": 5.743982494529541e-07,
"loss": 1.0089,
"mean_token_accuracy": 0.7033572435379029,
"num_tokens": 13145900.0,
"step": 5170
},
{
"epoch": 2.833698030634573,
"grad_norm": 10.1875,
"learning_rate": 5.561633843909556e-07,
"loss": 0.9821,
"mean_token_accuracy": 0.7097233951091766,
"num_tokens": 13170900.0,
"step": 5180
},
{
"epoch": 2.839168490153173,
"grad_norm": 10.0625,
"learning_rate": 5.37928519328957e-07,
"loss": 0.9525,
"mean_token_accuracy": 0.7147578775882721,
"num_tokens": 13196497.0,
"step": 5190
},
{
"epoch": 2.8446389496717726,
"grad_norm": 10.5,
"learning_rate": 5.196936542669585e-07,
"loss": 0.9752,
"mean_token_accuracy": 0.7121313869953155,
"num_tokens": 13221946.0,
"step": 5200
},
{
"epoch": 2.850109409190372,
"grad_norm": 10.4375,
"learning_rate": 5.014587892049599e-07,
"loss": 0.9862,
"mean_token_accuracy": 0.707955265045166,
"num_tokens": 13247347.0,
"step": 5210
},
{
"epoch": 2.8555798687089715,
"grad_norm": 10.5625,
"learning_rate": 4.832239241429613e-07,
"loss": 0.976,
"mean_token_accuracy": 0.7143423020839691,
"num_tokens": 13273017.0,
"step": 5220
},
{
"epoch": 2.861050328227571,
"grad_norm": 10.5,
"learning_rate": 4.649890590809628e-07,
"loss": 0.9889,
"mean_token_accuracy": 0.7104144990444183,
"num_tokens": 13298377.0,
"step": 5230
},
{
"epoch": 2.866520787746171,
"grad_norm": 10.1875,
"learning_rate": 4.467541940189643e-07,
"loss": 0.9631,
"mean_token_accuracy": 0.7177930176258087,
"num_tokens": 13323779.0,
"step": 5240
},
{
"epoch": 2.8719912472647704,
"grad_norm": 10.375,
"learning_rate": 4.285193289569658e-07,
"loss": 0.9779,
"mean_token_accuracy": 0.7117682516574859,
"num_tokens": 13349327.0,
"step": 5250
},
{
"epoch": 2.87746170678337,
"grad_norm": 9.9375,
"learning_rate": 4.102844638949672e-07,
"loss": 0.9936,
"mean_token_accuracy": 0.7063680112361908,
"num_tokens": 13375032.0,
"step": 5260
},
{
"epoch": 2.8829321663019694,
"grad_norm": 10.25,
"learning_rate": 3.9204959883296864e-07,
"loss": 0.9747,
"mean_token_accuracy": 0.7139563202857971,
"num_tokens": 13400329.0,
"step": 5270
},
{
"epoch": 2.888402625820569,
"grad_norm": 10.625,
"learning_rate": 3.7381473377097013e-07,
"loss": 0.9668,
"mean_token_accuracy": 0.7155350327491761,
"num_tokens": 13426059.0,
"step": 5280
},
{
"epoch": 2.8938730853391688,
"grad_norm": 10.8125,
"learning_rate": 3.5557986870897156e-07,
"loss": 0.958,
"mean_token_accuracy": 0.7152288734912873,
"num_tokens": 13451832.0,
"step": 5290
},
{
"epoch": 2.899343544857768,
"grad_norm": 10.75,
"learning_rate": 3.3734500364697305e-07,
"loss": 0.9859,
"mean_token_accuracy": 0.7133601427078247,
"num_tokens": 13477401.0,
"step": 5300
},
{
"epoch": 2.9048140043763677,
"grad_norm": 10.5625,
"learning_rate": 3.1911013858497453e-07,
"loss": 0.9678,
"mean_token_accuracy": 0.7171884000301361,
"num_tokens": 13502705.0,
"step": 5310
},
{
"epoch": 2.910284463894967,
"grad_norm": 11.0625,
"learning_rate": 3.0087527352297597e-07,
"loss": 0.9621,
"mean_token_accuracy": 0.717541116476059,
"num_tokens": 13528062.0,
"step": 5320
},
{
"epoch": 2.9157549234135667,
"grad_norm": 10.3125,
"learning_rate": 2.826404084609774e-07,
"loss": 0.9622,
"mean_token_accuracy": 0.7154664099216461,
"num_tokens": 13553750.0,
"step": 5330
},
{
"epoch": 2.921225382932166,
"grad_norm": 10.4375,
"learning_rate": 2.644055433989789e-07,
"loss": 0.9516,
"mean_token_accuracy": 0.717375636100769,
"num_tokens": 13578936.0,
"step": 5340
},
{
"epoch": 2.9266958424507656,
"grad_norm": 11.0,
"learning_rate": 2.461706783369803e-07,
"loss": 0.9518,
"mean_token_accuracy": 0.7167813301086425,
"num_tokens": 13604064.0,
"step": 5350
},
{
"epoch": 2.9321663019693656,
"grad_norm": 10.75,
"learning_rate": 2.279358132749818e-07,
"loss": 0.9842,
"mean_token_accuracy": 0.7137976944446563,
"num_tokens": 13629601.0,
"step": 5360
},
{
"epoch": 2.937636761487965,
"grad_norm": 9.8125,
"learning_rate": 2.0970094821298323e-07,
"loss": 0.9598,
"mean_token_accuracy": 0.7161750555038452,
"num_tokens": 13655161.0,
"step": 5370
},
{
"epoch": 2.9431072210065645,
"grad_norm": 10.0,
"learning_rate": 1.914660831509847e-07,
"loss": 0.9538,
"mean_token_accuracy": 0.718994963169098,
"num_tokens": 13680024.0,
"step": 5380
},
{
"epoch": 2.948577680525164,
"grad_norm": 10.5,
"learning_rate": 1.7323121808898615e-07,
"loss": 0.962,
"mean_token_accuracy": 0.7169410228729248,
"num_tokens": 13705181.0,
"step": 5390
},
{
"epoch": 2.9540481400437635,
"grad_norm": 10.5,
"learning_rate": 1.549963530269876e-07,
"loss": 0.9574,
"mean_token_accuracy": 0.7161156296730041,
"num_tokens": 13730505.0,
"step": 5400
},
{
"epoch": 2.9595185995623634,
"grad_norm": 10.5625,
"learning_rate": 1.3676148796498907e-07,
"loss": 0.9383,
"mean_token_accuracy": 0.7220743417739868,
"num_tokens": 13755959.0,
"step": 5410
},
{
"epoch": 2.964989059080963,
"grad_norm": 10.375,
"learning_rate": 1.1852662290299053e-07,
"loss": 0.9787,
"mean_token_accuracy": 0.7099229753017425,
"num_tokens": 13781338.0,
"step": 5420
},
{
"epoch": 2.9704595185995624,
"grad_norm": 11.0,
"learning_rate": 1.0029175784099199e-07,
"loss": 0.9792,
"mean_token_accuracy": 0.7099815905094147,
"num_tokens": 13806784.0,
"step": 5430
},
{
"epoch": 2.975929978118162,
"grad_norm": 10.5625,
"learning_rate": 8.205689277899343e-08,
"loss": 0.9774,
"mean_token_accuracy": 0.7121619462966919,
"num_tokens": 13831942.0,
"step": 5440
},
{
"epoch": 2.9814004376367613,
"grad_norm": 10.5,
"learning_rate": 6.38220277169949e-08,
"loss": 0.9726,
"mean_token_accuracy": 0.7091731190681457,
"num_tokens": 13857296.0,
"step": 5450
},
{
"epoch": 2.9868708971553612,
"grad_norm": 11.0,
"learning_rate": 4.558716265499636e-08,
"loss": 0.9846,
"mean_token_accuracy": 0.7077378630638123,
"num_tokens": 13883120.0,
"step": 5460
},
{
"epoch": 2.9923413566739607,
"grad_norm": 10.25,
"learning_rate": 2.735229759299781e-08,
"loss": 0.9573,
"mean_token_accuracy": 0.7156158804893493,
"num_tokens": 13908659.0,
"step": 5470
},
{
"epoch": 2.99781181619256,
"grad_norm": 10.9375,
"learning_rate": 9.117432530999272e-09,
"loss": 0.9851,
"mean_token_accuracy": 0.7107800126075745,
"num_tokens": 13933848.0,
"step": 5480
}
],
"logging_steps": 10,
"max_steps": 5484,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9139458472604058e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}