{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5484, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005470459518599562, "grad_norm": 9.625, "learning_rate": 9.983588621444202e-06, "loss": 1.9919, "mean_token_accuracy": 0.5373850524425506, "num_tokens": 25832.0, "step": 10 }, { "epoch": 0.010940919037199124, "grad_norm": 8.625, "learning_rate": 9.965353756382203e-06, "loss": 1.6719, "mean_token_accuracy": 0.5806249260902405, "num_tokens": 51122.0, "step": 20 }, { "epoch": 0.016411378555798686, "grad_norm": 8.875, "learning_rate": 9.947118891320205e-06, "loss": 1.6045, "mean_token_accuracy": 0.5889442741870881, "num_tokens": 76631.0, "step": 30 }, { "epoch": 0.02188183807439825, "grad_norm": 9.4375, "learning_rate": 9.928884026258206e-06, "loss": 1.5537, "mean_token_accuracy": 0.6036407649517059, "num_tokens": 101784.0, "step": 40 }, { "epoch": 0.02735229759299781, "grad_norm": 9.0625, "learning_rate": 9.910649161196208e-06, "loss": 1.4784, "mean_token_accuracy": 0.6087825715541839, "num_tokens": 127369.0, "step": 50 }, { "epoch": 0.03282275711159737, "grad_norm": 9.0, "learning_rate": 9.89241429613421e-06, "loss": 1.4971, "mean_token_accuracy": 0.6069094657897949, "num_tokens": 152695.0, "step": 60 }, { "epoch": 0.038293216630196934, "grad_norm": 8.6875, "learning_rate": 9.874179431072211e-06, "loss": 1.4571, "mean_token_accuracy": 0.6164368271827698, "num_tokens": 178219.0, "step": 70 }, { "epoch": 0.0437636761487965, "grad_norm": 9.1875, "learning_rate": 9.855944566010213e-06, "loss": 1.4476, "mean_token_accuracy": 0.6144214510917664, "num_tokens": 203604.0, "step": 80 }, { "epoch": 0.04923413566739606, "grad_norm": 8.9375, "learning_rate": 9.837709700948214e-06, "loss": 1.4509, "mean_token_accuracy": 0.6110931515693665, "num_tokens": 229492.0, "step": 90 }, { "epoch": 0.05470459518599562, "grad_norm": 8.375, "learning_rate": 9.819474835886216e-06, "loss": 1.4126, "mean_token_accuracy": 0.620923125743866, "num_tokens": 255685.0, "step": 100 }, { "epoch": 0.060175054704595186, "grad_norm": 9.3125, "learning_rate": 9.801239970824217e-06, "loss": 1.4117, "mean_token_accuracy": 0.6242850065231323, "num_tokens": 280910.0, "step": 110 }, { "epoch": 0.06564551422319474, "grad_norm": 9.125, "learning_rate": 9.783005105762217e-06, "loss": 1.3618, "mean_token_accuracy": 0.6288298785686492, "num_tokens": 306335.0, "step": 120 }, { "epoch": 0.0711159737417943, "grad_norm": 9.4375, "learning_rate": 9.76477024070022e-06, "loss": 1.3932, "mean_token_accuracy": 0.6229633510112762, "num_tokens": 331606.0, "step": 130 }, { "epoch": 0.07658643326039387, "grad_norm": 8.9375, "learning_rate": 9.74653537563822e-06, "loss": 1.4133, "mean_token_accuracy": 0.6180586159229279, "num_tokens": 357437.0, "step": 140 }, { "epoch": 0.08205689277899343, "grad_norm": 9.125, "learning_rate": 9.728300510576222e-06, "loss": 1.4235, "mean_token_accuracy": 0.6152303397655488, "num_tokens": 382692.0, "step": 150 }, { "epoch": 0.087527352297593, "grad_norm": 8.9375, "learning_rate": 9.710065645514224e-06, "loss": 1.382, "mean_token_accuracy": 0.621176666021347, "num_tokens": 407995.0, "step": 160 }, { "epoch": 0.09299781181619256, "grad_norm": 9.125, "learning_rate": 9.691830780452225e-06, "loss": 1.3665, "mean_token_accuracy": 0.6267438113689423, "num_tokens": 433527.0, "step": 170 }, { "epoch": 0.09846827133479212, "grad_norm": 9.1875, "learning_rate": 9.673595915390227e-06, "loss": 1.3829, "mean_token_accuracy": 0.6232609272003173, "num_tokens": 459043.0, "step": 180 }, { "epoch": 0.10393873085339168, "grad_norm": 9.0, "learning_rate": 9.655361050328229e-06, "loss": 1.3797, "mean_token_accuracy": 0.6250600516796112, "num_tokens": 484989.0, "step": 190 }, { "epoch": 0.10940919037199125, "grad_norm": 9.0625, "learning_rate": 9.63712618526623e-06, "loss": 1.3799, "mean_token_accuracy": 0.6250615298748017, "num_tokens": 510010.0, "step": 200 }, { "epoch": 0.11487964989059081, "grad_norm": 9.75, "learning_rate": 9.618891320204232e-06, "loss": 1.3541, "mean_token_accuracy": 0.6320520222187043, "num_tokens": 535568.0, "step": 210 }, { "epoch": 0.12035010940919037, "grad_norm": 9.1875, "learning_rate": 9.600656455142233e-06, "loss": 1.3568, "mean_token_accuracy": 0.6311915695667267, "num_tokens": 560786.0, "step": 220 }, { "epoch": 0.12582056892778992, "grad_norm": 9.0, "learning_rate": 9.582421590080235e-06, "loss": 1.3751, "mean_token_accuracy": 0.6243879318237304, "num_tokens": 586419.0, "step": 230 }, { "epoch": 0.13129102844638948, "grad_norm": 9.5625, "learning_rate": 9.564186725018235e-06, "loss": 1.3421, "mean_token_accuracy": 0.6328221976757049, "num_tokens": 611683.0, "step": 240 }, { "epoch": 0.13676148796498905, "grad_norm": 9.0, "learning_rate": 9.545951859956238e-06, "loss": 1.3199, "mean_token_accuracy": 0.6378473103046417, "num_tokens": 636737.0, "step": 250 }, { "epoch": 0.1422319474835886, "grad_norm": 8.6875, "learning_rate": 9.527716994894238e-06, "loss": 1.3362, "mean_token_accuracy": 0.6359954178333282, "num_tokens": 661611.0, "step": 260 }, { "epoch": 0.14770240700218817, "grad_norm": 9.0, "learning_rate": 9.50948212983224e-06, "loss": 1.2803, "mean_token_accuracy": 0.649063116312027, "num_tokens": 686464.0, "step": 270 }, { "epoch": 0.15317286652078774, "grad_norm": 8.8125, "learning_rate": 9.491247264770241e-06, "loss": 1.3093, "mean_token_accuracy": 0.6380355060100555, "num_tokens": 711615.0, "step": 280 }, { "epoch": 0.1586433260393873, "grad_norm": 8.4375, "learning_rate": 9.473012399708243e-06, "loss": 1.3034, "mean_token_accuracy": 0.6415359675884247, "num_tokens": 736807.0, "step": 290 }, { "epoch": 0.16411378555798686, "grad_norm": 9.3125, "learning_rate": 9.454777534646244e-06, "loss": 1.3251, "mean_token_accuracy": 0.6348649680614471, "num_tokens": 762209.0, "step": 300 }, { "epoch": 0.16958424507658643, "grad_norm": 8.375, "learning_rate": 9.436542669584246e-06, "loss": 1.2911, "mean_token_accuracy": 0.6452939510345459, "num_tokens": 787695.0, "step": 310 }, { "epoch": 0.175054704595186, "grad_norm": 8.5, "learning_rate": 9.418307804522248e-06, "loss": 1.3028, "mean_token_accuracy": 0.640759015083313, "num_tokens": 813451.0, "step": 320 }, { "epoch": 0.18052516411378555, "grad_norm": 9.1875, "learning_rate": 9.400072939460249e-06, "loss": 1.2847, "mean_token_accuracy": 0.6437187314033508, "num_tokens": 838902.0, "step": 330 }, { "epoch": 0.18599562363238512, "grad_norm": 8.875, "learning_rate": 9.38183807439825e-06, "loss": 1.2791, "mean_token_accuracy": 0.6461306512355804, "num_tokens": 864097.0, "step": 340 }, { "epoch": 0.19146608315098468, "grad_norm": 8.8125, "learning_rate": 9.363603209336252e-06, "loss": 1.2946, "mean_token_accuracy": 0.6436072170734406, "num_tokens": 889959.0, "step": 350 }, { "epoch": 0.19693654266958424, "grad_norm": 9.0625, "learning_rate": 9.345368344274252e-06, "loss": 1.3278, "mean_token_accuracy": 0.6354196608066559, "num_tokens": 915925.0, "step": 360 }, { "epoch": 0.2024070021881838, "grad_norm": 9.5625, "learning_rate": 9.327133479212255e-06, "loss": 1.3017, "mean_token_accuracy": 0.6431743144989014, "num_tokens": 941137.0, "step": 370 }, { "epoch": 0.20787746170678337, "grad_norm": 8.875, "learning_rate": 9.308898614150255e-06, "loss": 1.3143, "mean_token_accuracy": 0.638333660364151, "num_tokens": 966309.0, "step": 380 }, { "epoch": 0.21334792122538293, "grad_norm": 8.5625, "learning_rate": 9.290663749088259e-06, "loss": 1.2774, "mean_token_accuracy": 0.6447885036468506, "num_tokens": 991629.0, "step": 390 }, { "epoch": 0.2188183807439825, "grad_norm": 9.0625, "learning_rate": 9.272428884026259e-06, "loss": 1.2702, "mean_token_accuracy": 0.646415501832962, "num_tokens": 1017315.0, "step": 400 }, { "epoch": 0.22428884026258206, "grad_norm": 9.125, "learning_rate": 9.25419401896426e-06, "loss": 1.2708, "mean_token_accuracy": 0.6465388536453247, "num_tokens": 1042374.0, "step": 410 }, { "epoch": 0.22975929978118162, "grad_norm": 9.1875, "learning_rate": 9.235959153902262e-06, "loss": 1.2859, "mean_token_accuracy": 0.6423357188701629, "num_tokens": 1067942.0, "step": 420 }, { "epoch": 0.23522975929978118, "grad_norm": 8.8125, "learning_rate": 9.217724288840263e-06, "loss": 1.2934, "mean_token_accuracy": 0.6421127438545227, "num_tokens": 1093790.0, "step": 430 }, { "epoch": 0.24070021881838075, "grad_norm": 8.5, "learning_rate": 9.199489423778265e-06, "loss": 1.3007, "mean_token_accuracy": 0.6417977750301361, "num_tokens": 1119210.0, "step": 440 }, { "epoch": 0.2461706783369803, "grad_norm": 9.4375, "learning_rate": 9.181254558716267e-06, "loss": 1.3159, "mean_token_accuracy": 0.6400705456733704, "num_tokens": 1144670.0, "step": 450 }, { "epoch": 0.25164113785557984, "grad_norm": 9.625, "learning_rate": 9.163019693654268e-06, "loss": 1.27, "mean_token_accuracy": 0.6472333431243896, "num_tokens": 1170288.0, "step": 460 }, { "epoch": 0.25711159737417943, "grad_norm": 9.6875, "learning_rate": 9.14478482859227e-06, "loss": 1.2754, "mean_token_accuracy": 0.6426136493682861, "num_tokens": 1195969.0, "step": 470 }, { "epoch": 0.26258205689277897, "grad_norm": 9.1875, "learning_rate": 9.12654996353027e-06, "loss": 1.2805, "mean_token_accuracy": 0.6377979755401612, "num_tokens": 1221472.0, "step": 480 }, { "epoch": 0.26805251641137856, "grad_norm": 9.25, "learning_rate": 9.108315098468273e-06, "loss": 1.2649, "mean_token_accuracy": 0.6489916682243347, "num_tokens": 1246637.0, "step": 490 }, { "epoch": 0.2735229759299781, "grad_norm": 9.1875, "learning_rate": 9.090080233406273e-06, "loss": 1.2838, "mean_token_accuracy": 0.6428185880184174, "num_tokens": 1272427.0, "step": 500 }, { "epoch": 0.2789934354485777, "grad_norm": 8.6875, "learning_rate": 9.071845368344276e-06, "loss": 1.2394, "mean_token_accuracy": 0.6552128493785858, "num_tokens": 1297420.0, "step": 510 }, { "epoch": 0.2844638949671772, "grad_norm": 8.1875, "learning_rate": 9.053610503282276e-06, "loss": 1.2798, "mean_token_accuracy": 0.6457675039768219, "num_tokens": 1322816.0, "step": 520 }, { "epoch": 0.2899343544857768, "grad_norm": 8.625, "learning_rate": 9.03537563822028e-06, "loss": 1.2445, "mean_token_accuracy": 0.6481238782405854, "num_tokens": 1348483.0, "step": 530 }, { "epoch": 0.29540481400437635, "grad_norm": 9.25, "learning_rate": 9.017140773158279e-06, "loss": 1.2699, "mean_token_accuracy": 0.6499922752380372, "num_tokens": 1373968.0, "step": 540 }, { "epoch": 0.30087527352297594, "grad_norm": 9.0, "learning_rate": 8.99890590809628e-06, "loss": 1.2504, "mean_token_accuracy": 0.6519196212291718, "num_tokens": 1399038.0, "step": 550 }, { "epoch": 0.3063457330415755, "grad_norm": 8.8125, "learning_rate": 8.980671043034282e-06, "loss": 1.2576, "mean_token_accuracy": 0.6471436500549317, "num_tokens": 1424348.0, "step": 560 }, { "epoch": 0.31181619256017507, "grad_norm": 8.875, "learning_rate": 8.962436177972284e-06, "loss": 1.2728, "mean_token_accuracy": 0.6437059342861176, "num_tokens": 1450046.0, "step": 570 }, { "epoch": 0.3172866520787746, "grad_norm": 8.9375, "learning_rate": 8.944201312910286e-06, "loss": 1.2699, "mean_token_accuracy": 0.6483521461486816, "num_tokens": 1475329.0, "step": 580 }, { "epoch": 0.3227571115973742, "grad_norm": 9.0, "learning_rate": 8.925966447848287e-06, "loss": 1.2841, "mean_token_accuracy": 0.6452650845050811, "num_tokens": 1500705.0, "step": 590 }, { "epoch": 0.3282275711159737, "grad_norm": 9.4375, "learning_rate": 8.907731582786287e-06, "loss": 1.2879, "mean_token_accuracy": 0.6446281552314759, "num_tokens": 1526093.0, "step": 600 }, { "epoch": 0.3336980306345733, "grad_norm": 9.9375, "learning_rate": 8.88949671772429e-06, "loss": 1.2879, "mean_token_accuracy": 0.6415457963943482, "num_tokens": 1551249.0, "step": 610 }, { "epoch": 0.33916849015317285, "grad_norm": 8.875, "learning_rate": 8.87126185266229e-06, "loss": 1.2563, "mean_token_accuracy": 0.6477604985237122, "num_tokens": 1576597.0, "step": 620 }, { "epoch": 0.34463894967177244, "grad_norm": 8.8125, "learning_rate": 8.853026987600293e-06, "loss": 1.2371, "mean_token_accuracy": 0.6518564283847809, "num_tokens": 1601829.0, "step": 630 }, { "epoch": 0.350109409190372, "grad_norm": 9.0625, "learning_rate": 8.834792122538293e-06, "loss": 1.2774, "mean_token_accuracy": 0.6432484328746796, "num_tokens": 1627555.0, "step": 640 }, { "epoch": 0.35557986870897157, "grad_norm": 8.75, "learning_rate": 8.816557257476297e-06, "loss": 1.2568, "mean_token_accuracy": 0.6520409166812897, "num_tokens": 1653045.0, "step": 650 }, { "epoch": 0.3610503282275711, "grad_norm": 8.625, "learning_rate": 8.798322392414297e-06, "loss": 1.2746, "mean_token_accuracy": 0.6470682263374329, "num_tokens": 1678382.0, "step": 660 }, { "epoch": 0.3665207877461707, "grad_norm": 9.25, "learning_rate": 8.780087527352298e-06, "loss": 1.2593, "mean_token_accuracy": 0.6493343353271485, "num_tokens": 1703866.0, "step": 670 }, { "epoch": 0.37199124726477023, "grad_norm": 8.625, "learning_rate": 8.7618526622903e-06, "loss": 1.2595, "mean_token_accuracy": 0.6501845002174378, "num_tokens": 1729154.0, "step": 680 }, { "epoch": 0.3774617067833698, "grad_norm": 9.0, "learning_rate": 8.743617797228301e-06, "loss": 1.2543, "mean_token_accuracy": 0.6474352359771729, "num_tokens": 1754459.0, "step": 690 }, { "epoch": 0.38293216630196936, "grad_norm": 8.4375, "learning_rate": 8.725382932166303e-06, "loss": 1.2385, "mean_token_accuracy": 0.6522322833538056, "num_tokens": 1779833.0, "step": 700 }, { "epoch": 0.38840262582056895, "grad_norm": 8.8125, "learning_rate": 8.707148067104304e-06, "loss": 1.2308, "mean_token_accuracy": 0.6520416557788848, "num_tokens": 1805297.0, "step": 710 }, { "epoch": 0.3938730853391685, "grad_norm": 9.5625, "learning_rate": 8.688913202042306e-06, "loss": 1.2602, "mean_token_accuracy": 0.6470251500606536, "num_tokens": 1830573.0, "step": 720 }, { "epoch": 0.3993435448577681, "grad_norm": 9.25, "learning_rate": 8.670678336980308e-06, "loss": 1.2518, "mean_token_accuracy": 0.6481884896755219, "num_tokens": 1855937.0, "step": 730 }, { "epoch": 0.4048140043763676, "grad_norm": 9.125, "learning_rate": 8.652443471918308e-06, "loss": 1.2469, "mean_token_accuracy": 0.6519103825092316, "num_tokens": 1881096.0, "step": 740 }, { "epoch": 0.4102844638949672, "grad_norm": 9.0625, "learning_rate": 8.634208606856311e-06, "loss": 1.236, "mean_token_accuracy": 0.6525180697441101, "num_tokens": 1906526.0, "step": 750 }, { "epoch": 0.41575492341356673, "grad_norm": 8.8125, "learning_rate": 8.61597374179431e-06, "loss": 1.2583, "mean_token_accuracy": 0.647457766532898, "num_tokens": 1931741.0, "step": 760 }, { "epoch": 0.4212253829321663, "grad_norm": 8.9375, "learning_rate": 8.597738876732314e-06, "loss": 1.2296, "mean_token_accuracy": 0.6541453003883362, "num_tokens": 1956816.0, "step": 770 }, { "epoch": 0.42669584245076586, "grad_norm": 9.625, "learning_rate": 8.579504011670314e-06, "loss": 1.2343, "mean_token_accuracy": 0.6523520529270173, "num_tokens": 1981932.0, "step": 780 }, { "epoch": 0.43216630196936545, "grad_norm": 9.0625, "learning_rate": 8.561269146608316e-06, "loss": 1.2474, "mean_token_accuracy": 0.6512964367866516, "num_tokens": 2007382.0, "step": 790 }, { "epoch": 0.437636761487965, "grad_norm": 8.5625, "learning_rate": 8.543034281546317e-06, "loss": 1.2184, "mean_token_accuracy": 0.6564052760601043, "num_tokens": 2033121.0, "step": 800 }, { "epoch": 0.4431072210065646, "grad_norm": 9.25, "learning_rate": 8.524799416484319e-06, "loss": 1.2495, "mean_token_accuracy": 0.6491189241409302, "num_tokens": 2058751.0, "step": 810 }, { "epoch": 0.4485776805251641, "grad_norm": 8.8125, "learning_rate": 8.50656455142232e-06, "loss": 1.2014, "mean_token_accuracy": 0.6595002830028533, "num_tokens": 2083836.0, "step": 820 }, { "epoch": 0.4540481400437637, "grad_norm": 9.0, "learning_rate": 8.488329686360322e-06, "loss": 1.2478, "mean_token_accuracy": 0.650605583190918, "num_tokens": 2109163.0, "step": 830 }, { "epoch": 0.45951859956236324, "grad_norm": 8.6875, "learning_rate": 8.470094821298323e-06, "loss": 1.2486, "mean_token_accuracy": 0.6492572844028472, "num_tokens": 2134626.0, "step": 840 }, { "epoch": 0.4649890590809628, "grad_norm": 9.125, "learning_rate": 8.451859956236325e-06, "loss": 1.2397, "mean_token_accuracy": 0.6523317098617554, "num_tokens": 2159963.0, "step": 850 }, { "epoch": 0.47045951859956237, "grad_norm": 9.125, "learning_rate": 8.433625091174325e-06, "loss": 1.2381, "mean_token_accuracy": 0.6511462509632111, "num_tokens": 2185310.0, "step": 860 }, { "epoch": 0.4759299781181619, "grad_norm": 9.125, "learning_rate": 8.415390226112328e-06, "loss": 1.2015, "mean_token_accuracy": 0.6603323996067048, "num_tokens": 2210642.0, "step": 870 }, { "epoch": 0.4814004376367615, "grad_norm": 9.0625, "learning_rate": 8.397155361050328e-06, "loss": 1.2369, "mean_token_accuracy": 0.6551064074039459, "num_tokens": 2235713.0, "step": 880 }, { "epoch": 0.486870897155361, "grad_norm": 9.25, "learning_rate": 8.378920495988331e-06, "loss": 1.2588, "mean_token_accuracy": 0.6488555371761322, "num_tokens": 2261549.0, "step": 890 }, { "epoch": 0.4923413566739606, "grad_norm": 9.3125, "learning_rate": 8.360685630926331e-06, "loss": 1.2308, "mean_token_accuracy": 0.6573903739452363, "num_tokens": 2286614.0, "step": 900 }, { "epoch": 0.49781181619256015, "grad_norm": 9.25, "learning_rate": 8.342450765864333e-06, "loss": 1.215, "mean_token_accuracy": 0.6597650587558747, "num_tokens": 2312037.0, "step": 910 }, { "epoch": 0.5032822757111597, "grad_norm": 8.75, "learning_rate": 8.324215900802335e-06, "loss": 1.2239, "mean_token_accuracy": 0.6548255145549774, "num_tokens": 2337544.0, "step": 920 }, { "epoch": 0.5087527352297593, "grad_norm": 8.8125, "learning_rate": 8.305981035740336e-06, "loss": 1.2629, "mean_token_accuracy": 0.6493556082248688, "num_tokens": 2363200.0, "step": 930 }, { "epoch": 0.5142231947483589, "grad_norm": 9.0625, "learning_rate": 8.287746170678338e-06, "loss": 1.21, "mean_token_accuracy": 0.6589162766933441, "num_tokens": 2388450.0, "step": 940 }, { "epoch": 0.5196936542669585, "grad_norm": 8.8125, "learning_rate": 8.26951130561634e-06, "loss": 1.2373, "mean_token_accuracy": 0.6519401609897614, "num_tokens": 2413796.0, "step": 950 }, { "epoch": 0.5251641137855579, "grad_norm": 9.1875, "learning_rate": 8.251276440554341e-06, "loss": 1.2194, "mean_token_accuracy": 0.6579577445983886, "num_tokens": 2439412.0, "step": 960 }, { "epoch": 0.5306345733041575, "grad_norm": 9.0, "learning_rate": 8.233041575492342e-06, "loss": 1.241, "mean_token_accuracy": 0.6511935293674469, "num_tokens": 2464594.0, "step": 970 }, { "epoch": 0.5361050328227571, "grad_norm": 8.875, "learning_rate": 8.214806710430342e-06, "loss": 1.2325, "mean_token_accuracy": 0.6544456005096435, "num_tokens": 2490161.0, "step": 980 }, { "epoch": 0.5415754923413567, "grad_norm": 9.3125, "learning_rate": 8.196571845368346e-06, "loss": 1.2768, "mean_token_accuracy": 0.6460224032402039, "num_tokens": 2515978.0, "step": 990 }, { "epoch": 0.5470459518599562, "grad_norm": 8.5625, "learning_rate": 8.178336980306346e-06, "loss": 1.2196, "mean_token_accuracy": 0.6573625862598419, "num_tokens": 2541854.0, "step": 1000 }, { "epoch": 0.5525164113785558, "grad_norm": 8.8125, "learning_rate": 8.160102115244349e-06, "loss": 1.2044, "mean_token_accuracy": 0.6583312273025512, "num_tokens": 2566969.0, "step": 1010 }, { "epoch": 0.5579868708971554, "grad_norm": 9.25, "learning_rate": 8.141867250182349e-06, "loss": 1.2274, "mean_token_accuracy": 0.6549983143806457, "num_tokens": 2592252.0, "step": 1020 }, { "epoch": 0.563457330415755, "grad_norm": 9.0625, "learning_rate": 8.12363238512035e-06, "loss": 1.2106, "mean_token_accuracy": 0.6564074397087097, "num_tokens": 2617509.0, "step": 1030 }, { "epoch": 0.5689277899343544, "grad_norm": 8.625, "learning_rate": 8.105397520058352e-06, "loss": 1.239, "mean_token_accuracy": 0.6544763445854187, "num_tokens": 2642416.0, "step": 1040 }, { "epoch": 0.574398249452954, "grad_norm": 9.0625, "learning_rate": 8.087162654996354e-06, "loss": 1.2273, "mean_token_accuracy": 0.6501368045806885, "num_tokens": 2668075.0, "step": 1050 }, { "epoch": 0.5798687089715536, "grad_norm": 9.8125, "learning_rate": 8.068927789934355e-06, "loss": 1.2368, "mean_token_accuracy": 0.6530345678329468, "num_tokens": 2693387.0, "step": 1060 }, { "epoch": 0.5853391684901532, "grad_norm": 8.5, "learning_rate": 8.050692924872357e-06, "loss": 1.223, "mean_token_accuracy": 0.6532883644104004, "num_tokens": 2718953.0, "step": 1070 }, { "epoch": 0.5908096280087527, "grad_norm": 9.5625, "learning_rate": 8.032458059810358e-06, "loss": 1.2449, "mean_token_accuracy": 0.6541399002075196, "num_tokens": 2744468.0, "step": 1080 }, { "epoch": 0.5962800875273523, "grad_norm": 8.8125, "learning_rate": 8.01422319474836e-06, "loss": 1.2418, "mean_token_accuracy": 0.6520101726055145, "num_tokens": 2770101.0, "step": 1090 }, { "epoch": 0.6017505470459519, "grad_norm": 9.125, "learning_rate": 7.99598832968636e-06, "loss": 1.2313, "mean_token_accuracy": 0.6546747028827667, "num_tokens": 2795378.0, "step": 1100 }, { "epoch": 0.6072210065645515, "grad_norm": 9.3125, "learning_rate": 7.977753464624363e-06, "loss": 1.2274, "mean_token_accuracy": 0.6587331891059875, "num_tokens": 2820924.0, "step": 1110 }, { "epoch": 0.612691466083151, "grad_norm": 8.8125, "learning_rate": 7.959518599562363e-06, "loss": 1.2248, "mean_token_accuracy": 0.6550480246543884, "num_tokens": 2846859.0, "step": 1120 }, { "epoch": 0.6181619256017505, "grad_norm": 9.5, "learning_rate": 7.941283734500366e-06, "loss": 1.2137, "mean_token_accuracy": 0.6592613101005554, "num_tokens": 2872191.0, "step": 1130 }, { "epoch": 0.6236323851203501, "grad_norm": 9.5625, "learning_rate": 7.923048869438366e-06, "loss": 1.2026, "mean_token_accuracy": 0.661452466249466, "num_tokens": 2897466.0, "step": 1140 }, { "epoch": 0.6291028446389497, "grad_norm": 8.875, "learning_rate": 7.90481400437637e-06, "loss": 1.2344, "mean_token_accuracy": 0.6545186638832092, "num_tokens": 2923267.0, "step": 1150 }, { "epoch": 0.6345733041575492, "grad_norm": 8.875, "learning_rate": 7.88657913931437e-06, "loss": 1.2219, "mean_token_accuracy": 0.6601431190967559, "num_tokens": 2948499.0, "step": 1160 }, { "epoch": 0.6400437636761488, "grad_norm": 9.3125, "learning_rate": 7.868344274252371e-06, "loss": 1.2165, "mean_token_accuracy": 0.6581261634826661, "num_tokens": 2973709.0, "step": 1170 }, { "epoch": 0.6455142231947484, "grad_norm": 9.1875, "learning_rate": 7.850109409190373e-06, "loss": 1.2423, "mean_token_accuracy": 0.6519295990467071, "num_tokens": 2999177.0, "step": 1180 }, { "epoch": 0.650984682713348, "grad_norm": 8.625, "learning_rate": 7.831874544128374e-06, "loss": 1.2413, "mean_token_accuracy": 0.6543058276176452, "num_tokens": 3025180.0, "step": 1190 }, { "epoch": 0.6564551422319475, "grad_norm": 9.25, "learning_rate": 7.813639679066376e-06, "loss": 1.2133, "mean_token_accuracy": 0.6597758173942566, "num_tokens": 3050754.0, "step": 1200 }, { "epoch": 0.661925601750547, "grad_norm": 9.25, "learning_rate": 7.795404814004377e-06, "loss": 1.2227, "mean_token_accuracy": 0.6577537059783936, "num_tokens": 3075900.0, "step": 1210 }, { "epoch": 0.6673960612691466, "grad_norm": 9.0625, "learning_rate": 7.777169948942377e-06, "loss": 1.2266, "mean_token_accuracy": 0.655048793554306, "num_tokens": 3101018.0, "step": 1220 }, { "epoch": 0.6728665207877462, "grad_norm": 9.25, "learning_rate": 7.75893508388038e-06, "loss": 1.1967, "mean_token_accuracy": 0.6654035151004791, "num_tokens": 3126618.0, "step": 1230 }, { "epoch": 0.6783369803063457, "grad_norm": 9.0625, "learning_rate": 7.74070021881838e-06, "loss": 1.2273, "mean_token_accuracy": 0.6532664895057678, "num_tokens": 3151368.0, "step": 1240 }, { "epoch": 0.6838074398249453, "grad_norm": 9.0625, "learning_rate": 7.722465353756384e-06, "loss": 1.1909, "mean_token_accuracy": 0.6590378880500793, "num_tokens": 3176714.0, "step": 1250 }, { "epoch": 0.6892778993435449, "grad_norm": 9.25, "learning_rate": 7.704230488694384e-06, "loss": 1.2238, "mean_token_accuracy": 0.6598979830741882, "num_tokens": 3202322.0, "step": 1260 }, { "epoch": 0.6947483588621444, "grad_norm": 9.125, "learning_rate": 7.685995623632387e-06, "loss": 1.2273, "mean_token_accuracy": 0.6573567986488342, "num_tokens": 3227693.0, "step": 1270 }, { "epoch": 0.700218818380744, "grad_norm": 9.375, "learning_rate": 7.667760758570387e-06, "loss": 1.2031, "mean_token_accuracy": 0.6580542623996735, "num_tokens": 3252807.0, "step": 1280 }, { "epoch": 0.7056892778993435, "grad_norm": 8.9375, "learning_rate": 7.649525893508388e-06, "loss": 1.1975, "mean_token_accuracy": 0.6616823971271515, "num_tokens": 3278218.0, "step": 1290 }, { "epoch": 0.7111597374179431, "grad_norm": 9.3125, "learning_rate": 7.63129102844639e-06, "loss": 1.2157, "mean_token_accuracy": 0.6570103228092193, "num_tokens": 3304019.0, "step": 1300 }, { "epoch": 0.7166301969365426, "grad_norm": 9.4375, "learning_rate": 7.6130561633843915e-06, "loss": 1.2026, "mean_token_accuracy": 0.6590713143348694, "num_tokens": 3329456.0, "step": 1310 }, { "epoch": 0.7221006564551422, "grad_norm": 9.1875, "learning_rate": 7.594821298322393e-06, "loss": 1.2449, "mean_token_accuracy": 0.6509311914443969, "num_tokens": 3355024.0, "step": 1320 }, { "epoch": 0.7275711159737418, "grad_norm": 8.875, "learning_rate": 7.576586433260395e-06, "loss": 1.1989, "mean_token_accuracy": 0.6584409058094025, "num_tokens": 3380203.0, "step": 1330 }, { "epoch": 0.7330415754923414, "grad_norm": 9.0, "learning_rate": 7.5583515681983954e-06, "loss": 1.1704, "mean_token_accuracy": 0.6633239209651947, "num_tokens": 3405735.0, "step": 1340 }, { "epoch": 0.7385120350109409, "grad_norm": 9.3125, "learning_rate": 7.540116703136398e-06, "loss": 1.1788, "mean_token_accuracy": 0.6651252508163452, "num_tokens": 3431101.0, "step": 1350 }, { "epoch": 0.7439824945295405, "grad_norm": 9.0625, "learning_rate": 7.521881838074399e-06, "loss": 1.2175, "mean_token_accuracy": 0.6579277157783509, "num_tokens": 3456836.0, "step": 1360 }, { "epoch": 0.74945295404814, "grad_norm": 9.25, "learning_rate": 7.503646973012401e-06, "loss": 1.236, "mean_token_accuracy": 0.6502565503120422, "num_tokens": 3482262.0, "step": 1370 }, { "epoch": 0.7549234135667396, "grad_norm": 9.4375, "learning_rate": 7.485412107950402e-06, "loss": 1.2048, "mean_token_accuracy": 0.6587684571743011, "num_tokens": 3507578.0, "step": 1380 }, { "epoch": 0.7603938730853391, "grad_norm": 8.875, "learning_rate": 7.467177242888403e-06, "loss": 1.1982, "mean_token_accuracy": 0.6572374284267426, "num_tokens": 3533438.0, "step": 1390 }, { "epoch": 0.7658643326039387, "grad_norm": 9.3125, "learning_rate": 7.448942377826405e-06, "loss": 1.1968, "mean_token_accuracy": 0.6594564735889434, "num_tokens": 3558955.0, "step": 1400 }, { "epoch": 0.7713347921225383, "grad_norm": 9.0, "learning_rate": 7.430707512764406e-06, "loss": 1.1535, "mean_token_accuracy": 0.6660135149955749, "num_tokens": 3584024.0, "step": 1410 }, { "epoch": 0.7768052516411379, "grad_norm": 8.8125, "learning_rate": 7.412472647702408e-06, "loss": 1.1802, "mean_token_accuracy": 0.6659539043903351, "num_tokens": 3609518.0, "step": 1420 }, { "epoch": 0.7822757111597374, "grad_norm": 8.8125, "learning_rate": 7.394237782640409e-06, "loss": 1.2023, "mean_token_accuracy": 0.6609654724597931, "num_tokens": 3635205.0, "step": 1430 }, { "epoch": 0.787746170678337, "grad_norm": 9.0625, "learning_rate": 7.3760029175784105e-06, "loss": 1.2011, "mean_token_accuracy": 0.6600887596607208, "num_tokens": 3660799.0, "step": 1440 }, { "epoch": 0.7932166301969366, "grad_norm": 9.125, "learning_rate": 7.357768052516412e-06, "loss": 1.2188, "mean_token_accuracy": 0.6550438821315765, "num_tokens": 3686173.0, "step": 1450 }, { "epoch": 0.7986870897155361, "grad_norm": 8.8125, "learning_rate": 7.339533187454413e-06, "loss": 1.1869, "mean_token_accuracy": 0.6660338282585144, "num_tokens": 3711634.0, "step": 1460 }, { "epoch": 0.8041575492341356, "grad_norm": 8.5, "learning_rate": 7.321298322392415e-06, "loss": 1.2082, "mean_token_accuracy": 0.6563746392726898, "num_tokens": 3737277.0, "step": 1470 }, { "epoch": 0.8096280087527352, "grad_norm": 9.125, "learning_rate": 7.303063457330416e-06, "loss": 1.2071, "mean_token_accuracy": 0.6584551155567169, "num_tokens": 3763061.0, "step": 1480 }, { "epoch": 0.8150984682713348, "grad_norm": 9.375, "learning_rate": 7.2848285922684185e-06, "loss": 1.1978, "mean_token_accuracy": 0.6639045178890228, "num_tokens": 3788680.0, "step": 1490 }, { "epoch": 0.8205689277899344, "grad_norm": 8.875, "learning_rate": 7.266593727206419e-06, "loss": 1.2125, "mean_token_accuracy": 0.6617669343948365, "num_tokens": 3814212.0, "step": 1500 }, { "epoch": 0.8260393873085339, "grad_norm": 9.0625, "learning_rate": 7.248358862144421e-06, "loss": 1.1955, "mean_token_accuracy": 0.6609017014503479, "num_tokens": 3839753.0, "step": 1510 }, { "epoch": 0.8315098468271335, "grad_norm": 9.375, "learning_rate": 7.230123997082422e-06, "loss": 1.1969, "mean_token_accuracy": 0.6626928210258484, "num_tokens": 3865200.0, "step": 1520 }, { "epoch": 0.8369803063457331, "grad_norm": 8.75, "learning_rate": 7.211889132020423e-06, "loss": 1.2099, "mean_token_accuracy": 0.662108862400055, "num_tokens": 3891102.0, "step": 1530 }, { "epoch": 0.8424507658643327, "grad_norm": 8.8125, "learning_rate": 7.1936542669584256e-06, "loss": 1.1902, "mean_token_accuracy": 0.6671268463134765, "num_tokens": 3915979.0, "step": 1540 }, { "epoch": 0.8479212253829321, "grad_norm": 8.875, "learning_rate": 7.175419401896426e-06, "loss": 1.2321, "mean_token_accuracy": 0.651692271232605, "num_tokens": 3941101.0, "step": 1550 }, { "epoch": 0.8533916849015317, "grad_norm": 8.75, "learning_rate": 7.157184536834429e-06, "loss": 1.2018, "mean_token_accuracy": 0.6599911451339722, "num_tokens": 3966588.0, "step": 1560 }, { "epoch": 0.8588621444201313, "grad_norm": 9.6875, "learning_rate": 7.1389496717724295e-06, "loss": 1.1601, "mean_token_accuracy": 0.6702842473983764, "num_tokens": 3991816.0, "step": 1570 }, { "epoch": 0.8643326039387309, "grad_norm": 9.125, "learning_rate": 7.120714806710431e-06, "loss": 1.2061, "mean_token_accuracy": 0.6644167900085449, "num_tokens": 4017159.0, "step": 1580 }, { "epoch": 0.8698030634573304, "grad_norm": 9.0625, "learning_rate": 7.102479941648433e-06, "loss": 1.1982, "mean_token_accuracy": 0.661655330657959, "num_tokens": 4042412.0, "step": 1590 }, { "epoch": 0.87527352297593, "grad_norm": 9.25, "learning_rate": 7.0842450765864334e-06, "loss": 1.1893, "mean_token_accuracy": 0.6662971138954162, "num_tokens": 4068029.0, "step": 1600 }, { "epoch": 0.8807439824945296, "grad_norm": 8.75, "learning_rate": 7.066010211524436e-06, "loss": 1.2143, "mean_token_accuracy": 0.6562692880630493, "num_tokens": 4093581.0, "step": 1610 }, { "epoch": 0.8862144420131292, "grad_norm": 8.8125, "learning_rate": 7.047775346462437e-06, "loss": 1.164, "mean_token_accuracy": 0.6694628000259399, "num_tokens": 4118811.0, "step": 1620 }, { "epoch": 0.8916849015317286, "grad_norm": 9.25, "learning_rate": 7.029540481400439e-06, "loss": 1.2158, "mean_token_accuracy": 0.6589863717555999, "num_tokens": 4144422.0, "step": 1630 }, { "epoch": 0.8971553610503282, "grad_norm": 9.375, "learning_rate": 7.01130561633844e-06, "loss": 1.2068, "mean_token_accuracy": 0.6575411677360534, "num_tokens": 4169788.0, "step": 1640 }, { "epoch": 0.9026258205689278, "grad_norm": 9.1875, "learning_rate": 6.9930707512764405e-06, "loss": 1.2022, "mean_token_accuracy": 0.6641773998737335, "num_tokens": 4195416.0, "step": 1650 }, { "epoch": 0.9080962800875274, "grad_norm": 8.9375, "learning_rate": 6.974835886214443e-06, "loss": 1.2009, "mean_token_accuracy": 0.6612559735774994, "num_tokens": 4220959.0, "step": 1660 }, { "epoch": 0.9135667396061269, "grad_norm": 9.375, "learning_rate": 6.956601021152444e-06, "loss": 1.1772, "mean_token_accuracy": 0.6646326899528503, "num_tokens": 4246261.0, "step": 1670 }, { "epoch": 0.9190371991247265, "grad_norm": 8.75, "learning_rate": 6.938366156090446e-06, "loss": 1.2133, "mean_token_accuracy": 0.660455447435379, "num_tokens": 4271550.0, "step": 1680 }, { "epoch": 0.9245076586433261, "grad_norm": 9.5, "learning_rate": 6.920131291028447e-06, "loss": 1.198, "mean_token_accuracy": 0.661461490392685, "num_tokens": 4297359.0, "step": 1690 }, { "epoch": 0.9299781181619255, "grad_norm": 8.9375, "learning_rate": 6.901896425966449e-06, "loss": 1.2274, "mean_token_accuracy": 0.6552061200141907, "num_tokens": 4322920.0, "step": 1700 }, { "epoch": 0.9354485776805251, "grad_norm": 9.1875, "learning_rate": 6.88366156090445e-06, "loss": 1.1856, "mean_token_accuracy": 0.6640595495700836, "num_tokens": 4348175.0, "step": 1710 }, { "epoch": 0.9409190371991247, "grad_norm": 9.125, "learning_rate": 6.865426695842451e-06, "loss": 1.2074, "mean_token_accuracy": 0.6573819875717163, "num_tokens": 4373898.0, "step": 1720 }, { "epoch": 0.9463894967177243, "grad_norm": 9.0, "learning_rate": 6.847191830780453e-06, "loss": 1.2118, "mean_token_accuracy": 0.6596818029880523, "num_tokens": 4399398.0, "step": 1730 }, { "epoch": 0.9518599562363238, "grad_norm": 9.125, "learning_rate": 6.828956965718454e-06, "loss": 1.1888, "mean_token_accuracy": 0.6635434806346894, "num_tokens": 4424889.0, "step": 1740 }, { "epoch": 0.9573304157549234, "grad_norm": 9.0625, "learning_rate": 6.8107221006564564e-06, "loss": 1.1966, "mean_token_accuracy": 0.6608928442001343, "num_tokens": 4449872.0, "step": 1750 }, { "epoch": 0.962800875273523, "grad_norm": 9.4375, "learning_rate": 6.792487235594457e-06, "loss": 1.197, "mean_token_accuracy": 0.6581675052642822, "num_tokens": 4475097.0, "step": 1760 }, { "epoch": 0.9682713347921226, "grad_norm": 8.8125, "learning_rate": 6.774252370532458e-06, "loss": 1.164, "mean_token_accuracy": 0.6660536825656891, "num_tokens": 4500595.0, "step": 1770 }, { "epoch": 0.973741794310722, "grad_norm": 9.0625, "learning_rate": 6.75601750547046e-06, "loss": 1.1718, "mean_token_accuracy": 0.6656007945537568, "num_tokens": 4526142.0, "step": 1780 }, { "epoch": 0.9792122538293216, "grad_norm": 9.625, "learning_rate": 6.737782640408461e-06, "loss": 1.1778, "mean_token_accuracy": 0.670012629032135, "num_tokens": 4551204.0, "step": 1790 }, { "epoch": 0.9846827133479212, "grad_norm": 8.625, "learning_rate": 6.7195477753464636e-06, "loss": 1.2063, "mean_token_accuracy": 0.6588987648487091, "num_tokens": 4576664.0, "step": 1800 }, { "epoch": 0.9901531728665208, "grad_norm": 9.3125, "learning_rate": 6.701312910284464e-06, "loss": 1.1862, "mean_token_accuracy": 0.6618767023086548, "num_tokens": 4602021.0, "step": 1810 }, { "epoch": 0.9956236323851203, "grad_norm": 8.875, "learning_rate": 6.683078045222467e-06, "loss": 1.1911, "mean_token_accuracy": 0.6659906089305878, "num_tokens": 4627687.0, "step": 1820 }, { "epoch": 1.00109409190372, "grad_norm": 8.3125, "learning_rate": 6.6648431801604675e-06, "loss": 1.2073, "mean_token_accuracy": 0.6611107409000396, "num_tokens": 4653161.0, "step": 1830 }, { "epoch": 1.0065645514223194, "grad_norm": 8.8125, "learning_rate": 6.646608315098468e-06, "loss": 1.04, "mean_token_accuracy": 0.6974774897098541, "num_tokens": 4678651.0, "step": 1840 }, { "epoch": 1.012035010940919, "grad_norm": 9.75, "learning_rate": 6.628373450036471e-06, "loss": 1.0189, "mean_token_accuracy": 0.7027334988117218, "num_tokens": 4703974.0, "step": 1850 }, { "epoch": 1.0175054704595186, "grad_norm": 9.5625, "learning_rate": 6.610138584974471e-06, "loss": 1.0393, "mean_token_accuracy": 0.6992483794689178, "num_tokens": 4729182.0, "step": 1860 }, { "epoch": 1.0229759299781183, "grad_norm": 9.1875, "learning_rate": 6.591903719912474e-06, "loss": 1.0248, "mean_token_accuracy": 0.6999619722366333, "num_tokens": 4754700.0, "step": 1870 }, { "epoch": 1.0284463894967177, "grad_norm": 9.4375, "learning_rate": 6.573668854850475e-06, "loss": 1.0292, "mean_token_accuracy": 0.699607890844345, "num_tokens": 4780368.0, "step": 1880 }, { "epoch": 1.0339168490153172, "grad_norm": 9.5625, "learning_rate": 6.555433989788475e-06, "loss": 1.0111, "mean_token_accuracy": 0.7044564247131347, "num_tokens": 4805626.0, "step": 1890 }, { "epoch": 1.039387308533917, "grad_norm": 9.5, "learning_rate": 6.537199124726478e-06, "loss": 1.0417, "mean_token_accuracy": 0.6986953377723694, "num_tokens": 4830769.0, "step": 1900 }, { "epoch": 1.0448577680525164, "grad_norm": 9.875, "learning_rate": 6.5189642596644785e-06, "loss": 0.9978, "mean_token_accuracy": 0.7059278726577759, "num_tokens": 4856510.0, "step": 1910 }, { "epoch": 1.0503282275711159, "grad_norm": 9.875, "learning_rate": 6.500729394602481e-06, "loss": 1.041, "mean_token_accuracy": 0.6938862383365632, "num_tokens": 4881433.0, "step": 1920 }, { "epoch": 1.0557986870897156, "grad_norm": 9.9375, "learning_rate": 6.482494529540482e-06, "loss": 1.0299, "mean_token_accuracy": 0.7019344508647919, "num_tokens": 4906727.0, "step": 1930 }, { "epoch": 1.061269146608315, "grad_norm": 10.3125, "learning_rate": 6.464259664478484e-06, "loss": 1.0445, "mean_token_accuracy": 0.6943408429622651, "num_tokens": 4932196.0, "step": 1940 }, { "epoch": 1.0667396061269148, "grad_norm": 10.25, "learning_rate": 6.446024799416485e-06, "loss": 1.0244, "mean_token_accuracy": 0.700201416015625, "num_tokens": 4957115.0, "step": 1950 }, { "epoch": 1.0722100656455142, "grad_norm": 9.0625, "learning_rate": 6.427789934354486e-06, "loss": 1.0142, "mean_token_accuracy": 0.7004279494285583, "num_tokens": 4982766.0, "step": 1960 }, { "epoch": 1.0776805251641137, "grad_norm": 10.25, "learning_rate": 6.409555069292488e-06, "loss": 1.0371, "mean_token_accuracy": 0.6925820887088776, "num_tokens": 5008061.0, "step": 1970 }, { "epoch": 1.0831509846827134, "grad_norm": 9.6875, "learning_rate": 6.391320204230489e-06, "loss": 1.0356, "mean_token_accuracy": 0.7003223180770874, "num_tokens": 5033178.0, "step": 1980 }, { "epoch": 1.088621444201313, "grad_norm": 10.125, "learning_rate": 6.373085339168491e-06, "loss": 1.0496, "mean_token_accuracy": 0.6927937686443328, "num_tokens": 5058461.0, "step": 1990 }, { "epoch": 1.0940919037199124, "grad_norm": 9.9375, "learning_rate": 6.354850474106492e-06, "loss": 1.0666, "mean_token_accuracy": 0.6866900682449341, "num_tokens": 5083870.0, "step": 2000 }, { "epoch": 1.099562363238512, "grad_norm": 10.25, "learning_rate": 6.3366156090444944e-06, "loss": 1.0257, "mean_token_accuracy": 0.6989123821258545, "num_tokens": 5109168.0, "step": 2010 }, { "epoch": 1.1050328227571116, "grad_norm": 10.5, "learning_rate": 6.318380743982495e-06, "loss": 1.0333, "mean_token_accuracy": 0.6956054985523223, "num_tokens": 5134952.0, "step": 2020 }, { "epoch": 1.1105032822757113, "grad_norm": 9.625, "learning_rate": 6.300145878920496e-06, "loss": 1.0169, "mean_token_accuracy": 0.6954958021640778, "num_tokens": 5160606.0, "step": 2030 }, { "epoch": 1.1159737417943107, "grad_norm": 9.125, "learning_rate": 6.281911013858498e-06, "loss": 1.023, "mean_token_accuracy": 0.7010883867740632, "num_tokens": 5186194.0, "step": 2040 }, { "epoch": 1.1214442013129102, "grad_norm": 10.0625, "learning_rate": 6.263676148796499e-06, "loss": 1.0373, "mean_token_accuracy": 0.694850617647171, "num_tokens": 5211467.0, "step": 2050 }, { "epoch": 1.12691466083151, "grad_norm": 9.375, "learning_rate": 6.2454412837345015e-06, "loss": 1.023, "mean_token_accuracy": 0.6988016486167907, "num_tokens": 5236817.0, "step": 2060 }, { "epoch": 1.1323851203501094, "grad_norm": 9.6875, "learning_rate": 6.227206418672502e-06, "loss": 1.0054, "mean_token_accuracy": 0.704439902305603, "num_tokens": 5262238.0, "step": 2070 }, { "epoch": 1.1378555798687089, "grad_norm": 9.6875, "learning_rate": 6.208971553610503e-06, "loss": 1.0176, "mean_token_accuracy": 0.6989111959934234, "num_tokens": 5287171.0, "step": 2080 }, { "epoch": 1.1433260393873086, "grad_norm": 9.4375, "learning_rate": 6.1907366885485055e-06, "loss": 1.0479, "mean_token_accuracy": 0.6994122922420501, "num_tokens": 5312851.0, "step": 2090 }, { "epoch": 1.148796498905908, "grad_norm": 10.0, "learning_rate": 6.172501823486506e-06, "loss": 1.0211, "mean_token_accuracy": 0.7004438996315002, "num_tokens": 5338784.0, "step": 2100 }, { "epoch": 1.1542669584245075, "grad_norm": 10.0, "learning_rate": 6.154266958424509e-06, "loss": 1.0014, "mean_token_accuracy": 0.7057562172412872, "num_tokens": 5364113.0, "step": 2110 }, { "epoch": 1.1597374179431073, "grad_norm": 9.9375, "learning_rate": 6.136032093362509e-06, "loss": 1.0, "mean_token_accuracy": 0.7027003407478333, "num_tokens": 5389799.0, "step": 2120 }, { "epoch": 1.1652078774617067, "grad_norm": 10.0, "learning_rate": 6.117797228300512e-06, "loss": 1.0171, "mean_token_accuracy": 0.7003865003585815, "num_tokens": 5415045.0, "step": 2130 }, { "epoch": 1.1706783369803064, "grad_norm": 9.75, "learning_rate": 6.099562363238513e-06, "loss": 1.0371, "mean_token_accuracy": 0.6990915298461914, "num_tokens": 5440323.0, "step": 2140 }, { "epoch": 1.176148796498906, "grad_norm": 9.9375, "learning_rate": 6.081327498176513e-06, "loss": 1.0348, "mean_token_accuracy": 0.7001543581485749, "num_tokens": 5465572.0, "step": 2150 }, { "epoch": 1.1816192560175054, "grad_norm": 9.8125, "learning_rate": 6.063092633114516e-06, "loss": 1.0207, "mean_token_accuracy": 0.6994670450687408, "num_tokens": 5491158.0, "step": 2160 }, { "epoch": 1.187089715536105, "grad_norm": 9.875, "learning_rate": 6.0448577680525165e-06, "loss": 1.0209, "mean_token_accuracy": 0.700932627916336, "num_tokens": 5516927.0, "step": 2170 }, { "epoch": 1.1925601750547046, "grad_norm": 9.9375, "learning_rate": 6.026622902990519e-06, "loss": 1.0589, "mean_token_accuracy": 0.6895469307899476, "num_tokens": 5542251.0, "step": 2180 }, { "epoch": 1.1980306345733043, "grad_norm": 10.375, "learning_rate": 6.00838803792852e-06, "loss": 1.0421, "mean_token_accuracy": 0.6975519716739654, "num_tokens": 5567438.0, "step": 2190 }, { "epoch": 1.2035010940919038, "grad_norm": 10.1875, "learning_rate": 5.9901531728665204e-06, "loss": 1.0358, "mean_token_accuracy": 0.6961750984191895, "num_tokens": 5592726.0, "step": 2200 }, { "epoch": 1.2089715536105032, "grad_norm": 9.4375, "learning_rate": 5.971918307804523e-06, "loss": 1.0394, "mean_token_accuracy": 0.6967484354972839, "num_tokens": 5618018.0, "step": 2210 }, { "epoch": 1.214442013129103, "grad_norm": 9.5625, "learning_rate": 5.953683442742524e-06, "loss": 1.0465, "mean_token_accuracy": 0.6956069469451904, "num_tokens": 5643607.0, "step": 2220 }, { "epoch": 1.2199124726477024, "grad_norm": 9.8125, "learning_rate": 5.935448577680526e-06, "loss": 1.0473, "mean_token_accuracy": 0.6949532628059387, "num_tokens": 5668817.0, "step": 2230 }, { "epoch": 1.225382932166302, "grad_norm": 9.625, "learning_rate": 5.917213712618527e-06, "loss": 0.9865, "mean_token_accuracy": 0.7074730455875397, "num_tokens": 5694321.0, "step": 2240 }, { "epoch": 1.2308533916849016, "grad_norm": 10.0, "learning_rate": 5.898978847556529e-06, "loss": 1.039, "mean_token_accuracy": 0.6975044906139374, "num_tokens": 5719832.0, "step": 2250 }, { "epoch": 1.236323851203501, "grad_norm": 9.875, "learning_rate": 5.88074398249453e-06, "loss": 1.0395, "mean_token_accuracy": 0.6937489449977875, "num_tokens": 5745267.0, "step": 2260 }, { "epoch": 1.2417943107221006, "grad_norm": 9.4375, "learning_rate": 5.862509117432531e-06, "loss": 1.0194, "mean_token_accuracy": 0.6990994691848755, "num_tokens": 5770849.0, "step": 2270 }, { "epoch": 1.2472647702407003, "grad_norm": 10.1875, "learning_rate": 5.844274252370533e-06, "loss": 1.0543, "mean_token_accuracy": 0.6901531100273133, "num_tokens": 5795693.0, "step": 2280 }, { "epoch": 1.2527352297592997, "grad_norm": 9.8125, "learning_rate": 5.826039387308534e-06, "loss": 1.0315, "mean_token_accuracy": 0.7007197082042694, "num_tokens": 5820987.0, "step": 2290 }, { "epoch": 1.2582056892778994, "grad_norm": 10.125, "learning_rate": 5.807804522246536e-06, "loss": 1.0334, "mean_token_accuracy": 0.6947225153446197, "num_tokens": 5846512.0, "step": 2300 }, { "epoch": 1.263676148796499, "grad_norm": 10.5, "learning_rate": 5.789569657184537e-06, "loss": 1.0451, "mean_token_accuracy": 0.6949298202991485, "num_tokens": 5872260.0, "step": 2310 }, { "epoch": 1.2691466083150984, "grad_norm": 10.0, "learning_rate": 5.771334792122538e-06, "loss": 1.0218, "mean_token_accuracy": 0.7000970780849457, "num_tokens": 5897464.0, "step": 2320 }, { "epoch": 1.274617067833698, "grad_norm": 10.0625, "learning_rate": 5.75309992706054e-06, "loss": 0.9975, "mean_token_accuracy": 0.7064898908138275, "num_tokens": 5922759.0, "step": 2330 }, { "epoch": 1.2800875273522976, "grad_norm": 10.5, "learning_rate": 5.734865061998541e-06, "loss": 1.0501, "mean_token_accuracy": 0.693008977174759, "num_tokens": 5948112.0, "step": 2340 }, { "epoch": 1.2855579868708973, "grad_norm": 10.5, "learning_rate": 5.7166301969365435e-06, "loss": 1.0517, "mean_token_accuracy": 0.6909720063209533, "num_tokens": 5973664.0, "step": 2350 }, { "epoch": 1.2910284463894968, "grad_norm": 9.5625, "learning_rate": 5.698395331874544e-06, "loss": 1.0002, "mean_token_accuracy": 0.7026338756084443, "num_tokens": 5999362.0, "step": 2360 }, { "epoch": 1.2964989059080962, "grad_norm": 9.875, "learning_rate": 5.680160466812547e-06, "loss": 1.0145, "mean_token_accuracy": 0.7004771769046784, "num_tokens": 6025166.0, "step": 2370 }, { "epoch": 1.3019693654266957, "grad_norm": 10.0, "learning_rate": 5.661925601750547e-06, "loss": 1.01, "mean_token_accuracy": 0.7036912262439727, "num_tokens": 6050665.0, "step": 2380 }, { "epoch": 1.3074398249452954, "grad_norm": 10.25, "learning_rate": 5.643690736688548e-06, "loss": 1.027, "mean_token_accuracy": 0.7006226062774659, "num_tokens": 6075993.0, "step": 2390 }, { "epoch": 1.312910284463895, "grad_norm": 10.3125, "learning_rate": 5.6254558716265506e-06, "loss": 0.9982, "mean_token_accuracy": 0.7021169304847718, "num_tokens": 6101165.0, "step": 2400 }, { "epoch": 1.3183807439824946, "grad_norm": 9.75, "learning_rate": 5.607221006564551e-06, "loss": 1.0364, "mean_token_accuracy": 0.6955066680908203, "num_tokens": 6126628.0, "step": 2410 }, { "epoch": 1.323851203501094, "grad_norm": 10.5, "learning_rate": 5.588986141502554e-06, "loss": 1.0552, "mean_token_accuracy": 0.6929129481315612, "num_tokens": 6152116.0, "step": 2420 }, { "epoch": 1.3293216630196936, "grad_norm": 9.8125, "learning_rate": 5.5707512764405545e-06, "loss": 1.0342, "mean_token_accuracy": 0.6941097199916839, "num_tokens": 6177489.0, "step": 2430 }, { "epoch": 1.3347921225382933, "grad_norm": 9.875, "learning_rate": 5.552516411378557e-06, "loss": 1.039, "mean_token_accuracy": 0.697142231464386, "num_tokens": 6202873.0, "step": 2440 }, { "epoch": 1.3402625820568927, "grad_norm": 9.375, "learning_rate": 5.534281546316558e-06, "loss": 1.0287, "mean_token_accuracy": 0.6986703455448151, "num_tokens": 6228667.0, "step": 2450 }, { "epoch": 1.3457330415754925, "grad_norm": 9.75, "learning_rate": 5.516046681254558e-06, "loss": 1.0121, "mean_token_accuracy": 0.7000769674777985, "num_tokens": 6253923.0, "step": 2460 }, { "epoch": 1.351203501094092, "grad_norm": 9.75, "learning_rate": 5.497811816192561e-06, "loss": 1.011, "mean_token_accuracy": 0.7024608731269837, "num_tokens": 6279341.0, "step": 2470 }, { "epoch": 1.3566739606126914, "grad_norm": 9.6875, "learning_rate": 5.479576951130562e-06, "loss": 1.0576, "mean_token_accuracy": 0.6930296897888184, "num_tokens": 6304966.0, "step": 2480 }, { "epoch": 1.3621444201312911, "grad_norm": 10.5, "learning_rate": 5.461342086068564e-06, "loss": 0.9841, "mean_token_accuracy": 0.7117383360862732, "num_tokens": 6330323.0, "step": 2490 }, { "epoch": 1.3676148796498906, "grad_norm": 9.4375, "learning_rate": 5.443107221006565e-06, "loss": 1.0234, "mean_token_accuracy": 0.697024530172348, "num_tokens": 6356015.0, "step": 2500 }, { "epoch": 1.37308533916849, "grad_norm": 10.0, "learning_rate": 5.424872355944566e-06, "loss": 1.0141, "mean_token_accuracy": 0.7016679644584656, "num_tokens": 6381515.0, "step": 2510 }, { "epoch": 1.3785557986870898, "grad_norm": 9.625, "learning_rate": 5.406637490882568e-06, "loss": 1.038, "mean_token_accuracy": 0.6987586081027984, "num_tokens": 6407329.0, "step": 2520 }, { "epoch": 1.3840262582056893, "grad_norm": 10.0625, "learning_rate": 5.388402625820569e-06, "loss": 1.0408, "mean_token_accuracy": 0.6964347183704376, "num_tokens": 6433411.0, "step": 2530 }, { "epoch": 1.3894967177242887, "grad_norm": 9.75, "learning_rate": 5.370167760758571e-06, "loss": 1.0198, "mean_token_accuracy": 0.7017770171165466, "num_tokens": 6459012.0, "step": 2540 }, { "epoch": 1.3949671772428884, "grad_norm": 9.9375, "learning_rate": 5.351932895696572e-06, "loss": 1.0705, "mean_token_accuracy": 0.6890166878700257, "num_tokens": 6484324.0, "step": 2550 }, { "epoch": 1.400437636761488, "grad_norm": 9.5625, "learning_rate": 5.333698030634574e-06, "loss": 1.0185, "mean_token_accuracy": 0.7025589048862457, "num_tokens": 6509536.0, "step": 2560 }, { "epoch": 1.4059080962800876, "grad_norm": 10.1875, "learning_rate": 5.315463165572575e-06, "loss": 1.0356, "mean_token_accuracy": 0.7018558621406555, "num_tokens": 6534962.0, "step": 2570 }, { "epoch": 1.411378555798687, "grad_norm": 9.3125, "learning_rate": 5.297228300510576e-06, "loss": 1.0416, "mean_token_accuracy": 0.6961363673210144, "num_tokens": 6560702.0, "step": 2580 }, { "epoch": 1.4168490153172866, "grad_norm": 10.4375, "learning_rate": 5.278993435448578e-06, "loss": 1.0248, "mean_token_accuracy": 0.7003186762332916, "num_tokens": 6586229.0, "step": 2590 }, { "epoch": 1.4223194748358863, "grad_norm": 10.125, "learning_rate": 5.260758570386579e-06, "loss": 1.0435, "mean_token_accuracy": 0.6960846245288849, "num_tokens": 6611544.0, "step": 2600 }, { "epoch": 1.4277899343544858, "grad_norm": 9.8125, "learning_rate": 5.2425237053245814e-06, "loss": 1.0204, "mean_token_accuracy": 0.7010906517505646, "num_tokens": 6636774.0, "step": 2610 }, { "epoch": 1.4332603938730855, "grad_norm": 10.25, "learning_rate": 5.224288840262582e-06, "loss": 0.9894, "mean_token_accuracy": 0.7089883327484131, "num_tokens": 6661874.0, "step": 2620 }, { "epoch": 1.438730853391685, "grad_norm": 10.3125, "learning_rate": 5.206053975200584e-06, "loss": 1.0123, "mean_token_accuracy": 0.7034552216529846, "num_tokens": 6686911.0, "step": 2630 }, { "epoch": 1.4442013129102844, "grad_norm": 10.375, "learning_rate": 5.187819110138585e-06, "loss": 1.0494, "mean_token_accuracy": 0.6926229059696197, "num_tokens": 6712162.0, "step": 2640 }, { "epoch": 1.449671772428884, "grad_norm": 10.0625, "learning_rate": 5.169584245076586e-06, "loss": 1.006, "mean_token_accuracy": 0.7052405416965485, "num_tokens": 6737129.0, "step": 2650 }, { "epoch": 1.4551422319474836, "grad_norm": 10.25, "learning_rate": 5.1513493800145886e-06, "loss": 1.0193, "mean_token_accuracy": 0.7007757365703583, "num_tokens": 6762564.0, "step": 2660 }, { "epoch": 1.460612691466083, "grad_norm": 9.875, "learning_rate": 5.133114514952589e-06, "loss": 1.0164, "mean_token_accuracy": 0.7008088171482086, "num_tokens": 6788114.0, "step": 2670 }, { "epoch": 1.4660831509846828, "grad_norm": 10.6875, "learning_rate": 5.114879649890592e-06, "loss": 1.0335, "mean_token_accuracy": 0.6988726854324341, "num_tokens": 6813529.0, "step": 2680 }, { "epoch": 1.4715536105032823, "grad_norm": 9.9375, "learning_rate": 5.0966447848285925e-06, "loss": 1.0457, "mean_token_accuracy": 0.697508692741394, "num_tokens": 6839250.0, "step": 2690 }, { "epoch": 1.4770240700218817, "grad_norm": 9.75, "learning_rate": 5.078409919766594e-06, "loss": 1.0272, "mean_token_accuracy": 0.700483775138855, "num_tokens": 6864969.0, "step": 2700 }, { "epoch": 1.4824945295404814, "grad_norm": 10.5625, "learning_rate": 5.060175054704596e-06, "loss": 1.0065, "mean_token_accuracy": 0.7078005909919739, "num_tokens": 6890107.0, "step": 2710 }, { "epoch": 1.487964989059081, "grad_norm": 10.3125, "learning_rate": 5.041940189642596e-06, "loss": 1.0484, "mean_token_accuracy": 0.6926610350608826, "num_tokens": 6915446.0, "step": 2720 }, { "epoch": 1.4934354485776806, "grad_norm": 9.875, "learning_rate": 5.023705324580599e-06, "loss": 0.9933, "mean_token_accuracy": 0.7024588465690613, "num_tokens": 6940768.0, "step": 2730 }, { "epoch": 1.49890590809628, "grad_norm": 10.0625, "learning_rate": 5.0054704595186e-06, "loss": 1.013, "mean_token_accuracy": 0.7026680052280426, "num_tokens": 6966595.0, "step": 2740 }, { "epoch": 1.5043763676148796, "grad_norm": 10.6875, "learning_rate": 4.987235594456601e-06, "loss": 1.0465, "mean_token_accuracy": 0.696220201253891, "num_tokens": 6991472.0, "step": 2750 }, { "epoch": 1.509846827133479, "grad_norm": 10.0625, "learning_rate": 4.969000729394603e-06, "loss": 1.0226, "mean_token_accuracy": 0.7012458741664886, "num_tokens": 7016868.0, "step": 2760 }, { "epoch": 1.5153172866520788, "grad_norm": 9.5625, "learning_rate": 4.950765864332604e-06, "loss": 1.0363, "mean_token_accuracy": 0.6987705647945404, "num_tokens": 7042904.0, "step": 2770 }, { "epoch": 1.5207877461706785, "grad_norm": 10.0, "learning_rate": 4.932530999270605e-06, "loss": 1.0403, "mean_token_accuracy": 0.6974053025245667, "num_tokens": 7068329.0, "step": 2780 }, { "epoch": 1.526258205689278, "grad_norm": 10.5625, "learning_rate": 4.914296134208607e-06, "loss": 1.0297, "mean_token_accuracy": 0.6953822433948517, "num_tokens": 7093465.0, "step": 2790 }, { "epoch": 1.5317286652078774, "grad_norm": 9.6875, "learning_rate": 4.896061269146608e-06, "loss": 1.0098, "mean_token_accuracy": 0.7032950639724731, "num_tokens": 7119262.0, "step": 2800 }, { "epoch": 1.537199124726477, "grad_norm": 10.0625, "learning_rate": 4.87782640408461e-06, "loss": 1.0469, "mean_token_accuracy": 0.6940477907657623, "num_tokens": 7144654.0, "step": 2810 }, { "epoch": 1.5426695842450766, "grad_norm": 9.875, "learning_rate": 4.8595915390226115e-06, "loss": 0.9971, "mean_token_accuracy": 0.706735360622406, "num_tokens": 7170258.0, "step": 2820 }, { "epoch": 1.5481400437636763, "grad_norm": 9.75, "learning_rate": 4.841356673960613e-06, "loss": 1.0352, "mean_token_accuracy": 0.6959325015544892, "num_tokens": 7195794.0, "step": 2830 }, { "epoch": 1.5536105032822758, "grad_norm": 9.875, "learning_rate": 4.823121808898615e-06, "loss": 1.015, "mean_token_accuracy": 0.7041930437088013, "num_tokens": 7221353.0, "step": 2840 }, { "epoch": 1.5590809628008753, "grad_norm": 9.5625, "learning_rate": 4.804886943836615e-06, "loss": 0.9925, "mean_token_accuracy": 0.7065845012664795, "num_tokens": 7246656.0, "step": 2850 }, { "epoch": 1.5645514223194747, "grad_norm": 10.0625, "learning_rate": 4.786652078774617e-06, "loss": 1.0202, "mean_token_accuracy": 0.7003905832767486, "num_tokens": 7271876.0, "step": 2860 }, { "epoch": 1.5700218818380745, "grad_norm": 10.0, "learning_rate": 4.768417213712619e-06, "loss": 1.0114, "mean_token_accuracy": 0.7012807905673981, "num_tokens": 7296834.0, "step": 2870 }, { "epoch": 1.575492341356674, "grad_norm": 9.875, "learning_rate": 4.75018234865062e-06, "loss": 1.0231, "mean_token_accuracy": 0.7027694880962372, "num_tokens": 7321984.0, "step": 2880 }, { "epoch": 1.5809628008752736, "grad_norm": 10.0625, "learning_rate": 4.731947483588622e-06, "loss": 1.0165, "mean_token_accuracy": 0.7003744542598724, "num_tokens": 7347322.0, "step": 2890 }, { "epoch": 1.5864332603938731, "grad_norm": 9.9375, "learning_rate": 4.713712618526623e-06, "loss": 1.0458, "mean_token_accuracy": 0.6967971920967102, "num_tokens": 7372753.0, "step": 2900 }, { "epoch": 1.5919037199124726, "grad_norm": 10.0625, "learning_rate": 4.695477753464625e-06, "loss": 1.0454, "mean_token_accuracy": 0.6949516236782074, "num_tokens": 7398016.0, "step": 2910 }, { "epoch": 1.597374179431072, "grad_norm": 10.0625, "learning_rate": 4.677242888402626e-06, "loss": 1.0547, "mean_token_accuracy": 0.6950974285602569, "num_tokens": 7423781.0, "step": 2920 }, { "epoch": 1.6028446389496718, "grad_norm": 9.375, "learning_rate": 4.659008023340627e-06, "loss": 1.0073, "mean_token_accuracy": 0.7060118734836578, "num_tokens": 7449042.0, "step": 2930 }, { "epoch": 1.6083150984682715, "grad_norm": 9.5625, "learning_rate": 4.640773158278629e-06, "loss": 1.0522, "mean_token_accuracy": 0.6925396621227264, "num_tokens": 7474666.0, "step": 2940 }, { "epoch": 1.613785557986871, "grad_norm": 10.625, "learning_rate": 4.6225382932166305e-06, "loss": 1.0214, "mean_token_accuracy": 0.698899906873703, "num_tokens": 7500009.0, "step": 2950 }, { "epoch": 1.6192560175054704, "grad_norm": 10.1875, "learning_rate": 4.604303428154632e-06, "loss": 1.002, "mean_token_accuracy": 0.7080122113227845, "num_tokens": 7524900.0, "step": 2960 }, { "epoch": 1.62472647702407, "grad_norm": 9.875, "learning_rate": 4.586068563092634e-06, "loss": 1.0137, "mean_token_accuracy": 0.7044197976589203, "num_tokens": 7549885.0, "step": 2970 }, { "epoch": 1.6301969365426696, "grad_norm": 10.0625, "learning_rate": 4.567833698030634e-06, "loss": 1.0224, "mean_token_accuracy": 0.702244508266449, "num_tokens": 7575873.0, "step": 2980 }, { "epoch": 1.635667396061269, "grad_norm": 10.0, "learning_rate": 4.549598832968636e-06, "loss": 1.0231, "mean_token_accuracy": 0.7018688678741455, "num_tokens": 7601140.0, "step": 2990 }, { "epoch": 1.6411378555798688, "grad_norm": 10.375, "learning_rate": 4.531363967906638e-06, "loss": 1.0168, "mean_token_accuracy": 0.7000043153762817, "num_tokens": 7626744.0, "step": 3000 }, { "epoch": 1.6466083150984683, "grad_norm": 10.125, "learning_rate": 4.513129102844639e-06, "loss": 1.0382, "mean_token_accuracy": 0.6963400661945343, "num_tokens": 7652205.0, "step": 3010 }, { "epoch": 1.6520787746170678, "grad_norm": 9.875, "learning_rate": 4.494894237782641e-06, "loss": 1.0311, "mean_token_accuracy": 0.6954720914363861, "num_tokens": 7677302.0, "step": 3020 }, { "epoch": 1.6575492341356672, "grad_norm": 9.6875, "learning_rate": 4.476659372720642e-06, "loss": 1.0238, "mean_token_accuracy": 0.7008414506912232, "num_tokens": 7702641.0, "step": 3030 }, { "epoch": 1.663019693654267, "grad_norm": 10.6875, "learning_rate": 4.458424507658644e-06, "loss": 1.0455, "mean_token_accuracy": 0.6952131390571594, "num_tokens": 7728092.0, "step": 3040 }, { "epoch": 1.6684901531728666, "grad_norm": 10.5625, "learning_rate": 4.440189642596645e-06, "loss": 1.0563, "mean_token_accuracy": 0.6946309566497803, "num_tokens": 7753418.0, "step": 3050 }, { "epoch": 1.6739606126914661, "grad_norm": 9.875, "learning_rate": 4.421954777534646e-06, "loss": 1.0137, "mean_token_accuracy": 0.7010623216629028, "num_tokens": 7778914.0, "step": 3060 }, { "epoch": 1.6794310722100656, "grad_norm": 9.875, "learning_rate": 4.403719912472648e-06, "loss": 1.0461, "mean_token_accuracy": 0.694762361049652, "num_tokens": 7803962.0, "step": 3070 }, { "epoch": 1.684901531728665, "grad_norm": 10.0625, "learning_rate": 4.3854850474106495e-06, "loss": 1.0144, "mean_token_accuracy": 0.6980993211269378, "num_tokens": 7829041.0, "step": 3080 }, { "epoch": 1.6903719912472648, "grad_norm": 9.625, "learning_rate": 4.367250182348651e-06, "loss": 1.0323, "mean_token_accuracy": 0.6983445227146149, "num_tokens": 7854555.0, "step": 3090 }, { "epoch": 1.6958424507658645, "grad_norm": 9.75, "learning_rate": 4.349015317286653e-06, "loss": 1.0601, "mean_token_accuracy": 0.6918485045433045, "num_tokens": 7880371.0, "step": 3100 }, { "epoch": 1.701312910284464, "grad_norm": 9.625, "learning_rate": 4.330780452224654e-06, "loss": 1.0337, "mean_token_accuracy": 0.6994279623031616, "num_tokens": 7905824.0, "step": 3110 }, { "epoch": 1.7067833698030634, "grad_norm": 9.9375, "learning_rate": 4.312545587162655e-06, "loss": 1.0253, "mean_token_accuracy": 0.6975195348262787, "num_tokens": 7931361.0, "step": 3120 }, { "epoch": 1.712253829321663, "grad_norm": 10.625, "learning_rate": 4.2943107221006566e-06, "loss": 1.0498, "mean_token_accuracy": 0.6924622654914856, "num_tokens": 7956719.0, "step": 3130 }, { "epoch": 1.7177242888402626, "grad_norm": 10.25, "learning_rate": 4.276075857038658e-06, "loss": 1.0236, "mean_token_accuracy": 0.7029446244239808, "num_tokens": 7982503.0, "step": 3140 }, { "epoch": 1.723194748358862, "grad_norm": 10.0625, "learning_rate": 4.25784099197666e-06, "loss": 1.0426, "mean_token_accuracy": 0.6945299327373504, "num_tokens": 8008037.0, "step": 3150 }, { "epoch": 1.7286652078774618, "grad_norm": 10.125, "learning_rate": 4.239606126914661e-06, "loss": 1.0232, "mean_token_accuracy": 0.6983104586601258, "num_tokens": 8033255.0, "step": 3160 }, { "epoch": 1.7341356673960613, "grad_norm": 10.25, "learning_rate": 4.221371261852663e-06, "loss": 1.0267, "mean_token_accuracy": 0.7013046503067016, "num_tokens": 8058442.0, "step": 3170 }, { "epoch": 1.7396061269146608, "grad_norm": 10.25, "learning_rate": 4.203136396790664e-06, "loss": 1.0473, "mean_token_accuracy": 0.6944118142127991, "num_tokens": 8084197.0, "step": 3180 }, { "epoch": 1.7450765864332602, "grad_norm": 9.9375, "learning_rate": 4.184901531728665e-06, "loss": 1.0403, "mean_token_accuracy": 0.6959960579872131, "num_tokens": 8109669.0, "step": 3190 }, { "epoch": 1.75054704595186, "grad_norm": 9.5625, "learning_rate": 4.166666666666667e-06, "loss": 1.0224, "mean_token_accuracy": 0.7018200099468231, "num_tokens": 8135074.0, "step": 3200 }, { "epoch": 1.7560175054704596, "grad_norm": 9.8125, "learning_rate": 4.1484318016046685e-06, "loss": 1.0563, "mean_token_accuracy": 0.6930766403675079, "num_tokens": 8160391.0, "step": 3210 }, { "epoch": 1.7614879649890591, "grad_norm": 10.0625, "learning_rate": 4.13019693654267e-06, "loss": 1.029, "mean_token_accuracy": 0.6969853162765502, "num_tokens": 8185889.0, "step": 3220 }, { "epoch": 1.7669584245076586, "grad_norm": 10.3125, "learning_rate": 4.111962071480672e-06, "loss": 1.0038, "mean_token_accuracy": 0.7046498537063599, "num_tokens": 8211618.0, "step": 3230 }, { "epoch": 1.772428884026258, "grad_norm": 10.25, "learning_rate": 4.093727206418673e-06, "loss": 1.0284, "mean_token_accuracy": 0.7003251373767853, "num_tokens": 8236773.0, "step": 3240 }, { "epoch": 1.7778993435448578, "grad_norm": 10.5, "learning_rate": 4.075492341356674e-06, "loss": 1.0244, "mean_token_accuracy": 0.6951791286468506, "num_tokens": 8262043.0, "step": 3250 }, { "epoch": 1.7833698030634575, "grad_norm": 10.25, "learning_rate": 4.0572574762946756e-06, "loss": 1.0271, "mean_token_accuracy": 0.6958181917667389, "num_tokens": 8288205.0, "step": 3260 }, { "epoch": 1.788840262582057, "grad_norm": 10.375, "learning_rate": 4.039022611232677e-06, "loss": 1.0131, "mean_token_accuracy": 0.7018442392349243, "num_tokens": 8313349.0, "step": 3270 }, { "epoch": 1.7943107221006565, "grad_norm": 10.375, "learning_rate": 4.020787746170679e-06, "loss": 1.0274, "mean_token_accuracy": 0.7001440703868866, "num_tokens": 8338997.0, "step": 3280 }, { "epoch": 1.799781181619256, "grad_norm": 9.9375, "learning_rate": 4.00255288110868e-06, "loss": 1.0354, "mean_token_accuracy": 0.6997708559036255, "num_tokens": 8364446.0, "step": 3290 }, { "epoch": 1.8052516411378556, "grad_norm": 10.75, "learning_rate": 3.984318016046682e-06, "loss": 1.0228, "mean_token_accuracy": 0.7024441123008728, "num_tokens": 8390005.0, "step": 3300 }, { "epoch": 1.8107221006564551, "grad_norm": 10.0625, "learning_rate": 3.9660831509846835e-06, "loss": 1.0369, "mean_token_accuracy": 0.6996273756027221, "num_tokens": 8415557.0, "step": 3310 }, { "epoch": 1.8161925601750548, "grad_norm": 9.6875, "learning_rate": 3.947848285922684e-06, "loss": 1.0282, "mean_token_accuracy": 0.6991647839546203, "num_tokens": 8441067.0, "step": 3320 }, { "epoch": 1.8216630196936543, "grad_norm": 10.0625, "learning_rate": 3.929613420860686e-06, "loss": 1.0441, "mean_token_accuracy": 0.6979943752288819, "num_tokens": 8466748.0, "step": 3330 }, { "epoch": 1.8271334792122538, "grad_norm": 10.8125, "learning_rate": 3.9113785557986875e-06, "loss": 1.0425, "mean_token_accuracy": 0.6951029121875762, "num_tokens": 8491752.0, "step": 3340 }, { "epoch": 1.8326039387308533, "grad_norm": 10.0625, "learning_rate": 3.893143690736689e-06, "loss": 1.0116, "mean_token_accuracy": 0.7038478553295135, "num_tokens": 8517225.0, "step": 3350 }, { "epoch": 1.838074398249453, "grad_norm": 9.4375, "learning_rate": 3.874908825674691e-06, "loss": 1.0348, "mean_token_accuracy": 0.6978711068630219, "num_tokens": 8542666.0, "step": 3360 }, { "epoch": 1.8435448577680527, "grad_norm": 9.875, "learning_rate": 3.856673960612692e-06, "loss": 1.0388, "mean_token_accuracy": 0.6979636132717133, "num_tokens": 8568498.0, "step": 3370 }, { "epoch": 1.8490153172866521, "grad_norm": 9.625, "learning_rate": 3.838439095550693e-06, "loss": 1.0077, "mean_token_accuracy": 0.7020620405673981, "num_tokens": 8593770.0, "step": 3380 }, { "epoch": 1.8544857768052516, "grad_norm": 10.625, "learning_rate": 3.8202042304886946e-06, "loss": 1.0382, "mean_token_accuracy": 0.696090292930603, "num_tokens": 8619623.0, "step": 3390 }, { "epoch": 1.859956236323851, "grad_norm": 11.4375, "learning_rate": 3.8019693654266957e-06, "loss": 1.0215, "mean_token_accuracy": 0.7019686102867126, "num_tokens": 8644852.0, "step": 3400 }, { "epoch": 1.8654266958424508, "grad_norm": 10.75, "learning_rate": 3.7837345003646973e-06, "loss": 1.0507, "mean_token_accuracy": 0.695297920703888, "num_tokens": 8670333.0, "step": 3410 }, { "epoch": 1.8708971553610503, "grad_norm": 9.8125, "learning_rate": 3.765499635302699e-06, "loss": 1.0505, "mean_token_accuracy": 0.6986589133739471, "num_tokens": 8695967.0, "step": 3420 }, { "epoch": 1.87636761487965, "grad_norm": 10.625, "learning_rate": 3.7472647702407005e-06, "loss": 1.0098, "mean_token_accuracy": 0.7004635334014893, "num_tokens": 8721252.0, "step": 3430 }, { "epoch": 1.8818380743982495, "grad_norm": 10.1875, "learning_rate": 3.729029905178702e-06, "loss": 1.0037, "mean_token_accuracy": 0.7041628360748291, "num_tokens": 8746944.0, "step": 3440 }, { "epoch": 1.887308533916849, "grad_norm": 10.4375, "learning_rate": 3.7107950401167037e-06, "loss": 1.0375, "mean_token_accuracy": 0.6980779647827149, "num_tokens": 8771892.0, "step": 3450 }, { "epoch": 1.8927789934354484, "grad_norm": 10.0625, "learning_rate": 3.692560175054705e-06, "loss": 0.9951, "mean_token_accuracy": 0.7054601550102234, "num_tokens": 8797131.0, "step": 3460 }, { "epoch": 1.8982494529540481, "grad_norm": 10.5, "learning_rate": 3.674325309992706e-06, "loss": 0.9912, "mean_token_accuracy": 0.7073158025741577, "num_tokens": 8822466.0, "step": 3470 }, { "epoch": 1.9037199124726478, "grad_norm": 9.4375, "learning_rate": 3.6560904449307076e-06, "loss": 1.0059, "mean_token_accuracy": 0.7047175526618957, "num_tokens": 8847987.0, "step": 3480 }, { "epoch": 1.9091903719912473, "grad_norm": 10.3125, "learning_rate": 3.637855579868709e-06, "loss": 1.0167, "mean_token_accuracy": 0.7027063131332397, "num_tokens": 8873940.0, "step": 3490 }, { "epoch": 1.9146608315098468, "grad_norm": 9.8125, "learning_rate": 3.619620714806711e-06, "loss": 1.0089, "mean_token_accuracy": 0.7014587640762329, "num_tokens": 8899633.0, "step": 3500 }, { "epoch": 1.9201312910284463, "grad_norm": 9.5, "learning_rate": 3.6013858497447124e-06, "loss": 1.0017, "mean_token_accuracy": 0.7041820049285888, "num_tokens": 8924575.0, "step": 3510 }, { "epoch": 1.925601750547046, "grad_norm": 10.375, "learning_rate": 3.5831509846827136e-06, "loss": 1.0165, "mean_token_accuracy": 0.6998761057853699, "num_tokens": 8950150.0, "step": 3520 }, { "epoch": 1.9310722100656457, "grad_norm": 10.5, "learning_rate": 3.5649161196207147e-06, "loss": 0.9949, "mean_token_accuracy": 0.7060512363910675, "num_tokens": 8975532.0, "step": 3530 }, { "epoch": 1.9365426695842451, "grad_norm": 10.0625, "learning_rate": 3.5466812545587163e-06, "loss": 1.026, "mean_token_accuracy": 0.7015553712844849, "num_tokens": 9000990.0, "step": 3540 }, { "epoch": 1.9420131291028446, "grad_norm": 9.8125, "learning_rate": 3.528446389496718e-06, "loss": 1.0334, "mean_token_accuracy": 0.7001884341239929, "num_tokens": 9026718.0, "step": 3550 }, { "epoch": 1.947483588621444, "grad_norm": 10.1875, "learning_rate": 3.5102115244347195e-06, "loss": 1.0248, "mean_token_accuracy": 0.6993842363357544, "num_tokens": 9052433.0, "step": 3560 }, { "epoch": 1.9529540481400438, "grad_norm": 9.875, "learning_rate": 3.491976659372721e-06, "loss": 1.0273, "mean_token_accuracy": 0.7003478467464447, "num_tokens": 9077782.0, "step": 3570 }, { "epoch": 1.9584245076586433, "grad_norm": 10.5, "learning_rate": 3.4737417943107223e-06, "loss": 1.0047, "mean_token_accuracy": 0.7034225165843964, "num_tokens": 9103349.0, "step": 3580 }, { "epoch": 1.963894967177243, "grad_norm": 10.0, "learning_rate": 3.455506929248724e-06, "loss": 1.047, "mean_token_accuracy": 0.693518990278244, "num_tokens": 9129033.0, "step": 3590 }, { "epoch": 1.9693654266958425, "grad_norm": 10.25, "learning_rate": 3.437272064186725e-06, "loss": 1.042, "mean_token_accuracy": 0.698636132478714, "num_tokens": 9154512.0, "step": 3600 }, { "epoch": 1.974835886214442, "grad_norm": 10.1875, "learning_rate": 3.4190371991247266e-06, "loss": 1.0232, "mean_token_accuracy": 0.7020216822624207, "num_tokens": 9179746.0, "step": 3610 }, { "epoch": 1.9803063457330414, "grad_norm": 10.5625, "learning_rate": 3.400802334062728e-06, "loss": 1.0277, "mean_token_accuracy": 0.6972903072834015, "num_tokens": 9205381.0, "step": 3620 }, { "epoch": 1.9857768052516411, "grad_norm": 11.0625, "learning_rate": 3.38256746900073e-06, "loss": 1.0172, "mean_token_accuracy": 0.7026154100894928, "num_tokens": 9230261.0, "step": 3630 }, { "epoch": 1.9912472647702408, "grad_norm": 10.1875, "learning_rate": 3.3643326039387314e-06, "loss": 1.01, "mean_token_accuracy": 0.7005713403224945, "num_tokens": 9255736.0, "step": 3640 }, { "epoch": 1.9967177242888403, "grad_norm": 9.8125, "learning_rate": 3.3460977388767325e-06, "loss": 1.0041, "mean_token_accuracy": 0.7037198424339295, "num_tokens": 9281129.0, "step": 3650 }, { "epoch": 2.00218818380744, "grad_norm": 9.875, "learning_rate": 3.327862873814734e-06, "loss": 1.0031, "mean_token_accuracy": 0.7007030785083771, "num_tokens": 9306380.0, "step": 3660 }, { "epoch": 2.0076586433260393, "grad_norm": 9.875, "learning_rate": 3.3096280087527353e-06, "loss": 0.9473, "mean_token_accuracy": 0.717999279499054, "num_tokens": 9331366.0, "step": 3670 }, { "epoch": 2.0131291028446388, "grad_norm": 9.625, "learning_rate": 3.291393143690737e-06, "loss": 0.9626, "mean_token_accuracy": 0.717300283908844, "num_tokens": 9356901.0, "step": 3680 }, { "epoch": 2.0185995623632387, "grad_norm": 10.3125, "learning_rate": 3.2731582786287385e-06, "loss": 0.9602, "mean_token_accuracy": 0.7138874650001525, "num_tokens": 9382234.0, "step": 3690 }, { "epoch": 2.024070021881838, "grad_norm": 9.75, "learning_rate": 3.25492341356674e-06, "loss": 0.9654, "mean_token_accuracy": 0.712285041809082, "num_tokens": 9407437.0, "step": 3700 }, { "epoch": 2.0295404814004376, "grad_norm": 10.0625, "learning_rate": 3.2366885485047412e-06, "loss": 0.9342, "mean_token_accuracy": 0.7209572613239288, "num_tokens": 9432454.0, "step": 3710 }, { "epoch": 2.035010940919037, "grad_norm": 10.75, "learning_rate": 3.218453683442743e-06, "loss": 0.9631, "mean_token_accuracy": 0.7131079971790314, "num_tokens": 9457857.0, "step": 3720 }, { "epoch": 2.0404814004376366, "grad_norm": 10.3125, "learning_rate": 3.200218818380744e-06, "loss": 0.9577, "mean_token_accuracy": 0.7177065372467041, "num_tokens": 9482811.0, "step": 3730 }, { "epoch": 2.0459518599562365, "grad_norm": 10.9375, "learning_rate": 3.1819839533187456e-06, "loss": 0.9614, "mean_token_accuracy": 0.7131213903427124, "num_tokens": 9508186.0, "step": 3740 }, { "epoch": 2.051422319474836, "grad_norm": 10.75, "learning_rate": 3.163749088256747e-06, "loss": 0.9639, "mean_token_accuracy": 0.7165208160877228, "num_tokens": 9533509.0, "step": 3750 }, { "epoch": 2.0568927789934355, "grad_norm": 10.0625, "learning_rate": 3.1455142231947488e-06, "loss": 0.9535, "mean_token_accuracy": 0.7169022679328918, "num_tokens": 9558600.0, "step": 3760 }, { "epoch": 2.062363238512035, "grad_norm": 10.5625, "learning_rate": 3.12727935813275e-06, "loss": 0.9646, "mean_token_accuracy": 0.7135210871696472, "num_tokens": 9584233.0, "step": 3770 }, { "epoch": 2.0678336980306344, "grad_norm": 10.25, "learning_rate": 3.1090444930707515e-06, "loss": 0.9786, "mean_token_accuracy": 0.709892874956131, "num_tokens": 9609789.0, "step": 3780 }, { "epoch": 2.0733041575492344, "grad_norm": 10.375, "learning_rate": 3.090809628008753e-06, "loss": 0.9738, "mean_token_accuracy": 0.7121241211891174, "num_tokens": 9634822.0, "step": 3790 }, { "epoch": 2.078774617067834, "grad_norm": 11.25, "learning_rate": 3.0725747629467543e-06, "loss": 0.9616, "mean_token_accuracy": 0.7132679998874665, "num_tokens": 9660314.0, "step": 3800 }, { "epoch": 2.0842450765864333, "grad_norm": 10.625, "learning_rate": 3.054339897884756e-06, "loss": 0.9579, "mean_token_accuracy": 0.7166781783103943, "num_tokens": 9685937.0, "step": 3810 }, { "epoch": 2.089715536105033, "grad_norm": 10.8125, "learning_rate": 3.0361050328227575e-06, "loss": 0.9741, "mean_token_accuracy": 0.714539396762848, "num_tokens": 9711177.0, "step": 3820 }, { "epoch": 2.0951859956236323, "grad_norm": 11.125, "learning_rate": 3.0178701677607587e-06, "loss": 0.9463, "mean_token_accuracy": 0.7216998755931854, "num_tokens": 9736426.0, "step": 3830 }, { "epoch": 2.1006564551422318, "grad_norm": 10.25, "learning_rate": 2.9996353026987602e-06, "loss": 0.9648, "mean_token_accuracy": 0.7115364730358124, "num_tokens": 9762127.0, "step": 3840 }, { "epoch": 2.1061269146608317, "grad_norm": 10.75, "learning_rate": 2.981400437636762e-06, "loss": 0.9475, "mean_token_accuracy": 0.7189037084579468, "num_tokens": 9787178.0, "step": 3850 }, { "epoch": 2.111597374179431, "grad_norm": 10.5, "learning_rate": 2.9631655725747634e-06, "loss": 0.9537, "mean_token_accuracy": 0.7175340712070465, "num_tokens": 9812717.0, "step": 3860 }, { "epoch": 2.1170678336980306, "grad_norm": 10.1875, "learning_rate": 2.9449307075127646e-06, "loss": 0.9472, "mean_token_accuracy": 0.7212161302566529, "num_tokens": 9837881.0, "step": 3870 }, { "epoch": 2.12253829321663, "grad_norm": 10.3125, "learning_rate": 2.926695842450766e-06, "loss": 0.9608, "mean_token_accuracy": 0.7193179249763488, "num_tokens": 9863317.0, "step": 3880 }, { "epoch": 2.1280087527352296, "grad_norm": 10.5625, "learning_rate": 2.9084609773887674e-06, "loss": 0.951, "mean_token_accuracy": 0.7160651385784149, "num_tokens": 9888868.0, "step": 3890 }, { "epoch": 2.1334792122538295, "grad_norm": 10.25, "learning_rate": 2.890226112326769e-06, "loss": 0.9581, "mean_token_accuracy": 0.716667366027832, "num_tokens": 9914219.0, "step": 3900 }, { "epoch": 2.138949671772429, "grad_norm": 10.1875, "learning_rate": 2.8719912472647705e-06, "loss": 0.9637, "mean_token_accuracy": 0.7133105576038361, "num_tokens": 9939476.0, "step": 3910 }, { "epoch": 2.1444201312910285, "grad_norm": 10.3125, "learning_rate": 2.853756382202772e-06, "loss": 0.9541, "mean_token_accuracy": 0.7155322432518005, "num_tokens": 9964926.0, "step": 3920 }, { "epoch": 2.149890590809628, "grad_norm": 10.4375, "learning_rate": 2.8355215171407733e-06, "loss": 0.9801, "mean_token_accuracy": 0.7113901436328888, "num_tokens": 9990537.0, "step": 3930 }, { "epoch": 2.1553610503282274, "grad_norm": 10.3125, "learning_rate": 2.817286652078775e-06, "loss": 0.9581, "mean_token_accuracy": 0.7181893765926362, "num_tokens": 10016002.0, "step": 3940 }, { "epoch": 2.160831509846827, "grad_norm": 10.4375, "learning_rate": 2.799051787016776e-06, "loss": 0.947, "mean_token_accuracy": 0.7155714333057404, "num_tokens": 10041372.0, "step": 3950 }, { "epoch": 2.166301969365427, "grad_norm": 10.25, "learning_rate": 2.7808169219547776e-06, "loss": 0.9951, "mean_token_accuracy": 0.705439954996109, "num_tokens": 10066639.0, "step": 3960 }, { "epoch": 2.1717724288840263, "grad_norm": 9.9375, "learning_rate": 2.7625820568927792e-06, "loss": 0.9621, "mean_token_accuracy": 0.715647429227829, "num_tokens": 10091974.0, "step": 3970 }, { "epoch": 2.177242888402626, "grad_norm": 10.5, "learning_rate": 2.744347191830781e-06, "loss": 0.9731, "mean_token_accuracy": 0.7148273229598999, "num_tokens": 10117300.0, "step": 3980 }, { "epoch": 2.1827133479212253, "grad_norm": 10.125, "learning_rate": 2.7261123267687824e-06, "loss": 0.9625, "mean_token_accuracy": 0.7132592558860779, "num_tokens": 10142825.0, "step": 3990 }, { "epoch": 2.1881838074398248, "grad_norm": 10.5625, "learning_rate": 2.7078774617067836e-06, "loss": 0.9887, "mean_token_accuracy": 0.7106162488460541, "num_tokens": 10168114.0, "step": 4000 }, { "epoch": 2.1936542669584247, "grad_norm": 9.9375, "learning_rate": 2.6896425966447848e-06, "loss": 0.9496, "mean_token_accuracy": 0.721660703420639, "num_tokens": 10193441.0, "step": 4010 }, { "epoch": 2.199124726477024, "grad_norm": 10.375, "learning_rate": 2.6714077315827863e-06, "loss": 0.9785, "mean_token_accuracy": 0.7132415533065796, "num_tokens": 10219252.0, "step": 4020 }, { "epoch": 2.2045951859956237, "grad_norm": 10.25, "learning_rate": 2.653172866520788e-06, "loss": 0.9639, "mean_token_accuracy": 0.7168880999088287, "num_tokens": 10244918.0, "step": 4030 }, { "epoch": 2.210065645514223, "grad_norm": 10.1875, "learning_rate": 2.6349380014587895e-06, "loss": 0.9507, "mean_token_accuracy": 0.7177555739879609, "num_tokens": 10270446.0, "step": 4040 }, { "epoch": 2.2155361050328226, "grad_norm": 10.4375, "learning_rate": 2.616703136396791e-06, "loss": 0.9464, "mean_token_accuracy": 0.7184527516365051, "num_tokens": 10295965.0, "step": 4050 }, { "epoch": 2.2210065645514225, "grad_norm": 9.8125, "learning_rate": 2.5984682713347927e-06, "loss": 0.9547, "mean_token_accuracy": 0.7186037957668304, "num_tokens": 10321256.0, "step": 4060 }, { "epoch": 2.226477024070022, "grad_norm": 10.8125, "learning_rate": 2.580233406272794e-06, "loss": 0.9507, "mean_token_accuracy": 0.7183585882186889, "num_tokens": 10346406.0, "step": 4070 }, { "epoch": 2.2319474835886215, "grad_norm": 10.5, "learning_rate": 2.561998541210795e-06, "loss": 0.9862, "mean_token_accuracy": 0.710406482219696, "num_tokens": 10371870.0, "step": 4080 }, { "epoch": 2.237417943107221, "grad_norm": 10.375, "learning_rate": 2.5437636761487966e-06, "loss": 0.9938, "mean_token_accuracy": 0.7080509960651398, "num_tokens": 10397552.0, "step": 4090 }, { "epoch": 2.2428884026258205, "grad_norm": 10.375, "learning_rate": 2.5255288110867982e-06, "loss": 0.962, "mean_token_accuracy": 0.7114235162734985, "num_tokens": 10423310.0, "step": 4100 }, { "epoch": 2.24835886214442, "grad_norm": 10.375, "learning_rate": 2.5072939460248e-06, "loss": 0.9784, "mean_token_accuracy": 0.7133292317390442, "num_tokens": 10448405.0, "step": 4110 }, { "epoch": 2.25382932166302, "grad_norm": 9.8125, "learning_rate": 2.489059080962801e-06, "loss": 0.9837, "mean_token_accuracy": 0.7114726364612579, "num_tokens": 10474015.0, "step": 4120 }, { "epoch": 2.2592997811816193, "grad_norm": 10.1875, "learning_rate": 2.4708242159008026e-06, "loss": 0.9765, "mean_token_accuracy": 0.7124983072280884, "num_tokens": 10500122.0, "step": 4130 }, { "epoch": 2.264770240700219, "grad_norm": 11.0, "learning_rate": 2.452589350838804e-06, "loss": 1.0051, "mean_token_accuracy": 0.7033932983875275, "num_tokens": 10525569.0, "step": 4140 }, { "epoch": 2.2702407002188183, "grad_norm": 11.1875, "learning_rate": 2.4343544857768053e-06, "loss": 0.9537, "mean_token_accuracy": 0.7190083563327789, "num_tokens": 10550809.0, "step": 4150 }, { "epoch": 2.2757111597374178, "grad_norm": 11.0, "learning_rate": 2.416119620714807e-06, "loss": 0.9574, "mean_token_accuracy": 0.7156289637088775, "num_tokens": 10576391.0, "step": 4160 }, { "epoch": 2.2811816192560173, "grad_norm": 10.3125, "learning_rate": 2.3978847556528085e-06, "loss": 0.9615, "mean_token_accuracy": 0.7188243508338928, "num_tokens": 10601261.0, "step": 4170 }, { "epoch": 2.286652078774617, "grad_norm": 10.0625, "learning_rate": 2.3796498905908097e-06, "loss": 0.9545, "mean_token_accuracy": 0.715437775850296, "num_tokens": 10626305.0, "step": 4180 }, { "epoch": 2.2921225382932167, "grad_norm": 11.0, "learning_rate": 2.3614150255288113e-06, "loss": 0.9931, "mean_token_accuracy": 0.7077142417430877, "num_tokens": 10652029.0, "step": 4190 }, { "epoch": 2.297592997811816, "grad_norm": 10.25, "learning_rate": 2.343180160466813e-06, "loss": 0.976, "mean_token_accuracy": 0.7103690207004547, "num_tokens": 10677356.0, "step": 4200 }, { "epoch": 2.3030634573304156, "grad_norm": 10.0625, "learning_rate": 2.324945295404814e-06, "loss": 0.9647, "mean_token_accuracy": 0.7177474439144135, "num_tokens": 10703431.0, "step": 4210 }, { "epoch": 2.308533916849015, "grad_norm": 10.125, "learning_rate": 2.3067104303428156e-06, "loss": 0.9758, "mean_token_accuracy": 0.7121629536151886, "num_tokens": 10728992.0, "step": 4220 }, { "epoch": 2.314004376367615, "grad_norm": 10.125, "learning_rate": 2.2884755652808172e-06, "loss": 0.934, "mean_token_accuracy": 0.7213209450244904, "num_tokens": 10754404.0, "step": 4230 }, { "epoch": 2.3194748358862145, "grad_norm": 9.75, "learning_rate": 2.270240700218819e-06, "loss": 0.9461, "mean_token_accuracy": 0.719011914730072, "num_tokens": 10779944.0, "step": 4240 }, { "epoch": 2.324945295404814, "grad_norm": 11.0625, "learning_rate": 2.25200583515682e-06, "loss": 0.9966, "mean_token_accuracy": 0.7089413404464722, "num_tokens": 10805085.0, "step": 4250 }, { "epoch": 2.3304157549234135, "grad_norm": 10.375, "learning_rate": 2.2337709700948216e-06, "loss": 0.9707, "mean_token_accuracy": 0.7119170904159546, "num_tokens": 10830483.0, "step": 4260 }, { "epoch": 2.335886214442013, "grad_norm": 10.5625, "learning_rate": 2.215536105032823e-06, "loss": 0.9641, "mean_token_accuracy": 0.7150032758712769, "num_tokens": 10856177.0, "step": 4270 }, { "epoch": 2.341356673960613, "grad_norm": 10.625, "learning_rate": 2.1973012399708243e-06, "loss": 0.971, "mean_token_accuracy": 0.7109645545482636, "num_tokens": 10882042.0, "step": 4280 }, { "epoch": 2.3468271334792123, "grad_norm": 10.1875, "learning_rate": 2.179066374908826e-06, "loss": 0.9215, "mean_token_accuracy": 0.7236122965812684, "num_tokens": 10907588.0, "step": 4290 }, { "epoch": 2.352297592997812, "grad_norm": 11.0, "learning_rate": 2.1608315098468275e-06, "loss": 0.9629, "mean_token_accuracy": 0.7140246748924255, "num_tokens": 10932821.0, "step": 4300 }, { "epoch": 2.3577680525164113, "grad_norm": 11.0, "learning_rate": 2.1425966447848287e-06, "loss": 0.9542, "mean_token_accuracy": 0.712429267168045, "num_tokens": 10958451.0, "step": 4310 }, { "epoch": 2.363238512035011, "grad_norm": 10.25, "learning_rate": 2.1243617797228303e-06, "loss": 0.9602, "mean_token_accuracy": 0.7146483659744263, "num_tokens": 10983957.0, "step": 4320 }, { "epoch": 2.3687089715536107, "grad_norm": 10.375, "learning_rate": 2.106126914660832e-06, "loss": 0.991, "mean_token_accuracy": 0.7093435227870941, "num_tokens": 11009581.0, "step": 4330 }, { "epoch": 2.37417943107221, "grad_norm": 10.5625, "learning_rate": 2.087892049598833e-06, "loss": 0.9806, "mean_token_accuracy": 0.7109127819538117, "num_tokens": 11035199.0, "step": 4340 }, { "epoch": 2.3796498905908097, "grad_norm": 10.0, "learning_rate": 2.0696571845368346e-06, "loss": 0.9605, "mean_token_accuracy": 0.7149803459644317, "num_tokens": 11061011.0, "step": 4350 }, { "epoch": 2.385120350109409, "grad_norm": 10.8125, "learning_rate": 2.0514223194748362e-06, "loss": 0.9667, "mean_token_accuracy": 0.7125830769538879, "num_tokens": 11086545.0, "step": 4360 }, { "epoch": 2.3905908096280086, "grad_norm": 10.25, "learning_rate": 2.0331874544128374e-06, "loss": 0.9716, "mean_token_accuracy": 0.7115569293498993, "num_tokens": 11111650.0, "step": 4370 }, { "epoch": 2.3960612691466086, "grad_norm": 9.875, "learning_rate": 2.014952589350839e-06, "loss": 0.9835, "mean_token_accuracy": 0.7110448122024536, "num_tokens": 11136976.0, "step": 4380 }, { "epoch": 2.401531728665208, "grad_norm": 10.5, "learning_rate": 1.9967177242888406e-06, "loss": 0.9696, "mean_token_accuracy": 0.7116839528083801, "num_tokens": 11162409.0, "step": 4390 }, { "epoch": 2.4070021881838075, "grad_norm": 10.0625, "learning_rate": 1.9784828592268417e-06, "loss": 0.9518, "mean_token_accuracy": 0.7186504125595092, "num_tokens": 11187779.0, "step": 4400 }, { "epoch": 2.412472647702407, "grad_norm": 10.3125, "learning_rate": 1.9602479941648433e-06, "loss": 0.9594, "mean_token_accuracy": 0.7139133214950562, "num_tokens": 11213494.0, "step": 4410 }, { "epoch": 2.4179431072210065, "grad_norm": 10.8125, "learning_rate": 1.942013129102845e-06, "loss": 0.9563, "mean_token_accuracy": 0.7193277597427368, "num_tokens": 11238852.0, "step": 4420 }, { "epoch": 2.423413566739606, "grad_norm": 9.8125, "learning_rate": 1.923778264040846e-06, "loss": 0.9477, "mean_token_accuracy": 0.7188430905342102, "num_tokens": 11264480.0, "step": 4430 }, { "epoch": 2.428884026258206, "grad_norm": 10.625, "learning_rate": 1.9055433989788477e-06, "loss": 0.9579, "mean_token_accuracy": 0.7149973511695862, "num_tokens": 11290175.0, "step": 4440 }, { "epoch": 2.4343544857768054, "grad_norm": 10.5625, "learning_rate": 1.8873085339168493e-06, "loss": 0.9772, "mean_token_accuracy": 0.7096493124961853, "num_tokens": 11315301.0, "step": 4450 }, { "epoch": 2.439824945295405, "grad_norm": 10.125, "learning_rate": 1.8690736688548504e-06, "loss": 0.9463, "mean_token_accuracy": 0.7188547492027283, "num_tokens": 11340809.0, "step": 4460 }, { "epoch": 2.4452954048140043, "grad_norm": 10.375, "learning_rate": 1.850838803792852e-06, "loss": 0.9753, "mean_token_accuracy": 0.7100773870944976, "num_tokens": 11366227.0, "step": 4470 }, { "epoch": 2.450765864332604, "grad_norm": 10.8125, "learning_rate": 1.8326039387308536e-06, "loss": 0.9786, "mean_token_accuracy": 0.7101668000221253, "num_tokens": 11391908.0, "step": 4480 }, { "epoch": 2.4562363238512033, "grad_norm": 10.5625, "learning_rate": 1.814369073668855e-06, "loss": 0.9722, "mean_token_accuracy": 0.7132100880146026, "num_tokens": 11417099.0, "step": 4490 }, { "epoch": 2.461706783369803, "grad_norm": 10.25, "learning_rate": 1.7961342086068564e-06, "loss": 0.9672, "mean_token_accuracy": 0.7163193345069885, "num_tokens": 11442529.0, "step": 4500 }, { "epoch": 2.4671772428884027, "grad_norm": 10.5, "learning_rate": 1.777899343544858e-06, "loss": 0.9741, "mean_token_accuracy": 0.7144057810306549, "num_tokens": 11467943.0, "step": 4510 }, { "epoch": 2.472647702407002, "grad_norm": 10.125, "learning_rate": 1.7596644784828594e-06, "loss": 0.99, "mean_token_accuracy": 0.7097407221794129, "num_tokens": 11493655.0, "step": 4520 }, { "epoch": 2.4781181619256016, "grad_norm": 9.5625, "learning_rate": 1.7414296134208607e-06, "loss": 0.9242, "mean_token_accuracy": 0.7254151999950409, "num_tokens": 11519074.0, "step": 4530 }, { "epoch": 2.483588621444201, "grad_norm": 10.6875, "learning_rate": 1.7231947483588623e-06, "loss": 0.9623, "mean_token_accuracy": 0.7154429078102111, "num_tokens": 11544510.0, "step": 4540 }, { "epoch": 2.489059080962801, "grad_norm": 10.625, "learning_rate": 1.7049598832968637e-06, "loss": 0.9405, "mean_token_accuracy": 0.720922189950943, "num_tokens": 11569975.0, "step": 4550 }, { "epoch": 2.4945295404814005, "grad_norm": 10.375, "learning_rate": 1.686725018234865e-06, "loss": 0.9643, "mean_token_accuracy": 0.7141575336456298, "num_tokens": 11595508.0, "step": 4560 }, { "epoch": 2.5, "grad_norm": 10.4375, "learning_rate": 1.6684901531728667e-06, "loss": 0.9722, "mean_token_accuracy": 0.716586035490036, "num_tokens": 11621057.0, "step": 4570 }, { "epoch": 2.5054704595185995, "grad_norm": 9.875, "learning_rate": 1.650255288110868e-06, "loss": 0.9678, "mean_token_accuracy": 0.7154919922351837, "num_tokens": 11646277.0, "step": 4580 }, { "epoch": 2.510940919037199, "grad_norm": 10.375, "learning_rate": 1.6320204230488696e-06, "loss": 0.9514, "mean_token_accuracy": 0.7179181814193726, "num_tokens": 11671463.0, "step": 4590 }, { "epoch": 2.516411378555799, "grad_norm": 10.375, "learning_rate": 1.613785557986871e-06, "loss": 0.945, "mean_token_accuracy": 0.7157581686973572, "num_tokens": 11696766.0, "step": 4600 }, { "epoch": 2.5218818380743984, "grad_norm": 9.875, "learning_rate": 1.5955506929248724e-06, "loss": 0.9507, "mean_token_accuracy": 0.717986673116684, "num_tokens": 11722577.0, "step": 4610 }, { "epoch": 2.527352297592998, "grad_norm": 10.4375, "learning_rate": 1.577315827862874e-06, "loss": 0.9841, "mean_token_accuracy": 0.7108976006507873, "num_tokens": 11747611.0, "step": 4620 }, { "epoch": 2.5328227571115973, "grad_norm": 10.5, "learning_rate": 1.5590809628008754e-06, "loss": 0.9691, "mean_token_accuracy": 0.7137386500835419, "num_tokens": 11773081.0, "step": 4630 }, { "epoch": 2.538293216630197, "grad_norm": 10.6875, "learning_rate": 1.5408460977388768e-06, "loss": 0.9968, "mean_token_accuracy": 0.7075820744037629, "num_tokens": 11798361.0, "step": 4640 }, { "epoch": 2.5437636761487967, "grad_norm": 10.0625, "learning_rate": 1.5226112326768783e-06, "loss": 0.9582, "mean_token_accuracy": 0.7191313743591309, "num_tokens": 11823896.0, "step": 4650 }, { "epoch": 2.549234135667396, "grad_norm": 10.5625, "learning_rate": 1.5043763676148797e-06, "loss": 0.9983, "mean_token_accuracy": 0.7068405508995056, "num_tokens": 11849650.0, "step": 4660 }, { "epoch": 2.5547045951859957, "grad_norm": 10.3125, "learning_rate": 1.4861415025528813e-06, "loss": 0.9546, "mean_token_accuracy": 0.7164286077022552, "num_tokens": 11874973.0, "step": 4670 }, { "epoch": 2.560175054704595, "grad_norm": 10.5625, "learning_rate": 1.4679066374908827e-06, "loss": 0.9665, "mean_token_accuracy": 0.7142378628253937, "num_tokens": 11900214.0, "step": 4680 }, { "epoch": 2.5656455142231946, "grad_norm": 10.5, "learning_rate": 1.4496717724288843e-06, "loss": 0.9344, "mean_token_accuracy": 0.725408935546875, "num_tokens": 11925985.0, "step": 4690 }, { "epoch": 2.5711159737417946, "grad_norm": 10.5625, "learning_rate": 1.4314369073668857e-06, "loss": 0.9571, "mean_token_accuracy": 0.7170994579792023, "num_tokens": 11951080.0, "step": 4700 }, { "epoch": 2.5765864332603936, "grad_norm": 10.25, "learning_rate": 1.413202042304887e-06, "loss": 0.9723, "mean_token_accuracy": 0.7106690168380737, "num_tokens": 11976761.0, "step": 4710 }, { "epoch": 2.5820568927789935, "grad_norm": 11.125, "learning_rate": 1.3949671772428886e-06, "loss": 0.9664, "mean_token_accuracy": 0.7155169665813446, "num_tokens": 12002057.0, "step": 4720 }, { "epoch": 2.587527352297593, "grad_norm": 10.8125, "learning_rate": 1.37673231218089e-06, "loss": 0.9859, "mean_token_accuracy": 0.7098533868789673, "num_tokens": 12027349.0, "step": 4730 }, { "epoch": 2.5929978118161925, "grad_norm": 10.1875, "learning_rate": 1.3584974471188914e-06, "loss": 0.9681, "mean_token_accuracy": 0.7169641613960266, "num_tokens": 12053110.0, "step": 4740 }, { "epoch": 2.598468271334792, "grad_norm": 11.1875, "learning_rate": 1.340262582056893e-06, "loss": 0.9769, "mean_token_accuracy": 0.7144347429275513, "num_tokens": 12078660.0, "step": 4750 }, { "epoch": 2.6039387308533914, "grad_norm": 10.0, "learning_rate": 1.3220277169948944e-06, "loss": 0.9535, "mean_token_accuracy": 0.717149305343628, "num_tokens": 12104041.0, "step": 4760 }, { "epoch": 2.6094091903719914, "grad_norm": 10.5, "learning_rate": 1.3037928519328957e-06, "loss": 0.9509, "mean_token_accuracy": 0.7132203102111816, "num_tokens": 12129327.0, "step": 4770 }, { "epoch": 2.614879649890591, "grad_norm": 10.5, "learning_rate": 1.2855579868708973e-06, "loss": 0.9856, "mean_token_accuracy": 0.7076945900917053, "num_tokens": 12154514.0, "step": 4780 }, { "epoch": 2.6203501094091903, "grad_norm": 10.4375, "learning_rate": 1.267323121808899e-06, "loss": 0.9708, "mean_token_accuracy": 0.7127987205982208, "num_tokens": 12180006.0, "step": 4790 }, { "epoch": 2.62582056892779, "grad_norm": 10.5625, "learning_rate": 1.2490882567469003e-06, "loss": 0.9789, "mean_token_accuracy": 0.7122283995151519, "num_tokens": 12205688.0, "step": 4800 }, { "epoch": 2.6312910284463893, "grad_norm": 10.125, "learning_rate": 1.2308533916849017e-06, "loss": 0.9624, "mean_token_accuracy": 0.71814124584198, "num_tokens": 12230947.0, "step": 4810 }, { "epoch": 2.636761487964989, "grad_norm": 10.0625, "learning_rate": 1.212618526622903e-06, "loss": 0.9897, "mean_token_accuracy": 0.7073604583740234, "num_tokens": 12255955.0, "step": 4820 }, { "epoch": 2.6422319474835887, "grad_norm": 10.1875, "learning_rate": 1.1943836615609047e-06, "loss": 0.9558, "mean_token_accuracy": 0.7145933747291565, "num_tokens": 12281300.0, "step": 4830 }, { "epoch": 2.647702407002188, "grad_norm": 10.75, "learning_rate": 1.176148796498906e-06, "loss": 0.9583, "mean_token_accuracy": 0.7154503166675568, "num_tokens": 12306967.0, "step": 4840 }, { "epoch": 2.6531728665207877, "grad_norm": 10.625, "learning_rate": 1.1579139314369074e-06, "loss": 0.9714, "mean_token_accuracy": 0.7120263636112213, "num_tokens": 12332653.0, "step": 4850 }, { "epoch": 2.658643326039387, "grad_norm": 10.3125, "learning_rate": 1.139679066374909e-06, "loss": 0.9431, "mean_token_accuracy": 0.7196206390857697, "num_tokens": 12357964.0, "step": 4860 }, { "epoch": 2.664113785557987, "grad_norm": 9.875, "learning_rate": 1.1214442013129104e-06, "loss": 0.9235, "mean_token_accuracy": 0.72520210146904, "num_tokens": 12383359.0, "step": 4870 }, { "epoch": 2.6695842450765865, "grad_norm": 10.8125, "learning_rate": 1.1032093362509118e-06, "loss": 0.9541, "mean_token_accuracy": 0.7171355724334717, "num_tokens": 12408973.0, "step": 4880 }, { "epoch": 2.675054704595186, "grad_norm": 9.875, "learning_rate": 1.0849744711889134e-06, "loss": 0.9588, "mean_token_accuracy": 0.7152066648006439, "num_tokens": 12434174.0, "step": 4890 }, { "epoch": 2.6805251641137855, "grad_norm": 9.9375, "learning_rate": 1.0667396061269147e-06, "loss": 0.9417, "mean_token_accuracy": 0.7199950873851776, "num_tokens": 12459929.0, "step": 4900 }, { "epoch": 2.685995623632385, "grad_norm": 10.3125, "learning_rate": 1.0485047410649161e-06, "loss": 0.9403, "mean_token_accuracy": 0.7192999660968781, "num_tokens": 12485603.0, "step": 4910 }, { "epoch": 2.691466083150985, "grad_norm": 10.3125, "learning_rate": 1.0302698760029177e-06, "loss": 0.9588, "mean_token_accuracy": 0.7183211863040924, "num_tokens": 12510854.0, "step": 4920 }, { "epoch": 2.6969365426695844, "grad_norm": 10.0625, "learning_rate": 1.012035010940919e-06, "loss": 0.9445, "mean_token_accuracy": 0.7210539758205414, "num_tokens": 12536418.0, "step": 4930 }, { "epoch": 2.702407002188184, "grad_norm": 10.5625, "learning_rate": 9.938001458789205e-07, "loss": 0.9721, "mean_token_accuracy": 0.713829755783081, "num_tokens": 12562114.0, "step": 4940 }, { "epoch": 2.7078774617067833, "grad_norm": 10.0, "learning_rate": 9.75565280816922e-07, "loss": 0.9548, "mean_token_accuracy": 0.7159802138805389, "num_tokens": 12587542.0, "step": 4950 }, { "epoch": 2.713347921225383, "grad_norm": 10.75, "learning_rate": 9.573304157549234e-07, "loss": 0.9454, "mean_token_accuracy": 0.7186602294445038, "num_tokens": 12613047.0, "step": 4960 }, { "epoch": 2.7188183807439827, "grad_norm": 11.3125, "learning_rate": 9.39095550692925e-07, "loss": 1.0026, "mean_token_accuracy": 0.7088643789291382, "num_tokens": 12638277.0, "step": 4970 }, { "epoch": 2.7242888402625822, "grad_norm": 10.5, "learning_rate": 9.208606856309264e-07, "loss": 0.9739, "mean_token_accuracy": 0.7109079241752625, "num_tokens": 12663642.0, "step": 4980 }, { "epoch": 2.7297592997811817, "grad_norm": 10.5, "learning_rate": 9.026258205689278e-07, "loss": 0.9636, "mean_token_accuracy": 0.712633740901947, "num_tokens": 12688780.0, "step": 4990 }, { "epoch": 2.735229759299781, "grad_norm": 10.3125, "learning_rate": 8.843909555069294e-07, "loss": 0.9593, "mean_token_accuracy": 0.7172781467437744, "num_tokens": 12714020.0, "step": 5000 }, { "epoch": 2.7407002188183807, "grad_norm": 10.625, "learning_rate": 8.661560904449308e-07, "loss": 0.9668, "mean_token_accuracy": 0.7146118104457855, "num_tokens": 12739229.0, "step": 5010 }, { "epoch": 2.74617067833698, "grad_norm": 10.4375, "learning_rate": 8.479212253829322e-07, "loss": 0.9617, "mean_token_accuracy": 0.7141262829303742, "num_tokens": 12764906.0, "step": 5020 }, { "epoch": 2.7516411378555796, "grad_norm": 11.0625, "learning_rate": 8.296863603209337e-07, "loss": 0.9534, "mean_token_accuracy": 0.7181793093681336, "num_tokens": 12790046.0, "step": 5030 }, { "epoch": 2.7571115973741795, "grad_norm": 9.8125, "learning_rate": 8.114514952589351e-07, "loss": 0.9333, "mean_token_accuracy": 0.7222773134708405, "num_tokens": 12815713.0, "step": 5040 }, { "epoch": 2.762582056892779, "grad_norm": 10.625, "learning_rate": 7.932166301969366e-07, "loss": 0.9581, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 12841290.0, "step": 5050 }, { "epoch": 2.7680525164113785, "grad_norm": 10.4375, "learning_rate": 7.749817651349381e-07, "loss": 0.9709, "mean_token_accuracy": 0.7144385755062104, "num_tokens": 12867299.0, "step": 5060 }, { "epoch": 2.773522975929978, "grad_norm": 10.3125, "learning_rate": 7.567469000729396e-07, "loss": 0.9299, "mean_token_accuracy": 0.7200077176094055, "num_tokens": 12892111.0, "step": 5070 }, { "epoch": 2.7789934354485775, "grad_norm": 10.375, "learning_rate": 7.38512035010941e-07, "loss": 1.0044, "mean_token_accuracy": 0.706270956993103, "num_tokens": 12917547.0, "step": 5080 }, { "epoch": 2.7844638949671774, "grad_norm": 10.75, "learning_rate": 7.202771699489424e-07, "loss": 0.9561, "mean_token_accuracy": 0.7158130586147309, "num_tokens": 12942824.0, "step": 5090 }, { "epoch": 2.789934354485777, "grad_norm": 11.25, "learning_rate": 7.020423048869439e-07, "loss": 0.9943, "mean_token_accuracy": 0.7063575088977814, "num_tokens": 12968051.0, "step": 5100 }, { "epoch": 2.7954048140043763, "grad_norm": 10.0, "learning_rate": 6.838074398249453e-07, "loss": 0.9686, "mean_token_accuracy": 0.7124229729175567, "num_tokens": 12992995.0, "step": 5110 }, { "epoch": 2.800875273522976, "grad_norm": 10.25, "learning_rate": 6.655725747629469e-07, "loss": 0.9806, "mean_token_accuracy": 0.7110258340835571, "num_tokens": 13018611.0, "step": 5120 }, { "epoch": 2.8063457330415753, "grad_norm": 10.1875, "learning_rate": 6.473377097009483e-07, "loss": 0.9462, "mean_token_accuracy": 0.7187185704708099, "num_tokens": 13044142.0, "step": 5130 }, { "epoch": 2.8118161925601752, "grad_norm": 10.1875, "learning_rate": 6.291028446389497e-07, "loss": 0.9522, "mean_token_accuracy": 0.7164785265922546, "num_tokens": 13069649.0, "step": 5140 }, { "epoch": 2.8172866520787747, "grad_norm": 11.0, "learning_rate": 6.108679795769512e-07, "loss": 0.9323, "mean_token_accuracy": 0.7243307530879974, "num_tokens": 13094938.0, "step": 5150 }, { "epoch": 2.822757111597374, "grad_norm": 10.375, "learning_rate": 5.926331145149526e-07, "loss": 0.9351, "mean_token_accuracy": 0.7236999869346619, "num_tokens": 13120177.0, "step": 5160 }, { "epoch": 2.8282275711159737, "grad_norm": 10.375, "learning_rate": 5.743982494529541e-07, "loss": 1.0089, "mean_token_accuracy": 0.7033572435379029, "num_tokens": 13145900.0, "step": 5170 }, { "epoch": 2.833698030634573, "grad_norm": 10.1875, "learning_rate": 5.561633843909556e-07, "loss": 0.9821, "mean_token_accuracy": 0.7097233951091766, "num_tokens": 13170900.0, "step": 5180 }, { "epoch": 2.839168490153173, "grad_norm": 10.0625, "learning_rate": 5.37928519328957e-07, "loss": 0.9525, "mean_token_accuracy": 0.7147578775882721, "num_tokens": 13196497.0, "step": 5190 }, { "epoch": 2.8446389496717726, "grad_norm": 10.5, "learning_rate": 5.196936542669585e-07, "loss": 0.9752, "mean_token_accuracy": 0.7121313869953155, "num_tokens": 13221946.0, "step": 5200 }, { "epoch": 2.850109409190372, "grad_norm": 10.4375, "learning_rate": 5.014587892049599e-07, "loss": 0.9862, "mean_token_accuracy": 0.707955265045166, "num_tokens": 13247347.0, "step": 5210 }, { "epoch": 2.8555798687089715, "grad_norm": 10.5625, "learning_rate": 4.832239241429613e-07, "loss": 0.976, "mean_token_accuracy": 0.7143423020839691, "num_tokens": 13273017.0, "step": 5220 }, { "epoch": 2.861050328227571, "grad_norm": 10.5, "learning_rate": 4.649890590809628e-07, "loss": 0.9889, "mean_token_accuracy": 0.7104144990444183, "num_tokens": 13298377.0, "step": 5230 }, { "epoch": 2.866520787746171, "grad_norm": 10.1875, "learning_rate": 4.467541940189643e-07, "loss": 0.9631, "mean_token_accuracy": 0.7177930176258087, "num_tokens": 13323779.0, "step": 5240 }, { "epoch": 2.8719912472647704, "grad_norm": 10.375, "learning_rate": 4.285193289569658e-07, "loss": 0.9779, "mean_token_accuracy": 0.7117682516574859, "num_tokens": 13349327.0, "step": 5250 }, { "epoch": 2.87746170678337, "grad_norm": 9.9375, "learning_rate": 4.102844638949672e-07, "loss": 0.9936, "mean_token_accuracy": 0.7063680112361908, "num_tokens": 13375032.0, "step": 5260 }, { "epoch": 2.8829321663019694, "grad_norm": 10.25, "learning_rate": 3.9204959883296864e-07, "loss": 0.9747, "mean_token_accuracy": 0.7139563202857971, "num_tokens": 13400329.0, "step": 5270 }, { "epoch": 2.888402625820569, "grad_norm": 10.625, "learning_rate": 3.7381473377097013e-07, "loss": 0.9668, "mean_token_accuracy": 0.7155350327491761, "num_tokens": 13426059.0, "step": 5280 }, { "epoch": 2.8938730853391688, "grad_norm": 10.8125, "learning_rate": 3.5557986870897156e-07, "loss": 0.958, "mean_token_accuracy": 0.7152288734912873, "num_tokens": 13451832.0, "step": 5290 }, { "epoch": 2.899343544857768, "grad_norm": 10.75, "learning_rate": 3.3734500364697305e-07, "loss": 0.9859, "mean_token_accuracy": 0.7133601427078247, "num_tokens": 13477401.0, "step": 5300 }, { "epoch": 2.9048140043763677, "grad_norm": 10.5625, "learning_rate": 3.1911013858497453e-07, "loss": 0.9678, "mean_token_accuracy": 0.7171884000301361, "num_tokens": 13502705.0, "step": 5310 }, { "epoch": 2.910284463894967, "grad_norm": 11.0625, "learning_rate": 3.0087527352297597e-07, "loss": 0.9621, "mean_token_accuracy": 0.717541116476059, "num_tokens": 13528062.0, "step": 5320 }, { "epoch": 2.9157549234135667, "grad_norm": 10.3125, "learning_rate": 2.826404084609774e-07, "loss": 0.9622, "mean_token_accuracy": 0.7154664099216461, "num_tokens": 13553750.0, "step": 5330 }, { "epoch": 2.921225382932166, "grad_norm": 10.4375, "learning_rate": 2.644055433989789e-07, "loss": 0.9516, "mean_token_accuracy": 0.717375636100769, "num_tokens": 13578936.0, "step": 5340 }, { "epoch": 2.9266958424507656, "grad_norm": 11.0, "learning_rate": 2.461706783369803e-07, "loss": 0.9518, "mean_token_accuracy": 0.7167813301086425, "num_tokens": 13604064.0, "step": 5350 }, { "epoch": 2.9321663019693656, "grad_norm": 10.75, "learning_rate": 2.279358132749818e-07, "loss": 0.9842, "mean_token_accuracy": 0.7137976944446563, "num_tokens": 13629601.0, "step": 5360 }, { "epoch": 2.937636761487965, "grad_norm": 9.8125, "learning_rate": 2.0970094821298323e-07, "loss": 0.9598, "mean_token_accuracy": 0.7161750555038452, "num_tokens": 13655161.0, "step": 5370 }, { "epoch": 2.9431072210065645, "grad_norm": 10.0, "learning_rate": 1.914660831509847e-07, "loss": 0.9538, "mean_token_accuracy": 0.718994963169098, "num_tokens": 13680024.0, "step": 5380 }, { "epoch": 2.948577680525164, "grad_norm": 10.5, "learning_rate": 1.7323121808898615e-07, "loss": 0.962, "mean_token_accuracy": 0.7169410228729248, "num_tokens": 13705181.0, "step": 5390 }, { "epoch": 2.9540481400437635, "grad_norm": 10.5, "learning_rate": 1.549963530269876e-07, "loss": 0.9574, "mean_token_accuracy": 0.7161156296730041, "num_tokens": 13730505.0, "step": 5400 }, { "epoch": 2.9595185995623634, "grad_norm": 10.5625, "learning_rate": 1.3676148796498907e-07, "loss": 0.9383, "mean_token_accuracy": 0.7220743417739868, "num_tokens": 13755959.0, "step": 5410 }, { "epoch": 2.964989059080963, "grad_norm": 10.375, "learning_rate": 1.1852662290299053e-07, "loss": 0.9787, "mean_token_accuracy": 0.7099229753017425, "num_tokens": 13781338.0, "step": 5420 }, { "epoch": 2.9704595185995624, "grad_norm": 11.0, "learning_rate": 1.0029175784099199e-07, "loss": 0.9792, "mean_token_accuracy": 0.7099815905094147, "num_tokens": 13806784.0, "step": 5430 }, { "epoch": 2.975929978118162, "grad_norm": 10.5625, "learning_rate": 8.205689277899343e-08, "loss": 0.9774, "mean_token_accuracy": 0.7121619462966919, "num_tokens": 13831942.0, "step": 5440 }, { "epoch": 2.9814004376367613, "grad_norm": 10.5, "learning_rate": 6.38220277169949e-08, "loss": 0.9726, "mean_token_accuracy": 0.7091731190681457, "num_tokens": 13857296.0, "step": 5450 }, { "epoch": 2.9868708971553612, "grad_norm": 11.0, "learning_rate": 4.558716265499636e-08, "loss": 0.9846, "mean_token_accuracy": 0.7077378630638123, "num_tokens": 13883120.0, "step": 5460 }, { "epoch": 2.9923413566739607, "grad_norm": 10.25, "learning_rate": 2.735229759299781e-08, "loss": 0.9573, "mean_token_accuracy": 0.7156158804893493, "num_tokens": 13908659.0, "step": 5470 }, { "epoch": 2.99781181619256, "grad_norm": 10.9375, "learning_rate": 9.117432530999272e-09, "loss": 0.9851, "mean_token_accuracy": 0.7107800126075745, "num_tokens": 13933848.0, "step": 5480 } ], "logging_steps": 10, "max_steps": 5484, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9139458472604058e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }