| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 5484, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005470459518599562, | |
| "grad_norm": 9.625, | |
| "learning_rate": 9.983588621444202e-06, | |
| "loss": 1.9919, | |
| "mean_token_accuracy": 0.5373850524425506, | |
| "num_tokens": 25832.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010940919037199124, | |
| "grad_norm": 8.625, | |
| "learning_rate": 9.965353756382203e-06, | |
| "loss": 1.6719, | |
| "mean_token_accuracy": 0.5806249260902405, | |
| "num_tokens": 51122.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.016411378555798686, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.947118891320205e-06, | |
| "loss": 1.6045, | |
| "mean_token_accuracy": 0.5889442741870881, | |
| "num_tokens": 76631.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02188183807439825, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 9.928884026258206e-06, | |
| "loss": 1.5537, | |
| "mean_token_accuracy": 0.6036407649517059, | |
| "num_tokens": 101784.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02735229759299781, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.910649161196208e-06, | |
| "loss": 1.4784, | |
| "mean_token_accuracy": 0.6087825715541839, | |
| "num_tokens": 127369.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03282275711159737, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.89241429613421e-06, | |
| "loss": 1.4971, | |
| "mean_token_accuracy": 0.6069094657897949, | |
| "num_tokens": 152695.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.038293216630196934, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 9.874179431072211e-06, | |
| "loss": 1.4571, | |
| "mean_token_accuracy": 0.6164368271827698, | |
| "num_tokens": 178219.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0437636761487965, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.855944566010213e-06, | |
| "loss": 1.4476, | |
| "mean_token_accuracy": 0.6144214510917664, | |
| "num_tokens": 203604.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04923413566739606, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.837709700948214e-06, | |
| "loss": 1.4509, | |
| "mean_token_accuracy": 0.6110931515693665, | |
| "num_tokens": 229492.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05470459518599562, | |
| "grad_norm": 8.375, | |
| "learning_rate": 9.819474835886216e-06, | |
| "loss": 1.4126, | |
| "mean_token_accuracy": 0.620923125743866, | |
| "num_tokens": 255685.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.060175054704595186, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 9.801239970824217e-06, | |
| "loss": 1.4117, | |
| "mean_token_accuracy": 0.6242850065231323, | |
| "num_tokens": 280910.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06564551422319474, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.783005105762217e-06, | |
| "loss": 1.3618, | |
| "mean_token_accuracy": 0.6288298785686492, | |
| "num_tokens": 306335.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0711159737417943, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 9.76477024070022e-06, | |
| "loss": 1.3932, | |
| "mean_token_accuracy": 0.6229633510112762, | |
| "num_tokens": 331606.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07658643326039387, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.74653537563822e-06, | |
| "loss": 1.4133, | |
| "mean_token_accuracy": 0.6180586159229279, | |
| "num_tokens": 357437.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08205689277899343, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.728300510576222e-06, | |
| "loss": 1.4235, | |
| "mean_token_accuracy": 0.6152303397655488, | |
| "num_tokens": 382692.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.087527352297593, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.710065645514224e-06, | |
| "loss": 1.382, | |
| "mean_token_accuracy": 0.621176666021347, | |
| "num_tokens": 407995.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09299781181619256, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.691830780452225e-06, | |
| "loss": 1.3665, | |
| "mean_token_accuracy": 0.6267438113689423, | |
| "num_tokens": 433527.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09846827133479212, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.673595915390227e-06, | |
| "loss": 1.3829, | |
| "mean_token_accuracy": 0.6232609272003173, | |
| "num_tokens": 459043.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10393873085339168, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.655361050328229e-06, | |
| "loss": 1.3797, | |
| "mean_token_accuracy": 0.6250600516796112, | |
| "num_tokens": 484989.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10940919037199125, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.63712618526623e-06, | |
| "loss": 1.3799, | |
| "mean_token_accuracy": 0.6250615298748017, | |
| "num_tokens": 510010.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11487964989059081, | |
| "grad_norm": 9.75, | |
| "learning_rate": 9.618891320204232e-06, | |
| "loss": 1.3541, | |
| "mean_token_accuracy": 0.6320520222187043, | |
| "num_tokens": 535568.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12035010940919037, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.600656455142233e-06, | |
| "loss": 1.3568, | |
| "mean_token_accuracy": 0.6311915695667267, | |
| "num_tokens": 560786.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12582056892778992, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.582421590080235e-06, | |
| "loss": 1.3751, | |
| "mean_token_accuracy": 0.6243879318237304, | |
| "num_tokens": 586419.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13129102844638948, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.564186725018235e-06, | |
| "loss": 1.3421, | |
| "mean_token_accuracy": 0.6328221976757049, | |
| "num_tokens": 611683.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13676148796498905, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.545951859956238e-06, | |
| "loss": 1.3199, | |
| "mean_token_accuracy": 0.6378473103046417, | |
| "num_tokens": 636737.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1422319474835886, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 9.527716994894238e-06, | |
| "loss": 1.3362, | |
| "mean_token_accuracy": 0.6359954178333282, | |
| "num_tokens": 661611.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14770240700218817, | |
| "grad_norm": 9.0, | |
| "learning_rate": 9.50948212983224e-06, | |
| "loss": 1.2803, | |
| "mean_token_accuracy": 0.649063116312027, | |
| "num_tokens": 686464.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15317286652078774, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.491247264770241e-06, | |
| "loss": 1.3093, | |
| "mean_token_accuracy": 0.6380355060100555, | |
| "num_tokens": 711615.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1586433260393873, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 9.473012399708243e-06, | |
| "loss": 1.3034, | |
| "mean_token_accuracy": 0.6415359675884247, | |
| "num_tokens": 736807.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16411378555798686, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 9.454777534646244e-06, | |
| "loss": 1.3251, | |
| "mean_token_accuracy": 0.6348649680614471, | |
| "num_tokens": 762209.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16958424507658643, | |
| "grad_norm": 8.375, | |
| "learning_rate": 9.436542669584246e-06, | |
| "loss": 1.2911, | |
| "mean_token_accuracy": 0.6452939510345459, | |
| "num_tokens": 787695.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.175054704595186, | |
| "grad_norm": 8.5, | |
| "learning_rate": 9.418307804522248e-06, | |
| "loss": 1.3028, | |
| "mean_token_accuracy": 0.640759015083313, | |
| "num_tokens": 813451.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18052516411378555, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.400072939460249e-06, | |
| "loss": 1.2847, | |
| "mean_token_accuracy": 0.6437187314033508, | |
| "num_tokens": 838902.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.18599562363238512, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.38183807439825e-06, | |
| "loss": 1.2791, | |
| "mean_token_accuracy": 0.6461306512355804, | |
| "num_tokens": 864097.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19146608315098468, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.363603209336252e-06, | |
| "loss": 1.2946, | |
| "mean_token_accuracy": 0.6436072170734406, | |
| "num_tokens": 889959.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19693654266958424, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.345368344274252e-06, | |
| "loss": 1.3278, | |
| "mean_token_accuracy": 0.6354196608066559, | |
| "num_tokens": 915925.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2024070021881838, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.327133479212255e-06, | |
| "loss": 1.3017, | |
| "mean_token_accuracy": 0.6431743144989014, | |
| "num_tokens": 941137.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20787746170678337, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.308898614150255e-06, | |
| "loss": 1.3143, | |
| "mean_token_accuracy": 0.638333660364151, | |
| "num_tokens": 966309.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.21334792122538293, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 9.290663749088259e-06, | |
| "loss": 1.2774, | |
| "mean_token_accuracy": 0.6447885036468506, | |
| "num_tokens": 991629.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2188183807439825, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.272428884026259e-06, | |
| "loss": 1.2702, | |
| "mean_token_accuracy": 0.646415501832962, | |
| "num_tokens": 1017315.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22428884026258206, | |
| "grad_norm": 9.125, | |
| "learning_rate": 9.25419401896426e-06, | |
| "loss": 1.2708, | |
| "mean_token_accuracy": 0.6465388536453247, | |
| "num_tokens": 1042374.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.22975929978118162, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.235959153902262e-06, | |
| "loss": 1.2859, | |
| "mean_token_accuracy": 0.6423357188701629, | |
| "num_tokens": 1067942.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.23522975929978118, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.217724288840263e-06, | |
| "loss": 1.2934, | |
| "mean_token_accuracy": 0.6421127438545227, | |
| "num_tokens": 1093790.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.24070021881838075, | |
| "grad_norm": 8.5, | |
| "learning_rate": 9.199489423778265e-06, | |
| "loss": 1.3007, | |
| "mean_token_accuracy": 0.6417977750301361, | |
| "num_tokens": 1119210.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2461706783369803, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 9.181254558716267e-06, | |
| "loss": 1.3159, | |
| "mean_token_accuracy": 0.6400705456733704, | |
| "num_tokens": 1144670.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.25164113785557984, | |
| "grad_norm": 9.625, | |
| "learning_rate": 9.163019693654268e-06, | |
| "loss": 1.27, | |
| "mean_token_accuracy": 0.6472333431243896, | |
| "num_tokens": 1170288.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.25711159737417943, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 9.14478482859227e-06, | |
| "loss": 1.2754, | |
| "mean_token_accuracy": 0.6426136493682861, | |
| "num_tokens": 1195969.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.26258205689277897, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.12654996353027e-06, | |
| "loss": 1.2805, | |
| "mean_token_accuracy": 0.6377979755401612, | |
| "num_tokens": 1221472.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.26805251641137856, | |
| "grad_norm": 9.25, | |
| "learning_rate": 9.108315098468273e-06, | |
| "loss": 1.2649, | |
| "mean_token_accuracy": 0.6489916682243347, | |
| "num_tokens": 1246637.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2735229759299781, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 9.090080233406273e-06, | |
| "loss": 1.2838, | |
| "mean_token_accuracy": 0.6428185880184174, | |
| "num_tokens": 1272427.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2789934354485777, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 9.071845368344276e-06, | |
| "loss": 1.2394, | |
| "mean_token_accuracy": 0.6552128493785858, | |
| "num_tokens": 1297420.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2844638949671772, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 9.053610503282276e-06, | |
| "loss": 1.2798, | |
| "mean_token_accuracy": 0.6457675039768219, | |
| "num_tokens": 1322816.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2899343544857768, | |
| "grad_norm": 8.625, | |
| "learning_rate": 9.03537563822028e-06, | |
| "loss": 1.2445, | |
| "mean_token_accuracy": 0.6481238782405854, | |
| "num_tokens": 1348483.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.29540481400437635, | |
| "grad_norm": 9.25, | |
| "learning_rate": 9.017140773158279e-06, | |
| "loss": 1.2699, | |
| "mean_token_accuracy": 0.6499922752380372, | |
| "num_tokens": 1373968.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.30087527352297594, | |
| "grad_norm": 9.0, | |
| "learning_rate": 8.99890590809628e-06, | |
| "loss": 1.2504, | |
| "mean_token_accuracy": 0.6519196212291718, | |
| "num_tokens": 1399038.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3063457330415755, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.980671043034282e-06, | |
| "loss": 1.2576, | |
| "mean_token_accuracy": 0.6471436500549317, | |
| "num_tokens": 1424348.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.31181619256017507, | |
| "grad_norm": 8.875, | |
| "learning_rate": 8.962436177972284e-06, | |
| "loss": 1.2728, | |
| "mean_token_accuracy": 0.6437059342861176, | |
| "num_tokens": 1450046.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3172866520787746, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 8.944201312910286e-06, | |
| "loss": 1.2699, | |
| "mean_token_accuracy": 0.6483521461486816, | |
| "num_tokens": 1475329.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3227571115973742, | |
| "grad_norm": 9.0, | |
| "learning_rate": 8.925966447848287e-06, | |
| "loss": 1.2841, | |
| "mean_token_accuracy": 0.6452650845050811, | |
| "num_tokens": 1500705.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3282275711159737, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 8.907731582786287e-06, | |
| "loss": 1.2879, | |
| "mean_token_accuracy": 0.6446281552314759, | |
| "num_tokens": 1526093.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3336980306345733, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 8.88949671772429e-06, | |
| "loss": 1.2879, | |
| "mean_token_accuracy": 0.6415457963943482, | |
| "num_tokens": 1551249.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.33916849015317285, | |
| "grad_norm": 8.875, | |
| "learning_rate": 8.87126185266229e-06, | |
| "loss": 1.2563, | |
| "mean_token_accuracy": 0.6477604985237122, | |
| "num_tokens": 1576597.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.34463894967177244, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.853026987600293e-06, | |
| "loss": 1.2371, | |
| "mean_token_accuracy": 0.6518564283847809, | |
| "num_tokens": 1601829.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.350109409190372, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.834792122538293e-06, | |
| "loss": 1.2774, | |
| "mean_token_accuracy": 0.6432484328746796, | |
| "num_tokens": 1627555.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.35557986870897157, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8.816557257476297e-06, | |
| "loss": 1.2568, | |
| "mean_token_accuracy": 0.6520409166812897, | |
| "num_tokens": 1653045.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3610503282275711, | |
| "grad_norm": 8.625, | |
| "learning_rate": 8.798322392414297e-06, | |
| "loss": 1.2746, | |
| "mean_token_accuracy": 0.6470682263374329, | |
| "num_tokens": 1678382.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3665207877461707, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.780087527352298e-06, | |
| "loss": 1.2593, | |
| "mean_token_accuracy": 0.6493343353271485, | |
| "num_tokens": 1703866.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.37199124726477023, | |
| "grad_norm": 8.625, | |
| "learning_rate": 8.7618526622903e-06, | |
| "loss": 1.2595, | |
| "mean_token_accuracy": 0.6501845002174378, | |
| "num_tokens": 1729154.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3774617067833698, | |
| "grad_norm": 9.0, | |
| "learning_rate": 8.743617797228301e-06, | |
| "loss": 1.2543, | |
| "mean_token_accuracy": 0.6474352359771729, | |
| "num_tokens": 1754459.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.38293216630196936, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 8.725382932166303e-06, | |
| "loss": 1.2385, | |
| "mean_token_accuracy": 0.6522322833538056, | |
| "num_tokens": 1779833.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.38840262582056895, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.707148067104304e-06, | |
| "loss": 1.2308, | |
| "mean_token_accuracy": 0.6520416557788848, | |
| "num_tokens": 1805297.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3938730853391685, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 8.688913202042306e-06, | |
| "loss": 1.2602, | |
| "mean_token_accuracy": 0.6470251500606536, | |
| "num_tokens": 1830573.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3993435448577681, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.670678336980308e-06, | |
| "loss": 1.2518, | |
| "mean_token_accuracy": 0.6481884896755219, | |
| "num_tokens": 1855937.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4048140043763676, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.652443471918308e-06, | |
| "loss": 1.2469, | |
| "mean_token_accuracy": 0.6519103825092316, | |
| "num_tokens": 1881096.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4102844638949672, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.634208606856311e-06, | |
| "loss": 1.236, | |
| "mean_token_accuracy": 0.6525180697441101, | |
| "num_tokens": 1906526.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.41575492341356673, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.61597374179431e-06, | |
| "loss": 1.2583, | |
| "mean_token_accuracy": 0.647457766532898, | |
| "num_tokens": 1931741.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4212253829321663, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 8.597738876732314e-06, | |
| "loss": 1.2296, | |
| "mean_token_accuracy": 0.6541453003883362, | |
| "num_tokens": 1956816.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.42669584245076586, | |
| "grad_norm": 9.625, | |
| "learning_rate": 8.579504011670314e-06, | |
| "loss": 1.2343, | |
| "mean_token_accuracy": 0.6523520529270173, | |
| "num_tokens": 1981932.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.43216630196936545, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.561269146608316e-06, | |
| "loss": 1.2474, | |
| "mean_token_accuracy": 0.6512964367866516, | |
| "num_tokens": 2007382.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.437636761487965, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 8.543034281546317e-06, | |
| "loss": 1.2184, | |
| "mean_token_accuracy": 0.6564052760601043, | |
| "num_tokens": 2033121.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4431072210065646, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.524799416484319e-06, | |
| "loss": 1.2495, | |
| "mean_token_accuracy": 0.6491189241409302, | |
| "num_tokens": 2058751.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4485776805251641, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.50656455142232e-06, | |
| "loss": 1.2014, | |
| "mean_token_accuracy": 0.6595002830028533, | |
| "num_tokens": 2083836.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4540481400437637, | |
| "grad_norm": 9.0, | |
| "learning_rate": 8.488329686360322e-06, | |
| "loss": 1.2478, | |
| "mean_token_accuracy": 0.650605583190918, | |
| "num_tokens": 2109163.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.45951859956236324, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 8.470094821298323e-06, | |
| "loss": 1.2486, | |
| "mean_token_accuracy": 0.6492572844028472, | |
| "num_tokens": 2134626.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4649890590809628, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.451859956236325e-06, | |
| "loss": 1.2397, | |
| "mean_token_accuracy": 0.6523317098617554, | |
| "num_tokens": 2159963.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.47045951859956237, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.433625091174325e-06, | |
| "loss": 1.2381, | |
| "mean_token_accuracy": 0.6511462509632111, | |
| "num_tokens": 2185310.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4759299781181619, | |
| "grad_norm": 9.125, | |
| "learning_rate": 8.415390226112328e-06, | |
| "loss": 1.2015, | |
| "mean_token_accuracy": 0.6603323996067048, | |
| "num_tokens": 2210642.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4814004376367615, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.397155361050328e-06, | |
| "loss": 1.2369, | |
| "mean_token_accuracy": 0.6551064074039459, | |
| "num_tokens": 2235713.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.486870897155361, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.378920495988331e-06, | |
| "loss": 1.2588, | |
| "mean_token_accuracy": 0.6488555371761322, | |
| "num_tokens": 2261549.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4923413566739606, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 8.360685630926331e-06, | |
| "loss": 1.2308, | |
| "mean_token_accuracy": 0.6573903739452363, | |
| "num_tokens": 2286614.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.49781181619256015, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.342450765864333e-06, | |
| "loss": 1.215, | |
| "mean_token_accuracy": 0.6597650587558747, | |
| "num_tokens": 2312037.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5032822757111597, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8.324215900802335e-06, | |
| "loss": 1.2239, | |
| "mean_token_accuracy": 0.6548255145549774, | |
| "num_tokens": 2337544.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5087527352297593, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.305981035740336e-06, | |
| "loss": 1.2629, | |
| "mean_token_accuracy": 0.6493556082248688, | |
| "num_tokens": 2363200.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5142231947483589, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.287746170678338e-06, | |
| "loss": 1.21, | |
| "mean_token_accuracy": 0.6589162766933441, | |
| "num_tokens": 2388450.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5196936542669585, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.26951130561634e-06, | |
| "loss": 1.2373, | |
| "mean_token_accuracy": 0.6519401609897614, | |
| "num_tokens": 2413796.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5251641137855579, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 8.251276440554341e-06, | |
| "loss": 1.2194, | |
| "mean_token_accuracy": 0.6579577445983886, | |
| "num_tokens": 2439412.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5306345733041575, | |
| "grad_norm": 9.0, | |
| "learning_rate": 8.233041575492342e-06, | |
| "loss": 1.241, | |
| "mean_token_accuracy": 0.6511935293674469, | |
| "num_tokens": 2464594.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5361050328227571, | |
| "grad_norm": 8.875, | |
| "learning_rate": 8.214806710430342e-06, | |
| "loss": 1.2325, | |
| "mean_token_accuracy": 0.6544456005096435, | |
| "num_tokens": 2490161.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5415754923413567, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 8.196571845368346e-06, | |
| "loss": 1.2768, | |
| "mean_token_accuracy": 0.6460224032402039, | |
| "num_tokens": 2515978.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5470459518599562, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 8.178336980306346e-06, | |
| "loss": 1.2196, | |
| "mean_token_accuracy": 0.6573625862598419, | |
| "num_tokens": 2541854.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5525164113785558, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.160102115244349e-06, | |
| "loss": 1.2044, | |
| "mean_token_accuracy": 0.6583312273025512, | |
| "num_tokens": 2566969.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5579868708971554, | |
| "grad_norm": 9.25, | |
| "learning_rate": 8.141867250182349e-06, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.6549983143806457, | |
| "num_tokens": 2592252.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.563457330415755, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.12363238512035e-06, | |
| "loss": 1.2106, | |
| "mean_token_accuracy": 0.6564074397087097, | |
| "num_tokens": 2617509.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5689277899343544, | |
| "grad_norm": 8.625, | |
| "learning_rate": 8.105397520058352e-06, | |
| "loss": 1.239, | |
| "mean_token_accuracy": 0.6544763445854187, | |
| "num_tokens": 2642416.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.574398249452954, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 8.087162654996354e-06, | |
| "loss": 1.2273, | |
| "mean_token_accuracy": 0.6501368045806885, | |
| "num_tokens": 2668075.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5798687089715536, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 8.068927789934355e-06, | |
| "loss": 1.2368, | |
| "mean_token_accuracy": 0.6530345678329468, | |
| "num_tokens": 2693387.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5853391684901532, | |
| "grad_norm": 8.5, | |
| "learning_rate": 8.050692924872357e-06, | |
| "loss": 1.223, | |
| "mean_token_accuracy": 0.6532883644104004, | |
| "num_tokens": 2718953.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5908096280087527, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 8.032458059810358e-06, | |
| "loss": 1.2449, | |
| "mean_token_accuracy": 0.6541399002075196, | |
| "num_tokens": 2744468.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.5962800875273523, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 8.01422319474836e-06, | |
| "loss": 1.2418, | |
| "mean_token_accuracy": 0.6520101726055145, | |
| "num_tokens": 2770101.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6017505470459519, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.99598832968636e-06, | |
| "loss": 1.2313, | |
| "mean_token_accuracy": 0.6546747028827667, | |
| "num_tokens": 2795378.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6072210065645515, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.977753464624363e-06, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.6587331891059875, | |
| "num_tokens": 2820924.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.612691466083151, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.959518599562363e-06, | |
| "loss": 1.2248, | |
| "mean_token_accuracy": 0.6550480246543884, | |
| "num_tokens": 2846859.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6181619256017505, | |
| "grad_norm": 9.5, | |
| "learning_rate": 7.941283734500366e-06, | |
| "loss": 1.2137, | |
| "mean_token_accuracy": 0.6592613101005554, | |
| "num_tokens": 2872191.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6236323851203501, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 7.923048869438366e-06, | |
| "loss": 1.2026, | |
| "mean_token_accuracy": 0.661452466249466, | |
| "num_tokens": 2897466.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6291028446389497, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.90481400437637e-06, | |
| "loss": 1.2344, | |
| "mean_token_accuracy": 0.6545186638832092, | |
| "num_tokens": 2923267.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6345733041575492, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.88657913931437e-06, | |
| "loss": 1.2219, | |
| "mean_token_accuracy": 0.6601431190967559, | |
| "num_tokens": 2948499.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6400437636761488, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.868344274252371e-06, | |
| "loss": 1.2165, | |
| "mean_token_accuracy": 0.6581261634826661, | |
| "num_tokens": 2973709.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6455142231947484, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 7.850109409190373e-06, | |
| "loss": 1.2423, | |
| "mean_token_accuracy": 0.6519295990467071, | |
| "num_tokens": 2999177.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.650984682713348, | |
| "grad_norm": 8.625, | |
| "learning_rate": 7.831874544128374e-06, | |
| "loss": 1.2413, | |
| "mean_token_accuracy": 0.6543058276176452, | |
| "num_tokens": 3025180.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6564551422319475, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.813639679066376e-06, | |
| "loss": 1.2133, | |
| "mean_token_accuracy": 0.6597758173942566, | |
| "num_tokens": 3050754.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.661925601750547, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.795404814004377e-06, | |
| "loss": 1.2227, | |
| "mean_token_accuracy": 0.6577537059783936, | |
| "num_tokens": 3075900.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6673960612691466, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.777169948942377e-06, | |
| "loss": 1.2266, | |
| "mean_token_accuracy": 0.655048793554306, | |
| "num_tokens": 3101018.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6728665207877462, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.75893508388038e-06, | |
| "loss": 1.1967, | |
| "mean_token_accuracy": 0.6654035151004791, | |
| "num_tokens": 3126618.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6783369803063457, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.74070021881838e-06, | |
| "loss": 1.2273, | |
| "mean_token_accuracy": 0.6532664895057678, | |
| "num_tokens": 3151368.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6838074398249453, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.722465353756384e-06, | |
| "loss": 1.1909, | |
| "mean_token_accuracy": 0.6590378880500793, | |
| "num_tokens": 3176714.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6892778993435449, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.704230488694384e-06, | |
| "loss": 1.2238, | |
| "mean_token_accuracy": 0.6598979830741882, | |
| "num_tokens": 3202322.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.6947483588621444, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.685995623632387e-06, | |
| "loss": 1.2273, | |
| "mean_token_accuracy": 0.6573567986488342, | |
| "num_tokens": 3227693.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.700218818380744, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7.667760758570387e-06, | |
| "loss": 1.2031, | |
| "mean_token_accuracy": 0.6580542623996735, | |
| "num_tokens": 3252807.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7056892778993435, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 7.649525893508388e-06, | |
| "loss": 1.1975, | |
| "mean_token_accuracy": 0.6616823971271515, | |
| "num_tokens": 3278218.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7111597374179431, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.63129102844639e-06, | |
| "loss": 1.2157, | |
| "mean_token_accuracy": 0.6570103228092193, | |
| "num_tokens": 3304019.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7166301969365426, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 7.6130561633843915e-06, | |
| "loss": 1.2026, | |
| "mean_token_accuracy": 0.6590713143348694, | |
| "num_tokens": 3329456.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7221006564551422, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 7.594821298322393e-06, | |
| "loss": 1.2449, | |
| "mean_token_accuracy": 0.6509311914443969, | |
| "num_tokens": 3355024.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7275711159737418, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.576586433260395e-06, | |
| "loss": 1.1989, | |
| "mean_token_accuracy": 0.6584409058094025, | |
| "num_tokens": 3380203.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7330415754923414, | |
| "grad_norm": 9.0, | |
| "learning_rate": 7.5583515681983954e-06, | |
| "loss": 1.1704, | |
| "mean_token_accuracy": 0.6633239209651947, | |
| "num_tokens": 3405735.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7385120350109409, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.540116703136398e-06, | |
| "loss": 1.1788, | |
| "mean_token_accuracy": 0.6651252508163452, | |
| "num_tokens": 3431101.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7439824945295405, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.521881838074399e-06, | |
| "loss": 1.2175, | |
| "mean_token_accuracy": 0.6579277157783509, | |
| "num_tokens": 3456836.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.74945295404814, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.503646973012401e-06, | |
| "loss": 1.236, | |
| "mean_token_accuracy": 0.6502565503120422, | |
| "num_tokens": 3482262.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7549234135667396, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 7.485412107950402e-06, | |
| "loss": 1.2048, | |
| "mean_token_accuracy": 0.6587684571743011, | |
| "num_tokens": 3507578.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7603938730853391, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.467177242888403e-06, | |
| "loss": 1.1982, | |
| "mean_token_accuracy": 0.6572374284267426, | |
| "num_tokens": 3533438.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7658643326039387, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.448942377826405e-06, | |
| "loss": 1.1968, | |
| "mean_token_accuracy": 0.6594564735889434, | |
| "num_tokens": 3558955.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7713347921225383, | |
| "grad_norm": 9.0, | |
| "learning_rate": 7.430707512764406e-06, | |
| "loss": 1.1535, | |
| "mean_token_accuracy": 0.6660135149955749, | |
| "num_tokens": 3584024.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7768052516411379, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.412472647702408e-06, | |
| "loss": 1.1802, | |
| "mean_token_accuracy": 0.6659539043903351, | |
| "num_tokens": 3609518.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7822757111597374, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.394237782640409e-06, | |
| "loss": 1.2023, | |
| "mean_token_accuracy": 0.6609654724597931, | |
| "num_tokens": 3635205.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.787746170678337, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.3760029175784105e-06, | |
| "loss": 1.2011, | |
| "mean_token_accuracy": 0.6600887596607208, | |
| "num_tokens": 3660799.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.7932166301969366, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.357768052516412e-06, | |
| "loss": 1.2188, | |
| "mean_token_accuracy": 0.6550438821315765, | |
| "num_tokens": 3686173.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7986870897155361, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.339533187454413e-06, | |
| "loss": 1.1869, | |
| "mean_token_accuracy": 0.6660338282585144, | |
| "num_tokens": 3711634.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8041575492341356, | |
| "grad_norm": 8.5, | |
| "learning_rate": 7.321298322392415e-06, | |
| "loss": 1.2082, | |
| "mean_token_accuracy": 0.6563746392726898, | |
| "num_tokens": 3737277.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8096280087527352, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.303063457330416e-06, | |
| "loss": 1.2071, | |
| "mean_token_accuracy": 0.6584551155567169, | |
| "num_tokens": 3763061.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8150984682713348, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7.2848285922684185e-06, | |
| "loss": 1.1978, | |
| "mean_token_accuracy": 0.6639045178890228, | |
| "num_tokens": 3788680.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8205689277899344, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.266593727206419e-06, | |
| "loss": 1.2125, | |
| "mean_token_accuracy": 0.6617669343948365, | |
| "num_tokens": 3814212.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8260393873085339, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.248358862144421e-06, | |
| "loss": 1.1955, | |
| "mean_token_accuracy": 0.6609017014503479, | |
| "num_tokens": 3839753.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8315098468271335, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7.230123997082422e-06, | |
| "loss": 1.1969, | |
| "mean_token_accuracy": 0.6626928210258484, | |
| "num_tokens": 3865200.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8369803063457331, | |
| "grad_norm": 8.75, | |
| "learning_rate": 7.211889132020423e-06, | |
| "loss": 1.2099, | |
| "mean_token_accuracy": 0.662108862400055, | |
| "num_tokens": 3891102.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8424507658643327, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.1936542669584256e-06, | |
| "loss": 1.1902, | |
| "mean_token_accuracy": 0.6671268463134765, | |
| "num_tokens": 3915979.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8479212253829321, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.175419401896426e-06, | |
| "loss": 1.2321, | |
| "mean_token_accuracy": 0.651692271232605, | |
| "num_tokens": 3941101.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8533916849015317, | |
| "grad_norm": 8.75, | |
| "learning_rate": 7.157184536834429e-06, | |
| "loss": 1.2018, | |
| "mean_token_accuracy": 0.6599911451339722, | |
| "num_tokens": 3966588.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8588621444201313, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 7.1389496717724295e-06, | |
| "loss": 1.1601, | |
| "mean_token_accuracy": 0.6702842473983764, | |
| "num_tokens": 3991816.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8643326039387309, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.120714806710431e-06, | |
| "loss": 1.2061, | |
| "mean_token_accuracy": 0.6644167900085449, | |
| "num_tokens": 4017159.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8698030634573304, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 7.102479941648433e-06, | |
| "loss": 1.1982, | |
| "mean_token_accuracy": 0.661655330657959, | |
| "num_tokens": 4042412.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.87527352297593, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.0842450765864334e-06, | |
| "loss": 1.1893, | |
| "mean_token_accuracy": 0.6662971138954162, | |
| "num_tokens": 4068029.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8807439824945296, | |
| "grad_norm": 8.75, | |
| "learning_rate": 7.066010211524436e-06, | |
| "loss": 1.2143, | |
| "mean_token_accuracy": 0.6562692880630493, | |
| "num_tokens": 4093581.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8862144420131292, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 7.047775346462437e-06, | |
| "loss": 1.164, | |
| "mean_token_accuracy": 0.6694628000259399, | |
| "num_tokens": 4118811.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8916849015317286, | |
| "grad_norm": 9.25, | |
| "learning_rate": 7.029540481400439e-06, | |
| "loss": 1.2158, | |
| "mean_token_accuracy": 0.6589863717555999, | |
| "num_tokens": 4144422.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8971553610503282, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7.01130561633844e-06, | |
| "loss": 1.2068, | |
| "mean_token_accuracy": 0.6575411677360534, | |
| "num_tokens": 4169788.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9026258205689278, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 6.9930707512764405e-06, | |
| "loss": 1.2022, | |
| "mean_token_accuracy": 0.6641773998737335, | |
| "num_tokens": 4195416.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9080962800875274, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6.974835886214443e-06, | |
| "loss": 1.2009, | |
| "mean_token_accuracy": 0.6612559735774994, | |
| "num_tokens": 4220959.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9135667396061269, | |
| "grad_norm": 9.375, | |
| "learning_rate": 6.956601021152444e-06, | |
| "loss": 1.1772, | |
| "mean_token_accuracy": 0.6646326899528503, | |
| "num_tokens": 4246261.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9190371991247265, | |
| "grad_norm": 8.75, | |
| "learning_rate": 6.938366156090446e-06, | |
| "loss": 1.2133, | |
| "mean_token_accuracy": 0.660455447435379, | |
| "num_tokens": 4271550.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9245076586433261, | |
| "grad_norm": 9.5, | |
| "learning_rate": 6.920131291028447e-06, | |
| "loss": 1.198, | |
| "mean_token_accuracy": 0.661461490392685, | |
| "num_tokens": 4297359.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9299781181619255, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 6.901896425966449e-06, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.6552061200141907, | |
| "num_tokens": 4322920.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9354485776805251, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 6.88366156090445e-06, | |
| "loss": 1.1856, | |
| "mean_token_accuracy": 0.6640595495700836, | |
| "num_tokens": 4348175.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9409190371991247, | |
| "grad_norm": 9.125, | |
| "learning_rate": 6.865426695842451e-06, | |
| "loss": 1.2074, | |
| "mean_token_accuracy": 0.6573819875717163, | |
| "num_tokens": 4373898.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9463894967177243, | |
| "grad_norm": 9.0, | |
| "learning_rate": 6.847191830780453e-06, | |
| "loss": 1.2118, | |
| "mean_token_accuracy": 0.6596818029880523, | |
| "num_tokens": 4399398.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9518599562363238, | |
| "grad_norm": 9.125, | |
| "learning_rate": 6.828956965718454e-06, | |
| "loss": 1.1888, | |
| "mean_token_accuracy": 0.6635434806346894, | |
| "num_tokens": 4424889.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9573304157549234, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 6.8107221006564564e-06, | |
| "loss": 1.1966, | |
| "mean_token_accuracy": 0.6608928442001343, | |
| "num_tokens": 4449872.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.962800875273523, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 6.792487235594457e-06, | |
| "loss": 1.197, | |
| "mean_token_accuracy": 0.6581675052642822, | |
| "num_tokens": 4475097.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9682713347921226, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 6.774252370532458e-06, | |
| "loss": 1.164, | |
| "mean_token_accuracy": 0.6660536825656891, | |
| "num_tokens": 4500595.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.973741794310722, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 6.75601750547046e-06, | |
| "loss": 1.1718, | |
| "mean_token_accuracy": 0.6656007945537568, | |
| "num_tokens": 4526142.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9792122538293216, | |
| "grad_norm": 9.625, | |
| "learning_rate": 6.737782640408461e-06, | |
| "loss": 1.1778, | |
| "mean_token_accuracy": 0.670012629032135, | |
| "num_tokens": 4551204.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9846827133479212, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.7195477753464636e-06, | |
| "loss": 1.2063, | |
| "mean_token_accuracy": 0.6588987648487091, | |
| "num_tokens": 4576664.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9901531728665208, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 6.701312910284464e-06, | |
| "loss": 1.1862, | |
| "mean_token_accuracy": 0.6618767023086548, | |
| "num_tokens": 4602021.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9956236323851203, | |
| "grad_norm": 8.875, | |
| "learning_rate": 6.683078045222467e-06, | |
| "loss": 1.1911, | |
| "mean_token_accuracy": 0.6659906089305878, | |
| "num_tokens": 4627687.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.00109409190372, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 6.6648431801604675e-06, | |
| "loss": 1.2073, | |
| "mean_token_accuracy": 0.6611107409000396, | |
| "num_tokens": 4653161.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.0065645514223194, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 6.646608315098468e-06, | |
| "loss": 1.04, | |
| "mean_token_accuracy": 0.6974774897098541, | |
| "num_tokens": 4678651.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.012035010940919, | |
| "grad_norm": 9.75, | |
| "learning_rate": 6.628373450036471e-06, | |
| "loss": 1.0189, | |
| "mean_token_accuracy": 0.7027334988117218, | |
| "num_tokens": 4703974.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0175054704595186, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 6.610138584974471e-06, | |
| "loss": 1.0393, | |
| "mean_token_accuracy": 0.6992483794689178, | |
| "num_tokens": 4729182.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.0229759299781183, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 6.591903719912474e-06, | |
| "loss": 1.0248, | |
| "mean_token_accuracy": 0.6999619722366333, | |
| "num_tokens": 4754700.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.0284463894967177, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 6.573668854850475e-06, | |
| "loss": 1.0292, | |
| "mean_token_accuracy": 0.699607890844345, | |
| "num_tokens": 4780368.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.0339168490153172, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 6.555433989788475e-06, | |
| "loss": 1.0111, | |
| "mean_token_accuracy": 0.7044564247131347, | |
| "num_tokens": 4805626.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.039387308533917, | |
| "grad_norm": 9.5, | |
| "learning_rate": 6.537199124726478e-06, | |
| "loss": 1.0417, | |
| "mean_token_accuracy": 0.6986953377723694, | |
| "num_tokens": 4830769.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0448577680525164, | |
| "grad_norm": 9.875, | |
| "learning_rate": 6.5189642596644785e-06, | |
| "loss": 0.9978, | |
| "mean_token_accuracy": 0.7059278726577759, | |
| "num_tokens": 4856510.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.0503282275711159, | |
| "grad_norm": 9.875, | |
| "learning_rate": 6.500729394602481e-06, | |
| "loss": 1.041, | |
| "mean_token_accuracy": 0.6938862383365632, | |
| "num_tokens": 4881433.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.0557986870897156, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.482494529540482e-06, | |
| "loss": 1.0299, | |
| "mean_token_accuracy": 0.7019344508647919, | |
| "num_tokens": 4906727.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.061269146608315, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 6.464259664478484e-06, | |
| "loss": 1.0445, | |
| "mean_token_accuracy": 0.6943408429622651, | |
| "num_tokens": 4932196.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.0667396061269148, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.446024799416485e-06, | |
| "loss": 1.0244, | |
| "mean_token_accuracy": 0.700201416015625, | |
| "num_tokens": 4957115.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.0722100656455142, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 6.427789934354486e-06, | |
| "loss": 1.0142, | |
| "mean_token_accuracy": 0.7004279494285583, | |
| "num_tokens": 4982766.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.0776805251641137, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.409555069292488e-06, | |
| "loss": 1.0371, | |
| "mean_token_accuracy": 0.6925820887088776, | |
| "num_tokens": 5008061.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.0831509846827134, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 6.391320204230489e-06, | |
| "loss": 1.0356, | |
| "mean_token_accuracy": 0.7003223180770874, | |
| "num_tokens": 5033178.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.088621444201313, | |
| "grad_norm": 10.125, | |
| "learning_rate": 6.373085339168491e-06, | |
| "loss": 1.0496, | |
| "mean_token_accuracy": 0.6927937686443328, | |
| "num_tokens": 5058461.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.0940919037199124, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.354850474106492e-06, | |
| "loss": 1.0666, | |
| "mean_token_accuracy": 0.6866900682449341, | |
| "num_tokens": 5083870.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.099562363238512, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.3366156090444944e-06, | |
| "loss": 1.0257, | |
| "mean_token_accuracy": 0.6989123821258545, | |
| "num_tokens": 5109168.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.1050328227571116, | |
| "grad_norm": 10.5, | |
| "learning_rate": 6.318380743982495e-06, | |
| "loss": 1.0333, | |
| "mean_token_accuracy": 0.6956054985523223, | |
| "num_tokens": 5134952.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.1105032822757113, | |
| "grad_norm": 9.625, | |
| "learning_rate": 6.300145878920496e-06, | |
| "loss": 1.0169, | |
| "mean_token_accuracy": 0.6954958021640778, | |
| "num_tokens": 5160606.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.1159737417943107, | |
| "grad_norm": 9.125, | |
| "learning_rate": 6.281911013858498e-06, | |
| "loss": 1.023, | |
| "mean_token_accuracy": 0.7010883867740632, | |
| "num_tokens": 5186194.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.1214442013129102, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 6.263676148796499e-06, | |
| "loss": 1.0373, | |
| "mean_token_accuracy": 0.694850617647171, | |
| "num_tokens": 5211467.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.12691466083151, | |
| "grad_norm": 9.375, | |
| "learning_rate": 6.2454412837345015e-06, | |
| "loss": 1.023, | |
| "mean_token_accuracy": 0.6988016486167907, | |
| "num_tokens": 5236817.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.1323851203501094, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 6.227206418672502e-06, | |
| "loss": 1.0054, | |
| "mean_token_accuracy": 0.704439902305603, | |
| "num_tokens": 5262238.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.1378555798687089, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 6.208971553610503e-06, | |
| "loss": 1.0176, | |
| "mean_token_accuracy": 0.6989111959934234, | |
| "num_tokens": 5287171.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.1433260393873086, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 6.1907366885485055e-06, | |
| "loss": 1.0479, | |
| "mean_token_accuracy": 0.6994122922420501, | |
| "num_tokens": 5312851.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.148796498905908, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.172501823486506e-06, | |
| "loss": 1.0211, | |
| "mean_token_accuracy": 0.7004438996315002, | |
| "num_tokens": 5338784.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.1542669584245075, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.154266958424509e-06, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.7057562172412872, | |
| "num_tokens": 5364113.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.1597374179431073, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.136032093362509e-06, | |
| "loss": 1.0, | |
| "mean_token_accuracy": 0.7027003407478333, | |
| "num_tokens": 5389799.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.1652078774617067, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.117797228300512e-06, | |
| "loss": 1.0171, | |
| "mean_token_accuracy": 0.7003865003585815, | |
| "num_tokens": 5415045.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.1706783369803064, | |
| "grad_norm": 9.75, | |
| "learning_rate": 6.099562363238513e-06, | |
| "loss": 1.0371, | |
| "mean_token_accuracy": 0.6990915298461914, | |
| "num_tokens": 5440323.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.176148796498906, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.081327498176513e-06, | |
| "loss": 1.0348, | |
| "mean_token_accuracy": 0.7001543581485749, | |
| "num_tokens": 5465572.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.1816192560175054, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 6.063092633114516e-06, | |
| "loss": 1.0207, | |
| "mean_token_accuracy": 0.6994670450687408, | |
| "num_tokens": 5491158.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.187089715536105, | |
| "grad_norm": 9.875, | |
| "learning_rate": 6.0448577680525165e-06, | |
| "loss": 1.0209, | |
| "mean_token_accuracy": 0.700932627916336, | |
| "num_tokens": 5516927.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.1925601750547046, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 6.026622902990519e-06, | |
| "loss": 1.0589, | |
| "mean_token_accuracy": 0.6895469307899476, | |
| "num_tokens": 5542251.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.1980306345733043, | |
| "grad_norm": 10.375, | |
| "learning_rate": 6.00838803792852e-06, | |
| "loss": 1.0421, | |
| "mean_token_accuracy": 0.6975519716739654, | |
| "num_tokens": 5567438.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.2035010940919038, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 5.9901531728665204e-06, | |
| "loss": 1.0358, | |
| "mean_token_accuracy": 0.6961750984191895, | |
| "num_tokens": 5592726.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2089715536105032, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.971918307804523e-06, | |
| "loss": 1.0394, | |
| "mean_token_accuracy": 0.6967484354972839, | |
| "num_tokens": 5618018.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.214442013129103, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.953683442742524e-06, | |
| "loss": 1.0465, | |
| "mean_token_accuracy": 0.6956069469451904, | |
| "num_tokens": 5643607.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.2199124726477024, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 5.935448577680526e-06, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.6949532628059387, | |
| "num_tokens": 5668817.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.225382932166302, | |
| "grad_norm": 9.625, | |
| "learning_rate": 5.917213712618527e-06, | |
| "loss": 0.9865, | |
| "mean_token_accuracy": 0.7074730455875397, | |
| "num_tokens": 5694321.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.2308533916849016, | |
| "grad_norm": 10.0, | |
| "learning_rate": 5.898978847556529e-06, | |
| "loss": 1.039, | |
| "mean_token_accuracy": 0.6975044906139374, | |
| "num_tokens": 5719832.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.236323851203501, | |
| "grad_norm": 9.875, | |
| "learning_rate": 5.88074398249453e-06, | |
| "loss": 1.0395, | |
| "mean_token_accuracy": 0.6937489449977875, | |
| "num_tokens": 5745267.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.2417943107221006, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.862509117432531e-06, | |
| "loss": 1.0194, | |
| "mean_token_accuracy": 0.6990994691848755, | |
| "num_tokens": 5770849.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.2472647702407003, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 5.844274252370533e-06, | |
| "loss": 1.0543, | |
| "mean_token_accuracy": 0.6901531100273133, | |
| "num_tokens": 5795693.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.2527352297592997, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 5.826039387308534e-06, | |
| "loss": 1.0315, | |
| "mean_token_accuracy": 0.7007197082042694, | |
| "num_tokens": 5820987.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.2582056892778994, | |
| "grad_norm": 10.125, | |
| "learning_rate": 5.807804522246536e-06, | |
| "loss": 1.0334, | |
| "mean_token_accuracy": 0.6947225153446197, | |
| "num_tokens": 5846512.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.263676148796499, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.789569657184537e-06, | |
| "loss": 1.0451, | |
| "mean_token_accuracy": 0.6949298202991485, | |
| "num_tokens": 5872260.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.2691466083150984, | |
| "grad_norm": 10.0, | |
| "learning_rate": 5.771334792122538e-06, | |
| "loss": 1.0218, | |
| "mean_token_accuracy": 0.7000970780849457, | |
| "num_tokens": 5897464.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.274617067833698, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.75309992706054e-06, | |
| "loss": 0.9975, | |
| "mean_token_accuracy": 0.7064898908138275, | |
| "num_tokens": 5922759.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.2800875273522976, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.734865061998541e-06, | |
| "loss": 1.0501, | |
| "mean_token_accuracy": 0.693008977174759, | |
| "num_tokens": 5948112.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.2855579868708973, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.7166301969365435e-06, | |
| "loss": 1.0517, | |
| "mean_token_accuracy": 0.6909720063209533, | |
| "num_tokens": 5973664.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.2910284463894968, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.698395331874544e-06, | |
| "loss": 1.0002, | |
| "mean_token_accuracy": 0.7026338756084443, | |
| "num_tokens": 5999362.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.2964989059080962, | |
| "grad_norm": 9.875, | |
| "learning_rate": 5.680160466812547e-06, | |
| "loss": 1.0145, | |
| "mean_token_accuracy": 0.7004771769046784, | |
| "num_tokens": 6025166.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.3019693654266957, | |
| "grad_norm": 10.0, | |
| "learning_rate": 5.661925601750547e-06, | |
| "loss": 1.01, | |
| "mean_token_accuracy": 0.7036912262439727, | |
| "num_tokens": 6050665.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.3074398249452954, | |
| "grad_norm": 10.25, | |
| "learning_rate": 5.643690736688548e-06, | |
| "loss": 1.027, | |
| "mean_token_accuracy": 0.7006226062774659, | |
| "num_tokens": 6075993.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.312910284463895, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 5.6254558716265506e-06, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.7021169304847718, | |
| "num_tokens": 6101165.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3183807439824946, | |
| "grad_norm": 9.75, | |
| "learning_rate": 5.607221006564551e-06, | |
| "loss": 1.0364, | |
| "mean_token_accuracy": 0.6955066680908203, | |
| "num_tokens": 6126628.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.323851203501094, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.588986141502554e-06, | |
| "loss": 1.0552, | |
| "mean_token_accuracy": 0.6929129481315612, | |
| "num_tokens": 6152116.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.3293216630196936, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 5.5707512764405545e-06, | |
| "loss": 1.0342, | |
| "mean_token_accuracy": 0.6941097199916839, | |
| "num_tokens": 6177489.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.3347921225382933, | |
| "grad_norm": 9.875, | |
| "learning_rate": 5.552516411378557e-06, | |
| "loss": 1.039, | |
| "mean_token_accuracy": 0.697142231464386, | |
| "num_tokens": 6202873.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.3402625820568927, | |
| "grad_norm": 9.375, | |
| "learning_rate": 5.534281546316558e-06, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.6986703455448151, | |
| "num_tokens": 6228667.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.3457330415754925, | |
| "grad_norm": 9.75, | |
| "learning_rate": 5.516046681254558e-06, | |
| "loss": 1.0121, | |
| "mean_token_accuracy": 0.7000769674777985, | |
| "num_tokens": 6253923.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.351203501094092, | |
| "grad_norm": 9.75, | |
| "learning_rate": 5.497811816192561e-06, | |
| "loss": 1.011, | |
| "mean_token_accuracy": 0.7024608731269837, | |
| "num_tokens": 6279341.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.3566739606126914, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 5.479576951130562e-06, | |
| "loss": 1.0576, | |
| "mean_token_accuracy": 0.6930296897888184, | |
| "num_tokens": 6304966.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.3621444201312911, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.461342086068564e-06, | |
| "loss": 0.9841, | |
| "mean_token_accuracy": 0.7117383360862732, | |
| "num_tokens": 6330323.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.3676148796498906, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.443107221006565e-06, | |
| "loss": 1.0234, | |
| "mean_token_accuracy": 0.697024530172348, | |
| "num_tokens": 6356015.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.37308533916849, | |
| "grad_norm": 10.0, | |
| "learning_rate": 5.424872355944566e-06, | |
| "loss": 1.0141, | |
| "mean_token_accuracy": 0.7016679644584656, | |
| "num_tokens": 6381515.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.3785557986870898, | |
| "grad_norm": 9.625, | |
| "learning_rate": 5.406637490882568e-06, | |
| "loss": 1.038, | |
| "mean_token_accuracy": 0.6987586081027984, | |
| "num_tokens": 6407329.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.3840262582056893, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.388402625820569e-06, | |
| "loss": 1.0408, | |
| "mean_token_accuracy": 0.6964347183704376, | |
| "num_tokens": 6433411.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.3894967177242887, | |
| "grad_norm": 9.75, | |
| "learning_rate": 5.370167760758571e-06, | |
| "loss": 1.0198, | |
| "mean_token_accuracy": 0.7017770171165466, | |
| "num_tokens": 6459012.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.3949671772428884, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 5.351932895696572e-06, | |
| "loss": 1.0705, | |
| "mean_token_accuracy": 0.6890166878700257, | |
| "num_tokens": 6484324.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.400437636761488, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 5.333698030634574e-06, | |
| "loss": 1.0185, | |
| "mean_token_accuracy": 0.7025589048862457, | |
| "num_tokens": 6509536.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.4059080962800876, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 5.315463165572575e-06, | |
| "loss": 1.0356, | |
| "mean_token_accuracy": 0.7018558621406555, | |
| "num_tokens": 6534962.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.411378555798687, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 5.297228300510576e-06, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.6961363673210144, | |
| "num_tokens": 6560702.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.4168490153172866, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 5.278993435448578e-06, | |
| "loss": 1.0248, | |
| "mean_token_accuracy": 0.7003186762332916, | |
| "num_tokens": 6586229.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.4223194748358863, | |
| "grad_norm": 10.125, | |
| "learning_rate": 5.260758570386579e-06, | |
| "loss": 1.0435, | |
| "mean_token_accuracy": 0.6960846245288849, | |
| "num_tokens": 6611544.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4277899343544858, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 5.2425237053245814e-06, | |
| "loss": 1.0204, | |
| "mean_token_accuracy": 0.7010906517505646, | |
| "num_tokens": 6636774.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.4332603938730855, | |
| "grad_norm": 10.25, | |
| "learning_rate": 5.224288840262582e-06, | |
| "loss": 0.9894, | |
| "mean_token_accuracy": 0.7089883327484131, | |
| "num_tokens": 6661874.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.438730853391685, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 5.206053975200584e-06, | |
| "loss": 1.0123, | |
| "mean_token_accuracy": 0.7034552216529846, | |
| "num_tokens": 6686911.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.4442013129102844, | |
| "grad_norm": 10.375, | |
| "learning_rate": 5.187819110138585e-06, | |
| "loss": 1.0494, | |
| "mean_token_accuracy": 0.6926229059696197, | |
| "num_tokens": 6712162.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.449671772428884, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.169584245076586e-06, | |
| "loss": 1.006, | |
| "mean_token_accuracy": 0.7052405416965485, | |
| "num_tokens": 6737129.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.4551422319474836, | |
| "grad_norm": 10.25, | |
| "learning_rate": 5.1513493800145886e-06, | |
| "loss": 1.0193, | |
| "mean_token_accuracy": 0.7007757365703583, | |
| "num_tokens": 6762564.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.460612691466083, | |
| "grad_norm": 9.875, | |
| "learning_rate": 5.133114514952589e-06, | |
| "loss": 1.0164, | |
| "mean_token_accuracy": 0.7008088171482086, | |
| "num_tokens": 6788114.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.4660831509846828, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 5.114879649890592e-06, | |
| "loss": 1.0335, | |
| "mean_token_accuracy": 0.6988726854324341, | |
| "num_tokens": 6813529.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.4715536105032823, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 5.0966447848285925e-06, | |
| "loss": 1.0457, | |
| "mean_token_accuracy": 0.697508692741394, | |
| "num_tokens": 6839250.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.4770240700218817, | |
| "grad_norm": 9.75, | |
| "learning_rate": 5.078409919766594e-06, | |
| "loss": 1.0272, | |
| "mean_token_accuracy": 0.700483775138855, | |
| "num_tokens": 6864969.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4824945295404814, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 5.060175054704596e-06, | |
| "loss": 1.0065, | |
| "mean_token_accuracy": 0.7078005909919739, | |
| "num_tokens": 6890107.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.487964989059081, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 5.041940189642596e-06, | |
| "loss": 1.0484, | |
| "mean_token_accuracy": 0.6926610350608826, | |
| "num_tokens": 6915446.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.4934354485776806, | |
| "grad_norm": 9.875, | |
| "learning_rate": 5.023705324580599e-06, | |
| "loss": 0.9933, | |
| "mean_token_accuracy": 0.7024588465690613, | |
| "num_tokens": 6940768.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.49890590809628, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.0054704595186e-06, | |
| "loss": 1.013, | |
| "mean_token_accuracy": 0.7026680052280426, | |
| "num_tokens": 6966595.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.5043763676148796, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 4.987235594456601e-06, | |
| "loss": 1.0465, | |
| "mean_token_accuracy": 0.696220201253891, | |
| "num_tokens": 6991472.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.509846827133479, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.969000729394603e-06, | |
| "loss": 1.0226, | |
| "mean_token_accuracy": 0.7012458741664886, | |
| "num_tokens": 7016868.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.5153172866520788, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 4.950765864332604e-06, | |
| "loss": 1.0363, | |
| "mean_token_accuracy": 0.6987705647945404, | |
| "num_tokens": 7042904.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.5207877461706785, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.932530999270605e-06, | |
| "loss": 1.0403, | |
| "mean_token_accuracy": 0.6974053025245667, | |
| "num_tokens": 7068329.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.526258205689278, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 4.914296134208607e-06, | |
| "loss": 1.0297, | |
| "mean_token_accuracy": 0.6953822433948517, | |
| "num_tokens": 7093465.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.5317286652078774, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 4.896061269146608e-06, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.7032950639724731, | |
| "num_tokens": 7119262.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.537199124726477, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.87782640408461e-06, | |
| "loss": 1.0469, | |
| "mean_token_accuracy": 0.6940477907657623, | |
| "num_tokens": 7144654.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.5426695842450766, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.8595915390226115e-06, | |
| "loss": 0.9971, | |
| "mean_token_accuracy": 0.706735360622406, | |
| "num_tokens": 7170258.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.5481400437636763, | |
| "grad_norm": 9.75, | |
| "learning_rate": 4.841356673960613e-06, | |
| "loss": 1.0352, | |
| "mean_token_accuracy": 0.6959325015544892, | |
| "num_tokens": 7195794.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.5536105032822758, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.823121808898615e-06, | |
| "loss": 1.015, | |
| "mean_token_accuracy": 0.7041930437088013, | |
| "num_tokens": 7221353.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.5590809628008753, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 4.804886943836615e-06, | |
| "loss": 0.9925, | |
| "mean_token_accuracy": 0.7065845012664795, | |
| "num_tokens": 7246656.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.5645514223194747, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.786652078774617e-06, | |
| "loss": 1.0202, | |
| "mean_token_accuracy": 0.7003905832767486, | |
| "num_tokens": 7271876.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.5700218818380745, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.768417213712619e-06, | |
| "loss": 1.0114, | |
| "mean_token_accuracy": 0.7012807905673981, | |
| "num_tokens": 7296834.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.575492341356674, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.75018234865062e-06, | |
| "loss": 1.0231, | |
| "mean_token_accuracy": 0.7027694880962372, | |
| "num_tokens": 7321984.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.5809628008752736, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.731947483588622e-06, | |
| "loss": 1.0165, | |
| "mean_token_accuracy": 0.7003744542598724, | |
| "num_tokens": 7347322.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.5864332603938731, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 4.713712618526623e-06, | |
| "loss": 1.0458, | |
| "mean_token_accuracy": 0.6967971920967102, | |
| "num_tokens": 7372753.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5919037199124726, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.695477753464625e-06, | |
| "loss": 1.0454, | |
| "mean_token_accuracy": 0.6949516236782074, | |
| "num_tokens": 7398016.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.597374179431072, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.677242888402626e-06, | |
| "loss": 1.0547, | |
| "mean_token_accuracy": 0.6950974285602569, | |
| "num_tokens": 7423781.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.6028446389496718, | |
| "grad_norm": 9.375, | |
| "learning_rate": 4.659008023340627e-06, | |
| "loss": 1.0073, | |
| "mean_token_accuracy": 0.7060118734836578, | |
| "num_tokens": 7449042.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.6083150984682715, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 4.640773158278629e-06, | |
| "loss": 1.0522, | |
| "mean_token_accuracy": 0.6925396621227264, | |
| "num_tokens": 7474666.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.613785557986871, | |
| "grad_norm": 10.625, | |
| "learning_rate": 4.6225382932166305e-06, | |
| "loss": 1.0214, | |
| "mean_token_accuracy": 0.698899906873703, | |
| "num_tokens": 7500009.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.6192560175054704, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 4.604303428154632e-06, | |
| "loss": 1.002, | |
| "mean_token_accuracy": 0.7080122113227845, | |
| "num_tokens": 7524900.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.62472647702407, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.586068563092634e-06, | |
| "loss": 1.0137, | |
| "mean_token_accuracy": 0.7044197976589203, | |
| "num_tokens": 7549885.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.6301969365426696, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.567833698030634e-06, | |
| "loss": 1.0224, | |
| "mean_token_accuracy": 0.702244508266449, | |
| "num_tokens": 7575873.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.635667396061269, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.549598832968636e-06, | |
| "loss": 1.0231, | |
| "mean_token_accuracy": 0.7018688678741455, | |
| "num_tokens": 7601140.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.6411378555798688, | |
| "grad_norm": 10.375, | |
| "learning_rate": 4.531363967906638e-06, | |
| "loss": 1.0168, | |
| "mean_token_accuracy": 0.7000043153762817, | |
| "num_tokens": 7626744.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.6466083150984683, | |
| "grad_norm": 10.125, | |
| "learning_rate": 4.513129102844639e-06, | |
| "loss": 1.0382, | |
| "mean_token_accuracy": 0.6963400661945343, | |
| "num_tokens": 7652205.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.6520787746170678, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.494894237782641e-06, | |
| "loss": 1.0311, | |
| "mean_token_accuracy": 0.6954720914363861, | |
| "num_tokens": 7677302.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.6575492341356672, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 4.476659372720642e-06, | |
| "loss": 1.0238, | |
| "mean_token_accuracy": 0.7008414506912232, | |
| "num_tokens": 7702641.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.663019693654267, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 4.458424507658644e-06, | |
| "loss": 1.0455, | |
| "mean_token_accuracy": 0.6952131390571594, | |
| "num_tokens": 7728092.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.6684901531728666, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 4.440189642596645e-06, | |
| "loss": 1.0563, | |
| "mean_token_accuracy": 0.6946309566497803, | |
| "num_tokens": 7753418.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.6739606126914661, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.421954777534646e-06, | |
| "loss": 1.0137, | |
| "mean_token_accuracy": 0.7010623216629028, | |
| "num_tokens": 7778914.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.6794310722100656, | |
| "grad_norm": 9.875, | |
| "learning_rate": 4.403719912472648e-06, | |
| "loss": 1.0461, | |
| "mean_token_accuracy": 0.694762361049652, | |
| "num_tokens": 7803962.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.684901531728665, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.3854850474106495e-06, | |
| "loss": 1.0144, | |
| "mean_token_accuracy": 0.6980993211269378, | |
| "num_tokens": 7829041.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.6903719912472648, | |
| "grad_norm": 9.625, | |
| "learning_rate": 4.367250182348651e-06, | |
| "loss": 1.0323, | |
| "mean_token_accuracy": 0.6983445227146149, | |
| "num_tokens": 7854555.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.6958424507658645, | |
| "grad_norm": 9.75, | |
| "learning_rate": 4.349015317286653e-06, | |
| "loss": 1.0601, | |
| "mean_token_accuracy": 0.6918485045433045, | |
| "num_tokens": 7880371.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.701312910284464, | |
| "grad_norm": 9.625, | |
| "learning_rate": 4.330780452224654e-06, | |
| "loss": 1.0337, | |
| "mean_token_accuracy": 0.6994279623031616, | |
| "num_tokens": 7905824.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.7067833698030634, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 4.312545587162655e-06, | |
| "loss": 1.0253, | |
| "mean_token_accuracy": 0.6975195348262787, | |
| "num_tokens": 7931361.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.712253829321663, | |
| "grad_norm": 10.625, | |
| "learning_rate": 4.2943107221006566e-06, | |
| "loss": 1.0498, | |
| "mean_token_accuracy": 0.6924622654914856, | |
| "num_tokens": 7956719.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.7177242888402626, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.276075857038658e-06, | |
| "loss": 1.0236, | |
| "mean_token_accuracy": 0.7029446244239808, | |
| "num_tokens": 7982503.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.723194748358862, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.25784099197666e-06, | |
| "loss": 1.0426, | |
| "mean_token_accuracy": 0.6945299327373504, | |
| "num_tokens": 8008037.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.7286652078774618, | |
| "grad_norm": 10.125, | |
| "learning_rate": 4.239606126914661e-06, | |
| "loss": 1.0232, | |
| "mean_token_accuracy": 0.6983104586601258, | |
| "num_tokens": 8033255.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.7341356673960613, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.221371261852663e-06, | |
| "loss": 1.0267, | |
| "mean_token_accuracy": 0.7013046503067016, | |
| "num_tokens": 8058442.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.7396061269146608, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.203136396790664e-06, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.6944118142127991, | |
| "num_tokens": 8084197.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.7450765864332602, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 4.184901531728665e-06, | |
| "loss": 1.0403, | |
| "mean_token_accuracy": 0.6959960579872131, | |
| "num_tokens": 8109669.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.75054704595186, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 1.0224, | |
| "mean_token_accuracy": 0.7018200099468231, | |
| "num_tokens": 8135074.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.7560175054704596, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 4.1484318016046685e-06, | |
| "loss": 1.0563, | |
| "mean_token_accuracy": 0.6930766403675079, | |
| "num_tokens": 8160391.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.7614879649890591, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.13019693654267e-06, | |
| "loss": 1.029, | |
| "mean_token_accuracy": 0.6969853162765502, | |
| "num_tokens": 8185889.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.7669584245076586, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.111962071480672e-06, | |
| "loss": 1.0038, | |
| "mean_token_accuracy": 0.7046498537063599, | |
| "num_tokens": 8211618.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.772428884026258, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.093727206418673e-06, | |
| "loss": 1.0284, | |
| "mean_token_accuracy": 0.7003251373767853, | |
| "num_tokens": 8236773.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.7778993435448578, | |
| "grad_norm": 10.5, | |
| "learning_rate": 4.075492341356674e-06, | |
| "loss": 1.0244, | |
| "mean_token_accuracy": 0.6951791286468506, | |
| "num_tokens": 8262043.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.7833698030634575, | |
| "grad_norm": 10.25, | |
| "learning_rate": 4.0572574762946756e-06, | |
| "loss": 1.0271, | |
| "mean_token_accuracy": 0.6958181917667389, | |
| "num_tokens": 8288205.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.788840262582057, | |
| "grad_norm": 10.375, | |
| "learning_rate": 4.039022611232677e-06, | |
| "loss": 1.0131, | |
| "mean_token_accuracy": 0.7018442392349243, | |
| "num_tokens": 8313349.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.7943107221006565, | |
| "grad_norm": 10.375, | |
| "learning_rate": 4.020787746170679e-06, | |
| "loss": 1.0274, | |
| "mean_token_accuracy": 0.7001440703868866, | |
| "num_tokens": 8338997.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.799781181619256, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 4.00255288110868e-06, | |
| "loss": 1.0354, | |
| "mean_token_accuracy": 0.6997708559036255, | |
| "num_tokens": 8364446.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.8052516411378556, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.984318016046682e-06, | |
| "loss": 1.0228, | |
| "mean_token_accuracy": 0.7024441123008728, | |
| "num_tokens": 8390005.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.8107221006564551, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.9660831509846835e-06, | |
| "loss": 1.0369, | |
| "mean_token_accuracy": 0.6996273756027221, | |
| "num_tokens": 8415557.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.8161925601750548, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 3.947848285922684e-06, | |
| "loss": 1.0282, | |
| "mean_token_accuracy": 0.6991647839546203, | |
| "num_tokens": 8441067.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.8216630196936543, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.929613420860686e-06, | |
| "loss": 1.0441, | |
| "mean_token_accuracy": 0.6979943752288819, | |
| "num_tokens": 8466748.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.8271334792122538, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 3.9113785557986875e-06, | |
| "loss": 1.0425, | |
| "mean_token_accuracy": 0.6951029121875762, | |
| "num_tokens": 8491752.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.8326039387308533, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.893143690736689e-06, | |
| "loss": 1.0116, | |
| "mean_token_accuracy": 0.7038478553295135, | |
| "num_tokens": 8517225.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.838074398249453, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 3.874908825674691e-06, | |
| "loss": 1.0348, | |
| "mean_token_accuracy": 0.6978711068630219, | |
| "num_tokens": 8542666.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.8435448577680527, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.856673960612692e-06, | |
| "loss": 1.0388, | |
| "mean_token_accuracy": 0.6979636132717133, | |
| "num_tokens": 8568498.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.8490153172866521, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.838439095550693e-06, | |
| "loss": 1.0077, | |
| "mean_token_accuracy": 0.7020620405673981, | |
| "num_tokens": 8593770.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.8544857768052516, | |
| "grad_norm": 10.625, | |
| "learning_rate": 3.8202042304886946e-06, | |
| "loss": 1.0382, | |
| "mean_token_accuracy": 0.696090292930603, | |
| "num_tokens": 8619623.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.859956236323851, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 3.8019693654266957e-06, | |
| "loss": 1.0215, | |
| "mean_token_accuracy": 0.7019686102867126, | |
| "num_tokens": 8644852.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.8654266958424508, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.7837345003646973e-06, | |
| "loss": 1.0507, | |
| "mean_token_accuracy": 0.695297920703888, | |
| "num_tokens": 8670333.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.8708971553610503, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 3.765499635302699e-06, | |
| "loss": 1.0505, | |
| "mean_token_accuracy": 0.6986589133739471, | |
| "num_tokens": 8695967.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.87636761487965, | |
| "grad_norm": 10.625, | |
| "learning_rate": 3.7472647702407005e-06, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.7004635334014893, | |
| "num_tokens": 8721252.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.8818380743982495, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 3.729029905178702e-06, | |
| "loss": 1.0037, | |
| "mean_token_accuracy": 0.7041628360748291, | |
| "num_tokens": 8746944.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.887308533916849, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 3.7107950401167037e-06, | |
| "loss": 1.0375, | |
| "mean_token_accuracy": 0.6980779647827149, | |
| "num_tokens": 8771892.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.8927789934354484, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.692560175054705e-06, | |
| "loss": 0.9951, | |
| "mean_token_accuracy": 0.7054601550102234, | |
| "num_tokens": 8797131.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.8982494529540481, | |
| "grad_norm": 10.5, | |
| "learning_rate": 3.674325309992706e-06, | |
| "loss": 0.9912, | |
| "mean_token_accuracy": 0.7073158025741577, | |
| "num_tokens": 8822466.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.9037199124726478, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 3.6560904449307076e-06, | |
| "loss": 1.0059, | |
| "mean_token_accuracy": 0.7047175526618957, | |
| "num_tokens": 8847987.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.9091903719912473, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 3.637855579868709e-06, | |
| "loss": 1.0167, | |
| "mean_token_accuracy": 0.7027063131332397, | |
| "num_tokens": 8873940.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.9146608315098468, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 3.619620714806711e-06, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7014587640762329, | |
| "num_tokens": 8899633.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.9201312910284463, | |
| "grad_norm": 9.5, | |
| "learning_rate": 3.6013858497447124e-06, | |
| "loss": 1.0017, | |
| "mean_token_accuracy": 0.7041820049285888, | |
| "num_tokens": 8924575.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.925601750547046, | |
| "grad_norm": 10.375, | |
| "learning_rate": 3.5831509846827136e-06, | |
| "loss": 1.0165, | |
| "mean_token_accuracy": 0.6998761057853699, | |
| "num_tokens": 8950150.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.9310722100656457, | |
| "grad_norm": 10.5, | |
| "learning_rate": 3.5649161196207147e-06, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7060512363910675, | |
| "num_tokens": 8975532.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.9365426695842451, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.5466812545587163e-06, | |
| "loss": 1.026, | |
| "mean_token_accuracy": 0.7015553712844849, | |
| "num_tokens": 9000990.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.9420131291028446, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 3.528446389496718e-06, | |
| "loss": 1.0334, | |
| "mean_token_accuracy": 0.7001884341239929, | |
| "num_tokens": 9026718.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.947483588621444, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 3.5102115244347195e-06, | |
| "loss": 1.0248, | |
| "mean_token_accuracy": 0.6993842363357544, | |
| "num_tokens": 9052433.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.9529540481400438, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.491976659372721e-06, | |
| "loss": 1.0273, | |
| "mean_token_accuracy": 0.7003478467464447, | |
| "num_tokens": 9077782.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.9584245076586433, | |
| "grad_norm": 10.5, | |
| "learning_rate": 3.4737417943107223e-06, | |
| "loss": 1.0047, | |
| "mean_token_accuracy": 0.7034225165843964, | |
| "num_tokens": 9103349.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.963894967177243, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.455506929248724e-06, | |
| "loss": 1.047, | |
| "mean_token_accuracy": 0.693518990278244, | |
| "num_tokens": 9129033.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.9693654266958425, | |
| "grad_norm": 10.25, | |
| "learning_rate": 3.437272064186725e-06, | |
| "loss": 1.042, | |
| "mean_token_accuracy": 0.698636132478714, | |
| "num_tokens": 9154512.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.974835886214442, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 3.4190371991247266e-06, | |
| "loss": 1.0232, | |
| "mean_token_accuracy": 0.7020216822624207, | |
| "num_tokens": 9179746.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.9803063457330414, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 3.400802334062728e-06, | |
| "loss": 1.0277, | |
| "mean_token_accuracy": 0.6972903072834015, | |
| "num_tokens": 9205381.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.9857768052516411, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 3.38256746900073e-06, | |
| "loss": 1.0172, | |
| "mean_token_accuracy": 0.7026154100894928, | |
| "num_tokens": 9230261.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.9912472647702408, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 3.3643326039387314e-06, | |
| "loss": 1.01, | |
| "mean_token_accuracy": 0.7005713403224945, | |
| "num_tokens": 9255736.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.9967177242888403, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 3.3460977388767325e-06, | |
| "loss": 1.0041, | |
| "mean_token_accuracy": 0.7037198424339295, | |
| "num_tokens": 9281129.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.00218818380744, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.327862873814734e-06, | |
| "loss": 1.0031, | |
| "mean_token_accuracy": 0.7007030785083771, | |
| "num_tokens": 9306380.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.0076586433260393, | |
| "grad_norm": 9.875, | |
| "learning_rate": 3.3096280087527353e-06, | |
| "loss": 0.9473, | |
| "mean_token_accuracy": 0.717999279499054, | |
| "num_tokens": 9331366.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.0131291028446388, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.291393143690737e-06, | |
| "loss": 0.9626, | |
| "mean_token_accuracy": 0.717300283908844, | |
| "num_tokens": 9356901.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.0185995623632387, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 3.2731582786287385e-06, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7138874650001525, | |
| "num_tokens": 9382234.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.024070021881838, | |
| "grad_norm": 9.75, | |
| "learning_rate": 3.25492341356674e-06, | |
| "loss": 0.9654, | |
| "mean_token_accuracy": 0.712285041809082, | |
| "num_tokens": 9407437.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.0295404814004376, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.2366885485047412e-06, | |
| "loss": 0.9342, | |
| "mean_token_accuracy": 0.7209572613239288, | |
| "num_tokens": 9432454.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.035010940919037, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.218453683442743e-06, | |
| "loss": 0.9631, | |
| "mean_token_accuracy": 0.7131079971790314, | |
| "num_tokens": 9457857.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.0404814004376366, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 3.200218818380744e-06, | |
| "loss": 0.9577, | |
| "mean_token_accuracy": 0.7177065372467041, | |
| "num_tokens": 9482811.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.0459518599562365, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 3.1819839533187456e-06, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7131213903427124, | |
| "num_tokens": 9508186.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.051422319474836, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.163749088256747e-06, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7165208160877228, | |
| "num_tokens": 9533509.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.0568927789934355, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 3.1455142231947488e-06, | |
| "loss": 0.9535, | |
| "mean_token_accuracy": 0.7169022679328918, | |
| "num_tokens": 9558600.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.062363238512035, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 3.12727935813275e-06, | |
| "loss": 0.9646, | |
| "mean_token_accuracy": 0.7135210871696472, | |
| "num_tokens": 9584233.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.0678336980306344, | |
| "grad_norm": 10.25, | |
| "learning_rate": 3.1090444930707515e-06, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.709892874956131, | |
| "num_tokens": 9609789.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.0733041575492344, | |
| "grad_norm": 10.375, | |
| "learning_rate": 3.090809628008753e-06, | |
| "loss": 0.9738, | |
| "mean_token_accuracy": 0.7121241211891174, | |
| "num_tokens": 9634822.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.078774617067834, | |
| "grad_norm": 11.25, | |
| "learning_rate": 3.0725747629467543e-06, | |
| "loss": 0.9616, | |
| "mean_token_accuracy": 0.7132679998874665, | |
| "num_tokens": 9660314.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.0842450765864333, | |
| "grad_norm": 10.625, | |
| "learning_rate": 3.054339897884756e-06, | |
| "loss": 0.9579, | |
| "mean_token_accuracy": 0.7166781783103943, | |
| "num_tokens": 9685937.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.089715536105033, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 3.0361050328227575e-06, | |
| "loss": 0.9741, | |
| "mean_token_accuracy": 0.714539396762848, | |
| "num_tokens": 9711177.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.0951859956236323, | |
| "grad_norm": 11.125, | |
| "learning_rate": 3.0178701677607587e-06, | |
| "loss": 0.9463, | |
| "mean_token_accuracy": 0.7216998755931854, | |
| "num_tokens": 9736426.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.1006564551422318, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.9996353026987602e-06, | |
| "loss": 0.9648, | |
| "mean_token_accuracy": 0.7115364730358124, | |
| "num_tokens": 9762127.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.1061269146608317, | |
| "grad_norm": 10.75, | |
| "learning_rate": 2.981400437636762e-06, | |
| "loss": 0.9475, | |
| "mean_token_accuracy": 0.7189037084579468, | |
| "num_tokens": 9787178.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.111597374179431, | |
| "grad_norm": 10.5, | |
| "learning_rate": 2.9631655725747634e-06, | |
| "loss": 0.9537, | |
| "mean_token_accuracy": 0.7175340712070465, | |
| "num_tokens": 9812717.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.1170678336980306, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.9449307075127646e-06, | |
| "loss": 0.9472, | |
| "mean_token_accuracy": 0.7212161302566529, | |
| "num_tokens": 9837881.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.12253829321663, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.926695842450766e-06, | |
| "loss": 0.9608, | |
| "mean_token_accuracy": 0.7193179249763488, | |
| "num_tokens": 9863317.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.1280087527352296, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 2.9084609773887674e-06, | |
| "loss": 0.951, | |
| "mean_token_accuracy": 0.7160651385784149, | |
| "num_tokens": 9888868.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.1334792122538295, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.890226112326769e-06, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.716667366027832, | |
| "num_tokens": 9914219.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.138949671772429, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.8719912472647705e-06, | |
| "loss": 0.9637, | |
| "mean_token_accuracy": 0.7133105576038361, | |
| "num_tokens": 9939476.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.1444201312910285, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.853756382202772e-06, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7155322432518005, | |
| "num_tokens": 9964926.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.149890590809628, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.8355215171407733e-06, | |
| "loss": 0.9801, | |
| "mean_token_accuracy": 0.7113901436328888, | |
| "num_tokens": 9990537.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.1553610503282274, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.817286652078775e-06, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.7181893765926362, | |
| "num_tokens": 10016002.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.160831509846827, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.799051787016776e-06, | |
| "loss": 0.947, | |
| "mean_token_accuracy": 0.7155714333057404, | |
| "num_tokens": 10041372.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.166301969365427, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.7808169219547776e-06, | |
| "loss": 0.9951, | |
| "mean_token_accuracy": 0.705439954996109, | |
| "num_tokens": 10066639.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.1717724288840263, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 2.7625820568927792e-06, | |
| "loss": 0.9621, | |
| "mean_token_accuracy": 0.715647429227829, | |
| "num_tokens": 10091974.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.177242888402626, | |
| "grad_norm": 10.5, | |
| "learning_rate": 2.744347191830781e-06, | |
| "loss": 0.9731, | |
| "mean_token_accuracy": 0.7148273229598999, | |
| "num_tokens": 10117300.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.1827133479212253, | |
| "grad_norm": 10.125, | |
| "learning_rate": 2.7261123267687824e-06, | |
| "loss": 0.9625, | |
| "mean_token_accuracy": 0.7132592558860779, | |
| "num_tokens": 10142825.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.1881838074398248, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 2.7078774617067836e-06, | |
| "loss": 0.9887, | |
| "mean_token_accuracy": 0.7106162488460541, | |
| "num_tokens": 10168114.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.1936542669584247, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 2.6896425966447848e-06, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.721660703420639, | |
| "num_tokens": 10193441.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.199124726477024, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.6714077315827863e-06, | |
| "loss": 0.9785, | |
| "mean_token_accuracy": 0.7132415533065796, | |
| "num_tokens": 10219252.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.2045951859956237, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.653172866520788e-06, | |
| "loss": 0.9639, | |
| "mean_token_accuracy": 0.7168880999088287, | |
| "num_tokens": 10244918.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.210065645514223, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.6349380014587895e-06, | |
| "loss": 0.9507, | |
| "mean_token_accuracy": 0.7177555739879609, | |
| "num_tokens": 10270446.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.2155361050328226, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.616703136396791e-06, | |
| "loss": 0.9464, | |
| "mean_token_accuracy": 0.7184527516365051, | |
| "num_tokens": 10295965.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.2210065645514225, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 2.5984682713347927e-06, | |
| "loss": 0.9547, | |
| "mean_token_accuracy": 0.7186037957668304, | |
| "num_tokens": 10321256.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.226477024070022, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 2.580233406272794e-06, | |
| "loss": 0.9507, | |
| "mean_token_accuracy": 0.7183585882186889, | |
| "num_tokens": 10346406.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.2319474835886215, | |
| "grad_norm": 10.5, | |
| "learning_rate": 2.561998541210795e-06, | |
| "loss": 0.9862, | |
| "mean_token_accuracy": 0.710406482219696, | |
| "num_tokens": 10371870.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.237417943107221, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.5437636761487966e-06, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.7080509960651398, | |
| "num_tokens": 10397552.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.2428884026258205, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.5255288110867982e-06, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7114235162734985, | |
| "num_tokens": 10423310.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.24835886214442, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.5072939460248e-06, | |
| "loss": 0.9784, | |
| "mean_token_accuracy": 0.7133292317390442, | |
| "num_tokens": 10448405.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.25382932166302, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 2.489059080962801e-06, | |
| "loss": 0.9837, | |
| "mean_token_accuracy": 0.7114726364612579, | |
| "num_tokens": 10474015.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.2592997811816193, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.4708242159008026e-06, | |
| "loss": 0.9765, | |
| "mean_token_accuracy": 0.7124983072280884, | |
| "num_tokens": 10500122.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.264770240700219, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.452589350838804e-06, | |
| "loss": 1.0051, | |
| "mean_token_accuracy": 0.7033932983875275, | |
| "num_tokens": 10525569.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.2702407002188183, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 2.4343544857768053e-06, | |
| "loss": 0.9537, | |
| "mean_token_accuracy": 0.7190083563327789, | |
| "num_tokens": 10550809.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.2757111597374178, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.416119620714807e-06, | |
| "loss": 0.9574, | |
| "mean_token_accuracy": 0.7156289637088775, | |
| "num_tokens": 10576391.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.2811816192560173, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.3978847556528085e-06, | |
| "loss": 0.9615, | |
| "mean_token_accuracy": 0.7188243508338928, | |
| "num_tokens": 10601261.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.286652078774617, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 2.3796498905908097e-06, | |
| "loss": 0.9545, | |
| "mean_token_accuracy": 0.715437775850296, | |
| "num_tokens": 10626305.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.2921225382932167, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.3614150255288113e-06, | |
| "loss": 0.9931, | |
| "mean_token_accuracy": 0.7077142417430877, | |
| "num_tokens": 10652029.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.297592997811816, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.343180160466813e-06, | |
| "loss": 0.976, | |
| "mean_token_accuracy": 0.7103690207004547, | |
| "num_tokens": 10677356.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.3030634573304156, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 2.324945295404814e-06, | |
| "loss": 0.9647, | |
| "mean_token_accuracy": 0.7177474439144135, | |
| "num_tokens": 10703431.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.308533916849015, | |
| "grad_norm": 10.125, | |
| "learning_rate": 2.3067104303428156e-06, | |
| "loss": 0.9758, | |
| "mean_token_accuracy": 0.7121629536151886, | |
| "num_tokens": 10728992.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.314004376367615, | |
| "grad_norm": 10.125, | |
| "learning_rate": 2.2884755652808172e-06, | |
| "loss": 0.934, | |
| "mean_token_accuracy": 0.7213209450244904, | |
| "num_tokens": 10754404.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.3194748358862145, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.270240700218819e-06, | |
| "loss": 0.9461, | |
| "mean_token_accuracy": 0.719011914730072, | |
| "num_tokens": 10779944.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.324945295404814, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 2.25200583515682e-06, | |
| "loss": 0.9966, | |
| "mean_token_accuracy": 0.7089413404464722, | |
| "num_tokens": 10805085.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.3304157549234135, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.2337709700948216e-06, | |
| "loss": 0.9707, | |
| "mean_token_accuracy": 0.7119170904159546, | |
| "num_tokens": 10830483.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.335886214442013, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 2.215536105032823e-06, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.7150032758712769, | |
| "num_tokens": 10856177.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.341356673960613, | |
| "grad_norm": 10.625, | |
| "learning_rate": 2.1973012399708243e-06, | |
| "loss": 0.971, | |
| "mean_token_accuracy": 0.7109645545482636, | |
| "num_tokens": 10882042.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.3468271334792123, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 2.179066374908826e-06, | |
| "loss": 0.9215, | |
| "mean_token_accuracy": 0.7236122965812684, | |
| "num_tokens": 10907588.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.352297592997812, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.1608315098468275e-06, | |
| "loss": 0.9629, | |
| "mean_token_accuracy": 0.7140246748924255, | |
| "num_tokens": 10932821.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.3577680525164113, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.1425966447848287e-06, | |
| "loss": 0.9542, | |
| "mean_token_accuracy": 0.712429267168045, | |
| "num_tokens": 10958451.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.363238512035011, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.1243617797228303e-06, | |
| "loss": 0.9602, | |
| "mean_token_accuracy": 0.7146483659744263, | |
| "num_tokens": 10983957.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.3687089715536107, | |
| "grad_norm": 10.375, | |
| "learning_rate": 2.106126914660832e-06, | |
| "loss": 0.991, | |
| "mean_token_accuracy": 0.7093435227870941, | |
| "num_tokens": 11009581.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.37417943107221, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 2.087892049598833e-06, | |
| "loss": 0.9806, | |
| "mean_token_accuracy": 0.7109127819538117, | |
| "num_tokens": 11035199.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.3796498905908097, | |
| "grad_norm": 10.0, | |
| "learning_rate": 2.0696571845368346e-06, | |
| "loss": 0.9605, | |
| "mean_token_accuracy": 0.7149803459644317, | |
| "num_tokens": 11061011.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.385120350109409, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 2.0514223194748362e-06, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7125830769538879, | |
| "num_tokens": 11086545.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.3905908096280086, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.0331874544128374e-06, | |
| "loss": 0.9716, | |
| "mean_token_accuracy": 0.7115569293498993, | |
| "num_tokens": 11111650.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.3960612691466086, | |
| "grad_norm": 9.875, | |
| "learning_rate": 2.014952589350839e-06, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.7110448122024536, | |
| "num_tokens": 11136976.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.401531728665208, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.9967177242888406e-06, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7116839528083801, | |
| "num_tokens": 11162409.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.4070021881838075, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.9784828592268417e-06, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7186504125595092, | |
| "num_tokens": 11187779.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.412472647702407, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.9602479941648433e-06, | |
| "loss": 0.9594, | |
| "mean_token_accuracy": 0.7139133214950562, | |
| "num_tokens": 11213494.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.4179431072210065, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.942013129102845e-06, | |
| "loss": 0.9563, | |
| "mean_token_accuracy": 0.7193277597427368, | |
| "num_tokens": 11238852.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.423413566739606, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 1.923778264040846e-06, | |
| "loss": 0.9477, | |
| "mean_token_accuracy": 0.7188430905342102, | |
| "num_tokens": 11264480.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 2.428884026258206, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.9055433989788477e-06, | |
| "loss": 0.9579, | |
| "mean_token_accuracy": 0.7149973511695862, | |
| "num_tokens": 11290175.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 2.4343544857768054, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.8873085339168493e-06, | |
| "loss": 0.9772, | |
| "mean_token_accuracy": 0.7096493124961853, | |
| "num_tokens": 11315301.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.439824945295405, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.8690736688548504e-06, | |
| "loss": 0.9463, | |
| "mean_token_accuracy": 0.7188547492027283, | |
| "num_tokens": 11340809.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 2.4452954048140043, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.850838803792852e-06, | |
| "loss": 0.9753, | |
| "mean_token_accuracy": 0.7100773870944976, | |
| "num_tokens": 11366227.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 2.450765864332604, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.8326039387308536e-06, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.7101668000221253, | |
| "num_tokens": 11391908.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.4562363238512033, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.814369073668855e-06, | |
| "loss": 0.9722, | |
| "mean_token_accuracy": 0.7132100880146026, | |
| "num_tokens": 11417099.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 2.461706783369803, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.7961342086068564e-06, | |
| "loss": 0.9672, | |
| "mean_token_accuracy": 0.7163193345069885, | |
| "num_tokens": 11442529.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.4671772428884027, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.777899343544858e-06, | |
| "loss": 0.9741, | |
| "mean_token_accuracy": 0.7144057810306549, | |
| "num_tokens": 11467943.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 2.472647702407002, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.7596644784828594e-06, | |
| "loss": 0.99, | |
| "mean_token_accuracy": 0.7097407221794129, | |
| "num_tokens": 11493655.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 2.4781181619256016, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 1.7414296134208607e-06, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.7254151999950409, | |
| "num_tokens": 11519074.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 2.483588621444201, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.7231947483588623e-06, | |
| "loss": 0.9623, | |
| "mean_token_accuracy": 0.7154429078102111, | |
| "num_tokens": 11544510.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 2.489059080962801, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.7049598832968637e-06, | |
| "loss": 0.9405, | |
| "mean_token_accuracy": 0.720922189950943, | |
| "num_tokens": 11569975.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.4945295404814005, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.686725018234865e-06, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7141575336456298, | |
| "num_tokens": 11595508.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.6684901531728667e-06, | |
| "loss": 0.9722, | |
| "mean_token_accuracy": 0.716586035490036, | |
| "num_tokens": 11621057.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 2.5054704595185995, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.650255288110868e-06, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7154919922351837, | |
| "num_tokens": 11646277.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 2.510940919037199, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.6320204230488696e-06, | |
| "loss": 0.9514, | |
| "mean_token_accuracy": 0.7179181814193726, | |
| "num_tokens": 11671463.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 2.516411378555799, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.613785557986871e-06, | |
| "loss": 0.945, | |
| "mean_token_accuracy": 0.7157581686973572, | |
| "num_tokens": 11696766.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.5218818380743984, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.5955506929248724e-06, | |
| "loss": 0.9507, | |
| "mean_token_accuracy": 0.717986673116684, | |
| "num_tokens": 11722577.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 2.527352297592998, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.577315827862874e-06, | |
| "loss": 0.9841, | |
| "mean_token_accuracy": 0.7108976006507873, | |
| "num_tokens": 11747611.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 2.5328227571115973, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.5590809628008754e-06, | |
| "loss": 0.9691, | |
| "mean_token_accuracy": 0.7137386500835419, | |
| "num_tokens": 11773081.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 2.538293216630197, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.5408460977388768e-06, | |
| "loss": 0.9968, | |
| "mean_token_accuracy": 0.7075820744037629, | |
| "num_tokens": 11798361.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.5437636761487967, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.5226112326768783e-06, | |
| "loss": 0.9582, | |
| "mean_token_accuracy": 0.7191313743591309, | |
| "num_tokens": 11823896.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.549234135667396, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.5043763676148797e-06, | |
| "loss": 0.9983, | |
| "mean_token_accuracy": 0.7068405508995056, | |
| "num_tokens": 11849650.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 2.5547045951859957, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.4861415025528813e-06, | |
| "loss": 0.9546, | |
| "mean_token_accuracy": 0.7164286077022552, | |
| "num_tokens": 11874973.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 2.560175054704595, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.4679066374908827e-06, | |
| "loss": 0.9665, | |
| "mean_token_accuracy": 0.7142378628253937, | |
| "num_tokens": 11900214.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 2.5656455142231946, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.4496717724288843e-06, | |
| "loss": 0.9344, | |
| "mean_token_accuracy": 0.725408935546875, | |
| "num_tokens": 11925985.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 2.5711159737417946, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.4314369073668857e-06, | |
| "loss": 0.9571, | |
| "mean_token_accuracy": 0.7170994579792023, | |
| "num_tokens": 11951080.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.5765864332603936, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.413202042304887e-06, | |
| "loss": 0.9723, | |
| "mean_token_accuracy": 0.7106690168380737, | |
| "num_tokens": 11976761.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 2.5820568927789935, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.3949671772428886e-06, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.7155169665813446, | |
| "num_tokens": 12002057.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 2.587527352297593, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.37673231218089e-06, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7098533868789673, | |
| "num_tokens": 12027349.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 2.5929978118161925, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 1.3584974471188914e-06, | |
| "loss": 0.9681, | |
| "mean_token_accuracy": 0.7169641613960266, | |
| "num_tokens": 12053110.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 2.598468271334792, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 1.340262582056893e-06, | |
| "loss": 0.9769, | |
| "mean_token_accuracy": 0.7144347429275513, | |
| "num_tokens": 12078660.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.6039387308533914, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.3220277169948944e-06, | |
| "loss": 0.9535, | |
| "mean_token_accuracy": 0.717149305343628, | |
| "num_tokens": 12104041.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 2.6094091903719914, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.3037928519328957e-06, | |
| "loss": 0.9509, | |
| "mean_token_accuracy": 0.7132203102111816, | |
| "num_tokens": 12129327.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 2.614879649890591, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.2855579868708973e-06, | |
| "loss": 0.9856, | |
| "mean_token_accuracy": 0.7076945900917053, | |
| "num_tokens": 12154514.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 2.6203501094091903, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.267323121808899e-06, | |
| "loss": 0.9708, | |
| "mean_token_accuracy": 0.7127987205982208, | |
| "num_tokens": 12180006.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 2.62582056892779, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.2490882567469003e-06, | |
| "loss": 0.9789, | |
| "mean_token_accuracy": 0.7122283995151519, | |
| "num_tokens": 12205688.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.6312910284463893, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.2308533916849017e-06, | |
| "loss": 0.9624, | |
| "mean_token_accuracy": 0.71814124584198, | |
| "num_tokens": 12230947.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 2.636761487964989, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.212618526622903e-06, | |
| "loss": 0.9897, | |
| "mean_token_accuracy": 0.7073604583740234, | |
| "num_tokens": 12255955.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 2.6422319474835887, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 1.1943836615609047e-06, | |
| "loss": 0.9558, | |
| "mean_token_accuracy": 0.7145933747291565, | |
| "num_tokens": 12281300.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 2.647702407002188, | |
| "grad_norm": 10.75, | |
| "learning_rate": 1.176148796498906e-06, | |
| "loss": 0.9583, | |
| "mean_token_accuracy": 0.7154503166675568, | |
| "num_tokens": 12306967.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 2.6531728665207877, | |
| "grad_norm": 10.625, | |
| "learning_rate": 1.1579139314369074e-06, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.7120263636112213, | |
| "num_tokens": 12332653.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.658643326039387, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.139679066374909e-06, | |
| "loss": 0.9431, | |
| "mean_token_accuracy": 0.7196206390857697, | |
| "num_tokens": 12357964.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 2.664113785557987, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.1214442013129104e-06, | |
| "loss": 0.9235, | |
| "mean_token_accuracy": 0.72520210146904, | |
| "num_tokens": 12383359.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 2.6695842450765865, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.1032093362509118e-06, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7171355724334717, | |
| "num_tokens": 12408973.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 2.675054704595186, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.0849744711889134e-06, | |
| "loss": 0.9588, | |
| "mean_token_accuracy": 0.7152066648006439, | |
| "num_tokens": 12434174.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 2.6805251641137855, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 1.0667396061269147e-06, | |
| "loss": 0.9417, | |
| "mean_token_accuracy": 0.7199950873851776, | |
| "num_tokens": 12459929.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.685995623632385, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.0485047410649161e-06, | |
| "loss": 0.9403, | |
| "mean_token_accuracy": 0.7192999660968781, | |
| "num_tokens": 12485603.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 2.691466083150985, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 1.0302698760029177e-06, | |
| "loss": 0.9588, | |
| "mean_token_accuracy": 0.7183211863040924, | |
| "num_tokens": 12510854.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 2.6969365426695844, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.012035010940919e-06, | |
| "loss": 0.9445, | |
| "mean_token_accuracy": 0.7210539758205414, | |
| "num_tokens": 12536418.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 2.702407002188184, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 9.938001458789205e-07, | |
| "loss": 0.9721, | |
| "mean_token_accuracy": 0.713829755783081, | |
| "num_tokens": 12562114.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 2.7078774617067833, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.75565280816922e-07, | |
| "loss": 0.9548, | |
| "mean_token_accuracy": 0.7159802138805389, | |
| "num_tokens": 12587542.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.713347921225383, | |
| "grad_norm": 10.75, | |
| "learning_rate": 9.573304157549234e-07, | |
| "loss": 0.9454, | |
| "mean_token_accuracy": 0.7186602294445038, | |
| "num_tokens": 12613047.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.7188183807439827, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 9.39095550692925e-07, | |
| "loss": 1.0026, | |
| "mean_token_accuracy": 0.7088643789291382, | |
| "num_tokens": 12638277.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 2.7242888402625822, | |
| "grad_norm": 10.5, | |
| "learning_rate": 9.208606856309264e-07, | |
| "loss": 0.9739, | |
| "mean_token_accuracy": 0.7109079241752625, | |
| "num_tokens": 12663642.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 2.7297592997811817, | |
| "grad_norm": 10.5, | |
| "learning_rate": 9.026258205689278e-07, | |
| "loss": 0.9636, | |
| "mean_token_accuracy": 0.712633740901947, | |
| "num_tokens": 12688780.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 2.735229759299781, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 8.843909555069294e-07, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.7172781467437744, | |
| "num_tokens": 12714020.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.7407002188183807, | |
| "grad_norm": 10.625, | |
| "learning_rate": 8.661560904449308e-07, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7146118104457855, | |
| "num_tokens": 12739229.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 2.74617067833698, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 8.479212253829322e-07, | |
| "loss": 0.9617, | |
| "mean_token_accuracy": 0.7141262829303742, | |
| "num_tokens": 12764906.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 2.7516411378555796, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 8.296863603209337e-07, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.7181793093681336, | |
| "num_tokens": 12790046.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 2.7571115973741795, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 8.114514952589351e-07, | |
| "loss": 0.9333, | |
| "mean_token_accuracy": 0.7222773134708405, | |
| "num_tokens": 12815713.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 2.762582056892779, | |
| "grad_norm": 10.625, | |
| "learning_rate": 7.932166301969366e-07, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.7161290049552917, | |
| "num_tokens": 12841290.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.7680525164113785, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 7.749817651349381e-07, | |
| "loss": 0.9709, | |
| "mean_token_accuracy": 0.7144385755062104, | |
| "num_tokens": 12867299.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 2.773522975929978, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 7.567469000729396e-07, | |
| "loss": 0.9299, | |
| "mean_token_accuracy": 0.7200077176094055, | |
| "num_tokens": 12892111.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 2.7789934354485775, | |
| "grad_norm": 10.375, | |
| "learning_rate": 7.38512035010941e-07, | |
| "loss": 1.0044, | |
| "mean_token_accuracy": 0.706270956993103, | |
| "num_tokens": 12917547.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 2.7844638949671774, | |
| "grad_norm": 10.75, | |
| "learning_rate": 7.202771699489424e-07, | |
| "loss": 0.9561, | |
| "mean_token_accuracy": 0.7158130586147309, | |
| "num_tokens": 12942824.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 2.789934354485777, | |
| "grad_norm": 11.25, | |
| "learning_rate": 7.020423048869439e-07, | |
| "loss": 0.9943, | |
| "mean_token_accuracy": 0.7063575088977814, | |
| "num_tokens": 12968051.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.7954048140043763, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.838074398249453e-07, | |
| "loss": 0.9686, | |
| "mean_token_accuracy": 0.7124229729175567, | |
| "num_tokens": 12992995.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 2.800875273522976, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.655725747629469e-07, | |
| "loss": 0.9806, | |
| "mean_token_accuracy": 0.7110258340835571, | |
| "num_tokens": 13018611.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.8063457330415753, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 6.473377097009483e-07, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7187185704708099, | |
| "num_tokens": 13044142.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 2.8118161925601752, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 6.291028446389497e-07, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7164785265922546, | |
| "num_tokens": 13069649.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 2.8172866520787747, | |
| "grad_norm": 11.0, | |
| "learning_rate": 6.108679795769512e-07, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7243307530879974, | |
| "num_tokens": 13094938.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.822757111597374, | |
| "grad_norm": 10.375, | |
| "learning_rate": 5.926331145149526e-07, | |
| "loss": 0.9351, | |
| "mean_token_accuracy": 0.7236999869346619, | |
| "num_tokens": 13120177.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 2.8282275711159737, | |
| "grad_norm": 10.375, | |
| "learning_rate": 5.743982494529541e-07, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7033572435379029, | |
| "num_tokens": 13145900.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 2.833698030634573, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 5.561633843909556e-07, | |
| "loss": 0.9821, | |
| "mean_token_accuracy": 0.7097233951091766, | |
| "num_tokens": 13170900.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 2.839168490153173, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 5.37928519328957e-07, | |
| "loss": 0.9525, | |
| "mean_token_accuracy": 0.7147578775882721, | |
| "num_tokens": 13196497.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 2.8446389496717726, | |
| "grad_norm": 10.5, | |
| "learning_rate": 5.196936542669585e-07, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.7121313869953155, | |
| "num_tokens": 13221946.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.850109409190372, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 5.014587892049599e-07, | |
| "loss": 0.9862, | |
| "mean_token_accuracy": 0.707955265045166, | |
| "num_tokens": 13247347.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 2.8555798687089715, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 4.832239241429613e-07, | |
| "loss": 0.976, | |
| "mean_token_accuracy": 0.7143423020839691, | |
| "num_tokens": 13273017.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 2.861050328227571, | |
| "grad_norm": 10.5, | |
| "learning_rate": 4.649890590809628e-07, | |
| "loss": 0.9889, | |
| "mean_token_accuracy": 0.7104144990444183, | |
| "num_tokens": 13298377.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 2.866520787746171, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 4.467541940189643e-07, | |
| "loss": 0.9631, | |
| "mean_token_accuracy": 0.7177930176258087, | |
| "num_tokens": 13323779.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 2.8719912472647704, | |
| "grad_norm": 10.375, | |
| "learning_rate": 4.285193289569658e-07, | |
| "loss": 0.9779, | |
| "mean_token_accuracy": 0.7117682516574859, | |
| "num_tokens": 13349327.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.87746170678337, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 4.102844638949672e-07, | |
| "loss": 0.9936, | |
| "mean_token_accuracy": 0.7063680112361908, | |
| "num_tokens": 13375032.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 2.8829321663019694, | |
| "grad_norm": 10.25, | |
| "learning_rate": 3.9204959883296864e-07, | |
| "loss": 0.9747, | |
| "mean_token_accuracy": 0.7139563202857971, | |
| "num_tokens": 13400329.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 2.888402625820569, | |
| "grad_norm": 10.625, | |
| "learning_rate": 3.7381473377097013e-07, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7155350327491761, | |
| "num_tokens": 13426059.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.8938730853391688, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 3.5557986870897156e-07, | |
| "loss": 0.958, | |
| "mean_token_accuracy": 0.7152288734912873, | |
| "num_tokens": 13451832.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 2.899343544857768, | |
| "grad_norm": 10.75, | |
| "learning_rate": 3.3734500364697305e-07, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7133601427078247, | |
| "num_tokens": 13477401.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.9048140043763677, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 3.1911013858497453e-07, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7171884000301361, | |
| "num_tokens": 13502705.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 2.910284463894967, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 3.0087527352297597e-07, | |
| "loss": 0.9621, | |
| "mean_token_accuracy": 0.717541116476059, | |
| "num_tokens": 13528062.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 2.9157549234135667, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 2.826404084609774e-07, | |
| "loss": 0.9622, | |
| "mean_token_accuracy": 0.7154664099216461, | |
| "num_tokens": 13553750.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 2.921225382932166, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.644055433989789e-07, | |
| "loss": 0.9516, | |
| "mean_token_accuracy": 0.717375636100769, | |
| "num_tokens": 13578936.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 2.9266958424507656, | |
| "grad_norm": 11.0, | |
| "learning_rate": 2.461706783369803e-07, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7167813301086425, | |
| "num_tokens": 13604064.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.9321663019693656, | |
| "grad_norm": 10.75, | |
| "learning_rate": 2.279358132749818e-07, | |
| "loss": 0.9842, | |
| "mean_token_accuracy": 0.7137976944446563, | |
| "num_tokens": 13629601.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 2.937636761487965, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 2.0970094821298323e-07, | |
| "loss": 0.9598, | |
| "mean_token_accuracy": 0.7161750555038452, | |
| "num_tokens": 13655161.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 2.9431072210065645, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.914660831509847e-07, | |
| "loss": 0.9538, | |
| "mean_token_accuracy": 0.718994963169098, | |
| "num_tokens": 13680024.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 2.948577680525164, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.7323121808898615e-07, | |
| "loss": 0.962, | |
| "mean_token_accuracy": 0.7169410228729248, | |
| "num_tokens": 13705181.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 2.9540481400437635, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.549963530269876e-07, | |
| "loss": 0.9574, | |
| "mean_token_accuracy": 0.7161156296730041, | |
| "num_tokens": 13730505.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.9595185995623634, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.3676148796498907e-07, | |
| "loss": 0.9383, | |
| "mean_token_accuracy": 0.7220743417739868, | |
| "num_tokens": 13755959.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 2.964989059080963, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.1852662290299053e-07, | |
| "loss": 0.9787, | |
| "mean_token_accuracy": 0.7099229753017425, | |
| "num_tokens": 13781338.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 2.9704595185995624, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.0029175784099199e-07, | |
| "loss": 0.9792, | |
| "mean_token_accuracy": 0.7099815905094147, | |
| "num_tokens": 13806784.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 2.975929978118162, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 8.205689277899343e-08, | |
| "loss": 0.9774, | |
| "mean_token_accuracy": 0.7121619462966919, | |
| "num_tokens": 13831942.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.9814004376367613, | |
| "grad_norm": 10.5, | |
| "learning_rate": 6.38220277169949e-08, | |
| "loss": 0.9726, | |
| "mean_token_accuracy": 0.7091731190681457, | |
| "num_tokens": 13857296.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.9868708971553612, | |
| "grad_norm": 11.0, | |
| "learning_rate": 4.558716265499636e-08, | |
| "loss": 0.9846, | |
| "mean_token_accuracy": 0.7077378630638123, | |
| "num_tokens": 13883120.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 2.9923413566739607, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.735229759299781e-08, | |
| "loss": 0.9573, | |
| "mean_token_accuracy": 0.7156158804893493, | |
| "num_tokens": 13908659.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 2.99781181619256, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 9.117432530999272e-09, | |
| "loss": 0.9851, | |
| "mean_token_accuracy": 0.7107800126075745, | |
| "num_tokens": 13933848.0, | |
| "step": 5480 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5484, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9139458472604058e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |