| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 24.934228187919462, |
| "eval_steps": 500, |
| "global_step": 9300, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.026845637583892617, |
| "grad_norm": 3.416386604309082, |
| "learning_rate": 3.2258064516129035e-07, |
| "loss": 2.4117, |
| "mean_token_accuracy": 0.6175116240978241, |
| "num_tokens": 24255.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.053691275167785234, |
| "grad_norm": 3.772399663925171, |
| "learning_rate": 6.810035842293908e-07, |
| "loss": 2.4628, |
| "mean_token_accuracy": 0.6110778599977493, |
| "num_tokens": 47320.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08053691275167785, |
| "grad_norm": 3.545194625854492, |
| "learning_rate": 1.039426523297491e-06, |
| "loss": 2.5032, |
| "mean_token_accuracy": 0.6162596017122268, |
| "num_tokens": 69404.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10738255033557047, |
| "grad_norm": 3.8445169925689697, |
| "learning_rate": 1.3978494623655913e-06, |
| "loss": 2.655, |
| "mean_token_accuracy": 0.5948106974363327, |
| "num_tokens": 90245.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1342281879194631, |
| "grad_norm": 3.7103028297424316, |
| "learning_rate": 1.7562724014336918e-06, |
| "loss": 2.7566, |
| "mean_token_accuracy": 0.5874058037996293, |
| "num_tokens": 109437.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1610738255033557, |
| "grad_norm": 3.137907028198242, |
| "learning_rate": 2.1146953405017924e-06, |
| "loss": 2.3534, |
| "mean_token_accuracy": 0.6221999943256378, |
| "num_tokens": 133516.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.18791946308724833, |
| "grad_norm": 3.168318748474121, |
| "learning_rate": 2.4731182795698927e-06, |
| "loss": 2.3546, |
| "mean_token_accuracy": 0.6222240030765533, |
| "num_tokens": 156399.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.21476510067114093, |
| "grad_norm": 2.6762025356292725, |
| "learning_rate": 2.831541218637993e-06, |
| "loss": 2.2976, |
| "mean_token_accuracy": 0.6325597822666168, |
| "num_tokens": 178233.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.24161073825503357, |
| "grad_norm": 2.528851270675659, |
| "learning_rate": 3.1899641577060937e-06, |
| "loss": 2.3289, |
| "mean_token_accuracy": 0.6177177101373672, |
| "num_tokens": 198867.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2684563758389262, |
| "grad_norm": 2.180152654647827, |
| "learning_rate": 3.548387096774194e-06, |
| "loss": 2.2287, |
| "mean_token_accuracy": 0.6231148332357407, |
| "num_tokens": 218120.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2953020134228188, |
| "grad_norm": 1.8343068361282349, |
| "learning_rate": 3.906810035842294e-06, |
| "loss": 1.8435, |
| "mean_token_accuracy": 0.6591462314128875, |
| "num_tokens": 242238.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3221476510067114, |
| "grad_norm": 1.6624016761779785, |
| "learning_rate": 4.265232974910394e-06, |
| "loss": 1.6839, |
| "mean_token_accuracy": 0.6699382126331329, |
| "num_tokens": 265176.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.348993288590604, |
| "grad_norm": 1.4153294563293457, |
| "learning_rate": 4.623655913978495e-06, |
| "loss": 1.4978, |
| "mean_token_accuracy": 0.6955495923757553, |
| "num_tokens": 287280.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.37583892617449666, |
| "grad_norm": 1.4868972301483154, |
| "learning_rate": 4.982078853046595e-06, |
| "loss": 1.37, |
| "mean_token_accuracy": 0.7068142563104629, |
| "num_tokens": 308485.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.40268456375838924, |
| "grad_norm": 1.5649815797805786, |
| "learning_rate": 5.340501792114696e-06, |
| "loss": 1.1729, |
| "mean_token_accuracy": 0.7447071671485901, |
| "num_tokens": 327940.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.42953020134228187, |
| "grad_norm": 1.2401491403579712, |
| "learning_rate": 5.698924731182796e-06, |
| "loss": 0.9397, |
| "mean_token_accuracy": 0.7840387046337127, |
| "num_tokens": 352050.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4563758389261745, |
| "grad_norm": 1.1661632061004639, |
| "learning_rate": 6.057347670250897e-06, |
| "loss": 0.7066, |
| "mean_token_accuracy": 0.8346484929323197, |
| "num_tokens": 374886.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.48322147651006714, |
| "grad_norm": 1.1714835166931152, |
| "learning_rate": 6.415770609318996e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.8801979720592499, |
| "num_tokens": 396874.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5100671140939598, |
| "grad_norm": 0.5148192048072815, |
| "learning_rate": 6.774193548387097e-06, |
| "loss": 0.4031, |
| "mean_token_accuracy": 0.9046478867530823, |
| "num_tokens": 417713.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5369127516778524, |
| "grad_norm": 0.43239253759384155, |
| "learning_rate": 7.1326164874551975e-06, |
| "loss": 0.3182, |
| "mean_token_accuracy": 0.9254436582326889, |
| "num_tokens": 437046.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5637583892617449, |
| "grad_norm": 0.417090505361557, |
| "learning_rate": 7.491039426523297e-06, |
| "loss": 0.3797, |
| "mean_token_accuracy": 0.9017556846141815, |
| "num_tokens": 461134.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5906040268456376, |
| "grad_norm": 0.3316299617290497, |
| "learning_rate": 7.849462365591398e-06, |
| "loss": 0.304, |
| "mean_token_accuracy": 0.9168222814798355, |
| "num_tokens": 484137.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6174496644295302, |
| "grad_norm": 0.3776572644710541, |
| "learning_rate": 8.207885304659498e-06, |
| "loss": 0.2544, |
| "mean_token_accuracy": 0.9300281196832657, |
| "num_tokens": 506286.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6442953020134228, |
| "grad_norm": 0.5413280725479126, |
| "learning_rate": 8.5663082437276e-06, |
| "loss": 0.2859, |
| "mean_token_accuracy": 0.9251022487878799, |
| "num_tokens": 527336.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6711409395973155, |
| "grad_norm": 0.5370568037033081, |
| "learning_rate": 8.9247311827957e-06, |
| "loss": 0.2791, |
| "mean_token_accuracy": 0.9306178212165832, |
| "num_tokens": 546762.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.697986577181208, |
| "grad_norm": 0.4821404218673706, |
| "learning_rate": 9.2831541218638e-06, |
| "loss": 0.2656, |
| "mean_token_accuracy": 0.9255098283290863, |
| "num_tokens": 570752.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7248322147651006, |
| "grad_norm": 0.49469488859176636, |
| "learning_rate": 9.641577060931901e-06, |
| "loss": 0.2273, |
| "mean_token_accuracy": 0.9351761728525162, |
| "num_tokens": 593533.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7516778523489933, |
| "grad_norm": 0.5606919527053833, |
| "learning_rate": 1e-05, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.9411542862653732, |
| "num_tokens": 615365.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7785234899328859, |
| "grad_norm": 0.39467954635620117, |
| "learning_rate": 9.999969679947463e-06, |
| "loss": 0.2808, |
| "mean_token_accuracy": 0.9259979039430618, |
| "num_tokens": 635705.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8053691275167785, |
| "grad_norm": 0.29947659373283386, |
| "learning_rate": 9.999878720157571e-06, |
| "loss": 0.21, |
| "mean_token_accuracy": 0.9384137988090515, |
| "num_tokens": 654906.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8322147651006712, |
| "grad_norm": 0.3251740634441376, |
| "learning_rate": 9.99972712173349e-06, |
| "loss": 0.2109, |
| "mean_token_accuracy": 0.9341928958892822, |
| "num_tokens": 679030.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.8590604026845637, |
| "grad_norm": 0.2648128271102905, |
| "learning_rate": 9.999514886513808e-06, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.9401267766952515, |
| "num_tokens": 701960.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.8859060402684564, |
| "grad_norm": 0.289673775434494, |
| "learning_rate": 9.999242017072517e-06, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9540554910898209, |
| "num_tokens": 724008.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.912751677852349, |
| "grad_norm": 0.3500693440437317, |
| "learning_rate": 9.998908516718984e-06, |
| "loss": 0.24, |
| "mean_token_accuracy": 0.9336700767278672, |
| "num_tokens": 744883.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.9395973154362416, |
| "grad_norm": 0.285081684589386, |
| "learning_rate": 9.998514389497907e-06, |
| "loss": 0.2127, |
| "mean_token_accuracy": 0.9403306484222412, |
| "num_tokens": 764242.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9664429530201343, |
| "grad_norm": 0.5001574754714966, |
| "learning_rate": 9.99805964018927e-06, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.944737109541893, |
| "num_tokens": 787542.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.9932885906040269, |
| "grad_norm": 0.4394875764846802, |
| "learning_rate": 9.997544274308282e-06, |
| "loss": 0.2054, |
| "mean_token_accuracy": 0.9406224071979523, |
| "num_tokens": 808350.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.018791946308725, |
| "grad_norm": 0.3800680637359619, |
| "learning_rate": 9.996968298105313e-06, |
| "loss": 0.1784, |
| "mean_token_accuracy": 0.9486302858904788, |
| "num_tokens": 829396.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.0456375838926175, |
| "grad_norm": 0.29024776816368103, |
| "learning_rate": 9.996331718565812e-06, |
| "loss": 0.1667, |
| "mean_token_accuracy": 0.9462663173675537, |
| "num_tokens": 852700.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.07248322147651, |
| "grad_norm": 0.3312149941921234, |
| "learning_rate": 9.995634543410231e-06, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.9464462220668792, |
| "num_tokens": 875028.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0993288590604027, |
| "grad_norm": 0.41938960552215576, |
| "learning_rate": 9.994876781093923e-06, |
| "loss": 0.2146, |
| "mean_token_accuracy": 0.937797623872757, |
| "num_tokens": 896237.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.1261744966442953, |
| "grad_norm": 0.3066196143627167, |
| "learning_rate": 9.994058440807047e-06, |
| "loss": 0.2241, |
| "mean_token_accuracy": 0.9370527178049087, |
| "num_tokens": 915897.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.1530201342281878, |
| "grad_norm": 0.3139925003051758, |
| "learning_rate": 9.99317953247445e-06, |
| "loss": 0.1757, |
| "mean_token_accuracy": 0.9461874514818192, |
| "num_tokens": 938499.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.1798657718120806, |
| "grad_norm": 0.3327377438545227, |
| "learning_rate": 9.992240066755554e-06, |
| "loss": 0.181, |
| "mean_token_accuracy": 0.9421871662139892, |
| "num_tokens": 961628.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.2067114093959732, |
| "grad_norm": 0.30847689509391785, |
| "learning_rate": 9.991240055044214e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9528691500425339, |
| "num_tokens": 983811.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.2335570469798658, |
| "grad_norm": 0.4445633292198181, |
| "learning_rate": 9.990179509468595e-06, |
| "loss": 0.2034, |
| "mean_token_accuracy": 0.9405299901962281, |
| "num_tokens": 1005030.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.2604026845637584, |
| "grad_norm": 0.35106924176216125, |
| "learning_rate": 9.989058442891018e-06, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.93738272190094, |
| "num_tokens": 1024629.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.287248322147651, |
| "grad_norm": 0.30040764808654785, |
| "learning_rate": 9.9878768689078e-06, |
| "loss": 0.1688, |
| "mean_token_accuracy": 0.9484542042016983, |
| "num_tokens": 1047241.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.3140939597315437, |
| "grad_norm": 0.29934918880462646, |
| "learning_rate": 9.986634801849093e-06, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.9444483011960983, |
| "num_tokens": 1070350.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.3409395973154363, |
| "grad_norm": 0.3902275860309601, |
| "learning_rate": 9.985332256778719e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9500592857599258, |
| "num_tokens": 1092615.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.367785234899329, |
| "grad_norm": 0.4580014944076538, |
| "learning_rate": 9.983969249493964e-06, |
| "loss": 0.1825, |
| "mean_token_accuracy": 0.9451580941677094, |
| "num_tokens": 1114038.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.3946308724832215, |
| "grad_norm": 0.3057954013347626, |
| "learning_rate": 9.982545796525416e-06, |
| "loss": 0.2153, |
| "mean_token_accuracy": 0.9356414407491684, |
| "num_tokens": 1133791.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.421476510067114, |
| "grad_norm": 0.317940890789032, |
| "learning_rate": 9.981061915136737e-06, |
| "loss": 0.1642, |
| "mean_token_accuracy": 0.950609314441681, |
| "num_tokens": 1156476.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.4483221476510066, |
| "grad_norm": 0.29728659987449646, |
| "learning_rate": 9.979517623324475e-06, |
| "loss": 0.1585, |
| "mean_token_accuracy": 0.9496109008789062, |
| "num_tokens": 1179818.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.4751677852348992, |
| "grad_norm": 0.3028015196323395, |
| "learning_rate": 9.977912939817833e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9496413081884384, |
| "num_tokens": 1202050.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.5020134228187918, |
| "grad_norm": 0.5277287364006042, |
| "learning_rate": 9.976247884078445e-06, |
| "loss": 0.1941, |
| "mean_token_accuracy": 0.9424160838127136, |
| "num_tokens": 1223368.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.5288590604026846, |
| "grad_norm": 0.34933868050575256, |
| "learning_rate": 9.974522476300144e-06, |
| "loss": 0.2241, |
| "mean_token_accuracy": 0.9350442677736283, |
| "num_tokens": 1243174.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.5557046979865772, |
| "grad_norm": 0.34490832686424255, |
| "learning_rate": 9.97273673740871e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.9518986463546752, |
| "num_tokens": 1265937.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.5825503355704698, |
| "grad_norm": 0.3113323748111725, |
| "learning_rate": 9.970890689061622e-06, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9522456258535386, |
| "num_tokens": 1289186.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.6093959731543626, |
| "grad_norm": 0.2945059835910797, |
| "learning_rate": 9.968984353647796e-06, |
| "loss": 0.1594, |
| "mean_token_accuracy": 0.950118288397789, |
| "num_tokens": 1311437.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.6362416107382551, |
| "grad_norm": 0.40153175592422485, |
| "learning_rate": 9.967017754287303e-06, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.9425857335329055, |
| "num_tokens": 1332682.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.6630872483221477, |
| "grad_norm": 0.31573766469955444, |
| "learning_rate": 9.964990914831104e-06, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.93615363240242, |
| "num_tokens": 1352512.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.6899328859060403, |
| "grad_norm": 0.33446043729782104, |
| "learning_rate": 9.96290385986075e-06, |
| "loss": 0.154, |
| "mean_token_accuracy": 0.9519079566001892, |
| "num_tokens": 1375202.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.7167785234899329, |
| "grad_norm": 0.2740935683250427, |
| "learning_rate": 9.960756614688089e-06, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9505283504724502, |
| "num_tokens": 1398311.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.7436241610738255, |
| "grad_norm": 0.2914665639400482, |
| "learning_rate": 9.958549205354956e-06, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.9522065281867981, |
| "num_tokens": 1420467.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.770469798657718, |
| "grad_norm": 0.48217689990997314, |
| "learning_rate": 9.956281658632856e-06, |
| "loss": 0.1726, |
| "mean_token_accuracy": 0.9457607984542846, |
| "num_tokens": 1441750.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.7973154362416106, |
| "grad_norm": 0.3143758773803711, |
| "learning_rate": 9.953954002022643e-06, |
| "loss": 0.2142, |
| "mean_token_accuracy": 0.9394429564476013, |
| "num_tokens": 1461475.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.8241610738255034, |
| "grad_norm": 0.32394182682037354, |
| "learning_rate": 9.951566263754184e-06, |
| "loss": 0.1495, |
| "mean_token_accuracy": 0.9530263602733612, |
| "num_tokens": 1484202.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.851006711409396, |
| "grad_norm": 0.32636895775794983, |
| "learning_rate": 9.949118472786024e-06, |
| "loss": 0.1461, |
| "mean_token_accuracy": 0.9514715582132339, |
| "num_tokens": 1507538.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.8778523489932886, |
| "grad_norm": 0.2944177985191345, |
| "learning_rate": 9.946610658805018e-06, |
| "loss": 0.1629, |
| "mean_token_accuracy": 0.949513065814972, |
| "num_tokens": 1529841.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.9046979865771814, |
| "grad_norm": 0.36996525526046753, |
| "learning_rate": 9.944042852225991e-06, |
| "loss": 0.1742, |
| "mean_token_accuracy": 0.9453697711229324, |
| "num_tokens": 1551120.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.931543624161074, |
| "grad_norm": 0.3420737087726593, |
| "learning_rate": 9.94141508419135e-06, |
| "loss": 0.1999, |
| "mean_token_accuracy": 0.9439471960067749, |
| "num_tokens": 1570757.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.9583892617449665, |
| "grad_norm": 0.3249684274196625, |
| "learning_rate": 9.938727386570727e-06, |
| "loss": 0.144, |
| "mean_token_accuracy": 0.9547680050134659, |
| "num_tokens": 1593057.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.985234899328859, |
| "grad_norm": 0.39704540371894836, |
| "learning_rate": 9.935979791960571e-06, |
| "loss": 0.191, |
| "mean_token_accuracy": 0.9425975173711777, |
| "num_tokens": 1614741.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.010738255033557, |
| "grad_norm": 0.29178890585899353, |
| "learning_rate": 9.933172333683768e-06, |
| "loss": 0.1716, |
| "mean_token_accuracy": 0.9465354273193761, |
| "num_tokens": 1634588.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.03758389261745, |
| "grad_norm": 0.2685829699039459, |
| "learning_rate": 9.93030504578923e-06, |
| "loss": 0.1492, |
| "mean_token_accuracy": 0.9501548141241074, |
| "num_tokens": 1658163.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.0644295302013425, |
| "grad_norm": 0.2868586778640747, |
| "learning_rate": 9.927377963051488e-06, |
| "loss": 0.148, |
| "mean_token_accuracy": 0.9499404489994049, |
| "num_tokens": 1680633.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.091275167785235, |
| "grad_norm": 0.3987250030040741, |
| "learning_rate": 9.924391120970262e-06, |
| "loss": 0.1737, |
| "mean_token_accuracy": 0.9475592643022537, |
| "num_tokens": 1702264.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.1181208053691276, |
| "grad_norm": 0.330563485622406, |
| "learning_rate": 9.921344555770033e-06, |
| "loss": 0.2076, |
| "mean_token_accuracy": 0.9379633396863938, |
| "num_tokens": 1722321.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.14496644295302, |
| "grad_norm": 0.32567116618156433, |
| "learning_rate": 9.91823830439961e-06, |
| "loss": 0.1545, |
| "mean_token_accuracy": 0.9527941286563874, |
| "num_tokens": 1743586.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.1718120805369128, |
| "grad_norm": 0.280172735452652, |
| "learning_rate": 9.915072404531675e-06, |
| "loss": 0.1337, |
| "mean_token_accuracy": 0.9554655253887177, |
| "num_tokens": 1767189.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.1986577181208053, |
| "grad_norm": 0.37030693888664246, |
| "learning_rate": 9.911846894562325e-06, |
| "loss": 0.1634, |
| "mean_token_accuracy": 0.9463264971971512, |
| "num_tokens": 1789747.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.225503355704698, |
| "grad_norm": 0.38652828335762024, |
| "learning_rate": 9.908561813610615e-06, |
| "loss": 0.1542, |
| "mean_token_accuracy": 0.9503043502569198, |
| "num_tokens": 1811448.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.2523489932885905, |
| "grad_norm": 0.31967300176620483, |
| "learning_rate": 9.905217201518079e-06, |
| "loss": 0.2012, |
| "mean_token_accuracy": 0.9394483864307404, |
| "num_tokens": 1831513.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.279194630872483, |
| "grad_norm": 0.3107980489730835, |
| "learning_rate": 9.901813098848238e-06, |
| "loss": 0.1536, |
| "mean_token_accuracy": 0.9533493936061859, |
| "num_tokens": 1852698.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.3060402684563757, |
| "grad_norm": 0.28780773282051086, |
| "learning_rate": 9.898349546886123e-06, |
| "loss": 0.1396, |
| "mean_token_accuracy": 0.955009663105011, |
| "num_tokens": 1876231.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.3328859060402687, |
| "grad_norm": 0.2966178059577942, |
| "learning_rate": 9.894826587637764e-06, |
| "loss": 0.1367, |
| "mean_token_accuracy": 0.9548171132802963, |
| "num_tokens": 1898748.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.3597315436241613, |
| "grad_norm": 0.3404034972190857, |
| "learning_rate": 9.891244263829685e-06, |
| "loss": 0.1484, |
| "mean_token_accuracy": 0.9537680149078369, |
| "num_tokens": 1920372.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.386577181208054, |
| "grad_norm": 0.3345947861671448, |
| "learning_rate": 9.887602618908384e-06, |
| "loss": 0.2133, |
| "mean_token_accuracy": 0.9383688002824784, |
| "num_tokens": 1940538.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.4134228187919464, |
| "grad_norm": 0.3276788294315338, |
| "learning_rate": 9.883901697039809e-06, |
| "loss": 0.1556, |
| "mean_token_accuracy": 0.9519165605306625, |
| "num_tokens": 1961802.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.440268456375839, |
| "grad_norm": 0.30133068561553955, |
| "learning_rate": 9.880141543108816e-06, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.9523742258548736, |
| "num_tokens": 1985264.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.4671140939597316, |
| "grad_norm": 0.33560168743133545, |
| "learning_rate": 9.876322202718633e-06, |
| "loss": 0.1488, |
| "mean_token_accuracy": 0.9502911448478699, |
| "num_tokens": 2007692.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.493959731543624, |
| "grad_norm": 0.33598729968070984, |
| "learning_rate": 9.8724437221903e-06, |
| "loss": 0.1609, |
| "mean_token_accuracy": 0.9487929224967957, |
| "num_tokens": 2029244.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.5208053691275167, |
| "grad_norm": 0.3846241533756256, |
| "learning_rate": 9.868506148562107e-06, |
| "loss": 0.2107, |
| "mean_token_accuracy": 0.9374660611152649, |
| "num_tokens": 2049380.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.5476510067114093, |
| "grad_norm": 0.33732545375823975, |
| "learning_rate": 9.864509529589034e-06, |
| "loss": 0.1465, |
| "mean_token_accuracy": 0.9547136723995209, |
| "num_tokens": 2070666.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.574496644295302, |
| "grad_norm": 0.310533732175827, |
| "learning_rate": 9.860453913742158e-06, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.952840319275856, |
| "num_tokens": 2094161.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.6013422818791945, |
| "grad_norm": 0.2950354218482971, |
| "learning_rate": 9.856339350208073e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.9479620784521103, |
| "num_tokens": 2116615.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.6281879194630875, |
| "grad_norm": 0.37915998697280884, |
| "learning_rate": 9.852165888888294e-06, |
| "loss": 0.1475, |
| "mean_token_accuracy": 0.9523311793804169, |
| "num_tokens": 2138198.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.6550335570469796, |
| "grad_norm": 0.39890754222869873, |
| "learning_rate": 9.847933580398645e-06, |
| "loss": 0.2259, |
| "mean_token_accuracy": 0.9327600687742233, |
| "num_tokens": 2158392.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.6818791946308727, |
| "grad_norm": 0.38095587491989136, |
| "learning_rate": 9.843642476068654e-06, |
| "loss": 0.1577, |
| "mean_token_accuracy": 0.9513375163078308, |
| "num_tokens": 2179772.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.7087248322147652, |
| "grad_norm": 0.2671376168727875, |
| "learning_rate": 9.839292627940924e-06, |
| "loss": 0.1362, |
| "mean_token_accuracy": 0.9555162519216538, |
| "num_tokens": 2203441.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.735570469798658, |
| "grad_norm": 0.3001173436641693, |
| "learning_rate": 9.834884088770504e-06, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9491879791021347, |
| "num_tokens": 2226076.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.7624161073825504, |
| "grad_norm": 0.36988887190818787, |
| "learning_rate": 9.83041691202425e-06, |
| "loss": 0.1406, |
| "mean_token_accuracy": 0.9558630377054215, |
| "num_tokens": 2247831.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.789261744966443, |
| "grad_norm": 0.3370855450630188, |
| "learning_rate": 9.825891151880176e-06, |
| "loss": 0.2107, |
| "mean_token_accuracy": 0.9384757906198502, |
| "num_tokens": 2268043.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.8161073825503355, |
| "grad_norm": 0.40412017703056335, |
| "learning_rate": 9.821306863226796e-06, |
| "loss": 0.1582, |
| "mean_token_accuracy": 0.95028136074543, |
| "num_tokens": 2289294.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.842953020134228, |
| "grad_norm": 0.32156631350517273, |
| "learning_rate": 9.816664101662458e-06, |
| "loss": 0.1344, |
| "mean_token_accuracy": 0.9564105361700058, |
| "num_tokens": 2312830.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.8697986577181207, |
| "grad_norm": 0.3493017554283142, |
| "learning_rate": 9.811962923494674e-06, |
| "loss": 0.1551, |
| "mean_token_accuracy": 0.9494620323181152, |
| "num_tokens": 2335317.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.8966442953020133, |
| "grad_norm": 0.36799290776252747, |
| "learning_rate": 9.80720338573943e-06, |
| "loss": 0.1578, |
| "mean_token_accuracy": 0.9477977395057678, |
| "num_tokens": 2356912.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.9234899328859063, |
| "grad_norm": 0.34753313660621643, |
| "learning_rate": 9.802385546120498e-06, |
| "loss": 0.2169, |
| "mean_token_accuracy": 0.9361104846000672, |
| "num_tokens": 2377088.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.9503355704697984, |
| "grad_norm": 0.3603476881980896, |
| "learning_rate": 9.797509463068743e-06, |
| "loss": 0.1571, |
| "mean_token_accuracy": 0.9502560168504715, |
| "num_tokens": 2398159.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.9771812080536915, |
| "grad_norm": 0.3038477301597595, |
| "learning_rate": 9.7925751957214e-06, |
| "loss": 0.152, |
| "mean_token_accuracy": 0.949856498837471, |
| "num_tokens": 2420698.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 3.002684563758389, |
| "grad_norm": 0.33368000388145447, |
| "learning_rate": 9.787582803921366e-06, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.9435529865716633, |
| "num_tokens": 2439628.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 3.029530201342282, |
| "grad_norm": 0.2882815897464752, |
| "learning_rate": 9.782532348216475e-06, |
| "loss": 0.1371, |
| "mean_token_accuracy": 0.9531569749116897, |
| "num_tokens": 2463670.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 3.0563758389261744, |
| "grad_norm": 0.3468344211578369, |
| "learning_rate": 9.777423889858759e-06, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.9515006303787231, |
| "num_tokens": 2486545.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 3.083221476510067, |
| "grad_norm": 0.3093087077140808, |
| "learning_rate": 9.77225749080371e-06, |
| "loss": 0.1362, |
| "mean_token_accuracy": 0.9559379696846009, |
| "num_tokens": 2508549.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.1100671140939595, |
| "grad_norm": 0.4076331853866577, |
| "learning_rate": 9.767033213709525e-06, |
| "loss": 0.1911, |
| "mean_token_accuracy": 0.9427547425031662, |
| "num_tokens": 2529303.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 3.1369127516778526, |
| "grad_norm": 0.4216303825378418, |
| "learning_rate": 9.761751121936342e-06, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.947964558005333, |
| "num_tokens": 2549099.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 3.163758389261745, |
| "grad_norm": 0.2814992070198059, |
| "learning_rate": 9.756411279545486e-06, |
| "loss": 0.1329, |
| "mean_token_accuracy": 0.9554579347372055, |
| "num_tokens": 2573053.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 3.1906040268456377, |
| "grad_norm": 0.31067585945129395, |
| "learning_rate": 9.751013751298674e-06, |
| "loss": 0.1416, |
| "mean_token_accuracy": 0.951316300034523, |
| "num_tokens": 2595891.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 3.2174496644295303, |
| "grad_norm": 0.3256937265396118, |
| "learning_rate": 9.745558602657244e-06, |
| "loss": 0.1418, |
| "mean_token_accuracy": 0.952664366364479, |
| "num_tokens": 2617778.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.244295302013423, |
| "grad_norm": 0.35490092635154724, |
| "learning_rate": 9.740045899781353e-06, |
| "loss": 0.1947, |
| "mean_token_accuracy": 0.9404710203409195, |
| "num_tokens": 2638355.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 3.2711409395973154, |
| "grad_norm": 0.34426459670066833, |
| "learning_rate": 9.734475709529177e-06, |
| "loss": 0.1674, |
| "mean_token_accuracy": 0.9495470136404037, |
| "num_tokens": 2658221.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 3.297986577181208, |
| "grad_norm": 0.3160524070262909, |
| "learning_rate": 9.7288480994561e-06, |
| "loss": 0.1375, |
| "mean_token_accuracy": 0.9538306951522827, |
| "num_tokens": 2682262.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 3.3248322147651006, |
| "grad_norm": 0.33500564098358154, |
| "learning_rate": 9.723163137813898e-06, |
| "loss": 0.1418, |
| "mean_token_accuracy": 0.9511039316654205, |
| "num_tokens": 2705103.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 3.351677852348993, |
| "grad_norm": 0.34578338265419006, |
| "learning_rate": 9.717420893549902e-06, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.9548570722341537, |
| "num_tokens": 2727045.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.3785234899328858, |
| "grad_norm": 0.33035096526145935, |
| "learning_rate": 9.711621436306172e-06, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.9407137960195542, |
| "num_tokens": 2747667.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 3.4053691275167783, |
| "grad_norm": 0.34439653158187866, |
| "learning_rate": 9.705764836418648e-06, |
| "loss": 0.1598, |
| "mean_token_accuracy": 0.9528309881687165, |
| "num_tokens": 2767458.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 3.432214765100671, |
| "grad_norm": 0.32619956135749817, |
| "learning_rate": 9.699851164916296e-06, |
| "loss": 0.1356, |
| "mean_token_accuracy": 0.9537905603647232, |
| "num_tokens": 2791419.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 3.459060402684564, |
| "grad_norm": 0.33809950947761536, |
| "learning_rate": 9.69388049352025e-06, |
| "loss": 0.1447, |
| "mean_token_accuracy": 0.9516083031892777, |
| "num_tokens": 2814299.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 3.4859060402684565, |
| "grad_norm": 0.34236571192741394, |
| "learning_rate": 9.687852894642932e-06, |
| "loss": 0.1411, |
| "mean_token_accuracy": 0.9545169979333877, |
| "num_tokens": 2836250.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.512751677852349, |
| "grad_norm": 0.35245418548583984, |
| "learning_rate": 9.681768441387195e-06, |
| "loss": 0.1905, |
| "mean_token_accuracy": 0.9411760956048966, |
| "num_tokens": 2856939.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 3.5395973154362417, |
| "grad_norm": 0.33470526337623596, |
| "learning_rate": 9.675627207545415e-06, |
| "loss": 0.1641, |
| "mean_token_accuracy": 0.9509621292352677, |
| "num_tokens": 2876789.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 3.5664429530201343, |
| "grad_norm": 0.29923608899116516, |
| "learning_rate": 9.669429267598603e-06, |
| "loss": 0.1301, |
| "mean_token_accuracy": 0.9571765452623368, |
| "num_tokens": 2900661.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 3.593288590604027, |
| "grad_norm": 0.30686208605766296, |
| "learning_rate": 9.663174696715502e-06, |
| "loss": 0.144, |
| "mean_token_accuracy": 0.9498992472887039, |
| "num_tokens": 2923496.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 3.6201342281879194, |
| "grad_norm": 0.34822651743888855, |
| "learning_rate": 9.656863570751687e-06, |
| "loss": 0.1491, |
| "mean_token_accuracy": 0.9523457109928131, |
| "num_tokens": 2945425.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.646979865771812, |
| "grad_norm": 0.411933034658432, |
| "learning_rate": 9.650495966248618e-06, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.9376524448394775, |
| "num_tokens": 2965952.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 3.6738255033557046, |
| "grad_norm": 0.33340126276016235, |
| "learning_rate": 9.644071960432741e-06, |
| "loss": 0.161, |
| "mean_token_accuracy": 0.9514278948307038, |
| "num_tokens": 2985755.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 3.7006711409395976, |
| "grad_norm": 0.3414279520511627, |
| "learning_rate": 9.637591631214535e-06, |
| "loss": 0.1323, |
| "mean_token_accuracy": 0.9548977851867676, |
| "num_tokens": 3009591.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 3.7275167785234897, |
| "grad_norm": 0.3117181658744812, |
| "learning_rate": 9.631055057187564e-06, |
| "loss": 0.1443, |
| "mean_token_accuracy": 0.951587375998497, |
| "num_tokens": 3032349.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 3.7543624161073827, |
| "grad_norm": 0.3485643267631531, |
| "learning_rate": 9.624462317627538e-06, |
| "loss": 0.1343, |
| "mean_token_accuracy": 0.9559377074241638, |
| "num_tokens": 3054236.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.7812080536912753, |
| "grad_norm": 0.40424180030822754, |
| "learning_rate": 9.61781349249134e-06, |
| "loss": 0.2071, |
| "mean_token_accuracy": 0.935996612906456, |
| "num_tokens": 3075030.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 3.808053691275168, |
| "grad_norm": 0.36545875668525696, |
| "learning_rate": 9.611108662416064e-06, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9502278298139573, |
| "num_tokens": 3094888.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 3.8348993288590605, |
| "grad_norm": 0.3279842138290405, |
| "learning_rate": 9.604347908718026e-06, |
| "loss": 0.131, |
| "mean_token_accuracy": 0.9550918698310852, |
| "num_tokens": 3118692.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 3.861744966442953, |
| "grad_norm": 0.310715913772583, |
| "learning_rate": 9.59753131339179e-06, |
| "loss": 0.146, |
| "mean_token_accuracy": 0.9505369186401367, |
| "num_tokens": 3141385.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 3.8885906040268456, |
| "grad_norm": 0.3354150652885437, |
| "learning_rate": 9.590658959109168e-06, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9580255270004272, |
| "num_tokens": 3163253.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.915436241610738, |
| "grad_norm": 0.3825208842754364, |
| "learning_rate": 9.583730929218218e-06, |
| "loss": 0.2052, |
| "mean_token_accuracy": 0.9373064041137695, |
| "num_tokens": 3183796.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 3.942281879194631, |
| "grad_norm": 0.36658021807670593, |
| "learning_rate": 9.576747307742231e-06, |
| "loss": 0.1639, |
| "mean_token_accuracy": 0.9496467560529709, |
| "num_tokens": 3203577.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 3.9691275167785234, |
| "grad_norm": 0.31084001064300537, |
| "learning_rate": 9.569708179378716e-06, |
| "loss": 0.1406, |
| "mean_token_accuracy": 0.9523321092128754, |
| "num_tokens": 3226669.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 3.995973154362416, |
| "grad_norm": 0.36023128032684326, |
| "learning_rate": 9.562613629498367e-06, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.9448588013648986, |
| "num_tokens": 3247335.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 4.021476510067114, |
| "grad_norm": 0.3163889944553375, |
| "learning_rate": 9.555463744144037e-06, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9579417329085501, |
| "num_tokens": 3268914.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.048322147651007, |
| "grad_norm": 0.3244967758655548, |
| "learning_rate": 9.548258610029684e-06, |
| "loss": 0.1397, |
| "mean_token_accuracy": 0.9520616948604583, |
| "num_tokens": 3292003.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 4.0751677852349, |
| "grad_norm": 0.3769804835319519, |
| "learning_rate": 9.540998314539327e-06, |
| "loss": 0.1414, |
| "mean_token_accuracy": 0.9535090059041977, |
| "num_tokens": 3314200.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 4.102013422818792, |
| "grad_norm": 0.4516792595386505, |
| "learning_rate": 9.533682945725984e-06, |
| "loss": 0.1624, |
| "mean_token_accuracy": 0.9462276846170425, |
| "num_tokens": 3335382.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 4.128859060402685, |
| "grad_norm": 0.37825873494148254, |
| "learning_rate": 9.526312592310597e-06, |
| "loss": 0.1781, |
| "mean_token_accuracy": 0.9461793184280396, |
| "num_tokens": 3354942.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 4.155704697986577, |
| "grad_norm": 0.28725379705429077, |
| "learning_rate": 9.518887343680971e-06, |
| "loss": 0.1305, |
| "mean_token_accuracy": 0.9573172956705094, |
| "num_tokens": 3378126.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 4.18255033557047, |
| "grad_norm": 0.3456558585166931, |
| "learning_rate": 9.511407289890678e-06, |
| "loss": 0.1397, |
| "mean_token_accuracy": 0.9526260673999787, |
| "num_tokens": 3401323.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 4.209395973154362, |
| "grad_norm": 0.3543192148208618, |
| "learning_rate": 9.503872521657964e-06, |
| "loss": 0.1453, |
| "mean_token_accuracy": 0.9511166512966156, |
| "num_tokens": 3423565.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 4.236241610738255, |
| "grad_norm": 0.45514950156211853, |
| "learning_rate": 9.496283130364658e-06, |
| "loss": 0.17, |
| "mean_token_accuracy": 0.9460733950138092, |
| "num_tokens": 3444827.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 4.263087248322147, |
| "grad_norm": 0.3951168358325958, |
| "learning_rate": 9.488639208055059e-06, |
| "loss": 0.1844, |
| "mean_token_accuracy": 0.9429652452468872, |
| "num_tokens": 3464503.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 4.28993288590604, |
| "grad_norm": 0.31996405124664307, |
| "learning_rate": 9.480940847434814e-06, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.9602243095636368, |
| "num_tokens": 3487664.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 4.3167785234899325, |
| "grad_norm": 0.36431530117988586, |
| "learning_rate": 9.473188141869804e-06, |
| "loss": 0.1465, |
| "mean_token_accuracy": 0.949748307466507, |
| "num_tokens": 3510658.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 4.3436241610738255, |
| "grad_norm": 0.3072509765625, |
| "learning_rate": 9.465381185385008e-06, |
| "loss": 0.1297, |
| "mean_token_accuracy": 0.9579643219709396, |
| "num_tokens": 3532695.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 4.370469798657718, |
| "grad_norm": 0.46137064695358276, |
| "learning_rate": 9.457520072663353e-06, |
| "loss": 0.176, |
| "mean_token_accuracy": 0.9446452677249908, |
| "num_tokens": 3553814.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 4.397315436241611, |
| "grad_norm": 0.39676016569137573, |
| "learning_rate": 9.449604899044583e-06, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.9439645022153854, |
| "num_tokens": 3573399.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 4.424161073825504, |
| "grad_norm": 0.3147033751010895, |
| "learning_rate": 9.441635760524087e-06, |
| "loss": 0.13, |
| "mean_token_accuracy": 0.9564067393541336, |
| "num_tokens": 3596524.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.451006711409396, |
| "grad_norm": 0.32451656460762024, |
| "learning_rate": 9.433612753751748e-06, |
| "loss": 0.1347, |
| "mean_token_accuracy": 0.9525512009859085, |
| "num_tokens": 3619695.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 4.477852348993289, |
| "grad_norm": 0.34751221537590027, |
| "learning_rate": 9.425535976030758e-06, |
| "loss": 0.1346, |
| "mean_token_accuracy": 0.9561400681734085, |
| "num_tokens": 3641796.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 4.504697986577181, |
| "grad_norm": 0.476945161819458, |
| "learning_rate": 9.417405525316448e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9473035573959351, |
| "num_tokens": 3662816.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 4.531543624161074, |
| "grad_norm": 0.3843275308609009, |
| "learning_rate": 9.409221500215096e-06, |
| "loss": 0.1809, |
| "mean_token_accuracy": 0.9466674089431762, |
| "num_tokens": 3682421.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 4.558389261744966, |
| "grad_norm": 0.2982370853424072, |
| "learning_rate": 9.400983999982729e-06, |
| "loss": 0.1258, |
| "mean_token_accuracy": 0.9580628424882889, |
| "num_tokens": 3705605.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.585234899328859, |
| "grad_norm": 0.36844635009765625, |
| "learning_rate": 9.392693124523925e-06, |
| "loss": 0.1346, |
| "mean_token_accuracy": 0.9527536749839782, |
| "num_tokens": 3728772.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 4.612080536912751, |
| "grad_norm": 0.3254833519458771, |
| "learning_rate": 9.38434897439059e-06, |
| "loss": 0.1413, |
| "mean_token_accuracy": 0.9538274705410004, |
| "num_tokens": 3750862.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 4.638926174496644, |
| "grad_norm": 0.40839093923568726, |
| "learning_rate": 9.375951650780759e-06, |
| "loss": 0.1698, |
| "mean_token_accuracy": 0.946934774518013, |
| "num_tokens": 3772020.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 4.665771812080537, |
| "grad_norm": 0.4056667685508728, |
| "learning_rate": 9.367501255537347e-06, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.9489806234836579, |
| "num_tokens": 3791528.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 4.6926174496644295, |
| "grad_norm": 0.30787405371665955, |
| "learning_rate": 9.358997891146924e-06, |
| "loss": 0.1335, |
| "mean_token_accuracy": 0.9545555591583252, |
| "num_tokens": 3814594.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.7194630872483225, |
| "grad_norm": 0.33151739835739136, |
| "learning_rate": 9.350441660738472e-06, |
| "loss": 0.1354, |
| "mean_token_accuracy": 0.9516535818576812, |
| "num_tokens": 3837666.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 4.746308724832215, |
| "grad_norm": 0.35133394598960876, |
| "learning_rate": 9.341832668082136e-06, |
| "loss": 0.1361, |
| "mean_token_accuracy": 0.9538627177476883, |
| "num_tokens": 3859873.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 4.773154362416108, |
| "grad_norm": 0.4053596556186676, |
| "learning_rate": 9.333171017587956e-06, |
| "loss": 0.1711, |
| "mean_token_accuracy": 0.94505635201931, |
| "num_tokens": 3880988.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 0.40388065576553345, |
| "learning_rate": 9.324456814304614e-06, |
| "loss": 0.187, |
| "mean_token_accuracy": 0.9447333663702011, |
| "num_tokens": 3900559.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 4.826845637583893, |
| "grad_norm": 0.3266175389289856, |
| "learning_rate": 9.315690163918147e-06, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9574593305587769, |
| "num_tokens": 3923718.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.853691275167785, |
| "grad_norm": 0.2933380603790283, |
| "learning_rate": 9.30687117275068e-06, |
| "loss": 0.1438, |
| "mean_token_accuracy": 0.9510160237550735, |
| "num_tokens": 3946770.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 4.880536912751678, |
| "grad_norm": 0.36102089285850525, |
| "learning_rate": 9.29799994775912e-06, |
| "loss": 0.1404, |
| "mean_token_accuracy": 0.9519519448280335, |
| "num_tokens": 3968801.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 4.90738255033557, |
| "grad_norm": 0.41366609930992126, |
| "learning_rate": 9.289076596533873e-06, |
| "loss": 0.1868, |
| "mean_token_accuracy": 0.9425925433635711, |
| "num_tokens": 3989599.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 4.934228187919463, |
| "grad_norm": 0.391011118888855, |
| "learning_rate": 9.280101227297526e-06, |
| "loss": 0.1801, |
| "mean_token_accuracy": 0.9462247163057327, |
| "num_tokens": 4009110.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 4.961073825503355, |
| "grad_norm": 0.3075045645236969, |
| "learning_rate": 9.271073948903548e-06, |
| "loss": 0.1345, |
| "mean_token_accuracy": 0.9573867440223693, |
| "num_tokens": 4031827.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.987919463087248, |
| "grad_norm": 0.3926655054092407, |
| "learning_rate": 9.26199487083496e-06, |
| "loss": 0.1438, |
| "mean_token_accuracy": 0.9512310355901719, |
| "num_tokens": 4053816.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 5.0134228187919465, |
| "grad_norm": 0.31910833716392517, |
| "learning_rate": 9.252864103203015e-06, |
| "loss": 0.1382, |
| "mean_token_accuracy": 0.9562591439799258, |
| "num_tokens": 4074082.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 5.040268456375839, |
| "grad_norm": 0.3303743898868561, |
| "learning_rate": 9.243681756745851e-06, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9561623185873032, |
| "num_tokens": 4097543.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 5.067114093959732, |
| "grad_norm": 0.3294975459575653, |
| "learning_rate": 9.23444794282716e-06, |
| "loss": 0.1429, |
| "mean_token_accuracy": 0.9520007222890854, |
| "num_tokens": 4119984.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 5.093959731543624, |
| "grad_norm": 0.43249550461769104, |
| "learning_rate": 9.225162773434831e-06, |
| "loss": 0.1441, |
| "mean_token_accuracy": 0.9515981763601303, |
| "num_tokens": 4141494.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 5.120805369127517, |
| "grad_norm": 0.3819236159324646, |
| "learning_rate": 9.215826361179596e-06, |
| "loss": 0.1759, |
| "mean_token_accuracy": 0.945612245798111, |
| "num_tokens": 4161327.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 5.14765100671141, |
| "grad_norm": 0.3246872127056122, |
| "learning_rate": 9.206438819293654e-06, |
| "loss": 0.1335, |
| "mean_token_accuracy": 0.9576434195041656, |
| "num_tokens": 4182978.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 5.174496644295302, |
| "grad_norm": 0.33821627497673035, |
| "learning_rate": 9.197000261629314e-06, |
| "loss": 0.1318, |
| "mean_token_accuracy": 0.9542952626943588, |
| "num_tokens": 4206394.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 5.201342281879195, |
| "grad_norm": 0.3124513328075409, |
| "learning_rate": 9.187510802657601e-06, |
| "loss": 0.1353, |
| "mean_token_accuracy": 0.9533496767282486, |
| "num_tokens": 4228836.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 5.228187919463087, |
| "grad_norm": 0.44123515486717224, |
| "learning_rate": 9.177970557466873e-06, |
| "loss": 0.1396, |
| "mean_token_accuracy": 0.9535221606492996, |
| "num_tokens": 4250312.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 5.25503355704698, |
| "grad_norm": 0.43715864419937134, |
| "learning_rate": 9.168379641761425e-06, |
| "loss": 0.1894, |
| "mean_token_accuracy": 0.9410462647676467, |
| "num_tokens": 4270261.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 5.281879194630872, |
| "grad_norm": 0.3679194748401642, |
| "learning_rate": 9.158738171860081e-06, |
| "loss": 0.1346, |
| "mean_token_accuracy": 0.9566879123449326, |
| "num_tokens": 4291980.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 5.308724832214765, |
| "grad_norm": 0.3097969889640808, |
| "learning_rate": 9.149046264694795e-06, |
| "loss": 0.1282, |
| "mean_token_accuracy": 0.9558116465806961, |
| "num_tokens": 4315357.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 5.3355704697986575, |
| "grad_norm": 0.3311108350753784, |
| "learning_rate": 9.139304037809216e-06, |
| "loss": 0.1361, |
| "mean_token_accuracy": 0.953630456328392, |
| "num_tokens": 4337801.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 5.3624161073825505, |
| "grad_norm": 0.3906519114971161, |
| "learning_rate": 9.12951160935728e-06, |
| "loss": 0.1506, |
| "mean_token_accuracy": 0.9501311779022217, |
| "num_tokens": 4359347.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 5.389261744966443, |
| "grad_norm": 0.44028791785240173, |
| "learning_rate": 9.119669098101764e-06, |
| "loss": 0.1911, |
| "mean_token_accuracy": 0.9415321052074432, |
| "num_tokens": 4379414.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 5.416107382550336, |
| "grad_norm": 0.3951742649078369, |
| "learning_rate": 9.10977662341285e-06, |
| "loss": 0.1363, |
| "mean_token_accuracy": 0.9573418200016022, |
| "num_tokens": 4401148.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 5.442953020134228, |
| "grad_norm": 0.3792262077331543, |
| "learning_rate": 9.099834305266681e-06, |
| "loss": 0.1286, |
| "mean_token_accuracy": 0.9557218492031098, |
| "num_tokens": 4424751.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 5.469798657718121, |
| "grad_norm": 0.39373981952667236, |
| "learning_rate": 9.0898422642439e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9514533162117005, |
| "num_tokens": 4447202.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 5.496644295302014, |
| "grad_norm": 0.38414108753204346, |
| "learning_rate": 9.07980062152819e-06, |
| "loss": 0.1523, |
| "mean_token_accuracy": 0.9517021149396896, |
| "num_tokens": 4468742.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 5.523489932885906, |
| "grad_norm": 0.42570510506629944, |
| "learning_rate": 9.069709498904803e-06, |
| "loss": 0.1928, |
| "mean_token_accuracy": 0.9422121673822403, |
| "num_tokens": 4488651.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 5.550335570469799, |
| "grad_norm": 0.3122999370098114, |
| "learning_rate": 9.059569018759092e-06, |
| "loss": 0.1347, |
| "mean_token_accuracy": 0.9576286196708679, |
| "num_tokens": 4510368.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 5.577181208053691, |
| "grad_norm": 0.342123419046402, |
| "learning_rate": 9.049379304075009e-06, |
| "loss": 0.1243, |
| "mean_token_accuracy": 0.9563952952623367, |
| "num_tokens": 4533853.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 5.604026845637584, |
| "grad_norm": 0.3129931390285492, |
| "learning_rate": 9.039140478433625e-06, |
| "loss": 0.1435, |
| "mean_token_accuracy": 0.9513378292322159, |
| "num_tokens": 4556327.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 5.630872483221476, |
| "grad_norm": 0.4187992215156555, |
| "learning_rate": 9.028852666011638e-06, |
| "loss": 0.145, |
| "mean_token_accuracy": 0.9517636507749557, |
| "num_tokens": 4577971.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 5.657718120805369, |
| "grad_norm": 0.422378808259964, |
| "learning_rate": 9.018515991579851e-06, |
| "loss": 0.1844, |
| "mean_token_accuracy": 0.9434764713048935, |
| "num_tokens": 4598122.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 5.684563758389261, |
| "grad_norm": 0.36106374859809875, |
| "learning_rate": 9.008130580501669e-06, |
| "loss": 0.143, |
| "mean_token_accuracy": 0.9556879609823227, |
| "num_tokens": 4619856.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 5.7114093959731544, |
| "grad_norm": 0.38176390528678894, |
| "learning_rate": 8.997696558731575e-06, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.9565722495317459, |
| "num_tokens": 4643297.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 5.7382550335570475, |
| "grad_norm": 0.41863518953323364, |
| "learning_rate": 8.987214052813605e-06, |
| "loss": 0.1361, |
| "mean_token_accuracy": 0.9535513520240784, |
| "num_tokens": 4665658.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 5.76510067114094, |
| "grad_norm": 0.4007248282432556, |
| "learning_rate": 8.976683189879811e-06, |
| "loss": 0.1335, |
| "mean_token_accuracy": 0.9553972989320755, |
| "num_tokens": 4687201.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 5.791946308724832, |
| "grad_norm": 0.4537723958492279, |
| "learning_rate": 8.966104097648721e-06, |
| "loss": 0.1943, |
| "mean_token_accuracy": 0.9408634662628174, |
| "num_tokens": 4707358.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 5.818791946308725, |
| "grad_norm": 0.3353871703147888, |
| "learning_rate": 8.955476904423785e-06, |
| "loss": 0.1388, |
| "mean_token_accuracy": 0.9545477360486985, |
| "num_tokens": 4729147.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 5.845637583892618, |
| "grad_norm": 0.42482447624206543, |
| "learning_rate": 8.944801739091831e-06, |
| "loss": 0.1331, |
| "mean_token_accuracy": 0.9544930905103683, |
| "num_tokens": 4752510.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 5.87248322147651, |
| "grad_norm": 0.3644810616970062, |
| "learning_rate": 8.934078731121482e-06, |
| "loss": 0.1315, |
| "mean_token_accuracy": 0.9561907082796097, |
| "num_tokens": 4774905.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 5.899328859060403, |
| "grad_norm": 0.48736846446990967, |
| "learning_rate": 8.923308010561608e-06, |
| "loss": 0.1427, |
| "mean_token_accuracy": 0.9533969014883041, |
| "num_tokens": 4796405.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 5.926174496644295, |
| "grad_norm": 0.429516464471817, |
| "learning_rate": 8.912489708039734e-06, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.9409243553876877, |
| "num_tokens": 4816438.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 5.953020134228188, |
| "grad_norm": 0.3750719130039215, |
| "learning_rate": 8.90162395476046e-06, |
| "loss": 0.1382, |
| "mean_token_accuracy": 0.9537917494773864, |
| "num_tokens": 4838019.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 5.97986577181208, |
| "grad_norm": 0.431819349527359, |
| "learning_rate": 8.89071088250387e-06, |
| "loss": 0.1419, |
| "mean_token_accuracy": 0.9530791282653809, |
| "num_tokens": 4860114.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 6.005369127516778, |
| "grad_norm": 0.35386669635772705, |
| "learning_rate": 8.879750623623932e-06, |
| "loss": 0.1549, |
| "mean_token_accuracy": 0.9517395119918021, |
| "num_tokens": 4879178.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 6.0322147651006714, |
| "grad_norm": 0.3473912179470062, |
| "learning_rate": 8.8687433110469e-06, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.9572397619485855, |
| "num_tokens": 4903000.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 6.059060402684564, |
| "grad_norm": 0.3461743891239166, |
| "learning_rate": 8.857689078269688e-06, |
| "loss": 0.1367, |
| "mean_token_accuracy": 0.9519617527723312, |
| "num_tokens": 4925763.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 6.085906040268457, |
| "grad_norm": 0.38205376267433167, |
| "learning_rate": 8.846588059358265e-06, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.9563361287117005, |
| "num_tokens": 4947525.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 6.112751677852349, |
| "grad_norm": 0.410349041223526, |
| "learning_rate": 8.835440388946025e-06, |
| "loss": 0.183, |
| "mean_token_accuracy": 0.9442010521888733, |
| "num_tokens": 4967890.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 6.139597315436242, |
| "grad_norm": 0.4093804657459259, |
| "learning_rate": 8.824246202232142e-06, |
| "loss": 0.1428, |
| "mean_token_accuracy": 0.9542681604623795, |
| "num_tokens": 4988239.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 6.166442953020134, |
| "grad_norm": 0.39240196347236633, |
| "learning_rate": 8.813005634979954e-06, |
| "loss": 0.1258, |
| "mean_token_accuracy": 0.9575669139623642, |
| "num_tokens": 5012161.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 6.193288590604027, |
| "grad_norm": 0.3647288680076599, |
| "learning_rate": 8.801718823515293e-06, |
| "loss": 0.1303, |
| "mean_token_accuracy": 0.955005195736885, |
| "num_tokens": 5034857.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 6.220134228187919, |
| "grad_norm": 0.41007348895072937, |
| "learning_rate": 8.790385904724848e-06, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9576222687959671, |
| "num_tokens": 5056763.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 6.246979865771812, |
| "grad_norm": 0.45653489232063293, |
| "learning_rate": 8.779007016054496e-06, |
| "loss": 0.1887, |
| "mean_token_accuracy": 0.9412017434835434, |
| "num_tokens": 5077221.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 6.273825503355705, |
| "grad_norm": 0.39891692996025085, |
| "learning_rate": 8.767582295507637e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.9543320029973984, |
| "num_tokens": 5097569.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 6.300671140939597, |
| "grad_norm": 0.3695172071456909, |
| "learning_rate": 8.75611188164352e-06, |
| "loss": 0.1224, |
| "mean_token_accuracy": 0.9566197484731674, |
| "num_tokens": 5121371.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 6.32751677852349, |
| "grad_norm": 0.39350956678390503, |
| "learning_rate": 8.744595913575572e-06, |
| "loss": 0.1374, |
| "mean_token_accuracy": 0.9534388244152069, |
| "num_tokens": 5144160.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 6.354362416107382, |
| "grad_norm": 0.42946743965148926, |
| "learning_rate": 8.733034530969688e-06, |
| "loss": 0.1352, |
| "mean_token_accuracy": 0.9549015939235688, |
| "num_tokens": 5166042.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 6.381208053691275, |
| "grad_norm": 0.5302273035049438, |
| "learning_rate": 8.721427874042563e-06, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.9400767356157302, |
| "num_tokens": 5186480.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 6.4080536912751676, |
| "grad_norm": 0.4178345799446106, |
| "learning_rate": 8.709776083559978e-06, |
| "loss": 0.1486, |
| "mean_token_accuracy": 0.9536987513303756, |
| "num_tokens": 5206790.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 6.434899328859061, |
| "grad_norm": 0.408120334148407, |
| "learning_rate": 8.698079300835088e-06, |
| "loss": 0.1213, |
| "mean_token_accuracy": 0.9586103349924088, |
| "num_tokens": 5230497.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 6.461744966442953, |
| "grad_norm": 0.34466466307640076, |
| "learning_rate": 8.686337667726723e-06, |
| "loss": 0.1321, |
| "mean_token_accuracy": 0.953861802816391, |
| "num_tokens": 5253213.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 6.488590604026846, |
| "grad_norm": 0.43265581130981445, |
| "learning_rate": 8.674551326637655e-06, |
| "loss": 0.125, |
| "mean_token_accuracy": 0.958708542585373, |
| "num_tokens": 5274989.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 6.515436241610738, |
| "grad_norm": 0.5245379209518433, |
| "learning_rate": 8.662720420512877e-06, |
| "loss": 0.1889, |
| "mean_token_accuracy": 0.9418635576963424, |
| "num_tokens": 5295319.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 6.542281879194631, |
| "grad_norm": 0.3666956126689911, |
| "learning_rate": 8.650845092837867e-06, |
| "loss": 0.1407, |
| "mean_token_accuracy": 0.9555400878190994, |
| "num_tokens": 5315538.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 6.569127516778524, |
| "grad_norm": 0.4082365930080414, |
| "learning_rate": 8.638925487636847e-06, |
| "loss": 0.1319, |
| "mean_token_accuracy": 0.9544797509908676, |
| "num_tokens": 5339222.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 6.595973154362416, |
| "grad_norm": 0.3805497884750366, |
| "learning_rate": 8.626961749471044e-06, |
| "loss": 0.1421, |
| "mean_token_accuracy": 0.9522867351770401, |
| "num_tokens": 5361763.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 6.622818791946309, |
| "grad_norm": 0.3796834945678711, |
| "learning_rate": 8.61495402343692e-06, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9572306245565414, |
| "num_tokens": 5383570.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 6.649664429530201, |
| "grad_norm": 0.5255109667778015, |
| "learning_rate": 8.602902455164432e-06, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.9410823851823806, |
| "num_tokens": 5404105.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 6.676510067114094, |
| "grad_norm": 0.39988207817077637, |
| "learning_rate": 8.590807190815254e-06, |
| "loss": 0.1472, |
| "mean_token_accuracy": 0.9552285671234131, |
| "num_tokens": 5424459.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 6.703355704697986, |
| "grad_norm": 0.37969970703125, |
| "learning_rate": 8.578668377081001e-06, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9572932302951813, |
| "num_tokens": 5448334.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 6.730201342281879, |
| "grad_norm": 0.36511608958244324, |
| "learning_rate": 8.56648616118147e-06, |
| "loss": 0.1309, |
| "mean_token_accuracy": 0.9542735308408737, |
| "num_tokens": 5471058.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 6.7570469798657715, |
| "grad_norm": 0.48128047585487366, |
| "learning_rate": 8.554260690862824e-06, |
| "loss": 0.134, |
| "mean_token_accuracy": 0.9557673066854477, |
| "num_tokens": 5492789.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 6.7838926174496645, |
| "grad_norm": 0.46509677171707153, |
| "learning_rate": 8.541992114395825e-06, |
| "loss": 0.1749, |
| "mean_token_accuracy": 0.9453763455152512, |
| "num_tokens": 5513232.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 6.810738255033557, |
| "grad_norm": 0.3718424439430237, |
| "learning_rate": 8.529680580574028e-06, |
| "loss": 0.1456, |
| "mean_token_accuracy": 0.9548163831233978, |
| "num_tokens": 5533489.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 6.83758389261745, |
| "grad_norm": 0.40027740597724915, |
| "learning_rate": 8.517326238711976e-06, |
| "loss": 0.1222, |
| "mean_token_accuracy": 0.9584755569696426, |
| "num_tokens": 5557178.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 6.864429530201342, |
| "grad_norm": 0.4006039798259735, |
| "learning_rate": 8.504929238643381e-06, |
| "loss": 0.1362, |
| "mean_token_accuracy": 0.9527265220880509, |
| "num_tokens": 5579955.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 6.891275167785235, |
| "grad_norm": 0.371866375207901, |
| "learning_rate": 8.492489730719325e-06, |
| "loss": 0.1299, |
| "mean_token_accuracy": 0.9573649376630783, |
| "num_tokens": 5601887.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 6.918120805369128, |
| "grad_norm": 0.5413601398468018, |
| "learning_rate": 8.48000786580642e-06, |
| "loss": 0.1712, |
| "mean_token_accuracy": 0.944629642367363, |
| "num_tokens": 5622673.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 6.94496644295302, |
| "grad_norm": 0.3805997669696808, |
| "learning_rate": 8.467483795284987e-06, |
| "loss": 0.1388, |
| "mean_token_accuracy": 0.9575351625680923, |
| "num_tokens": 5643003.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 6.971812080536913, |
| "grad_norm": 0.3734448552131653, |
| "learning_rate": 8.454917671047213e-06, |
| "loss": 0.1271, |
| "mean_token_accuracy": 0.9564762502908707, |
| "num_tokens": 5665927.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 6.998657718120805, |
| "grad_norm": 0.5456552505493164, |
| "learning_rate": 8.442309645495322e-06, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9499319314956665, |
| "num_tokens": 5686349.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 7.024161073825503, |
| "grad_norm": 0.40639278292655945, |
| "learning_rate": 8.429659871539709e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9616394952723855, |
| "num_tokens": 5708418.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 7.051006711409396, |
| "grad_norm": 0.44582584500312805, |
| "learning_rate": 8.416968502597101e-06, |
| "loss": 0.1311, |
| "mean_token_accuracy": 0.9550166130065918, |
| "num_tokens": 5731562.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 7.0778523489932885, |
| "grad_norm": 0.42072245478630066, |
| "learning_rate": 8.404235692588682e-06, |
| "loss": 0.1242, |
| "mean_token_accuracy": 0.9576824128627777, |
| "num_tokens": 5753681.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 7.1046979865771815, |
| "grad_norm": 0.5467314720153809, |
| "learning_rate": 8.391461595938245e-06, |
| "loss": 0.1659, |
| "mean_token_accuracy": 0.9464216500520706, |
| "num_tokens": 5774718.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 7.131543624161074, |
| "grad_norm": 0.45191726088523865, |
| "learning_rate": 8.378646367570302e-06, |
| "loss": 0.1661, |
| "mean_token_accuracy": 0.9496298342943191, |
| "num_tokens": 5794201.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 7.158389261744967, |
| "grad_norm": 0.4229516088962555, |
| "learning_rate": 8.36579016290821e-06, |
| "loss": 0.1168, |
| "mean_token_accuracy": 0.9608386904001236, |
| "num_tokens": 5817768.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 7.185234899328859, |
| "grad_norm": 0.42818018794059753, |
| "learning_rate": 8.352893137872292e-06, |
| "loss": 0.1219, |
| "mean_token_accuracy": 0.9569632887840271, |
| "num_tokens": 5840800.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 7.212080536912752, |
| "grad_norm": 0.3931344151496887, |
| "learning_rate": 8.339955448877934e-06, |
| "loss": 0.1284, |
| "mean_token_accuracy": 0.9573963195085525, |
| "num_tokens": 5862870.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 7.238926174496644, |
| "grad_norm": 0.5881303548812866, |
| "learning_rate": 8.326977252833704e-06, |
| "loss": 0.1682, |
| "mean_token_accuracy": 0.9461307436227798, |
| "num_tokens": 5883809.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 7.265771812080537, |
| "grad_norm": 0.44616442918777466, |
| "learning_rate": 8.313958707139434e-06, |
| "loss": 0.1528, |
| "mean_token_accuracy": 0.9527498662471772, |
| "num_tokens": 5903277.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 7.292617449664429, |
| "grad_norm": 0.39954400062561035, |
| "learning_rate": 8.300899969684322e-06, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9591354012489319, |
| "num_tokens": 5926773.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 7.319463087248322, |
| "grad_norm": 0.4336886405944824, |
| "learning_rate": 8.28780119884501e-06, |
| "loss": 0.1326, |
| "mean_token_accuracy": 0.9533321857452393, |
| "num_tokens": 5949796.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 7.346308724832214, |
| "grad_norm": 0.45649003982543945, |
| "learning_rate": 8.274662553483662e-06, |
| "loss": 0.12, |
| "mean_token_accuracy": 0.9590807974338531, |
| "num_tokens": 5971946.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 7.373154362416107, |
| "grad_norm": 0.5981083512306213, |
| "learning_rate": 8.26148419294605e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.9494126617908478, |
| "num_tokens": 5993002.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 7.4, |
| "grad_norm": 0.5191400051116943, |
| "learning_rate": 8.248266277059607e-06, |
| "loss": 0.1636, |
| "mean_token_accuracy": 0.9508023709058762, |
| "num_tokens": 6012457.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 7.4268456375838925, |
| "grad_norm": 0.45730146765708923, |
| "learning_rate": 8.235008966131492e-06, |
| "loss": 0.1257, |
| "mean_token_accuracy": 0.9582333266735077, |
| "num_tokens": 6035995.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 7.4536912751677855, |
| "grad_norm": 0.4261474907398224, |
| "learning_rate": 8.221712420946651e-06, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9531764149665832, |
| "num_tokens": 6058903.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 7.480536912751678, |
| "grad_norm": 0.458927720785141, |
| "learning_rate": 8.208376802765866e-06, |
| "loss": 0.1296, |
| "mean_token_accuracy": 0.9571897268295289, |
| "num_tokens": 6080945.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 7.507382550335571, |
| "grad_norm": 0.6553865671157837, |
| "learning_rate": 8.195002273323792e-06, |
| "loss": 0.1593, |
| "mean_token_accuracy": 0.9487423598766327, |
| "num_tokens": 6101890.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 7.534228187919463, |
| "grad_norm": 0.4802383780479431, |
| "learning_rate": 8.181588994827005e-06, |
| "loss": 0.1583, |
| "mean_token_accuracy": 0.9515509098768234, |
| "num_tokens": 6121346.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 7.561073825503356, |
| "grad_norm": 0.4876711666584015, |
| "learning_rate": 8.168137129952027e-06, |
| "loss": 0.1225, |
| "mean_token_accuracy": 0.9582916587591171, |
| "num_tokens": 6144990.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 7.587919463087248, |
| "grad_norm": 0.45218655467033386, |
| "learning_rate": 8.154646841843358e-06, |
| "loss": 0.1328, |
| "mean_token_accuracy": 0.9530595809221267, |
| "num_tokens": 6168024.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 7.614765100671141, |
| "grad_norm": 0.463436484336853, |
| "learning_rate": 8.141118294111496e-06, |
| "loss": 0.1327, |
| "mean_token_accuracy": 0.9549932539463043, |
| "num_tokens": 6190149.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 7.641610738255034, |
| "grad_norm": 0.6307859420776367, |
| "learning_rate": 8.127551650830954e-06, |
| "loss": 0.1635, |
| "mean_token_accuracy": 0.9493362814188003, |
| "num_tokens": 6211195.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 7.668456375838926, |
| "grad_norm": 0.4908117949962616, |
| "learning_rate": 8.113947076538264e-06, |
| "loss": 0.1687, |
| "mean_token_accuracy": 0.9480661511421203, |
| "num_tokens": 6230721.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 7.695302013422819, |
| "grad_norm": 0.3950389623641968, |
| "learning_rate": 8.100304736229991e-06, |
| "loss": 0.1197, |
| "mean_token_accuracy": 0.9598756283521652, |
| "num_tokens": 6254447.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 7.722147651006711, |
| "grad_norm": 0.4120844006538391, |
| "learning_rate": 8.086624795360723e-06, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9543819516897202, |
| "num_tokens": 6277445.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 7.748993288590604, |
| "grad_norm": 0.3652952313423157, |
| "learning_rate": 8.07290741984107e-06, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9587823182344437, |
| "num_tokens": 6299402.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 7.7758389261744965, |
| "grad_norm": 0.5701743960380554, |
| "learning_rate": 8.059152776035653e-06, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.9458971083164215, |
| "num_tokens": 6320105.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 7.8026845637583895, |
| "grad_norm": 0.4986805319786072, |
| "learning_rate": 8.045361030761082e-06, |
| "loss": 0.1462, |
| "mean_token_accuracy": 0.9533874779939652, |
| "num_tokens": 6339460.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 7.829530201342282, |
| "grad_norm": 0.45700371265411377, |
| "learning_rate": 8.03153235128393e-06, |
| "loss": 0.1116, |
| "mean_token_accuracy": 0.9626628488302231, |
| "num_tokens": 6363130.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 7.856375838926175, |
| "grad_norm": 0.4490987956523895, |
| "learning_rate": 8.017666905318712e-06, |
| "loss": 0.1313, |
| "mean_token_accuracy": 0.9540461421012878, |
| "num_tokens": 6386118.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 7.883221476510067, |
| "grad_norm": 0.3797796666622162, |
| "learning_rate": 8.003764861025853e-06, |
| "loss": 0.1231, |
| "mean_token_accuracy": 0.9578628242015839, |
| "num_tokens": 6408209.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 7.91006711409396, |
| "grad_norm": 0.5887638330459595, |
| "learning_rate": 7.989826387009634e-06, |
| "loss": 0.1537, |
| "mean_token_accuracy": 0.9492853492498398, |
| "num_tokens": 6429237.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 7.936912751677852, |
| "grad_norm": 0.5216886401176453, |
| "learning_rate": 7.975851652316162e-06, |
| "loss": 0.1568, |
| "mean_token_accuracy": 0.9521301418542862, |
| "num_tokens": 6448673.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 7.963758389261745, |
| "grad_norm": 0.4166746735572815, |
| "learning_rate": 7.961840826431314e-06, |
| "loss": 0.1264, |
| "mean_token_accuracy": 0.957261809706688, |
| "num_tokens": 6471731.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 7.990604026845638, |
| "grad_norm": 0.5581911206245422, |
| "learning_rate": 7.947794079278678e-06, |
| "loss": 0.1457, |
| "mean_token_accuracy": 0.9534464627504349, |
| "num_tokens": 6492971.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 8.016107382550336, |
| "grad_norm": 0.4519020915031433, |
| "learning_rate": 7.933711581217501e-06, |
| "loss": 0.119, |
| "mean_token_accuracy": 0.9589253067970276, |
| "num_tokens": 6513617.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 8.042953020134227, |
| "grad_norm": 0.44895055890083313, |
| "learning_rate": 7.919593503040616e-06, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.9571912050247192, |
| "num_tokens": 6536816.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 8.06979865771812, |
| "grad_norm": 0.4848621189594269, |
| "learning_rate": 7.905440015972372e-06, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9568954467773437, |
| "num_tokens": 6559060.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 8.096644295302013, |
| "grad_norm": 0.7106390595436096, |
| "learning_rate": 7.891251291666554e-06, |
| "loss": 0.1547, |
| "mean_token_accuracy": 0.9491069823503494, |
| "num_tokens": 6580168.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 8.123489932885906, |
| "grad_norm": 0.5733054876327515, |
| "learning_rate": 7.877027502204311e-06, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.9492790251970291, |
| "num_tokens": 6599902.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 8.1503355704698, |
| "grad_norm": 0.5422059297561646, |
| "learning_rate": 7.862768820092061e-06, |
| "loss": 0.1234, |
| "mean_token_accuracy": 0.9600847691297532, |
| "num_tokens": 6622088.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 8.17718120805369, |
| "grad_norm": 0.4836737811565399, |
| "learning_rate": 7.848475418259399e-06, |
| "loss": 0.121, |
| "mean_token_accuracy": 0.9583497941493988, |
| "num_tokens": 6645425.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 8.204026845637584, |
| "grad_norm": 0.4681214988231659, |
| "learning_rate": 7.834147470057006e-06, |
| "loss": 0.1252, |
| "mean_token_accuracy": 0.957104617357254, |
| "num_tokens": 6667702.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 8.230872483221477, |
| "grad_norm": 0.6612587571144104, |
| "learning_rate": 7.819785149254534e-06, |
| "loss": 0.135, |
| "mean_token_accuracy": 0.9549255698919297, |
| "num_tokens": 6689095.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 8.25771812080537, |
| "grad_norm": 0.5453577637672424, |
| "learning_rate": 7.805388630038512e-06, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.9499281167984008, |
| "num_tokens": 6709062.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 8.284563758389261, |
| "grad_norm": 0.5287258625030518, |
| "learning_rate": 7.790958087010234e-06, |
| "loss": 0.1275, |
| "mean_token_accuracy": 0.9588546067476272, |
| "num_tokens": 6731280.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 8.311409395973154, |
| "grad_norm": 0.5968199372291565, |
| "learning_rate": 7.776493695183623e-06, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.9554890125989914, |
| "num_tokens": 6754515.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 8.338255033557047, |
| "grad_norm": 0.4913158118724823, |
| "learning_rate": 7.761995629983129e-06, |
| "loss": 0.1253, |
| "mean_token_accuracy": 0.9566126644611359, |
| "num_tokens": 6776753.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 8.36510067114094, |
| "grad_norm": 0.853453516960144, |
| "learning_rate": 7.74746406724159e-06, |
| "loss": 0.1529, |
| "mean_token_accuracy": 0.9489505797624588, |
| "num_tokens": 6797907.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 8.391946308724831, |
| "grad_norm": 0.5716426968574524, |
| "learning_rate": 7.732899183198108e-06, |
| "loss": 0.1569, |
| "mean_token_accuracy": 0.9509849786758423, |
| "num_tokens": 6817642.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 8.418791946308724, |
| "grad_norm": 0.4912736117839813, |
| "learning_rate": 7.718301154495897e-06, |
| "loss": 0.1211, |
| "mean_token_accuracy": 0.9595335066318512, |
| "num_tokens": 6839853.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 8.445637583892617, |
| "grad_norm": 0.5178288817405701, |
| "learning_rate": 7.70367015818016e-06, |
| "loss": 0.1162, |
| "mean_token_accuracy": 0.9598266303539276, |
| "num_tokens": 6863268.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 8.47248322147651, |
| "grad_norm": 0.4956505000591278, |
| "learning_rate": 7.689006371695928e-06, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.957508260011673, |
| "num_tokens": 6885638.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 8.499328859060403, |
| "grad_norm": 0.7209757566452026, |
| "learning_rate": 7.674309972885909e-06, |
| "loss": 0.1411, |
| "mean_token_accuracy": 0.9534810066223145, |
| "num_tokens": 6907038.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 8.526174496644295, |
| "grad_norm": 0.6008272767066956, |
| "learning_rate": 7.659581139988339e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9486708849668503, |
| "num_tokens": 6926858.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 8.553020134228188, |
| "grad_norm": 0.5501519441604614, |
| "learning_rate": 7.644820051634813e-06, |
| "loss": 0.1162, |
| "mean_token_accuracy": 0.9615104466676712, |
| "num_tokens": 6949093.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 8.57986577181208, |
| "grad_norm": 0.6659561991691589, |
| "learning_rate": 7.630026886848118e-06, |
| "loss": 0.1237, |
| "mean_token_accuracy": 0.9558080345392227, |
| "num_tokens": 6972467.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 8.606711409395974, |
| "grad_norm": 0.5133163332939148, |
| "learning_rate": 7.61520182504007e-06, |
| "loss": 0.1239, |
| "mean_token_accuracy": 0.957629781961441, |
| "num_tokens": 6994729.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 8.633557046979865, |
| "grad_norm": 0.7774618864059448, |
| "learning_rate": 7.60034504600933e-06, |
| "loss": 0.1402, |
| "mean_token_accuracy": 0.9532318562269211, |
| "num_tokens": 7015998.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 8.660402684563758, |
| "grad_norm": 0.604790985584259, |
| "learning_rate": 7.585456729939225e-06, |
| "loss": 0.1663, |
| "mean_token_accuracy": 0.9489589869976044, |
| "num_tokens": 7035712.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 8.687248322147651, |
| "grad_norm": 0.5196418166160583, |
| "learning_rate": 7.570537057395566e-06, |
| "loss": 0.1141, |
| "mean_token_accuracy": 0.9628522455692291, |
| "num_tokens": 7057968.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 8.714093959731544, |
| "grad_norm": 0.5068167448043823, |
| "learning_rate": 7.555586209324455e-06, |
| "loss": 0.1159, |
| "mean_token_accuracy": 0.9594800651073456, |
| "num_tokens": 7081387.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 8.740939597315435, |
| "grad_norm": 0.504896879196167, |
| "learning_rate": 7.540604367050091e-06, |
| "loss": 0.1341, |
| "mean_token_accuracy": 0.9530034631490707, |
| "num_tokens": 7103855.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 8.767785234899328, |
| "grad_norm": 0.5507974624633789, |
| "learning_rate": 7.525591712272574e-06, |
| "loss": 0.1216, |
| "mean_token_accuracy": 0.9571410089731216, |
| "num_tokens": 7125521.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 8.794630872483221, |
| "grad_norm": 0.5960702300071716, |
| "learning_rate": 7.510548427065693e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.9465545684099197, |
| "num_tokens": 7145574.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 8.821476510067114, |
| "grad_norm": 0.4573246240615845, |
| "learning_rate": 7.495474693874731e-06, |
| "loss": 0.1201, |
| "mean_token_accuracy": 0.9599078118801116, |
| "num_tokens": 7167649.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 8.848322147651007, |
| "grad_norm": 0.4620523154735565, |
| "learning_rate": 7.4803706955142385e-06, |
| "loss": 0.1269, |
| "mean_token_accuracy": 0.9555114895105362, |
| "num_tokens": 7190906.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 8.875167785234899, |
| "grad_norm": 0.438799113035202, |
| "learning_rate": 7.465236615165826e-06, |
| "loss": 0.1204, |
| "mean_token_accuracy": 0.9585326343774796, |
| "num_tokens": 7213279.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 8.902013422818792, |
| "grad_norm": 0.7440369129180908, |
| "learning_rate": 7.450072636375939e-06, |
| "loss": 0.1266, |
| "mean_token_accuracy": 0.9576111942529678, |
| "num_tokens": 7234946.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 8.928859060402685, |
| "grad_norm": 0.6318687796592712, |
| "learning_rate": 7.4348789430536275e-06, |
| "loss": 0.1714, |
| "mean_token_accuracy": 0.9475503444671631, |
| "num_tokens": 7255133.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 8.955704697986578, |
| "grad_norm": 0.523369312286377, |
| "learning_rate": 7.4196557194683265e-06, |
| "loss": 0.1218, |
| "mean_token_accuracy": 0.9601493507623673, |
| "num_tokens": 7277115.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 8.982550335570469, |
| "grad_norm": 0.592207133769989, |
| "learning_rate": 7.40440315024761e-06, |
| "loss": 0.1281, |
| "mean_token_accuracy": 0.9569019913673401, |
| "num_tokens": 7299266.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 9.008053691275167, |
| "grad_norm": 0.656379759311676, |
| "learning_rate": 7.389121420374961e-06, |
| "loss": 0.1389, |
| "mean_token_accuracy": 0.9550388643616124, |
| "num_tokens": 7318761.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 9.03489932885906, |
| "grad_norm": 0.5115005970001221, |
| "learning_rate": 7.373810715187516e-06, |
| "loss": 0.1067, |
| "mean_token_accuracy": 0.9614180713891983, |
| "num_tokens": 7342447.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 9.061744966442953, |
| "grad_norm": 0.4467087984085083, |
| "learning_rate": 7.358471220373831e-06, |
| "loss": 0.1185, |
| "mean_token_accuracy": 0.9584500521421433, |
| "num_tokens": 7365127.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 9.088590604026846, |
| "grad_norm": 0.6290796399116516, |
| "learning_rate": 7.343103121971623e-06, |
| "loss": 0.1196, |
| "mean_token_accuracy": 0.9590140283107758, |
| "num_tokens": 7386969.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 9.115436241610738, |
| "grad_norm": 0.684598445892334, |
| "learning_rate": 7.327706606365512e-06, |
| "loss": 0.1561, |
| "mean_token_accuracy": 0.9502658367156982, |
| "num_tokens": 7407550.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 9.14228187919463, |
| "grad_norm": 0.7307904362678528, |
| "learning_rate": 7.3122818602847624e-06, |
| "loss": 0.129, |
| "mean_token_accuracy": 0.9587647318840027, |
| "num_tokens": 7428380.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 9.169127516778524, |
| "grad_norm": 0.6057212352752686, |
| "learning_rate": 7.296829070801017e-06, |
| "loss": 0.1098, |
| "mean_token_accuracy": 0.9636854767799378, |
| "num_tokens": 7452040.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 9.195973154362417, |
| "grad_norm": 0.589817225933075, |
| "learning_rate": 7.281348425326034e-06, |
| "loss": 0.1226, |
| "mean_token_accuracy": 0.9586849749088288, |
| "num_tokens": 7474538.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 9.22281879194631, |
| "grad_norm": 0.8453562259674072, |
| "learning_rate": 7.265840111609405e-06, |
| "loss": 0.1187, |
| "mean_token_accuracy": 0.9598051875829696, |
| "num_tokens": 7496163.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 9.2496644295302, |
| "grad_norm": 0.6440286636352539, |
| "learning_rate": 7.250304317736286e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.9493000030517578, |
| "num_tokens": 7516288.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 9.276510067114094, |
| "grad_norm": 0.7584598064422607, |
| "learning_rate": 7.234741232125111e-06, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9604197144508362, |
| "num_tokens": 7537035.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 9.303355704697987, |
| "grad_norm": 0.623458743095398, |
| "learning_rate": 7.219151043525311e-06, |
| "loss": 0.1179, |
| "mean_token_accuracy": 0.9581877291202545, |
| "num_tokens": 7560587.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 9.33020134228188, |
| "grad_norm": 0.5248258113861084, |
| "learning_rate": 7.203533941015019e-06, |
| "loss": 0.1235, |
| "mean_token_accuracy": 0.9568084627389908, |
| "num_tokens": 7583176.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 9.357046979865771, |
| "grad_norm": 0.8949034214019775, |
| "learning_rate": 7.1878901139987826e-06, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9586084365844727, |
| "num_tokens": 7604880.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 9.383892617449664, |
| "grad_norm": 0.7485925555229187, |
| "learning_rate": 7.172219752205265e-06, |
| "loss": 0.1614, |
| "mean_token_accuracy": 0.9506101936101914, |
| "num_tokens": 7625182.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 9.410738255033557, |
| "grad_norm": 0.6359798312187195, |
| "learning_rate": 7.156523045684944e-06, |
| "loss": 0.1216, |
| "mean_token_accuracy": 0.9604356437921524, |
| "num_tokens": 7645893.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 9.43758389261745, |
| "grad_norm": 0.6602040529251099, |
| "learning_rate": 7.140800184807805e-06, |
| "loss": 0.1085, |
| "mean_token_accuracy": 0.9612102717161178, |
| "num_tokens": 7669544.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 9.464429530201341, |
| "grad_norm": 0.5637961030006409, |
| "learning_rate": 7.1250513602610364e-06, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.9569024622440339, |
| "num_tokens": 7692213.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 9.491275167785235, |
| "grad_norm": 0.7590091228485107, |
| "learning_rate": 7.109276763046713e-06, |
| "loss": 0.1172, |
| "mean_token_accuracy": 0.9609732508659363, |
| "num_tokens": 7713906.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 9.518120805369128, |
| "grad_norm": 0.8090452551841736, |
| "learning_rate": 7.09347658447948e-06, |
| "loss": 0.1623, |
| "mean_token_accuracy": 0.947560715675354, |
| "num_tokens": 7734201.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 9.54496644295302, |
| "grad_norm": 0.5834486484527588, |
| "learning_rate": 7.077651016184235e-06, |
| "loss": 0.1259, |
| "mean_token_accuracy": 0.9605411350727081, |
| "num_tokens": 7755049.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 9.571812080536914, |
| "grad_norm": 0.541755735874176, |
| "learning_rate": 7.061800250093804e-06, |
| "loss": 0.116, |
| "mean_token_accuracy": 0.9606775552034378, |
| "num_tokens": 7778737.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 9.598657718120805, |
| "grad_norm": 0.5823907852172852, |
| "learning_rate": 7.0459244784466115e-06, |
| "loss": 0.1249, |
| "mean_token_accuracy": 0.9562828868627549, |
| "num_tokens": 7801497.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 9.625503355704698, |
| "grad_norm": 0.7929330468177795, |
| "learning_rate": 7.03002389378435e-06, |
| "loss": 0.1255, |
| "mean_token_accuracy": 0.9591152399778367, |
| "num_tokens": 7823289.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 9.65234899328859, |
| "grad_norm": 0.7745299935340881, |
| "learning_rate": 7.014098688949643e-06, |
| "loss": 0.168, |
| "mean_token_accuracy": 0.9466694802045822, |
| "num_tokens": 7843625.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 9.679194630872484, |
| "grad_norm": 0.6910697817802429, |
| "learning_rate": 6.998149057083711e-06, |
| "loss": 0.1208, |
| "mean_token_accuracy": 0.9632305800914764, |
| "num_tokens": 7864423.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 9.706040268456375, |
| "grad_norm": 0.7385261654853821, |
| "learning_rate": 6.982175191624022e-06, |
| "loss": 0.1148, |
| "mean_token_accuracy": 0.9590048015117645, |
| "num_tokens": 7888184.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 9.732885906040268, |
| "grad_norm": 0.6523067355155945, |
| "learning_rate": 6.966177286301954e-06, |
| "loss": 0.1232, |
| "mean_token_accuracy": 0.9572295129299164, |
| "num_tokens": 7910819.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 9.759731543624161, |
| "grad_norm": 0.7242245674133301, |
| "learning_rate": 6.950155535140439e-06, |
| "loss": 0.1198, |
| "mean_token_accuracy": 0.9601730585098267, |
| "num_tokens": 7932588.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 9.786577181208054, |
| "grad_norm": 0.724774181842804, |
| "learning_rate": 6.934110132451611e-06, |
| "loss": 0.1595, |
| "mean_token_accuracy": 0.9477441519498825, |
| "num_tokens": 7953004.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 9.813422818791945, |
| "grad_norm": 0.5886031985282898, |
| "learning_rate": 6.918041272834451e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.9589419364929199, |
| "num_tokens": 7973860.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 9.840268456375838, |
| "grad_norm": 0.5724550485610962, |
| "learning_rate": 6.901949151172427e-06, |
| "loss": 0.1139, |
| "mean_token_accuracy": 0.9589464545249939, |
| "num_tokens": 7997506.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 9.867114093959731, |
| "grad_norm": 0.6673897504806519, |
| "learning_rate": 6.885833962631126e-06, |
| "loss": 0.1317, |
| "mean_token_accuracy": 0.954210615158081, |
| "num_tokens": 8020020.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 9.893959731543625, |
| "grad_norm": 0.7852922081947327, |
| "learning_rate": 6.869695902655898e-06, |
| "loss": 0.1195, |
| "mean_token_accuracy": 0.9600458711385726, |
| "num_tokens": 8041729.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 9.920805369127518, |
| "grad_norm": 0.6804185509681702, |
| "learning_rate": 6.8535351669694694e-06, |
| "loss": 0.1654, |
| "mean_token_accuracy": 0.947740015387535, |
| "num_tokens": 8061952.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 9.947651006711409, |
| "grad_norm": 0.5824105143547058, |
| "learning_rate": 6.837351951569584e-06, |
| "loss": 0.1306, |
| "mean_token_accuracy": 0.9590909868478775, |
| "num_tokens": 8082665.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 9.974496644295302, |
| "grad_norm": 0.5961267948150635, |
| "learning_rate": 6.821146452726617e-06, |
| "loss": 0.1166, |
| "mean_token_accuracy": 0.959626880288124, |
| "num_tokens": 8105297.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.5416700839996338, |
| "learning_rate": 6.8049188669812024e-06, |
| "loss": 0.1366, |
| "mean_token_accuracy": 0.9568514949397037, |
| "num_tokens": 8123690.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 10.026845637583893, |
| "grad_norm": 0.5632114410400391, |
| "learning_rate": 6.788669391141837e-06, |
| "loss": 0.0991, |
| "mean_token_accuracy": 0.9649410545825958, |
| "num_tokens": 8147756.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 10.053691275167786, |
| "grad_norm": 0.6619052290916443, |
| "learning_rate": 6.772398222282507e-06, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9557583898305893, |
| "num_tokens": 8170704.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 10.080536912751677, |
| "grad_norm": 0.7061659097671509, |
| "learning_rate": 6.756105557740289e-06, |
| "loss": 0.1112, |
| "mean_token_accuracy": 0.961877191066742, |
| "num_tokens": 8192810.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 10.10738255033557, |
| "grad_norm": 0.8877546787261963, |
| "learning_rate": 6.739791595112964e-06, |
| "loss": 0.1295, |
| "mean_token_accuracy": 0.9570236325263977, |
| "num_tokens": 8213881.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 10.134228187919463, |
| "grad_norm": 0.7186614871025085, |
| "learning_rate": 6.7234565322566116e-06, |
| "loss": 0.1394, |
| "mean_token_accuracy": 0.9572271972894668, |
| "num_tokens": 8233288.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 10.161073825503356, |
| "grad_norm": 0.8258345723152161, |
| "learning_rate": 6.707100567283217e-06, |
| "loss": 0.1079, |
| "mean_token_accuracy": 0.9627465546131134, |
| "num_tokens": 8257423.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 10.187919463087248, |
| "grad_norm": 0.721269428730011, |
| "learning_rate": 6.690723898558267e-06, |
| "loss": 0.1125, |
| "mean_token_accuracy": 0.959991529583931, |
| "num_tokens": 8280282.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 10.21476510067114, |
| "grad_norm": 0.6415335536003113, |
| "learning_rate": 6.6743267246983445e-06, |
| "loss": 0.1063, |
| "mean_token_accuracy": 0.9647485375404358, |
| "num_tokens": 8302156.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 10.241610738255034, |
| "grad_norm": 1.024770736694336, |
| "learning_rate": 6.657909244568721e-06, |
| "loss": 0.1499, |
| "mean_token_accuracy": 0.9528948366641998, |
| "num_tokens": 8322759.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 10.268456375838927, |
| "grad_norm": 0.6429427266120911, |
| "learning_rate": 6.641471657280937e-06, |
| "loss": 0.132, |
| "mean_token_accuracy": 0.959844994544983, |
| "num_tokens": 8342010.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 10.29530201342282, |
| "grad_norm": 0.7093128561973572, |
| "learning_rate": 6.625014162190397e-06, |
| "loss": 0.1042, |
| "mean_token_accuracy": 0.9622471898794174, |
| "num_tokens": 8366113.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 10.322147651006711, |
| "grad_norm": 0.5344433784484863, |
| "learning_rate": 6.608536958893948e-06, |
| "loss": 0.1119, |
| "mean_token_accuracy": 0.9615410745143891, |
| "num_tokens": 8388950.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 10.348993288590604, |
| "grad_norm": 0.6051499843597412, |
| "learning_rate": 6.59204024722746e-06, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9628863126039505, |
| "num_tokens": 8410787.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 10.375838926174497, |
| "grad_norm": 0.9594293236732483, |
| "learning_rate": 6.575524227263397e-06, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.951352596282959, |
| "num_tokens": 8431450.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 10.40268456375839, |
| "grad_norm": 0.7486820816993713, |
| "learning_rate": 6.5589890993083934e-06, |
| "loss": 0.1314, |
| "mean_token_accuracy": 0.9604231595993042, |
| "num_tokens": 8450653.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 10.429530201342281, |
| "grad_norm": 0.8094140887260437, |
| "learning_rate": 6.542435063900834e-06, |
| "loss": 0.1078, |
| "mean_token_accuracy": 0.9626900613307953, |
| "num_tokens": 8474784.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 10.456375838926174, |
| "grad_norm": 1.2699894905090332, |
| "learning_rate": 6.525862321808403e-06, |
| "loss": 0.1137, |
| "mean_token_accuracy": 0.9591136366128922, |
| "num_tokens": 8497679.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 10.483221476510067, |
| "grad_norm": 0.7747824788093567, |
| "learning_rate": 6.509271074025668e-06, |
| "loss": 0.1107, |
| "mean_token_accuracy": 0.9634034723043442, |
| "num_tokens": 8519621.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 10.51006711409396, |
| "grad_norm": 0.882911741733551, |
| "learning_rate": 6.49266152177163e-06, |
| "loss": 0.153, |
| "mean_token_accuracy": 0.9512349933385849, |
| "num_tokens": 8540214.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 10.536912751677852, |
| "grad_norm": 0.6655781865119934, |
| "learning_rate": 6.476033866487287e-06, |
| "loss": 0.1283, |
| "mean_token_accuracy": 0.9606866925954819, |
| "num_tokens": 8559450.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 10.563758389261745, |
| "grad_norm": 0.6774150729179382, |
| "learning_rate": 6.459388309833193e-06, |
| "loss": 0.1069, |
| "mean_token_accuracy": 0.9630162745714188, |
| "num_tokens": 8583592.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 10.590604026845638, |
| "grad_norm": 0.764737606048584, |
| "learning_rate": 6.442725053687009e-06, |
| "loss": 0.1205, |
| "mean_token_accuracy": 0.9564787149429321, |
| "num_tokens": 8606534.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 10.61744966442953, |
| "grad_norm": 0.6622138023376465, |
| "learning_rate": 6.426044300141054e-06, |
| "loss": 0.1116, |
| "mean_token_accuracy": 0.9616225004196167, |
| "num_tokens": 8628580.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 10.644295302013422, |
| "grad_norm": 1.1456904411315918, |
| "learning_rate": 6.409346251499859e-06, |
| "loss": 0.147, |
| "mean_token_accuracy": 0.9527129501104354, |
| "num_tokens": 8649304.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 10.671140939597315, |
| "grad_norm": 0.723003625869751, |
| "learning_rate": 6.392631110277707e-06, |
| "loss": 0.1241, |
| "mean_token_accuracy": 0.9604730904102325, |
| "num_tokens": 8668520.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 10.697986577181208, |
| "grad_norm": 0.81331866979599, |
| "learning_rate": 6.375899079196184e-06, |
| "loss": 0.1078, |
| "mean_token_accuracy": 0.9621582269668579, |
| "num_tokens": 8692652.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 10.724832214765101, |
| "grad_norm": 0.6479800343513489, |
| "learning_rate": 6.3591503611817155e-06, |
| "loss": 0.1157, |
| "mean_token_accuracy": 0.9594815254211426, |
| "num_tokens": 8715659.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 10.751677852348994, |
| "grad_norm": 0.7673507332801819, |
| "learning_rate": 6.342385159363102e-06, |
| "loss": 0.1183, |
| "mean_token_accuracy": 0.959958502650261, |
| "num_tokens": 8737763.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 10.778523489932885, |
| "grad_norm": 1.0533620119094849, |
| "learning_rate": 6.325603677069067e-06, |
| "loss": 0.1515, |
| "mean_token_accuracy": 0.9495168924331665, |
| "num_tokens": 8758611.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 10.805369127516778, |
| "grad_norm": 0.8211329579353333, |
| "learning_rate": 6.308806117825777e-06, |
| "loss": 0.1358, |
| "mean_token_accuracy": 0.9576447039842606, |
| "num_tokens": 8777989.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 10.832214765100671, |
| "grad_norm": 0.7280214428901672, |
| "learning_rate": 6.291992685354386e-06, |
| "loss": 0.1117, |
| "mean_token_accuracy": 0.961353474855423, |
| "num_tokens": 8802066.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 10.859060402684564, |
| "grad_norm": 0.7263833284378052, |
| "learning_rate": 6.2751635835685575e-06, |
| "loss": 0.1164, |
| "mean_token_accuracy": 0.9582950919866562, |
| "num_tokens": 8824929.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 10.885906040268456, |
| "grad_norm": 0.7844878435134888, |
| "learning_rate": 6.25831901657199e-06, |
| "loss": 0.1105, |
| "mean_token_accuracy": 0.9616876095533371, |
| "num_tokens": 8846953.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 10.912751677852349, |
| "grad_norm": 1.225616693496704, |
| "learning_rate": 6.241459188655944e-06, |
| "loss": 0.1451, |
| "mean_token_accuracy": 0.9530714869499206, |
| "num_tokens": 8867986.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 10.939597315436242, |
| "grad_norm": 0.8232021927833557, |
| "learning_rate": 6.224584304296769e-06, |
| "loss": 0.1358, |
| "mean_token_accuracy": 0.9574162900447846, |
| "num_tokens": 8887395.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 10.966442953020135, |
| "grad_norm": 0.7063604593276978, |
| "learning_rate": 6.207694568153418e-06, |
| "loss": 0.1131, |
| "mean_token_accuracy": 0.9610060393810272, |
| "num_tokens": 8910823.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 10.993288590604028, |
| "grad_norm": 0.9226030707359314, |
| "learning_rate": 6.1907901850649636e-06, |
| "loss": 0.1344, |
| "mean_token_accuracy": 0.9559124350547791, |
| "num_tokens": 8931972.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 11.018791946308724, |
| "grad_norm": 0.6862479448318481, |
| "learning_rate": 6.1738713600481205e-06, |
| "loss": 0.106, |
| "mean_token_accuracy": 0.9632336026743838, |
| "num_tokens": 8953111.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 11.045637583892617, |
| "grad_norm": 0.8745356202125549, |
| "learning_rate": 6.156938298294752e-06, |
| "loss": 0.1082, |
| "mean_token_accuracy": 0.9619537621736527, |
| "num_tokens": 8976409.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 11.07248322147651, |
| "grad_norm": 0.7044370770454407, |
| "learning_rate": 6.139991205169391e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9622800439596176, |
| "num_tokens": 8998696.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 11.099328859060403, |
| "grad_norm": 1.0560659170150757, |
| "learning_rate": 6.123030286206736e-06, |
| "loss": 0.1132, |
| "mean_token_accuracy": 0.9616888105869293, |
| "num_tokens": 9020113.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 11.126174496644296, |
| "grad_norm": 0.7895893454551697, |
| "learning_rate": 6.106055747109169e-06, |
| "loss": 0.1325, |
| "mean_token_accuracy": 0.9582968652248383, |
| "num_tokens": 9039986.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 11.153020134228187, |
| "grad_norm": 0.9621571898460388, |
| "learning_rate": 6.089067793744258e-06, |
| "loss": 0.1044, |
| "mean_token_accuracy": 0.9661745488643646, |
| "num_tokens": 9062685.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 11.17986577181208, |
| "grad_norm": 0.9362059831619263, |
| "learning_rate": 6.0720666321422574e-06, |
| "loss": 0.1076, |
| "mean_token_accuracy": 0.9622334897518158, |
| "num_tokens": 9085989.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 11.206711409395973, |
| "grad_norm": 1.2377800941467285, |
| "learning_rate": 6.055052468493614e-06, |
| "loss": 0.114, |
| "mean_token_accuracy": 0.9611568659543991, |
| "num_tokens": 9108336.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 11.233557046979866, |
| "grad_norm": 1.4495049715042114, |
| "learning_rate": 6.038025509146459e-06, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9592485100030899, |
| "num_tokens": 9129631.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 11.260402684563758, |
| "grad_norm": 0.8522602915763855, |
| "learning_rate": 6.020985960604115e-06, |
| "loss": 0.1382, |
| "mean_token_accuracy": 0.9578620493412018, |
| "num_tokens": 9149360.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 11.28724832214765, |
| "grad_norm": 1.1033200025558472, |
| "learning_rate": 6.0039340295225845e-06, |
| "loss": 0.1053, |
| "mean_token_accuracy": 0.9642931789159774, |
| "num_tokens": 9172104.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 11.314093959731544, |
| "grad_norm": 0.9398940205574036, |
| "learning_rate": 5.986869922708048e-06, |
| "loss": 0.1073, |
| "mean_token_accuracy": 0.9612972408533096, |
| "num_tokens": 9195342.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 11.340939597315437, |
| "grad_norm": 0.7499808073043823, |
| "learning_rate": 5.969793847114349e-06, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9620550066232681, |
| "num_tokens": 9217655.0, |
| "step": 4230 |
| }, |
| { |
| "epoch": 11.367785234899328, |
| "grad_norm": 1.2773305177688599, |
| "learning_rate": 5.952706009840491e-06, |
| "loss": 0.1168, |
| "mean_token_accuracy": 0.9608386933803559, |
| "num_tokens": 9239082.0, |
| "step": 4240 |
| }, |
| { |
| "epoch": 11.394630872483221, |
| "grad_norm": 0.9053451418876648, |
| "learning_rate": 5.935606618128124e-06, |
| "loss": 0.1434, |
| "mean_token_accuracy": 0.9550743252038956, |
| "num_tokens": 9258824.0, |
| "step": 4250 |
| }, |
| { |
| "epoch": 11.421476510067114, |
| "grad_norm": 0.9365808367729187, |
| "learning_rate": 5.918495879359032e-06, |
| "loss": 0.1042, |
| "mean_token_accuracy": 0.9643994092941284, |
| "num_tokens": 9281610.0, |
| "step": 4260 |
| }, |
| { |
| "epoch": 11.448322147651007, |
| "grad_norm": 0.958976149559021, |
| "learning_rate": 5.901374001052614e-06, |
| "loss": 0.1048, |
| "mean_token_accuracy": 0.9623458862304688, |
| "num_tokens": 9304906.0, |
| "step": 4270 |
| }, |
| { |
| "epoch": 11.4751677852349, |
| "grad_norm": 1.1884407997131348, |
| "learning_rate": 5.884241190863367e-06, |
| "loss": 0.1097, |
| "mean_token_accuracy": 0.9642301768064498, |
| "num_tokens": 9327141.0, |
| "step": 4280 |
| }, |
| { |
| "epoch": 11.502013422818791, |
| "grad_norm": 1.3043391704559326, |
| "learning_rate": 5.867097656578375e-06, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9575917005538941, |
| "num_tokens": 9348426.0, |
| "step": 4290 |
| }, |
| { |
| "epoch": 11.528859060402684, |
| "grad_norm": 0.8189520835876465, |
| "learning_rate": 5.849943606114782e-06, |
| "loss": 0.1394, |
| "mean_token_accuracy": 0.9571978777647019, |
| "num_tokens": 9368198.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 11.555704697986577, |
| "grad_norm": 0.9798721075057983, |
| "learning_rate": 5.832779247517273e-06, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.9647763341665268, |
| "num_tokens": 9390810.0, |
| "step": 4310 |
| }, |
| { |
| "epoch": 11.58255033557047, |
| "grad_norm": 0.7926787734031677, |
| "learning_rate": 5.815604788955549e-06, |
| "loss": 0.1155, |
| "mean_token_accuracy": 0.9591783225536347, |
| "num_tokens": 9413905.0, |
| "step": 4320 |
| }, |
| { |
| "epoch": 11.609395973154362, |
| "grad_norm": 0.7051096558570862, |
| "learning_rate": 5.798420438721804e-06, |
| "loss": 0.1091, |
| "mean_token_accuracy": 0.9627108782529831, |
| "num_tokens": 9436047.0, |
| "step": 4330 |
| }, |
| { |
| "epoch": 11.636241610738255, |
| "grad_norm": 1.5358829498291016, |
| "learning_rate": 5.781226405228201e-06, |
| "loss": 0.1291, |
| "mean_token_accuracy": 0.956219607591629, |
| "num_tokens": 9457284.0, |
| "step": 4340 |
| }, |
| { |
| "epoch": 11.663087248322148, |
| "grad_norm": 0.8946168422698975, |
| "learning_rate": 5.764022897004336e-06, |
| "loss": 0.1376, |
| "mean_token_accuracy": 0.957264369726181, |
| "num_tokens": 9477014.0, |
| "step": 4350 |
| }, |
| { |
| "epoch": 11.68993288590604, |
| "grad_norm": 0.8450965285301208, |
| "learning_rate": 5.74681012269472e-06, |
| "loss": 0.1076, |
| "mean_token_accuracy": 0.9647704660892487, |
| "num_tokens": 9499686.0, |
| "step": 4360 |
| }, |
| { |
| "epoch": 11.716778523489932, |
| "grad_norm": 0.8520516157150269, |
| "learning_rate": 5.729588291056243e-06, |
| "loss": 0.1049, |
| "mean_token_accuracy": 0.9633782804012299, |
| "num_tokens": 9522983.0, |
| "step": 4370 |
| }, |
| { |
| "epoch": 11.743624161073825, |
| "grad_norm": 0.7502100467681885, |
| "learning_rate": 5.7123576109556386e-06, |
| "loss": 0.1043, |
| "mean_token_accuracy": 0.9639011263847351, |
| "num_tokens": 9545186.0, |
| "step": 4380 |
| }, |
| { |
| "epoch": 11.770469798657718, |
| "grad_norm": 1.220030665397644, |
| "learning_rate": 5.695118291366959e-06, |
| "loss": 0.1245, |
| "mean_token_accuracy": 0.9588271796703338, |
| "num_tokens": 9566314.0, |
| "step": 4390 |
| }, |
| { |
| "epoch": 11.797315436241611, |
| "grad_norm": 1.0362236499786377, |
| "learning_rate": 5.677870541369034e-06, |
| "loss": 0.1407, |
| "mean_token_accuracy": 0.9561160743236542, |
| "num_tokens": 9585969.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 11.824161073825504, |
| "grad_norm": 1.0146335363388062, |
| "learning_rate": 5.660614570142938e-06, |
| "loss": 0.1032, |
| "mean_token_accuracy": 0.9659360885620117, |
| "num_tokens": 9608480.0, |
| "step": 4410 |
| }, |
| { |
| "epoch": 11.851006711409395, |
| "grad_norm": 1.0298837423324585, |
| "learning_rate": 5.643350586969453e-06, |
| "loss": 0.1067, |
| "mean_token_accuracy": 0.9613613903522491, |
| "num_tokens": 9631569.0, |
| "step": 4420 |
| }, |
| { |
| "epoch": 11.877852348993288, |
| "grad_norm": 0.7821735739707947, |
| "learning_rate": 5.626078801226528e-06, |
| "loss": 0.1113, |
| "mean_token_accuracy": 0.961196494102478, |
| "num_tokens": 9653749.0, |
| "step": 4430 |
| }, |
| { |
| "epoch": 11.904697986577181, |
| "grad_norm": 2.3024230003356934, |
| "learning_rate": 5.608799422386744e-06, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9577797710895538, |
| "num_tokens": 9674872.0, |
| "step": 4440 |
| }, |
| { |
| "epoch": 11.931543624161074, |
| "grad_norm": 0.9050582051277161, |
| "learning_rate": 5.591512660014773e-06, |
| "loss": 0.1369, |
| "mean_token_accuracy": 0.9575184613466263, |
| "num_tokens": 9694480.0, |
| "step": 4450 |
| }, |
| { |
| "epoch": 11.958389261744966, |
| "grad_norm": 0.9048384428024292, |
| "learning_rate": 5.57421872376483e-06, |
| "loss": 0.1071, |
| "mean_token_accuracy": 0.9633950978517533, |
| "num_tokens": 9716740.0, |
| "step": 4460 |
| }, |
| { |
| "epoch": 11.985234899328859, |
| "grad_norm": 1.0750577449798584, |
| "learning_rate": 5.5569178233781384e-06, |
| "loss": 0.1131, |
| "mean_token_accuracy": 0.9621215164661407, |
| "num_tokens": 9738481.0, |
| "step": 4470 |
| }, |
| { |
| "epoch": 12.010738255033557, |
| "grad_norm": 0.8347861170768738, |
| "learning_rate": 5.539610168680381e-06, |
| "loss": 0.1152, |
| "mean_token_accuracy": 0.964070222879711, |
| "num_tokens": 9758315.0, |
| "step": 4480 |
| }, |
| { |
| "epoch": 12.03758389261745, |
| "grad_norm": 0.9832549691200256, |
| "learning_rate": 5.522295969579157e-06, |
| "loss": 0.0956, |
| "mean_token_accuracy": 0.9664029866456986, |
| "num_tokens": 9782015.0, |
| "step": 4490 |
| }, |
| { |
| "epoch": 12.064429530201343, |
| "grad_norm": 0.913514256477356, |
| "learning_rate": 5.50497543606144e-06, |
| "loss": 0.1038, |
| "mean_token_accuracy": 0.9633048325777054, |
| "num_tokens": 9804720.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 12.091275167785234, |
| "grad_norm": 2.7291688919067383, |
| "learning_rate": 5.487648778191021e-06, |
| "loss": 0.1018, |
| "mean_token_accuracy": 0.9650433152914047, |
| "num_tokens": 9826495.0, |
| "step": 4510 |
| }, |
| { |
| "epoch": 12.118120805369127, |
| "grad_norm": 1.1719425916671753, |
| "learning_rate": 5.470316206105971e-06, |
| "loss": 0.131, |
| "mean_token_accuracy": 0.9595217138528824, |
| "num_tokens": 9846765.0, |
| "step": 4520 |
| }, |
| { |
| "epoch": 12.14496644295302, |
| "grad_norm": 1.3220218420028687, |
| "learning_rate": 5.45297793001609e-06, |
| "loss": 0.1046, |
| "mean_token_accuracy": 0.9682448267936706, |
| "num_tokens": 9868041.0, |
| "step": 4530 |
| }, |
| { |
| "epoch": 12.171812080536913, |
| "grad_norm": 1.0875526666641235, |
| "learning_rate": 5.435634160200355e-06, |
| "loss": 0.0976, |
| "mean_token_accuracy": 0.9661436587572098, |
| "num_tokens": 9891664.0, |
| "step": 4540 |
| }, |
| { |
| "epoch": 12.198657718120806, |
| "grad_norm": 1.1350823640823364, |
| "learning_rate": 5.418285107004372e-06, |
| "loss": 0.1048, |
| "mean_token_accuracy": 0.9631434679031372, |
| "num_tokens": 9914268.0, |
| "step": 4550 |
| }, |
| { |
| "epoch": 12.225503355704697, |
| "grad_norm": 1.8183422088623047, |
| "learning_rate": 5.4009309808378185e-06, |
| "loss": 0.0967, |
| "mean_token_accuracy": 0.9672815710306167, |
| "num_tokens": 9936030.0, |
| "step": 4560 |
| }, |
| { |
| "epoch": 12.25234899328859, |
| "grad_norm": 1.1410795450210571, |
| "learning_rate": 5.383571992171904e-06, |
| "loss": 0.1385, |
| "mean_token_accuracy": 0.9560069739818573, |
| "num_tokens": 9956185.0, |
| "step": 4570 |
| }, |
| { |
| "epoch": 12.279194630872484, |
| "grad_norm": 1.5358262062072754, |
| "learning_rate": 5.366208351536809e-06, |
| "loss": 0.1037, |
| "mean_token_accuracy": 0.9659923285245895, |
| "num_tokens": 9977425.0, |
| "step": 4580 |
| }, |
| { |
| "epoch": 12.306040268456377, |
| "grad_norm": 1.0602080821990967, |
| "learning_rate": 5.34884026951913e-06, |
| "loss": 0.0997, |
| "mean_token_accuracy": 0.9650217086076737, |
| "num_tokens": 10001018.0, |
| "step": 4590 |
| }, |
| { |
| "epoch": 12.332885906040268, |
| "grad_norm": 1.0410484075546265, |
| "learning_rate": 5.331467956759331e-06, |
| "loss": 0.1144, |
| "mean_token_accuracy": 0.9604771822690964, |
| "num_tokens": 10023449.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 12.35973154362416, |
| "grad_norm": 1.281563401222229, |
| "learning_rate": 5.314091623949187e-06, |
| "loss": 0.1055, |
| "mean_token_accuracy": 0.9640044838190078, |
| "num_tokens": 10045006.0, |
| "step": 4610 |
| }, |
| { |
| "epoch": 12.386577181208054, |
| "grad_norm": 1.2597219944000244, |
| "learning_rate": 5.296711481829227e-06, |
| "loss": 0.1392, |
| "mean_token_accuracy": 0.9558876633644104, |
| "num_tokens": 10065200.0, |
| "step": 4620 |
| }, |
| { |
| "epoch": 12.413422818791947, |
| "grad_norm": 1.0641788244247437, |
| "learning_rate": 5.279327741186179e-06, |
| "loss": 0.1021, |
| "mean_token_accuracy": 0.9678240925073623, |
| "num_tokens": 10086432.0, |
| "step": 4630 |
| }, |
| { |
| "epoch": 12.440268456375838, |
| "grad_norm": 1.1310027837753296, |
| "learning_rate": 5.261940612850418e-06, |
| "loss": 0.0987, |
| "mean_token_accuracy": 0.9654311060905456, |
| "num_tokens": 10109941.0, |
| "step": 4640 |
| }, |
| { |
| "epoch": 12.467114093959731, |
| "grad_norm": 0.8842890858650208, |
| "learning_rate": 5.244550307693398e-06, |
| "loss": 0.1013, |
| "mean_token_accuracy": 0.9641594380140305, |
| "num_tokens": 10132421.0, |
| "step": 4650 |
| }, |
| { |
| "epoch": 12.493959731543624, |
| "grad_norm": 1.3634861707687378, |
| "learning_rate": 5.227157036625108e-06, |
| "loss": 0.1003, |
| "mean_token_accuracy": 0.9646014750003815, |
| "num_tokens": 10153998.0, |
| "step": 4660 |
| }, |
| { |
| "epoch": 12.520805369127517, |
| "grad_norm": 1.1904743909835815, |
| "learning_rate": 5.209761010591503e-06, |
| "loss": 0.1374, |
| "mean_token_accuracy": 0.9562786787748336, |
| "num_tokens": 10173970.0, |
| "step": 4670 |
| }, |
| { |
| "epoch": 12.54765100671141, |
| "grad_norm": 1.3061332702636719, |
| "learning_rate": 5.192362440571955e-06, |
| "loss": 0.1064, |
| "mean_token_accuracy": 0.965582725405693, |
| "num_tokens": 10195289.0, |
| "step": 4680 |
| }, |
| { |
| "epoch": 12.574496644295301, |
| "grad_norm": 1.0577377080917358, |
| "learning_rate": 5.174961537576685e-06, |
| "loss": 0.1013, |
| "mean_token_accuracy": 0.9635748207569123, |
| "num_tokens": 10218795.0, |
| "step": 4690 |
| }, |
| { |
| "epoch": 12.601342281879194, |
| "grad_norm": 0.8413182497024536, |
| "learning_rate": 5.15755851264421e-06, |
| "loss": 0.1114, |
| "mean_token_accuracy": 0.9613074272871017, |
| "num_tokens": 10241329.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 12.628187919463087, |
| "grad_norm": 3.429868221282959, |
| "learning_rate": 5.140153576838781e-06, |
| "loss": 0.1125, |
| "mean_token_accuracy": 0.9631948232650757, |
| "num_tokens": 10262842.0, |
| "step": 4710 |
| }, |
| { |
| "epoch": 12.65503355704698, |
| "grad_norm": 1.2405115365982056, |
| "learning_rate": 5.122746941247828e-06, |
| "loss": 0.1364, |
| "mean_token_accuracy": 0.958396029472351, |
| "num_tokens": 10282940.0, |
| "step": 4720 |
| }, |
| { |
| "epoch": 12.681879194630872, |
| "grad_norm": 1.0881459712982178, |
| "learning_rate": 5.105338816979393e-06, |
| "loss": 0.1116, |
| "mean_token_accuracy": 0.9615271121263504, |
| "num_tokens": 10304123.0, |
| "step": 4730 |
| }, |
| { |
| "epoch": 12.708724832214765, |
| "grad_norm": 1.1950105428695679, |
| "learning_rate": 5.087929415159571e-06, |
| "loss": 0.1009, |
| "mean_token_accuracy": 0.9633850902318954, |
| "num_tokens": 10327510.0, |
| "step": 4740 |
| }, |
| { |
| "epoch": 12.735570469798658, |
| "grad_norm": 0.8143473267555237, |
| "learning_rate": 5.070518946929954e-06, |
| "loss": 0.1052, |
| "mean_token_accuracy": 0.9625776678323745, |
| "num_tokens": 10349965.0, |
| "step": 4750 |
| }, |
| { |
| "epoch": 12.76241610738255, |
| "grad_norm": 1.3037388324737549, |
| "learning_rate": 5.053107623445067e-06, |
| "loss": 0.101, |
| "mean_token_accuracy": 0.9653885364532471, |
| "num_tokens": 10371686.0, |
| "step": 4760 |
| }, |
| { |
| "epoch": 12.789261744966442, |
| "grad_norm": 1.3862019777297974, |
| "learning_rate": 5.035695655869808e-06, |
| "loss": 0.137, |
| "mean_token_accuracy": 0.9558083891868592, |
| "num_tokens": 10392121.0, |
| "step": 4770 |
| }, |
| { |
| "epoch": 12.816107382550335, |
| "grad_norm": 1.0056508779525757, |
| "learning_rate": 5.018283255376882e-06, |
| "loss": 0.1083, |
| "mean_token_accuracy": 0.9667860418558121, |
| "num_tokens": 10413476.0, |
| "step": 4780 |
| }, |
| { |
| "epoch": 12.842953020134228, |
| "grad_norm": 1.75967538356781, |
| "learning_rate": 5.000870633144252e-06, |
| "loss": 0.0992, |
| "mean_token_accuracy": 0.965215852856636, |
| "num_tokens": 10437053.0, |
| "step": 4790 |
| }, |
| { |
| "epoch": 12.869798657718121, |
| "grad_norm": 0.9346075654029846, |
| "learning_rate": 4.983458000352565e-06, |
| "loss": 0.1126, |
| "mean_token_accuracy": 0.96172194480896, |
| "num_tokens": 10459596.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 12.896644295302014, |
| "grad_norm": 1.8567789793014526, |
| "learning_rate": 4.966045568182596e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9629136115312577, |
| "num_tokens": 10481183.0, |
| "step": 4810 |
| }, |
| { |
| "epoch": 12.923489932885905, |
| "grad_norm": 3.24837064743042, |
| "learning_rate": 4.948633547812691e-06, |
| "loss": 0.1334, |
| "mean_token_accuracy": 0.958159077167511, |
| "num_tokens": 10501315.0, |
| "step": 4820 |
| }, |
| { |
| "epoch": 12.950335570469798, |
| "grad_norm": 0.9896105527877808, |
| "learning_rate": 4.931222150416197e-06, |
| "loss": 0.0997, |
| "mean_token_accuracy": 0.9687875539064408, |
| "num_tokens": 10522428.0, |
| "step": 4830 |
| }, |
| { |
| "epoch": 12.977181208053691, |
| "grad_norm": 1.075074553489685, |
| "learning_rate": 4.913811587158908e-06, |
| "loss": 0.1053, |
| "mean_token_accuracy": 0.9634338021278381, |
| "num_tokens": 10544661.0, |
| "step": 4840 |
| }, |
| { |
| "epoch": 13.00268456375839, |
| "grad_norm": 0.8715270757675171, |
| "learning_rate": 4.896402069196502e-06, |
| "loss": 0.1193, |
| "mean_token_accuracy": 0.9614172013182389, |
| "num_tokens": 10563307.0, |
| "step": 4850 |
| }, |
| { |
| "epoch": 13.029530201342283, |
| "grad_norm": 0.9880079627037048, |
| "learning_rate": 4.878993807671976e-06, |
| "loss": 0.0906, |
| "mean_token_accuracy": 0.9682508319616318, |
| "num_tokens": 10587248.0, |
| "step": 4860 |
| }, |
| { |
| "epoch": 13.056375838926174, |
| "grad_norm": 1.9502968788146973, |
| "learning_rate": 4.861587013713096e-06, |
| "loss": 0.1007, |
| "mean_token_accuracy": 0.9648101240396499, |
| "num_tokens": 10610106.0, |
| "step": 4870 |
| }, |
| { |
| "epoch": 13.083221476510067, |
| "grad_norm": 0.9801583290100098, |
| "learning_rate": 4.8441818984298204e-06, |
| "loss": 0.0912, |
| "mean_token_accuracy": 0.9686092883348465, |
| "num_tokens": 10632149.0, |
| "step": 4880 |
| }, |
| { |
| "epoch": 13.11006711409396, |
| "grad_norm": 1.2754795551300049, |
| "learning_rate": 4.826778672911757e-06, |
| "loss": 0.1184, |
| "mean_token_accuracy": 0.9613798499107361, |
| "num_tokens": 10653072.0, |
| "step": 4890 |
| }, |
| { |
| "epoch": 13.136912751677853, |
| "grad_norm": 1.2431989908218384, |
| "learning_rate": 4.809377548225589e-06, |
| "loss": 0.1108, |
| "mean_token_accuracy": 0.9650191992521286, |
| "num_tokens": 10672975.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 13.163758389261744, |
| "grad_norm": 1.0186421871185303, |
| "learning_rate": 4.79197873541252e-06, |
| "loss": 0.0892, |
| "mean_token_accuracy": 0.9677982062101365, |
| "num_tokens": 10696744.0, |
| "step": 4910 |
| }, |
| { |
| "epoch": 13.190604026845637, |
| "grad_norm": 1.6983952522277832, |
| "learning_rate": 4.774582445485721e-06, |
| "loss": 0.0998, |
| "mean_token_accuracy": 0.9657606661319733, |
| "num_tokens": 10719417.0, |
| "step": 4920 |
| }, |
| { |
| "epoch": 13.21744966442953, |
| "grad_norm": 1.8801636695861816, |
| "learning_rate": 4.757188889427761e-06, |
| "loss": 0.0969, |
| "mean_token_accuracy": 0.9664499372243881, |
| "num_tokens": 10741268.0, |
| "step": 4930 |
| }, |
| { |
| "epoch": 13.244295302013423, |
| "grad_norm": 1.604750633239746, |
| "learning_rate": 4.73979827818805e-06, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.9594014555215835, |
| "num_tokens": 10761836.0, |
| "step": 4940 |
| }, |
| { |
| "epoch": 13.271140939597316, |
| "grad_norm": 1.1507794857025146, |
| "learning_rate": 4.7224108226802915e-06, |
| "loss": 0.1045, |
| "mean_token_accuracy": 0.9674178540706635, |
| "num_tokens": 10781662.0, |
| "step": 4950 |
| }, |
| { |
| "epoch": 13.297986577181208, |
| "grad_norm": 1.8615467548370361, |
| "learning_rate": 4.7050267337799074e-06, |
| "loss": 0.0936, |
| "mean_token_accuracy": 0.9667486816644668, |
| "num_tokens": 10805617.0, |
| "step": 4960 |
| }, |
| { |
| "epoch": 13.3248322147651, |
| "grad_norm": 1.1684675216674805, |
| "learning_rate": 4.687646222321496e-06, |
| "loss": 0.1075, |
| "mean_token_accuracy": 0.9631021231412887, |
| "num_tokens": 10828423.0, |
| "step": 4970 |
| }, |
| { |
| "epoch": 13.351677852348994, |
| "grad_norm": 0.9275521039962769, |
| "learning_rate": 4.670269499096266e-06, |
| "loss": 0.0935, |
| "mean_token_accuracy": 0.9685831665992737, |
| "num_tokens": 10850355.0, |
| "step": 4980 |
| }, |
| { |
| "epoch": 13.378523489932887, |
| "grad_norm": 1.5694960355758667, |
| "learning_rate": 4.652896774849477e-06, |
| "loss": 0.1288, |
| "mean_token_accuracy": 0.9589329659938812, |
| "num_tokens": 10870919.0, |
| "step": 4990 |
| }, |
| { |
| "epoch": 13.405369127516778, |
| "grad_norm": 1.0490679740905762, |
| "learning_rate": 4.635528260277899e-06, |
| "loss": 0.1067, |
| "mean_token_accuracy": 0.9686432272195816, |
| "num_tokens": 10890675.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 13.432214765100671, |
| "grad_norm": 1.5049335956573486, |
| "learning_rate": 4.618164166027238e-06, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.9673222094774246, |
| "num_tokens": 10914713.0, |
| "step": 5010 |
| }, |
| { |
| "epoch": 13.459060402684564, |
| "grad_norm": 1.205875277519226, |
| "learning_rate": 4.600804702689598e-06, |
| "loss": 0.1022, |
| "mean_token_accuracy": 0.9632074743509292, |
| "num_tokens": 10937577.0, |
| "step": 5020 |
| }, |
| { |
| "epoch": 13.485906040268457, |
| "grad_norm": 1.0139328241348267, |
| "learning_rate": 4.583450080800912e-06, |
| "loss": 0.0957, |
| "mean_token_accuracy": 0.9681812196969986, |
| "num_tokens": 10959493.0, |
| "step": 5030 |
| }, |
| { |
| "epoch": 13.512751677852348, |
| "grad_norm": 1.8275400400161743, |
| "learning_rate": 4.5661005108384e-06, |
| "loss": 0.1238, |
| "mean_token_accuracy": 0.9604158759117126, |
| "num_tokens": 10980260.0, |
| "step": 5040 |
| }, |
| { |
| "epoch": 13.539597315436241, |
| "grad_norm": 1.1797428131103516, |
| "learning_rate": 4.54875620321801e-06, |
| "loss": 0.1142, |
| "mean_token_accuracy": 0.9651793152093887, |
| "num_tokens": 11000267.0, |
| "step": 5050 |
| }, |
| { |
| "epoch": 13.566442953020134, |
| "grad_norm": 1.460530161857605, |
| "learning_rate": 4.5314173682918704e-06, |
| "loss": 0.0945, |
| "mean_token_accuracy": 0.9683007091283798, |
| "num_tokens": 11024206.0, |
| "step": 5060 |
| }, |
| { |
| "epoch": 13.593288590604027, |
| "grad_norm": 1.042712926864624, |
| "learning_rate": 4.514084216345736e-06, |
| "loss": 0.1014, |
| "mean_token_accuracy": 0.9641075730323792, |
| "num_tokens": 11046962.0, |
| "step": 5070 |
| }, |
| { |
| "epoch": 13.620134228187919, |
| "grad_norm": 1.3945727348327637, |
| "learning_rate": 4.496756957596438e-06, |
| "loss": 0.0934, |
| "mean_token_accuracy": 0.9680504709482193, |
| "num_tokens": 11068787.0, |
| "step": 5080 |
| }, |
| { |
| "epoch": 13.646979865771812, |
| "grad_norm": 1.7909672260284424, |
| "learning_rate": 4.479435802189332e-06, |
| "loss": 0.1278, |
| "mean_token_accuracy": 0.9593035578727722, |
| "num_tokens": 11089125.0, |
| "step": 5090 |
| }, |
| { |
| "epoch": 13.673825503355705, |
| "grad_norm": 1.2251805067062378, |
| "learning_rate": 4.4621209601957585e-06, |
| "loss": 0.1054, |
| "mean_token_accuracy": 0.9684852123260498, |
| "num_tokens": 11108756.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 13.700671140939598, |
| "grad_norm": 1.1306813955307007, |
| "learning_rate": 4.444812641610482e-06, |
| "loss": 0.0941, |
| "mean_token_accuracy": 0.9665086060762406, |
| "num_tokens": 11132698.0, |
| "step": 5110 |
| }, |
| { |
| "epoch": 13.72751677852349, |
| "grad_norm": 1.2338292598724365, |
| "learning_rate": 4.427511056349157e-06, |
| "loss": 0.1035, |
| "mean_token_accuracy": 0.9634651213884353, |
| "num_tokens": 11155580.0, |
| "step": 5120 |
| }, |
| { |
| "epoch": 13.754362416107382, |
| "grad_norm": 1.1115108728408813, |
| "learning_rate": 4.410216414245771e-06, |
| "loss": 0.0954, |
| "mean_token_accuracy": 0.9664014279842377, |
| "num_tokens": 11177543.0, |
| "step": 5130 |
| }, |
| { |
| "epoch": 13.781208053691275, |
| "grad_norm": 1.7830060720443726, |
| "learning_rate": 4.392928925050106e-06, |
| "loss": 0.1326, |
| "mean_token_accuracy": 0.9572756230831146, |
| "num_tokens": 11198259.0, |
| "step": 5140 |
| }, |
| { |
| "epoch": 13.808053691275168, |
| "grad_norm": 1.142449140548706, |
| "learning_rate": 4.375648798425197e-06, |
| "loss": 0.1074, |
| "mean_token_accuracy": 0.96616330742836, |
| "num_tokens": 11218098.0, |
| "step": 5150 |
| }, |
| { |
| "epoch": 13.834899328859061, |
| "grad_norm": 1.3761417865753174, |
| "learning_rate": 4.358376243944782e-06, |
| "loss": 0.0923, |
| "mean_token_accuracy": 0.9684065580368042, |
| "num_tokens": 11242063.0, |
| "step": 5160 |
| }, |
| { |
| "epoch": 13.861744966442952, |
| "grad_norm": 0.9292601346969604, |
| "learning_rate": 4.341111471090762e-06, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.9653068006038665, |
| "num_tokens": 11264924.0, |
| "step": 5170 |
| }, |
| { |
| "epoch": 13.888590604026845, |
| "grad_norm": 0.9202330708503723, |
| "learning_rate": 4.323854689250669e-06, |
| "loss": 0.0927, |
| "mean_token_accuracy": 0.9676958501338959, |
| "num_tokens": 11286770.0, |
| "step": 5180 |
| }, |
| { |
| "epoch": 13.915436241610738, |
| "grad_norm": 1.5448764562606812, |
| "learning_rate": 4.3066061077151124e-06, |
| "loss": 0.1261, |
| "mean_token_accuracy": 0.9603673964738846, |
| "num_tokens": 11307393.0, |
| "step": 5190 |
| }, |
| { |
| "epoch": 13.942281879194631, |
| "grad_norm": 1.1237270832061768, |
| "learning_rate": 4.289365935675255e-06, |
| "loss": 0.109, |
| "mean_token_accuracy": 0.9644716501235961, |
| "num_tokens": 11327277.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 13.969127516778524, |
| "grad_norm": 0.9842494130134583, |
| "learning_rate": 4.272134382220263e-06, |
| "loss": 0.099, |
| "mean_token_accuracy": 0.9658454984426499, |
| "num_tokens": 11350369.0, |
| "step": 5210 |
| }, |
| { |
| "epoch": 13.995973154362416, |
| "grad_norm": 1.408265233039856, |
| "learning_rate": 4.254911656334778e-06, |
| "loss": 0.1152, |
| "mean_token_accuracy": 0.963257348537445, |
| "num_tokens": 11371043.0, |
| "step": 5220 |
| }, |
| { |
| "epoch": 14.021476510067114, |
| "grad_norm": 1.1198426485061646, |
| "learning_rate": 4.237697966896385e-06, |
| "loss": 0.0816, |
| "mean_token_accuracy": 0.971764558239987, |
| "num_tokens": 11392630.0, |
| "step": 5230 |
| }, |
| { |
| "epoch": 14.048322147651007, |
| "grad_norm": 1.1393156051635742, |
| "learning_rate": 4.220493522673067e-06, |
| "loss": 0.0969, |
| "mean_token_accuracy": 0.9652393728494644, |
| "num_tokens": 11415708.0, |
| "step": 5240 |
| }, |
| { |
| "epoch": 14.0751677852349, |
| "grad_norm": 1.3301637172698975, |
| "learning_rate": 4.20329853232069e-06, |
| "loss": 0.095, |
| "mean_token_accuracy": 0.9665391176939011, |
| "num_tokens": 11437808.0, |
| "step": 5250 |
| }, |
| { |
| "epoch": 14.102013422818793, |
| "grad_norm": 2.872422218322754, |
| "learning_rate": 4.1861132043804555e-06, |
| "loss": 0.1017, |
| "mean_token_accuracy": 0.9674932867288589, |
| "num_tokens": 11458991.0, |
| "step": 5260 |
| }, |
| { |
| "epoch": 14.128859060402684, |
| "grad_norm": 1.053300142288208, |
| "learning_rate": 4.168937747276381e-06, |
| "loss": 0.1138, |
| "mean_token_accuracy": 0.965873995423317, |
| "num_tokens": 11478611.0, |
| "step": 5270 |
| }, |
| { |
| "epoch": 14.155704697986577, |
| "grad_norm": 1.5345606803894043, |
| "learning_rate": 4.151772369312772e-06, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9710632592439652, |
| "num_tokens": 11501870.0, |
| "step": 5280 |
| }, |
| { |
| "epoch": 14.18255033557047, |
| "grad_norm": 1.1510871648788452, |
| "learning_rate": 4.134617278671694e-06, |
| "loss": 0.0991, |
| "mean_token_accuracy": 0.9647044003009796, |
| "num_tokens": 11525013.0, |
| "step": 5290 |
| }, |
| { |
| "epoch": 14.209395973154363, |
| "grad_norm": 1.239437460899353, |
| "learning_rate": 4.117472683410446e-06, |
| "loss": 0.0911, |
| "mean_token_accuracy": 0.9685541450977325, |
| "num_tokens": 11547218.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 14.236241610738254, |
| "grad_norm": 2.640573263168335, |
| "learning_rate": 4.100338791459042e-06, |
| "loss": 0.1, |
| "mean_token_accuracy": 0.9668088883161545, |
| "num_tokens": 11568484.0, |
| "step": 5310 |
| }, |
| { |
| "epoch": 14.263087248322147, |
| "grad_norm": 1.1691937446594238, |
| "learning_rate": 4.083215810617678e-06, |
| "loss": 0.1192, |
| "mean_token_accuracy": 0.9638256758451462, |
| "num_tokens": 11588180.0, |
| "step": 5320 |
| }, |
| { |
| "epoch": 14.28993288590604, |
| "grad_norm": 1.7287318706512451, |
| "learning_rate": 4.0661039485542326e-06, |
| "loss": 0.0895, |
| "mean_token_accuracy": 0.9701910257339478, |
| "num_tokens": 11611340.0, |
| "step": 5330 |
| }, |
| { |
| "epoch": 14.316778523489933, |
| "grad_norm": 1.492462396621704, |
| "learning_rate": 4.049003412801724e-06, |
| "loss": 0.0927, |
| "mean_token_accuracy": 0.9668333351612091, |
| "num_tokens": 11634481.0, |
| "step": 5340 |
| }, |
| { |
| "epoch": 14.343624161073825, |
| "grad_norm": 1.4218974113464355, |
| "learning_rate": 4.031914410755809e-06, |
| "loss": 0.0927, |
| "mean_token_accuracy": 0.9681075811386108, |
| "num_tokens": 11656604.0, |
| "step": 5350 |
| }, |
| { |
| "epoch": 14.370469798657718, |
| "grad_norm": 2.2234318256378174, |
| "learning_rate": 4.014837149672266e-06, |
| "loss": 0.1087, |
| "mean_token_accuracy": 0.9646641999483109, |
| "num_tokens": 11677627.0, |
| "step": 5360 |
| }, |
| { |
| "epoch": 14.39731543624161, |
| "grad_norm": 1.1645985841751099, |
| "learning_rate": 3.997771836664473e-06, |
| "loss": 0.1092, |
| "mean_token_accuracy": 0.9664602816104889, |
| "num_tokens": 11697187.0, |
| "step": 5370 |
| }, |
| { |
| "epoch": 14.424161073825504, |
| "grad_norm": 1.700769305229187, |
| "learning_rate": 3.980718678700909e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9712590277194977, |
| "num_tokens": 11720310.0, |
| "step": 5380 |
| }, |
| { |
| "epoch": 14.451006711409397, |
| "grad_norm": 1.1342298984527588, |
| "learning_rate": 3.96367788260263e-06, |
| "loss": 0.094, |
| "mean_token_accuracy": 0.9667182564735413, |
| "num_tokens": 11743414.0, |
| "step": 5390 |
| }, |
| { |
| "epoch": 14.477852348993288, |
| "grad_norm": 1.848952054977417, |
| "learning_rate": 3.9466496550407675e-06, |
| "loss": 0.0927, |
| "mean_token_accuracy": 0.9685674637556076, |
| "num_tokens": 11765609.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 14.504697986577181, |
| "grad_norm": 1.5314711332321167, |
| "learning_rate": 3.929634202534026e-06, |
| "loss": 0.1052, |
| "mean_token_accuracy": 0.9641919821500778, |
| "num_tokens": 11786687.0, |
| "step": 5410 |
| }, |
| { |
| "epoch": 14.531543624161074, |
| "grad_norm": 1.143330693244934, |
| "learning_rate": 3.912631731446168e-06, |
| "loss": 0.1086, |
| "mean_token_accuracy": 0.9661551773548126, |
| "num_tokens": 11806174.0, |
| "step": 5420 |
| }, |
| { |
| "epoch": 14.558389261744967, |
| "grad_norm": 1.3766965866088867, |
| "learning_rate": 3.895642447983525e-06, |
| "loss": 0.0949, |
| "mean_token_accuracy": 0.9679251462221146, |
| "num_tokens": 11829254.0, |
| "step": 5430 |
| }, |
| { |
| "epoch": 14.585234899328858, |
| "grad_norm": 1.2601746320724487, |
| "learning_rate": 3.8786665581924805e-06, |
| "loss": 0.0968, |
| "mean_token_accuracy": 0.965695607662201, |
| "num_tokens": 11852275.0, |
| "step": 5440 |
| }, |
| { |
| "epoch": 14.612080536912751, |
| "grad_norm": 1.1301180124282837, |
| "learning_rate": 3.8617042679569805e-06, |
| "loss": 0.0908, |
| "mean_token_accuracy": 0.9697641730308533, |
| "num_tokens": 11874374.0, |
| "step": 5450 |
| }, |
| { |
| "epoch": 14.638926174496644, |
| "grad_norm": 2.486135959625244, |
| "learning_rate": 3.844755782996043e-06, |
| "loss": 0.1104, |
| "mean_token_accuracy": 0.9635231077671051, |
| "num_tokens": 11895508.0, |
| "step": 5460 |
| }, |
| { |
| "epoch": 14.665771812080537, |
| "grad_norm": 1.2749981880187988, |
| "learning_rate": 3.827821308861244e-06, |
| "loss": 0.1214, |
| "mean_token_accuracy": 0.9636542230844498, |
| "num_tokens": 11915109.0, |
| "step": 5470 |
| }, |
| { |
| "epoch": 14.692617449664429, |
| "grad_norm": 1.4531991481781006, |
| "learning_rate": 3.810901050934247e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9712864845991135, |
| "num_tokens": 11938194.0, |
| "step": 5480 |
| }, |
| { |
| "epoch": 14.719463087248322, |
| "grad_norm": 1.8714344501495361, |
| "learning_rate": 3.793995214424292e-06, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9681427836418152, |
| "num_tokens": 11961341.0, |
| "step": 5490 |
| }, |
| { |
| "epoch": 14.746308724832215, |
| "grad_norm": 1.3651467561721802, |
| "learning_rate": 3.777104004365721e-06, |
| "loss": 0.0932, |
| "mean_token_accuracy": 0.9683119237422944, |
| "num_tokens": 11983507.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 14.773154362416108, |
| "grad_norm": 1.960383415222168, |
| "learning_rate": 3.7602276256154852e-06, |
| "loss": 0.1098, |
| "mean_token_accuracy": 0.9644652128219604, |
| "num_tokens": 12004614.0, |
| "step": 5510 |
| }, |
| { |
| "epoch": 14.8, |
| "grad_norm": 1.2198858261108398, |
| "learning_rate": 3.7433662828506577e-06, |
| "loss": 0.1114, |
| "mean_token_accuracy": 0.965359115600586, |
| "num_tokens": 12024189.0, |
| "step": 5520 |
| }, |
| { |
| "epoch": 14.826845637583892, |
| "grad_norm": 1.7983379364013672, |
| "learning_rate": 3.7265201805659618e-06, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9709195643663406, |
| "num_tokens": 12047241.0, |
| "step": 5530 |
| }, |
| { |
| "epoch": 14.853691275167785, |
| "grad_norm": 1.500483751296997, |
| "learning_rate": 3.709689523071277e-06, |
| "loss": 0.0965, |
| "mean_token_accuracy": 0.965771397948265, |
| "num_tokens": 12070280.0, |
| "step": 5540 |
| }, |
| { |
| "epoch": 14.880536912751678, |
| "grad_norm": 1.2716580629348755, |
| "learning_rate": 3.6928745144891733e-06, |
| "loss": 0.0954, |
| "mean_token_accuracy": 0.9673113852739335, |
| "num_tokens": 12092414.0, |
| "step": 5550 |
| }, |
| { |
| "epoch": 14.907382550335571, |
| "grad_norm": 1.794506549835205, |
| "learning_rate": 3.676075358752426e-06, |
| "loss": 0.1128, |
| "mean_token_accuracy": 0.9624302506446838, |
| "num_tokens": 12113463.0, |
| "step": 5560 |
| }, |
| { |
| "epoch": 14.934228187919462, |
| "grad_norm": 1.2952619791030884, |
| "learning_rate": 3.6592922596015516e-06, |
| "loss": 0.1053, |
| "mean_token_accuracy": 0.9676823377609253, |
| "num_tokens": 12132960.0, |
| "step": 5570 |
| }, |
| { |
| "epoch": 14.961073825503355, |
| "grad_norm": 1.7229140996932983, |
| "learning_rate": 3.6425254205823256e-06, |
| "loss": 0.0879, |
| "mean_token_accuracy": 0.9696457594633102, |
| "num_tokens": 12155835.0, |
| "step": 5580 |
| }, |
| { |
| "epoch": 14.987919463087248, |
| "grad_norm": 5.81431245803833, |
| "learning_rate": 3.6257750450433284e-06, |
| "loss": 0.1062, |
| "mean_token_accuracy": 0.9640632271766663, |
| "num_tokens": 12177517.0, |
| "step": 5590 |
| }, |
| { |
| "epoch": 15.013422818791947, |
| "grad_norm": 1.7150989770889282, |
| "learning_rate": 3.609041336133462e-06, |
| "loss": 0.0895, |
| "mean_token_accuracy": 0.9702811994050679, |
| "num_tokens": 12197826.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 15.04026845637584, |
| "grad_norm": 1.458148717880249, |
| "learning_rate": 3.5923244967995045e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.9716112166643143, |
| "num_tokens": 12221328.0, |
| "step": 5610 |
| }, |
| { |
| "epoch": 15.06711409395973, |
| "grad_norm": 1.431036114692688, |
| "learning_rate": 3.575624729783632e-06, |
| "loss": 0.0909, |
| "mean_token_accuracy": 0.9683985263109207, |
| "num_tokens": 12243817.0, |
| "step": 5620 |
| }, |
| { |
| "epoch": 15.093959731543624, |
| "grad_norm": 1.949104905128479, |
| "learning_rate": 3.558942237620968e-06, |
| "loss": 0.0826, |
| "mean_token_accuracy": 0.9721886366605759, |
| "num_tokens": 12265391.0, |
| "step": 5630 |
| }, |
| { |
| "epoch": 15.120805369127517, |
| "grad_norm": 1.5431227684020996, |
| "learning_rate": 3.5422772226371315e-06, |
| "loss": 0.1096, |
| "mean_token_accuracy": 0.9656341940164566, |
| "num_tokens": 12285431.0, |
| "step": 5640 |
| }, |
| { |
| "epoch": 15.14765100671141, |
| "grad_norm": 1.9090901613235474, |
| "learning_rate": 3.5256298869457715e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9727503031492233, |
| "num_tokens": 12307130.0, |
| "step": 5650 |
| }, |
| { |
| "epoch": 15.174496644295303, |
| "grad_norm": 1.5246036052703857, |
| "learning_rate": 3.509000432446128e-06, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9687469393014908, |
| "num_tokens": 12330587.0, |
| "step": 5660 |
| }, |
| { |
| "epoch": 15.201342281879194, |
| "grad_norm": 1.1196630001068115, |
| "learning_rate": 3.492389060820574e-06, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9701620578765869, |
| "num_tokens": 12352973.0, |
| "step": 5670 |
| }, |
| { |
| "epoch": 15.228187919463087, |
| "grad_norm": 2.1184024810791016, |
| "learning_rate": 3.47579597353217e-06, |
| "loss": 0.0897, |
| "mean_token_accuracy": 0.9701414495706558, |
| "num_tokens": 12374519.0, |
| "step": 5680 |
| }, |
| { |
| "epoch": 15.25503355704698, |
| "grad_norm": 1.739675760269165, |
| "learning_rate": 3.4592213718222335e-06, |
| "loss": 0.1136, |
| "mean_token_accuracy": 0.9656203925609589, |
| "num_tokens": 12394483.0, |
| "step": 5690 |
| }, |
| { |
| "epoch": 15.281879194630873, |
| "grad_norm": 1.7435755729675293, |
| "learning_rate": 3.4426654567078753e-06, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9721616327762603, |
| "num_tokens": 12416218.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 15.308724832214764, |
| "grad_norm": 1.2572535276412964, |
| "learning_rate": 3.426128428979589e-06, |
| "loss": 0.0898, |
| "mean_token_accuracy": 0.9683011502027512, |
| "num_tokens": 12439695.0, |
| "step": 5710 |
| }, |
| { |
| "epoch": 15.335570469798657, |
| "grad_norm": 1.2672688961029053, |
| "learning_rate": 3.4096104891987903e-06, |
| "loss": 0.0936, |
| "mean_token_accuracy": 0.9671826392412186, |
| "num_tokens": 12462245.0, |
| "step": 5720 |
| }, |
| { |
| "epoch": 15.36241610738255, |
| "grad_norm": 1.6163733005523682, |
| "learning_rate": 3.3931118376953986e-06, |
| "loss": 0.0903, |
| "mean_token_accuracy": 0.9696721345186233, |
| "num_tokens": 12483912.0, |
| "step": 5730 |
| }, |
| { |
| "epoch": 15.389261744966444, |
| "grad_norm": 1.7301290035247803, |
| "learning_rate": 3.376632674565411e-06, |
| "loss": 0.1248, |
| "mean_token_accuracy": 0.9635996103286744, |
| "num_tokens": 12504064.0, |
| "step": 5740 |
| }, |
| { |
| "epoch": 15.416107382550335, |
| "grad_norm": 1.6507128477096558, |
| "learning_rate": 3.3601731996684584e-06, |
| "loss": 0.0857, |
| "mean_token_accuracy": 0.97254838347435, |
| "num_tokens": 12525725.0, |
| "step": 5750 |
| }, |
| { |
| "epoch": 15.442953020134228, |
| "grad_norm": 2.455329656600952, |
| "learning_rate": 3.343733612625404e-06, |
| "loss": 0.0848, |
| "mean_token_accuracy": 0.9704270303249359, |
| "num_tokens": 12549124.0, |
| "step": 5760 |
| }, |
| { |
| "epoch": 15.46979865771812, |
| "grad_norm": 2.2060821056365967, |
| "learning_rate": 3.3273141128159005e-06, |
| "loss": 0.0907, |
| "mean_token_accuracy": 0.9694718390703201, |
| "num_tokens": 12571502.0, |
| "step": 5770 |
| }, |
| { |
| "epoch": 15.496644295302014, |
| "grad_norm": 1.91788911819458, |
| "learning_rate": 3.310914899375989e-06, |
| "loss": 0.089, |
| "mean_token_accuracy": 0.9696966052055359, |
| "num_tokens": 12592999.0, |
| "step": 5780 |
| }, |
| { |
| "epoch": 15.523489932885907, |
| "grad_norm": 1.589574933052063, |
| "learning_rate": 3.294536171195673e-06, |
| "loss": 0.1121, |
| "mean_token_accuracy": 0.9663246095180511, |
| "num_tokens": 12613099.0, |
| "step": 5790 |
| }, |
| { |
| "epoch": 15.550335570469798, |
| "grad_norm": 1.5774067640304565, |
| "learning_rate": 3.278178126916515e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9742209672927856, |
| "num_tokens": 12634907.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 15.577181208053691, |
| "grad_norm": 1.7764393091201782, |
| "learning_rate": 3.2618409649292153e-06, |
| "loss": 0.0894, |
| "mean_token_accuracy": 0.9685896605253219, |
| "num_tokens": 12658442.0, |
| "step": 5810 |
| }, |
| { |
| "epoch": 15.604026845637584, |
| "grad_norm": 1.441229224205017, |
| "learning_rate": 3.2455248833712226e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9684127897024155, |
| "num_tokens": 12680988.0, |
| "step": 5820 |
| }, |
| { |
| "epoch": 15.630872483221477, |
| "grad_norm": 1.5752300024032593, |
| "learning_rate": 3.2292300801243133e-06, |
| "loss": 0.091, |
| "mean_token_accuracy": 0.969354122877121, |
| "num_tokens": 12702567.0, |
| "step": 5830 |
| }, |
| { |
| "epoch": 15.657718120805368, |
| "grad_norm": 1.5445083379745483, |
| "learning_rate": 3.212956752812206e-06, |
| "loss": 0.1182, |
| "mean_token_accuracy": 0.9640820533037185, |
| "num_tokens": 12722654.0, |
| "step": 5840 |
| }, |
| { |
| "epoch": 15.684563758389261, |
| "grad_norm": 1.6099363565444946, |
| "learning_rate": 3.196705098798156e-06, |
| "loss": 0.0893, |
| "mean_token_accuracy": 0.9708084911108017, |
| "num_tokens": 12744530.0, |
| "step": 5850 |
| }, |
| { |
| "epoch": 15.711409395973154, |
| "grad_norm": 1.6343265771865845, |
| "learning_rate": 3.180475315182563e-06, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9693089485168457, |
| "num_tokens": 12767788.0, |
| "step": 5860 |
| }, |
| { |
| "epoch": 15.738255033557047, |
| "grad_norm": 1.7334041595458984, |
| "learning_rate": 3.1642675988005854e-06, |
| "loss": 0.0929, |
| "mean_token_accuracy": 0.9685852974653244, |
| "num_tokens": 12790039.0, |
| "step": 5870 |
| }, |
| { |
| "epoch": 15.765100671140939, |
| "grad_norm": 1.8182802200317383, |
| "learning_rate": 3.1480821462197464e-06, |
| "loss": 0.0944, |
| "mean_token_accuracy": 0.9677254110574722, |
| "num_tokens": 12811427.0, |
| "step": 5880 |
| }, |
| { |
| "epoch": 15.791946308724832, |
| "grad_norm": 2.699237585067749, |
| "learning_rate": 3.1319191537375577e-06, |
| "loss": 0.1086, |
| "mean_token_accuracy": 0.9659045994281769, |
| "num_tokens": 12831229.0, |
| "step": 5890 |
| }, |
| { |
| "epoch": 15.818791946308725, |
| "grad_norm": 1.9692622423171997, |
| "learning_rate": 3.1157788173791303e-06, |
| "loss": 0.0908, |
| "mean_token_accuracy": 0.9693997651338577, |
| "num_tokens": 12852938.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 15.845637583892618, |
| "grad_norm": 1.4347225427627563, |
| "learning_rate": 3.0996613328948006e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9709456175565719, |
| "num_tokens": 12876425.0, |
| "step": 5910 |
| }, |
| { |
| "epoch": 15.87248322147651, |
| "grad_norm": 1.4364287853240967, |
| "learning_rate": 3.0835668957577636e-06, |
| "loss": 0.0968, |
| "mean_token_accuracy": 0.9675709336996079, |
| "num_tokens": 12898803.0, |
| "step": 5920 |
| }, |
| { |
| "epoch": 15.899328859060402, |
| "grad_norm": 1.7437158823013306, |
| "learning_rate": 3.067495701161686e-06, |
| "loss": 0.0937, |
| "mean_token_accuracy": 0.9686466962099075, |
| "num_tokens": 12920276.0, |
| "step": 5930 |
| }, |
| { |
| "epoch": 15.926174496644295, |
| "grad_norm": 1.5922681093215942, |
| "learning_rate": 3.051447944018359e-06, |
| "loss": 0.1165, |
| "mean_token_accuracy": 0.9641006350517273, |
| "num_tokens": 12940184.0, |
| "step": 5940 |
| }, |
| { |
| "epoch": 15.953020134228188, |
| "grad_norm": 1.3997697830200195, |
| "learning_rate": 3.035423818955316e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9725989639759064, |
| "num_tokens": 12961557.0, |
| "step": 5950 |
| }, |
| { |
| "epoch": 15.979865771812081, |
| "grad_norm": 1.8167921304702759, |
| "learning_rate": 3.01942352031348e-06, |
| "loss": 0.0958, |
| "mean_token_accuracy": 0.9666165590286255, |
| "num_tokens": 12983780.0, |
| "step": 5960 |
| }, |
| { |
| "epoch": 16.00536912751678, |
| "grad_norm": 1.1939505338668823, |
| "learning_rate": 3.0034472421448134e-06, |
| "loss": 0.0984, |
| "mean_token_accuracy": 0.9686258435249329, |
| "num_tokens": 13002890.0, |
| "step": 5970 |
| }, |
| { |
| "epoch": 16.032214765100672, |
| "grad_norm": 1.3747490644454956, |
| "learning_rate": 2.987495178209951e-06, |
| "loss": 0.0782, |
| "mean_token_accuracy": 0.9740465998649597, |
| "num_tokens": 13026596.0, |
| "step": 5980 |
| }, |
| { |
| "epoch": 16.059060402684565, |
| "grad_norm": 1.6247341632843018, |
| "learning_rate": 2.9715675219758598e-06, |
| "loss": 0.0878, |
| "mean_token_accuracy": 0.9689295053482055, |
| "num_tokens": 13049138.0, |
| "step": 5990 |
| }, |
| { |
| "epoch": 16.085906040268455, |
| "grad_norm": 1.530810832977295, |
| "learning_rate": 2.9556644666134903e-06, |
| "loss": 0.072, |
| "mean_token_accuracy": 0.9749242842197419, |
| "num_tokens": 13070906.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 16.112751677852348, |
| "grad_norm": 1.8794249296188354, |
| "learning_rate": 2.9397862049954307e-06, |
| "loss": 0.1095, |
| "mean_token_accuracy": 0.9656890034675598, |
| "num_tokens": 13091256.0, |
| "step": 6010 |
| }, |
| { |
| "epoch": 16.13959731543624, |
| "grad_norm": 3.501293659210205, |
| "learning_rate": 2.9239329296935726e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.973246031999588, |
| "num_tokens": 13111492.0, |
| "step": 6020 |
| }, |
| { |
| "epoch": 16.166442953020134, |
| "grad_norm": 1.5681910514831543, |
| "learning_rate": 2.908104832976773e-06, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.9714659541845322, |
| "num_tokens": 13135289.0, |
| "step": 6030 |
| }, |
| { |
| "epoch": 16.193288590604027, |
| "grad_norm": 1.32642662525177, |
| "learning_rate": 2.892302106808519e-06, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9693911045789718, |
| "num_tokens": 13158186.0, |
| "step": 6040 |
| }, |
| { |
| "epoch": 16.22013422818792, |
| "grad_norm": 1.3679111003875732, |
| "learning_rate": 2.8765249428446074e-06, |
| "loss": 0.0841, |
| "mean_token_accuracy": 0.9712060689926147, |
| "num_tokens": 13180185.0, |
| "step": 6050 |
| }, |
| { |
| "epoch": 16.246979865771813, |
| "grad_norm": 1.9701911211013794, |
| "learning_rate": 2.860773532430814e-06, |
| "loss": 0.1007, |
| "mean_token_accuracy": 0.9682783782482147, |
| "num_tokens": 13200699.0, |
| "step": 6060 |
| }, |
| { |
| "epoch": 16.273825503355706, |
| "grad_norm": 1.7542141675949097, |
| "learning_rate": 2.8450480666005743e-06, |
| "loss": 0.0865, |
| "mean_token_accuracy": 0.9719853222370147, |
| "num_tokens": 13220970.0, |
| "step": 6070 |
| }, |
| { |
| "epoch": 16.3006711409396, |
| "grad_norm": 1.3732870817184448, |
| "learning_rate": 2.8293487360726703e-06, |
| "loss": 0.0778, |
| "mean_token_accuracy": 0.9729690462350845, |
| "num_tokens": 13244725.0, |
| "step": 6080 |
| }, |
| { |
| "epoch": 16.32751677852349, |
| "grad_norm": 1.513179898262024, |
| "learning_rate": 2.8136757312489104e-06, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9692198872566223, |
| "num_tokens": 13267350.0, |
| "step": 6090 |
| }, |
| { |
| "epoch": 16.35436241610738, |
| "grad_norm": 1.625817060470581, |
| "learning_rate": 2.7980292422118282e-06, |
| "loss": 0.0788, |
| "mean_token_accuracy": 0.9728165179491043, |
| "num_tokens": 13289168.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 16.381208053691275, |
| "grad_norm": 2.6137592792510986, |
| "learning_rate": 2.782409458722371e-06, |
| "loss": 0.1082, |
| "mean_token_accuracy": 0.9670576930046082, |
| "num_tokens": 13309717.0, |
| "step": 6110 |
| }, |
| { |
| "epoch": 16.408053691275168, |
| "grad_norm": 1.512478232383728, |
| "learning_rate": 2.7668165702176007e-06, |
| "loss": 0.0881, |
| "mean_token_accuracy": 0.9715318799018859, |
| "num_tokens": 13330146.0, |
| "step": 6120 |
| }, |
| { |
| "epoch": 16.43489932885906, |
| "grad_norm": 1.7888556718826294, |
| "learning_rate": 2.7512507658083996e-06, |
| "loss": 0.0808, |
| "mean_token_accuracy": 0.9712006777524949, |
| "num_tokens": 13354002.0, |
| "step": 6130 |
| }, |
| { |
| "epoch": 16.461744966442954, |
| "grad_norm": 1.4530067443847656, |
| "learning_rate": 2.735712234277165e-06, |
| "loss": 0.082, |
| "mean_token_accuracy": 0.9709073066711426, |
| "num_tokens": 13376841.0, |
| "step": 6140 |
| }, |
| { |
| "epoch": 16.488590604026847, |
| "grad_norm": 1.5421147346496582, |
| "learning_rate": 2.72020116407554e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9726418137550354, |
| "num_tokens": 13398706.0, |
| "step": 6150 |
| }, |
| { |
| "epoch": 16.51543624161074, |
| "grad_norm": 2.2229270935058594, |
| "learning_rate": 2.704717743322104e-06, |
| "loss": 0.1123, |
| "mean_token_accuracy": 0.9643840521574021, |
| "num_tokens": 13419292.0, |
| "step": 6160 |
| }, |
| { |
| "epoch": 16.542281879194633, |
| "grad_norm": 1.7774611711502075, |
| "learning_rate": 2.6892621598001157e-06, |
| "loss": 0.0896, |
| "mean_token_accuracy": 0.9724551647901535, |
| "num_tokens": 13439730.0, |
| "step": 6170 |
| }, |
| { |
| "epoch": 16.569127516778522, |
| "grad_norm": 1.6754381656646729, |
| "learning_rate": 2.673834600955212e-06, |
| "loss": 0.0851, |
| "mean_token_accuracy": 0.971163061261177, |
| "num_tokens": 13463427.0, |
| "step": 6180 |
| }, |
| { |
| "epoch": 16.595973154362415, |
| "grad_norm": 1.3491127490997314, |
| "learning_rate": 2.6584352538931523e-06, |
| "loss": 0.0913, |
| "mean_token_accuracy": 0.9694003283977508, |
| "num_tokens": 13486109.0, |
| "step": 6190 |
| }, |
| { |
| "epoch": 16.622818791946308, |
| "grad_norm": 1.4250203371047974, |
| "learning_rate": 2.643064305377542e-06, |
| "loss": 0.084, |
| "mean_token_accuracy": 0.9725743800401687, |
| "num_tokens": 13507988.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 16.6496644295302, |
| "grad_norm": 5.250053882598877, |
| "learning_rate": 2.627721941827568e-06, |
| "loss": 0.1066, |
| "mean_token_accuracy": 0.9665826559066772, |
| "num_tokens": 13528436.0, |
| "step": 6210 |
| }, |
| { |
| "epoch": 16.676510067114094, |
| "grad_norm": 2.1783525943756104, |
| "learning_rate": 2.612408349315734e-06, |
| "loss": 0.0907, |
| "mean_token_accuracy": 0.9724477410316468, |
| "num_tokens": 13548694.0, |
| "step": 6220 |
| }, |
| { |
| "epoch": 16.703355704697987, |
| "grad_norm": 1.5104924440383911, |
| "learning_rate": 2.597123713565618e-06, |
| "loss": 0.0786, |
| "mean_token_accuracy": 0.9716423004865646, |
| "num_tokens": 13572456.0, |
| "step": 6230 |
| }, |
| { |
| "epoch": 16.73020134228188, |
| "grad_norm": 1.8810374736785889, |
| "learning_rate": 2.581868219949597e-06, |
| "loss": 0.0873, |
| "mean_token_accuracy": 0.9687312304973602, |
| "num_tokens": 13595077.0, |
| "step": 6240 |
| }, |
| { |
| "epoch": 16.757046979865773, |
| "grad_norm": 3.1727447509765625, |
| "learning_rate": 2.5666420534866256e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9711047947406769, |
| "num_tokens": 13616808.0, |
| "step": 6250 |
| }, |
| { |
| "epoch": 16.783892617449663, |
| "grad_norm": 2.609102725982666, |
| "learning_rate": 2.551445398839964e-06, |
| "loss": 0.1102, |
| "mean_token_accuracy": 0.9661127954721451, |
| "num_tokens": 13637241.0, |
| "step": 6260 |
| }, |
| { |
| "epoch": 16.810738255033556, |
| "grad_norm": 1.5663530826568604, |
| "learning_rate": 2.536278440314962e-06, |
| "loss": 0.0907, |
| "mean_token_accuracy": 0.9729632198810577, |
| "num_tokens": 13657546.0, |
| "step": 6270 |
| }, |
| { |
| "epoch": 16.83758389261745, |
| "grad_norm": 3.344959020614624, |
| "learning_rate": 2.5211413618568114e-06, |
| "loss": 0.0807, |
| "mean_token_accuracy": 0.9719722241163253, |
| "num_tokens": 13681423.0, |
| "step": 6280 |
| }, |
| { |
| "epoch": 16.864429530201342, |
| "grad_norm": 1.5950130224227905, |
| "learning_rate": 2.5060343470483173e-06, |
| "loss": 0.0918, |
| "mean_token_accuracy": 0.9679073393344879, |
| "num_tokens": 13704189.0, |
| "step": 6290 |
| }, |
| { |
| "epoch": 16.891275167785235, |
| "grad_norm": 1.7153663635253906, |
| "learning_rate": 2.490957579107673e-06, |
| "loss": 0.078, |
| "mean_token_accuracy": 0.9733668386936187, |
| "num_tokens": 13726058.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 16.918120805369128, |
| "grad_norm": 4.971838474273682, |
| "learning_rate": 2.4759112408862366e-06, |
| "loss": 0.1162, |
| "mean_token_accuracy": 0.9651578456163407, |
| "num_tokens": 13746683.0, |
| "step": 6310 |
| }, |
| { |
| "epoch": 16.94496644295302, |
| "grad_norm": 1.5700833797454834, |
| "learning_rate": 2.460895514866315e-06, |
| "loss": 0.0913, |
| "mean_token_accuracy": 0.9729659497737885, |
| "num_tokens": 13766985.0, |
| "step": 6320 |
| }, |
| { |
| "epoch": 16.971812080536914, |
| "grad_norm": 4.7659382820129395, |
| "learning_rate": 2.445910583158948e-06, |
| "loss": 0.0814, |
| "mean_token_accuracy": 0.9713144838809967, |
| "num_tokens": 13789886.0, |
| "step": 6330 |
| }, |
| { |
| "epoch": 16.998657718120807, |
| "grad_norm": 1.5581632852554321, |
| "learning_rate": 2.4309566275017027e-06, |
| "loss": 0.0994, |
| "mean_token_accuracy": 0.9688288152217865, |
| "num_tokens": 13810043.0, |
| "step": 6340 |
| }, |
| { |
| "epoch": 17.024161073825503, |
| "grad_norm": 1.6904538869857788, |
| "learning_rate": 2.4160338292564685e-06, |
| "loss": 0.0794, |
| "mean_token_accuracy": 0.9736049488971108, |
| "num_tokens": 13832080.0, |
| "step": 6350 |
| }, |
| { |
| "epoch": 17.051006711409396, |
| "grad_norm": 1.6213840246200562, |
| "learning_rate": 2.401142369407256e-06, |
| "loss": 0.0797, |
| "mean_token_accuracy": 0.9715674847364426, |
| "num_tokens": 13854971.0, |
| "step": 6360 |
| }, |
| { |
| "epoch": 17.07785234899329, |
| "grad_norm": 1.3479030132293701, |
| "learning_rate": 2.386282428558001e-06, |
| "loss": 0.0758, |
| "mean_token_accuracy": 0.9742376655340195, |
| "num_tokens": 13876881.0, |
| "step": 6370 |
| }, |
| { |
| "epoch": 17.104697986577182, |
| "grad_norm": 2.28167724609375, |
| "learning_rate": 2.37145418693038e-06, |
| "loss": 0.1018, |
| "mean_token_accuracy": 0.9682648777961731, |
| "num_tokens": 13897674.0, |
| "step": 6380 |
| }, |
| { |
| "epoch": 17.131543624161075, |
| "grad_norm": 1.3986122608184814, |
| "learning_rate": 2.3566578243616184e-06, |
| "loss": 0.0943, |
| "mean_token_accuracy": 0.9730989217758179, |
| "num_tokens": 13917109.0, |
| "step": 6390 |
| }, |
| { |
| "epoch": 17.158389261744965, |
| "grad_norm": 1.8013077974319458, |
| "learning_rate": 2.341893520302313e-06, |
| "loss": 0.0748, |
| "mean_token_accuracy": 0.9754731118679046, |
| "num_tokens": 13940804.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 17.185234899328858, |
| "grad_norm": 1.4591470956802368, |
| "learning_rate": 2.327161453814254e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9693176746368408, |
| "num_tokens": 13963800.0, |
| "step": 6410 |
| }, |
| { |
| "epoch": 17.21208053691275, |
| "grad_norm": 1.3876402378082275, |
| "learning_rate": 2.3124618035682523e-06, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9730733752250671, |
| "num_tokens": 13985821.0, |
| "step": 6420 |
| }, |
| { |
| "epoch": 17.238926174496644, |
| "grad_norm": 2.060427665710449, |
| "learning_rate": 2.297794747841976e-06, |
| "loss": 0.1029, |
| "mean_token_accuracy": 0.9671726226806641, |
| "num_tokens": 14006527.0, |
| "step": 6430 |
| }, |
| { |
| "epoch": 17.265771812080537, |
| "grad_norm": 1.4501941204071045, |
| "learning_rate": 2.2831604645177867e-06, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9718647956848144, |
| "num_tokens": 14026022.0, |
| "step": 6440 |
| }, |
| { |
| "epoch": 17.29261744966443, |
| "grad_norm": 4.501188278198242, |
| "learning_rate": 2.2685591310805743e-06, |
| "loss": 0.0745, |
| "mean_token_accuracy": 0.9747655302286148, |
| "num_tokens": 14049610.0, |
| "step": 6450 |
| }, |
| { |
| "epoch": 17.319463087248323, |
| "grad_norm": 1.406905174255371, |
| "learning_rate": 2.2539909246156257e-06, |
| "loss": 0.0787, |
| "mean_token_accuracy": 0.9729764729738235, |
| "num_tokens": 14072599.0, |
| "step": 6460 |
| }, |
| { |
| "epoch": 17.346308724832216, |
| "grad_norm": 1.2447459697723389, |
| "learning_rate": 2.2394560218064464e-06, |
| "loss": 0.0757, |
| "mean_token_accuracy": 0.9738614737987519, |
| "num_tokens": 14094676.0, |
| "step": 6470 |
| }, |
| { |
| "epoch": 17.37315436241611, |
| "grad_norm": 3.209287643432617, |
| "learning_rate": 2.2249545989326516e-06, |
| "loss": 0.0944, |
| "mean_token_accuracy": 0.9688991487026215, |
| "num_tokens": 14115606.0, |
| "step": 6480 |
| }, |
| { |
| "epoch": 17.4, |
| "grad_norm": 1.2575204372406006, |
| "learning_rate": 2.2104868318677963e-06, |
| "loss": 0.0878, |
| "mean_token_accuracy": 0.9742684334516525, |
| "num_tokens": 14134927.0, |
| "step": 6490 |
| }, |
| { |
| "epoch": 17.42684563758389, |
| "grad_norm": 3.82460880279541, |
| "learning_rate": 2.1960528960772666e-06, |
| "loss": 0.0751, |
| "mean_token_accuracy": 0.9734666824340821, |
| "num_tokens": 14158556.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 17.453691275167785, |
| "grad_norm": 2.3200035095214844, |
| "learning_rate": 2.1816529666161378e-06, |
| "loss": 0.0832, |
| "mean_token_accuracy": 0.9702016830444335, |
| "num_tokens": 14181722.0, |
| "step": 6510 |
| }, |
| { |
| "epoch": 17.480536912751678, |
| "grad_norm": 2.4491753578186035, |
| "learning_rate": 2.1672872181270575e-06, |
| "loss": 0.0806, |
| "mean_token_accuracy": 0.9737491935491562, |
| "num_tokens": 14203907.0, |
| "step": 6520 |
| }, |
| { |
| "epoch": 17.50738255033557, |
| "grad_norm": 4.129950046539307, |
| "learning_rate": 2.1529558248381254e-06, |
| "loss": 0.0952, |
| "mean_token_accuracy": 0.9714792817831039, |
| "num_tokens": 14224816.0, |
| "step": 6530 |
| }, |
| { |
| "epoch": 17.534228187919464, |
| "grad_norm": 1.8445805311203003, |
| "learning_rate": 2.1386589605607826e-06, |
| "loss": 0.0908, |
| "mean_token_accuracy": 0.9711948812007904, |
| "num_tokens": 14244208.0, |
| "step": 6540 |
| }, |
| { |
| "epoch": 17.561073825503357, |
| "grad_norm": 1.3207927942276, |
| "learning_rate": 2.1243967986876933e-06, |
| "loss": 0.071, |
| "mean_token_accuracy": 0.9752148389816284, |
| "num_tokens": 14267815.0, |
| "step": 6550 |
| }, |
| { |
| "epoch": 17.58791946308725, |
| "grad_norm": 1.794206142425537, |
| "learning_rate": 2.110169512190664e-06, |
| "loss": 0.0793, |
| "mean_token_accuracy": 0.9713789284229278, |
| "num_tokens": 14290856.0, |
| "step": 6560 |
| }, |
| { |
| "epoch": 17.614765100671143, |
| "grad_norm": 1.4987738132476807, |
| "learning_rate": 2.0959772736185174e-06, |
| "loss": 0.0809, |
| "mean_token_accuracy": 0.9734682083129883, |
| "num_tokens": 14312993.0, |
| "step": 6570 |
| }, |
| { |
| "epoch": 17.641610738255032, |
| "grad_norm": 2.3780603408813477, |
| "learning_rate": 2.081820255095028e-06, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9696302711963654, |
| "num_tokens": 14334097.0, |
| "step": 6580 |
| }, |
| { |
| "epoch": 17.668456375838925, |
| "grad_norm": 1.3051165342330933, |
| "learning_rate": 2.0676986283168083e-06, |
| "loss": 0.0973, |
| "mean_token_accuracy": 0.9718139797449112, |
| "num_tokens": 14353612.0, |
| "step": 6590 |
| }, |
| { |
| "epoch": 17.69530201342282, |
| "grad_norm": 1.6877503395080566, |
| "learning_rate": 2.0536125645512473e-06, |
| "loss": 0.079, |
| "mean_token_accuracy": 0.9738190263509751, |
| "num_tokens": 14377191.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 17.72214765100671, |
| "grad_norm": 1.623205304145813, |
| "learning_rate": 2.0395622346344213e-06, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9697360932826996, |
| "num_tokens": 14400189.0, |
| "step": 6610 |
| }, |
| { |
| "epoch": 17.748993288590604, |
| "grad_norm": 1.23772394657135, |
| "learning_rate": 2.025547808969028e-06, |
| "loss": 0.0741, |
| "mean_token_accuracy": 0.974305123090744, |
| "num_tokens": 14422242.0, |
| "step": 6620 |
| }, |
| { |
| "epoch": 17.775838926174497, |
| "grad_norm": 3.2685928344726562, |
| "learning_rate": 2.011569457522315e-06, |
| "loss": 0.0987, |
| "mean_token_accuracy": 0.968069052696228, |
| "num_tokens": 14443178.0, |
| "step": 6630 |
| }, |
| { |
| "epoch": 17.80268456375839, |
| "grad_norm": 1.7577261924743652, |
| "learning_rate": 1.9976273498240234e-06, |
| "loss": 0.095, |
| "mean_token_accuracy": 0.9713614732027054, |
| "num_tokens": 14462670.0, |
| "step": 6640 |
| }, |
| { |
| "epoch": 17.829530201342283, |
| "grad_norm": 1.8098061084747314, |
| "learning_rate": 1.9837216549643285e-06, |
| "loss": 0.076, |
| "mean_token_accuracy": 0.9737594068050385, |
| "num_tokens": 14486332.0, |
| "step": 6650 |
| }, |
| { |
| "epoch": 17.856375838926173, |
| "grad_norm": 1.7305619716644287, |
| "learning_rate": 1.969852541591789e-06, |
| "loss": 0.0883, |
| "mean_token_accuracy": 0.9701765835285187, |
| "num_tokens": 14509387.0, |
| "step": 6660 |
| }, |
| { |
| "epoch": 17.883221476510066, |
| "grad_norm": 3.777503728866577, |
| "learning_rate": 1.9560201779113056e-06, |
| "loss": 0.0837, |
| "mean_token_accuracy": 0.972148808836937, |
| "num_tokens": 14531516.0, |
| "step": 6670 |
| }, |
| { |
| "epoch": 17.91006711409396, |
| "grad_norm": 1.9363383054733276, |
| "learning_rate": 1.94222473168207e-06, |
| "loss": 0.0969, |
| "mean_token_accuracy": 0.9687934070825577, |
| "num_tokens": 14552725.0, |
| "step": 6680 |
| }, |
| { |
| "epoch": 17.936912751677852, |
| "grad_norm": 1.2105554342269897, |
| "learning_rate": 1.928466370215552e-06, |
| "loss": 0.0959, |
| "mean_token_accuracy": 0.971455842256546, |
| "num_tokens": 14572133.0, |
| "step": 6690 |
| }, |
| { |
| "epoch": 17.963758389261745, |
| "grad_norm": 1.5113040208816528, |
| "learning_rate": 1.9147452603734402e-06, |
| "loss": 0.0787, |
| "mean_token_accuracy": 0.9724821716547012, |
| "num_tokens": 14595076.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 17.990604026845638, |
| "grad_norm": 2.3052010536193848, |
| "learning_rate": 1.9010615685656514e-06, |
| "loss": 0.0875, |
| "mean_token_accuracy": 0.9719449728727341, |
| "num_tokens": 14616603.0, |
| "step": 6710 |
| }, |
| { |
| "epoch": 18.016107382550334, |
| "grad_norm": 1.6915111541748047, |
| "learning_rate": 1.8874154607482815e-06, |
| "loss": 0.0849, |
| "mean_token_accuracy": 0.9732024732388949, |
| "num_tokens": 14637290.0, |
| "step": 6720 |
| }, |
| { |
| "epoch": 18.042953020134227, |
| "grad_norm": 1.7393372058868408, |
| "learning_rate": 1.8738071024216141e-06, |
| "loss": 0.0734, |
| "mean_token_accuracy": 0.9738708108663559, |
| "num_tokens": 14660686.0, |
| "step": 6730 |
| }, |
| { |
| "epoch": 18.06979865771812, |
| "grad_norm": 1.477663516998291, |
| "learning_rate": 1.8602366586281063e-06, |
| "loss": 0.0787, |
| "mean_token_accuracy": 0.9723863214254379, |
| "num_tokens": 14683101.0, |
| "step": 6740 |
| }, |
| { |
| "epoch": 18.096644295302013, |
| "grad_norm": 2.2596659660339355, |
| "learning_rate": 1.8467042939503844e-06, |
| "loss": 0.0765, |
| "mean_token_accuracy": 0.9746802181005478, |
| "num_tokens": 14704571.0, |
| "step": 6750 |
| }, |
| { |
| "epoch": 18.123489932885906, |
| "grad_norm": 1.6076984405517578, |
| "learning_rate": 1.8332101725092522e-06, |
| "loss": 0.1067, |
| "mean_token_accuracy": 0.96855249106884, |
| "num_tokens": 14724451.0, |
| "step": 6760 |
| }, |
| { |
| "epoch": 18.1503355704698, |
| "grad_norm": 2.0869836807250977, |
| "learning_rate": 1.8197544579616998e-06, |
| "loss": 0.0792, |
| "mean_token_accuracy": 0.9745613276958466, |
| "num_tokens": 14746707.0, |
| "step": 6770 |
| }, |
| { |
| "epoch": 18.177181208053693, |
| "grad_norm": 2.008155345916748, |
| "learning_rate": 1.8063373134989104e-06, |
| "loss": 0.0758, |
| "mean_token_accuracy": 0.9732336699962616, |
| "num_tokens": 14770082.0, |
| "step": 6780 |
| }, |
| { |
| "epoch": 18.204026845637586, |
| "grad_norm": 1.347522497177124, |
| "learning_rate": 1.7929589018443016e-06, |
| "loss": 0.0788, |
| "mean_token_accuracy": 0.9738054692745208, |
| "num_tokens": 14792471.0, |
| "step": 6790 |
| }, |
| { |
| "epoch": 18.230872483221475, |
| "grad_norm": 2.535391092300415, |
| "learning_rate": 1.7796193852515258e-06, |
| "loss": 0.0839, |
| "mean_token_accuracy": 0.9719479292631149, |
| "num_tokens": 14813925.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 18.257718120805368, |
| "grad_norm": 1.823934555053711, |
| "learning_rate": 1.766318925502522e-06, |
| "loss": 0.0927, |
| "mean_token_accuracy": 0.973075395822525, |
| "num_tokens": 14833771.0, |
| "step": 6810 |
| }, |
| { |
| "epoch": 18.28456375838926, |
| "grad_norm": 1.6548361778259277, |
| "learning_rate": 1.7530576839055453e-06, |
| "loss": 0.072, |
| "mean_token_accuracy": 0.9755393981933593, |
| "num_tokens": 14855886.0, |
| "step": 6820 |
| }, |
| { |
| "epoch": 18.311409395973154, |
| "grad_norm": 1.5816959142684937, |
| "learning_rate": 1.7398358212932132e-06, |
| "loss": 0.0779, |
| "mean_token_accuracy": 0.9722195774316787, |
| "num_tokens": 14879050.0, |
| "step": 6830 |
| }, |
| { |
| "epoch": 18.338255033557047, |
| "grad_norm": 1.3728562593460083, |
| "learning_rate": 1.7266534980205524e-06, |
| "loss": 0.0788, |
| "mean_token_accuracy": 0.9737544000148773, |
| "num_tokens": 14901117.0, |
| "step": 6840 |
| }, |
| { |
| "epoch": 18.36510067114094, |
| "grad_norm": 2.1892294883728027, |
| "learning_rate": 1.7135108739630573e-06, |
| "loss": 0.0858, |
| "mean_token_accuracy": 0.9720165610313416, |
| "num_tokens": 14922262.0, |
| "step": 6850 |
| }, |
| { |
| "epoch": 18.391946308724833, |
| "grad_norm": 1.304505705833435, |
| "learning_rate": 1.7004081085147483e-06, |
| "loss": 0.096, |
| "mean_token_accuracy": 0.9711766183376312, |
| "num_tokens": 14942032.0, |
| "step": 6860 |
| }, |
| { |
| "epoch": 18.418791946308726, |
| "grad_norm": 1.7754546403884888, |
| "learning_rate": 1.6873453605862405e-06, |
| "loss": 0.0738, |
| "mean_token_accuracy": 0.9774259269237519, |
| "num_tokens": 14964280.0, |
| "step": 6870 |
| }, |
| { |
| "epoch": 18.44563758389262, |
| "grad_norm": 3.5867326259613037, |
| "learning_rate": 1.6743227886028152e-06, |
| "loss": 0.0756, |
| "mean_token_accuracy": 0.9743267685174942, |
| "num_tokens": 14987755.0, |
| "step": 6880 |
| }, |
| { |
| "epoch": 18.47248322147651, |
| "grad_norm": 1.8171364068984985, |
| "learning_rate": 1.6613405505024987e-06, |
| "loss": 0.075, |
| "mean_token_accuracy": 0.974465224146843, |
| "num_tokens": 15010023.0, |
| "step": 6890 |
| }, |
| { |
| "epoch": 18.4993288590604, |
| "grad_norm": 2.2199792861938477, |
| "learning_rate": 1.6483988037341497e-06, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9714995324611664, |
| "num_tokens": 15031292.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 18.526174496644295, |
| "grad_norm": 1.3972761631011963, |
| "learning_rate": 1.6354977052555393e-06, |
| "loss": 0.0983, |
| "mean_token_accuracy": 0.9719545841217041, |
| "num_tokens": 15051069.0, |
| "step": 6910 |
| }, |
| { |
| "epoch": 18.553020134228188, |
| "grad_norm": 1.4796215295791626, |
| "learning_rate": 1.622637411531468e-06, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.9775502204895019, |
| "num_tokens": 15073310.0, |
| "step": 6920 |
| }, |
| { |
| "epoch": 18.57986577181208, |
| "grad_norm": 1.5705747604370117, |
| "learning_rate": 1.6098180785318424e-06, |
| "loss": 0.074, |
| "mean_token_accuracy": 0.9737792462110519, |
| "num_tokens": 15096715.0, |
| "step": 6930 |
| }, |
| { |
| "epoch": 18.606711409395974, |
| "grad_norm": 1.4334312677383423, |
| "learning_rate": 1.5970398617298078e-06, |
| "loss": 0.0838, |
| "mean_token_accuracy": 0.9727539092302322, |
| "num_tokens": 15119210.0, |
| "step": 6940 |
| }, |
| { |
| "epoch": 18.633557046979867, |
| "grad_norm": 1.7822301387786865, |
| "learning_rate": 1.584302916099842e-06, |
| "loss": 0.0761, |
| "mean_token_accuracy": 0.9743141323328018, |
| "num_tokens": 15140845.0, |
| "step": 6950 |
| }, |
| { |
| "epoch": 18.66040268456376, |
| "grad_norm": 1.785094141960144, |
| "learning_rate": 1.5716073961158907e-06, |
| "loss": 0.0968, |
| "mean_token_accuracy": 0.9698344320058823, |
| "num_tokens": 15160876.0, |
| "step": 6960 |
| }, |
| { |
| "epoch": 18.68724832214765, |
| "grad_norm": 1.5996724367141724, |
| "learning_rate": 1.5589534557494868e-06, |
| "loss": 0.0751, |
| "mean_token_accuracy": 0.9763054817914962, |
| "num_tokens": 15183031.0, |
| "step": 6970 |
| }, |
| { |
| "epoch": 18.714093959731542, |
| "grad_norm": 2.5582118034362793, |
| "learning_rate": 1.5463412484678858e-06, |
| "loss": 0.0789, |
| "mean_token_accuracy": 0.9721554785966873, |
| "num_tokens": 15206301.0, |
| "step": 6980 |
| }, |
| { |
| "epoch": 18.740939597315435, |
| "grad_norm": 1.7245941162109375, |
| "learning_rate": 1.5337709272322015e-06, |
| "loss": 0.0811, |
| "mean_token_accuracy": 0.9717311263084412, |
| "num_tokens": 15228690.0, |
| "step": 6990 |
| }, |
| { |
| "epoch": 18.76778523489933, |
| "grad_norm": 2.050199270248413, |
| "learning_rate": 1.5212426444955569e-06, |
| "loss": 0.0835, |
| "mean_token_accuracy": 0.9718193680047988, |
| "num_tokens": 15250081.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 18.79463087248322, |
| "grad_norm": 2.7452526092529297, |
| "learning_rate": 1.5087565522012226e-06, |
| "loss": 0.0967, |
| "mean_token_accuracy": 0.9709857195615769, |
| "num_tokens": 15269934.0, |
| "step": 7010 |
| }, |
| { |
| "epoch": 18.821476510067114, |
| "grad_norm": 1.5559078454971313, |
| "learning_rate": 1.496312801780795e-06, |
| "loss": 0.0709, |
| "mean_token_accuracy": 0.976904445886612, |
| "num_tokens": 15292176.0, |
| "step": 7020 |
| }, |
| { |
| "epoch": 18.848322147651007, |
| "grad_norm": 3.355046272277832, |
| "learning_rate": 1.4839115441523355e-06, |
| "loss": 0.0761, |
| "mean_token_accuracy": 0.9721087634563446, |
| "num_tokens": 15315512.0, |
| "step": 7030 |
| }, |
| { |
| "epoch": 18.8751677852349, |
| "grad_norm": 1.4108015298843384, |
| "learning_rate": 1.47155292971856e-06, |
| "loss": 0.0818, |
| "mean_token_accuracy": 0.9717017352581024, |
| "num_tokens": 15337909.0, |
| "step": 7040 |
| }, |
| { |
| "epoch": 18.902013422818793, |
| "grad_norm": 2.1846816539764404, |
| "learning_rate": 1.459237108365003e-06, |
| "loss": 0.0779, |
| "mean_token_accuracy": 0.9738930046558381, |
| "num_tokens": 15359470.0, |
| "step": 7050 |
| }, |
| { |
| "epoch": 18.928859060402683, |
| "grad_norm": 1.6539279222488403, |
| "learning_rate": 1.4469642294582048e-06, |
| "loss": 0.0991, |
| "mean_token_accuracy": 0.9687478452920913, |
| "num_tokens": 15379489.0, |
| "step": 7060 |
| }, |
| { |
| "epoch": 18.955704697986576, |
| "grad_norm": 1.7294431924819946, |
| "learning_rate": 1.434734441843899e-06, |
| "loss": 0.082, |
| "mean_token_accuracy": 0.9736314594745636, |
| "num_tokens": 15401287.0, |
| "step": 7070 |
| }, |
| { |
| "epoch": 18.98255033557047, |
| "grad_norm": 2.668654441833496, |
| "learning_rate": 1.4225478938452064e-06, |
| "loss": 0.0805, |
| "mean_token_accuracy": 0.9734222680330277, |
| "num_tokens": 15423058.0, |
| "step": 7080 |
| }, |
| { |
| "epoch": 19.00805369127517, |
| "grad_norm": 1.6047618389129639, |
| "learning_rate": 1.4104047332608379e-06, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9735144627721686, |
| "num_tokens": 15442432.0, |
| "step": 7090 |
| }, |
| { |
| "epoch": 19.034899328859062, |
| "grad_norm": 1.4699209928512573, |
| "learning_rate": 1.3983051073632996e-06, |
| "loss": 0.0695, |
| "mean_token_accuracy": 0.9758471369743347, |
| "num_tokens": 15466193.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 19.06174496644295, |
| "grad_norm": 1.302131175994873, |
| "learning_rate": 1.3862491628971097e-06, |
| "loss": 0.0739, |
| "mean_token_accuracy": 0.9741963982582093, |
| "num_tokens": 15488932.0, |
| "step": 7110 |
| }, |
| { |
| "epoch": 19.088590604026844, |
| "grad_norm": 1.373273491859436, |
| "learning_rate": 1.3742370460770144e-06, |
| "loss": 0.0685, |
| "mean_token_accuracy": 0.9762805610895157, |
| "num_tokens": 15510761.0, |
| "step": 7120 |
| }, |
| { |
| "epoch": 19.115436241610738, |
| "grad_norm": 1.9326364994049072, |
| "learning_rate": 1.3622689025862219e-06, |
| "loss": 0.0929, |
| "mean_token_accuracy": 0.9708808869123459, |
| "num_tokens": 15531273.0, |
| "step": 7130 |
| }, |
| { |
| "epoch": 19.14228187919463, |
| "grad_norm": 2.295323371887207, |
| "learning_rate": 1.3503448775746226e-06, |
| "loss": 0.0754, |
| "mean_token_accuracy": 0.976592805981636, |
| "num_tokens": 15552169.0, |
| "step": 7140 |
| }, |
| { |
| "epoch": 19.169127516778524, |
| "grad_norm": 2.1047043800354004, |
| "learning_rate": 1.3384651156570483e-06, |
| "loss": 0.0684, |
| "mean_token_accuracy": 0.9758545637130738, |
| "num_tokens": 15575849.0, |
| "step": 7150 |
| }, |
| { |
| "epoch": 19.195973154362417, |
| "grad_norm": 1.449154019355774, |
| "learning_rate": 1.3266297609114965e-06, |
| "loss": 0.0751, |
| "mean_token_accuracy": 0.9727135717868804, |
| "num_tokens": 15598502.0, |
| "step": 7160 |
| }, |
| { |
| "epoch": 19.22281879194631, |
| "grad_norm": 1.5876628160476685, |
| "learning_rate": 1.3148389568774022e-06, |
| "loss": 0.0707, |
| "mean_token_accuracy": 0.9776216924190522, |
| "num_tokens": 15620289.0, |
| "step": 7170 |
| }, |
| { |
| "epoch": 19.249664429530203, |
| "grad_norm": 1.8609250783920288, |
| "learning_rate": 1.3030928465538822e-06, |
| "loss": 0.0948, |
| "mean_token_accuracy": 0.9705761075019836, |
| "num_tokens": 15640772.0, |
| "step": 7180 |
| }, |
| { |
| "epoch": 19.276510067114096, |
| "grad_norm": 2.3664748668670654, |
| "learning_rate": 1.291391572398009e-06, |
| "loss": 0.0746, |
| "mean_token_accuracy": 0.9772118955850602, |
| "num_tokens": 15661593.0, |
| "step": 7190 |
| }, |
| { |
| "epoch": 19.303355704697985, |
| "grad_norm": 2.159987449645996, |
| "learning_rate": 1.279735276323083e-06, |
| "loss": 0.073, |
| "mean_token_accuracy": 0.9758386135101318, |
| "num_tokens": 15685231.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 19.330201342281878, |
| "grad_norm": 4.445925712585449, |
| "learning_rate": 1.2681240996969085e-06, |
| "loss": 0.0794, |
| "mean_token_accuracy": 0.9732168436050415, |
| "num_tokens": 15707769.0, |
| "step": 7210 |
| }, |
| { |
| "epoch": 19.35704697986577, |
| "grad_norm": 1.9557926654815674, |
| "learning_rate": 1.2565581833400753e-06, |
| "loss": 0.0734, |
| "mean_token_accuracy": 0.975311490893364, |
| "num_tokens": 15729487.0, |
| "step": 7220 |
| }, |
| { |
| "epoch": 19.383892617449664, |
| "grad_norm": 1.8642995357513428, |
| "learning_rate": 1.2450376675242658e-06, |
| "loss": 0.1031, |
| "mean_token_accuracy": 0.9692795485258102, |
| "num_tokens": 15749855.0, |
| "step": 7230 |
| }, |
| { |
| "epoch": 19.410738255033557, |
| "grad_norm": 3.3018038272857666, |
| "learning_rate": 1.233562691970533e-06, |
| "loss": 0.0789, |
| "mean_token_accuracy": 0.9755080968141556, |
| "num_tokens": 15770677.0, |
| "step": 7240 |
| }, |
| { |
| "epoch": 19.43758389261745, |
| "grad_norm": 3.031343460083008, |
| "learning_rate": 1.2221333958476261e-06, |
| "loss": 0.0717, |
| "mean_token_accuracy": 0.9755002796649933, |
| "num_tokens": 15794295.0, |
| "step": 7250 |
| }, |
| { |
| "epoch": 19.464429530201343, |
| "grad_norm": 1.6727511882781982, |
| "learning_rate": 1.2107499177702852e-06, |
| "loss": 0.0779, |
| "mean_token_accuracy": 0.9729755282402038, |
| "num_tokens": 15816964.0, |
| "step": 7260 |
| }, |
| { |
| "epoch": 19.491275167785236, |
| "grad_norm": 2.095463991165161, |
| "learning_rate": 1.1994123957975722e-06, |
| "loss": 0.0734, |
| "mean_token_accuracy": 0.9745409727096558, |
| "num_tokens": 15838814.0, |
| "step": 7270 |
| }, |
| { |
| "epoch": 19.51812080536913, |
| "grad_norm": 1.4033610820770264, |
| "learning_rate": 1.1881209674311934e-06, |
| "loss": 0.093, |
| "mean_token_accuracy": 0.9723419904708862, |
| "num_tokens": 15859106.0, |
| "step": 7280 |
| }, |
| { |
| "epoch": 19.54496644295302, |
| "grad_norm": 2.11256742477417, |
| "learning_rate": 1.1768757696138278e-06, |
| "loss": 0.0767, |
| "mean_token_accuracy": 0.9770674705505371, |
| "num_tokens": 15879928.0, |
| "step": 7290 |
| }, |
| { |
| "epoch": 19.571812080536912, |
| "grad_norm": 2.040146827697754, |
| "learning_rate": 1.1656769387274714e-06, |
| "loss": 0.0718, |
| "mean_token_accuracy": 0.9746215373277665, |
| "num_tokens": 15903526.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 19.598657718120805, |
| "grad_norm": 2.3293981552124023, |
| "learning_rate": 1.1545246105917807e-06, |
| "loss": 0.0793, |
| "mean_token_accuracy": 0.9727816581726074, |
| "num_tokens": 15925985.0, |
| "step": 7310 |
| }, |
| { |
| "epoch": 19.625503355704698, |
| "grad_norm": 2.2348577976226807, |
| "learning_rate": 1.143418920462425e-06, |
| "loss": 0.0796, |
| "mean_token_accuracy": 0.9728397846221923, |
| "num_tokens": 15947589.0, |
| "step": 7320 |
| }, |
| { |
| "epoch": 19.65234899328859, |
| "grad_norm": 1.796433448791504, |
| "learning_rate": 1.132360003029449e-06, |
| "loss": 0.1048, |
| "mean_token_accuracy": 0.9691227197647094, |
| "num_tokens": 15967874.0, |
| "step": 7330 |
| }, |
| { |
| "epoch": 19.679194630872484, |
| "grad_norm": 2.2440576553344727, |
| "learning_rate": 1.1213479924156346e-06, |
| "loss": 0.0776, |
| "mean_token_accuracy": 0.9765941977500916, |
| "num_tokens": 15988720.0, |
| "step": 7340 |
| }, |
| { |
| "epoch": 19.706040268456377, |
| "grad_norm": 2.9480559825897217, |
| "learning_rate": 1.1103830221748774e-06, |
| "loss": 0.0759, |
| "mean_token_accuracy": 0.973601347208023, |
| "num_tokens": 16012428.0, |
| "step": 7350 |
| }, |
| { |
| "epoch": 19.73288590604027, |
| "grad_norm": 1.705248236656189, |
| "learning_rate": 1.0994652252905695e-06, |
| "loss": 0.0825, |
| "mean_token_accuracy": 0.9710620373487473, |
| "num_tokens": 16035074.0, |
| "step": 7360 |
| }, |
| { |
| "epoch": 19.75973154362416, |
| "grad_norm": 1.7874031066894531, |
| "learning_rate": 1.0885947341739768e-06, |
| "loss": 0.068, |
| "mean_token_accuracy": 0.9776312798261643, |
| "num_tokens": 16056867.0, |
| "step": 7370 |
| }, |
| { |
| "epoch": 19.786577181208052, |
| "grad_norm": 1.9907939434051514, |
| "learning_rate": 1.0777716806626488e-06, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9716785818338394, |
| "num_tokens": 16077235.0, |
| "step": 7380 |
| }, |
| { |
| "epoch": 19.813422818791945, |
| "grad_norm": 2.154221773147583, |
| "learning_rate": 1.0669961960188008e-06, |
| "loss": 0.0786, |
| "mean_token_accuracy": 0.9756328999996186, |
| "num_tokens": 16098064.0, |
| "step": 7390 |
| }, |
| { |
| "epoch": 19.84026845637584, |
| "grad_norm": 2.000872850418091, |
| "learning_rate": 1.0562684109277426e-06, |
| "loss": 0.0728, |
| "mean_token_accuracy": 0.9748990356922149, |
| "num_tokens": 16121764.0, |
| "step": 7400 |
| }, |
| { |
| "epoch": 19.86711409395973, |
| "grad_norm": 1.618493676185608, |
| "learning_rate": 1.0455884554962725e-06, |
| "loss": 0.0764, |
| "mean_token_accuracy": 0.9751930832862854, |
| "num_tokens": 16144396.0, |
| "step": 7410 |
| }, |
| { |
| "epoch": 19.893959731543625, |
| "grad_norm": 2.0999059677124023, |
| "learning_rate": 1.0349564592511162e-06, |
| "loss": 0.0724, |
| "mean_token_accuracy": 0.9754237473011017, |
| "num_tokens": 16166124.0, |
| "step": 7420 |
| }, |
| { |
| "epoch": 19.920805369127518, |
| "grad_norm": 1.7069863080978394, |
| "learning_rate": 1.024372551137348e-06, |
| "loss": 0.0942, |
| "mean_token_accuracy": 0.9713965833187104, |
| "num_tokens": 16186295.0, |
| "step": 7430 |
| }, |
| { |
| "epoch": 19.94765100671141, |
| "grad_norm": 1.8244096040725708, |
| "learning_rate": 1.0138368595168291e-06, |
| "loss": 0.0781, |
| "mean_token_accuracy": 0.9758277833461761, |
| "num_tokens": 16206865.0, |
| "step": 7440 |
| }, |
| { |
| "epoch": 19.974496644295304, |
| "grad_norm": 2.282172441482544, |
| "learning_rate": 1.0033495121666442e-06, |
| "loss": 0.0764, |
| "mean_token_accuracy": 0.9733882695436478, |
| "num_tokens": 16229364.0, |
| "step": 7450 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 2.2477612495422363, |
| "learning_rate": 9.929106362775681e-07, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9732856374037894, |
| "num_tokens": 16247380.0, |
| "step": 7460 |
| }, |
| { |
| "epoch": 20.026845637583893, |
| "grad_norm": 1.7040324211120605, |
| "learning_rate": 9.825203584525022e-07, |
| "loss": 0.0688, |
| "mean_token_accuracy": 0.9761479705572128, |
| "num_tokens": 16271513.0, |
| "step": 7470 |
| }, |
| { |
| "epoch": 20.053691275167786, |
| "grad_norm": 1.6158870458602905, |
| "learning_rate": 9.721788047049586e-07, |
| "loss": 0.0757, |
| "mean_token_accuracy": 0.9738033145666123, |
| "num_tokens": 16294453.0, |
| "step": 7480 |
| }, |
| { |
| "epoch": 20.08053691275168, |
| "grad_norm": 1.5813090801239014, |
| "learning_rate": 9.618861004575154e-07, |
| "loss": 0.0676, |
| "mean_token_accuracy": 0.9767954677343369, |
| "num_tokens": 16316505.0, |
| "step": 7490 |
| }, |
| { |
| "epoch": 20.107382550335572, |
| "grad_norm": 2.368957042694092, |
| "learning_rate": 9.516423705403066e-07, |
| "loss": 0.0876, |
| "mean_token_accuracy": 0.9725225150585175, |
| "num_tokens": 16337346.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 20.13422818791946, |
| "grad_norm": 0.9680088758468628, |
| "learning_rate": 9.414477391895044e-07, |
| "loss": 0.0766, |
| "mean_token_accuracy": 0.9777394503355026, |
| "num_tokens": 16356618.0, |
| "step": 7510 |
| }, |
| { |
| "epoch": 20.161073825503355, |
| "grad_norm": 1.9878125190734863, |
| "learning_rate": 9.313023300458118e-07, |
| "loss": 0.0697, |
| "mean_token_accuracy": 0.9754358917474747, |
| "num_tokens": 16380670.0, |
| "step": 7520 |
| }, |
| { |
| "epoch": 20.187919463087248, |
| "grad_norm": 1.7701399326324463, |
| "learning_rate": 9.212062661529641e-07, |
| "loss": 0.0697, |
| "mean_token_accuracy": 0.975850623846054, |
| "num_tokens": 16403537.0, |
| "step": 7530 |
| }, |
| { |
| "epoch": 20.21476510067114, |
| "grad_norm": 1.4228025674819946, |
| "learning_rate": 9.11159669956237e-07, |
| "loss": 0.0704, |
| "mean_token_accuracy": 0.9756285309791565, |
| "num_tokens": 16425451.0, |
| "step": 7540 |
| }, |
| { |
| "epoch": 20.241610738255034, |
| "grad_norm": 2.2829296588897705, |
| "learning_rate": 9.011626633009596e-07, |
| "loss": 0.0976, |
| "mean_token_accuracy": 0.9698988974094391, |
| "num_tokens": 16446094.0, |
| "step": 7550 |
| }, |
| { |
| "epoch": 20.268456375838927, |
| "grad_norm": 1.0587600469589233, |
| "learning_rate": 8.912153674310386e-07, |
| "loss": 0.0794, |
| "mean_token_accuracy": 0.9759812623262405, |
| "num_tokens": 16465369.0, |
| "step": 7560 |
| }, |
| { |
| "epoch": 20.29530201342282, |
| "grad_norm": 1.7993144989013672, |
| "learning_rate": 8.813179029874874e-07, |
| "loss": 0.0684, |
| "mean_token_accuracy": 0.9770451784133911, |
| "num_tokens": 16489501.0, |
| "step": 7570 |
| }, |
| { |
| "epoch": 20.322147651006713, |
| "grad_norm": 1.5915813446044922, |
| "learning_rate": 8.714703900069638e-07, |
| "loss": 0.0816, |
| "mean_token_accuracy": 0.971697872877121, |
| "num_tokens": 16512298.0, |
| "step": 7580 |
| }, |
| { |
| "epoch": 20.348993288590606, |
| "grad_norm": 1.3499330282211304, |
| "learning_rate": 8.616729479203123e-07, |
| "loss": 0.0674, |
| "mean_token_accuracy": 0.9774249851703644, |
| "num_tokens": 16534247.0, |
| "step": 7590 |
| }, |
| { |
| "epoch": 20.375838926174495, |
| "grad_norm": 3.021082878112793, |
| "learning_rate": 8.51925695551113e-07, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9728184163570404, |
| "num_tokens": 16555124.0, |
| "step": 7600 |
| }, |
| { |
| "epoch": 20.40268456375839, |
| "grad_norm": 1.2314085960388184, |
| "learning_rate": 8.422287511142524e-07, |
| "loss": 0.0852, |
| "mean_token_accuracy": 0.9761320263147354, |
| "num_tokens": 16574493.0, |
| "step": 7610 |
| }, |
| { |
| "epoch": 20.42953020134228, |
| "grad_norm": 2.3866217136383057, |
| "learning_rate": 8.325822322144728e-07, |
| "loss": 0.0707, |
| "mean_token_accuracy": 0.9761977344751358, |
| "num_tokens": 16598519.0, |
| "step": 7620 |
| }, |
| { |
| "epoch": 20.456375838926174, |
| "grad_norm": 1.8169933557510376, |
| "learning_rate": 8.229862558449592e-07, |
| "loss": 0.076, |
| "mean_token_accuracy": 0.9738662779331207, |
| "num_tokens": 16621364.0, |
| "step": 7630 |
| }, |
| { |
| "epoch": 20.483221476510067, |
| "grad_norm": 1.4940258264541626, |
| "learning_rate": 8.134409383859149e-07, |
| "loss": 0.0715, |
| "mean_token_accuracy": 0.9758548170328141, |
| "num_tokens": 16643312.0, |
| "step": 7640 |
| }, |
| { |
| "epoch": 20.51006711409396, |
| "grad_norm": 2.932368516921997, |
| "learning_rate": 8.039463956031501e-07, |
| "loss": 0.0867, |
| "mean_token_accuracy": 0.9723454862833023, |
| "num_tokens": 16664147.0, |
| "step": 7650 |
| }, |
| { |
| "epoch": 20.536912751677853, |
| "grad_norm": 1.3653275966644287, |
| "learning_rate": 7.945027426466801e-07, |
| "loss": 0.0782, |
| "mean_token_accuracy": 0.9777654260396957, |
| "num_tokens": 16683356.0, |
| "step": 7660 |
| }, |
| { |
| "epoch": 20.563758389261746, |
| "grad_norm": 2.0917046070098877, |
| "learning_rate": 7.851100940493273e-07, |
| "loss": 0.066, |
| "mean_token_accuracy": 0.9783705800771714, |
| "num_tokens": 16707545.0, |
| "step": 7670 |
| }, |
| { |
| "epoch": 20.59060402684564, |
| "grad_norm": 2.3530590534210205, |
| "learning_rate": 7.757685637253271e-07, |
| "loss": 0.0744, |
| "mean_token_accuracy": 0.973720371723175, |
| "num_tokens": 16730426.0, |
| "step": 7680 |
| }, |
| { |
| "epoch": 20.61744966442953, |
| "grad_norm": 1.8383070230484009, |
| "learning_rate": 7.664782649689611e-07, |
| "loss": 0.071, |
| "mean_token_accuracy": 0.9763635188341141, |
| "num_tokens": 16752377.0, |
| "step": 7690 |
| }, |
| { |
| "epoch": 20.644295302013422, |
| "grad_norm": 2.579986095428467, |
| "learning_rate": 7.572393104531622e-07, |
| "loss": 0.0929, |
| "mean_token_accuracy": 0.9709441542625428, |
| "num_tokens": 16773192.0, |
| "step": 7700 |
| }, |
| { |
| "epoch": 20.671140939597315, |
| "grad_norm": 1.4985390901565552, |
| "learning_rate": 7.480518122281711e-07, |
| "loss": 0.0785, |
| "mean_token_accuracy": 0.9765917301177979, |
| "num_tokens": 16792508.0, |
| "step": 7710 |
| }, |
| { |
| "epoch": 20.697986577181208, |
| "grad_norm": 1.8128042221069336, |
| "learning_rate": 7.389158817201541e-07, |
| "loss": 0.0685, |
| "mean_token_accuracy": 0.9761598974466323, |
| "num_tokens": 16816671.0, |
| "step": 7720 |
| }, |
| { |
| "epoch": 20.7248322147651, |
| "grad_norm": 1.72760009765625, |
| "learning_rate": 7.298316297298713e-07, |
| "loss": 0.0728, |
| "mean_token_accuracy": 0.9747570604085922, |
| "num_tokens": 16839673.0, |
| "step": 7730 |
| }, |
| { |
| "epoch": 20.751677852348994, |
| "grad_norm": 1.655516505241394, |
| "learning_rate": 7.207991664313202e-07, |
| "loss": 0.0712, |
| "mean_token_accuracy": 0.9755067259073258, |
| "num_tokens": 16861795.0, |
| "step": 7740 |
| }, |
| { |
| "epoch": 20.778523489932887, |
| "grad_norm": 2.935474395751953, |
| "learning_rate": 7.118186013704065e-07, |
| "loss": 0.0822, |
| "mean_token_accuracy": 0.973547288775444, |
| "num_tokens": 16882823.0, |
| "step": 7750 |
| }, |
| { |
| "epoch": 20.80536912751678, |
| "grad_norm": 1.1629414558410645, |
| "learning_rate": 7.028900434636116e-07, |
| "loss": 0.0847, |
| "mean_token_accuracy": 0.9749088943004608, |
| "num_tokens": 16902242.0, |
| "step": 7760 |
| }, |
| { |
| "epoch": 20.83221476510067, |
| "grad_norm": 2.0850579738616943, |
| "learning_rate": 6.940136009966735e-07, |
| "loss": 0.0659, |
| "mean_token_accuracy": 0.9775604158639908, |
| "num_tokens": 16926364.0, |
| "step": 7770 |
| }, |
| { |
| "epoch": 20.859060402684563, |
| "grad_norm": 1.6004384756088257, |
| "learning_rate": 6.851893816232729e-07, |
| "loss": 0.0757, |
| "mean_token_accuracy": 0.974211847782135, |
| "num_tokens": 16949390.0, |
| "step": 7780 |
| }, |
| { |
| "epoch": 20.885906040268456, |
| "grad_norm": 1.4713903665542603, |
| "learning_rate": 6.764174923637279e-07, |
| "loss": 0.0704, |
| "mean_token_accuracy": 0.9770765691995621, |
| "num_tokens": 16971409.0, |
| "step": 7790 |
| }, |
| { |
| "epoch": 20.91275167785235, |
| "grad_norm": 2.419337272644043, |
| "learning_rate": 6.676980396036953e-07, |
| "loss": 0.0895, |
| "mean_token_accuracy": 0.9723514080047607, |
| "num_tokens": 16992049.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 20.93959731543624, |
| "grad_norm": 1.4086570739746094, |
| "learning_rate": 6.590311290928825e-07, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9762147784233093, |
| "num_tokens": 17011358.0, |
| "step": 7810 |
| }, |
| { |
| "epoch": 20.966442953020135, |
| "grad_norm": 1.5443168878555298, |
| "learning_rate": 6.504168659437627e-07, |
| "loss": 0.0738, |
| "mean_token_accuracy": 0.9730790197849274, |
| "num_tokens": 17034767.0, |
| "step": 7820 |
| }, |
| { |
| "epoch": 20.993288590604028, |
| "grad_norm": 1.7910735607147217, |
| "learning_rate": 6.418553546302964e-07, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9733808606863021, |
| "num_tokens": 17055742.0, |
| "step": 7830 |
| }, |
| { |
| "epoch": 21.018791946308724, |
| "grad_norm": 1.4867401123046875, |
| "learning_rate": 6.333466989866787e-07, |
| "loss": 0.0695, |
| "mean_token_accuracy": 0.9786910979371322, |
| "num_tokens": 17076765.0, |
| "step": 7840 |
| }, |
| { |
| "epoch": 21.045637583892617, |
| "grad_norm": 2.1171696186065674, |
| "learning_rate": 6.248910022060595e-07, |
| "loss": 0.0726, |
| "mean_token_accuracy": 0.9751413524150848, |
| "num_tokens": 17099992.0, |
| "step": 7850 |
| }, |
| { |
| "epoch": 21.07248322147651, |
| "grad_norm": 1.8590487241744995, |
| "learning_rate": 6.16488366839309e-07, |
| "loss": 0.0683, |
| "mean_token_accuracy": 0.9762388318777084, |
| "num_tokens": 17122206.0, |
| "step": 7860 |
| }, |
| { |
| "epoch": 21.099328859060403, |
| "grad_norm": 2.488386631011963, |
| "learning_rate": 6.08138894793765e-07, |
| "loss": 0.0813, |
| "mean_token_accuracy": 0.9725316137075424, |
| "num_tokens": 17143334.0, |
| "step": 7870 |
| }, |
| { |
| "epoch": 21.126174496644296, |
| "grad_norm": 1.9426281452178955, |
| "learning_rate": 5.998426873320001e-07, |
| "loss": 0.0815, |
| "mean_token_accuracy": 0.9752875089645385, |
| "num_tokens": 17162878.0, |
| "step": 7880 |
| }, |
| { |
| "epoch": 21.15302013422819, |
| "grad_norm": 1.9060635566711426, |
| "learning_rate": 5.915998450705923e-07, |
| "loss": 0.0685, |
| "mean_token_accuracy": 0.9771264225244523, |
| "num_tokens": 17185525.0, |
| "step": 7890 |
| }, |
| { |
| "epoch": 21.179865771812082, |
| "grad_norm": 2.2453320026397705, |
| "learning_rate": 5.834104679789077e-07, |
| "loss": 0.0737, |
| "mean_token_accuracy": 0.9742690682411194, |
| "num_tokens": 17208605.0, |
| "step": 7900 |
| }, |
| { |
| "epoch": 21.20671140939597, |
| "grad_norm": 1.5044374465942383, |
| "learning_rate": 5.752746553778798e-07, |
| "loss": 0.0682, |
| "mean_token_accuracy": 0.9773426532745362, |
| "num_tokens": 17230705.0, |
| "step": 7910 |
| }, |
| { |
| "epoch": 21.233557046979865, |
| "grad_norm": 2.717085123062134, |
| "learning_rate": 5.671925059388195e-07, |
| "loss": 0.0772, |
| "mean_token_accuracy": 0.9750842124223709, |
| "num_tokens": 17251855.0, |
| "step": 7920 |
| }, |
| { |
| "epoch": 21.260402684563758, |
| "grad_norm": 1.5443522930145264, |
| "learning_rate": 5.591641176822005e-07, |
| "loss": 0.0874, |
| "mean_token_accuracy": 0.9740733593702317, |
| "num_tokens": 17271579.0, |
| "step": 7930 |
| }, |
| { |
| "epoch": 21.28724832214765, |
| "grad_norm": 1.8659101724624634, |
| "learning_rate": 5.511895879764883e-07, |
| "loss": 0.07, |
| "mean_token_accuracy": 0.9775551408529282, |
| "num_tokens": 17294183.0, |
| "step": 7940 |
| }, |
| { |
| "epoch": 21.314093959731544, |
| "grad_norm": 1.8989914655685425, |
| "learning_rate": 5.432690135369445e-07, |
| "loss": 0.0796, |
| "mean_token_accuracy": 0.9734296709299087, |
| "num_tokens": 17317241.0, |
| "step": 7950 |
| }, |
| { |
| "epoch": 21.340939597315437, |
| "grad_norm": 1.711384892463684, |
| "learning_rate": 5.354024904244632e-07, |
| "loss": 0.072, |
| "mean_token_accuracy": 0.9772316753864289, |
| "num_tokens": 17339366.0, |
| "step": 7960 |
| }, |
| { |
| "epoch": 21.36778523489933, |
| "grad_norm": 2.298988103866577, |
| "learning_rate": 5.275901140444012e-07, |
| "loss": 0.0757, |
| "mean_token_accuracy": 0.9755023777484894, |
| "num_tokens": 17360584.0, |
| "step": 7970 |
| }, |
| { |
| "epoch": 21.394630872483223, |
| "grad_norm": 1.5071649551391602, |
| "learning_rate": 5.198319791454237e-07, |
| "loss": 0.0942, |
| "mean_token_accuracy": 0.9728712201118469, |
| "num_tokens": 17380386.0, |
| "step": 7980 |
| }, |
| { |
| "epoch": 21.421476510067116, |
| "grad_norm": 1.6493000984191895, |
| "learning_rate": 5.121281798183547e-07, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.9769768297672272, |
| "num_tokens": 17403090.0, |
| "step": 7990 |
| }, |
| { |
| "epoch": 21.448322147651005, |
| "grad_norm": 1.9484785795211792, |
| "learning_rate": 5.044788094950332e-07, |
| "loss": 0.0721, |
| "mean_token_accuracy": 0.9745801717042923, |
| "num_tokens": 17426401.0, |
| "step": 8000 |
| }, |
| { |
| "epoch": 21.4751677852349, |
| "grad_norm": 4.09341287612915, |
| "learning_rate": 4.968839609471837e-07, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9758915185928345, |
| "num_tokens": 17448699.0, |
| "step": 8010 |
| }, |
| { |
| "epoch": 21.50201342281879, |
| "grad_norm": 2.4515931606292725, |
| "learning_rate": 4.893437262852885e-07, |
| "loss": 0.0729, |
| "mean_token_accuracy": 0.9776129275560379, |
| "num_tokens": 17470206.0, |
| "step": 8020 |
| }, |
| { |
| "epoch": 21.528859060402684, |
| "grad_norm": 1.707498550415039, |
| "learning_rate": 4.818581969574743e-07, |
| "loss": 0.0947, |
| "mean_token_accuracy": 0.9731051385402679, |
| "num_tokens": 17490055.0, |
| "step": 8030 |
| }, |
| { |
| "epoch": 21.555704697986577, |
| "grad_norm": 1.5686323642730713, |
| "learning_rate": 4.7442746374839363e-07, |
| "loss": 0.0622, |
| "mean_token_accuracy": 0.979521569609642, |
| "num_tokens": 17512695.0, |
| "step": 8040 |
| }, |
| { |
| "epoch": 21.58255033557047, |
| "grad_norm": 2.5754616260528564, |
| "learning_rate": 4.6705161677814024e-07, |
| "loss": 0.0717, |
| "mean_token_accuracy": 0.9750470906496048, |
| "num_tokens": 17535874.0, |
| "step": 8050 |
| }, |
| { |
| "epoch": 21.609395973154363, |
| "grad_norm": 1.9909005165100098, |
| "learning_rate": 4.597307455011363e-07, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.9758717834949493, |
| "num_tokens": 17558072.0, |
| "step": 8060 |
| }, |
| { |
| "epoch": 21.636241610738256, |
| "grad_norm": 2.4641635417938232, |
| "learning_rate": 4.524649387050667e-07, |
| "loss": 0.0812, |
| "mean_token_accuracy": 0.9734772562980651, |
| "num_tokens": 17579394.0, |
| "step": 8070 |
| }, |
| { |
| "epoch": 21.663087248322146, |
| "grad_norm": 1.7697465419769287, |
| "learning_rate": 4.4525428450978627e-07, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.973640301823616, |
| "num_tokens": 17599105.0, |
| "step": 8080 |
| }, |
| { |
| "epoch": 21.68993288590604, |
| "grad_norm": 4.2104268074035645, |
| "learning_rate": 4.380988703662614e-07, |
| "loss": 0.0662, |
| "mean_token_accuracy": 0.9779145032167434, |
| "num_tokens": 17621804.0, |
| "step": 8090 |
| }, |
| { |
| "epoch": 21.716778523489932, |
| "grad_norm": 2.5988218784332275, |
| "learning_rate": 4.309987830555057e-07, |
| "loss": 0.0647, |
| "mean_token_accuracy": 0.9763138085603714, |
| "num_tokens": 17645036.0, |
| "step": 8100 |
| }, |
| { |
| "epoch": 21.743624161073825, |
| "grad_norm": 1.6398255825042725, |
| "learning_rate": 4.239541086875265e-07, |
| "loss": 0.0715, |
| "mean_token_accuracy": 0.9762193262577057, |
| "num_tokens": 17667306.0, |
| "step": 8110 |
| }, |
| { |
| "epoch": 21.770469798657718, |
| "grad_norm": 2.159721612930298, |
| "learning_rate": 4.1696493270028284e-07, |
| "loss": 0.078, |
| "mean_token_accuracy": 0.9745160311460495, |
| "num_tokens": 17688460.0, |
| "step": 8120 |
| }, |
| { |
| "epoch": 21.79731543624161, |
| "grad_norm": 1.4078185558319092, |
| "learning_rate": 4.1003133985864864e-07, |
| "loss": 0.0805, |
| "mean_token_accuracy": 0.976104524731636, |
| "num_tokens": 17708085.0, |
| "step": 8130 |
| }, |
| { |
| "epoch": 21.824161073825504, |
| "grad_norm": 1.8352998495101929, |
| "learning_rate": 4.031534142533816e-07, |
| "loss": 0.0706, |
| "mean_token_accuracy": 0.9771550267934799, |
| "num_tokens": 17730815.0, |
| "step": 8140 |
| }, |
| { |
| "epoch": 21.851006711409397, |
| "grad_norm": 2.1755294799804688, |
| "learning_rate": 3.9633123930011065e-07, |
| "loss": 0.0707, |
| "mean_token_accuracy": 0.9745010644197464, |
| "num_tokens": 17754175.0, |
| "step": 8150 |
| }, |
| { |
| "epoch": 21.87785234899329, |
| "grad_norm": 1.4615411758422852, |
| "learning_rate": 3.895648977383143e-07, |
| "loss": 0.0751, |
| "mean_token_accuracy": 0.9746832549571991, |
| "num_tokens": 17776562.0, |
| "step": 8160 |
| }, |
| { |
| "epoch": 21.90469798657718, |
| "grad_norm": 3.1907434463500977, |
| "learning_rate": 3.828544716303284e-07, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9775834113359452, |
| "num_tokens": 17797911.0, |
| "step": 8170 |
| }, |
| { |
| "epoch": 21.931543624161073, |
| "grad_norm": 1.4641789197921753, |
| "learning_rate": 3.76200042360339e-07, |
| "loss": 0.0877, |
| "mean_token_accuracy": 0.9736798137426377, |
| "num_tokens": 17817668.0, |
| "step": 8180 |
| }, |
| { |
| "epoch": 21.958389261744966, |
| "grad_norm": 1.6593701839447021, |
| "learning_rate": 3.6960169063340543e-07, |
| "loss": 0.0706, |
| "mean_token_accuracy": 0.9777141779661178, |
| "num_tokens": 17840084.0, |
| "step": 8190 |
| }, |
| { |
| "epoch": 21.98523489932886, |
| "grad_norm": 2.6766629219055176, |
| "learning_rate": 3.6305949647447545e-07, |
| "loss": 0.0797, |
| "mean_token_accuracy": 0.9745386809110641, |
| "num_tokens": 17862053.0, |
| "step": 8200 |
| }, |
| { |
| "epoch": 22.01073825503356, |
| "grad_norm": 2.444695234298706, |
| "learning_rate": 3.5657353922741834e-07, |
| "loss": 0.0739, |
| "mean_token_accuracy": 0.9770089576118871, |
| "num_tokens": 17881976.0, |
| "step": 8210 |
| }, |
| { |
| "epoch": 22.037583892617448, |
| "grad_norm": 1.9715152978897095, |
| "learning_rate": 3.501438975540583e-07, |
| "loss": 0.0623, |
| "mean_token_accuracy": 0.9785895735025406, |
| "num_tokens": 17905589.0, |
| "step": 8220 |
| }, |
| { |
| "epoch": 22.06442953020134, |
| "grad_norm": 1.3546748161315918, |
| "learning_rate": 3.437706494332266e-07, |
| "loss": 0.0713, |
| "mean_token_accuracy": 0.9752832442522049, |
| "num_tokens": 17928115.0, |
| "step": 8230 |
| }, |
| { |
| "epoch": 22.091275167785234, |
| "grad_norm": 2.851863384246826, |
| "learning_rate": 3.374538721598086e-07, |
| "loss": 0.0662, |
| "mean_token_accuracy": 0.9777217179536819, |
| "num_tokens": 17949802.0, |
| "step": 8240 |
| }, |
| { |
| "epoch": 22.118120805369127, |
| "grad_norm": 1.7194266319274902, |
| "learning_rate": 3.311936423438128e-07, |
| "loss": 0.0856, |
| "mean_token_accuracy": 0.9752306431531906, |
| "num_tokens": 17969952.0, |
| "step": 8250 |
| }, |
| { |
| "epoch": 22.14496644295302, |
| "grad_norm": 2.0014491081237793, |
| "learning_rate": 3.249900359094388e-07, |
| "loss": 0.0763, |
| "mean_token_accuracy": 0.9760718226432801, |
| "num_tokens": 17991224.0, |
| "step": 8260 |
| }, |
| { |
| "epoch": 22.171812080536913, |
| "grad_norm": 2.113339900970459, |
| "learning_rate": 3.188431280941529e-07, |
| "loss": 0.0676, |
| "mean_token_accuracy": 0.9763350129127503, |
| "num_tokens": 18014820.0, |
| "step": 8270 |
| }, |
| { |
| "epoch": 22.198657718120806, |
| "grad_norm": 1.4311920404434204, |
| "learning_rate": 3.1275299344778576e-07, |
| "loss": 0.0727, |
| "mean_token_accuracy": 0.9759140908718109, |
| "num_tokens": 18037322.0, |
| "step": 8280 |
| }, |
| { |
| "epoch": 22.2255033557047, |
| "grad_norm": 2.7011280059814453, |
| "learning_rate": 3.067197058316157e-07, |
| "loss": 0.0708, |
| "mean_token_accuracy": 0.9770938664674759, |
| "num_tokens": 18058945.0, |
| "step": 8290 |
| }, |
| { |
| "epoch": 22.252348993288592, |
| "grad_norm": 1.7201358079910278, |
| "learning_rate": 3.007433384174835e-07, |
| "loss": 0.0952, |
| "mean_token_accuracy": 0.9714390218257904, |
| "num_tokens": 18079153.0, |
| "step": 8300 |
| }, |
| { |
| "epoch": 22.27919463087248, |
| "grad_norm": 1.7672648429870605, |
| "learning_rate": 2.948239636868977e-07, |
| "loss": 0.0691, |
| "mean_token_accuracy": 0.9796588033437729, |
| "num_tokens": 18100388.0, |
| "step": 8310 |
| }, |
| { |
| "epoch": 22.306040268456375, |
| "grad_norm": 2.4666810035705566, |
| "learning_rate": 2.889616534301598e-07, |
| "loss": 0.0715, |
| "mean_token_accuracy": 0.9756601005792618, |
| "num_tokens": 18123969.0, |
| "step": 8320 |
| }, |
| { |
| "epoch": 22.332885906040268, |
| "grad_norm": 1.4548521041870117, |
| "learning_rate": 2.831564787454916e-07, |
| "loss": 0.0739, |
| "mean_token_accuracy": 0.9751757919788361, |
| "num_tokens": 18146448.0, |
| "step": 8330 |
| }, |
| { |
| "epoch": 22.35973154362416, |
| "grad_norm": 2.3725903034210205, |
| "learning_rate": 2.774085100381735e-07, |
| "loss": 0.0694, |
| "mean_token_accuracy": 0.9762961208820343, |
| "num_tokens": 18168098.0, |
| "step": 8340 |
| }, |
| { |
| "epoch": 22.386577181208054, |
| "grad_norm": 1.802985668182373, |
| "learning_rate": 2.717178170196916e-07, |
| "loss": 0.0924, |
| "mean_token_accuracy": 0.9725849449634552, |
| "num_tokens": 18188314.0, |
| "step": 8350 |
| }, |
| { |
| "epoch": 22.413422818791947, |
| "grad_norm": 2.313931465148926, |
| "learning_rate": 2.660844687068903e-07, |
| "loss": 0.0682, |
| "mean_token_accuracy": 0.979394719004631, |
| "num_tokens": 18209585.0, |
| "step": 8360 |
| }, |
| { |
| "epoch": 22.44026845637584, |
| "grad_norm": 2.165256977081299, |
| "learning_rate": 2.6050853342113437e-07, |
| "loss": 0.0687, |
| "mean_token_accuracy": 0.9766799122095108, |
| "num_tokens": 18233015.0, |
| "step": 8370 |
| }, |
| { |
| "epoch": 22.467114093959733, |
| "grad_norm": 1.488602876663208, |
| "learning_rate": 2.549900787874876e-07, |
| "loss": 0.0748, |
| "mean_token_accuracy": 0.9740068465471268, |
| "num_tokens": 18255469.0, |
| "step": 8380 |
| }, |
| { |
| "epoch": 22.493959731543626, |
| "grad_norm": 2.5867998600006104, |
| "learning_rate": 2.4952917173387993e-07, |
| "loss": 0.0716, |
| "mean_token_accuracy": 0.9767491906881333, |
| "num_tokens": 18276911.0, |
| "step": 8390 |
| }, |
| { |
| "epoch": 22.520805369127515, |
| "grad_norm": 1.6677324771881104, |
| "learning_rate": 2.4412587849031e-07, |
| "loss": 0.0911, |
| "mean_token_accuracy": 0.9713105499744416, |
| "num_tokens": 18296904.0, |
| "step": 8400 |
| }, |
| { |
| "epoch": 22.54765100671141, |
| "grad_norm": 3.195547342300415, |
| "learning_rate": 2.3878026458803047e-07, |
| "loss": 0.0669, |
| "mean_token_accuracy": 0.9800841093063355, |
| "num_tokens": 18318172.0, |
| "step": 8410 |
| }, |
| { |
| "epoch": 22.5744966442953, |
| "grad_norm": 1.8589850664138794, |
| "learning_rate": 2.3349239485875918e-07, |
| "loss": 0.0679, |
| "mean_token_accuracy": 0.975395730137825, |
| "num_tokens": 18341711.0, |
| "step": 8420 |
| }, |
| { |
| "epoch": 22.601342281879194, |
| "grad_norm": 1.5052769184112549, |
| "learning_rate": 2.282623334338907e-07, |
| "loss": 0.0706, |
| "mean_token_accuracy": 0.974920055270195, |
| "num_tokens": 18364282.0, |
| "step": 8430 |
| }, |
| { |
| "epoch": 22.628187919463087, |
| "grad_norm": 2.210737943649292, |
| "learning_rate": 2.2309014374372106e-07, |
| "loss": 0.0699, |
| "mean_token_accuracy": 0.9770695507526398, |
| "num_tokens": 18385999.0, |
| "step": 8440 |
| }, |
| { |
| "epoch": 22.65503355704698, |
| "grad_norm": 1.5828299522399902, |
| "learning_rate": 2.1797588851667494e-07, |
| "loss": 0.0906, |
| "mean_token_accuracy": 0.9725321441888809, |
| "num_tokens": 18406305.0, |
| "step": 8450 |
| }, |
| { |
| "epoch": 22.681879194630874, |
| "grad_norm": 1.828018307685852, |
| "learning_rate": 2.129196297785474e-07, |
| "loss": 0.0656, |
| "mean_token_accuracy": 0.979384246468544, |
| "num_tokens": 18427591.0, |
| "step": 8460 |
| }, |
| { |
| "epoch": 22.708724832214767, |
| "grad_norm": 1.981468915939331, |
| "learning_rate": 2.079214288517506e-07, |
| "loss": 0.0674, |
| "mean_token_accuracy": 0.9769616097211837, |
| "num_tokens": 18451157.0, |
| "step": 8470 |
| }, |
| { |
| "epoch": 22.735570469798656, |
| "grad_norm": 2.4667210578918457, |
| "learning_rate": 2.029813463545699e-07, |
| "loss": 0.0742, |
| "mean_token_accuracy": 0.9744071394205094, |
| "num_tokens": 18473755.0, |
| "step": 8480 |
| }, |
| { |
| "epoch": 22.76241610738255, |
| "grad_norm": 2.268716812133789, |
| "learning_rate": 1.980994422004312e-07, |
| "loss": 0.0686, |
| "mean_token_accuracy": 0.9764893770217895, |
| "num_tokens": 18495455.0, |
| "step": 8490 |
| }, |
| { |
| "epoch": 22.789261744966442, |
| "grad_norm": 1.5990172624588013, |
| "learning_rate": 1.9327577559716815e-07, |
| "loss": 0.0869, |
| "mean_token_accuracy": 0.9736561328172684, |
| "num_tokens": 18515761.0, |
| "step": 8500 |
| }, |
| { |
| "epoch": 22.816107382550335, |
| "grad_norm": 1.634722352027893, |
| "learning_rate": 1.8851040504631325e-07, |
| "loss": 0.0707, |
| "mean_token_accuracy": 0.9781961172819138, |
| "num_tokens": 18537018.0, |
| "step": 8510 |
| }, |
| { |
| "epoch": 22.842953020134228, |
| "grad_norm": 2.427337408065796, |
| "learning_rate": 1.8380338834237842e-07, |
| "loss": 0.0695, |
| "mean_token_accuracy": 0.9761110007762909, |
| "num_tokens": 18560550.0, |
| "step": 8520 |
| }, |
| { |
| "epoch": 22.86979865771812, |
| "grad_norm": 1.6152406930923462, |
| "learning_rate": 1.79154782572164e-07, |
| "loss": 0.0746, |
| "mean_token_accuracy": 0.9747624099254608, |
| "num_tokens": 18583052.0, |
| "step": 8530 |
| }, |
| { |
| "epoch": 22.896644295302014, |
| "grad_norm": 1.99691641330719, |
| "learning_rate": 1.7456464411405527e-07, |
| "loss": 0.0706, |
| "mean_token_accuracy": 0.9768070250749588, |
| "num_tokens": 18604669.0, |
| "step": 8540 |
| }, |
| { |
| "epoch": 22.923489932885907, |
| "grad_norm": 2.081984043121338, |
| "learning_rate": 1.7003302863735028e-07, |
| "loss": 0.0929, |
| "mean_token_accuracy": 0.9729080408811569, |
| "num_tokens": 18624806.0, |
| "step": 8550 |
| }, |
| { |
| "epoch": 22.9503355704698, |
| "grad_norm": 2.1356115341186523, |
| "learning_rate": 1.655599911015754e-07, |
| "loss": 0.068, |
| "mean_token_accuracy": 0.9790314078330994, |
| "num_tokens": 18645962.0, |
| "step": 8560 |
| }, |
| { |
| "epoch": 22.97718120805369, |
| "grad_norm": 1.4790443181991577, |
| "learning_rate": 1.6114558575582418e-07, |
| "loss": 0.0679, |
| "mean_token_accuracy": 0.9775549441576004, |
| "num_tokens": 18668370.0, |
| "step": 8570 |
| }, |
| { |
| "epoch": 23.00268456375839, |
| "grad_norm": 1.7534834146499634, |
| "learning_rate": 1.5678986613809788e-07, |
| "loss": 0.0758, |
| "mean_token_accuracy": 0.9766696377804405, |
| "num_tokens": 18686993.0, |
| "step": 8580 |
| }, |
| { |
| "epoch": 23.029530201342283, |
| "grad_norm": 1.7369558811187744, |
| "learning_rate": 1.52492885074656e-07, |
| "loss": 0.0661, |
| "mean_token_accuracy": 0.9780718445777893, |
| "num_tokens": 18710832.0, |
| "step": 8590 |
| }, |
| { |
| "epoch": 23.056375838926176, |
| "grad_norm": 1.6902610063552856, |
| "learning_rate": 1.4825469467937336e-07, |
| "loss": 0.08, |
| "mean_token_accuracy": 0.9725459694862366, |
| "num_tokens": 18733644.0, |
| "step": 8600 |
| }, |
| { |
| "epoch": 23.08322147651007, |
| "grad_norm": 1.5242305994033813, |
| "learning_rate": 1.4407534635311415e-07, |
| "loss": 0.0672, |
| "mean_token_accuracy": 0.9776864409446716, |
| "num_tokens": 18755633.0, |
| "step": 8610 |
| }, |
| { |
| "epoch": 23.110067114093958, |
| "grad_norm": 2.102566719055176, |
| "learning_rate": 1.3995489078310055e-07, |
| "loss": 0.0855, |
| "mean_token_accuracy": 0.9734326243400574, |
| "num_tokens": 18776371.0, |
| "step": 8620 |
| }, |
| { |
| "epoch": 23.13691275167785, |
| "grad_norm": 1.5999020338058472, |
| "learning_rate": 1.358933779423066e-07, |
| "loss": 0.0732, |
| "mean_token_accuracy": 0.9789162427186966, |
| "num_tokens": 18796212.0, |
| "step": 8630 |
| }, |
| { |
| "epoch": 23.163758389261744, |
| "grad_norm": 1.7957699298858643, |
| "learning_rate": 1.3189085708884387e-07, |
| "loss": 0.0654, |
| "mean_token_accuracy": 0.976372754573822, |
| "num_tokens": 18819968.0, |
| "step": 8640 |
| }, |
| { |
| "epoch": 23.190604026845637, |
| "grad_norm": 1.6005198955535889, |
| "learning_rate": 1.2794737676536993e-07, |
| "loss": 0.0713, |
| "mean_token_accuracy": 0.9749877661466598, |
| "num_tokens": 18842576.0, |
| "step": 8650 |
| }, |
| { |
| "epoch": 23.21744966442953, |
| "grad_norm": 1.7131704092025757, |
| "learning_rate": 1.24062984798497e-07, |
| "loss": 0.0654, |
| "mean_token_accuracy": 0.978054803609848, |
| "num_tokens": 18864395.0, |
| "step": 8660 |
| }, |
| { |
| "epoch": 23.244295302013423, |
| "grad_norm": 2.195054769515991, |
| "learning_rate": 1.2023772829821202e-07, |
| "loss": 0.0976, |
| "mean_token_accuracy": 0.9702393293380738, |
| "num_tokens": 18884905.0, |
| "step": 8670 |
| }, |
| { |
| "epoch": 23.271140939597316, |
| "grad_norm": 1.513555645942688, |
| "learning_rate": 1.164716536573074e-07, |
| "loss": 0.0731, |
| "mean_token_accuracy": 0.9793914705514908, |
| "num_tokens": 18904747.0, |
| "step": 8680 |
| }, |
| { |
| "epoch": 23.29798657718121, |
| "grad_norm": 1.324312686920166, |
| "learning_rate": 1.1276480655081412e-07, |
| "loss": 0.0649, |
| "mean_token_accuracy": 0.9773990035057067, |
| "num_tokens": 18928841.0, |
| "step": 8690 |
| }, |
| { |
| "epoch": 23.324832214765102, |
| "grad_norm": 4.074679374694824, |
| "learning_rate": 1.091172319354522e-07, |
| "loss": 0.0689, |
| "mean_token_accuracy": 0.9761066138744354, |
| "num_tokens": 18951836.0, |
| "step": 8700 |
| }, |
| { |
| "epoch": 23.351677852348992, |
| "grad_norm": 1.2226976156234741, |
| "learning_rate": 1.0552897404908391e-07, |
| "loss": 0.0659, |
| "mean_token_accuracy": 0.9781163841485977, |
| "num_tokens": 18973843.0, |
| "step": 8710 |
| }, |
| { |
| "epoch": 23.378523489932885, |
| "grad_norm": 4.224064350128174, |
| "learning_rate": 1.0200007641017583e-07, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9745501130819321, |
| "num_tokens": 18994538.0, |
| "step": 8720 |
| }, |
| { |
| "epoch": 23.405369127516778, |
| "grad_norm": 2.0320780277252197, |
| "learning_rate": 9.853058181727215e-08, |
| "loss": 0.0797, |
| "mean_token_accuracy": 0.9776176422834396, |
| "num_tokens": 19014345.0, |
| "step": 8730 |
| }, |
| { |
| "epoch": 23.43221476510067, |
| "grad_norm": 1.7470709085464478, |
| "learning_rate": 9.512053234847774e-08, |
| "loss": 0.0678, |
| "mean_token_accuracy": 0.9759983509778977, |
| "num_tokens": 19038219.0, |
| "step": 8740 |
| }, |
| { |
| "epoch": 23.459060402684564, |
| "grad_norm": 1.6720378398895264, |
| "learning_rate": 9.176996936094195e-08, |
| "loss": 0.0749, |
| "mean_token_accuracy": 0.9732575833797454, |
| "num_tokens": 19061030.0, |
| "step": 8750 |
| }, |
| { |
| "epoch": 23.485906040268457, |
| "grad_norm": 1.4237157106399536, |
| "learning_rate": 8.847893349036518e-08, |
| "loss": 0.0657, |
| "mean_token_accuracy": 0.9781557589769363, |
| "num_tokens": 19083034.0, |
| "step": 8760 |
| }, |
| { |
| "epoch": 23.51275167785235, |
| "grad_norm": 2.249169111251831, |
| "learning_rate": 8.52474646504986e-08, |
| "loss": 0.0853, |
| "mean_token_accuracy": 0.9733679562807083, |
| "num_tokens": 19103836.0, |
| "step": 8770 |
| }, |
| { |
| "epoch": 23.539597315436243, |
| "grad_norm": 1.4655739068984985, |
| "learning_rate": 8.207560203266462e-08, |
| "loss": 0.0762, |
| "mean_token_accuracy": 0.9779812008142471, |
| "num_tokens": 19123721.0, |
| "step": 8780 |
| }, |
| { |
| "epoch": 23.566442953020136, |
| "grad_norm": 2.0239083766937256, |
| "learning_rate": 7.896338410527948e-08, |
| "loss": 0.0662, |
| "mean_token_accuracy": 0.976813143491745, |
| "num_tokens": 19147587.0, |
| "step": 8790 |
| }, |
| { |
| "epoch": 23.593288590604026, |
| "grad_norm": 1.9883896112442017, |
| "learning_rate": 7.591084861338749e-08, |
| "loss": 0.0681, |
| "mean_token_accuracy": 0.9767774403095245, |
| "num_tokens": 19170369.0, |
| "step": 8800 |
| }, |
| { |
| "epoch": 23.62013422818792, |
| "grad_norm": 1.4335416555404663, |
| "learning_rate": 7.291803257820307e-08, |
| "loss": 0.0706, |
| "mean_token_accuracy": 0.9769880920648575, |
| "num_tokens": 19192301.0, |
| "step": 8810 |
| }, |
| { |
| "epoch": 23.64697986577181, |
| "grad_norm": 1.5386704206466675, |
| "learning_rate": 6.998497229666334e-08, |
| "loss": 0.0836, |
| "mean_token_accuracy": 0.9734211921691894, |
| "num_tokens": 19212985.0, |
| "step": 8820 |
| }, |
| { |
| "epoch": 23.673825503355705, |
| "grad_norm": 1.4760531187057495, |
| "learning_rate": 6.711170334098294e-08, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.979541563987732, |
| "num_tokens": 19232801.0, |
| "step": 8830 |
| }, |
| { |
| "epoch": 23.700671140939598, |
| "grad_norm": 1.768637776374817, |
| "learning_rate": 6.429826055822985e-08, |
| "loss": 0.0632, |
| "mean_token_accuracy": 0.9791298300027848, |
| "num_tokens": 19256797.0, |
| "step": 8840 |
| }, |
| { |
| "epoch": 23.72751677852349, |
| "grad_norm": 1.8322697877883911, |
| "learning_rate": 6.154467806989639e-08, |
| "loss": 0.0691, |
| "mean_token_accuracy": 0.9751640826463699, |
| "num_tokens": 19279549.0, |
| "step": 8850 |
| }, |
| { |
| "epoch": 23.754362416107384, |
| "grad_norm": 1.681655764579773, |
| "learning_rate": 5.885098927148947e-08, |
| "loss": 0.0626, |
| "mean_token_accuracy": 0.9791503757238388, |
| "num_tokens": 19301365.0, |
| "step": 8860 |
| }, |
| { |
| "epoch": 23.781208053691277, |
| "grad_norm": 2.793856620788574, |
| "learning_rate": 5.6217226832122605e-08, |
| "loss": 0.0892, |
| "mean_token_accuracy": 0.9728416323661804, |
| "num_tokens": 19321863.0, |
| "step": 8870 |
| }, |
| { |
| "epoch": 23.808053691275166, |
| "grad_norm": 1.3198052644729614, |
| "learning_rate": 5.364342269412237e-08, |
| "loss": 0.0747, |
| "mean_token_accuracy": 0.9781924426555634, |
| "num_tokens": 19341690.0, |
| "step": 8880 |
| }, |
| { |
| "epoch": 23.83489932885906, |
| "grad_norm": 1.6709011793136597, |
| "learning_rate": 5.112960807263978e-08, |
| "loss": 0.07, |
| "mean_token_accuracy": 0.9769271492958069, |
| "num_tokens": 19365725.0, |
| "step": 8890 |
| }, |
| { |
| "epoch": 23.861744966442952, |
| "grad_norm": 1.718581199645996, |
| "learning_rate": 4.867581345527117e-08, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9760184645652771, |
| "num_tokens": 19388641.0, |
| "step": 8900 |
| }, |
| { |
| "epoch": 23.888590604026845, |
| "grad_norm": 1.6949131488800049, |
| "learning_rate": 4.62820686016896e-08, |
| "loss": 0.0695, |
| "mean_token_accuracy": 0.9765161216259003, |
| "num_tokens": 19410558.0, |
| "step": 8910 |
| }, |
| { |
| "epoch": 23.915436241610738, |
| "grad_norm": 2.4173476696014404, |
| "learning_rate": 4.3948402543282366e-08, |
| "loss": 0.0886, |
| "mean_token_accuracy": 0.9729428887367249, |
| "num_tokens": 19431305.0, |
| "step": 8920 |
| }, |
| { |
| "epoch": 23.94228187919463, |
| "grad_norm": 1.3827898502349854, |
| "learning_rate": 4.167484358280016e-08, |
| "loss": 0.0721, |
| "mean_token_accuracy": 0.9788724452257156, |
| "num_tokens": 19451159.0, |
| "step": 8930 |
| }, |
| { |
| "epoch": 23.969127516778524, |
| "grad_norm": 1.6722241640090942, |
| "learning_rate": 3.946141929401459e-08, |
| "loss": 0.0733, |
| "mean_token_accuracy": 0.9754825055599212, |
| "num_tokens": 19474310.0, |
| "step": 8940 |
| }, |
| { |
| "epoch": 23.995973154362417, |
| "grad_norm": 1.3889379501342773, |
| "learning_rate": 3.730815652138231e-08, |
| "loss": 0.0809, |
| "mean_token_accuracy": 0.9765669792890549, |
| "num_tokens": 19494756.0, |
| "step": 8950 |
| }, |
| { |
| "epoch": 24.021476510067114, |
| "grad_norm": 1.9823448657989502, |
| "learning_rate": 3.521508137971807e-08, |
| "loss": 0.0587, |
| "mean_token_accuracy": 0.9801463171055442, |
| "num_tokens": 19516280.0, |
| "step": 8960 |
| }, |
| { |
| "epoch": 24.048322147651007, |
| "grad_norm": 1.6457539796829224, |
| "learning_rate": 3.3182219253882766e-08, |
| "loss": 0.0644, |
| "mean_token_accuracy": 0.9775006264448166, |
| "num_tokens": 19539394.0, |
| "step": 8970 |
| }, |
| { |
| "epoch": 24.0751677852349, |
| "grad_norm": 1.7060420513153076, |
| "learning_rate": 3.120959479846919e-08, |
| "loss": 0.0658, |
| "mean_token_accuracy": 0.9774825513362885, |
| "num_tokens": 19561443.0, |
| "step": 8980 |
| }, |
| { |
| "epoch": 24.102013422818793, |
| "grad_norm": 2.1901304721832275, |
| "learning_rate": 2.9297231937510107e-08, |
| "loss": 0.0833, |
| "mean_token_accuracy": 0.9740053981542587, |
| "num_tokens": 19582429.0, |
| "step": 8990 |
| }, |
| { |
| "epoch": 24.128859060402686, |
| "grad_norm": 1.383569598197937, |
| "learning_rate": 2.7445153864180674e-08, |
| "loss": 0.0806, |
| "mean_token_accuracy": 0.9769510418176651, |
| "num_tokens": 19601915.0, |
| "step": 9000 |
| }, |
| { |
| "epoch": 24.15570469798658, |
| "grad_norm": 4.088254451751709, |
| "learning_rate": 2.5653383040524228e-08, |
| "loss": 0.0658, |
| "mean_token_accuracy": 0.9784015566110611, |
| "num_tokens": 19625008.0, |
| "step": 9010 |
| }, |
| { |
| "epoch": 24.18255033557047, |
| "grad_norm": 1.6868436336517334, |
| "learning_rate": 2.392194119717417e-08, |
| "loss": 0.071, |
| "mean_token_accuracy": 0.9756916046142579, |
| "num_tokens": 19648138.0, |
| "step": 9020 |
| }, |
| { |
| "epoch": 24.20939597315436, |
| "grad_norm": 1.6547763347625732, |
| "learning_rate": 2.225084933309363e-08, |
| "loss": 0.067, |
| "mean_token_accuracy": 0.976289376616478, |
| "num_tokens": 19670309.0, |
| "step": 9030 |
| }, |
| { |
| "epoch": 24.236241610738254, |
| "grad_norm": 2.064862012863159, |
| "learning_rate": 2.064012771532009e-08, |
| "loss": 0.0829, |
| "mean_token_accuracy": 0.9720039278268814, |
| "num_tokens": 19691223.0, |
| "step": 9040 |
| }, |
| { |
| "epoch": 24.263087248322147, |
| "grad_norm": 1.1820263862609863, |
| "learning_rate": 1.9089795878718953e-08, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9768513649702072, |
| "num_tokens": 19710783.0, |
| "step": 9050 |
| }, |
| { |
| "epoch": 24.28993288590604, |
| "grad_norm": 3.2465732097625732, |
| "learning_rate": 1.7599872625747583e-08, |
| "loss": 0.0646, |
| "mean_token_accuracy": 0.9789623349905014, |
| "num_tokens": 19734062.0, |
| "step": 9060 |
| }, |
| { |
| "epoch": 24.316778523489933, |
| "grad_norm": 1.9160860776901245, |
| "learning_rate": 1.6170376026226065e-08, |
| "loss": 0.0712, |
| "mean_token_accuracy": 0.975100401043892, |
| "num_tokens": 19757347.0, |
| "step": 9070 |
| }, |
| { |
| "epoch": 24.343624161073826, |
| "grad_norm": 1.6455693244934082, |
| "learning_rate": 1.4801323417119595e-08, |
| "loss": 0.0684, |
| "mean_token_accuracy": 0.9768495559692383, |
| "num_tokens": 19779529.0, |
| "step": 9080 |
| }, |
| { |
| "epoch": 24.37046979865772, |
| "grad_norm": 3.2963931560516357, |
| "learning_rate": 1.3492731402326431e-08, |
| "loss": 0.086, |
| "mean_token_accuracy": 0.9731739908456802, |
| "num_tokens": 19800407.0, |
| "step": 9090 |
| }, |
| { |
| "epoch": 24.397315436241612, |
| "grad_norm": 1.495845913887024, |
| "learning_rate": 1.2244615852479158e-08, |
| "loss": 0.0828, |
| "mean_token_accuracy": 0.9758796036243439, |
| "num_tokens": 19819948.0, |
| "step": 9100 |
| }, |
| { |
| "epoch": 24.424161073825502, |
| "grad_norm": 2.0350308418273926, |
| "learning_rate": 1.1056991904748182e-08, |
| "loss": 0.07, |
| "mean_token_accuracy": 0.9763268619775772, |
| "num_tokens": 19843153.0, |
| "step": 9110 |
| }, |
| { |
| "epoch": 24.451006711409395, |
| "grad_norm": 1.525877833366394, |
| "learning_rate": 9.929873962661873e-09, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.9746161639690399, |
| "num_tokens": 19866266.0, |
| "step": 9120 |
| }, |
| { |
| "epoch": 24.477852348993288, |
| "grad_norm": 1.7038556337356567, |
| "learning_rate": 8.86327569593115e-09, |
| "loss": 0.0665, |
| "mean_token_accuracy": 0.9768165737390518, |
| "num_tokens": 19888403.0, |
| "step": 9130 |
| }, |
| { |
| "epoch": 24.50469798657718, |
| "grad_norm": 4.7443366050720215, |
| "learning_rate": 7.857210040281838e-09, |
| "loss": 0.0799, |
| "mean_token_accuracy": 0.9739553958177567, |
| "num_tokens": 19909460.0, |
| "step": 9140 |
| }, |
| { |
| "epoch": 24.531543624161074, |
| "grad_norm": 1.6406482458114624, |
| "learning_rate": 6.9116891972986766e-09, |
| "loss": 0.0787, |
| "mean_token_accuracy": 0.9759374588727951, |
| "num_tokens": 19928999.0, |
| "step": 9150 |
| }, |
| { |
| "epoch": 24.558389261744967, |
| "grad_norm": 1.7716859579086304, |
| "learning_rate": 6.026724634279335e-09, |
| "loss": 0.0651, |
| "mean_token_accuracy": 0.978615865111351, |
| "num_tokens": 19952230.0, |
| "step": 9160 |
| }, |
| { |
| "epoch": 24.58523489932886, |
| "grad_norm": 1.912937045097351, |
| "learning_rate": 5.20232708409174e-09, |
| "loss": 0.0694, |
| "mean_token_accuracy": 0.9758041888475418, |
| "num_tokens": 19975383.0, |
| "step": 9170 |
| }, |
| { |
| "epoch": 24.612080536912753, |
| "grad_norm": 1.484447717666626, |
| "learning_rate": 4.438506545046961e-09, |
| "loss": 0.0655, |
| "mean_token_accuracy": 0.9781613409519195, |
| "num_tokens": 19997602.0, |
| "step": 9180 |
| }, |
| { |
| "epoch": 24.638926174496646, |
| "grad_norm": 2.091848134994507, |
| "learning_rate": 3.73527228077708e-09, |
| "loss": 0.0741, |
| "mean_token_accuracy": 0.9758224546909332, |
| "num_tokens": 20018904.0, |
| "step": 9190 |
| }, |
| { |
| "epoch": 24.665771812080536, |
| "grad_norm": 1.6126073598861694, |
| "learning_rate": 3.0926328201213996e-09, |
| "loss": 0.0821, |
| "mean_token_accuracy": 0.9765258640050888, |
| "num_tokens": 20038577.0, |
| "step": 9200 |
| }, |
| { |
| "epoch": 24.69261744966443, |
| "grad_norm": 2.2654337882995605, |
| "learning_rate": 2.510595957025408e-09, |
| "loss": 0.0664, |
| "mean_token_accuracy": 0.9781230628490448, |
| "num_tokens": 20061623.0, |
| "step": 9210 |
| }, |
| { |
| "epoch": 24.71946308724832, |
| "grad_norm": 1.9814746379852295, |
| "learning_rate": 1.9891687504436373e-09, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9765848129987716, |
| "num_tokens": 20084607.0, |
| "step": 9220 |
| }, |
| { |
| "epoch": 24.746308724832215, |
| "grad_norm": 1.474151372909546, |
| "learning_rate": 1.5283575242569514e-09, |
| "loss": 0.0732, |
| "mean_token_accuracy": 0.9761769473552704, |
| "num_tokens": 20106707.0, |
| "step": 9230 |
| }, |
| { |
| "epoch": 24.773154362416108, |
| "grad_norm": 3.660369634628296, |
| "learning_rate": 1.1281678671931639e-09, |
| "loss": 0.0765, |
| "mean_token_accuracy": 0.9770137190818786, |
| "num_tokens": 20127881.0, |
| "step": 9240 |
| }, |
| { |
| "epoch": 24.8, |
| "grad_norm": 1.50706946849823, |
| "learning_rate": 7.886046327609809e-10, |
| "loss": 0.0816, |
| "mean_token_accuracy": 0.9761880010366439, |
| "num_tokens": 20147458.0, |
| "step": 9250 |
| }, |
| { |
| "epoch": 24.826845637583894, |
| "grad_norm": 1.9096148014068604, |
| "learning_rate": 5.096719391900484e-10, |
| "loss": 0.0637, |
| "mean_token_accuracy": 0.9798777759075165, |
| "num_tokens": 20170537.0, |
| "step": 9260 |
| }, |
| { |
| "epoch": 24.853691275167787, |
| "grad_norm": 4.045505523681641, |
| "learning_rate": 2.9137316938265826e-10, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.9747423082590103, |
| "num_tokens": 20193545.0, |
| "step": 9270 |
| }, |
| { |
| "epoch": 24.880536912751676, |
| "grad_norm": 1.777500867843628, |
| "learning_rate": 1.337109708704487e-10, |
| "loss": 0.071, |
| "mean_token_accuracy": 0.9765003561973572, |
| "num_tokens": 20215713.0, |
| "step": 9280 |
| }, |
| { |
| "epoch": 24.90738255033557, |
| "grad_norm": 2.0628652572631836, |
| "learning_rate": 3.6687255783873775e-11, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.9766353726387024, |
| "num_tokens": 20237151.0, |
| "step": 9290 |
| }, |
| { |
| "epoch": 24.934228187919462, |
| "grad_norm": 1.4208660125732422, |
| "learning_rate": 3.0320082888835036e-13, |
| "loss": 0.087, |
| "mean_token_accuracy": 0.9745760560035706, |
| "num_tokens": 20256982.0, |
| "step": 9300 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 9300, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 25, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.462878792121057e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|