| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.849759401176261, |
| "eval_steps": 100, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014257708073427196, |
| "grad_norm": 80.31977081298828, |
| "learning_rate": 4.265402843601896e-07, |
| "loss": 1.7215, |
| "mean_token_accuracy": 0.4386619367171079, |
| "num_tokens": 81920.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02851541614685439, |
| "grad_norm": 5.226085186004639, |
| "learning_rate": 9.004739336492892e-07, |
| "loss": 0.4476, |
| "mean_token_accuracy": 0.7448018610477447, |
| "num_tokens": 163840.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04277312422028159, |
| "grad_norm": 1.4328534603118896, |
| "learning_rate": 1.3744075829383887e-06, |
| "loss": 0.293, |
| "mean_token_accuracy": 0.7609466724097729, |
| "num_tokens": 245760.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05703083229370878, |
| "grad_norm": 1.468935251235962, |
| "learning_rate": 1.8483412322274883e-06, |
| "loss": 0.2499, |
| "mean_token_accuracy": 0.7796599786728621, |
| "num_tokens": 327680.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07128854036713599, |
| "grad_norm": 1.6373568773269653, |
| "learning_rate": 2.322274881516588e-06, |
| "loss": 0.225, |
| "mean_token_accuracy": 0.7888087052851915, |
| "num_tokens": 409600.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08554624844056317, |
| "grad_norm": 1.3717502355575562, |
| "learning_rate": 2.7962085308056874e-06, |
| "loss": 0.2301, |
| "mean_token_accuracy": 0.7796599812805652, |
| "num_tokens": 491520.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09980395651399038, |
| "grad_norm": 1.4575324058532715, |
| "learning_rate": 3.2701421800947867e-06, |
| "loss": 0.2165, |
| "mean_token_accuracy": 0.7822773970663548, |
| "num_tokens": 573440.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11406166458741757, |
| "grad_norm": 1.7066882848739624, |
| "learning_rate": 3.7440758293838865e-06, |
| "loss": 0.2484, |
| "mean_token_accuracy": 0.7653864935040474, |
| "num_tokens": 655360.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12831937266084478, |
| "grad_norm": 1.3280051946640015, |
| "learning_rate": 4.218009478672986e-06, |
| "loss": 0.2092, |
| "mean_token_accuracy": 0.8033390413969755, |
| "num_tokens": 737280.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14257708073427197, |
| "grad_norm": 1.1487761735916138, |
| "learning_rate": 4.691943127962086e-06, |
| "loss": 0.2303, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.14257708073427197, |
| "eval_loss": 0.4363424479961395, |
| "eval_mean_token_accuracy": 0.9056754631873889, |
| "eval_num_tokens": 819200.0, |
| "eval_runtime": 41.3394, |
| "eval_samples_per_second": 30.165, |
| "eval_steps_per_second": 1.887, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15683478880769916, |
| "grad_norm": 1.2161787748336792, |
| "learning_rate": 5.165876777251185e-06, |
| "loss": 0.1977, |
| "mean_token_accuracy": 0.7886374732479453, |
| "num_tokens": 901120.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.17109249688112635, |
| "grad_norm": 1.2622631788253784, |
| "learning_rate": 5.639810426540285e-06, |
| "loss": 0.272, |
| "mean_token_accuracy": 0.7660469669848681, |
| "num_tokens": 983040.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.18535020495455357, |
| "grad_norm": 1.6876330375671387, |
| "learning_rate": 6.1137440758293845e-06, |
| "loss": 0.2425, |
| "mean_token_accuracy": 0.7557363010942936, |
| "num_tokens": 1064960.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.19960791302798075, |
| "grad_norm": 1.7104541063308716, |
| "learning_rate": 6.587677725118484e-06, |
| "loss": 0.2536, |
| "mean_token_accuracy": 0.7589774951338768, |
| "num_tokens": 1146880.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.21386562110140794, |
| "grad_norm": 1.7957197427749634, |
| "learning_rate": 7.061611374407583e-06, |
| "loss": 0.2307, |
| "mean_token_accuracy": 0.7757460869848728, |
| "num_tokens": 1228800.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.22812332917483513, |
| "grad_norm": 1.3486217260360718, |
| "learning_rate": 7.535545023696683e-06, |
| "loss": 0.2096, |
| "mean_token_accuracy": 0.8065435424447059, |
| "num_tokens": 1310720.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.24238103724826235, |
| "grad_norm": 1.7759106159210205, |
| "learning_rate": 8.009478672985783e-06, |
| "loss": 0.2394, |
| "mean_token_accuracy": 0.7681506846100092, |
| "num_tokens": 1392640.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.25663874532168957, |
| "grad_norm": 1.4754517078399658, |
| "learning_rate": 8.483412322274883e-06, |
| "loss": 0.2284, |
| "mean_token_accuracy": 0.7852617405354977, |
| "num_tokens": 1474560.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.27089645339511675, |
| "grad_norm": 1.4712748527526855, |
| "learning_rate": 8.957345971563981e-06, |
| "loss": 0.1902, |
| "mean_token_accuracy": 0.7985200572758913, |
| "num_tokens": 1556480.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.28515416146854394, |
| "grad_norm": 1.8678069114685059, |
| "learning_rate": 9.431279620853082e-06, |
| "loss": 0.2482, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.28515416146854394, |
| "eval_loss": 0.4305071532726288, |
| "eval_mean_token_accuracy": 0.9062988207890437, |
| "eval_num_tokens": 1638400.0, |
| "eval_runtime": 41.308, |
| "eval_samples_per_second": 30.188, |
| "eval_steps_per_second": 1.888, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.29941186954197113, |
| "grad_norm": 1.591244101524353, |
| "learning_rate": 9.905213270142182e-06, |
| "loss": 0.219, |
| "mean_token_accuracy": 0.7843933456577361, |
| "num_tokens": 1720320.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3136695776153983, |
| "grad_norm": 2.2734687328338623, |
| "learning_rate": 9.95778364116095e-06, |
| "loss": 0.2099, |
| "mean_token_accuracy": 0.7872798424214125, |
| "num_tokens": 1802240.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3279272856888255, |
| "grad_norm": 1.6370080709457397, |
| "learning_rate": 9.905013192612138e-06, |
| "loss": 0.192, |
| "mean_token_accuracy": 0.7989359103143215, |
| "num_tokens": 1884160.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3421849937622527, |
| "grad_norm": 1.4441734552383423, |
| "learning_rate": 9.852242744063325e-06, |
| "loss": 0.2143, |
| "mean_token_accuracy": 0.808133564144373, |
| "num_tokens": 1966080.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.3564427018356799, |
| "grad_norm": 1.1315490007400513, |
| "learning_rate": 9.799472295514513e-06, |
| "loss": 0.1918, |
| "mean_token_accuracy": 0.7888331696391105, |
| "num_tokens": 2048000.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.37070040990910713, |
| "grad_norm": 0.9603747725486755, |
| "learning_rate": 9.7467018469657e-06, |
| "loss": 0.1924, |
| "mean_token_accuracy": 0.8039628200232982, |
| "num_tokens": 2129920.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3849581179825343, |
| "grad_norm": 1.666515827178955, |
| "learning_rate": 9.693931398416887e-06, |
| "loss": 0.1782, |
| "mean_token_accuracy": 0.803094419836998, |
| "num_tokens": 2211840.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3992158260559615, |
| "grad_norm": 1.8663837909698486, |
| "learning_rate": 9.641160949868074e-06, |
| "loss": 0.2374, |
| "mean_token_accuracy": 0.7656678084284068, |
| "num_tokens": 2293760.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4134735341293887, |
| "grad_norm": 1.5346232652664185, |
| "learning_rate": 9.588390501319263e-06, |
| "loss": 0.2, |
| "mean_token_accuracy": 0.7928816072642804, |
| "num_tokens": 2375680.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4277312422028159, |
| "grad_norm": 1.631354570388794, |
| "learning_rate": 9.53562005277045e-06, |
| "loss": 0.2238, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4277312422028159, |
| "eval_loss": 0.42917218804359436, |
| "eval_mean_token_accuracy": 0.9063309125411205, |
| "eval_num_tokens": 2457600.0, |
| "eval_runtime": 41.3603, |
| "eval_samples_per_second": 30.15, |
| "eval_steps_per_second": 1.886, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4419889502762431, |
| "grad_norm": 1.4803297519683838, |
| "learning_rate": 9.482849604221636e-06, |
| "loss": 0.2441, |
| "mean_token_accuracy": 0.7797455977648496, |
| "num_tokens": 2539520.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.45624665834967026, |
| "grad_norm": 1.4234380722045898, |
| "learning_rate": 9.430079155672825e-06, |
| "loss": 0.2552, |
| "mean_token_accuracy": 0.7678449124097824, |
| "num_tokens": 2621440.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4705043664230975, |
| "grad_norm": 1.5810368061065674, |
| "learning_rate": 9.37730870712401e-06, |
| "loss": 0.225, |
| "mean_token_accuracy": 0.7840631131082774, |
| "num_tokens": 2703360.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4847620744965247, |
| "grad_norm": 1.540651559829712, |
| "learning_rate": 9.324538258575199e-06, |
| "loss": 0.1879, |
| "mean_token_accuracy": 0.7923556782305241, |
| "num_tokens": 2785280.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4990197825699519, |
| "grad_norm": 1.7700861692428589, |
| "learning_rate": 9.271767810026386e-06, |
| "loss": 0.228, |
| "mean_token_accuracy": 0.7943493168801069, |
| "num_tokens": 2867200.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5132774906433791, |
| "grad_norm": 1.2500240802764893, |
| "learning_rate": 9.218997361477573e-06, |
| "loss": 0.2557, |
| "mean_token_accuracy": 0.7765655562281608, |
| "num_tokens": 2949120.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5275351987168063, |
| "grad_norm": 1.4669880867004395, |
| "learning_rate": 9.166226912928761e-06, |
| "loss": 0.2077, |
| "mean_token_accuracy": 0.7948140885680914, |
| "num_tokens": 3031040.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5417929067902335, |
| "grad_norm": 1.4171335697174072, |
| "learning_rate": 9.113456464379948e-06, |
| "loss": 0.2263, |
| "mean_token_accuracy": 0.7792441301047802, |
| "num_tokens": 3112960.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5560506148636607, |
| "grad_norm": 1.6694875955581665, |
| "learning_rate": 9.060686015831135e-06, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.8029109600931406, |
| "num_tokens": 3194880.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5703083229370879, |
| "grad_norm": 1.242436170578003, |
| "learning_rate": 9.007915567282322e-06, |
| "loss": 0.2046, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5703083229370879, |
| "eval_loss": 0.4270094931125641, |
| "eval_mean_token_accuracy": 0.9065381089846293, |
| "eval_num_tokens": 3276800.0, |
| "eval_runtime": 41.4607, |
| "eval_samples_per_second": 30.077, |
| "eval_steps_per_second": 1.881, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5845660310105151, |
| "grad_norm": 1.7970621585845947, |
| "learning_rate": 8.95514511873351e-06, |
| "loss": 0.2489, |
| "mean_token_accuracy": 0.7863931017927825, |
| "num_tokens": 3358720.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5988237390839423, |
| "grad_norm": 1.1633917093276978, |
| "learning_rate": 8.902374670184698e-06, |
| "loss": 0.2433, |
| "mean_token_accuracy": 0.7638698622584343, |
| "num_tokens": 3440640.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6130814471573695, |
| "grad_norm": 1.1505024433135986, |
| "learning_rate": 8.849604221635884e-06, |
| "loss": 0.2055, |
| "mean_token_accuracy": 0.7901785716414451, |
| "num_tokens": 3522560.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6273391552307966, |
| "grad_norm": 1.0602478981018066, |
| "learning_rate": 8.796833773087073e-06, |
| "loss": 0.2245, |
| "mean_token_accuracy": 0.7706947140395641, |
| "num_tokens": 3604480.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6415968633042238, |
| "grad_norm": 1.772160291671753, |
| "learning_rate": 8.744063324538258e-06, |
| "loss": 0.1874, |
| "mean_token_accuracy": 0.8206702545285225, |
| "num_tokens": 3686400.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.655854571377651, |
| "grad_norm": 1.3542793989181519, |
| "learning_rate": 8.691292875989447e-06, |
| "loss": 0.1848, |
| "mean_token_accuracy": 0.804293054714799, |
| "num_tokens": 3768320.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6701122794510782, |
| "grad_norm": 1.0653384923934937, |
| "learning_rate": 8.638522427440634e-06, |
| "loss": 0.235, |
| "mean_token_accuracy": 0.7650929525494575, |
| "num_tokens": 3850240.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6843699875245054, |
| "grad_norm": 1.6029295921325684, |
| "learning_rate": 8.58575197889182e-06, |
| "loss": 0.1925, |
| "mean_token_accuracy": 0.8006971623748541, |
| "num_tokens": 3932160.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.6986276955979326, |
| "grad_norm": 1.4445295333862305, |
| "learning_rate": 8.53298153034301e-06, |
| "loss": 0.2081, |
| "mean_token_accuracy": 0.7856042090803385, |
| "num_tokens": 4014080.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7128854036713598, |
| "grad_norm": 1.2714146375656128, |
| "learning_rate": 8.480211081794196e-06, |
| "loss": 0.1993, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7128854036713598, |
| "eval_loss": 0.4256138503551483, |
| "eval_mean_token_accuracy": 0.9067788590223361, |
| "eval_num_tokens": 4096000.0, |
| "eval_runtime": 41.1978, |
| "eval_samples_per_second": 30.269, |
| "eval_steps_per_second": 1.893, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7271431117447871, |
| "grad_norm": 1.3898621797561646, |
| "learning_rate": 8.427440633245383e-06, |
| "loss": 0.2309, |
| "mean_token_accuracy": 0.7859711354598403, |
| "num_tokens": 4177920.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7414008198182143, |
| "grad_norm": 1.7414770126342773, |
| "learning_rate": 8.37467018469657e-06, |
| "loss": 0.184, |
| "mean_token_accuracy": 0.8112646777182817, |
| "num_tokens": 4259840.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7556585278916415, |
| "grad_norm": 1.3587582111358643, |
| "learning_rate": 8.321899736147759e-06, |
| "loss": 0.1971, |
| "mean_token_accuracy": 0.8050636004656553, |
| "num_tokens": 4341760.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7699162359650686, |
| "grad_norm": 1.324808120727539, |
| "learning_rate": 8.269129287598946e-06, |
| "loss": 0.2105, |
| "mean_token_accuracy": 0.7933341484516859, |
| "num_tokens": 4423680.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7841739440384958, |
| "grad_norm": 1.2644426822662354, |
| "learning_rate": 8.216358839050133e-06, |
| "loss": 0.2382, |
| "mean_token_accuracy": 0.7831457942724228, |
| "num_tokens": 4505600.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.798431652111923, |
| "grad_norm": 1.3145424127578735, |
| "learning_rate": 8.16358839050132e-06, |
| "loss": 0.2081, |
| "mean_token_accuracy": 0.7934319969266653, |
| "num_tokens": 4587520.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8126893601853502, |
| "grad_norm": 1.205351710319519, |
| "learning_rate": 8.110817941952506e-06, |
| "loss": 0.2078, |
| "mean_token_accuracy": 0.7768101781606674, |
| "num_tokens": 4669440.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8269470682587774, |
| "grad_norm": 1.2244083881378174, |
| "learning_rate": 8.058047493403695e-06, |
| "loss": 0.2068, |
| "mean_token_accuracy": 0.7885518573224545, |
| "num_tokens": 4751360.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8412047763322046, |
| "grad_norm": 1.6164205074310303, |
| "learning_rate": 8.005277044854882e-06, |
| "loss": 0.1852, |
| "mean_token_accuracy": 0.7931751444935798, |
| "num_tokens": 4833280.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8554624844056318, |
| "grad_norm": 1.507039189338684, |
| "learning_rate": 7.952506596306069e-06, |
| "loss": 0.2409, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8554624844056318, |
| "eval_loss": 0.42402541637420654, |
| "eval_mean_token_accuracy": 0.9070860980412899, |
| "eval_num_tokens": 4915200.0, |
| "eval_runtime": 41.2209, |
| "eval_samples_per_second": 30.252, |
| "eval_steps_per_second": 1.892, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.869720192479059, |
| "grad_norm": 1.43611478805542, |
| "learning_rate": 7.899736147757256e-06, |
| "loss": 0.1694, |
| "mean_token_accuracy": 0.7737157551571727, |
| "num_tokens": 4997120.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8839779005524862, |
| "grad_norm": 1.1565167903900146, |
| "learning_rate": 7.846965699208444e-06, |
| "loss": 0.1831, |
| "mean_token_accuracy": 0.8027274951338768, |
| "num_tokens": 5079040.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8982356086259133, |
| "grad_norm": 1.421459674835205, |
| "learning_rate": 7.794195250659631e-06, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.8165117412805557, |
| "num_tokens": 5160960.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9124933166993405, |
| "grad_norm": 1.2855138778686523, |
| "learning_rate": 7.741424802110818e-06, |
| "loss": 0.1795, |
| "mean_token_accuracy": 0.8006971597671508, |
| "num_tokens": 5242880.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9267510247727678, |
| "grad_norm": 1.2338076829910278, |
| "learning_rate": 7.688654353562007e-06, |
| "loss": 0.1988, |
| "mean_token_accuracy": 0.7919031333178281, |
| "num_tokens": 5324800.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.941008732846195, |
| "grad_norm": 1.3217254877090454, |
| "learning_rate": 7.635883905013192e-06, |
| "loss": 0.2178, |
| "mean_token_accuracy": 0.7840019542723894, |
| "num_tokens": 5406720.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9552664409196222, |
| "grad_norm": 1.0036381483078003, |
| "learning_rate": 7.583113456464381e-06, |
| "loss": 0.2076, |
| "mean_token_accuracy": 0.80275196172297, |
| "num_tokens": 5488640.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9695241489930494, |
| "grad_norm": 1.1931681632995605, |
| "learning_rate": 7.5303430079155685e-06, |
| "loss": 0.1649, |
| "mean_token_accuracy": 0.8191291578114033, |
| "num_tokens": 5570560.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9837818570664766, |
| "grad_norm": 1.2504717111587524, |
| "learning_rate": 7.4775725593667545e-06, |
| "loss": 0.214, |
| "mean_token_accuracy": 0.7973703525960445, |
| "num_tokens": 5652480.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9980395651399038, |
| "grad_norm": 1.1769760847091675, |
| "learning_rate": 7.424802110817942e-06, |
| "loss": 0.1853, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.9980395651399038, |
| "eval_loss": 0.42192551493644714, |
| "eval_mean_token_accuracy": 0.9072852402161329, |
| "eval_num_tokens": 5734400.0, |
| "eval_runtime": 41.2475, |
| "eval_samples_per_second": 30.232, |
| "eval_steps_per_second": 1.891, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.0114061664587417, |
| "grad_norm": 1.4164894819259644, |
| "learning_rate": 7.37203166226913e-06, |
| "loss": 0.1644, |
| "mean_token_accuracy": 0.7994192252236028, |
| "num_tokens": 5810688.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.025663874532169, |
| "grad_norm": 1.4920251369476318, |
| "learning_rate": 7.319261213720317e-06, |
| "loss": 0.2075, |
| "mean_token_accuracy": 0.771135026961565, |
| "num_tokens": 5892608.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.039921582605596, |
| "grad_norm": 1.4728736877441406, |
| "learning_rate": 7.266490765171505e-06, |
| "loss": 0.2103, |
| "mean_token_accuracy": 0.7777886509895324, |
| "num_tokens": 5974528.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0541792906790233, |
| "grad_norm": 1.3879398107528687, |
| "learning_rate": 7.2137203166226925e-06, |
| "loss": 0.1873, |
| "mean_token_accuracy": 0.810420742072165, |
| "num_tokens": 6056448.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.0684369987524505, |
| "grad_norm": 1.2428431510925293, |
| "learning_rate": 7.160949868073879e-06, |
| "loss": 0.179, |
| "mean_token_accuracy": 0.8042074333876371, |
| "num_tokens": 6138368.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0826947068258777, |
| "grad_norm": 1.2117047309875488, |
| "learning_rate": 7.108179419525066e-06, |
| "loss": 0.1715, |
| "mean_token_accuracy": 0.7988258298486471, |
| "num_tokens": 6220288.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.0969524148993048, |
| "grad_norm": 1.0141360759735107, |
| "learning_rate": 7.055408970976254e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.7959515653550625, |
| "num_tokens": 6302208.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.1112101229727323, |
| "grad_norm": 1.3603819608688354, |
| "learning_rate": 7.002638522427441e-06, |
| "loss": 0.1843, |
| "mean_token_accuracy": 0.7814090006053448, |
| "num_tokens": 6384128.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.1254678310461594, |
| "grad_norm": 1.2628884315490723, |
| "learning_rate": 6.949868073878628e-06, |
| "loss": 0.1975, |
| "mean_token_accuracy": 0.7898238770663738, |
| "num_tokens": 6466048.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.1397255391195866, |
| "grad_norm": 1.3789145946502686, |
| "learning_rate": 6.897097625329816e-06, |
| "loss": 0.188, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1397255391195866, |
| "eval_loss": 0.42303529381752014, |
| "eval_mean_token_accuracy": 0.9073893580681238, |
| "eval_num_tokens": 6547968.0, |
| "eval_runtime": 41.1713, |
| "eval_samples_per_second": 30.288, |
| "eval_steps_per_second": 1.895, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1539832471930138, |
| "grad_norm": 1.688471794128418, |
| "learning_rate": 6.844327176781003e-06, |
| "loss": 0.1963, |
| "mean_token_accuracy": 0.7926553322002292, |
| "num_tokens": 6629888.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.168240955266441, |
| "grad_norm": 1.4517184495925903, |
| "learning_rate": 6.7915567282321904e-06, |
| "loss": 0.2068, |
| "mean_token_accuracy": 0.7929794482886792, |
| "num_tokens": 6711808.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1824986633398682, |
| "grad_norm": 1.250712275505066, |
| "learning_rate": 6.738786279683378e-06, |
| "loss": 0.2249, |
| "mean_token_accuracy": 0.7757338518276811, |
| "num_tokens": 6793728.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.1967563714132954, |
| "grad_norm": 1.5452476739883423, |
| "learning_rate": 6.686015831134564e-06, |
| "loss": 0.1976, |
| "mean_token_accuracy": 0.7758683927357197, |
| "num_tokens": 6875648.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.2110140794867226, |
| "grad_norm": 1.7196730375289917, |
| "learning_rate": 6.633245382585752e-06, |
| "loss": 0.1876, |
| "mean_token_accuracy": 0.7862769093364477, |
| "num_tokens": 6957568.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.2252717875601498, |
| "grad_norm": 1.0415942668914795, |
| "learning_rate": 6.58047493403694e-06, |
| "loss": 0.1789, |
| "mean_token_accuracy": 0.8055528394877911, |
| "num_tokens": 7039488.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.239529495633577, |
| "grad_norm": 1.075972080230713, |
| "learning_rate": 6.527704485488127e-06, |
| "loss": 0.1741, |
| "mean_token_accuracy": 0.788759783655405, |
| "num_tokens": 7121408.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.2537872037070041, |
| "grad_norm": 1.3330209255218506, |
| "learning_rate": 6.4749340369393145e-06, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.792331212386489, |
| "num_tokens": 7203328.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2680449117804313, |
| "grad_norm": 1.4352701902389526, |
| "learning_rate": 6.422163588390502e-06, |
| "loss": 0.201, |
| "mean_token_accuracy": 0.7732142828404903, |
| "num_tokens": 7285248.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.2823026198538585, |
| "grad_norm": 1.189164638519287, |
| "learning_rate": 6.36939313984169e-06, |
| "loss": 0.1709, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2823026198538585, |
| "eval_loss": 0.4221397936344147, |
| "eval_mean_token_accuracy": 0.9075833811209753, |
| "eval_num_tokens": 7367168.0, |
| "eval_runtime": 41.1163, |
| "eval_samples_per_second": 30.329, |
| "eval_steps_per_second": 1.897, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2965603279272857, |
| "grad_norm": 1.0476454496383667, |
| "learning_rate": 6.316622691292876e-06, |
| "loss": 0.1622, |
| "mean_token_accuracy": 0.8043847857043147, |
| "num_tokens": 7449088.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.310818036000713, |
| "grad_norm": 1.4576106071472168, |
| "learning_rate": 6.263852242744064e-06, |
| "loss": 0.1771, |
| "mean_token_accuracy": 0.7940190762281418, |
| "num_tokens": 7531008.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.32507574407414, |
| "grad_norm": 1.3801617622375488, |
| "learning_rate": 6.211081794195252e-06, |
| "loss": 0.1683, |
| "mean_token_accuracy": 0.8146893348544836, |
| "num_tokens": 7612928.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.3393334521475673, |
| "grad_norm": 1.3410853147506714, |
| "learning_rate": 6.1583113456464385e-06, |
| "loss": 0.1822, |
| "mean_token_accuracy": 0.7949853200465441, |
| "num_tokens": 7694848.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.3535911602209945, |
| "grad_norm": 1.3625820875167847, |
| "learning_rate": 6.105540897097626e-06, |
| "loss": 0.1851, |
| "mean_token_accuracy": 0.7908635035157203, |
| "num_tokens": 7776768.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.3678488682944216, |
| "grad_norm": 1.2172579765319824, |
| "learning_rate": 6.052770448548814e-06, |
| "loss": 0.1819, |
| "mean_token_accuracy": 0.7890900187194347, |
| "num_tokens": 7858688.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.3821065763678488, |
| "grad_norm": 1.2454630136489868, |
| "learning_rate": 6e-06, |
| "loss": 0.1972, |
| "mean_token_accuracy": 0.791903131455183, |
| "num_tokens": 7940608.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.396364284441276, |
| "grad_norm": 1.0972909927368164, |
| "learning_rate": 5.947229551451188e-06, |
| "loss": 0.1745, |
| "mean_token_accuracy": 0.795584636926651, |
| "num_tokens": 8022528.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.4106219925147032, |
| "grad_norm": 1.3013101816177368, |
| "learning_rate": 5.894459102902376e-06, |
| "loss": 0.2051, |
| "mean_token_accuracy": 0.7778742641210556, |
| "num_tokens": 8104448.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.4248797005881304, |
| "grad_norm": 1.4143636226654053, |
| "learning_rate": 5.841688654353563e-06, |
| "loss": 0.228, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.4248797005881304, |
| "eval_loss": 0.4206378161907196, |
| "eval_mean_token_accuracy": 0.9077556622334015, |
| "eval_num_tokens": 8186368.0, |
| "eval_runtime": 41.158, |
| "eval_samples_per_second": 30.298, |
| "eval_steps_per_second": 1.895, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.4391374086615576, |
| "grad_norm": 1.8749654293060303, |
| "learning_rate": 5.7889182058047495e-06, |
| "loss": 0.1719, |
| "mean_token_accuracy": 0.7847602725028991, |
| "num_tokens": 8268288.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.4533951167349848, |
| "grad_norm": 1.5693446397781372, |
| "learning_rate": 5.736147757255937e-06, |
| "loss": 0.187, |
| "mean_token_accuracy": 0.7941780813038349, |
| "num_tokens": 8350208.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.467652824808412, |
| "grad_norm": 1.1207451820373535, |
| "learning_rate": 5.683377308707124e-06, |
| "loss": 0.1705, |
| "mean_token_accuracy": 0.7942636970430612, |
| "num_tokens": 8432128.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.4819105328818392, |
| "grad_norm": 1.1814815998077393, |
| "learning_rate": 5.630606860158312e-06, |
| "loss": 0.1723, |
| "mean_token_accuracy": 0.8012964788824319, |
| "num_tokens": 8514048.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.4961682409552663, |
| "grad_norm": 1.3927719593048096, |
| "learning_rate": 5.5778364116095e-06, |
| "loss": 0.2018, |
| "mean_token_accuracy": 0.7847480427473783, |
| "num_tokens": 8595968.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.5104259490286935, |
| "grad_norm": 1.4986419677734375, |
| "learning_rate": 5.525065963060686e-06, |
| "loss": 0.1803, |
| "mean_token_accuracy": 0.7783757321536541, |
| "num_tokens": 8677888.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.5246836571021207, |
| "grad_norm": 1.8012514114379883, |
| "learning_rate": 5.472295514511874e-06, |
| "loss": 0.1863, |
| "mean_token_accuracy": 0.8155821930617094, |
| "num_tokens": 8759808.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.538941365175548, |
| "grad_norm": 1.2655534744262695, |
| "learning_rate": 5.419525065963061e-06, |
| "loss": 0.1967, |
| "mean_token_accuracy": 0.7537671256810426, |
| "num_tokens": 8841728.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.553199073248975, |
| "grad_norm": 1.3260008096694946, |
| "learning_rate": 5.366754617414248e-06, |
| "loss": 0.199, |
| "mean_token_accuracy": 0.7909124296158552, |
| "num_tokens": 8923648.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.5674567813224023, |
| "grad_norm": 1.490972876548767, |
| "learning_rate": 5.313984168865436e-06, |
| "loss": 0.1835, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5674567813224023, |
| "eval_loss": 0.4212629497051239, |
| "eval_mean_token_accuracy": 0.9076559314360986, |
| "eval_num_tokens": 9005568.0, |
| "eval_runtime": 41.1762, |
| "eval_samples_per_second": 30.284, |
| "eval_steps_per_second": 1.894, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.5817144893958295, |
| "grad_norm": 1.5151809453964233, |
| "learning_rate": 5.261213720316624e-06, |
| "loss": 0.1791, |
| "mean_token_accuracy": 0.8009601265192032, |
| "num_tokens": 9087488.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.5959721974692567, |
| "grad_norm": 1.244946837425232, |
| "learning_rate": 5.20844327176781e-06, |
| "loss": 0.1692, |
| "mean_token_accuracy": 0.8105185899883509, |
| "num_tokens": 9169408.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.6102299055426839, |
| "grad_norm": 1.328723669052124, |
| "learning_rate": 5.155672823218998e-06, |
| "loss": 0.19, |
| "mean_token_accuracy": 0.7765777885913849, |
| "num_tokens": 9251328.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.624487613616111, |
| "grad_norm": 1.1754485368728638, |
| "learning_rate": 5.102902374670185e-06, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.8148972604423761, |
| "num_tokens": 9333248.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.6387453216895385, |
| "grad_norm": 1.2302050590515137, |
| "learning_rate": 5.050131926121372e-06, |
| "loss": 0.2001, |
| "mean_token_accuracy": 0.7948018629103899, |
| "num_tokens": 9415168.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.6530030297629656, |
| "grad_norm": 1.1300264596939087, |
| "learning_rate": 4.99736147757256e-06, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.8071673218160867, |
| "num_tokens": 9497088.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.6672607378363928, |
| "grad_norm": 1.0087612867355347, |
| "learning_rate": 4.944591029023747e-06, |
| "loss": 0.1486, |
| "mean_token_accuracy": 0.8172700595110655, |
| "num_tokens": 9579008.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.68151844590982, |
| "grad_norm": 1.4350190162658691, |
| "learning_rate": 4.891820580474935e-06, |
| "loss": 0.186, |
| "mean_token_accuracy": 0.8002446169033647, |
| "num_tokens": 9660928.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.6957761539832472, |
| "grad_norm": 1.69225013256073, |
| "learning_rate": 4.839050131926122e-06, |
| "loss": 0.1788, |
| "mean_token_accuracy": 0.7850782759487629, |
| "num_tokens": 9742848.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.7100338620566744, |
| "grad_norm": 1.1749773025512695, |
| "learning_rate": 4.786279683377309e-06, |
| "loss": 0.1869, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.7100338620566744, |
| "eval_loss": 0.4190373420715332, |
| "eval_mean_token_accuracy": 0.9080827587690109, |
| "eval_num_tokens": 9824768.0, |
| "eval_runtime": 41.1505, |
| "eval_samples_per_second": 30.303, |
| "eval_steps_per_second": 1.895, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.7242915701301016, |
| "grad_norm": 1.0468658208847046, |
| "learning_rate": 4.733509234828496e-06, |
| "loss": 0.1812, |
| "mean_token_accuracy": 0.8027397247031332, |
| "num_tokens": 9906688.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.7385492782035288, |
| "grad_norm": 1.4152122735977173, |
| "learning_rate": 4.680738786279684e-06, |
| "loss": 0.1816, |
| "mean_token_accuracy": 0.7942759301513433, |
| "num_tokens": 9988608.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.752806986276956, |
| "grad_norm": 1.4354445934295654, |
| "learning_rate": 4.627968337730871e-06, |
| "loss": 0.2029, |
| "mean_token_accuracy": 0.7708537172526121, |
| "num_tokens": 10070528.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.7670646943503832, |
| "grad_norm": 1.2206268310546875, |
| "learning_rate": 4.575197889182059e-06, |
| "loss": 0.2083, |
| "mean_token_accuracy": 0.7663405101746321, |
| "num_tokens": 10152448.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.7813224024238103, |
| "grad_norm": 1.3205043077468872, |
| "learning_rate": 4.522427440633246e-06, |
| "loss": 0.1929, |
| "mean_token_accuracy": 0.7970156516879797, |
| "num_tokens": 10234368.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.7955801104972375, |
| "grad_norm": 1.1927738189697266, |
| "learning_rate": 4.469656992084433e-06, |
| "loss": 0.1527, |
| "mean_token_accuracy": 0.8137230888009072, |
| "num_tokens": 10316288.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.8098378185706647, |
| "grad_norm": 1.4011446237564087, |
| "learning_rate": 4.4168865435356204e-06, |
| "loss": 0.1938, |
| "mean_token_accuracy": 0.7995963796973229, |
| "num_tokens": 10398208.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.824095526644092, |
| "grad_norm": 1.3237054347991943, |
| "learning_rate": 4.364116094986807e-06, |
| "loss": 0.166, |
| "mean_token_accuracy": 0.794483856856823, |
| "num_tokens": 10480128.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.8383532347175193, |
| "grad_norm": 1.1077933311462402, |
| "learning_rate": 4.311345646437995e-06, |
| "loss": 0.1773, |
| "mean_token_accuracy": 0.7855797458440066, |
| "num_tokens": 10562048.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.8526109427909465, |
| "grad_norm": 1.12204909324646, |
| "learning_rate": 4.258575197889183e-06, |
| "loss": 0.1679, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.8526109427909465, |
| "eval_loss": 0.41882508993148804, |
| "eval_mean_token_accuracy": 0.9083497478411748, |
| "eval_num_tokens": 10643968.0, |
| "eval_runtime": 41.3617, |
| "eval_samples_per_second": 30.149, |
| "eval_steps_per_second": 1.886, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.8668686508643737, |
| "grad_norm": 1.453183650970459, |
| "learning_rate": 4.20580474934037e-06, |
| "loss": 0.1681, |
| "mean_token_accuracy": 0.8072040120139718, |
| "num_tokens": 10725888.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.881126358937801, |
| "grad_norm": 1.229581356048584, |
| "learning_rate": 4.153034300791557e-06, |
| "loss": 0.2113, |
| "mean_token_accuracy": 0.7945083156228065, |
| "num_tokens": 10807808.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.895384067011228, |
| "grad_norm": 1.193543553352356, |
| "learning_rate": 4.1002638522427445e-06, |
| "loss": 0.1821, |
| "mean_token_accuracy": 0.7945450108498335, |
| "num_tokens": 10889728.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.9096417750846553, |
| "grad_norm": 1.3757144212722778, |
| "learning_rate": 4.047493403693931e-06, |
| "loss": 0.198, |
| "mean_token_accuracy": 0.7713062632828951, |
| "num_tokens": 10971648.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.9238994831580825, |
| "grad_norm": 0.997105062007904, |
| "learning_rate": 3.994722955145119e-06, |
| "loss": 0.1507, |
| "mean_token_accuracy": 0.8220768094062805, |
| "num_tokens": 11053568.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.9381571912315096, |
| "grad_norm": 1.5941100120544434, |
| "learning_rate": 3.941952506596307e-06, |
| "loss": 0.1748, |
| "mean_token_accuracy": 0.7959882594645024, |
| "num_tokens": 11135488.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.9524148993049368, |
| "grad_norm": 1.271546721458435, |
| "learning_rate": 3.889182058047494e-06, |
| "loss": 0.1581, |
| "mean_token_accuracy": 0.8203400176018476, |
| "num_tokens": 11217408.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.966672607378364, |
| "grad_norm": 1.631945252418518, |
| "learning_rate": 3.836411609498681e-06, |
| "loss": 0.2008, |
| "mean_token_accuracy": 0.7798923674970866, |
| "num_tokens": 11299328.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.9809303154517912, |
| "grad_norm": 1.08231520652771, |
| "learning_rate": 3.7836411609498686e-06, |
| "loss": 0.1557, |
| "mean_token_accuracy": 0.7915484357625246, |
| "num_tokens": 11381248.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.9951880235252184, |
| "grad_norm": 1.283751130104065, |
| "learning_rate": 3.730870712401056e-06, |
| "loss": 0.1926, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.9951880235252184, |
| "eval_loss": 0.417749285697937, |
| "eval_mean_token_accuracy": 0.908511886994044, |
| "eval_num_tokens": 11463168.0, |
| "eval_runtime": 41.1617, |
| "eval_samples_per_second": 30.295, |
| "eval_steps_per_second": 1.895, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.0085546248440562, |
| "grad_norm": 1.2342406511306763, |
| "learning_rate": 3.678100263852243e-06, |
| "loss": 0.1856, |
| "mean_token_accuracy": 0.7944889839618436, |
| "num_tokens": 11539456.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.0228123329174834, |
| "grad_norm": 1.6273552179336548, |
| "learning_rate": 3.6253298153034306e-06, |
| "loss": 0.1697, |
| "mean_token_accuracy": 0.7902274955064058, |
| "num_tokens": 11621376.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.0370700409909106, |
| "grad_norm": 0.9989307522773743, |
| "learning_rate": 3.5725593667546175e-06, |
| "loss": 0.195, |
| "mean_token_accuracy": 0.7881115455180406, |
| "num_tokens": 11703296.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.051327749064338, |
| "grad_norm": 1.885137915611267, |
| "learning_rate": 3.519788918205805e-06, |
| "loss": 0.1554, |
| "mean_token_accuracy": 0.8048434421420098, |
| "num_tokens": 11785216.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.065585457137765, |
| "grad_norm": 1.6031672954559326, |
| "learning_rate": 3.4670184696569926e-06, |
| "loss": 0.1584, |
| "mean_token_accuracy": 0.794055774062872, |
| "num_tokens": 11867136.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.079843165211192, |
| "grad_norm": 1.0749027729034424, |
| "learning_rate": 3.4142480211081795e-06, |
| "loss": 0.1678, |
| "mean_token_accuracy": 0.8086961843073368, |
| "num_tokens": 11949056.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.0941008732846194, |
| "grad_norm": 1.521986484527588, |
| "learning_rate": 3.361477572559367e-06, |
| "loss": 0.1919, |
| "mean_token_accuracy": 0.7812744613736868, |
| "num_tokens": 12030976.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.1083585813580465, |
| "grad_norm": 1.4633926153182983, |
| "learning_rate": 3.3087071240105546e-06, |
| "loss": 0.148, |
| "mean_token_accuracy": 0.8082069471478462, |
| "num_tokens": 12112896.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.1226162894314737, |
| "grad_norm": 1.4096753597259521, |
| "learning_rate": 3.2559366754617416e-06, |
| "loss": 0.2094, |
| "mean_token_accuracy": 0.7721991192549467, |
| "num_tokens": 12194816.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.136873997504901, |
| "grad_norm": 1.5162105560302734, |
| "learning_rate": 3.203166226912929e-06, |
| "loss": 0.1665, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.136873997504901, |
| "eval_loss": 0.4229465126991272, |
| "eval_mean_token_accuracy": 0.9079398543406756, |
| "eval_num_tokens": 12276736.0, |
| "eval_runtime": 41.2908, |
| "eval_samples_per_second": 30.2, |
| "eval_steps_per_second": 1.889, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.151131705578328, |
| "grad_norm": 1.1753822565078735, |
| "learning_rate": 3.1503957783641167e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.7948324346914888, |
| "num_tokens": 12358656.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.1653894136517553, |
| "grad_norm": 1.3583290576934814, |
| "learning_rate": 3.0976253298153036e-06, |
| "loss": 0.1516, |
| "mean_token_accuracy": 0.7987769071012736, |
| "num_tokens": 12440576.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.1796471217251825, |
| "grad_norm": 1.6773642301559448, |
| "learning_rate": 3.044854881266491e-06, |
| "loss": 0.1582, |
| "mean_token_accuracy": 0.8161937419325114, |
| "num_tokens": 12522496.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.1939048297986097, |
| "grad_norm": 1.700421929359436, |
| "learning_rate": 2.9920844327176783e-06, |
| "loss": 0.1651, |
| "mean_token_accuracy": 0.7837084148079156, |
| "num_tokens": 12604416.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.208162537872037, |
| "grad_norm": 1.278611183166504, |
| "learning_rate": 2.9393139841688656e-06, |
| "loss": 0.1459, |
| "mean_token_accuracy": 0.8016634039580822, |
| "num_tokens": 12686336.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.2224202459454645, |
| "grad_norm": 1.3623602390289307, |
| "learning_rate": 2.8865435356200525e-06, |
| "loss": 0.1754, |
| "mean_token_accuracy": 0.7952054802328348, |
| "num_tokens": 12768256.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.2366779540188917, |
| "grad_norm": 1.1797006130218506, |
| "learning_rate": 2.8337730870712403e-06, |
| "loss": 0.1854, |
| "mean_token_accuracy": 0.7857632093131542, |
| "num_tokens": 12850176.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.250935662092319, |
| "grad_norm": 1.2017779350280762, |
| "learning_rate": 2.7810026385224277e-06, |
| "loss": 0.1482, |
| "mean_token_accuracy": 0.8103106629103423, |
| "num_tokens": 12932096.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.265193370165746, |
| "grad_norm": 1.1322146654129028, |
| "learning_rate": 2.7282321899736154e-06, |
| "loss": 0.1539, |
| "mean_token_accuracy": 0.8084882564842701, |
| "num_tokens": 13014016.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.2794510782391733, |
| "grad_norm": 1.2803654670715332, |
| "learning_rate": 2.6754617414248023e-06, |
| "loss": 0.1495, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.2794510782391733, |
| "eval_loss": 0.423663467168808, |
| "eval_mean_token_accuracy": 0.9079369283639468, |
| "eval_num_tokens": 13095936.0, |
| "eval_runtime": 41.2866, |
| "eval_samples_per_second": 30.203, |
| "eval_steps_per_second": 1.889, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.2937087863126004, |
| "grad_norm": 1.110379934310913, |
| "learning_rate": 2.6226912928759897e-06, |
| "loss": 0.157, |
| "mean_token_accuracy": 0.8001467704772949, |
| "num_tokens": 13177856.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.3079664943860276, |
| "grad_norm": 1.2236034870147705, |
| "learning_rate": 2.5699208443271775e-06, |
| "loss": 0.1566, |
| "mean_token_accuracy": 0.807118396833539, |
| "num_tokens": 13259776.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.322224202459455, |
| "grad_norm": 1.439042329788208, |
| "learning_rate": 2.5171503957783644e-06, |
| "loss": 0.1979, |
| "mean_token_accuracy": 0.7804794508963824, |
| "num_tokens": 13341696.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.336481910532882, |
| "grad_norm": 1.3598966598510742, |
| "learning_rate": 2.4643799472295517e-06, |
| "loss": 0.1514, |
| "mean_token_accuracy": 0.8212695695459843, |
| "num_tokens": 13423616.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.350739618606309, |
| "grad_norm": 1.401573896408081, |
| "learning_rate": 2.411609498680739e-06, |
| "loss": 0.1588, |
| "mean_token_accuracy": 0.8089774928987026, |
| "num_tokens": 13505536.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.3649973266797364, |
| "grad_norm": 1.6068435907363892, |
| "learning_rate": 2.3588390501319264e-06, |
| "loss": 0.1647, |
| "mean_token_accuracy": 0.8134907066822052, |
| "num_tokens": 13587456.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.3792550347531636, |
| "grad_norm": 1.2568259239196777, |
| "learning_rate": 2.3060686015831133e-06, |
| "loss": 0.1664, |
| "mean_token_accuracy": 0.7954256378114224, |
| "num_tokens": 13669376.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.3935127428265908, |
| "grad_norm": 1.6980928182601929, |
| "learning_rate": 2.253298153034301e-06, |
| "loss": 0.1707, |
| "mean_token_accuracy": 0.7994985327124595, |
| "num_tokens": 13751296.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.407770450900018, |
| "grad_norm": 1.6247879266738892, |
| "learning_rate": 2.2005277044854884e-06, |
| "loss": 0.1579, |
| "mean_token_accuracy": 0.7971624247729778, |
| "num_tokens": 13833216.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.422028158973445, |
| "grad_norm": 1.6872649192810059, |
| "learning_rate": 2.1477572559366753e-06, |
| "loss": 0.1703, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.422028158973445, |
| "eval_loss": 0.4227621853351593, |
| "eval_mean_token_accuracy": 0.9080967650963709, |
| "eval_num_tokens": 13915136.0, |
| "eval_runtime": 41.2123, |
| "eval_samples_per_second": 30.258, |
| "eval_steps_per_second": 1.893, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.4362858670468723, |
| "grad_norm": 1.6167148351669312, |
| "learning_rate": 2.094986807387863e-06, |
| "loss": 0.1801, |
| "mean_token_accuracy": 0.7794765178114176, |
| "num_tokens": 13997056.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.4505435751202995, |
| "grad_norm": 1.2795140743255615, |
| "learning_rate": 2.0422163588390505e-06, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.8015288673341274, |
| "num_tokens": 14078976.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.4648012831937267, |
| "grad_norm": 1.2836272716522217, |
| "learning_rate": 1.989445910290238e-06, |
| "loss": 0.1587, |
| "mean_token_accuracy": 0.7941046960651874, |
| "num_tokens": 14160896.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.479058991267154, |
| "grad_norm": 1.1510287523269653, |
| "learning_rate": 1.9366754617414247e-06, |
| "loss": 0.1807, |
| "mean_token_accuracy": 0.7942025430500508, |
| "num_tokens": 14242816.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.493316699340581, |
| "grad_norm": 1.2959060668945312, |
| "learning_rate": 1.8839050131926123e-06, |
| "loss": 0.187, |
| "mean_token_accuracy": 0.7789016582071782, |
| "num_tokens": 14324736.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.5075744074140083, |
| "grad_norm": 1.0948452949523926, |
| "learning_rate": 1.8311345646437998e-06, |
| "loss": 0.1995, |
| "mean_token_accuracy": 0.761827296577394, |
| "num_tokens": 14406656.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.5218321154874355, |
| "grad_norm": 1.3183213472366333, |
| "learning_rate": 1.778364116094987e-06, |
| "loss": 0.1709, |
| "mean_token_accuracy": 0.7887353241443634, |
| "num_tokens": 14488576.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.5360898235608627, |
| "grad_norm": 1.2092057466506958, |
| "learning_rate": 1.7255936675461743e-06, |
| "loss": 0.1325, |
| "mean_token_accuracy": 0.8213796474039554, |
| "num_tokens": 14570496.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.55034753163429, |
| "grad_norm": 1.418562889099121, |
| "learning_rate": 1.6728232189973616e-06, |
| "loss": 0.1827, |
| "mean_token_accuracy": 0.7853595890104771, |
| "num_tokens": 14652416.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.564605239707717, |
| "grad_norm": 1.0960406064987183, |
| "learning_rate": 1.6200527704485488e-06, |
| "loss": 0.1758, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.564605239707717, |
| "eval_loss": 0.4227621257305145, |
| "eval_mean_token_accuracy": 0.9082627732020158, |
| "eval_num_tokens": 14734336.0, |
| "eval_runtime": 41.1309, |
| "eval_samples_per_second": 30.318, |
| "eval_steps_per_second": 1.896, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.578862947781144, |
| "grad_norm": 1.5267870426177979, |
| "learning_rate": 1.5672823218997363e-06, |
| "loss": 0.1732, |
| "mean_token_accuracy": 0.7900256833992898, |
| "num_tokens": 14816256.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.5931206558545714, |
| "grad_norm": 2.303779125213623, |
| "learning_rate": 1.5145118733509237e-06, |
| "loss": 0.1717, |
| "mean_token_accuracy": 0.8003057725727558, |
| "num_tokens": 14898176.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.6073783639279986, |
| "grad_norm": 1.3814704418182373, |
| "learning_rate": 1.4617414248021108e-06, |
| "loss": 0.1691, |
| "mean_token_accuracy": 0.8011741682887077, |
| "num_tokens": 14980096.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.621636072001426, |
| "grad_norm": 1.4888346195220947, |
| "learning_rate": 1.4089709762532984e-06, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.7911203544586897, |
| "num_tokens": 15062016.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.635893780074853, |
| "grad_norm": 1.7252527475357056, |
| "learning_rate": 1.3562005277044857e-06, |
| "loss": 0.1462, |
| "mean_token_accuracy": 0.8204623281955719, |
| "num_tokens": 15143936.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.65015148814828, |
| "grad_norm": 1.3731549978256226, |
| "learning_rate": 1.3034300791556728e-06, |
| "loss": 0.1469, |
| "mean_token_accuracy": 0.8153620343655348, |
| "num_tokens": 15225856.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.6644091962217074, |
| "grad_norm": 1.1390541791915894, |
| "learning_rate": 1.2506596306068602e-06, |
| "loss": 0.1511, |
| "mean_token_accuracy": 0.7933586105704308, |
| "num_tokens": 15307776.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.6786669042951345, |
| "grad_norm": 1.3843096494674683, |
| "learning_rate": 1.1978891820580475e-06, |
| "loss": 0.1743, |
| "mean_token_accuracy": 0.7874510768800974, |
| "num_tokens": 15389696.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.6929246123685617, |
| "grad_norm": 1.4261775016784668, |
| "learning_rate": 1.1451187335092349e-06, |
| "loss": 0.1775, |
| "mean_token_accuracy": 0.7992783728986979, |
| "num_tokens": 15471616.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.707182320441989, |
| "grad_norm": 1.4358237981796265, |
| "learning_rate": 1.0923482849604222e-06, |
| "loss": 0.1488, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.707182320441989, |
| "eval_loss": 0.4216897487640381, |
| "eval_mean_token_accuracy": 0.9083614570972247, |
| "eval_num_tokens": 15553536.0, |
| "eval_runtime": 41.1549, |
| "eval_samples_per_second": 30.3, |
| "eval_steps_per_second": 1.895, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.721440028515416, |
| "grad_norm": 1.4193668365478516, |
| "learning_rate": 1.0395778364116096e-06, |
| "loss": 0.1432, |
| "mean_token_accuracy": 0.8027458423748612, |
| "num_tokens": 15635456.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.7356977365888433, |
| "grad_norm": 1.3984283208847046, |
| "learning_rate": 9.86807387862797e-07, |
| "loss": 0.1751, |
| "mean_token_accuracy": 0.7997309185564518, |
| "num_tokens": 15717376.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.7499554446622705, |
| "grad_norm": 1.2041066884994507, |
| "learning_rate": 9.340369393139842e-07, |
| "loss": 0.2063, |
| "mean_token_accuracy": 0.770768103376031, |
| "num_tokens": 15799296.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.7642131527356977, |
| "grad_norm": 1.4668165445327759, |
| "learning_rate": 8.812664907651716e-07, |
| "loss": 0.1496, |
| "mean_token_accuracy": 0.7937133066356182, |
| "num_tokens": 15881216.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.778470860809125, |
| "grad_norm": 1.1798230409622192, |
| "learning_rate": 8.284960422163589e-07, |
| "loss": 0.1696, |
| "mean_token_accuracy": 0.7978228941559792, |
| "num_tokens": 15963136.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.792728568882552, |
| "grad_norm": 1.4253802299499512, |
| "learning_rate": 7.757255936675462e-07, |
| "loss": 0.1602, |
| "mean_token_accuracy": 0.8014432441443204, |
| "num_tokens": 16045056.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.8069862769559792, |
| "grad_norm": 1.3596400022506714, |
| "learning_rate": 7.229551451187336e-07, |
| "loss": 0.1672, |
| "mean_token_accuracy": 0.808916338160634, |
| "num_tokens": 16126976.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.8212439850294064, |
| "grad_norm": 1.4225387573242188, |
| "learning_rate": 6.701846965699208e-07, |
| "loss": 0.1767, |
| "mean_token_accuracy": 0.7800391383469105, |
| "num_tokens": 16208896.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.8355016931028336, |
| "grad_norm": 1.8448420763015747, |
| "learning_rate": 6.174142480211082e-07, |
| "loss": 0.1846, |
| "mean_token_accuracy": 0.792747063189745, |
| "num_tokens": 16290816.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.849759401176261, |
| "grad_norm": 1.4115536212921143, |
| "learning_rate": 5.646437994722955e-07, |
| "loss": 0.1398, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.849759401176261, |
| "eval_loss": 0.42159923911094666, |
| "eval_mean_token_accuracy": 0.9084225067725549, |
| "eval_num_tokens": 16372736.0, |
| "eval_runtime": 41.1801, |
| "eval_samples_per_second": 30.282, |
| "eval_steps_per_second": 1.894, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2106, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.32699442420777e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|