{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9992003259049143, "eval_steps": 500, "global_step": 16568, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012070552378653229, "grad_norm": 1.5390625, "learning_rate": 7.543753771876886e-07, "loss": 2.6793, "mean_token_accuracy": 0.5008124655112625, "num_input_tokens_seen": 24135392, "num_tokens": 10166805.0, "step": 50 }, { "epoch": 0.024141104757306457, "grad_norm": 1.578125, "learning_rate": 1.5087507543753772e-06, "loss": 2.683, "mean_token_accuracy": 0.5005724773928523, "num_input_tokens_seen": 48153152, "num_tokens": 20365412.0, "step": 100 }, { "epoch": 0.03621165713595968, "grad_norm": 4.25, "learning_rate": 2.2631261315630656e-06, "loss": 2.6827, "mean_token_accuracy": 0.5006869393214584, "num_input_tokens_seen": 72337344, "num_tokens": 30424121.0, "step": 150 }, { "epoch": 0.048282209514612914, "grad_norm": 1.6640625, "learning_rate": 3.0175015087507544e-06, "loss": 2.6854, "mean_token_accuracy": 0.49996432337909935, "num_input_tokens_seen": 96540720, "num_tokens": 40650709.0, "step": 200 }, { "epoch": 0.06035276189326614, "grad_norm": 1.671875, "learning_rate": 3.771876885938443e-06, "loss": 2.6589, "mean_token_accuracy": 0.5033125403895974, "num_input_tokens_seen": 120664016, "num_tokens": 50830609.0, "step": 250 }, { "epoch": 0.07242331427191936, "grad_norm": 1.4375, "learning_rate": 4.526252263126131e-06, "loss": 2.6724, "mean_token_accuracy": 0.5016581188514828, "num_input_tokens_seen": 144779584, "num_tokens": 60898431.0, "step": 300 }, { "epoch": 0.0844938666505726, "grad_norm": 1.484375, "learning_rate": 5.280627640313821e-06, "loss": 2.6515, "mean_token_accuracy": 0.5036601684615016, "num_input_tokens_seen": 168931312, "num_tokens": 71195944.0, "step": 350 }, { "epoch": 0.09656441902922583, "grad_norm": 1.40625, "learning_rate": 6.035003017501509e-06, "loss": 2.6474, "mean_token_accuracy": 0.5041869094967842, "num_input_tokens_seen": 192934592, "num_tokens": 81381172.0, "step": 400 }, { "epoch": 0.10863497140787905, "grad_norm": 1.3359375, "learning_rate": 6.789378394689197e-06, "loss": 2.6561, "mean_token_accuracy": 0.5037067172303796, "num_input_tokens_seen": 217010448, "num_tokens": 91569720.0, "step": 450 }, { "epoch": 0.12070552378653228, "grad_norm": 1.2421875, "learning_rate": 7.543753771876886e-06, "loss": 2.6454, "num_input_tokens_seen": 241172672, "step": 500 }, { "epoch": 0.12070552378653228, "eval_loss": 2.550887107849121, "eval_mean_token_accuracy": 0.5239145074365208, "eval_num_tokens": 101801812.0, "eval_runtime": 125.6328, "eval_samples_per_second": 85.264, "eval_steps_per_second": 21.316, "num_input_tokens_seen": 241172672, "step": 500 }, { "epoch": 0.1327760761651855, "grad_norm": 1.234375, "learning_rate": 8.298129149064575e-06, "loss": 2.6413, "mean_token_accuracy": 0.5040716730989516, "num_input_tokens_seen": 265089232, "num_tokens": 111840270.0, "step": 550 }, { "epoch": 0.14484662854383873, "grad_norm": 1.0859375, "learning_rate": 9.052504526252262e-06, "loss": 2.6279, "mean_token_accuracy": 0.5059382322058081, "num_input_tokens_seen": 289296624, "num_tokens": 121987982.0, "step": 600 }, { "epoch": 0.15691718092249196, "grad_norm": 0.93359375, "learning_rate": 9.806879903439953e-06, "loss": 2.6104, "mean_token_accuracy": 0.5081973234936595, "num_input_tokens_seen": 313326128, "num_tokens": 132144416.0, "step": 650 }, { "epoch": 0.1689877333011452, "grad_norm": 0.8828125, "learning_rate": 1.0561255280627642e-05, "loss": 2.611, "mean_token_accuracy": 0.5080171688646078, "num_input_tokens_seen": 337553504, "num_tokens": 142378323.0, "step": 700 }, { "epoch": 0.18105828567979843, "grad_norm": 0.76171875, "learning_rate": 1.1315630657815329e-05, "loss": 2.6039, "mean_token_accuracy": 0.5081416912004352, "num_input_tokens_seen": 361961104, "num_tokens": 152623392.0, "step": 750 }, { "epoch": 0.19312883805845166, "grad_norm": 0.7421875, "learning_rate": 1.2070006035003018e-05, "loss": 2.5936, "mean_token_accuracy": 0.5093595118448139, "num_input_tokens_seen": 386105904, "num_tokens": 162827766.0, "step": 800 }, { "epoch": 0.2051993904371049, "grad_norm": 0.86328125, "learning_rate": 1.2824381412190706e-05, "loss": 2.5887, "mean_token_accuracy": 0.5100815147906542, "num_input_tokens_seen": 410248672, "num_tokens": 172979899.0, "step": 850 }, { "epoch": 0.2172699428157581, "grad_norm": 0.81640625, "learning_rate": 1.3578756789378394e-05, "loss": 2.5903, "mean_token_accuracy": 0.5090210309624672, "num_input_tokens_seen": 434357040, "num_tokens": 183138615.0, "step": 900 }, { "epoch": 0.22934049519441133, "grad_norm": 0.69921875, "learning_rate": 1.4333132166566086e-05, "loss": 2.5816, "mean_token_accuracy": 0.5105240726843476, "num_input_tokens_seen": 458609136, "num_tokens": 193253896.0, "step": 950 }, { "epoch": 0.24141104757306456, "grad_norm": 0.64453125, "learning_rate": 1.5087507543753773e-05, "loss": 2.558, "num_input_tokens_seen": 482701824, "step": 1000 }, { "epoch": 0.24141104757306456, "eval_loss": 2.4748051166534424, "eval_mean_token_accuracy": 0.529798322766908, "eval_num_tokens": 203431197.0, "eval_runtime": 125.5514, "eval_samples_per_second": 85.32, "eval_steps_per_second": 21.33, "num_input_tokens_seen": 482701824, "step": 1000 }, { "epoch": 0.2534815999517178, "grad_norm": 0.62890625, "learning_rate": 1.584188292094146e-05, "loss": 2.557, "mean_token_accuracy": 0.5126943441852927, "num_input_tokens_seen": 506798000, "num_tokens": 213604239.0, "step": 1050 }, { "epoch": 0.265552152330371, "grad_norm": 0.578125, "learning_rate": 1.659625829812915e-05, "loss": 2.5533, "mean_token_accuracy": 0.5126326360553503, "num_input_tokens_seen": 530786496, "num_tokens": 223786306.0, "step": 1100 }, { "epoch": 0.2776227047090242, "grad_norm": 0.56640625, "learning_rate": 1.7350633675316838e-05, "loss": 2.5528, "mean_token_accuracy": 0.5130741761997342, "num_input_tokens_seen": 554798768, "num_tokens": 233857737.0, "step": 1150 }, { "epoch": 0.28969325708767746, "grad_norm": 0.5234375, "learning_rate": 1.8105009052504525e-05, "loss": 2.5433, "mean_token_accuracy": 0.5140750538557768, "num_input_tokens_seen": 578877696, "num_tokens": 243994162.0, "step": 1200 }, { "epoch": 0.3017638094663307, "grad_norm": 0.671875, "learning_rate": 1.8859384429692215e-05, "loss": 2.5414, "mean_token_accuracy": 0.5151798555627465, "num_input_tokens_seen": 603141184, "num_tokens": 254196491.0, "step": 1250 }, { "epoch": 0.3138343618449839, "grad_norm": 0.59375, "learning_rate": 1.9613759806879906e-05, "loss": 2.5383, "mean_token_accuracy": 0.5148227337375283, "num_input_tokens_seen": 627221936, "num_tokens": 264309463.0, "step": 1300 }, { "epoch": 0.32590491422363715, "grad_norm": 0.498046875, "learning_rate": 2.0368135184067593e-05, "loss": 2.5292, "mean_token_accuracy": 0.5156710411980748, "num_input_tokens_seen": 651386848, "num_tokens": 274462595.0, "step": 1350 }, { "epoch": 0.3379754666022904, "grad_norm": 0.51171875, "learning_rate": 2.1122510561255283e-05, "loss": 2.523, "mean_token_accuracy": 0.5161588852107525, "num_input_tokens_seen": 675579984, "num_tokens": 284636254.0, "step": 1400 }, { "epoch": 0.3500460189809436, "grad_norm": 6.375, "learning_rate": 2.187688593844297e-05, "loss": 2.528, "mean_token_accuracy": 0.5156370849534869, "num_input_tokens_seen": 699751424, "num_tokens": 294907617.0, "step": 1450 }, { "epoch": 0.36211657135959685, "grad_norm": 0.68359375, "learning_rate": 2.2631261315630658e-05, "loss": 2.5232, "num_input_tokens_seen": 723883888, "step": 1500 }, { "epoch": 0.36211657135959685, "eval_loss": 2.42679500579834, "eval_mean_token_accuracy": 0.5353444569074368, "eval_num_tokens": 305087605.0, "eval_runtime": 125.481, "eval_samples_per_second": 85.367, "eval_steps_per_second": 21.342, "num_input_tokens_seen": 723883888, "step": 1500 }, { "epoch": 0.3741871237382501, "grad_norm": 0.455078125, "learning_rate": 2.3385636692818348e-05, "loss": 2.5178, "mean_token_accuracy": 0.5161234551295638, "num_input_tokens_seen": 748106064, "num_tokens": 315240015.0, "step": 1550 }, { "epoch": 0.3862576761169033, "grad_norm": 0.5078125, "learning_rate": 2.4140012070006035e-05, "loss": 2.5157, "mean_token_accuracy": 0.5167792574688792, "num_input_tokens_seen": 772242320, "num_tokens": 325393577.0, "step": 1600 }, { "epoch": 0.39832822849555655, "grad_norm": 0.54296875, "learning_rate": 2.4894387447193726e-05, "loss": 2.505, "mean_token_accuracy": 0.5181788290664554, "num_input_tokens_seen": 796536976, "num_tokens": 335636146.0, "step": 1650 }, { "epoch": 0.4103987808742098, "grad_norm": 0.50390625, "learning_rate": 2.5648762824381413e-05, "loss": 2.5034, "mean_token_accuracy": 0.5191374982148409, "num_input_tokens_seen": 820727792, "num_tokens": 345721073.0, "step": 1700 }, { "epoch": 0.422469333252863, "grad_norm": 0.40234375, "learning_rate": 2.64031382015691e-05, "loss": 2.5049, "mean_token_accuracy": 0.5170740441232919, "num_input_tokens_seen": 844900704, "num_tokens": 355941787.0, "step": 1750 }, { "epoch": 0.4345398856315162, "grad_norm": 0.46484375, "learning_rate": 2.7157513578756787e-05, "loss": 2.4944, "mean_token_accuracy": 0.5191365649551153, "num_input_tokens_seen": 868882304, "num_tokens": 366140276.0, "step": 1800 }, { "epoch": 0.4466104380101694, "grad_norm": 0.4609375, "learning_rate": 2.791188895594448e-05, "loss": 2.4852, "mean_token_accuracy": 0.519771606773138, "num_input_tokens_seen": 893201120, "num_tokens": 376397263.0, "step": 1850 }, { "epoch": 0.45868099038882265, "grad_norm": 0.42578125, "learning_rate": 2.866626433313217e-05, "loss": 2.4242, "mean_token_accuracy": 0.5218987537547946, "num_input_tokens_seen": 917474880, "num_tokens": 386613084.0, "step": 1900 }, { "epoch": 0.4707515427674759, "grad_norm": 0.486328125, "learning_rate": 2.942063971031986e-05, "loss": 2.4118, "mean_token_accuracy": 0.5205260647833347, "num_input_tokens_seen": 941694304, "num_tokens": 396813412.0, "step": 1950 }, { "epoch": 0.4828220951461291, "grad_norm": 0.421875, "learning_rate": 3.0175015087507546e-05, "loss": 2.3856, "num_input_tokens_seen": 965894272, "step": 2000 }, { "epoch": 0.4828220951461291, "eval_loss": 2.281926155090332, "eval_mean_token_accuracy": 0.5452906315098778, "eval_num_tokens": 407043305.0, "eval_runtime": 126.3646, "eval_samples_per_second": 84.771, "eval_steps_per_second": 21.193, "num_input_tokens_seen": 965894272, "step": 2000 }, { "epoch": 0.49489264752478235, "grad_norm": 0.55859375, "learning_rate": 3.092939046469523e-05, "loss": 2.3729, "mean_token_accuracy": 0.5244628588855267, "num_input_tokens_seen": 990100416, "num_tokens": 417281294.0, "step": 2050 }, { "epoch": 0.5069631999034356, "grad_norm": 0.46484375, "learning_rate": 3.168376584188292e-05, "loss": 2.365, "mean_token_accuracy": 0.5261798568814993, "num_input_tokens_seen": 1014101792, "num_tokens": 427442066.0, "step": 2100 }, { "epoch": 0.5190337522820888, "grad_norm": 0.4296875, "learning_rate": 3.2438141219070614e-05, "loss": 2.3679, "mean_token_accuracy": 0.5253909171745181, "num_input_tokens_seen": 1038463120, "num_tokens": 437736421.0, "step": 2150 }, { "epoch": 0.531104304660742, "grad_norm": 0.404296875, "learning_rate": 3.31925165962583e-05, "loss": 2.3664, "mean_token_accuracy": 0.5255175845324993, "num_input_tokens_seen": 1062548608, "num_tokens": 447839969.0, "step": 2200 }, { "epoch": 0.5431748570393953, "grad_norm": 0.376953125, "learning_rate": 3.394689197344599e-05, "loss": 2.3566, "mean_token_accuracy": 0.5265485693141818, "num_input_tokens_seen": 1086859296, "num_tokens": 458069337.0, "step": 2250 }, { "epoch": 0.5552454094180485, "grad_norm": 0.376953125, "learning_rate": 3.4701267350633675e-05, "loss": 2.3424, "mean_token_accuracy": 0.529197344481945, "num_input_tokens_seen": 1110954480, "num_tokens": 468219132.0, "step": 2300 }, { "epoch": 0.5673159617967017, "grad_norm": 0.376953125, "learning_rate": 3.545564272782136e-05, "loss": 2.3344, "mean_token_accuracy": 0.5297643894702196, "num_input_tokens_seen": 1135145632, "num_tokens": 478383651.0, "step": 2350 }, { "epoch": 0.5793865141753549, "grad_norm": 0.3984375, "learning_rate": 3.621001810500905e-05, "loss": 2.3403, "mean_token_accuracy": 0.5286615265905857, "num_input_tokens_seen": 1159223072, "num_tokens": 488558846.0, "step": 2400 }, { "epoch": 0.5914570665540082, "grad_norm": 0.38671875, "learning_rate": 3.696439348219674e-05, "loss": 2.3366, "mean_token_accuracy": 0.5282911998406052, "num_input_tokens_seen": 1183346592, "num_tokens": 498693280.0, "step": 2450 }, { "epoch": 0.6035276189326614, "grad_norm": 0.36328125, "learning_rate": 3.771876885938443e-05, "loss": 2.3361, "num_input_tokens_seen": 1207430512, "step": 2500 }, { "epoch": 0.6035276189326614, "eval_loss": 2.2252955436706543, "eval_mean_token_accuracy": 0.5513953688186527, "eval_num_tokens": 508899754.0, "eval_runtime": 125.696, "eval_samples_per_second": 85.221, "eval_steps_per_second": 21.305, "num_input_tokens_seen": 1207430512, "step": 2500 }, { "epoch": 0.6155981713113147, "grad_norm": 0.33984375, "learning_rate": 3.8473144236572124e-05, "loss": 2.3191, "mean_token_accuracy": 0.5302580918744206, "num_input_tokens_seen": 1231510064, "num_tokens": 519053973.0, "step": 2550 }, { "epoch": 0.6276687236899678, "grad_norm": 0.40234375, "learning_rate": 3.922751961375981e-05, "loss": 2.3184, "mean_token_accuracy": 0.5312636430934071, "num_input_tokens_seen": 1255661376, "num_tokens": 529292647.0, "step": 2600 }, { "epoch": 0.6397392760686211, "grad_norm": 0.361328125, "learning_rate": 3.99818949909475e-05, "loss": 2.3149, "mean_token_accuracy": 0.5317899576947093, "num_input_tokens_seen": 1279915488, "num_tokens": 539406430.0, "step": 2650 }, { "epoch": 0.6518098284472743, "grad_norm": 0.37890625, "learning_rate": 4.0736270368135186e-05, "loss": 2.3226, "mean_token_accuracy": 0.530639638863504, "num_input_tokens_seen": 1303974000, "num_tokens": 549560793.0, "step": 2700 }, { "epoch": 0.6638803808259276, "grad_norm": 0.3046875, "learning_rate": 4.149064574532287e-05, "loss": 2.3027, "mean_token_accuracy": 0.5339162700995803, "num_input_tokens_seen": 1328099056, "num_tokens": 559731643.0, "step": 2750 }, { "epoch": 0.6759509332045808, "grad_norm": 0.353515625, "learning_rate": 4.224502112251057e-05, "loss": 2.3041, "mean_token_accuracy": 0.5334529640898108, "num_input_tokens_seen": 1352139632, "num_tokens": 569834852.0, "step": 2800 }, { "epoch": 0.688021485583234, "grad_norm": 0.333984375, "learning_rate": 4.2999396499698254e-05, "loss": 2.2978, "mean_token_accuracy": 0.5343039923906326, "num_input_tokens_seen": 1376232240, "num_tokens": 579992251.0, "step": 2850 }, { "epoch": 0.7000920379618872, "grad_norm": 0.330078125, "learning_rate": 4.375377187688594e-05, "loss": 2.2834, "mean_token_accuracy": 0.5361328301951289, "num_input_tokens_seen": 1400431104, "num_tokens": 590199795.0, "step": 2900 }, { "epoch": 0.7121625903405404, "grad_norm": 0.337890625, "learning_rate": 4.450814725407363e-05, "loss": 2.301, "mean_token_accuracy": 0.5332023718208074, "num_input_tokens_seen": 1424615616, "num_tokens": 600407039.0, "step": 2950 }, { "epoch": 0.7242331427191937, "grad_norm": 0.291015625, "learning_rate": 4.5262522631261315e-05, "loss": 2.2846, "num_input_tokens_seen": 1448897616, "step": 3000 }, { "epoch": 0.7242331427191937, "eval_loss": 2.174436569213867, "eval_mean_token_accuracy": 0.5580886014145288, "eval_num_tokens": 610496575.0, "eval_runtime": 125.6585, "eval_samples_per_second": 85.247, "eval_steps_per_second": 21.312, "num_input_tokens_seen": 1448897616, "step": 3000 }, { "epoch": 0.7363036950978469, "grad_norm": 0.298828125, "learning_rate": 4.6016898008449e-05, "loss": 2.2866, "mean_token_accuracy": 0.535948946569115, "num_input_tokens_seen": 1472853872, "num_tokens": 620644723.0, "step": 3050 }, { "epoch": 0.7483742474765002, "grad_norm": 0.296875, "learning_rate": 4.6771273385636696e-05, "loss": 2.2713, "mean_token_accuracy": 0.5381996771320701, "num_input_tokens_seen": 1497112768, "num_tokens": 630831881.0, "step": 3100 }, { "epoch": 0.7604447998551533, "grad_norm": 0.318359375, "learning_rate": 4.752564876282438e-05, "loss": 2.2659, "mean_token_accuracy": 0.5379517000168562, "num_input_tokens_seen": 1521326656, "num_tokens": 640973141.0, "step": 3150 }, { "epoch": 0.7725153522338066, "grad_norm": 0.27734375, "learning_rate": 4.828002414001207e-05, "loss": 2.2534, "mean_token_accuracy": 0.5404072028771043, "num_input_tokens_seen": 1545505712, "num_tokens": 651205945.0, "step": 3200 }, { "epoch": 0.7845859046124598, "grad_norm": 0.267578125, "learning_rate": 4.903439951719976e-05, "loss": 2.2662, "mean_token_accuracy": 0.5395008590817452, "num_input_tokens_seen": 1569597408, "num_tokens": 661422296.0, "step": 3250 }, { "epoch": 0.7966564569911131, "grad_norm": 0.306640625, "learning_rate": 4.978877489438745e-05, "loss": 2.2429, "mean_token_accuracy": 0.5423036898300052, "num_input_tokens_seen": 1593795440, "num_tokens": 671551427.0, "step": 3300 }, { "epoch": 0.8087270093697663, "grad_norm": 0.29296875, "learning_rate": 4.9864191942055236e-05, "loss": 2.234, "mean_token_accuracy": 0.5444167210906744, "num_input_tokens_seen": 1618048688, "num_tokens": 681790219.0, "step": 3350 }, { "epoch": 0.8207975617484196, "grad_norm": 0.306640625, "learning_rate": 4.967556963935416e-05, "loss": 2.2466, "mean_token_accuracy": 0.5417763916775584, "num_input_tokens_seen": 1642159776, "num_tokens": 691978554.0, "step": 3400 }, { "epoch": 0.8328681141270727, "grad_norm": 0.30859375, "learning_rate": 4.9486947336653086e-05, "loss": 2.2358, "mean_token_accuracy": 0.5441526301577687, "num_input_tokens_seen": 1666410544, "num_tokens": 702184762.0, "step": 3450 }, { "epoch": 0.844938666505726, "grad_norm": 0.2578125, "learning_rate": 4.929832503395201e-05, "loss": 2.2274, "num_input_tokens_seen": 1690571888, "step": 3500 }, { "epoch": 0.844938666505726, "eval_loss": 2.110778331756592, "eval_mean_token_accuracy": 0.5686311067219009, "eval_num_tokens": 712362296.0, "eval_runtime": 127.3714, "eval_samples_per_second": 84.1, "eval_steps_per_second": 21.025, "num_input_tokens_seen": 1690571888, "step": 3500 }, { "epoch": 0.8570092188843792, "grad_norm": 0.283203125, "learning_rate": 4.9109702731250944e-05, "loss": 2.2322, "mean_token_accuracy": 0.5444289642199874, "num_input_tokens_seen": 1714787776, "num_tokens": 722682302.0, "step": 3550 }, { "epoch": 0.8690797712630324, "grad_norm": 0.3203125, "learning_rate": 4.8921080428549876e-05, "loss": 2.2194, "mean_token_accuracy": 0.5459688815101981, "num_input_tokens_seen": 1739009552, "num_tokens": 732823822.0, "step": 3600 }, { "epoch": 0.8811503236416857, "grad_norm": 0.267578125, "learning_rate": 4.87324581258488e-05, "loss": 2.2139, "mean_token_accuracy": 0.5467682545632124, "num_input_tokens_seen": 1763209840, "num_tokens": 743138600.0, "step": 3650 }, { "epoch": 0.8932208760203388, "grad_norm": 0.328125, "learning_rate": 4.854383582314773e-05, "loss": 2.204, "mean_token_accuracy": 0.5477216844260693, "num_input_tokens_seen": 1787295680, "num_tokens": 753284868.0, "step": 3700 }, { "epoch": 0.9052914283989921, "grad_norm": 0.306640625, "learning_rate": 4.835521352044666e-05, "loss": 2.186, "mean_token_accuracy": 0.5463542007282376, "num_input_tokens_seen": 1811501840, "num_tokens": 763533047.0, "step": 3750 }, { "epoch": 0.9173619807776453, "grad_norm": 0.294921875, "learning_rate": 4.816659121774559e-05, "loss": 2.1705, "mean_token_accuracy": 0.5472249809652567, "num_input_tokens_seen": 1835579680, "num_tokens": 773772552.0, "step": 3800 }, { "epoch": 0.9294325331562986, "grad_norm": 0.2578125, "learning_rate": 4.797796891504452e-05, "loss": 2.1472, "mean_token_accuracy": 0.5502070318907499, "num_input_tokens_seen": 1859762928, "num_tokens": 783996309.0, "step": 3850 }, { "epoch": 0.9415030855349518, "grad_norm": 0.30078125, "learning_rate": 4.778934661234345e-05, "loss": 2.1494, "mean_token_accuracy": 0.548779489658773, "num_input_tokens_seen": 1883948656, "num_tokens": 794252324.0, "step": 3900 }, { "epoch": 0.953573637913605, "grad_norm": 0.29296875, "learning_rate": 4.760072430964237e-05, "loss": 2.1484, "mean_token_accuracy": 0.5490481401607394, "num_input_tokens_seen": 1908219840, "num_tokens": 804538128.0, "step": 3950 }, { "epoch": 0.9656441902922582, "grad_norm": 0.291015625, "learning_rate": 4.7412102006941305e-05, "loss": 2.1447, "num_input_tokens_seen": 1932223680, "step": 4000 }, { "epoch": 0.9656441902922582, "eval_loss": 2.0152089595794678, "eval_mean_token_accuracy": 0.5734592423989222, "eval_num_tokens": 814660681.0, "eval_runtime": 126.585, "eval_samples_per_second": 84.623, "eval_steps_per_second": 21.156, "num_input_tokens_seen": 1932223680, "step": 4000 }, { "epoch": 0.9777147426709115, "grad_norm": 0.298828125, "learning_rate": 4.722347970424023e-05, "loss": 2.1476, "mean_token_accuracy": 0.5491000188142061, "num_input_tokens_seen": 1956345120, "num_tokens": 824824558.0, "step": 4050 }, { "epoch": 0.9897852950495647, "grad_norm": 0.2890625, "learning_rate": 4.703485740153916e-05, "loss": 2.1336, "mean_token_accuracy": 0.5505272497236728, "num_input_tokens_seen": 1980539728, "num_tokens": 835004823.0, "step": 4100 }, { "epoch": 1.0016898773330114, "grad_norm": 0.2890625, "learning_rate": 4.684623509883809e-05, "loss": 2.1376, "mean_token_accuracy": 0.5500758526367531, "num_input_tokens_seen": 2004388912, "num_tokens": 844972763.0, "step": 4150 }, { "epoch": 1.0137604297116647, "grad_norm": 0.275390625, "learning_rate": 4.665761279613702e-05, "loss": 2.1349, "mean_token_accuracy": 0.5500743924826383, "num_input_tokens_seen": 2028622064, "num_tokens": 855126584.0, "step": 4200 }, { "epoch": 1.025830982090318, "grad_norm": 0.283203125, "learning_rate": 4.646899049343595e-05, "loss": 2.1248, "mean_token_accuracy": 0.5514456473290921, "num_input_tokens_seen": 2052718336, "num_tokens": 865332386.0, "step": 4250 }, { "epoch": 1.037901534468971, "grad_norm": 0.28125, "learning_rate": 4.6280368190734876e-05, "loss": 2.1088, "mean_token_accuracy": 0.5532256289571523, "num_input_tokens_seen": 2076571680, "num_tokens": 875448332.0, "step": 4300 }, { "epoch": 1.0499720868476243, "grad_norm": 0.326171875, "learning_rate": 4.60917458880338e-05, "loss": 2.1184, "mean_token_accuracy": 0.5509732039645314, "num_input_tokens_seen": 2100726912, "num_tokens": 885623694.0, "step": 4350 }, { "epoch": 1.0620426392262776, "grad_norm": 0.310546875, "learning_rate": 4.590312358533273e-05, "loss": 2.1324, "mean_token_accuracy": 0.5498137963563203, "num_input_tokens_seen": 2124980016, "num_tokens": 895774422.0, "step": 4400 }, { "epoch": 1.074113191604931, "grad_norm": 0.32421875, "learning_rate": 4.571450128263166e-05, "loss": 2.1195, "mean_token_accuracy": 0.551388250514865, "num_input_tokens_seen": 2149237504, "num_tokens": 905968753.0, "step": 4450 }, { "epoch": 1.086183743983584, "grad_norm": 0.296875, "learning_rate": 4.552587897993059e-05, "loss": 2.1195, "num_input_tokens_seen": 2173337456, "step": 4500 }, { "epoch": 1.086183743983584, "eval_loss": 1.989871859550476, "eval_mean_token_accuracy": 0.5754866465826013, "eval_num_tokens": 916079112.0, "eval_runtime": 128.4454, "eval_samples_per_second": 83.397, "eval_steps_per_second": 20.849, "num_input_tokens_seen": 2173337456, "step": 4500 }, { "epoch": 1.0982542963622373, "grad_norm": 0.287109375, "learning_rate": 4.5337256677229516e-05, "loss": 2.1218, "mean_token_accuracy": 0.5514280049689114, "num_input_tokens_seen": 2197505712, "num_tokens": 926254107.0, "step": 4550 }, { "epoch": 1.1103248487408905, "grad_norm": 0.291015625, "learning_rate": 4.514863437452845e-05, "loss": 2.1132, "mean_token_accuracy": 0.5515907733514905, "num_input_tokens_seen": 2221716688, "num_tokens": 936449430.0, "step": 4600 }, { "epoch": 1.1223954011195438, "grad_norm": 0.296875, "learning_rate": 4.4960012071827373e-05, "loss": 2.1142, "mean_token_accuracy": 0.5520639397203922, "num_input_tokens_seen": 2245565536, "num_tokens": 946528658.0, "step": 4650 }, { "epoch": 1.134465953498197, "grad_norm": 0.2734375, "learning_rate": 4.4771389769126305e-05, "loss": 2.1275, "mean_token_accuracy": 0.5497148666903376, "num_input_tokens_seen": 2269696864, "num_tokens": 956594209.0, "step": 4700 }, { "epoch": 1.1465365058768502, "grad_norm": 0.279296875, "learning_rate": 4.458276746642524e-05, "loss": 2.1065, "mean_token_accuracy": 0.5532364987954498, "num_input_tokens_seen": 2293845360, "num_tokens": 966701814.0, "step": 4750 }, { "epoch": 1.1586070582555035, "grad_norm": 0.259765625, "learning_rate": 4.439414516372416e-05, "loss": 2.1133, "mean_token_accuracy": 0.5517958915606141, "num_input_tokens_seen": 2318062016, "num_tokens": 976956133.0, "step": 4800 }, { "epoch": 1.1706776106341565, "grad_norm": 0.314453125, "learning_rate": 4.420552286102309e-05, "loss": 2.1083, "mean_token_accuracy": 0.5527382261306047, "num_input_tokens_seen": 2342152464, "num_tokens": 987113621.0, "step": 4850 }, { "epoch": 1.1827481630128098, "grad_norm": 0.26953125, "learning_rate": 4.401690055832201e-05, "loss": 2.1084, "mean_token_accuracy": 0.5531642048805953, "num_input_tokens_seen": 2366342016, "num_tokens": 997304128.0, "step": 4900 }, { "epoch": 1.1948187153914631, "grad_norm": 0.263671875, "learning_rate": 4.3828278255620945e-05, "loss": 2.1129, "mean_token_accuracy": 0.5526808862015605, "num_input_tokens_seen": 2390580560, "num_tokens": 1007600120.0, "step": 4950 }, { "epoch": 1.2068892677701164, "grad_norm": 0.271484375, "learning_rate": 4.363965595291988e-05, "loss": 2.1136, "num_input_tokens_seen": 2414871648, "step": 5000 }, { "epoch": 1.2068892677701164, "eval_loss": 1.9823503494262695, "eval_mean_token_accuracy": 0.5763351263685668, "eval_num_tokens": 1017920689.0, "eval_runtime": 131.1681, "eval_samples_per_second": 81.666, "eval_steps_per_second": 20.417, "num_input_tokens_seen": 2414871648, "step": 5000 }, { "epoch": 1.2189598201487695, "grad_norm": 0.25, "learning_rate": 4.34510336502188e-05, "loss": 2.108, "mean_token_accuracy": 0.5514175926893949, "num_input_tokens_seen": 2438963872, "num_tokens": 1028143121.0, "step": 5050 }, { "epoch": 1.2310303725274228, "grad_norm": 0.2421875, "learning_rate": 4.3262411347517734e-05, "loss": 2.1066, "mean_token_accuracy": 0.5526730781793594, "num_input_tokens_seen": 2463130960, "num_tokens": 1038274786.0, "step": 5100 }, { "epoch": 1.243100924906076, "grad_norm": 0.2353515625, "learning_rate": 4.307378904481666e-05, "loss": 2.1011, "mean_token_accuracy": 0.5543517142161727, "num_input_tokens_seen": 2487402736, "num_tokens": 1048479252.0, "step": 5150 }, { "epoch": 1.2551714772847293, "grad_norm": 0.265625, "learning_rate": 4.288516674211559e-05, "loss": 2.1021, "mean_token_accuracy": 0.5538267828151584, "num_input_tokens_seen": 2511451728, "num_tokens": 1058650745.0, "step": 5200 }, { "epoch": 1.2672420296633824, "grad_norm": 0.30859375, "learning_rate": 4.2696544439414524e-05, "loss": 2.0863, "mean_token_accuracy": 0.5557815081253648, "num_input_tokens_seen": 2535548592, "num_tokens": 1068882104.0, "step": 5250 }, { "epoch": 1.2793125820420357, "grad_norm": 0.306640625, "learning_rate": 4.250792213671345e-05, "loss": 2.1063, "mean_token_accuracy": 0.5531226889789105, "num_input_tokens_seen": 2559719664, "num_tokens": 1079065265.0, "step": 5300 }, { "epoch": 1.291383134420689, "grad_norm": 0.263671875, "learning_rate": 4.2319299834012374e-05, "loss": 2.1104, "mean_token_accuracy": 0.5524419481307268, "num_input_tokens_seen": 2584073280, "num_tokens": 1089341978.0, "step": 5350 }, { "epoch": 1.303453686799342, "grad_norm": 0.244140625, "learning_rate": 4.21306775313113e-05, "loss": 2.1044, "mean_token_accuracy": 0.5532321387529373, "num_input_tokens_seen": 2608296624, "num_tokens": 1099642346.0, "step": 5400 }, { "epoch": 1.3155242391779953, "grad_norm": 0.2412109375, "learning_rate": 4.194205522861023e-05, "loss": 2.1115, "mean_token_accuracy": 0.5528482471778989, "num_input_tokens_seen": 2632421856, "num_tokens": 1109721280.0, "step": 5450 }, { "epoch": 1.3275947915566486, "grad_norm": 0.2275390625, "learning_rate": 4.1753432925909163e-05, "loss": 2.1009, "num_input_tokens_seen": 2656567344, "step": 5500 }, { "epoch": 1.3275947915566486, "eval_loss": 1.9779127836227417, "eval_mean_token_accuracy": 0.5769637392206456, "eval_num_tokens": 1119903809.0, "eval_runtime": 131.3767, "eval_samples_per_second": 81.537, "eval_steps_per_second": 20.384, "num_input_tokens_seen": 2656567344, "step": 5500 }, { "epoch": 1.339665343935302, "grad_norm": 0.26171875, "learning_rate": 4.156481062320809e-05, "loss": 2.1059, "mean_token_accuracy": 0.5533521883934737, "num_input_tokens_seen": 2680728000, "num_tokens": 1130022087.0, "step": 5550 }, { "epoch": 1.3517358963139552, "grad_norm": 0.25390625, "learning_rate": 4.137618832050702e-05, "loss": 2.0992, "mean_token_accuracy": 0.5542617355659604, "num_input_tokens_seen": 2704833792, "num_tokens": 1140249458.0, "step": 5600 }, { "epoch": 1.3638064486926083, "grad_norm": 0.267578125, "learning_rate": 4.1187566017805946e-05, "loss": 2.0977, "mean_token_accuracy": 0.5540939109772444, "num_input_tokens_seen": 2729074544, "num_tokens": 1150474886.0, "step": 5650 }, { "epoch": 1.3758770010712615, "grad_norm": 0.294921875, "learning_rate": 4.099894371510488e-05, "loss": 2.0995, "mean_token_accuracy": 0.553785107024014, "num_input_tokens_seen": 2753196608, "num_tokens": 1160634529.0, "step": 5700 }, { "epoch": 1.3879475534499148, "grad_norm": 0.26171875, "learning_rate": 4.081032141240381e-05, "loss": 2.1066, "mean_token_accuracy": 0.5523933649063111, "num_input_tokens_seen": 2777300400, "num_tokens": 1170864545.0, "step": 5750 }, { "epoch": 1.400018105828568, "grad_norm": 0.291015625, "learning_rate": 4.0621699109702735e-05, "loss": 2.1023, "mean_token_accuracy": 0.5536971531435847, "num_input_tokens_seen": 2801426672, "num_tokens": 1181051604.0, "step": 5800 }, { "epoch": 1.4120886582072212, "grad_norm": 0.267578125, "learning_rate": 4.043307680700166e-05, "loss": 2.1042, "mean_token_accuracy": 0.5537538637593389, "num_input_tokens_seen": 2825621648, "num_tokens": 1191210311.0, "step": 5850 }, { "epoch": 1.4241592105858745, "grad_norm": 0.29296875, "learning_rate": 4.0244454504300586e-05, "loss": 2.1221, "mean_token_accuracy": 0.5503192816674709, "num_input_tokens_seen": 2849863744, "num_tokens": 1201379955.0, "step": 5900 }, { "epoch": 1.4362297629645275, "grad_norm": 0.30859375, "learning_rate": 4.005583220159952e-05, "loss": 2.0984, "mean_token_accuracy": 0.5546167600527405, "num_input_tokens_seen": 2874100544, "num_tokens": 1211495441.0, "step": 5950 }, { "epoch": 1.4483003153431808, "grad_norm": 0.267578125, "learning_rate": 3.986720989889845e-05, "loss": 2.0976, "num_input_tokens_seen": 2898379392, "step": 6000 }, { "epoch": 1.4483003153431808, "eval_loss": 1.9750181436538696, "eval_mean_token_accuracy": 0.5774352134075336, "eval_num_tokens": 1221766201.0, "eval_runtime": 130.8087, "eval_samples_per_second": 81.891, "eval_steps_per_second": 20.473, "num_input_tokens_seen": 2898379392, "step": 6000 }, { "epoch": 1.460370867721834, "grad_norm": 0.2734375, "learning_rate": 3.9678587596197375e-05, "loss": 2.1105, "mean_token_accuracy": 0.5535502586700022, "num_input_tokens_seen": 2922428832, "num_tokens": 1231890670.0, "step": 6050 }, { "epoch": 1.4724414201004874, "grad_norm": 0.2412109375, "learning_rate": 3.948996529349631e-05, "loss": 2.0925, "mean_token_accuracy": 0.5552064320072532, "num_input_tokens_seen": 2946628480, "num_tokens": 1242047017.0, "step": 6100 }, { "epoch": 1.4845119724791407, "grad_norm": 0.2392578125, "learning_rate": 3.930134299079523e-05, "loss": 2.0991, "mean_token_accuracy": 0.5542361034452915, "num_input_tokens_seen": 2970772896, "num_tokens": 1252270839.0, "step": 6150 }, { "epoch": 1.4965825248577938, "grad_norm": 0.248046875, "learning_rate": 3.9112720688094164e-05, "loss": 2.0975, "mean_token_accuracy": 0.553666141666472, "num_input_tokens_seen": 2995178928, "num_tokens": 1262575010.0, "step": 6200 }, { "epoch": 1.508653077236447, "grad_norm": 0.27734375, "learning_rate": 3.8924098385393096e-05, "loss": 2.1016, "mean_token_accuracy": 0.5535378622636199, "num_input_tokens_seen": 3019494064, "num_tokens": 1272716330.0, "step": 6250 }, { "epoch": 1.5207236296151003, "grad_norm": 0.251953125, "learning_rate": 3.873547608269202e-05, "loss": 2.1071, "mean_token_accuracy": 0.5533343946188688, "num_input_tokens_seen": 3043731184, "num_tokens": 1282959106.0, "step": 6300 }, { "epoch": 1.5327941819937534, "grad_norm": 0.267578125, "learning_rate": 3.854685377999095e-05, "loss": 2.1057, "mean_token_accuracy": 0.5532900895178318, "num_input_tokens_seen": 3067867536, "num_tokens": 1293166946.0, "step": 6350 }, { "epoch": 1.5448647343724067, "grad_norm": 0.32421875, "learning_rate": 3.835823147728987e-05, "loss": 2.106, "mean_token_accuracy": 0.5535468808189035, "num_input_tokens_seen": 3092176096, "num_tokens": 1303381117.0, "step": 6400 }, { "epoch": 1.55693528675106, "grad_norm": 0.248046875, "learning_rate": 3.8169609174588804e-05, "loss": 2.0989, "mean_token_accuracy": 0.5537711648643017, "num_input_tokens_seen": 3116219440, "num_tokens": 1313584312.0, "step": 6450 }, { "epoch": 1.569005839129713, "grad_norm": 0.2490234375, "learning_rate": 3.7980986871887736e-05, "loss": 2.1121, "num_input_tokens_seen": 3140306656, "step": 6500 }, { "epoch": 1.569005839129713, "eval_loss": 1.972907304763794, "eval_mean_token_accuracy": 0.5776848248619922, "eval_num_tokens": 1323681372.0, "eval_runtime": 129.9521, "eval_samples_per_second": 82.43, "eval_steps_per_second": 20.608, "num_input_tokens_seen": 3140306656, "step": 6500 }, { "epoch": 1.5810763915083665, "grad_norm": 0.255859375, "learning_rate": 3.779236456918666e-05, "loss": 2.1022, "mean_token_accuracy": 0.5528610655851662, "num_input_tokens_seen": 3164451568, "num_tokens": 1333870621.0, "step": 6550 }, { "epoch": 1.5931469438870196, "grad_norm": 0.2578125, "learning_rate": 3.760374226648559e-05, "loss": 2.0995, "mean_token_accuracy": 0.5544479803740978, "num_input_tokens_seen": 3188556560, "num_tokens": 1344040900.0, "step": 6600 }, { "epoch": 1.605217496265673, "grad_norm": 0.263671875, "learning_rate": 3.741511996378452e-05, "loss": 2.1007, "mean_token_accuracy": 0.5533806948363781, "num_input_tokens_seen": 3212749536, "num_tokens": 1354214136.0, "step": 6650 }, { "epoch": 1.6172880486443262, "grad_norm": 0.314453125, "learning_rate": 3.722649766108345e-05, "loss": 2.1034, "mean_token_accuracy": 0.5535299601778388, "num_input_tokens_seen": 3236971856, "num_tokens": 1364392553.0, "step": 6700 }, { "epoch": 1.6293586010229792, "grad_norm": 0.255859375, "learning_rate": 3.7037875358382376e-05, "loss": 2.1039, "mean_token_accuracy": 0.5538195591047406, "num_input_tokens_seen": 3261221664, "num_tokens": 1374597748.0, "step": 6750 }, { "epoch": 1.6414291534016325, "grad_norm": 0.2431640625, "learning_rate": 3.68492530556813e-05, "loss": 2.0999, "mean_token_accuracy": 0.5533070769160986, "num_input_tokens_seen": 3285325344, "num_tokens": 1384799126.0, "step": 6800 }, { "epoch": 1.6534997057802858, "grad_norm": 0.294921875, "learning_rate": 3.666063075298023e-05, "loss": 2.0969, "mean_token_accuracy": 0.5540298366174102, "num_input_tokens_seen": 3309447808, "num_tokens": 1394991541.0, "step": 6850 }, { "epoch": 1.6655702581589389, "grad_norm": 0.28125, "learning_rate": 3.647200845027916e-05, "loss": 2.0982, "mean_token_accuracy": 0.5548456938192249, "num_input_tokens_seen": 3333810384, "num_tokens": 1405193552.0, "step": 6900 }, { "epoch": 1.6776408105375922, "grad_norm": 0.27734375, "learning_rate": 3.628338614757809e-05, "loss": 2.0998, "mean_token_accuracy": 0.5537503241375089, "num_input_tokens_seen": 3357917184, "num_tokens": 1415265616.0, "step": 6950 }, { "epoch": 1.6897113629162455, "grad_norm": 0.236328125, "learning_rate": 3.609476384487702e-05, "loss": 2.0829, "num_input_tokens_seen": 3382174368, "step": 7000 }, { "epoch": 1.6897113629162455, "eval_loss": 1.971500039100647, "eval_mean_token_accuracy": 0.5780662869320956, "eval_num_tokens": 1425356084.0, "eval_runtime": 130.448, "eval_samples_per_second": 82.117, "eval_steps_per_second": 20.529, "num_input_tokens_seen": 3382174368, "step": 7000 }, { "epoch": 1.7017819152948985, "grad_norm": 0.2421875, "learning_rate": 3.590614154217595e-05, "loss": 2.0955, "mean_token_accuracy": 0.55618498865515, "num_input_tokens_seen": 3406371872, "num_tokens": 1435550428.0, "step": 7050 }, { "epoch": 1.713852467673552, "grad_norm": 0.275390625, "learning_rate": 3.571751923947488e-05, "loss": 2.0919, "mean_token_accuracy": 0.5553148340806365, "num_input_tokens_seen": 3430521536, "num_tokens": 1445655664.0, "step": 7100 }, { "epoch": 1.725923020052205, "grad_norm": 0.28125, "learning_rate": 3.5528896936773805e-05, "loss": 2.0906, "mean_token_accuracy": 0.5547146466746926, "num_input_tokens_seen": 3454573728, "num_tokens": 1455866347.0, "step": 7150 }, { "epoch": 1.7379935724308584, "grad_norm": 0.265625, "learning_rate": 3.534027463407274e-05, "loss": 2.098, "mean_token_accuracy": 0.5544127273187042, "num_input_tokens_seen": 3478641648, "num_tokens": 1465996080.0, "step": 7200 }, { "epoch": 1.7500641248095117, "grad_norm": 0.275390625, "learning_rate": 3.515165233137166e-05, "loss": 2.1039, "mean_token_accuracy": 0.5536015385761857, "num_input_tokens_seen": 3502632176, "num_tokens": 1476133852.0, "step": 7250 }, { "epoch": 1.7621346771881647, "grad_norm": 0.2470703125, "learning_rate": 3.496303002867059e-05, "loss": 2.0974, "mean_token_accuracy": 0.5540728243440389, "num_input_tokens_seen": 3526775024, "num_tokens": 1486346988.0, "step": 7300 }, { "epoch": 1.774205229566818, "grad_norm": 0.24609375, "learning_rate": 3.477440772596952e-05, "loss": 2.0986, "mean_token_accuracy": 0.5539619905874134, "num_input_tokens_seen": 3551057232, "num_tokens": 1496540907.0, "step": 7350 }, { "epoch": 1.7862757819454713, "grad_norm": 0.2734375, "learning_rate": 3.4585785423268445e-05, "loss": 2.097, "mean_token_accuracy": 0.5543709811195732, "num_input_tokens_seen": 3575145120, "num_tokens": 1506733776.0, "step": 7400 }, { "epoch": 1.7983463343241244, "grad_norm": 0.263671875, "learning_rate": 3.4397163120567377e-05, "loss": 2.1054, "mean_token_accuracy": 0.5529748990386725, "num_input_tokens_seen": 3599324048, "num_tokens": 1516881364.0, "step": 7450 }, { "epoch": 1.810416886702778, "grad_norm": 0.26171875, "learning_rate": 3.420854081786631e-05, "loss": 2.107, "num_input_tokens_seen": 3623572960, "step": 7500 }, { "epoch": 1.810416886702778, "eval_loss": 1.9704335927963257, "eval_mean_token_accuracy": 0.5782234972207915, "eval_num_tokens": 1527213100.0, "eval_runtime": 130.0921, "eval_samples_per_second": 82.342, "eval_steps_per_second": 20.585, "num_input_tokens_seen": 3623572960, "step": 7500 }, { "epoch": 1.822487439081431, "grad_norm": 0.236328125, "learning_rate": 3.4019918515165234e-05, "loss": 2.0859, "mean_token_accuracy": 0.5543704128451645, "num_input_tokens_seen": 3647842624, "num_tokens": 1537348409.0, "step": 7550 }, { "epoch": 1.8345579914600842, "grad_norm": 0.275390625, "learning_rate": 3.3831296212464166e-05, "loss": 2.0964, "mean_token_accuracy": 0.5545977150648832, "num_input_tokens_seen": 3671838176, "num_tokens": 1547405691.0, "step": 7600 }, { "epoch": 1.8466285438387375, "grad_norm": 0.271484375, "learning_rate": 3.364267390976309e-05, "loss": 2.1049, "mean_token_accuracy": 0.5531406961008907, "num_input_tokens_seen": 3696051376, "num_tokens": 1557614207.0, "step": 7650 }, { "epoch": 1.8586990962173906, "grad_norm": 0.26171875, "learning_rate": 3.345405160706202e-05, "loss": 2.0976, "mean_token_accuracy": 0.5547824421525002, "num_input_tokens_seen": 3720242912, "num_tokens": 1567813391.0, "step": 7700 }, { "epoch": 1.8707696485960439, "grad_norm": 0.26171875, "learning_rate": 3.326542930436095e-05, "loss": 2.0941, "mean_token_accuracy": 0.5552040388435125, "num_input_tokens_seen": 3744303744, "num_tokens": 1577952759.0, "step": 7750 }, { "epoch": 1.8828402009746972, "grad_norm": 0.2734375, "learning_rate": 3.3076807001659874e-05, "loss": 2.107, "mean_token_accuracy": 0.5526882111281156, "num_input_tokens_seen": 3768596640, "num_tokens": 1588260331.0, "step": 7800 }, { "epoch": 1.8949107533533502, "grad_norm": 0.2890625, "learning_rate": 3.2888184698958806e-05, "loss": 2.0974, "mean_token_accuracy": 0.5548465251550079, "num_input_tokens_seen": 3792634928, "num_tokens": 1598444722.0, "step": 7850 }, { "epoch": 1.9069813057320035, "grad_norm": 0.251953125, "learning_rate": 3.269956239625773e-05, "loss": 2.0957, "mean_token_accuracy": 0.554438531845808, "num_input_tokens_seen": 3816849872, "num_tokens": 1608557410.0, "step": 7900 }, { "epoch": 1.9190518581106568, "grad_norm": 0.2490234375, "learning_rate": 3.251094009355666e-05, "loss": 2.108, "mean_token_accuracy": 0.5528679783269763, "num_input_tokens_seen": 3840903600, "num_tokens": 1618721583.0, "step": 7950 }, { "epoch": 1.9311224104893099, "grad_norm": 0.263671875, "learning_rate": 3.2322317790855595e-05, "loss": 2.0908, "num_input_tokens_seen": 3864935104, "step": 8000 }, { "epoch": 1.9311224104893099, "eval_loss": 1.9698705673217773, "eval_mean_token_accuracy": 0.5783014823866923, "eval_num_tokens": 1628824365.0, "eval_runtime": 130.0192, "eval_samples_per_second": 82.388, "eval_steps_per_second": 20.597, "num_input_tokens_seen": 3864935104, "step": 8000 }, { "epoch": 1.9431929628679634, "grad_norm": 0.26171875, "learning_rate": 3.213369548815452e-05, "loss": 2.096, "mean_token_accuracy": 0.5548218312300741, "num_input_tokens_seen": 3889193776, "num_tokens": 1638986755.0, "step": 8050 }, { "epoch": 1.9552635152466165, "grad_norm": 0.28125, "learning_rate": 3.194507318545345e-05, "loss": 2.0821, "mean_token_accuracy": 0.5565256755426526, "num_input_tokens_seen": 3913273664, "num_tokens": 1649209062.0, "step": 8100 }, { "epoch": 1.9673340676252697, "grad_norm": 0.267578125, "learning_rate": 3.175645088275238e-05, "loss": 2.1026, "mean_token_accuracy": 0.5532021636888385, "num_input_tokens_seen": 3937597616, "num_tokens": 1659435713.0, "step": 8150 }, { "epoch": 1.979404620003923, "grad_norm": 0.255859375, "learning_rate": 3.156782858005131e-05, "loss": 2.0984, "mean_token_accuracy": 0.554893646761775, "num_input_tokens_seen": 3961803792, "num_tokens": 1669582457.0, "step": 8200 }, { "epoch": 1.991475172382576, "grad_norm": 0.25, "learning_rate": 3.1379206277350235e-05, "loss": 2.0818, "mean_token_accuracy": 0.5563116483017803, "num_input_tokens_seen": 3985940144, "num_tokens": 1679783022.0, "step": 8250 }, { "epoch": 2.003379754666023, "grad_norm": 0.279296875, "learning_rate": 3.119058397464916e-05, "loss": 2.0986, "mean_token_accuracy": 0.5548089014411124, "num_input_tokens_seen": 4009775217, "num_tokens": 1689934301.0, "step": 8300 }, { "epoch": 2.0154503070446763, "grad_norm": 0.2431640625, "learning_rate": 3.100196167194809e-05, "loss": 2.101, "mean_token_accuracy": 0.554502502605319, "num_input_tokens_seen": 4033881233, "num_tokens": 1700106136.0, "step": 8350 }, { "epoch": 2.0275208594233294, "grad_norm": 0.27734375, "learning_rate": 3.081333936924702e-05, "loss": 2.0952, "mean_token_accuracy": 0.5547482476383447, "num_input_tokens_seen": 4058061905, "num_tokens": 1710288179.0, "step": 8400 }, { "epoch": 2.0395914118019824, "grad_norm": 0.251953125, "learning_rate": 3.062471706654595e-05, "loss": 2.0997, "mean_token_accuracy": 0.5538438270241022, "num_input_tokens_seen": 4082118001, "num_tokens": 1720402394.0, "step": 8450 }, { "epoch": 2.051661964180636, "grad_norm": 0.27734375, "learning_rate": 3.043609476384488e-05, "loss": 2.0904, "num_input_tokens_seen": 4106175169, "step": 8500 }, { "epoch": 2.051661964180636, "eval_loss": 1.969247817993164, "eval_mean_token_accuracy": 0.5783566591497497, "eval_num_tokens": 1730551722.0, "eval_runtime": 130.525, "eval_samples_per_second": 82.069, "eval_steps_per_second": 20.517, "num_input_tokens_seen": 4106175169, "step": 8500 }, { "epoch": 2.063732516559289, "grad_norm": 0.244140625, "learning_rate": 3.0247472461143806e-05, "loss": 2.1045, "mean_token_accuracy": 0.5544156692735851, "num_input_tokens_seen": 4130212065, "num_tokens": 1740781983.0, "step": 8550 }, { "epoch": 2.075803068937942, "grad_norm": 0.28125, "learning_rate": 3.0058850158442735e-05, "loss": 2.0968, "mean_token_accuracy": 0.5541778185963631, "num_input_tokens_seen": 4154342289, "num_tokens": 1750886868.0, "step": 8600 }, { "epoch": 2.0878736213165956, "grad_norm": 0.2392578125, "learning_rate": 2.9870227855741667e-05, "loss": 2.0825, "mean_token_accuracy": 0.5571553486213088, "num_input_tokens_seen": 4178622977, "num_tokens": 1761024406.0, "step": 8650 }, { "epoch": 2.0999441736952487, "grad_norm": 0.2490234375, "learning_rate": 2.9681605553040592e-05, "loss": 2.0936, "mean_token_accuracy": 0.554418184608221, "num_input_tokens_seen": 4202833969, "num_tokens": 1771241652.0, "step": 8700 }, { "epoch": 2.112014726073902, "grad_norm": 0.244140625, "learning_rate": 2.9492983250339524e-05, "loss": 2.0959, "mean_token_accuracy": 0.5550215977802873, "num_input_tokens_seen": 4226915233, "num_tokens": 1781349215.0, "step": 8750 }, { "epoch": 2.1240852784525552, "grad_norm": 0.267578125, "learning_rate": 2.930436094763845e-05, "loss": 2.1008, "mean_token_accuracy": 0.553907332457602, "num_input_tokens_seen": 4251137105, "num_tokens": 1791502343.0, "step": 8800 }, { "epoch": 2.1361558308312083, "grad_norm": 0.263671875, "learning_rate": 2.9115738644937378e-05, "loss": 2.0903, "mean_token_accuracy": 0.5557647632434964, "num_input_tokens_seen": 4275224433, "num_tokens": 1801620139.0, "step": 8850 }, { "epoch": 2.148226383209862, "grad_norm": 0.2275390625, "learning_rate": 2.892711634223631e-05, "loss": 2.105, "mean_token_accuracy": 0.5533201249688864, "num_input_tokens_seen": 4299490545, "num_tokens": 1811829692.0, "step": 8900 }, { "epoch": 2.160296935588515, "grad_norm": 0.251953125, "learning_rate": 2.8738494039535235e-05, "loss": 2.0771, "mean_token_accuracy": 0.5572306806966663, "num_input_tokens_seen": 4323603905, "num_tokens": 1822011165.0, "step": 8950 }, { "epoch": 2.172367487967168, "grad_norm": 0.248046875, "learning_rate": 2.8549871736834167e-05, "loss": 2.0766, "num_input_tokens_seen": 4347894913, "step": 9000 }, { "epoch": 2.172367487967168, "eval_loss": 1.968759536743164, "eval_mean_token_accuracy": 0.5784085558767724, "eval_num_tokens": 1832187809.0, "eval_runtime": 130.1161, "eval_samples_per_second": 82.326, "eval_steps_per_second": 20.582, "num_input_tokens_seen": 4347894913, "step": 9000 }, { "epoch": 2.1844380403458215, "grad_norm": 0.2734375, "learning_rate": 2.8361249434133093e-05, "loss": 2.0951, "mean_token_accuracy": 0.5561914920061827, "num_input_tokens_seen": 4372093409, "num_tokens": 1842403609.0, "step": 9050 }, { "epoch": 2.1965085927244745, "grad_norm": 0.275390625, "learning_rate": 2.817262713143202e-05, "loss": 2.0915, "mean_token_accuracy": 0.5557398213073611, "num_input_tokens_seen": 4396072241, "num_tokens": 1852448709.0, "step": 9100 }, { "epoch": 2.2085791451031276, "grad_norm": 0.2392578125, "learning_rate": 2.7984004828730953e-05, "loss": 2.0965, "mean_token_accuracy": 0.5542204293608666, "num_input_tokens_seen": 4420308385, "num_tokens": 1862702843.0, "step": 9150 }, { "epoch": 2.220649697481781, "grad_norm": 0.2578125, "learning_rate": 2.779538252602988e-05, "loss": 2.0873, "mean_token_accuracy": 0.555770318582654, "num_input_tokens_seen": 4444408305, "num_tokens": 1872813360.0, "step": 9200 }, { "epoch": 2.232720249860434, "grad_norm": 0.248046875, "learning_rate": 2.760676022332881e-05, "loss": 2.0984, "mean_token_accuracy": 0.5543450859189033, "num_input_tokens_seen": 4468586049, "num_tokens": 1883034727.0, "step": 9250 }, { "epoch": 2.2447908022390877, "grad_norm": 0.26171875, "learning_rate": 2.7418137920627736e-05, "loss": 2.0913, "mean_token_accuracy": 0.5554680547490716, "num_input_tokens_seen": 4492717489, "num_tokens": 1893259660.0, "step": 9300 }, { "epoch": 2.2568613546177407, "grad_norm": 0.3046875, "learning_rate": 2.7229515617926664e-05, "loss": 2.0976, "mean_token_accuracy": 0.5547211924567819, "num_input_tokens_seen": 4516832449, "num_tokens": 1903351453.0, "step": 9350 }, { "epoch": 2.268931906996394, "grad_norm": 0.240234375, "learning_rate": 2.7040893315225596e-05, "loss": 2.095, "mean_token_accuracy": 0.5545766900852322, "num_input_tokens_seen": 4540881473, "num_tokens": 1913462038.0, "step": 9400 }, { "epoch": 2.2810024593750473, "grad_norm": 0.2412109375, "learning_rate": 2.685227101252452e-05, "loss": 2.1047, "mean_token_accuracy": 0.5530835852399468, "num_input_tokens_seen": 4565196353, "num_tokens": 1923836730.0, "step": 9450 }, { "epoch": 2.2930730117537004, "grad_norm": 0.25390625, "learning_rate": 2.6663648709823454e-05, "loss": 2.1036, "num_input_tokens_seen": 4589393665, "step": 9500 }, { "epoch": 2.2930730117537004, "eval_loss": 1.9684821367263794, "eval_mean_token_accuracy": 0.5784456487953707, "eval_num_tokens": 1933999749.0, "eval_runtime": 130.3401, "eval_samples_per_second": 82.185, "eval_steps_per_second": 20.546, "num_input_tokens_seen": 4589393665, "step": 9500 }, { "epoch": 2.3051435641323534, "grad_norm": 0.2373046875, "learning_rate": 2.647502640712238e-05, "loss": 2.1091, "mean_token_accuracy": 0.5529859235696495, "num_input_tokens_seen": 4613609921, "num_tokens": 1944210855.0, "step": 9550 }, { "epoch": 2.317214116511007, "grad_norm": 0.2490234375, "learning_rate": 2.6286404104421307e-05, "loss": 2.0976, "mean_token_accuracy": 0.554888856895268, "num_input_tokens_seen": 4637474321, "num_tokens": 1954258079.0, "step": 9600 }, { "epoch": 2.32928466888966, "grad_norm": 0.25, "learning_rate": 2.609778180172024e-05, "loss": 2.1061, "mean_token_accuracy": 0.5531089297309518, "num_input_tokens_seen": 4661687841, "num_tokens": 1964447306.0, "step": 9650 }, { "epoch": 2.341355221268313, "grad_norm": 0.283203125, "learning_rate": 2.5909159499019165e-05, "loss": 2.0972, "mean_token_accuracy": 0.5547297456115484, "num_input_tokens_seen": 4685886657, "num_tokens": 1974672380.0, "step": 9700 }, { "epoch": 2.3534257736469666, "grad_norm": 0.275390625, "learning_rate": 2.5720537196318097e-05, "loss": 2.0874, "mean_token_accuracy": 0.556226581223309, "num_input_tokens_seen": 4710004273, "num_tokens": 1984832310.0, "step": 9750 }, { "epoch": 2.3654963260256197, "grad_norm": 0.2392578125, "learning_rate": 2.5531914893617022e-05, "loss": 2.096, "mean_token_accuracy": 0.5547980547696352, "num_input_tokens_seen": 4734271009, "num_tokens": 1995090784.0, "step": 9800 }, { "epoch": 2.377566878404273, "grad_norm": 0.28515625, "learning_rate": 2.534329259091595e-05, "loss": 2.0871, "mean_token_accuracy": 0.5552258058264852, "num_input_tokens_seen": 4758291265, "num_tokens": 2005240317.0, "step": 9850 }, { "epoch": 2.3896374307829262, "grad_norm": 0.2470703125, "learning_rate": 2.5154670288214883e-05, "loss": 2.0865, "mean_token_accuracy": 0.5557247434183955, "num_input_tokens_seen": 4782472097, "num_tokens": 2015507708.0, "step": 9900 }, { "epoch": 2.4017079831615793, "grad_norm": 0.2421875, "learning_rate": 2.4966047985513808e-05, "loss": 2.1074, "mean_token_accuracy": 0.5527091028168798, "num_input_tokens_seen": 4806608113, "num_tokens": 2025820931.0, "step": 9950 }, { "epoch": 2.413778535540233, "grad_norm": 0.2421875, "learning_rate": 2.477742568281274e-05, "loss": 2.1001, "num_input_tokens_seen": 4830743425, "step": 10000 }, { "epoch": 2.413778535540233, "eval_loss": 1.9683291912078857, "eval_mean_token_accuracy": 0.5784874623550952, "eval_num_tokens": 2035904188.0, "eval_runtime": 130.7093, "eval_samples_per_second": 81.953, "eval_steps_per_second": 20.488, "num_input_tokens_seen": 4830743425, "step": 10000 }, { "epoch": 2.425849087918886, "grad_norm": 0.263671875, "learning_rate": 2.4588803380111665e-05, "loss": 2.102, "mean_token_accuracy": 0.5541372266598046, "num_input_tokens_seen": 4855099809, "num_tokens": 2046084283.0, "step": 10050 }, { "epoch": 2.437919640297539, "grad_norm": 0.26171875, "learning_rate": 2.4400181077410594e-05, "loss": 2.0991, "mean_token_accuracy": 0.5542299181595445, "num_input_tokens_seen": 4879214129, "num_tokens": 2056326330.0, "step": 10100 }, { "epoch": 2.4499901926761924, "grad_norm": 0.25390625, "learning_rate": 2.4211558774709522e-05, "loss": 2.0834, "mean_token_accuracy": 0.5564426334574819, "num_input_tokens_seen": 4903399553, "num_tokens": 2066490013.0, "step": 10150 }, { "epoch": 2.4620607450548455, "grad_norm": 0.263671875, "learning_rate": 2.402293647200845e-05, "loss": 2.098, "mean_token_accuracy": 0.5545364746823906, "num_input_tokens_seen": 4927492609, "num_tokens": 2076526539.0, "step": 10200 }, { "epoch": 2.474131297433499, "grad_norm": 0.23828125, "learning_rate": 2.383431416930738e-05, "loss": 2.0885, "mean_token_accuracy": 0.555601441822946, "num_input_tokens_seen": 4951732929, "num_tokens": 2086768431.0, "step": 10250 }, { "epoch": 2.486201849812152, "grad_norm": 0.255859375, "learning_rate": 2.3645691866606308e-05, "loss": 2.0909, "mean_token_accuracy": 0.5558399046584964, "num_input_tokens_seen": 4975948097, "num_tokens": 2096961030.0, "step": 10300 }, { "epoch": 2.498272402190805, "grad_norm": 0.326171875, "learning_rate": 2.3457069563905237e-05, "loss": 2.0906, "mean_token_accuracy": 0.5556136939302087, "num_input_tokens_seen": 5000143905, "num_tokens": 2107303887.0, "step": 10350 }, { "epoch": 2.5103429545694587, "grad_norm": 0.267578125, "learning_rate": 2.3268447261204166e-05, "loss": 2.0976, "mean_token_accuracy": 0.5541230865567922, "num_input_tokens_seen": 5024212113, "num_tokens": 2117576166.0, "step": 10400 }, { "epoch": 2.5224135069481117, "grad_norm": 0.29296875, "learning_rate": 2.3079824958503094e-05, "loss": 2.0935, "mean_token_accuracy": 0.5555445018038153, "num_input_tokens_seen": 5048313681, "num_tokens": 2127734721.0, "step": 10450 }, { "epoch": 2.534484059326765, "grad_norm": 0.2421875, "learning_rate": 2.2891202655802023e-05, "loss": 2.0982, "num_input_tokens_seen": 5072508817, "step": 10500 }, { "epoch": 2.534484059326765, "eval_loss": 1.9683516025543213, "eval_mean_token_accuracy": 0.5784807712440619, "eval_num_tokens": 2137987548.0, "eval_runtime": 130.4075, "eval_samples_per_second": 82.143, "eval_steps_per_second": 20.536, "num_input_tokens_seen": 5072508817, "step": 10500 }, { "epoch": 2.5465546117054183, "grad_norm": 0.267578125, "learning_rate": 2.270258035310095e-05, "loss": 2.0924, "mean_token_accuracy": 0.5551841219887137, "num_input_tokens_seen": 5096586577, "num_tokens": 2148155987.0, "step": 10550 }, { "epoch": 2.5586251640840714, "grad_norm": 0.2734375, "learning_rate": 2.251395805039988e-05, "loss": 2.0982, "mean_token_accuracy": 0.5541262343525887, "num_input_tokens_seen": 5120875729, "num_tokens": 2158352820.0, "step": 10600 }, { "epoch": 2.5706957164627244, "grad_norm": 0.251953125, "learning_rate": 2.232533574769881e-05, "loss": 2.0908, "mean_token_accuracy": 0.5560182608664036, "num_input_tokens_seen": 5145050353, "num_tokens": 2168407807.0, "step": 10650 }, { "epoch": 2.582766268841378, "grad_norm": 0.2734375, "learning_rate": 2.2136713444997737e-05, "loss": 2.0958, "mean_token_accuracy": 0.5551287305355072, "num_input_tokens_seen": 5169266849, "num_tokens": 2178592858.0, "step": 10700 }, { "epoch": 2.594836821220031, "grad_norm": 0.2451171875, "learning_rate": 2.1948091142296666e-05, "loss": 2.0904, "mean_token_accuracy": 0.5559819753468037, "num_input_tokens_seen": 5193472705, "num_tokens": 2188792925.0, "step": 10750 }, { "epoch": 2.606907373598684, "grad_norm": 0.2578125, "learning_rate": 2.1759468839595595e-05, "loss": 2.1003, "mean_token_accuracy": 0.5538398388028145, "num_input_tokens_seen": 5217541665, "num_tokens": 2199063266.0, "step": 10800 }, { "epoch": 2.6189779259773376, "grad_norm": 0.2578125, "learning_rate": 2.1570846536894523e-05, "loss": 2.0996, "mean_token_accuracy": 0.5539121518284083, "num_input_tokens_seen": 5241669153, "num_tokens": 2209236507.0, "step": 10850 }, { "epoch": 2.6310484783559906, "grad_norm": 0.2412109375, "learning_rate": 2.1382224234193452e-05, "loss": 2.0898, "mean_token_accuracy": 0.5560731103271246, "num_input_tokens_seen": 5265851553, "num_tokens": 2219375503.0, "step": 10900 }, { "epoch": 2.643119030734644, "grad_norm": 0.255859375, "learning_rate": 2.119360193149238e-05, "loss": 2.0887, "mean_token_accuracy": 0.5559511515125632, "num_input_tokens_seen": 5290120305, "num_tokens": 2229631896.0, "step": 10950 }, { "epoch": 2.6551895831132972, "grad_norm": 0.267578125, "learning_rate": 2.100497962879131e-05, "loss": 2.0941, "num_input_tokens_seen": 5314253297, "step": 11000 }, { "epoch": 2.6551895831132972, "eval_loss": 1.9683243036270142, "eval_mean_token_accuracy": 0.5784822298106727, "eval_num_tokens": 2239778564.0, "eval_runtime": 131.1903, "eval_samples_per_second": 81.652, "eval_steps_per_second": 20.413, "num_input_tokens_seen": 5314253297, "step": 11000 }, { "epoch": 2.6672601354919503, "grad_norm": 0.275390625, "learning_rate": 2.0816357326090238e-05, "loss": 2.0981, "mean_token_accuracy": 0.5546219968609511, "num_input_tokens_seen": 5338466017, "num_tokens": 2249988177.0, "step": 11050 }, { "epoch": 2.679330687870604, "grad_norm": 0.388671875, "learning_rate": 2.0627735023389166e-05, "loss": 2.0921, "mean_token_accuracy": 0.5557056156918406, "num_input_tokens_seen": 5362616753, "num_tokens": 2260143864.0, "step": 11100 }, { "epoch": 2.691401240249257, "grad_norm": 0.296875, "learning_rate": 2.0439112720688095e-05, "loss": 2.0957, "mean_token_accuracy": 0.555001782849431, "num_input_tokens_seen": 5386751937, "num_tokens": 2270269015.0, "step": 11150 }, { "epoch": 2.7034717926279104, "grad_norm": 0.2734375, "learning_rate": 2.0250490417987024e-05, "loss": 2.0879, "mean_token_accuracy": 0.5558207688108087, "num_input_tokens_seen": 5410821777, "num_tokens": 2280359409.0, "step": 11200 }, { "epoch": 2.7155423450065634, "grad_norm": 0.255859375, "learning_rate": 2.0061868115285952e-05, "loss": 2.0862, "mean_token_accuracy": 0.5561293217167258, "num_input_tokens_seen": 5435109553, "num_tokens": 2290676732.0, "step": 11250 }, { "epoch": 2.7276128973852165, "grad_norm": 0.2890625, "learning_rate": 1.987324581258488e-05, "loss": 2.096, "mean_token_accuracy": 0.5545283930376173, "num_input_tokens_seen": 5459182481, "num_tokens": 2300830078.0, "step": 11300 }, { "epoch": 2.73968344976387, "grad_norm": 0.271484375, "learning_rate": 1.968462350988381e-05, "loss": 2.0944, "mean_token_accuracy": 0.5546667322888971, "num_input_tokens_seen": 5483343233, "num_tokens": 2310996379.0, "step": 11350 }, { "epoch": 2.751754002142523, "grad_norm": 0.2373046875, "learning_rate": 1.9496001207182738e-05, "loss": 2.0962, "mean_token_accuracy": 0.5551789667457342, "num_input_tokens_seen": 5507547265, "num_tokens": 2321208987.0, "step": 11400 }, { "epoch": 2.763824554521176, "grad_norm": 0.25390625, "learning_rate": 1.9307378904481667e-05, "loss": 2.0824, "mean_token_accuracy": 0.5572770998999477, "num_input_tokens_seen": 5531617505, "num_tokens": 2331304919.0, "step": 11450 }, { "epoch": 2.7758951068998297, "grad_norm": 0.2578125, "learning_rate": 1.9118756601780595e-05, "loss": 2.1095, "num_input_tokens_seen": 5555689025, "step": 11500 }, { "epoch": 2.7758951068998297, "eval_loss": 1.9681649208068848, "eval_mean_token_accuracy": 0.5785138376329798, "eval_num_tokens": 2341489447.0, "eval_runtime": 130.3192, "eval_samples_per_second": 82.198, "eval_steps_per_second": 20.55, "num_input_tokens_seen": 5555689025, "step": 11500 }, { "epoch": 2.7879656592784827, "grad_norm": 0.275390625, "learning_rate": 1.8930134299079524e-05, "loss": 2.0913, "mean_token_accuracy": 0.553754635732621, "num_input_tokens_seen": 5579808081, "num_tokens": 2351725815.0, "step": 11550 }, { "epoch": 2.800036211657136, "grad_norm": 0.27734375, "learning_rate": 1.8741511996378453e-05, "loss": 2.0961, "mean_token_accuracy": 0.5546263293549418, "num_input_tokens_seen": 5604043089, "num_tokens": 2362015471.0, "step": 11600 }, { "epoch": 2.8121067640357893, "grad_norm": 0.267578125, "learning_rate": 1.855288969367738e-05, "loss": 2.0887, "mean_token_accuracy": 0.555464554913342, "num_input_tokens_seen": 5628213761, "num_tokens": 2372168279.0, "step": 11650 }, { "epoch": 2.8241773164144424, "grad_norm": 0.263671875, "learning_rate": 1.836426739097631e-05, "loss": 2.1049, "mean_token_accuracy": 0.5529934700578452, "num_input_tokens_seen": 5652297233, "num_tokens": 2382280922.0, "step": 11700 }, { "epoch": 2.8362478687930954, "grad_norm": 0.27734375, "learning_rate": 1.817564508827524e-05, "loss": 2.1025, "mean_token_accuracy": 0.5528503654524684, "num_input_tokens_seen": 5676338817, "num_tokens": 2392486600.0, "step": 11750 }, { "epoch": 2.848318421171749, "grad_norm": 0.25, "learning_rate": 1.7987022785574167e-05, "loss": 2.1018, "mean_token_accuracy": 0.5537810071185231, "num_input_tokens_seen": 5700543889, "num_tokens": 2402663993.0, "step": 11800 }, { "epoch": 2.860388973550402, "grad_norm": 0.267578125, "learning_rate": 1.7798400482873096e-05, "loss": 2.0891, "mean_token_accuracy": 0.5560761171206832, "num_input_tokens_seen": 5724764289, "num_tokens": 2412816351.0, "step": 11850 }, { "epoch": 2.872459525929055, "grad_norm": 0.24609375, "learning_rate": 1.7609778180172024e-05, "loss": 2.0974, "mean_token_accuracy": 0.5541906878352165, "num_input_tokens_seen": 5749009809, "num_tokens": 2423083966.0, "step": 11900 }, { "epoch": 2.8845300783077086, "grad_norm": 0.265625, "learning_rate": 1.7421155877470953e-05, "loss": 2.0996, "mean_token_accuracy": 0.5539322036504746, "num_input_tokens_seen": 5773128241, "num_tokens": 2433212460.0, "step": 11950 }, { "epoch": 2.8966006306863616, "grad_norm": 0.26171875, "learning_rate": 1.723253357476988e-05, "loss": 2.0986, "num_input_tokens_seen": 5797304337, "step": 12000 }, { "epoch": 2.8966006306863616, "eval_loss": 1.9681628942489624, "eval_mean_token_accuracy": 0.5785647708146406, "eval_num_tokens": 2443385171.0, "eval_runtime": 130.5866, "eval_samples_per_second": 82.03, "eval_steps_per_second": 20.507, "num_input_tokens_seen": 5797304337, "step": 12000 }, { "epoch": 2.908671183065015, "grad_norm": 0.2578125, "learning_rate": 1.704391127206881e-05, "loss": 2.0958, "mean_token_accuracy": 0.5546299646422267, "num_input_tokens_seen": 5821442385, "num_tokens": 2453453845.0, "step": 12050 }, { "epoch": 2.920741735443668, "grad_norm": 0.26171875, "learning_rate": 1.685528896936774e-05, "loss": 2.0926, "mean_token_accuracy": 0.5549974143505096, "num_input_tokens_seen": 5845686961, "num_tokens": 2463776050.0, "step": 12100 }, { "epoch": 2.9328122878223217, "grad_norm": 0.263671875, "learning_rate": 1.6666666666666667e-05, "loss": 2.1015, "mean_token_accuracy": 0.5541527543962002, "num_input_tokens_seen": 5869745137, "num_tokens": 2473828195.0, "step": 12150 }, { "epoch": 2.944882840200975, "grad_norm": 0.26953125, "learning_rate": 1.6478044363965596e-05, "loss": 2.1041, "mean_token_accuracy": 0.5541104365140199, "num_input_tokens_seen": 5893803025, "num_tokens": 2483915340.0, "step": 12200 }, { "epoch": 2.956953392579628, "grad_norm": 0.2333984375, "learning_rate": 1.6289422061264525e-05, "loss": 2.0922, "mean_token_accuracy": 0.5555301706120371, "num_input_tokens_seen": 5918068641, "num_tokens": 2494208052.0, "step": 12250 }, { "epoch": 2.9690239449582814, "grad_norm": 0.2490234375, "learning_rate": 1.6100799758563453e-05, "loss": 2.0938, "mean_token_accuracy": 0.5548020200431347, "num_input_tokens_seen": 5942257041, "num_tokens": 2504393372.0, "step": 12300 }, { "epoch": 2.9810944973369344, "grad_norm": 0.2890625, "learning_rate": 1.5912177455862382e-05, "loss": 2.0843, "mean_token_accuracy": 0.5566597804427147, "num_input_tokens_seen": 5966422081, "num_tokens": 2514627798.0, "step": 12350 }, { "epoch": 2.9931650497155875, "grad_norm": 0.2734375, "learning_rate": 1.572355515316131e-05, "loss": 2.0886, "mean_token_accuracy": 0.5566082544624805, "num_input_tokens_seen": 5990574321, "num_tokens": 2524805007.0, "step": 12400 }, { "epoch": 3.005069631999034, "grad_norm": 0.26171875, "learning_rate": 1.553493285046024e-05, "loss": 2.1001, "mean_token_accuracy": 0.5549402527407246, "num_input_tokens_seen": 6014380145, "num_tokens": 2534738802.0, "step": 12450 }, { "epoch": 3.0171401843776877, "grad_norm": 0.2314453125, "learning_rate": 1.5346310547759168e-05, "loss": 2.092, "num_input_tokens_seen": 6038556753, "step": 12500 }, { "epoch": 3.0171401843776877, "eval_loss": 1.9681233167648315, "eval_mean_token_accuracy": 0.5784891846355349, "eval_num_tokens": 2544886437.0, "eval_runtime": 130.6689, "eval_samples_per_second": 81.978, "eval_steps_per_second": 20.495, "num_input_tokens_seen": 6038556753, "step": 12500 }, { "epoch": 3.029210736756341, "grad_norm": 0.25390625, "learning_rate": 1.5157688245058096e-05, "loss": 2.0925, "mean_token_accuracy": 0.5550393326207995, "num_input_tokens_seen": 6062857617, "num_tokens": 2555112151.0, "step": 12550 }, { "epoch": 3.041281289134994, "grad_norm": 0.38671875, "learning_rate": 1.4969065942357025e-05, "loss": 2.0957, "mean_token_accuracy": 0.5551863227039575, "num_input_tokens_seen": 6087077841, "num_tokens": 2565388515.0, "step": 12600 }, { "epoch": 3.0533518415136474, "grad_norm": 0.279296875, "learning_rate": 1.4780443639655952e-05, "loss": 2.0858, "mean_token_accuracy": 0.5563259933143854, "num_input_tokens_seen": 6111161617, "num_tokens": 2575504513.0, "step": 12650 }, { "epoch": 3.0654223938923004, "grad_norm": 0.25, "learning_rate": 1.4591821336954884e-05, "loss": 2.101, "mean_token_accuracy": 0.5549140437319875, "num_input_tokens_seen": 6135170369, "num_tokens": 2585570498.0, "step": 12700 }, { "epoch": 3.077492946270954, "grad_norm": 0.263671875, "learning_rate": 1.4403199034253811e-05, "loss": 2.0935, "mean_token_accuracy": 0.5543564364686608, "num_input_tokens_seen": 6159397985, "num_tokens": 2595740107.0, "step": 12750 }, { "epoch": 3.089563498649607, "grad_norm": 0.265625, "learning_rate": 1.421457673155274e-05, "loss": 2.0928, "mean_token_accuracy": 0.5548016136884689, "num_input_tokens_seen": 6183511137, "num_tokens": 2605900153.0, "step": 12800 }, { "epoch": 3.10163405102826, "grad_norm": 0.2890625, "learning_rate": 1.4025954428851668e-05, "loss": 2.0862, "mean_token_accuracy": 0.5555924268066883, "num_input_tokens_seen": 6207630993, "num_tokens": 2616105592.0, "step": 12850 }, { "epoch": 3.1137046034069136, "grad_norm": 0.248046875, "learning_rate": 1.3837332126150595e-05, "loss": 2.0938, "mean_token_accuracy": 0.554584386125207, "num_input_tokens_seen": 6231763217, "num_tokens": 2626268256.0, "step": 12900 }, { "epoch": 3.1257751557855666, "grad_norm": 0.251953125, "learning_rate": 1.3648709823449527e-05, "loss": 2.1042, "mean_token_accuracy": 0.553115917481482, "num_input_tokens_seen": 6255995041, "num_tokens": 2636461653.0, "step": 12950 }, { "epoch": 3.1378457081642197, "grad_norm": 0.25390625, "learning_rate": 1.3460087520748454e-05, "loss": 2.0952, "num_input_tokens_seen": 6280158129, "step": 13000 }, { "epoch": 3.1378457081642197, "eval_loss": 1.9681209325790405, "eval_mean_token_accuracy": 0.5785721040555485, "eval_num_tokens": 2646712354.0, "eval_runtime": 130.3881, "eval_samples_per_second": 82.155, "eval_steps_per_second": 20.539, "num_input_tokens_seen": 6280158129, "step": 13000 }, { "epoch": 3.149916260542873, "grad_norm": 0.25390625, "learning_rate": 1.3271465218047383e-05, "loss": 2.0974, "mean_token_accuracy": 0.5548031070828437, "num_input_tokens_seen": 6304365713, "num_tokens": 2656912031.0, "step": 13050 }, { "epoch": 3.1619868129215263, "grad_norm": 0.24609375, "learning_rate": 1.3082842915346311e-05, "loss": 2.0981, "mean_token_accuracy": 0.5543636172637343, "num_input_tokens_seen": 6328561217, "num_tokens": 2667181848.0, "step": 13100 }, { "epoch": 3.1740573653001793, "grad_norm": 0.236328125, "learning_rate": 1.2894220612645238e-05, "loss": 2.093, "mean_token_accuracy": 0.5551713344082236, "num_input_tokens_seen": 6352657569, "num_tokens": 2677374089.0, "step": 13150 }, { "epoch": 3.186127917678833, "grad_norm": 0.267578125, "learning_rate": 1.2705598309944169e-05, "loss": 2.084, "mean_token_accuracy": 0.5568741805478931, "num_input_tokens_seen": 6376750801, "num_tokens": 2687517529.0, "step": 13200 }, { "epoch": 3.198198470057486, "grad_norm": 0.2578125, "learning_rate": 1.2516976007243097e-05, "loss": 2.0985, "mean_token_accuracy": 0.5545465455949307, "num_input_tokens_seen": 6400738145, "num_tokens": 2697615714.0, "step": 13250 }, { "epoch": 3.2102690224361394, "grad_norm": 0.2451171875, "learning_rate": 1.2328353704542026e-05, "loss": 2.0969, "mean_token_accuracy": 0.5544571406021714, "num_input_tokens_seen": 6424909057, "num_tokens": 2707784293.0, "step": 13300 }, { "epoch": 3.2223395748147925, "grad_norm": 0.302734375, "learning_rate": 1.2139731401840953e-05, "loss": 2.0932, "mean_token_accuracy": 0.5548350306227803, "num_input_tokens_seen": 6449111825, "num_tokens": 2717984302.0, "step": 13350 }, { "epoch": 3.2344101271934456, "grad_norm": 0.228515625, "learning_rate": 1.1951109099139883e-05, "loss": 2.1012, "mean_token_accuracy": 0.5535725425183773, "num_input_tokens_seen": 6473257953, "num_tokens": 2728233467.0, "step": 13400 }, { "epoch": 3.246480679572099, "grad_norm": 0.2578125, "learning_rate": 1.1762486796438812e-05, "loss": 2.0985, "mean_token_accuracy": 0.5541856496781111, "num_input_tokens_seen": 6497464865, "num_tokens": 2738326366.0, "step": 13450 }, { "epoch": 3.258551231950752, "grad_norm": 0.2412109375, "learning_rate": 1.157386449373774e-05, "loss": 2.0911, "num_input_tokens_seen": 6521634753, "step": 13500 }, { "epoch": 3.258551231950752, "eval_loss": 1.9680596590042114, "eval_mean_token_accuracy": 0.5785238554199033, "eval_num_tokens": 2748403907.0, "eval_runtime": 130.2372, "eval_samples_per_second": 82.25, "eval_steps_per_second": 20.562, "num_input_tokens_seen": 6521634753, "step": 13500 }, { "epoch": 3.270621784329405, "grad_norm": 0.251953125, "learning_rate": 1.1385242191036669e-05, "loss": 2.0844, "mean_token_accuracy": 0.5562084444984794, "num_input_tokens_seen": 6545823777, "num_tokens": 2758638062.0, "step": 13550 }, { "epoch": 3.2826923367080587, "grad_norm": 0.24609375, "learning_rate": 1.1196619888335598e-05, "loss": 2.089, "mean_token_accuracy": 0.5565486250445246, "num_input_tokens_seen": 6569949777, "num_tokens": 2768698376.0, "step": 13600 }, { "epoch": 3.2947628890867118, "grad_norm": 0.2431640625, "learning_rate": 1.1007997585634526e-05, "loss": 2.0915, "mean_token_accuracy": 0.5548499751463533, "num_input_tokens_seen": 6593997425, "num_tokens": 2778806953.0, "step": 13650 }, { "epoch": 3.306833441465365, "grad_norm": 0.330078125, "learning_rate": 1.0819375282933455e-05, "loss": 2.0875, "mean_token_accuracy": 0.5560770154371858, "num_input_tokens_seen": 6618153121, "num_tokens": 2789046249.0, "step": 13700 }, { "epoch": 3.3189039938440184, "grad_norm": 0.26171875, "learning_rate": 1.0630752980232384e-05, "loss": 2.0974, "mean_token_accuracy": 0.5540758088976144, "num_input_tokens_seen": 6642228561, "num_tokens": 2799134100.0, "step": 13750 }, { "epoch": 3.3309745462226714, "grad_norm": 0.2578125, "learning_rate": 1.0442130677531312e-05, "loss": 2.0837, "mean_token_accuracy": 0.5564264697581529, "num_input_tokens_seen": 6666487089, "num_tokens": 2809333203.0, "step": 13800 }, { "epoch": 3.343045098601325, "grad_norm": 0.271484375, "learning_rate": 1.025350837483024e-05, "loss": 2.0804, "mean_token_accuracy": 0.5564664682373405, "num_input_tokens_seen": 6690592209, "num_tokens": 2819507621.0, "step": 13850 }, { "epoch": 3.355115650979978, "grad_norm": 0.2578125, "learning_rate": 1.006488607212917e-05, "loss": 2.0875, "mean_token_accuracy": 0.5563617146387696, "num_input_tokens_seen": 6714782033, "num_tokens": 2829715451.0, "step": 13900 }, { "epoch": 3.367186203358631, "grad_norm": 0.26171875, "learning_rate": 9.876263769428096e-06, "loss": 2.1015, "mean_token_accuracy": 0.5533242063969374, "num_input_tokens_seen": 6738954721, "num_tokens": 2839876349.0, "step": 13950 }, { "epoch": 3.3792567557372846, "grad_norm": 0.2578125, "learning_rate": 9.687641466727027e-06, "loss": 2.1018, "num_input_tokens_seen": 6763271617, "step": 14000 }, { "epoch": 3.3792567557372846, "eval_loss": 1.9681081771850586, "eval_mean_token_accuracy": 0.5785279828634967, "eval_num_tokens": 2850147053.0, "eval_runtime": 131.6179, "eval_samples_per_second": 81.387, "eval_steps_per_second": 20.347, "num_input_tokens_seen": 6763271617, "step": 14000 }, { "epoch": 3.3913273081159376, "grad_norm": 0.25, "learning_rate": 9.499019164025955e-06, "loss": 2.0975, "mean_token_accuracy": 0.5536782286874949, "num_input_tokens_seen": 6787391841, "num_tokens": 2860344977.0, "step": 14050 }, { "epoch": 3.4033978604945907, "grad_norm": 0.25, "learning_rate": 9.310396861324884e-06, "loss": 2.1022, "mean_token_accuracy": 0.5538700968772173, "num_input_tokens_seen": 6811630961, "num_tokens": 2870526506.0, "step": 14100 }, { "epoch": 3.415468412873244, "grad_norm": 0.2431640625, "learning_rate": 9.121774558623813e-06, "loss": 2.0934, "mean_token_accuracy": 0.5550757900252938, "num_input_tokens_seen": 6835811825, "num_tokens": 2880722898.0, "step": 14150 }, { "epoch": 3.4275389652518973, "grad_norm": 0.2578125, "learning_rate": 8.93315225592274e-06, "loss": 2.0875, "mean_token_accuracy": 0.5558257311582565, "num_input_tokens_seen": 6859918049, "num_tokens": 2890925546.0, "step": 14200 }, { "epoch": 3.439609517630551, "grad_norm": 0.2294921875, "learning_rate": 8.74452995322167e-06, "loss": 2.0969, "mean_token_accuracy": 0.5544555878639221, "num_input_tokens_seen": 6883973409, "num_tokens": 2901007248.0, "step": 14250 }, { "epoch": 3.451680070009204, "grad_norm": 0.25390625, "learning_rate": 8.555907650520598e-06, "loss": 2.0987, "mean_token_accuracy": 0.5544828617200256, "num_input_tokens_seen": 6908263985, "num_tokens": 2911355124.0, "step": 14300 }, { "epoch": 3.463750622387857, "grad_norm": 0.271484375, "learning_rate": 8.367285347819527e-06, "loss": 2.0889, "mean_token_accuracy": 0.5557316156104207, "num_input_tokens_seen": 6932344993, "num_tokens": 2921442830.0, "step": 14350 }, { "epoch": 3.4758211747665104, "grad_norm": 0.255859375, "learning_rate": 8.178663045118456e-06, "loss": 2.0979, "mean_token_accuracy": 0.5547628674656153, "num_input_tokens_seen": 6956417041, "num_tokens": 2931461417.0, "step": 14400 }, { "epoch": 3.4878917271451635, "grad_norm": 0.234375, "learning_rate": 7.990040742417383e-06, "loss": 2.1005, "mean_token_accuracy": 0.5539160283654928, "num_input_tokens_seen": 6980421889, "num_tokens": 2941531928.0, "step": 14450 }, { "epoch": 3.4999622795238166, "grad_norm": 0.275390625, "learning_rate": 7.801418439716313e-06, "loss": 2.1017, "num_input_tokens_seen": 7004552193, "step": 14500 }, { "epoch": 3.4999622795238166, "eval_loss": 1.9681284427642822, "eval_mean_token_accuracy": 0.5785388401566912, "eval_num_tokens": 2951727207.0, "eval_runtime": 131.2276, "eval_samples_per_second": 81.629, "eval_steps_per_second": 20.407, "num_input_tokens_seen": 7004552193, "step": 14500 }, { "epoch": 3.51203283190247, "grad_norm": 0.267578125, "learning_rate": 7.612796137015241e-06, "loss": 2.09, "mean_token_accuracy": 0.5543626462481916, "num_input_tokens_seen": 7028775953, "num_tokens": 2961945579.0, "step": 14550 }, { "epoch": 3.524103384281123, "grad_norm": 0.26171875, "learning_rate": 7.42417383431417e-06, "loss": 2.0978, "mean_token_accuracy": 0.5544422981515527, "num_input_tokens_seen": 7052883457, "num_tokens": 2972173798.0, "step": 14600 }, { "epoch": 3.536173936659776, "grad_norm": 0.251953125, "learning_rate": 7.235551531613098e-06, "loss": 2.0915, "mean_token_accuracy": 0.5559014651551842, "num_input_tokens_seen": 7077135185, "num_tokens": 2982315453.0, "step": 14650 }, { "epoch": 3.5482444890384297, "grad_norm": 0.310546875, "learning_rate": 7.0469292289120274e-06, "loss": 2.0932, "mean_token_accuracy": 0.5552764968574047, "num_input_tokens_seen": 7101260305, "num_tokens": 2992557355.0, "step": 14700 }, { "epoch": 3.5603150414170828, "grad_norm": 0.25390625, "learning_rate": 6.858306926210955e-06, "loss": 2.0959, "mean_token_accuracy": 0.555088207796216, "num_input_tokens_seen": 7125198545, "num_tokens": 3002657117.0, "step": 14750 }, { "epoch": 3.572385593795736, "grad_norm": 0.2314453125, "learning_rate": 6.669684623509884e-06, "loss": 2.0933, "mean_token_accuracy": 0.5554985254630447, "num_input_tokens_seen": 7149297905, "num_tokens": 3012818977.0, "step": 14800 }, { "epoch": 3.5844561461743893, "grad_norm": 0.23828125, "learning_rate": 6.481062320808813e-06, "loss": 2.0901, "mean_token_accuracy": 0.5556722393259406, "num_input_tokens_seen": 7173408417, "num_tokens": 3022993500.0, "step": 14850 }, { "epoch": 3.5965266985530424, "grad_norm": 0.279296875, "learning_rate": 6.292440018107741e-06, "loss": 2.0862, "mean_token_accuracy": 0.5560053834319114, "num_input_tokens_seen": 7197689201, "num_tokens": 3033239485.0, "step": 14900 }, { "epoch": 3.608597250931696, "grad_norm": 0.265625, "learning_rate": 6.10381771540667e-06, "loss": 2.093, "mean_token_accuracy": 0.5550377672165632, "num_input_tokens_seen": 7221805553, "num_tokens": 3043356374.0, "step": 14950 }, { "epoch": 3.620667803310349, "grad_norm": 0.24609375, "learning_rate": 5.915195412705598e-06, "loss": 2.0994, "num_input_tokens_seen": 7245951473, "step": 15000 }, { "epoch": 3.620667803310349, "eval_loss": 1.9680702686309814, "eval_mean_token_accuracy": 0.5785124528710748, "eval_num_tokens": 3053564564.0, "eval_runtime": 130.6855, "eval_samples_per_second": 81.968, "eval_steps_per_second": 20.492, "num_input_tokens_seen": 7245951473, "step": 15000 }, { "epoch": 3.632738355689002, "grad_norm": 0.248046875, "learning_rate": 5.726573110004527e-06, "loss": 2.0923, "mean_token_accuracy": 0.554327048882842, "num_input_tokens_seen": 7269914417, "num_tokens": 3063722766.0, "step": 15050 }, { "epoch": 3.6448089080676556, "grad_norm": 0.26953125, "learning_rate": 5.5379508073034565e-06, "loss": 2.0861, "mean_token_accuracy": 0.5561711810901762, "num_input_tokens_seen": 7293956449, "num_tokens": 3073837103.0, "step": 15100 }, { "epoch": 3.6568794604463086, "grad_norm": 0.255859375, "learning_rate": 5.349328504602384e-06, "loss": 2.0949, "mean_token_accuracy": 0.5547518468275666, "num_input_tokens_seen": 7318063713, "num_tokens": 3083990662.0, "step": 15150 }, { "epoch": 3.668950012824962, "grad_norm": 0.26171875, "learning_rate": 5.160706201901313e-06, "loss": 2.0874, "mean_token_accuracy": 0.555564073510468, "num_input_tokens_seen": 7342169889, "num_tokens": 3094140237.0, "step": 15200 }, { "epoch": 3.681020565203615, "grad_norm": 0.298828125, "learning_rate": 4.9720838992002415e-06, "loss": 2.1014, "mean_token_accuracy": 0.5540463343262673, "num_input_tokens_seen": 7366510321, "num_tokens": 3104349542.0, "step": 15250 }, { "epoch": 3.6930911175822683, "grad_norm": 0.349609375, "learning_rate": 4.78346159649917e-06, "loss": 2.1128, "mean_token_accuracy": 0.5523605942726135, "num_input_tokens_seen": 7390741489, "num_tokens": 3114615388.0, "step": 15300 }, { "epoch": 3.7051616699609218, "grad_norm": 0.265625, "learning_rate": 4.594839293798099e-06, "loss": 2.0836, "mean_token_accuracy": 0.5556057692691684, "num_input_tokens_seen": 7414960897, "num_tokens": 3124852831.0, "step": 15350 }, { "epoch": 3.717232222339575, "grad_norm": 0.234375, "learning_rate": 4.406216991097027e-06, "loss": 2.0832, "mean_token_accuracy": 0.557135313116014, "num_input_tokens_seen": 7439141473, "num_tokens": 3135118289.0, "step": 15400 }, { "epoch": 3.729302774718228, "grad_norm": 0.3671875, "learning_rate": 4.217594688395956e-06, "loss": 2.0993, "mean_token_accuracy": 0.554307484254241, "num_input_tokens_seen": 7463114529, "num_tokens": 3145345547.0, "step": 15450 }, { "epoch": 3.7413733270968814, "grad_norm": 0.27734375, "learning_rate": 4.028972385694885e-06, "loss": 2.0916, "num_input_tokens_seen": 7487502401, "step": 15500 }, { "epoch": 3.7413733270968814, "eval_loss": 1.9680771827697754, "eval_mean_token_accuracy": 0.5784905481975056, "eval_num_tokens": 3155655029.0, "eval_runtime": 131.7036, "eval_samples_per_second": 81.334, "eval_steps_per_second": 20.334, "num_input_tokens_seen": 7487502401, "step": 15500 }, { "epoch": 3.7534438794755345, "grad_norm": 0.28125, "learning_rate": 3.840350082993813e-06, "loss": 2.0919, "mean_token_accuracy": 0.5555992320179939, "num_input_tokens_seen": 7511609537, "num_tokens": 3165956280.0, "step": 15550 }, { "epoch": 3.7655144318541875, "grad_norm": 0.265625, "learning_rate": 3.6517277802927423e-06, "loss": 2.096, "mean_token_accuracy": 0.5547948920354248, "num_input_tokens_seen": 7535873665, "num_tokens": 3176126007.0, "step": 15600 }, { "epoch": 3.777584984232841, "grad_norm": 0.25390625, "learning_rate": 3.463105477591671e-06, "loss": 2.0963, "mean_token_accuracy": 0.5549294283241033, "num_input_tokens_seen": 7560110225, "num_tokens": 3186367919.0, "step": 15650 }, { "epoch": 3.789655536611494, "grad_norm": 0.28125, "learning_rate": 3.274483174890599e-06, "loss": 2.0986, "mean_token_accuracy": 0.5547161266207695, "num_input_tokens_seen": 7584157249, "num_tokens": 3196441494.0, "step": 15700 }, { "epoch": 3.801726088990147, "grad_norm": 0.2470703125, "learning_rate": 3.0858608721895278e-06, "loss": 2.0952, "mean_token_accuracy": 0.554626210257411, "num_input_tokens_seen": 7608282737, "num_tokens": 3206560665.0, "step": 15750 }, { "epoch": 3.8137966413688007, "grad_norm": 0.265625, "learning_rate": 2.8972385694884564e-06, "loss": 2.0874, "mean_token_accuracy": 0.5568051477894187, "num_input_tokens_seen": 7632420385, "num_tokens": 3216638197.0, "step": 15800 }, { "epoch": 3.8258671937474538, "grad_norm": 0.357421875, "learning_rate": 2.708616266787385e-06, "loss": 2.0935, "mean_token_accuracy": 0.5554833044111729, "num_input_tokens_seen": 7656546049, "num_tokens": 3226721685.0, "step": 15850 }, { "epoch": 3.837937746126107, "grad_norm": 0.216796875, "learning_rate": 2.5199939640863136e-06, "loss": 2.1087, "mean_token_accuracy": 0.5525853624939918, "num_input_tokens_seen": 7680650689, "num_tokens": 3236963270.0, "step": 15900 }, { "epoch": 3.8500082985047603, "grad_norm": 0.267578125, "learning_rate": 2.3313716613852423e-06, "loss": 2.0952, "mean_token_accuracy": 0.5550886183232069, "num_input_tokens_seen": 7704888257, "num_tokens": 3247161827.0, "step": 15950 }, { "epoch": 3.8620788508834134, "grad_norm": 0.2333984375, "learning_rate": 2.142749358684171e-06, "loss": 2.0914, "num_input_tokens_seen": 7729128049, "step": 16000 }, { "epoch": 3.8620788508834134, "eval_loss": 1.9680593013763428, "eval_mean_token_accuracy": 0.5785026788667034, "eval_num_tokens": 3257367290.0, "eval_runtime": 130.8705, "eval_samples_per_second": 81.852, "eval_steps_per_second": 20.463, "num_input_tokens_seen": 7729128049, "step": 16000 }, { "epoch": 3.874149403262067, "grad_norm": 0.26953125, "learning_rate": 1.9541270559830995e-06, "loss": 2.0974, "mean_token_accuracy": 0.5552676925435662, "num_input_tokens_seen": 7753351249, "num_tokens": 3267712597.0, "step": 16050 }, { "epoch": 3.88621995564072, "grad_norm": 0.27734375, "learning_rate": 1.7655047532820282e-06, "loss": 2.0956, "mean_token_accuracy": 0.5543999705091118, "num_input_tokens_seen": 7777499137, "num_tokens": 3277902761.0, "step": 16100 }, { "epoch": 3.8982905080193735, "grad_norm": 0.25, "learning_rate": 1.576882450580957e-06, "loss": 2.0949, "mean_token_accuracy": 0.5548696434870363, "num_input_tokens_seen": 7801633089, "num_tokens": 3288082221.0, "step": 16150 }, { "epoch": 3.9103610603980266, "grad_norm": 0.24609375, "learning_rate": 1.3882601478798854e-06, "loss": 2.0986, "mean_token_accuracy": 0.5540741100907326, "num_input_tokens_seen": 7825738481, "num_tokens": 3298341216.0, "step": 16200 }, { "epoch": 3.9224316127766796, "grad_norm": 0.263671875, "learning_rate": 1.199637845178814e-06, "loss": 2.0917, "mean_token_accuracy": 0.556021711602807, "num_input_tokens_seen": 7849877409, "num_tokens": 3308431697.0, "step": 16250 }, { "epoch": 3.934502165155333, "grad_norm": 0.255859375, "learning_rate": 1.0110155424777427e-06, "loss": 2.0983, "mean_token_accuracy": 0.5542013296857476, "num_input_tokens_seen": 7873955105, "num_tokens": 3318516095.0, "step": 16300 }, { "epoch": 3.946572717533986, "grad_norm": 0.2490234375, "learning_rate": 8.223932397766712e-07, "loss": 2.1049, "mean_token_accuracy": 0.553213356398046, "num_input_tokens_seen": 7898153793, "num_tokens": 3328721862.0, "step": 16350 }, { "epoch": 3.9586432699126393, "grad_norm": 0.2734375, "learning_rate": 6.337709370755999e-07, "loss": 2.0981, "mean_token_accuracy": 0.5537924468889832, "num_input_tokens_seen": 7922266993, "num_tokens": 3338965333.0, "step": 16400 }, { "epoch": 3.9707138222912928, "grad_norm": 0.2431640625, "learning_rate": 4.4514863437452844e-07, "loss": 2.0952, "mean_token_accuracy": 0.5555017331615091, "num_input_tokens_seen": 7946376977, "num_tokens": 3349129194.0, "step": 16450 }, { "epoch": 3.982784374669946, "grad_norm": 0.279296875, "learning_rate": 2.565263316734571e-07, "loss": 2.0973, "num_input_tokens_seen": 7970557393, "step": 16500 }, { "epoch": 3.982784374669946, "eval_loss": 1.9680447578430176, "eval_mean_token_accuracy": 0.5785172289325018, "eval_num_tokens": 3359357384.0, "eval_runtime": 131.3028, "eval_samples_per_second": 81.582, "eval_steps_per_second": 20.396, "num_input_tokens_seen": 7970557393, "step": 16500 }, { "epoch": 3.994854927048599, "grad_norm": 0.2392578125, "learning_rate": 6.79040289723857e-08, "loss": 2.0972, "mean_token_accuracy": 0.5545667923986912, "num_input_tokens_seen": 7994787313, "num_tokens": 3369677528.0, "step": 16550 } ], "logging_steps": 50, "max_steps": 16568, "num_input_tokens_seen": 8003589233, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.141038234858414e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }