smallm_70_instruct / last-checkpoint /trainer_state.json
Azrail's picture
Training in progress, step 16568, checkpoint
30a1120 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9992003259049143,
"eval_steps": 500,
"global_step": 16568,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012070552378653229,
"grad_norm": 1.5390625,
"learning_rate": 7.543753771876886e-07,
"loss": 2.6793,
"mean_token_accuracy": 0.5008124655112625,
"num_input_tokens_seen": 24135392,
"num_tokens": 10166805.0,
"step": 50
},
{
"epoch": 0.024141104757306457,
"grad_norm": 1.578125,
"learning_rate": 1.5087507543753772e-06,
"loss": 2.683,
"mean_token_accuracy": 0.5005724773928523,
"num_input_tokens_seen": 48153152,
"num_tokens": 20365412.0,
"step": 100
},
{
"epoch": 0.03621165713595968,
"grad_norm": 4.25,
"learning_rate": 2.2631261315630656e-06,
"loss": 2.6827,
"mean_token_accuracy": 0.5006869393214584,
"num_input_tokens_seen": 72337344,
"num_tokens": 30424121.0,
"step": 150
},
{
"epoch": 0.048282209514612914,
"grad_norm": 1.6640625,
"learning_rate": 3.0175015087507544e-06,
"loss": 2.6854,
"mean_token_accuracy": 0.49996432337909935,
"num_input_tokens_seen": 96540720,
"num_tokens": 40650709.0,
"step": 200
},
{
"epoch": 0.06035276189326614,
"grad_norm": 1.671875,
"learning_rate": 3.771876885938443e-06,
"loss": 2.6589,
"mean_token_accuracy": 0.5033125403895974,
"num_input_tokens_seen": 120664016,
"num_tokens": 50830609.0,
"step": 250
},
{
"epoch": 0.07242331427191936,
"grad_norm": 1.4375,
"learning_rate": 4.526252263126131e-06,
"loss": 2.6724,
"mean_token_accuracy": 0.5016581188514828,
"num_input_tokens_seen": 144779584,
"num_tokens": 60898431.0,
"step": 300
},
{
"epoch": 0.0844938666505726,
"grad_norm": 1.484375,
"learning_rate": 5.280627640313821e-06,
"loss": 2.6515,
"mean_token_accuracy": 0.5036601684615016,
"num_input_tokens_seen": 168931312,
"num_tokens": 71195944.0,
"step": 350
},
{
"epoch": 0.09656441902922583,
"grad_norm": 1.40625,
"learning_rate": 6.035003017501509e-06,
"loss": 2.6474,
"mean_token_accuracy": 0.5041869094967842,
"num_input_tokens_seen": 192934592,
"num_tokens": 81381172.0,
"step": 400
},
{
"epoch": 0.10863497140787905,
"grad_norm": 1.3359375,
"learning_rate": 6.789378394689197e-06,
"loss": 2.6561,
"mean_token_accuracy": 0.5037067172303796,
"num_input_tokens_seen": 217010448,
"num_tokens": 91569720.0,
"step": 450
},
{
"epoch": 0.12070552378653228,
"grad_norm": 1.2421875,
"learning_rate": 7.543753771876886e-06,
"loss": 2.6454,
"num_input_tokens_seen": 241172672,
"step": 500
},
{
"epoch": 0.12070552378653228,
"eval_loss": 2.550887107849121,
"eval_mean_token_accuracy": 0.5239145074365208,
"eval_num_tokens": 101801812.0,
"eval_runtime": 125.6328,
"eval_samples_per_second": 85.264,
"eval_steps_per_second": 21.316,
"num_input_tokens_seen": 241172672,
"step": 500
},
{
"epoch": 0.1327760761651855,
"grad_norm": 1.234375,
"learning_rate": 8.298129149064575e-06,
"loss": 2.6413,
"mean_token_accuracy": 0.5040716730989516,
"num_input_tokens_seen": 265089232,
"num_tokens": 111840270.0,
"step": 550
},
{
"epoch": 0.14484662854383873,
"grad_norm": 1.0859375,
"learning_rate": 9.052504526252262e-06,
"loss": 2.6279,
"mean_token_accuracy": 0.5059382322058081,
"num_input_tokens_seen": 289296624,
"num_tokens": 121987982.0,
"step": 600
},
{
"epoch": 0.15691718092249196,
"grad_norm": 0.93359375,
"learning_rate": 9.806879903439953e-06,
"loss": 2.6104,
"mean_token_accuracy": 0.5081973234936595,
"num_input_tokens_seen": 313326128,
"num_tokens": 132144416.0,
"step": 650
},
{
"epoch": 0.1689877333011452,
"grad_norm": 0.8828125,
"learning_rate": 1.0561255280627642e-05,
"loss": 2.611,
"mean_token_accuracy": 0.5080171688646078,
"num_input_tokens_seen": 337553504,
"num_tokens": 142378323.0,
"step": 700
},
{
"epoch": 0.18105828567979843,
"grad_norm": 0.76171875,
"learning_rate": 1.1315630657815329e-05,
"loss": 2.6039,
"mean_token_accuracy": 0.5081416912004352,
"num_input_tokens_seen": 361961104,
"num_tokens": 152623392.0,
"step": 750
},
{
"epoch": 0.19312883805845166,
"grad_norm": 0.7421875,
"learning_rate": 1.2070006035003018e-05,
"loss": 2.5936,
"mean_token_accuracy": 0.5093595118448139,
"num_input_tokens_seen": 386105904,
"num_tokens": 162827766.0,
"step": 800
},
{
"epoch": 0.2051993904371049,
"grad_norm": 0.86328125,
"learning_rate": 1.2824381412190706e-05,
"loss": 2.5887,
"mean_token_accuracy": 0.5100815147906542,
"num_input_tokens_seen": 410248672,
"num_tokens": 172979899.0,
"step": 850
},
{
"epoch": 0.2172699428157581,
"grad_norm": 0.81640625,
"learning_rate": 1.3578756789378394e-05,
"loss": 2.5903,
"mean_token_accuracy": 0.5090210309624672,
"num_input_tokens_seen": 434357040,
"num_tokens": 183138615.0,
"step": 900
},
{
"epoch": 0.22934049519441133,
"grad_norm": 0.69921875,
"learning_rate": 1.4333132166566086e-05,
"loss": 2.5816,
"mean_token_accuracy": 0.5105240726843476,
"num_input_tokens_seen": 458609136,
"num_tokens": 193253896.0,
"step": 950
},
{
"epoch": 0.24141104757306456,
"grad_norm": 0.64453125,
"learning_rate": 1.5087507543753773e-05,
"loss": 2.558,
"num_input_tokens_seen": 482701824,
"step": 1000
},
{
"epoch": 0.24141104757306456,
"eval_loss": 2.4748051166534424,
"eval_mean_token_accuracy": 0.529798322766908,
"eval_num_tokens": 203431197.0,
"eval_runtime": 125.5514,
"eval_samples_per_second": 85.32,
"eval_steps_per_second": 21.33,
"num_input_tokens_seen": 482701824,
"step": 1000
},
{
"epoch": 0.2534815999517178,
"grad_norm": 0.62890625,
"learning_rate": 1.584188292094146e-05,
"loss": 2.557,
"mean_token_accuracy": 0.5126943441852927,
"num_input_tokens_seen": 506798000,
"num_tokens": 213604239.0,
"step": 1050
},
{
"epoch": 0.265552152330371,
"grad_norm": 0.578125,
"learning_rate": 1.659625829812915e-05,
"loss": 2.5533,
"mean_token_accuracy": 0.5126326360553503,
"num_input_tokens_seen": 530786496,
"num_tokens": 223786306.0,
"step": 1100
},
{
"epoch": 0.2776227047090242,
"grad_norm": 0.56640625,
"learning_rate": 1.7350633675316838e-05,
"loss": 2.5528,
"mean_token_accuracy": 0.5130741761997342,
"num_input_tokens_seen": 554798768,
"num_tokens": 233857737.0,
"step": 1150
},
{
"epoch": 0.28969325708767746,
"grad_norm": 0.5234375,
"learning_rate": 1.8105009052504525e-05,
"loss": 2.5433,
"mean_token_accuracy": 0.5140750538557768,
"num_input_tokens_seen": 578877696,
"num_tokens": 243994162.0,
"step": 1200
},
{
"epoch": 0.3017638094663307,
"grad_norm": 0.671875,
"learning_rate": 1.8859384429692215e-05,
"loss": 2.5414,
"mean_token_accuracy": 0.5151798555627465,
"num_input_tokens_seen": 603141184,
"num_tokens": 254196491.0,
"step": 1250
},
{
"epoch": 0.3138343618449839,
"grad_norm": 0.59375,
"learning_rate": 1.9613759806879906e-05,
"loss": 2.5383,
"mean_token_accuracy": 0.5148227337375283,
"num_input_tokens_seen": 627221936,
"num_tokens": 264309463.0,
"step": 1300
},
{
"epoch": 0.32590491422363715,
"grad_norm": 0.498046875,
"learning_rate": 2.0368135184067593e-05,
"loss": 2.5292,
"mean_token_accuracy": 0.5156710411980748,
"num_input_tokens_seen": 651386848,
"num_tokens": 274462595.0,
"step": 1350
},
{
"epoch": 0.3379754666022904,
"grad_norm": 0.51171875,
"learning_rate": 2.1122510561255283e-05,
"loss": 2.523,
"mean_token_accuracy": 0.5161588852107525,
"num_input_tokens_seen": 675579984,
"num_tokens": 284636254.0,
"step": 1400
},
{
"epoch": 0.3500460189809436,
"grad_norm": 6.375,
"learning_rate": 2.187688593844297e-05,
"loss": 2.528,
"mean_token_accuracy": 0.5156370849534869,
"num_input_tokens_seen": 699751424,
"num_tokens": 294907617.0,
"step": 1450
},
{
"epoch": 0.36211657135959685,
"grad_norm": 0.68359375,
"learning_rate": 2.2631261315630658e-05,
"loss": 2.5232,
"num_input_tokens_seen": 723883888,
"step": 1500
},
{
"epoch": 0.36211657135959685,
"eval_loss": 2.42679500579834,
"eval_mean_token_accuracy": 0.5353444569074368,
"eval_num_tokens": 305087605.0,
"eval_runtime": 125.481,
"eval_samples_per_second": 85.367,
"eval_steps_per_second": 21.342,
"num_input_tokens_seen": 723883888,
"step": 1500
},
{
"epoch": 0.3741871237382501,
"grad_norm": 0.455078125,
"learning_rate": 2.3385636692818348e-05,
"loss": 2.5178,
"mean_token_accuracy": 0.5161234551295638,
"num_input_tokens_seen": 748106064,
"num_tokens": 315240015.0,
"step": 1550
},
{
"epoch": 0.3862576761169033,
"grad_norm": 0.5078125,
"learning_rate": 2.4140012070006035e-05,
"loss": 2.5157,
"mean_token_accuracy": 0.5167792574688792,
"num_input_tokens_seen": 772242320,
"num_tokens": 325393577.0,
"step": 1600
},
{
"epoch": 0.39832822849555655,
"grad_norm": 0.54296875,
"learning_rate": 2.4894387447193726e-05,
"loss": 2.505,
"mean_token_accuracy": 0.5181788290664554,
"num_input_tokens_seen": 796536976,
"num_tokens": 335636146.0,
"step": 1650
},
{
"epoch": 0.4103987808742098,
"grad_norm": 0.50390625,
"learning_rate": 2.5648762824381413e-05,
"loss": 2.5034,
"mean_token_accuracy": 0.5191374982148409,
"num_input_tokens_seen": 820727792,
"num_tokens": 345721073.0,
"step": 1700
},
{
"epoch": 0.422469333252863,
"grad_norm": 0.40234375,
"learning_rate": 2.64031382015691e-05,
"loss": 2.5049,
"mean_token_accuracy": 0.5170740441232919,
"num_input_tokens_seen": 844900704,
"num_tokens": 355941787.0,
"step": 1750
},
{
"epoch": 0.4345398856315162,
"grad_norm": 0.46484375,
"learning_rate": 2.7157513578756787e-05,
"loss": 2.4944,
"mean_token_accuracy": 0.5191365649551153,
"num_input_tokens_seen": 868882304,
"num_tokens": 366140276.0,
"step": 1800
},
{
"epoch": 0.4466104380101694,
"grad_norm": 0.4609375,
"learning_rate": 2.791188895594448e-05,
"loss": 2.4852,
"mean_token_accuracy": 0.519771606773138,
"num_input_tokens_seen": 893201120,
"num_tokens": 376397263.0,
"step": 1850
},
{
"epoch": 0.45868099038882265,
"grad_norm": 0.42578125,
"learning_rate": 2.866626433313217e-05,
"loss": 2.4242,
"mean_token_accuracy": 0.5218987537547946,
"num_input_tokens_seen": 917474880,
"num_tokens": 386613084.0,
"step": 1900
},
{
"epoch": 0.4707515427674759,
"grad_norm": 0.486328125,
"learning_rate": 2.942063971031986e-05,
"loss": 2.4118,
"mean_token_accuracy": 0.5205260647833347,
"num_input_tokens_seen": 941694304,
"num_tokens": 396813412.0,
"step": 1950
},
{
"epoch": 0.4828220951461291,
"grad_norm": 0.421875,
"learning_rate": 3.0175015087507546e-05,
"loss": 2.3856,
"num_input_tokens_seen": 965894272,
"step": 2000
},
{
"epoch": 0.4828220951461291,
"eval_loss": 2.281926155090332,
"eval_mean_token_accuracy": 0.5452906315098778,
"eval_num_tokens": 407043305.0,
"eval_runtime": 126.3646,
"eval_samples_per_second": 84.771,
"eval_steps_per_second": 21.193,
"num_input_tokens_seen": 965894272,
"step": 2000
},
{
"epoch": 0.49489264752478235,
"grad_norm": 0.55859375,
"learning_rate": 3.092939046469523e-05,
"loss": 2.3729,
"mean_token_accuracy": 0.5244628588855267,
"num_input_tokens_seen": 990100416,
"num_tokens": 417281294.0,
"step": 2050
},
{
"epoch": 0.5069631999034356,
"grad_norm": 0.46484375,
"learning_rate": 3.168376584188292e-05,
"loss": 2.365,
"mean_token_accuracy": 0.5261798568814993,
"num_input_tokens_seen": 1014101792,
"num_tokens": 427442066.0,
"step": 2100
},
{
"epoch": 0.5190337522820888,
"grad_norm": 0.4296875,
"learning_rate": 3.2438141219070614e-05,
"loss": 2.3679,
"mean_token_accuracy": 0.5253909171745181,
"num_input_tokens_seen": 1038463120,
"num_tokens": 437736421.0,
"step": 2150
},
{
"epoch": 0.531104304660742,
"grad_norm": 0.404296875,
"learning_rate": 3.31925165962583e-05,
"loss": 2.3664,
"mean_token_accuracy": 0.5255175845324993,
"num_input_tokens_seen": 1062548608,
"num_tokens": 447839969.0,
"step": 2200
},
{
"epoch": 0.5431748570393953,
"grad_norm": 0.376953125,
"learning_rate": 3.394689197344599e-05,
"loss": 2.3566,
"mean_token_accuracy": 0.5265485693141818,
"num_input_tokens_seen": 1086859296,
"num_tokens": 458069337.0,
"step": 2250
},
{
"epoch": 0.5552454094180485,
"grad_norm": 0.376953125,
"learning_rate": 3.4701267350633675e-05,
"loss": 2.3424,
"mean_token_accuracy": 0.529197344481945,
"num_input_tokens_seen": 1110954480,
"num_tokens": 468219132.0,
"step": 2300
},
{
"epoch": 0.5673159617967017,
"grad_norm": 0.376953125,
"learning_rate": 3.545564272782136e-05,
"loss": 2.3344,
"mean_token_accuracy": 0.5297643894702196,
"num_input_tokens_seen": 1135145632,
"num_tokens": 478383651.0,
"step": 2350
},
{
"epoch": 0.5793865141753549,
"grad_norm": 0.3984375,
"learning_rate": 3.621001810500905e-05,
"loss": 2.3403,
"mean_token_accuracy": 0.5286615265905857,
"num_input_tokens_seen": 1159223072,
"num_tokens": 488558846.0,
"step": 2400
},
{
"epoch": 0.5914570665540082,
"grad_norm": 0.38671875,
"learning_rate": 3.696439348219674e-05,
"loss": 2.3366,
"mean_token_accuracy": 0.5282911998406052,
"num_input_tokens_seen": 1183346592,
"num_tokens": 498693280.0,
"step": 2450
},
{
"epoch": 0.6035276189326614,
"grad_norm": 0.36328125,
"learning_rate": 3.771876885938443e-05,
"loss": 2.3361,
"num_input_tokens_seen": 1207430512,
"step": 2500
},
{
"epoch": 0.6035276189326614,
"eval_loss": 2.2252955436706543,
"eval_mean_token_accuracy": 0.5513953688186527,
"eval_num_tokens": 508899754.0,
"eval_runtime": 125.696,
"eval_samples_per_second": 85.221,
"eval_steps_per_second": 21.305,
"num_input_tokens_seen": 1207430512,
"step": 2500
},
{
"epoch": 0.6155981713113147,
"grad_norm": 0.33984375,
"learning_rate": 3.8473144236572124e-05,
"loss": 2.3191,
"mean_token_accuracy": 0.5302580918744206,
"num_input_tokens_seen": 1231510064,
"num_tokens": 519053973.0,
"step": 2550
},
{
"epoch": 0.6276687236899678,
"grad_norm": 0.40234375,
"learning_rate": 3.922751961375981e-05,
"loss": 2.3184,
"mean_token_accuracy": 0.5312636430934071,
"num_input_tokens_seen": 1255661376,
"num_tokens": 529292647.0,
"step": 2600
},
{
"epoch": 0.6397392760686211,
"grad_norm": 0.361328125,
"learning_rate": 3.99818949909475e-05,
"loss": 2.3149,
"mean_token_accuracy": 0.5317899576947093,
"num_input_tokens_seen": 1279915488,
"num_tokens": 539406430.0,
"step": 2650
},
{
"epoch": 0.6518098284472743,
"grad_norm": 0.37890625,
"learning_rate": 4.0736270368135186e-05,
"loss": 2.3226,
"mean_token_accuracy": 0.530639638863504,
"num_input_tokens_seen": 1303974000,
"num_tokens": 549560793.0,
"step": 2700
},
{
"epoch": 0.6638803808259276,
"grad_norm": 0.3046875,
"learning_rate": 4.149064574532287e-05,
"loss": 2.3027,
"mean_token_accuracy": 0.5339162700995803,
"num_input_tokens_seen": 1328099056,
"num_tokens": 559731643.0,
"step": 2750
},
{
"epoch": 0.6759509332045808,
"grad_norm": 0.353515625,
"learning_rate": 4.224502112251057e-05,
"loss": 2.3041,
"mean_token_accuracy": 0.5334529640898108,
"num_input_tokens_seen": 1352139632,
"num_tokens": 569834852.0,
"step": 2800
},
{
"epoch": 0.688021485583234,
"grad_norm": 0.333984375,
"learning_rate": 4.2999396499698254e-05,
"loss": 2.2978,
"mean_token_accuracy": 0.5343039923906326,
"num_input_tokens_seen": 1376232240,
"num_tokens": 579992251.0,
"step": 2850
},
{
"epoch": 0.7000920379618872,
"grad_norm": 0.330078125,
"learning_rate": 4.375377187688594e-05,
"loss": 2.2834,
"mean_token_accuracy": 0.5361328301951289,
"num_input_tokens_seen": 1400431104,
"num_tokens": 590199795.0,
"step": 2900
},
{
"epoch": 0.7121625903405404,
"grad_norm": 0.337890625,
"learning_rate": 4.450814725407363e-05,
"loss": 2.301,
"mean_token_accuracy": 0.5332023718208074,
"num_input_tokens_seen": 1424615616,
"num_tokens": 600407039.0,
"step": 2950
},
{
"epoch": 0.7242331427191937,
"grad_norm": 0.291015625,
"learning_rate": 4.5262522631261315e-05,
"loss": 2.2846,
"num_input_tokens_seen": 1448897616,
"step": 3000
},
{
"epoch": 0.7242331427191937,
"eval_loss": 2.174436569213867,
"eval_mean_token_accuracy": 0.5580886014145288,
"eval_num_tokens": 610496575.0,
"eval_runtime": 125.6585,
"eval_samples_per_second": 85.247,
"eval_steps_per_second": 21.312,
"num_input_tokens_seen": 1448897616,
"step": 3000
},
{
"epoch": 0.7363036950978469,
"grad_norm": 0.298828125,
"learning_rate": 4.6016898008449e-05,
"loss": 2.2866,
"mean_token_accuracy": 0.535948946569115,
"num_input_tokens_seen": 1472853872,
"num_tokens": 620644723.0,
"step": 3050
},
{
"epoch": 0.7483742474765002,
"grad_norm": 0.296875,
"learning_rate": 4.6771273385636696e-05,
"loss": 2.2713,
"mean_token_accuracy": 0.5381996771320701,
"num_input_tokens_seen": 1497112768,
"num_tokens": 630831881.0,
"step": 3100
},
{
"epoch": 0.7604447998551533,
"grad_norm": 0.318359375,
"learning_rate": 4.752564876282438e-05,
"loss": 2.2659,
"mean_token_accuracy": 0.5379517000168562,
"num_input_tokens_seen": 1521326656,
"num_tokens": 640973141.0,
"step": 3150
},
{
"epoch": 0.7725153522338066,
"grad_norm": 0.27734375,
"learning_rate": 4.828002414001207e-05,
"loss": 2.2534,
"mean_token_accuracy": 0.5404072028771043,
"num_input_tokens_seen": 1545505712,
"num_tokens": 651205945.0,
"step": 3200
},
{
"epoch": 0.7845859046124598,
"grad_norm": 0.267578125,
"learning_rate": 4.903439951719976e-05,
"loss": 2.2662,
"mean_token_accuracy": 0.5395008590817452,
"num_input_tokens_seen": 1569597408,
"num_tokens": 661422296.0,
"step": 3250
},
{
"epoch": 0.7966564569911131,
"grad_norm": 0.306640625,
"learning_rate": 4.978877489438745e-05,
"loss": 2.2429,
"mean_token_accuracy": 0.5423036898300052,
"num_input_tokens_seen": 1593795440,
"num_tokens": 671551427.0,
"step": 3300
},
{
"epoch": 0.8087270093697663,
"grad_norm": 0.29296875,
"learning_rate": 4.9864191942055236e-05,
"loss": 2.234,
"mean_token_accuracy": 0.5444167210906744,
"num_input_tokens_seen": 1618048688,
"num_tokens": 681790219.0,
"step": 3350
},
{
"epoch": 0.8207975617484196,
"grad_norm": 0.306640625,
"learning_rate": 4.967556963935416e-05,
"loss": 2.2466,
"mean_token_accuracy": 0.5417763916775584,
"num_input_tokens_seen": 1642159776,
"num_tokens": 691978554.0,
"step": 3400
},
{
"epoch": 0.8328681141270727,
"grad_norm": 0.30859375,
"learning_rate": 4.9486947336653086e-05,
"loss": 2.2358,
"mean_token_accuracy": 0.5441526301577687,
"num_input_tokens_seen": 1666410544,
"num_tokens": 702184762.0,
"step": 3450
},
{
"epoch": 0.844938666505726,
"grad_norm": 0.2578125,
"learning_rate": 4.929832503395201e-05,
"loss": 2.2274,
"num_input_tokens_seen": 1690571888,
"step": 3500
},
{
"epoch": 0.844938666505726,
"eval_loss": 2.110778331756592,
"eval_mean_token_accuracy": 0.5686311067219009,
"eval_num_tokens": 712362296.0,
"eval_runtime": 127.3714,
"eval_samples_per_second": 84.1,
"eval_steps_per_second": 21.025,
"num_input_tokens_seen": 1690571888,
"step": 3500
},
{
"epoch": 0.8570092188843792,
"grad_norm": 0.283203125,
"learning_rate": 4.9109702731250944e-05,
"loss": 2.2322,
"mean_token_accuracy": 0.5444289642199874,
"num_input_tokens_seen": 1714787776,
"num_tokens": 722682302.0,
"step": 3550
},
{
"epoch": 0.8690797712630324,
"grad_norm": 0.3203125,
"learning_rate": 4.8921080428549876e-05,
"loss": 2.2194,
"mean_token_accuracy": 0.5459688815101981,
"num_input_tokens_seen": 1739009552,
"num_tokens": 732823822.0,
"step": 3600
},
{
"epoch": 0.8811503236416857,
"grad_norm": 0.267578125,
"learning_rate": 4.87324581258488e-05,
"loss": 2.2139,
"mean_token_accuracy": 0.5467682545632124,
"num_input_tokens_seen": 1763209840,
"num_tokens": 743138600.0,
"step": 3650
},
{
"epoch": 0.8932208760203388,
"grad_norm": 0.328125,
"learning_rate": 4.854383582314773e-05,
"loss": 2.204,
"mean_token_accuracy": 0.5477216844260693,
"num_input_tokens_seen": 1787295680,
"num_tokens": 753284868.0,
"step": 3700
},
{
"epoch": 0.9052914283989921,
"grad_norm": 0.306640625,
"learning_rate": 4.835521352044666e-05,
"loss": 2.186,
"mean_token_accuracy": 0.5463542007282376,
"num_input_tokens_seen": 1811501840,
"num_tokens": 763533047.0,
"step": 3750
},
{
"epoch": 0.9173619807776453,
"grad_norm": 0.294921875,
"learning_rate": 4.816659121774559e-05,
"loss": 2.1705,
"mean_token_accuracy": 0.5472249809652567,
"num_input_tokens_seen": 1835579680,
"num_tokens": 773772552.0,
"step": 3800
},
{
"epoch": 0.9294325331562986,
"grad_norm": 0.2578125,
"learning_rate": 4.797796891504452e-05,
"loss": 2.1472,
"mean_token_accuracy": 0.5502070318907499,
"num_input_tokens_seen": 1859762928,
"num_tokens": 783996309.0,
"step": 3850
},
{
"epoch": 0.9415030855349518,
"grad_norm": 0.30078125,
"learning_rate": 4.778934661234345e-05,
"loss": 2.1494,
"mean_token_accuracy": 0.548779489658773,
"num_input_tokens_seen": 1883948656,
"num_tokens": 794252324.0,
"step": 3900
},
{
"epoch": 0.953573637913605,
"grad_norm": 0.29296875,
"learning_rate": 4.760072430964237e-05,
"loss": 2.1484,
"mean_token_accuracy": 0.5490481401607394,
"num_input_tokens_seen": 1908219840,
"num_tokens": 804538128.0,
"step": 3950
},
{
"epoch": 0.9656441902922582,
"grad_norm": 0.291015625,
"learning_rate": 4.7412102006941305e-05,
"loss": 2.1447,
"num_input_tokens_seen": 1932223680,
"step": 4000
},
{
"epoch": 0.9656441902922582,
"eval_loss": 2.0152089595794678,
"eval_mean_token_accuracy": 0.5734592423989222,
"eval_num_tokens": 814660681.0,
"eval_runtime": 126.585,
"eval_samples_per_second": 84.623,
"eval_steps_per_second": 21.156,
"num_input_tokens_seen": 1932223680,
"step": 4000
},
{
"epoch": 0.9777147426709115,
"grad_norm": 0.298828125,
"learning_rate": 4.722347970424023e-05,
"loss": 2.1476,
"mean_token_accuracy": 0.5491000188142061,
"num_input_tokens_seen": 1956345120,
"num_tokens": 824824558.0,
"step": 4050
},
{
"epoch": 0.9897852950495647,
"grad_norm": 0.2890625,
"learning_rate": 4.703485740153916e-05,
"loss": 2.1336,
"mean_token_accuracy": 0.5505272497236728,
"num_input_tokens_seen": 1980539728,
"num_tokens": 835004823.0,
"step": 4100
},
{
"epoch": 1.0016898773330114,
"grad_norm": 0.2890625,
"learning_rate": 4.684623509883809e-05,
"loss": 2.1376,
"mean_token_accuracy": 0.5500758526367531,
"num_input_tokens_seen": 2004388912,
"num_tokens": 844972763.0,
"step": 4150
},
{
"epoch": 1.0137604297116647,
"grad_norm": 0.275390625,
"learning_rate": 4.665761279613702e-05,
"loss": 2.1349,
"mean_token_accuracy": 0.5500743924826383,
"num_input_tokens_seen": 2028622064,
"num_tokens": 855126584.0,
"step": 4200
},
{
"epoch": 1.025830982090318,
"grad_norm": 0.283203125,
"learning_rate": 4.646899049343595e-05,
"loss": 2.1248,
"mean_token_accuracy": 0.5514456473290921,
"num_input_tokens_seen": 2052718336,
"num_tokens": 865332386.0,
"step": 4250
},
{
"epoch": 1.037901534468971,
"grad_norm": 0.28125,
"learning_rate": 4.6280368190734876e-05,
"loss": 2.1088,
"mean_token_accuracy": 0.5532256289571523,
"num_input_tokens_seen": 2076571680,
"num_tokens": 875448332.0,
"step": 4300
},
{
"epoch": 1.0499720868476243,
"grad_norm": 0.326171875,
"learning_rate": 4.60917458880338e-05,
"loss": 2.1184,
"mean_token_accuracy": 0.5509732039645314,
"num_input_tokens_seen": 2100726912,
"num_tokens": 885623694.0,
"step": 4350
},
{
"epoch": 1.0620426392262776,
"grad_norm": 0.310546875,
"learning_rate": 4.590312358533273e-05,
"loss": 2.1324,
"mean_token_accuracy": 0.5498137963563203,
"num_input_tokens_seen": 2124980016,
"num_tokens": 895774422.0,
"step": 4400
},
{
"epoch": 1.074113191604931,
"grad_norm": 0.32421875,
"learning_rate": 4.571450128263166e-05,
"loss": 2.1195,
"mean_token_accuracy": 0.551388250514865,
"num_input_tokens_seen": 2149237504,
"num_tokens": 905968753.0,
"step": 4450
},
{
"epoch": 1.086183743983584,
"grad_norm": 0.296875,
"learning_rate": 4.552587897993059e-05,
"loss": 2.1195,
"num_input_tokens_seen": 2173337456,
"step": 4500
},
{
"epoch": 1.086183743983584,
"eval_loss": 1.989871859550476,
"eval_mean_token_accuracy": 0.5754866465826013,
"eval_num_tokens": 916079112.0,
"eval_runtime": 128.4454,
"eval_samples_per_second": 83.397,
"eval_steps_per_second": 20.849,
"num_input_tokens_seen": 2173337456,
"step": 4500
},
{
"epoch": 1.0982542963622373,
"grad_norm": 0.287109375,
"learning_rate": 4.5337256677229516e-05,
"loss": 2.1218,
"mean_token_accuracy": 0.5514280049689114,
"num_input_tokens_seen": 2197505712,
"num_tokens": 926254107.0,
"step": 4550
},
{
"epoch": 1.1103248487408905,
"grad_norm": 0.291015625,
"learning_rate": 4.514863437452845e-05,
"loss": 2.1132,
"mean_token_accuracy": 0.5515907733514905,
"num_input_tokens_seen": 2221716688,
"num_tokens": 936449430.0,
"step": 4600
},
{
"epoch": 1.1223954011195438,
"grad_norm": 0.296875,
"learning_rate": 4.4960012071827373e-05,
"loss": 2.1142,
"mean_token_accuracy": 0.5520639397203922,
"num_input_tokens_seen": 2245565536,
"num_tokens": 946528658.0,
"step": 4650
},
{
"epoch": 1.134465953498197,
"grad_norm": 0.2734375,
"learning_rate": 4.4771389769126305e-05,
"loss": 2.1275,
"mean_token_accuracy": 0.5497148666903376,
"num_input_tokens_seen": 2269696864,
"num_tokens": 956594209.0,
"step": 4700
},
{
"epoch": 1.1465365058768502,
"grad_norm": 0.279296875,
"learning_rate": 4.458276746642524e-05,
"loss": 2.1065,
"mean_token_accuracy": 0.5532364987954498,
"num_input_tokens_seen": 2293845360,
"num_tokens": 966701814.0,
"step": 4750
},
{
"epoch": 1.1586070582555035,
"grad_norm": 0.259765625,
"learning_rate": 4.439414516372416e-05,
"loss": 2.1133,
"mean_token_accuracy": 0.5517958915606141,
"num_input_tokens_seen": 2318062016,
"num_tokens": 976956133.0,
"step": 4800
},
{
"epoch": 1.1706776106341565,
"grad_norm": 0.314453125,
"learning_rate": 4.420552286102309e-05,
"loss": 2.1083,
"mean_token_accuracy": 0.5527382261306047,
"num_input_tokens_seen": 2342152464,
"num_tokens": 987113621.0,
"step": 4850
},
{
"epoch": 1.1827481630128098,
"grad_norm": 0.26953125,
"learning_rate": 4.401690055832201e-05,
"loss": 2.1084,
"mean_token_accuracy": 0.5531642048805953,
"num_input_tokens_seen": 2366342016,
"num_tokens": 997304128.0,
"step": 4900
},
{
"epoch": 1.1948187153914631,
"grad_norm": 0.263671875,
"learning_rate": 4.3828278255620945e-05,
"loss": 2.1129,
"mean_token_accuracy": 0.5526808862015605,
"num_input_tokens_seen": 2390580560,
"num_tokens": 1007600120.0,
"step": 4950
},
{
"epoch": 1.2068892677701164,
"grad_norm": 0.271484375,
"learning_rate": 4.363965595291988e-05,
"loss": 2.1136,
"num_input_tokens_seen": 2414871648,
"step": 5000
},
{
"epoch": 1.2068892677701164,
"eval_loss": 1.9823503494262695,
"eval_mean_token_accuracy": 0.5763351263685668,
"eval_num_tokens": 1017920689.0,
"eval_runtime": 131.1681,
"eval_samples_per_second": 81.666,
"eval_steps_per_second": 20.417,
"num_input_tokens_seen": 2414871648,
"step": 5000
},
{
"epoch": 1.2189598201487695,
"grad_norm": 0.25,
"learning_rate": 4.34510336502188e-05,
"loss": 2.108,
"mean_token_accuracy": 0.5514175926893949,
"num_input_tokens_seen": 2438963872,
"num_tokens": 1028143121.0,
"step": 5050
},
{
"epoch": 1.2310303725274228,
"grad_norm": 0.2421875,
"learning_rate": 4.3262411347517734e-05,
"loss": 2.1066,
"mean_token_accuracy": 0.5526730781793594,
"num_input_tokens_seen": 2463130960,
"num_tokens": 1038274786.0,
"step": 5100
},
{
"epoch": 1.243100924906076,
"grad_norm": 0.2353515625,
"learning_rate": 4.307378904481666e-05,
"loss": 2.1011,
"mean_token_accuracy": 0.5543517142161727,
"num_input_tokens_seen": 2487402736,
"num_tokens": 1048479252.0,
"step": 5150
},
{
"epoch": 1.2551714772847293,
"grad_norm": 0.265625,
"learning_rate": 4.288516674211559e-05,
"loss": 2.1021,
"mean_token_accuracy": 0.5538267828151584,
"num_input_tokens_seen": 2511451728,
"num_tokens": 1058650745.0,
"step": 5200
},
{
"epoch": 1.2672420296633824,
"grad_norm": 0.30859375,
"learning_rate": 4.2696544439414524e-05,
"loss": 2.0863,
"mean_token_accuracy": 0.5557815081253648,
"num_input_tokens_seen": 2535548592,
"num_tokens": 1068882104.0,
"step": 5250
},
{
"epoch": 1.2793125820420357,
"grad_norm": 0.306640625,
"learning_rate": 4.250792213671345e-05,
"loss": 2.1063,
"mean_token_accuracy": 0.5531226889789105,
"num_input_tokens_seen": 2559719664,
"num_tokens": 1079065265.0,
"step": 5300
},
{
"epoch": 1.291383134420689,
"grad_norm": 0.263671875,
"learning_rate": 4.2319299834012374e-05,
"loss": 2.1104,
"mean_token_accuracy": 0.5524419481307268,
"num_input_tokens_seen": 2584073280,
"num_tokens": 1089341978.0,
"step": 5350
},
{
"epoch": 1.303453686799342,
"grad_norm": 0.244140625,
"learning_rate": 4.21306775313113e-05,
"loss": 2.1044,
"mean_token_accuracy": 0.5532321387529373,
"num_input_tokens_seen": 2608296624,
"num_tokens": 1099642346.0,
"step": 5400
},
{
"epoch": 1.3155242391779953,
"grad_norm": 0.2412109375,
"learning_rate": 4.194205522861023e-05,
"loss": 2.1115,
"mean_token_accuracy": 0.5528482471778989,
"num_input_tokens_seen": 2632421856,
"num_tokens": 1109721280.0,
"step": 5450
},
{
"epoch": 1.3275947915566486,
"grad_norm": 0.2275390625,
"learning_rate": 4.1753432925909163e-05,
"loss": 2.1009,
"num_input_tokens_seen": 2656567344,
"step": 5500
},
{
"epoch": 1.3275947915566486,
"eval_loss": 1.9779127836227417,
"eval_mean_token_accuracy": 0.5769637392206456,
"eval_num_tokens": 1119903809.0,
"eval_runtime": 131.3767,
"eval_samples_per_second": 81.537,
"eval_steps_per_second": 20.384,
"num_input_tokens_seen": 2656567344,
"step": 5500
},
{
"epoch": 1.339665343935302,
"grad_norm": 0.26171875,
"learning_rate": 4.156481062320809e-05,
"loss": 2.1059,
"mean_token_accuracy": 0.5533521883934737,
"num_input_tokens_seen": 2680728000,
"num_tokens": 1130022087.0,
"step": 5550
},
{
"epoch": 1.3517358963139552,
"grad_norm": 0.25390625,
"learning_rate": 4.137618832050702e-05,
"loss": 2.0992,
"mean_token_accuracy": 0.5542617355659604,
"num_input_tokens_seen": 2704833792,
"num_tokens": 1140249458.0,
"step": 5600
},
{
"epoch": 1.3638064486926083,
"grad_norm": 0.267578125,
"learning_rate": 4.1187566017805946e-05,
"loss": 2.0977,
"mean_token_accuracy": 0.5540939109772444,
"num_input_tokens_seen": 2729074544,
"num_tokens": 1150474886.0,
"step": 5650
},
{
"epoch": 1.3758770010712615,
"grad_norm": 0.294921875,
"learning_rate": 4.099894371510488e-05,
"loss": 2.0995,
"mean_token_accuracy": 0.553785107024014,
"num_input_tokens_seen": 2753196608,
"num_tokens": 1160634529.0,
"step": 5700
},
{
"epoch": 1.3879475534499148,
"grad_norm": 0.26171875,
"learning_rate": 4.081032141240381e-05,
"loss": 2.1066,
"mean_token_accuracy": 0.5523933649063111,
"num_input_tokens_seen": 2777300400,
"num_tokens": 1170864545.0,
"step": 5750
},
{
"epoch": 1.400018105828568,
"grad_norm": 0.291015625,
"learning_rate": 4.0621699109702735e-05,
"loss": 2.1023,
"mean_token_accuracy": 0.5536971531435847,
"num_input_tokens_seen": 2801426672,
"num_tokens": 1181051604.0,
"step": 5800
},
{
"epoch": 1.4120886582072212,
"grad_norm": 0.267578125,
"learning_rate": 4.043307680700166e-05,
"loss": 2.1042,
"mean_token_accuracy": 0.5537538637593389,
"num_input_tokens_seen": 2825621648,
"num_tokens": 1191210311.0,
"step": 5850
},
{
"epoch": 1.4241592105858745,
"grad_norm": 0.29296875,
"learning_rate": 4.0244454504300586e-05,
"loss": 2.1221,
"mean_token_accuracy": 0.5503192816674709,
"num_input_tokens_seen": 2849863744,
"num_tokens": 1201379955.0,
"step": 5900
},
{
"epoch": 1.4362297629645275,
"grad_norm": 0.30859375,
"learning_rate": 4.005583220159952e-05,
"loss": 2.0984,
"mean_token_accuracy": 0.5546167600527405,
"num_input_tokens_seen": 2874100544,
"num_tokens": 1211495441.0,
"step": 5950
},
{
"epoch": 1.4483003153431808,
"grad_norm": 0.267578125,
"learning_rate": 3.986720989889845e-05,
"loss": 2.0976,
"num_input_tokens_seen": 2898379392,
"step": 6000
},
{
"epoch": 1.4483003153431808,
"eval_loss": 1.9750181436538696,
"eval_mean_token_accuracy": 0.5774352134075336,
"eval_num_tokens": 1221766201.0,
"eval_runtime": 130.8087,
"eval_samples_per_second": 81.891,
"eval_steps_per_second": 20.473,
"num_input_tokens_seen": 2898379392,
"step": 6000
},
{
"epoch": 1.460370867721834,
"grad_norm": 0.2734375,
"learning_rate": 3.9678587596197375e-05,
"loss": 2.1105,
"mean_token_accuracy": 0.5535502586700022,
"num_input_tokens_seen": 2922428832,
"num_tokens": 1231890670.0,
"step": 6050
},
{
"epoch": 1.4724414201004874,
"grad_norm": 0.2412109375,
"learning_rate": 3.948996529349631e-05,
"loss": 2.0925,
"mean_token_accuracy": 0.5552064320072532,
"num_input_tokens_seen": 2946628480,
"num_tokens": 1242047017.0,
"step": 6100
},
{
"epoch": 1.4845119724791407,
"grad_norm": 0.2392578125,
"learning_rate": 3.930134299079523e-05,
"loss": 2.0991,
"mean_token_accuracy": 0.5542361034452915,
"num_input_tokens_seen": 2970772896,
"num_tokens": 1252270839.0,
"step": 6150
},
{
"epoch": 1.4965825248577938,
"grad_norm": 0.248046875,
"learning_rate": 3.9112720688094164e-05,
"loss": 2.0975,
"mean_token_accuracy": 0.553666141666472,
"num_input_tokens_seen": 2995178928,
"num_tokens": 1262575010.0,
"step": 6200
},
{
"epoch": 1.508653077236447,
"grad_norm": 0.27734375,
"learning_rate": 3.8924098385393096e-05,
"loss": 2.1016,
"mean_token_accuracy": 0.5535378622636199,
"num_input_tokens_seen": 3019494064,
"num_tokens": 1272716330.0,
"step": 6250
},
{
"epoch": 1.5207236296151003,
"grad_norm": 0.251953125,
"learning_rate": 3.873547608269202e-05,
"loss": 2.1071,
"mean_token_accuracy": 0.5533343946188688,
"num_input_tokens_seen": 3043731184,
"num_tokens": 1282959106.0,
"step": 6300
},
{
"epoch": 1.5327941819937534,
"grad_norm": 0.267578125,
"learning_rate": 3.854685377999095e-05,
"loss": 2.1057,
"mean_token_accuracy": 0.5532900895178318,
"num_input_tokens_seen": 3067867536,
"num_tokens": 1293166946.0,
"step": 6350
},
{
"epoch": 1.5448647343724067,
"grad_norm": 0.32421875,
"learning_rate": 3.835823147728987e-05,
"loss": 2.106,
"mean_token_accuracy": 0.5535468808189035,
"num_input_tokens_seen": 3092176096,
"num_tokens": 1303381117.0,
"step": 6400
},
{
"epoch": 1.55693528675106,
"grad_norm": 0.248046875,
"learning_rate": 3.8169609174588804e-05,
"loss": 2.0989,
"mean_token_accuracy": 0.5537711648643017,
"num_input_tokens_seen": 3116219440,
"num_tokens": 1313584312.0,
"step": 6450
},
{
"epoch": 1.569005839129713,
"grad_norm": 0.2490234375,
"learning_rate": 3.7980986871887736e-05,
"loss": 2.1121,
"num_input_tokens_seen": 3140306656,
"step": 6500
},
{
"epoch": 1.569005839129713,
"eval_loss": 1.972907304763794,
"eval_mean_token_accuracy": 0.5776848248619922,
"eval_num_tokens": 1323681372.0,
"eval_runtime": 129.9521,
"eval_samples_per_second": 82.43,
"eval_steps_per_second": 20.608,
"num_input_tokens_seen": 3140306656,
"step": 6500
},
{
"epoch": 1.5810763915083665,
"grad_norm": 0.255859375,
"learning_rate": 3.779236456918666e-05,
"loss": 2.1022,
"mean_token_accuracy": 0.5528610655851662,
"num_input_tokens_seen": 3164451568,
"num_tokens": 1333870621.0,
"step": 6550
},
{
"epoch": 1.5931469438870196,
"grad_norm": 0.2578125,
"learning_rate": 3.760374226648559e-05,
"loss": 2.0995,
"mean_token_accuracy": 0.5544479803740978,
"num_input_tokens_seen": 3188556560,
"num_tokens": 1344040900.0,
"step": 6600
},
{
"epoch": 1.605217496265673,
"grad_norm": 0.263671875,
"learning_rate": 3.741511996378452e-05,
"loss": 2.1007,
"mean_token_accuracy": 0.5533806948363781,
"num_input_tokens_seen": 3212749536,
"num_tokens": 1354214136.0,
"step": 6650
},
{
"epoch": 1.6172880486443262,
"grad_norm": 0.314453125,
"learning_rate": 3.722649766108345e-05,
"loss": 2.1034,
"mean_token_accuracy": 0.5535299601778388,
"num_input_tokens_seen": 3236971856,
"num_tokens": 1364392553.0,
"step": 6700
},
{
"epoch": 1.6293586010229792,
"grad_norm": 0.255859375,
"learning_rate": 3.7037875358382376e-05,
"loss": 2.1039,
"mean_token_accuracy": 0.5538195591047406,
"num_input_tokens_seen": 3261221664,
"num_tokens": 1374597748.0,
"step": 6750
},
{
"epoch": 1.6414291534016325,
"grad_norm": 0.2431640625,
"learning_rate": 3.68492530556813e-05,
"loss": 2.0999,
"mean_token_accuracy": 0.5533070769160986,
"num_input_tokens_seen": 3285325344,
"num_tokens": 1384799126.0,
"step": 6800
},
{
"epoch": 1.6534997057802858,
"grad_norm": 0.294921875,
"learning_rate": 3.666063075298023e-05,
"loss": 2.0969,
"mean_token_accuracy": 0.5540298366174102,
"num_input_tokens_seen": 3309447808,
"num_tokens": 1394991541.0,
"step": 6850
},
{
"epoch": 1.6655702581589389,
"grad_norm": 0.28125,
"learning_rate": 3.647200845027916e-05,
"loss": 2.0982,
"mean_token_accuracy": 0.5548456938192249,
"num_input_tokens_seen": 3333810384,
"num_tokens": 1405193552.0,
"step": 6900
},
{
"epoch": 1.6776408105375922,
"grad_norm": 0.27734375,
"learning_rate": 3.628338614757809e-05,
"loss": 2.0998,
"mean_token_accuracy": 0.5537503241375089,
"num_input_tokens_seen": 3357917184,
"num_tokens": 1415265616.0,
"step": 6950
},
{
"epoch": 1.6897113629162455,
"grad_norm": 0.236328125,
"learning_rate": 3.609476384487702e-05,
"loss": 2.0829,
"num_input_tokens_seen": 3382174368,
"step": 7000
},
{
"epoch": 1.6897113629162455,
"eval_loss": 1.971500039100647,
"eval_mean_token_accuracy": 0.5780662869320956,
"eval_num_tokens": 1425356084.0,
"eval_runtime": 130.448,
"eval_samples_per_second": 82.117,
"eval_steps_per_second": 20.529,
"num_input_tokens_seen": 3382174368,
"step": 7000
},
{
"epoch": 1.7017819152948985,
"grad_norm": 0.2421875,
"learning_rate": 3.590614154217595e-05,
"loss": 2.0955,
"mean_token_accuracy": 0.55618498865515,
"num_input_tokens_seen": 3406371872,
"num_tokens": 1435550428.0,
"step": 7050
},
{
"epoch": 1.713852467673552,
"grad_norm": 0.275390625,
"learning_rate": 3.571751923947488e-05,
"loss": 2.0919,
"mean_token_accuracy": 0.5553148340806365,
"num_input_tokens_seen": 3430521536,
"num_tokens": 1445655664.0,
"step": 7100
},
{
"epoch": 1.725923020052205,
"grad_norm": 0.28125,
"learning_rate": 3.5528896936773805e-05,
"loss": 2.0906,
"mean_token_accuracy": 0.5547146466746926,
"num_input_tokens_seen": 3454573728,
"num_tokens": 1455866347.0,
"step": 7150
},
{
"epoch": 1.7379935724308584,
"grad_norm": 0.265625,
"learning_rate": 3.534027463407274e-05,
"loss": 2.098,
"mean_token_accuracy": 0.5544127273187042,
"num_input_tokens_seen": 3478641648,
"num_tokens": 1465996080.0,
"step": 7200
},
{
"epoch": 1.7500641248095117,
"grad_norm": 0.275390625,
"learning_rate": 3.515165233137166e-05,
"loss": 2.1039,
"mean_token_accuracy": 0.5536015385761857,
"num_input_tokens_seen": 3502632176,
"num_tokens": 1476133852.0,
"step": 7250
},
{
"epoch": 1.7621346771881647,
"grad_norm": 0.2470703125,
"learning_rate": 3.496303002867059e-05,
"loss": 2.0974,
"mean_token_accuracy": 0.5540728243440389,
"num_input_tokens_seen": 3526775024,
"num_tokens": 1486346988.0,
"step": 7300
},
{
"epoch": 1.774205229566818,
"grad_norm": 0.24609375,
"learning_rate": 3.477440772596952e-05,
"loss": 2.0986,
"mean_token_accuracy": 0.5539619905874134,
"num_input_tokens_seen": 3551057232,
"num_tokens": 1496540907.0,
"step": 7350
},
{
"epoch": 1.7862757819454713,
"grad_norm": 0.2734375,
"learning_rate": 3.4585785423268445e-05,
"loss": 2.097,
"mean_token_accuracy": 0.5543709811195732,
"num_input_tokens_seen": 3575145120,
"num_tokens": 1506733776.0,
"step": 7400
},
{
"epoch": 1.7983463343241244,
"grad_norm": 0.263671875,
"learning_rate": 3.4397163120567377e-05,
"loss": 2.1054,
"mean_token_accuracy": 0.5529748990386725,
"num_input_tokens_seen": 3599324048,
"num_tokens": 1516881364.0,
"step": 7450
},
{
"epoch": 1.810416886702778,
"grad_norm": 0.26171875,
"learning_rate": 3.420854081786631e-05,
"loss": 2.107,
"num_input_tokens_seen": 3623572960,
"step": 7500
},
{
"epoch": 1.810416886702778,
"eval_loss": 1.9704335927963257,
"eval_mean_token_accuracy": 0.5782234972207915,
"eval_num_tokens": 1527213100.0,
"eval_runtime": 130.0921,
"eval_samples_per_second": 82.342,
"eval_steps_per_second": 20.585,
"num_input_tokens_seen": 3623572960,
"step": 7500
},
{
"epoch": 1.822487439081431,
"grad_norm": 0.236328125,
"learning_rate": 3.4019918515165234e-05,
"loss": 2.0859,
"mean_token_accuracy": 0.5543704128451645,
"num_input_tokens_seen": 3647842624,
"num_tokens": 1537348409.0,
"step": 7550
},
{
"epoch": 1.8345579914600842,
"grad_norm": 0.275390625,
"learning_rate": 3.3831296212464166e-05,
"loss": 2.0964,
"mean_token_accuracy": 0.5545977150648832,
"num_input_tokens_seen": 3671838176,
"num_tokens": 1547405691.0,
"step": 7600
},
{
"epoch": 1.8466285438387375,
"grad_norm": 0.271484375,
"learning_rate": 3.364267390976309e-05,
"loss": 2.1049,
"mean_token_accuracy": 0.5531406961008907,
"num_input_tokens_seen": 3696051376,
"num_tokens": 1557614207.0,
"step": 7650
},
{
"epoch": 1.8586990962173906,
"grad_norm": 0.26171875,
"learning_rate": 3.345405160706202e-05,
"loss": 2.0976,
"mean_token_accuracy": 0.5547824421525002,
"num_input_tokens_seen": 3720242912,
"num_tokens": 1567813391.0,
"step": 7700
},
{
"epoch": 1.8707696485960439,
"grad_norm": 0.26171875,
"learning_rate": 3.326542930436095e-05,
"loss": 2.0941,
"mean_token_accuracy": 0.5552040388435125,
"num_input_tokens_seen": 3744303744,
"num_tokens": 1577952759.0,
"step": 7750
},
{
"epoch": 1.8828402009746972,
"grad_norm": 0.2734375,
"learning_rate": 3.3076807001659874e-05,
"loss": 2.107,
"mean_token_accuracy": 0.5526882111281156,
"num_input_tokens_seen": 3768596640,
"num_tokens": 1588260331.0,
"step": 7800
},
{
"epoch": 1.8949107533533502,
"grad_norm": 0.2890625,
"learning_rate": 3.2888184698958806e-05,
"loss": 2.0974,
"mean_token_accuracy": 0.5548465251550079,
"num_input_tokens_seen": 3792634928,
"num_tokens": 1598444722.0,
"step": 7850
},
{
"epoch": 1.9069813057320035,
"grad_norm": 0.251953125,
"learning_rate": 3.269956239625773e-05,
"loss": 2.0957,
"mean_token_accuracy": 0.554438531845808,
"num_input_tokens_seen": 3816849872,
"num_tokens": 1608557410.0,
"step": 7900
},
{
"epoch": 1.9190518581106568,
"grad_norm": 0.2490234375,
"learning_rate": 3.251094009355666e-05,
"loss": 2.108,
"mean_token_accuracy": 0.5528679783269763,
"num_input_tokens_seen": 3840903600,
"num_tokens": 1618721583.0,
"step": 7950
},
{
"epoch": 1.9311224104893099,
"grad_norm": 0.263671875,
"learning_rate": 3.2322317790855595e-05,
"loss": 2.0908,
"num_input_tokens_seen": 3864935104,
"step": 8000
},
{
"epoch": 1.9311224104893099,
"eval_loss": 1.9698705673217773,
"eval_mean_token_accuracy": 0.5783014823866923,
"eval_num_tokens": 1628824365.0,
"eval_runtime": 130.0192,
"eval_samples_per_second": 82.388,
"eval_steps_per_second": 20.597,
"num_input_tokens_seen": 3864935104,
"step": 8000
},
{
"epoch": 1.9431929628679634,
"grad_norm": 0.26171875,
"learning_rate": 3.213369548815452e-05,
"loss": 2.096,
"mean_token_accuracy": 0.5548218312300741,
"num_input_tokens_seen": 3889193776,
"num_tokens": 1638986755.0,
"step": 8050
},
{
"epoch": 1.9552635152466165,
"grad_norm": 0.28125,
"learning_rate": 3.194507318545345e-05,
"loss": 2.0821,
"mean_token_accuracy": 0.5565256755426526,
"num_input_tokens_seen": 3913273664,
"num_tokens": 1649209062.0,
"step": 8100
},
{
"epoch": 1.9673340676252697,
"grad_norm": 0.267578125,
"learning_rate": 3.175645088275238e-05,
"loss": 2.1026,
"mean_token_accuracy": 0.5532021636888385,
"num_input_tokens_seen": 3937597616,
"num_tokens": 1659435713.0,
"step": 8150
},
{
"epoch": 1.979404620003923,
"grad_norm": 0.255859375,
"learning_rate": 3.156782858005131e-05,
"loss": 2.0984,
"mean_token_accuracy": 0.554893646761775,
"num_input_tokens_seen": 3961803792,
"num_tokens": 1669582457.0,
"step": 8200
},
{
"epoch": 1.991475172382576,
"grad_norm": 0.25,
"learning_rate": 3.1379206277350235e-05,
"loss": 2.0818,
"mean_token_accuracy": 0.5563116483017803,
"num_input_tokens_seen": 3985940144,
"num_tokens": 1679783022.0,
"step": 8250
},
{
"epoch": 2.003379754666023,
"grad_norm": 0.279296875,
"learning_rate": 3.119058397464916e-05,
"loss": 2.0986,
"mean_token_accuracy": 0.5548089014411124,
"num_input_tokens_seen": 4009775217,
"num_tokens": 1689934301.0,
"step": 8300
},
{
"epoch": 2.0154503070446763,
"grad_norm": 0.2431640625,
"learning_rate": 3.100196167194809e-05,
"loss": 2.101,
"mean_token_accuracy": 0.554502502605319,
"num_input_tokens_seen": 4033881233,
"num_tokens": 1700106136.0,
"step": 8350
},
{
"epoch": 2.0275208594233294,
"grad_norm": 0.27734375,
"learning_rate": 3.081333936924702e-05,
"loss": 2.0952,
"mean_token_accuracy": 0.5547482476383447,
"num_input_tokens_seen": 4058061905,
"num_tokens": 1710288179.0,
"step": 8400
},
{
"epoch": 2.0395914118019824,
"grad_norm": 0.251953125,
"learning_rate": 3.062471706654595e-05,
"loss": 2.0997,
"mean_token_accuracy": 0.5538438270241022,
"num_input_tokens_seen": 4082118001,
"num_tokens": 1720402394.0,
"step": 8450
},
{
"epoch": 2.051661964180636,
"grad_norm": 0.27734375,
"learning_rate": 3.043609476384488e-05,
"loss": 2.0904,
"num_input_tokens_seen": 4106175169,
"step": 8500
},
{
"epoch": 2.051661964180636,
"eval_loss": 1.969247817993164,
"eval_mean_token_accuracy": 0.5783566591497497,
"eval_num_tokens": 1730551722.0,
"eval_runtime": 130.525,
"eval_samples_per_second": 82.069,
"eval_steps_per_second": 20.517,
"num_input_tokens_seen": 4106175169,
"step": 8500
},
{
"epoch": 2.063732516559289,
"grad_norm": 0.244140625,
"learning_rate": 3.0247472461143806e-05,
"loss": 2.1045,
"mean_token_accuracy": 0.5544156692735851,
"num_input_tokens_seen": 4130212065,
"num_tokens": 1740781983.0,
"step": 8550
},
{
"epoch": 2.075803068937942,
"grad_norm": 0.28125,
"learning_rate": 3.0058850158442735e-05,
"loss": 2.0968,
"mean_token_accuracy": 0.5541778185963631,
"num_input_tokens_seen": 4154342289,
"num_tokens": 1750886868.0,
"step": 8600
},
{
"epoch": 2.0878736213165956,
"grad_norm": 0.2392578125,
"learning_rate": 2.9870227855741667e-05,
"loss": 2.0825,
"mean_token_accuracy": 0.5571553486213088,
"num_input_tokens_seen": 4178622977,
"num_tokens": 1761024406.0,
"step": 8650
},
{
"epoch": 2.0999441736952487,
"grad_norm": 0.2490234375,
"learning_rate": 2.9681605553040592e-05,
"loss": 2.0936,
"mean_token_accuracy": 0.554418184608221,
"num_input_tokens_seen": 4202833969,
"num_tokens": 1771241652.0,
"step": 8700
},
{
"epoch": 2.112014726073902,
"grad_norm": 0.244140625,
"learning_rate": 2.9492983250339524e-05,
"loss": 2.0959,
"mean_token_accuracy": 0.5550215977802873,
"num_input_tokens_seen": 4226915233,
"num_tokens": 1781349215.0,
"step": 8750
},
{
"epoch": 2.1240852784525552,
"grad_norm": 0.267578125,
"learning_rate": 2.930436094763845e-05,
"loss": 2.1008,
"mean_token_accuracy": 0.553907332457602,
"num_input_tokens_seen": 4251137105,
"num_tokens": 1791502343.0,
"step": 8800
},
{
"epoch": 2.1361558308312083,
"grad_norm": 0.263671875,
"learning_rate": 2.9115738644937378e-05,
"loss": 2.0903,
"mean_token_accuracy": 0.5557647632434964,
"num_input_tokens_seen": 4275224433,
"num_tokens": 1801620139.0,
"step": 8850
},
{
"epoch": 2.148226383209862,
"grad_norm": 0.2275390625,
"learning_rate": 2.892711634223631e-05,
"loss": 2.105,
"mean_token_accuracy": 0.5533201249688864,
"num_input_tokens_seen": 4299490545,
"num_tokens": 1811829692.0,
"step": 8900
},
{
"epoch": 2.160296935588515,
"grad_norm": 0.251953125,
"learning_rate": 2.8738494039535235e-05,
"loss": 2.0771,
"mean_token_accuracy": 0.5572306806966663,
"num_input_tokens_seen": 4323603905,
"num_tokens": 1822011165.0,
"step": 8950
},
{
"epoch": 2.172367487967168,
"grad_norm": 0.248046875,
"learning_rate": 2.8549871736834167e-05,
"loss": 2.0766,
"num_input_tokens_seen": 4347894913,
"step": 9000
},
{
"epoch": 2.172367487967168,
"eval_loss": 1.968759536743164,
"eval_mean_token_accuracy": 0.5784085558767724,
"eval_num_tokens": 1832187809.0,
"eval_runtime": 130.1161,
"eval_samples_per_second": 82.326,
"eval_steps_per_second": 20.582,
"num_input_tokens_seen": 4347894913,
"step": 9000
},
{
"epoch": 2.1844380403458215,
"grad_norm": 0.2734375,
"learning_rate": 2.8361249434133093e-05,
"loss": 2.0951,
"mean_token_accuracy": 0.5561914920061827,
"num_input_tokens_seen": 4372093409,
"num_tokens": 1842403609.0,
"step": 9050
},
{
"epoch": 2.1965085927244745,
"grad_norm": 0.275390625,
"learning_rate": 2.817262713143202e-05,
"loss": 2.0915,
"mean_token_accuracy": 0.5557398213073611,
"num_input_tokens_seen": 4396072241,
"num_tokens": 1852448709.0,
"step": 9100
},
{
"epoch": 2.2085791451031276,
"grad_norm": 0.2392578125,
"learning_rate": 2.7984004828730953e-05,
"loss": 2.0965,
"mean_token_accuracy": 0.5542204293608666,
"num_input_tokens_seen": 4420308385,
"num_tokens": 1862702843.0,
"step": 9150
},
{
"epoch": 2.220649697481781,
"grad_norm": 0.2578125,
"learning_rate": 2.779538252602988e-05,
"loss": 2.0873,
"mean_token_accuracy": 0.555770318582654,
"num_input_tokens_seen": 4444408305,
"num_tokens": 1872813360.0,
"step": 9200
},
{
"epoch": 2.232720249860434,
"grad_norm": 0.248046875,
"learning_rate": 2.760676022332881e-05,
"loss": 2.0984,
"mean_token_accuracy": 0.5543450859189033,
"num_input_tokens_seen": 4468586049,
"num_tokens": 1883034727.0,
"step": 9250
},
{
"epoch": 2.2447908022390877,
"grad_norm": 0.26171875,
"learning_rate": 2.7418137920627736e-05,
"loss": 2.0913,
"mean_token_accuracy": 0.5554680547490716,
"num_input_tokens_seen": 4492717489,
"num_tokens": 1893259660.0,
"step": 9300
},
{
"epoch": 2.2568613546177407,
"grad_norm": 0.3046875,
"learning_rate": 2.7229515617926664e-05,
"loss": 2.0976,
"mean_token_accuracy": 0.5547211924567819,
"num_input_tokens_seen": 4516832449,
"num_tokens": 1903351453.0,
"step": 9350
},
{
"epoch": 2.268931906996394,
"grad_norm": 0.240234375,
"learning_rate": 2.7040893315225596e-05,
"loss": 2.095,
"mean_token_accuracy": 0.5545766900852322,
"num_input_tokens_seen": 4540881473,
"num_tokens": 1913462038.0,
"step": 9400
},
{
"epoch": 2.2810024593750473,
"grad_norm": 0.2412109375,
"learning_rate": 2.685227101252452e-05,
"loss": 2.1047,
"mean_token_accuracy": 0.5530835852399468,
"num_input_tokens_seen": 4565196353,
"num_tokens": 1923836730.0,
"step": 9450
},
{
"epoch": 2.2930730117537004,
"grad_norm": 0.25390625,
"learning_rate": 2.6663648709823454e-05,
"loss": 2.1036,
"num_input_tokens_seen": 4589393665,
"step": 9500
},
{
"epoch": 2.2930730117537004,
"eval_loss": 1.9684821367263794,
"eval_mean_token_accuracy": 0.5784456487953707,
"eval_num_tokens": 1933999749.0,
"eval_runtime": 130.3401,
"eval_samples_per_second": 82.185,
"eval_steps_per_second": 20.546,
"num_input_tokens_seen": 4589393665,
"step": 9500
},
{
"epoch": 2.3051435641323534,
"grad_norm": 0.2373046875,
"learning_rate": 2.647502640712238e-05,
"loss": 2.1091,
"mean_token_accuracy": 0.5529859235696495,
"num_input_tokens_seen": 4613609921,
"num_tokens": 1944210855.0,
"step": 9550
},
{
"epoch": 2.317214116511007,
"grad_norm": 0.2490234375,
"learning_rate": 2.6286404104421307e-05,
"loss": 2.0976,
"mean_token_accuracy": 0.554888856895268,
"num_input_tokens_seen": 4637474321,
"num_tokens": 1954258079.0,
"step": 9600
},
{
"epoch": 2.32928466888966,
"grad_norm": 0.25,
"learning_rate": 2.609778180172024e-05,
"loss": 2.1061,
"mean_token_accuracy": 0.5531089297309518,
"num_input_tokens_seen": 4661687841,
"num_tokens": 1964447306.0,
"step": 9650
},
{
"epoch": 2.341355221268313,
"grad_norm": 0.283203125,
"learning_rate": 2.5909159499019165e-05,
"loss": 2.0972,
"mean_token_accuracy": 0.5547297456115484,
"num_input_tokens_seen": 4685886657,
"num_tokens": 1974672380.0,
"step": 9700
},
{
"epoch": 2.3534257736469666,
"grad_norm": 0.275390625,
"learning_rate": 2.5720537196318097e-05,
"loss": 2.0874,
"mean_token_accuracy": 0.556226581223309,
"num_input_tokens_seen": 4710004273,
"num_tokens": 1984832310.0,
"step": 9750
},
{
"epoch": 2.3654963260256197,
"grad_norm": 0.2392578125,
"learning_rate": 2.5531914893617022e-05,
"loss": 2.096,
"mean_token_accuracy": 0.5547980547696352,
"num_input_tokens_seen": 4734271009,
"num_tokens": 1995090784.0,
"step": 9800
},
{
"epoch": 2.377566878404273,
"grad_norm": 0.28515625,
"learning_rate": 2.534329259091595e-05,
"loss": 2.0871,
"mean_token_accuracy": 0.5552258058264852,
"num_input_tokens_seen": 4758291265,
"num_tokens": 2005240317.0,
"step": 9850
},
{
"epoch": 2.3896374307829262,
"grad_norm": 0.2470703125,
"learning_rate": 2.5154670288214883e-05,
"loss": 2.0865,
"mean_token_accuracy": 0.5557247434183955,
"num_input_tokens_seen": 4782472097,
"num_tokens": 2015507708.0,
"step": 9900
},
{
"epoch": 2.4017079831615793,
"grad_norm": 0.2421875,
"learning_rate": 2.4966047985513808e-05,
"loss": 2.1074,
"mean_token_accuracy": 0.5527091028168798,
"num_input_tokens_seen": 4806608113,
"num_tokens": 2025820931.0,
"step": 9950
},
{
"epoch": 2.413778535540233,
"grad_norm": 0.2421875,
"learning_rate": 2.477742568281274e-05,
"loss": 2.1001,
"num_input_tokens_seen": 4830743425,
"step": 10000
},
{
"epoch": 2.413778535540233,
"eval_loss": 1.9683291912078857,
"eval_mean_token_accuracy": 0.5784874623550952,
"eval_num_tokens": 2035904188.0,
"eval_runtime": 130.7093,
"eval_samples_per_second": 81.953,
"eval_steps_per_second": 20.488,
"num_input_tokens_seen": 4830743425,
"step": 10000
},
{
"epoch": 2.425849087918886,
"grad_norm": 0.263671875,
"learning_rate": 2.4588803380111665e-05,
"loss": 2.102,
"mean_token_accuracy": 0.5541372266598046,
"num_input_tokens_seen": 4855099809,
"num_tokens": 2046084283.0,
"step": 10050
},
{
"epoch": 2.437919640297539,
"grad_norm": 0.26171875,
"learning_rate": 2.4400181077410594e-05,
"loss": 2.0991,
"mean_token_accuracy": 0.5542299181595445,
"num_input_tokens_seen": 4879214129,
"num_tokens": 2056326330.0,
"step": 10100
},
{
"epoch": 2.4499901926761924,
"grad_norm": 0.25390625,
"learning_rate": 2.4211558774709522e-05,
"loss": 2.0834,
"mean_token_accuracy": 0.5564426334574819,
"num_input_tokens_seen": 4903399553,
"num_tokens": 2066490013.0,
"step": 10150
},
{
"epoch": 2.4620607450548455,
"grad_norm": 0.263671875,
"learning_rate": 2.402293647200845e-05,
"loss": 2.098,
"mean_token_accuracy": 0.5545364746823906,
"num_input_tokens_seen": 4927492609,
"num_tokens": 2076526539.0,
"step": 10200
},
{
"epoch": 2.474131297433499,
"grad_norm": 0.23828125,
"learning_rate": 2.383431416930738e-05,
"loss": 2.0885,
"mean_token_accuracy": 0.555601441822946,
"num_input_tokens_seen": 4951732929,
"num_tokens": 2086768431.0,
"step": 10250
},
{
"epoch": 2.486201849812152,
"grad_norm": 0.255859375,
"learning_rate": 2.3645691866606308e-05,
"loss": 2.0909,
"mean_token_accuracy": 0.5558399046584964,
"num_input_tokens_seen": 4975948097,
"num_tokens": 2096961030.0,
"step": 10300
},
{
"epoch": 2.498272402190805,
"grad_norm": 0.326171875,
"learning_rate": 2.3457069563905237e-05,
"loss": 2.0906,
"mean_token_accuracy": 0.5556136939302087,
"num_input_tokens_seen": 5000143905,
"num_tokens": 2107303887.0,
"step": 10350
},
{
"epoch": 2.5103429545694587,
"grad_norm": 0.267578125,
"learning_rate": 2.3268447261204166e-05,
"loss": 2.0976,
"mean_token_accuracy": 0.5541230865567922,
"num_input_tokens_seen": 5024212113,
"num_tokens": 2117576166.0,
"step": 10400
},
{
"epoch": 2.5224135069481117,
"grad_norm": 0.29296875,
"learning_rate": 2.3079824958503094e-05,
"loss": 2.0935,
"mean_token_accuracy": 0.5555445018038153,
"num_input_tokens_seen": 5048313681,
"num_tokens": 2127734721.0,
"step": 10450
},
{
"epoch": 2.534484059326765,
"grad_norm": 0.2421875,
"learning_rate": 2.2891202655802023e-05,
"loss": 2.0982,
"num_input_tokens_seen": 5072508817,
"step": 10500
},
{
"epoch": 2.534484059326765,
"eval_loss": 1.9683516025543213,
"eval_mean_token_accuracy": 0.5784807712440619,
"eval_num_tokens": 2137987548.0,
"eval_runtime": 130.4075,
"eval_samples_per_second": 82.143,
"eval_steps_per_second": 20.536,
"num_input_tokens_seen": 5072508817,
"step": 10500
},
{
"epoch": 2.5465546117054183,
"grad_norm": 0.267578125,
"learning_rate": 2.270258035310095e-05,
"loss": 2.0924,
"mean_token_accuracy": 0.5551841219887137,
"num_input_tokens_seen": 5096586577,
"num_tokens": 2148155987.0,
"step": 10550
},
{
"epoch": 2.5586251640840714,
"grad_norm": 0.2734375,
"learning_rate": 2.251395805039988e-05,
"loss": 2.0982,
"mean_token_accuracy": 0.5541262343525887,
"num_input_tokens_seen": 5120875729,
"num_tokens": 2158352820.0,
"step": 10600
},
{
"epoch": 2.5706957164627244,
"grad_norm": 0.251953125,
"learning_rate": 2.232533574769881e-05,
"loss": 2.0908,
"mean_token_accuracy": 0.5560182608664036,
"num_input_tokens_seen": 5145050353,
"num_tokens": 2168407807.0,
"step": 10650
},
{
"epoch": 2.582766268841378,
"grad_norm": 0.2734375,
"learning_rate": 2.2136713444997737e-05,
"loss": 2.0958,
"mean_token_accuracy": 0.5551287305355072,
"num_input_tokens_seen": 5169266849,
"num_tokens": 2178592858.0,
"step": 10700
},
{
"epoch": 2.594836821220031,
"grad_norm": 0.2451171875,
"learning_rate": 2.1948091142296666e-05,
"loss": 2.0904,
"mean_token_accuracy": 0.5559819753468037,
"num_input_tokens_seen": 5193472705,
"num_tokens": 2188792925.0,
"step": 10750
},
{
"epoch": 2.606907373598684,
"grad_norm": 0.2578125,
"learning_rate": 2.1759468839595595e-05,
"loss": 2.1003,
"mean_token_accuracy": 0.5538398388028145,
"num_input_tokens_seen": 5217541665,
"num_tokens": 2199063266.0,
"step": 10800
},
{
"epoch": 2.6189779259773376,
"grad_norm": 0.2578125,
"learning_rate": 2.1570846536894523e-05,
"loss": 2.0996,
"mean_token_accuracy": 0.5539121518284083,
"num_input_tokens_seen": 5241669153,
"num_tokens": 2209236507.0,
"step": 10850
},
{
"epoch": 2.6310484783559906,
"grad_norm": 0.2412109375,
"learning_rate": 2.1382224234193452e-05,
"loss": 2.0898,
"mean_token_accuracy": 0.5560731103271246,
"num_input_tokens_seen": 5265851553,
"num_tokens": 2219375503.0,
"step": 10900
},
{
"epoch": 2.643119030734644,
"grad_norm": 0.255859375,
"learning_rate": 2.119360193149238e-05,
"loss": 2.0887,
"mean_token_accuracy": 0.5559511515125632,
"num_input_tokens_seen": 5290120305,
"num_tokens": 2229631896.0,
"step": 10950
},
{
"epoch": 2.6551895831132972,
"grad_norm": 0.267578125,
"learning_rate": 2.100497962879131e-05,
"loss": 2.0941,
"num_input_tokens_seen": 5314253297,
"step": 11000
},
{
"epoch": 2.6551895831132972,
"eval_loss": 1.9683243036270142,
"eval_mean_token_accuracy": 0.5784822298106727,
"eval_num_tokens": 2239778564.0,
"eval_runtime": 131.1903,
"eval_samples_per_second": 81.652,
"eval_steps_per_second": 20.413,
"num_input_tokens_seen": 5314253297,
"step": 11000
},
{
"epoch": 2.6672601354919503,
"grad_norm": 0.275390625,
"learning_rate": 2.0816357326090238e-05,
"loss": 2.0981,
"mean_token_accuracy": 0.5546219968609511,
"num_input_tokens_seen": 5338466017,
"num_tokens": 2249988177.0,
"step": 11050
},
{
"epoch": 2.679330687870604,
"grad_norm": 0.388671875,
"learning_rate": 2.0627735023389166e-05,
"loss": 2.0921,
"mean_token_accuracy": 0.5557056156918406,
"num_input_tokens_seen": 5362616753,
"num_tokens": 2260143864.0,
"step": 11100
},
{
"epoch": 2.691401240249257,
"grad_norm": 0.296875,
"learning_rate": 2.0439112720688095e-05,
"loss": 2.0957,
"mean_token_accuracy": 0.555001782849431,
"num_input_tokens_seen": 5386751937,
"num_tokens": 2270269015.0,
"step": 11150
},
{
"epoch": 2.7034717926279104,
"grad_norm": 0.2734375,
"learning_rate": 2.0250490417987024e-05,
"loss": 2.0879,
"mean_token_accuracy": 0.5558207688108087,
"num_input_tokens_seen": 5410821777,
"num_tokens": 2280359409.0,
"step": 11200
},
{
"epoch": 2.7155423450065634,
"grad_norm": 0.255859375,
"learning_rate": 2.0061868115285952e-05,
"loss": 2.0862,
"mean_token_accuracy": 0.5561293217167258,
"num_input_tokens_seen": 5435109553,
"num_tokens": 2290676732.0,
"step": 11250
},
{
"epoch": 2.7276128973852165,
"grad_norm": 0.2890625,
"learning_rate": 1.987324581258488e-05,
"loss": 2.096,
"mean_token_accuracy": 0.5545283930376173,
"num_input_tokens_seen": 5459182481,
"num_tokens": 2300830078.0,
"step": 11300
},
{
"epoch": 2.73968344976387,
"grad_norm": 0.271484375,
"learning_rate": 1.968462350988381e-05,
"loss": 2.0944,
"mean_token_accuracy": 0.5546667322888971,
"num_input_tokens_seen": 5483343233,
"num_tokens": 2310996379.0,
"step": 11350
},
{
"epoch": 2.751754002142523,
"grad_norm": 0.2373046875,
"learning_rate": 1.9496001207182738e-05,
"loss": 2.0962,
"mean_token_accuracy": 0.5551789667457342,
"num_input_tokens_seen": 5507547265,
"num_tokens": 2321208987.0,
"step": 11400
},
{
"epoch": 2.763824554521176,
"grad_norm": 0.25390625,
"learning_rate": 1.9307378904481667e-05,
"loss": 2.0824,
"mean_token_accuracy": 0.5572770998999477,
"num_input_tokens_seen": 5531617505,
"num_tokens": 2331304919.0,
"step": 11450
},
{
"epoch": 2.7758951068998297,
"grad_norm": 0.2578125,
"learning_rate": 1.9118756601780595e-05,
"loss": 2.1095,
"num_input_tokens_seen": 5555689025,
"step": 11500
},
{
"epoch": 2.7758951068998297,
"eval_loss": 1.9681649208068848,
"eval_mean_token_accuracy": 0.5785138376329798,
"eval_num_tokens": 2341489447.0,
"eval_runtime": 130.3192,
"eval_samples_per_second": 82.198,
"eval_steps_per_second": 20.55,
"num_input_tokens_seen": 5555689025,
"step": 11500
},
{
"epoch": 2.7879656592784827,
"grad_norm": 0.275390625,
"learning_rate": 1.8930134299079524e-05,
"loss": 2.0913,
"mean_token_accuracy": 0.553754635732621,
"num_input_tokens_seen": 5579808081,
"num_tokens": 2351725815.0,
"step": 11550
},
{
"epoch": 2.800036211657136,
"grad_norm": 0.27734375,
"learning_rate": 1.8741511996378453e-05,
"loss": 2.0961,
"mean_token_accuracy": 0.5546263293549418,
"num_input_tokens_seen": 5604043089,
"num_tokens": 2362015471.0,
"step": 11600
},
{
"epoch": 2.8121067640357893,
"grad_norm": 0.267578125,
"learning_rate": 1.855288969367738e-05,
"loss": 2.0887,
"mean_token_accuracy": 0.555464554913342,
"num_input_tokens_seen": 5628213761,
"num_tokens": 2372168279.0,
"step": 11650
},
{
"epoch": 2.8241773164144424,
"grad_norm": 0.263671875,
"learning_rate": 1.836426739097631e-05,
"loss": 2.1049,
"mean_token_accuracy": 0.5529934700578452,
"num_input_tokens_seen": 5652297233,
"num_tokens": 2382280922.0,
"step": 11700
},
{
"epoch": 2.8362478687930954,
"grad_norm": 0.27734375,
"learning_rate": 1.817564508827524e-05,
"loss": 2.1025,
"mean_token_accuracy": 0.5528503654524684,
"num_input_tokens_seen": 5676338817,
"num_tokens": 2392486600.0,
"step": 11750
},
{
"epoch": 2.848318421171749,
"grad_norm": 0.25,
"learning_rate": 1.7987022785574167e-05,
"loss": 2.1018,
"mean_token_accuracy": 0.5537810071185231,
"num_input_tokens_seen": 5700543889,
"num_tokens": 2402663993.0,
"step": 11800
},
{
"epoch": 2.860388973550402,
"grad_norm": 0.267578125,
"learning_rate": 1.7798400482873096e-05,
"loss": 2.0891,
"mean_token_accuracy": 0.5560761171206832,
"num_input_tokens_seen": 5724764289,
"num_tokens": 2412816351.0,
"step": 11850
},
{
"epoch": 2.872459525929055,
"grad_norm": 0.24609375,
"learning_rate": 1.7609778180172024e-05,
"loss": 2.0974,
"mean_token_accuracy": 0.5541906878352165,
"num_input_tokens_seen": 5749009809,
"num_tokens": 2423083966.0,
"step": 11900
},
{
"epoch": 2.8845300783077086,
"grad_norm": 0.265625,
"learning_rate": 1.7421155877470953e-05,
"loss": 2.0996,
"mean_token_accuracy": 0.5539322036504746,
"num_input_tokens_seen": 5773128241,
"num_tokens": 2433212460.0,
"step": 11950
},
{
"epoch": 2.8966006306863616,
"grad_norm": 0.26171875,
"learning_rate": 1.723253357476988e-05,
"loss": 2.0986,
"num_input_tokens_seen": 5797304337,
"step": 12000
},
{
"epoch": 2.8966006306863616,
"eval_loss": 1.9681628942489624,
"eval_mean_token_accuracy": 0.5785647708146406,
"eval_num_tokens": 2443385171.0,
"eval_runtime": 130.5866,
"eval_samples_per_second": 82.03,
"eval_steps_per_second": 20.507,
"num_input_tokens_seen": 5797304337,
"step": 12000
},
{
"epoch": 2.908671183065015,
"grad_norm": 0.2578125,
"learning_rate": 1.704391127206881e-05,
"loss": 2.0958,
"mean_token_accuracy": 0.5546299646422267,
"num_input_tokens_seen": 5821442385,
"num_tokens": 2453453845.0,
"step": 12050
},
{
"epoch": 2.920741735443668,
"grad_norm": 0.26171875,
"learning_rate": 1.685528896936774e-05,
"loss": 2.0926,
"mean_token_accuracy": 0.5549974143505096,
"num_input_tokens_seen": 5845686961,
"num_tokens": 2463776050.0,
"step": 12100
},
{
"epoch": 2.9328122878223217,
"grad_norm": 0.263671875,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.1015,
"mean_token_accuracy": 0.5541527543962002,
"num_input_tokens_seen": 5869745137,
"num_tokens": 2473828195.0,
"step": 12150
},
{
"epoch": 2.944882840200975,
"grad_norm": 0.26953125,
"learning_rate": 1.6478044363965596e-05,
"loss": 2.1041,
"mean_token_accuracy": 0.5541104365140199,
"num_input_tokens_seen": 5893803025,
"num_tokens": 2483915340.0,
"step": 12200
},
{
"epoch": 2.956953392579628,
"grad_norm": 0.2333984375,
"learning_rate": 1.6289422061264525e-05,
"loss": 2.0922,
"mean_token_accuracy": 0.5555301706120371,
"num_input_tokens_seen": 5918068641,
"num_tokens": 2494208052.0,
"step": 12250
},
{
"epoch": 2.9690239449582814,
"grad_norm": 0.2490234375,
"learning_rate": 1.6100799758563453e-05,
"loss": 2.0938,
"mean_token_accuracy": 0.5548020200431347,
"num_input_tokens_seen": 5942257041,
"num_tokens": 2504393372.0,
"step": 12300
},
{
"epoch": 2.9810944973369344,
"grad_norm": 0.2890625,
"learning_rate": 1.5912177455862382e-05,
"loss": 2.0843,
"mean_token_accuracy": 0.5566597804427147,
"num_input_tokens_seen": 5966422081,
"num_tokens": 2514627798.0,
"step": 12350
},
{
"epoch": 2.9931650497155875,
"grad_norm": 0.2734375,
"learning_rate": 1.572355515316131e-05,
"loss": 2.0886,
"mean_token_accuracy": 0.5566082544624805,
"num_input_tokens_seen": 5990574321,
"num_tokens": 2524805007.0,
"step": 12400
},
{
"epoch": 3.005069631999034,
"grad_norm": 0.26171875,
"learning_rate": 1.553493285046024e-05,
"loss": 2.1001,
"mean_token_accuracy": 0.5549402527407246,
"num_input_tokens_seen": 6014380145,
"num_tokens": 2534738802.0,
"step": 12450
},
{
"epoch": 3.0171401843776877,
"grad_norm": 0.2314453125,
"learning_rate": 1.5346310547759168e-05,
"loss": 2.092,
"num_input_tokens_seen": 6038556753,
"step": 12500
},
{
"epoch": 3.0171401843776877,
"eval_loss": 1.9681233167648315,
"eval_mean_token_accuracy": 0.5784891846355349,
"eval_num_tokens": 2544886437.0,
"eval_runtime": 130.6689,
"eval_samples_per_second": 81.978,
"eval_steps_per_second": 20.495,
"num_input_tokens_seen": 6038556753,
"step": 12500
},
{
"epoch": 3.029210736756341,
"grad_norm": 0.25390625,
"learning_rate": 1.5157688245058096e-05,
"loss": 2.0925,
"mean_token_accuracy": 0.5550393326207995,
"num_input_tokens_seen": 6062857617,
"num_tokens": 2555112151.0,
"step": 12550
},
{
"epoch": 3.041281289134994,
"grad_norm": 0.38671875,
"learning_rate": 1.4969065942357025e-05,
"loss": 2.0957,
"mean_token_accuracy": 0.5551863227039575,
"num_input_tokens_seen": 6087077841,
"num_tokens": 2565388515.0,
"step": 12600
},
{
"epoch": 3.0533518415136474,
"grad_norm": 0.279296875,
"learning_rate": 1.4780443639655952e-05,
"loss": 2.0858,
"mean_token_accuracy": 0.5563259933143854,
"num_input_tokens_seen": 6111161617,
"num_tokens": 2575504513.0,
"step": 12650
},
{
"epoch": 3.0654223938923004,
"grad_norm": 0.25,
"learning_rate": 1.4591821336954884e-05,
"loss": 2.101,
"mean_token_accuracy": 0.5549140437319875,
"num_input_tokens_seen": 6135170369,
"num_tokens": 2585570498.0,
"step": 12700
},
{
"epoch": 3.077492946270954,
"grad_norm": 0.263671875,
"learning_rate": 1.4403199034253811e-05,
"loss": 2.0935,
"mean_token_accuracy": 0.5543564364686608,
"num_input_tokens_seen": 6159397985,
"num_tokens": 2595740107.0,
"step": 12750
},
{
"epoch": 3.089563498649607,
"grad_norm": 0.265625,
"learning_rate": 1.421457673155274e-05,
"loss": 2.0928,
"mean_token_accuracy": 0.5548016136884689,
"num_input_tokens_seen": 6183511137,
"num_tokens": 2605900153.0,
"step": 12800
},
{
"epoch": 3.10163405102826,
"grad_norm": 0.2890625,
"learning_rate": 1.4025954428851668e-05,
"loss": 2.0862,
"mean_token_accuracy": 0.5555924268066883,
"num_input_tokens_seen": 6207630993,
"num_tokens": 2616105592.0,
"step": 12850
},
{
"epoch": 3.1137046034069136,
"grad_norm": 0.248046875,
"learning_rate": 1.3837332126150595e-05,
"loss": 2.0938,
"mean_token_accuracy": 0.554584386125207,
"num_input_tokens_seen": 6231763217,
"num_tokens": 2626268256.0,
"step": 12900
},
{
"epoch": 3.1257751557855666,
"grad_norm": 0.251953125,
"learning_rate": 1.3648709823449527e-05,
"loss": 2.1042,
"mean_token_accuracy": 0.553115917481482,
"num_input_tokens_seen": 6255995041,
"num_tokens": 2636461653.0,
"step": 12950
},
{
"epoch": 3.1378457081642197,
"grad_norm": 0.25390625,
"learning_rate": 1.3460087520748454e-05,
"loss": 2.0952,
"num_input_tokens_seen": 6280158129,
"step": 13000
},
{
"epoch": 3.1378457081642197,
"eval_loss": 1.9681209325790405,
"eval_mean_token_accuracy": 0.5785721040555485,
"eval_num_tokens": 2646712354.0,
"eval_runtime": 130.3881,
"eval_samples_per_second": 82.155,
"eval_steps_per_second": 20.539,
"num_input_tokens_seen": 6280158129,
"step": 13000
},
{
"epoch": 3.149916260542873,
"grad_norm": 0.25390625,
"learning_rate": 1.3271465218047383e-05,
"loss": 2.0974,
"mean_token_accuracy": 0.5548031070828437,
"num_input_tokens_seen": 6304365713,
"num_tokens": 2656912031.0,
"step": 13050
},
{
"epoch": 3.1619868129215263,
"grad_norm": 0.24609375,
"learning_rate": 1.3082842915346311e-05,
"loss": 2.0981,
"mean_token_accuracy": 0.5543636172637343,
"num_input_tokens_seen": 6328561217,
"num_tokens": 2667181848.0,
"step": 13100
},
{
"epoch": 3.1740573653001793,
"grad_norm": 0.236328125,
"learning_rate": 1.2894220612645238e-05,
"loss": 2.093,
"mean_token_accuracy": 0.5551713344082236,
"num_input_tokens_seen": 6352657569,
"num_tokens": 2677374089.0,
"step": 13150
},
{
"epoch": 3.186127917678833,
"grad_norm": 0.267578125,
"learning_rate": 1.2705598309944169e-05,
"loss": 2.084,
"mean_token_accuracy": 0.5568741805478931,
"num_input_tokens_seen": 6376750801,
"num_tokens": 2687517529.0,
"step": 13200
},
{
"epoch": 3.198198470057486,
"grad_norm": 0.2578125,
"learning_rate": 1.2516976007243097e-05,
"loss": 2.0985,
"mean_token_accuracy": 0.5545465455949307,
"num_input_tokens_seen": 6400738145,
"num_tokens": 2697615714.0,
"step": 13250
},
{
"epoch": 3.2102690224361394,
"grad_norm": 0.2451171875,
"learning_rate": 1.2328353704542026e-05,
"loss": 2.0969,
"mean_token_accuracy": 0.5544571406021714,
"num_input_tokens_seen": 6424909057,
"num_tokens": 2707784293.0,
"step": 13300
},
{
"epoch": 3.2223395748147925,
"grad_norm": 0.302734375,
"learning_rate": 1.2139731401840953e-05,
"loss": 2.0932,
"mean_token_accuracy": 0.5548350306227803,
"num_input_tokens_seen": 6449111825,
"num_tokens": 2717984302.0,
"step": 13350
},
{
"epoch": 3.2344101271934456,
"grad_norm": 0.228515625,
"learning_rate": 1.1951109099139883e-05,
"loss": 2.1012,
"mean_token_accuracy": 0.5535725425183773,
"num_input_tokens_seen": 6473257953,
"num_tokens": 2728233467.0,
"step": 13400
},
{
"epoch": 3.246480679572099,
"grad_norm": 0.2578125,
"learning_rate": 1.1762486796438812e-05,
"loss": 2.0985,
"mean_token_accuracy": 0.5541856496781111,
"num_input_tokens_seen": 6497464865,
"num_tokens": 2738326366.0,
"step": 13450
},
{
"epoch": 3.258551231950752,
"grad_norm": 0.2412109375,
"learning_rate": 1.157386449373774e-05,
"loss": 2.0911,
"num_input_tokens_seen": 6521634753,
"step": 13500
},
{
"epoch": 3.258551231950752,
"eval_loss": 1.9680596590042114,
"eval_mean_token_accuracy": 0.5785238554199033,
"eval_num_tokens": 2748403907.0,
"eval_runtime": 130.2372,
"eval_samples_per_second": 82.25,
"eval_steps_per_second": 20.562,
"num_input_tokens_seen": 6521634753,
"step": 13500
},
{
"epoch": 3.270621784329405,
"grad_norm": 0.251953125,
"learning_rate": 1.1385242191036669e-05,
"loss": 2.0844,
"mean_token_accuracy": 0.5562084444984794,
"num_input_tokens_seen": 6545823777,
"num_tokens": 2758638062.0,
"step": 13550
},
{
"epoch": 3.2826923367080587,
"grad_norm": 0.24609375,
"learning_rate": 1.1196619888335598e-05,
"loss": 2.089,
"mean_token_accuracy": 0.5565486250445246,
"num_input_tokens_seen": 6569949777,
"num_tokens": 2768698376.0,
"step": 13600
},
{
"epoch": 3.2947628890867118,
"grad_norm": 0.2431640625,
"learning_rate": 1.1007997585634526e-05,
"loss": 2.0915,
"mean_token_accuracy": 0.5548499751463533,
"num_input_tokens_seen": 6593997425,
"num_tokens": 2778806953.0,
"step": 13650
},
{
"epoch": 3.306833441465365,
"grad_norm": 0.330078125,
"learning_rate": 1.0819375282933455e-05,
"loss": 2.0875,
"mean_token_accuracy": 0.5560770154371858,
"num_input_tokens_seen": 6618153121,
"num_tokens": 2789046249.0,
"step": 13700
},
{
"epoch": 3.3189039938440184,
"grad_norm": 0.26171875,
"learning_rate": 1.0630752980232384e-05,
"loss": 2.0974,
"mean_token_accuracy": 0.5540758088976144,
"num_input_tokens_seen": 6642228561,
"num_tokens": 2799134100.0,
"step": 13750
},
{
"epoch": 3.3309745462226714,
"grad_norm": 0.2578125,
"learning_rate": 1.0442130677531312e-05,
"loss": 2.0837,
"mean_token_accuracy": 0.5564264697581529,
"num_input_tokens_seen": 6666487089,
"num_tokens": 2809333203.0,
"step": 13800
},
{
"epoch": 3.343045098601325,
"grad_norm": 0.271484375,
"learning_rate": 1.025350837483024e-05,
"loss": 2.0804,
"mean_token_accuracy": 0.5564664682373405,
"num_input_tokens_seen": 6690592209,
"num_tokens": 2819507621.0,
"step": 13850
},
{
"epoch": 3.355115650979978,
"grad_norm": 0.2578125,
"learning_rate": 1.006488607212917e-05,
"loss": 2.0875,
"mean_token_accuracy": 0.5563617146387696,
"num_input_tokens_seen": 6714782033,
"num_tokens": 2829715451.0,
"step": 13900
},
{
"epoch": 3.367186203358631,
"grad_norm": 0.26171875,
"learning_rate": 9.876263769428096e-06,
"loss": 2.1015,
"mean_token_accuracy": 0.5533242063969374,
"num_input_tokens_seen": 6738954721,
"num_tokens": 2839876349.0,
"step": 13950
},
{
"epoch": 3.3792567557372846,
"grad_norm": 0.2578125,
"learning_rate": 9.687641466727027e-06,
"loss": 2.1018,
"num_input_tokens_seen": 6763271617,
"step": 14000
},
{
"epoch": 3.3792567557372846,
"eval_loss": 1.9681081771850586,
"eval_mean_token_accuracy": 0.5785279828634967,
"eval_num_tokens": 2850147053.0,
"eval_runtime": 131.6179,
"eval_samples_per_second": 81.387,
"eval_steps_per_second": 20.347,
"num_input_tokens_seen": 6763271617,
"step": 14000
},
{
"epoch": 3.3913273081159376,
"grad_norm": 0.25,
"learning_rate": 9.499019164025955e-06,
"loss": 2.0975,
"mean_token_accuracy": 0.5536782286874949,
"num_input_tokens_seen": 6787391841,
"num_tokens": 2860344977.0,
"step": 14050
},
{
"epoch": 3.4033978604945907,
"grad_norm": 0.25,
"learning_rate": 9.310396861324884e-06,
"loss": 2.1022,
"mean_token_accuracy": 0.5538700968772173,
"num_input_tokens_seen": 6811630961,
"num_tokens": 2870526506.0,
"step": 14100
},
{
"epoch": 3.415468412873244,
"grad_norm": 0.2431640625,
"learning_rate": 9.121774558623813e-06,
"loss": 2.0934,
"mean_token_accuracy": 0.5550757900252938,
"num_input_tokens_seen": 6835811825,
"num_tokens": 2880722898.0,
"step": 14150
},
{
"epoch": 3.4275389652518973,
"grad_norm": 0.2578125,
"learning_rate": 8.93315225592274e-06,
"loss": 2.0875,
"mean_token_accuracy": 0.5558257311582565,
"num_input_tokens_seen": 6859918049,
"num_tokens": 2890925546.0,
"step": 14200
},
{
"epoch": 3.439609517630551,
"grad_norm": 0.2294921875,
"learning_rate": 8.74452995322167e-06,
"loss": 2.0969,
"mean_token_accuracy": 0.5544555878639221,
"num_input_tokens_seen": 6883973409,
"num_tokens": 2901007248.0,
"step": 14250
},
{
"epoch": 3.451680070009204,
"grad_norm": 0.25390625,
"learning_rate": 8.555907650520598e-06,
"loss": 2.0987,
"mean_token_accuracy": 0.5544828617200256,
"num_input_tokens_seen": 6908263985,
"num_tokens": 2911355124.0,
"step": 14300
},
{
"epoch": 3.463750622387857,
"grad_norm": 0.271484375,
"learning_rate": 8.367285347819527e-06,
"loss": 2.0889,
"mean_token_accuracy": 0.5557316156104207,
"num_input_tokens_seen": 6932344993,
"num_tokens": 2921442830.0,
"step": 14350
},
{
"epoch": 3.4758211747665104,
"grad_norm": 0.255859375,
"learning_rate": 8.178663045118456e-06,
"loss": 2.0979,
"mean_token_accuracy": 0.5547628674656153,
"num_input_tokens_seen": 6956417041,
"num_tokens": 2931461417.0,
"step": 14400
},
{
"epoch": 3.4878917271451635,
"grad_norm": 0.234375,
"learning_rate": 7.990040742417383e-06,
"loss": 2.1005,
"mean_token_accuracy": 0.5539160283654928,
"num_input_tokens_seen": 6980421889,
"num_tokens": 2941531928.0,
"step": 14450
},
{
"epoch": 3.4999622795238166,
"grad_norm": 0.275390625,
"learning_rate": 7.801418439716313e-06,
"loss": 2.1017,
"num_input_tokens_seen": 7004552193,
"step": 14500
},
{
"epoch": 3.4999622795238166,
"eval_loss": 1.9681284427642822,
"eval_mean_token_accuracy": 0.5785388401566912,
"eval_num_tokens": 2951727207.0,
"eval_runtime": 131.2276,
"eval_samples_per_second": 81.629,
"eval_steps_per_second": 20.407,
"num_input_tokens_seen": 7004552193,
"step": 14500
},
{
"epoch": 3.51203283190247,
"grad_norm": 0.267578125,
"learning_rate": 7.612796137015241e-06,
"loss": 2.09,
"mean_token_accuracy": 0.5543626462481916,
"num_input_tokens_seen": 7028775953,
"num_tokens": 2961945579.0,
"step": 14550
},
{
"epoch": 3.524103384281123,
"grad_norm": 0.26171875,
"learning_rate": 7.42417383431417e-06,
"loss": 2.0978,
"mean_token_accuracy": 0.5544422981515527,
"num_input_tokens_seen": 7052883457,
"num_tokens": 2972173798.0,
"step": 14600
},
{
"epoch": 3.536173936659776,
"grad_norm": 0.251953125,
"learning_rate": 7.235551531613098e-06,
"loss": 2.0915,
"mean_token_accuracy": 0.5559014651551842,
"num_input_tokens_seen": 7077135185,
"num_tokens": 2982315453.0,
"step": 14650
},
{
"epoch": 3.5482444890384297,
"grad_norm": 0.310546875,
"learning_rate": 7.0469292289120274e-06,
"loss": 2.0932,
"mean_token_accuracy": 0.5552764968574047,
"num_input_tokens_seen": 7101260305,
"num_tokens": 2992557355.0,
"step": 14700
},
{
"epoch": 3.5603150414170828,
"grad_norm": 0.25390625,
"learning_rate": 6.858306926210955e-06,
"loss": 2.0959,
"mean_token_accuracy": 0.555088207796216,
"num_input_tokens_seen": 7125198545,
"num_tokens": 3002657117.0,
"step": 14750
},
{
"epoch": 3.572385593795736,
"grad_norm": 0.2314453125,
"learning_rate": 6.669684623509884e-06,
"loss": 2.0933,
"mean_token_accuracy": 0.5554985254630447,
"num_input_tokens_seen": 7149297905,
"num_tokens": 3012818977.0,
"step": 14800
},
{
"epoch": 3.5844561461743893,
"grad_norm": 0.23828125,
"learning_rate": 6.481062320808813e-06,
"loss": 2.0901,
"mean_token_accuracy": 0.5556722393259406,
"num_input_tokens_seen": 7173408417,
"num_tokens": 3022993500.0,
"step": 14850
},
{
"epoch": 3.5965266985530424,
"grad_norm": 0.279296875,
"learning_rate": 6.292440018107741e-06,
"loss": 2.0862,
"mean_token_accuracy": 0.5560053834319114,
"num_input_tokens_seen": 7197689201,
"num_tokens": 3033239485.0,
"step": 14900
},
{
"epoch": 3.608597250931696,
"grad_norm": 0.265625,
"learning_rate": 6.10381771540667e-06,
"loss": 2.093,
"mean_token_accuracy": 0.5550377672165632,
"num_input_tokens_seen": 7221805553,
"num_tokens": 3043356374.0,
"step": 14950
},
{
"epoch": 3.620667803310349,
"grad_norm": 0.24609375,
"learning_rate": 5.915195412705598e-06,
"loss": 2.0994,
"num_input_tokens_seen": 7245951473,
"step": 15000
},
{
"epoch": 3.620667803310349,
"eval_loss": 1.9680702686309814,
"eval_mean_token_accuracy": 0.5785124528710748,
"eval_num_tokens": 3053564564.0,
"eval_runtime": 130.6855,
"eval_samples_per_second": 81.968,
"eval_steps_per_second": 20.492,
"num_input_tokens_seen": 7245951473,
"step": 15000
},
{
"epoch": 3.632738355689002,
"grad_norm": 0.248046875,
"learning_rate": 5.726573110004527e-06,
"loss": 2.0923,
"mean_token_accuracy": 0.554327048882842,
"num_input_tokens_seen": 7269914417,
"num_tokens": 3063722766.0,
"step": 15050
},
{
"epoch": 3.6448089080676556,
"grad_norm": 0.26953125,
"learning_rate": 5.5379508073034565e-06,
"loss": 2.0861,
"mean_token_accuracy": 0.5561711810901762,
"num_input_tokens_seen": 7293956449,
"num_tokens": 3073837103.0,
"step": 15100
},
{
"epoch": 3.6568794604463086,
"grad_norm": 0.255859375,
"learning_rate": 5.349328504602384e-06,
"loss": 2.0949,
"mean_token_accuracy": 0.5547518468275666,
"num_input_tokens_seen": 7318063713,
"num_tokens": 3083990662.0,
"step": 15150
},
{
"epoch": 3.668950012824962,
"grad_norm": 0.26171875,
"learning_rate": 5.160706201901313e-06,
"loss": 2.0874,
"mean_token_accuracy": 0.555564073510468,
"num_input_tokens_seen": 7342169889,
"num_tokens": 3094140237.0,
"step": 15200
},
{
"epoch": 3.681020565203615,
"grad_norm": 0.298828125,
"learning_rate": 4.9720838992002415e-06,
"loss": 2.1014,
"mean_token_accuracy": 0.5540463343262673,
"num_input_tokens_seen": 7366510321,
"num_tokens": 3104349542.0,
"step": 15250
},
{
"epoch": 3.6930911175822683,
"grad_norm": 0.349609375,
"learning_rate": 4.78346159649917e-06,
"loss": 2.1128,
"mean_token_accuracy": 0.5523605942726135,
"num_input_tokens_seen": 7390741489,
"num_tokens": 3114615388.0,
"step": 15300
},
{
"epoch": 3.7051616699609218,
"grad_norm": 0.265625,
"learning_rate": 4.594839293798099e-06,
"loss": 2.0836,
"mean_token_accuracy": 0.5556057692691684,
"num_input_tokens_seen": 7414960897,
"num_tokens": 3124852831.0,
"step": 15350
},
{
"epoch": 3.717232222339575,
"grad_norm": 0.234375,
"learning_rate": 4.406216991097027e-06,
"loss": 2.0832,
"mean_token_accuracy": 0.557135313116014,
"num_input_tokens_seen": 7439141473,
"num_tokens": 3135118289.0,
"step": 15400
},
{
"epoch": 3.729302774718228,
"grad_norm": 0.3671875,
"learning_rate": 4.217594688395956e-06,
"loss": 2.0993,
"mean_token_accuracy": 0.554307484254241,
"num_input_tokens_seen": 7463114529,
"num_tokens": 3145345547.0,
"step": 15450
},
{
"epoch": 3.7413733270968814,
"grad_norm": 0.27734375,
"learning_rate": 4.028972385694885e-06,
"loss": 2.0916,
"num_input_tokens_seen": 7487502401,
"step": 15500
},
{
"epoch": 3.7413733270968814,
"eval_loss": 1.9680771827697754,
"eval_mean_token_accuracy": 0.5784905481975056,
"eval_num_tokens": 3155655029.0,
"eval_runtime": 131.7036,
"eval_samples_per_second": 81.334,
"eval_steps_per_second": 20.334,
"num_input_tokens_seen": 7487502401,
"step": 15500
},
{
"epoch": 3.7534438794755345,
"grad_norm": 0.28125,
"learning_rate": 3.840350082993813e-06,
"loss": 2.0919,
"mean_token_accuracy": 0.5555992320179939,
"num_input_tokens_seen": 7511609537,
"num_tokens": 3165956280.0,
"step": 15550
},
{
"epoch": 3.7655144318541875,
"grad_norm": 0.265625,
"learning_rate": 3.6517277802927423e-06,
"loss": 2.096,
"mean_token_accuracy": 0.5547948920354248,
"num_input_tokens_seen": 7535873665,
"num_tokens": 3176126007.0,
"step": 15600
},
{
"epoch": 3.777584984232841,
"grad_norm": 0.25390625,
"learning_rate": 3.463105477591671e-06,
"loss": 2.0963,
"mean_token_accuracy": 0.5549294283241033,
"num_input_tokens_seen": 7560110225,
"num_tokens": 3186367919.0,
"step": 15650
},
{
"epoch": 3.789655536611494,
"grad_norm": 0.28125,
"learning_rate": 3.274483174890599e-06,
"loss": 2.0986,
"mean_token_accuracy": 0.5547161266207695,
"num_input_tokens_seen": 7584157249,
"num_tokens": 3196441494.0,
"step": 15700
},
{
"epoch": 3.801726088990147,
"grad_norm": 0.2470703125,
"learning_rate": 3.0858608721895278e-06,
"loss": 2.0952,
"mean_token_accuracy": 0.554626210257411,
"num_input_tokens_seen": 7608282737,
"num_tokens": 3206560665.0,
"step": 15750
},
{
"epoch": 3.8137966413688007,
"grad_norm": 0.265625,
"learning_rate": 2.8972385694884564e-06,
"loss": 2.0874,
"mean_token_accuracy": 0.5568051477894187,
"num_input_tokens_seen": 7632420385,
"num_tokens": 3216638197.0,
"step": 15800
},
{
"epoch": 3.8258671937474538,
"grad_norm": 0.357421875,
"learning_rate": 2.708616266787385e-06,
"loss": 2.0935,
"mean_token_accuracy": 0.5554833044111729,
"num_input_tokens_seen": 7656546049,
"num_tokens": 3226721685.0,
"step": 15850
},
{
"epoch": 3.837937746126107,
"grad_norm": 0.216796875,
"learning_rate": 2.5199939640863136e-06,
"loss": 2.1087,
"mean_token_accuracy": 0.5525853624939918,
"num_input_tokens_seen": 7680650689,
"num_tokens": 3236963270.0,
"step": 15900
},
{
"epoch": 3.8500082985047603,
"grad_norm": 0.267578125,
"learning_rate": 2.3313716613852423e-06,
"loss": 2.0952,
"mean_token_accuracy": 0.5550886183232069,
"num_input_tokens_seen": 7704888257,
"num_tokens": 3247161827.0,
"step": 15950
},
{
"epoch": 3.8620788508834134,
"grad_norm": 0.2333984375,
"learning_rate": 2.142749358684171e-06,
"loss": 2.0914,
"num_input_tokens_seen": 7729128049,
"step": 16000
},
{
"epoch": 3.8620788508834134,
"eval_loss": 1.9680593013763428,
"eval_mean_token_accuracy": 0.5785026788667034,
"eval_num_tokens": 3257367290.0,
"eval_runtime": 130.8705,
"eval_samples_per_second": 81.852,
"eval_steps_per_second": 20.463,
"num_input_tokens_seen": 7729128049,
"step": 16000
},
{
"epoch": 3.874149403262067,
"grad_norm": 0.26953125,
"learning_rate": 1.9541270559830995e-06,
"loss": 2.0974,
"mean_token_accuracy": 0.5552676925435662,
"num_input_tokens_seen": 7753351249,
"num_tokens": 3267712597.0,
"step": 16050
},
{
"epoch": 3.88621995564072,
"grad_norm": 0.27734375,
"learning_rate": 1.7655047532820282e-06,
"loss": 2.0956,
"mean_token_accuracy": 0.5543999705091118,
"num_input_tokens_seen": 7777499137,
"num_tokens": 3277902761.0,
"step": 16100
},
{
"epoch": 3.8982905080193735,
"grad_norm": 0.25,
"learning_rate": 1.576882450580957e-06,
"loss": 2.0949,
"mean_token_accuracy": 0.5548696434870363,
"num_input_tokens_seen": 7801633089,
"num_tokens": 3288082221.0,
"step": 16150
},
{
"epoch": 3.9103610603980266,
"grad_norm": 0.24609375,
"learning_rate": 1.3882601478798854e-06,
"loss": 2.0986,
"mean_token_accuracy": 0.5540741100907326,
"num_input_tokens_seen": 7825738481,
"num_tokens": 3298341216.0,
"step": 16200
},
{
"epoch": 3.9224316127766796,
"grad_norm": 0.263671875,
"learning_rate": 1.199637845178814e-06,
"loss": 2.0917,
"mean_token_accuracy": 0.556021711602807,
"num_input_tokens_seen": 7849877409,
"num_tokens": 3308431697.0,
"step": 16250
},
{
"epoch": 3.934502165155333,
"grad_norm": 0.255859375,
"learning_rate": 1.0110155424777427e-06,
"loss": 2.0983,
"mean_token_accuracy": 0.5542013296857476,
"num_input_tokens_seen": 7873955105,
"num_tokens": 3318516095.0,
"step": 16300
},
{
"epoch": 3.946572717533986,
"grad_norm": 0.2490234375,
"learning_rate": 8.223932397766712e-07,
"loss": 2.1049,
"mean_token_accuracy": 0.553213356398046,
"num_input_tokens_seen": 7898153793,
"num_tokens": 3328721862.0,
"step": 16350
},
{
"epoch": 3.9586432699126393,
"grad_norm": 0.2734375,
"learning_rate": 6.337709370755999e-07,
"loss": 2.0981,
"mean_token_accuracy": 0.5537924468889832,
"num_input_tokens_seen": 7922266993,
"num_tokens": 3338965333.0,
"step": 16400
},
{
"epoch": 3.9707138222912928,
"grad_norm": 0.2431640625,
"learning_rate": 4.4514863437452844e-07,
"loss": 2.0952,
"mean_token_accuracy": 0.5555017331615091,
"num_input_tokens_seen": 7946376977,
"num_tokens": 3349129194.0,
"step": 16450
},
{
"epoch": 3.982784374669946,
"grad_norm": 0.279296875,
"learning_rate": 2.565263316734571e-07,
"loss": 2.0973,
"num_input_tokens_seen": 7970557393,
"step": 16500
},
{
"epoch": 3.982784374669946,
"eval_loss": 1.9680447578430176,
"eval_mean_token_accuracy": 0.5785172289325018,
"eval_num_tokens": 3359357384.0,
"eval_runtime": 131.3028,
"eval_samples_per_second": 81.582,
"eval_steps_per_second": 20.396,
"num_input_tokens_seen": 7970557393,
"step": 16500
},
{
"epoch": 3.994854927048599,
"grad_norm": 0.2392578125,
"learning_rate": 6.79040289723857e-08,
"loss": 2.0972,
"mean_token_accuracy": 0.5545667923986912,
"num_input_tokens_seen": 7994787313,
"num_tokens": 3369677528.0,
"step": 16550
}
],
"logging_steps": 50,
"max_steps": 16568,
"num_input_tokens_seen": 8003589233,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.141038234858414e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}