| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.29329096908224367, | |
| "eval_steps": 500, | |
| "global_step": 1500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.2905268957838416, | |
| "epoch": 0.0031284370035439325, | |
| "grad_norm": 0.43689030408859253, | |
| "learning_rate": 9.999787808528638e-05, | |
| "loss": 1.7852, | |
| "mean_token_accuracy": 0.6027710330672562, | |
| "num_tokens": 524288.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.3333816397935152, | |
| "epoch": 0.006256874007087865, | |
| "grad_norm": 0.5030925273895264, | |
| "learning_rate": 9.99909372761763e-05, | |
| "loss": 1.2521, | |
| "mean_token_accuracy": 0.6930539021268487, | |
| "num_tokens": 1048576.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.8592402129434049, | |
| "epoch": 0.009385311010631798, | |
| "grad_norm": 0.17196309566497803, | |
| "learning_rate": 9.99791688121494e-05, | |
| "loss": 0.8735, | |
| "mean_token_accuracy": 0.789072047919035, | |
| "num_tokens": 1572864.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.8306971751153469, | |
| "epoch": 0.01251374801417573, | |
| "grad_norm": 0.18978707492351532, | |
| "learning_rate": 9.996257382969333e-05, | |
| "loss": 0.8334, | |
| "mean_token_accuracy": 0.7984222243539989, | |
| "num_tokens": 2096964.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.7896183831617236, | |
| "epoch": 0.015642185017719662, | |
| "grad_norm": 0.13939695060253143, | |
| "learning_rate": 9.994115393139555e-05, | |
| "loss": 0.7922, | |
| "mean_token_accuracy": 0.8059685812331736, | |
| "num_tokens": 2621252.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.7537235538475215, | |
| "epoch": 0.018770622021263596, | |
| "grad_norm": 0.14827103912830353, | |
| "learning_rate": 9.991491118578856e-05, | |
| "loss": 0.7564, | |
| "mean_token_accuracy": 0.8114575678482652, | |
| "num_tokens": 3145540.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.7533294446766376, | |
| "epoch": 0.02189905902480753, | |
| "grad_norm": 0.14420101046562195, | |
| "learning_rate": 9.988384812715006e-05, | |
| "loss": 0.7549, | |
| "mean_token_accuracy": 0.8126546451821923, | |
| "num_tokens": 3669828.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.735235239379108, | |
| "epoch": 0.02502749602835146, | |
| "grad_norm": 0.15388034284114838, | |
| "learning_rate": 9.984796775525836e-05, | |
| "loss": 0.7378, | |
| "mean_token_accuracy": 0.813876539003104, | |
| "num_tokens": 4194116.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.745816265232861, | |
| "epoch": 0.028155933031895394, | |
| "grad_norm": 0.16290415823459625, | |
| "learning_rate": 9.980727353510257e-05, | |
| "loss": 0.7469, | |
| "mean_token_accuracy": 0.8129773042164743, | |
| "num_tokens": 4718404.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.7049960081931204, | |
| "epoch": 0.031284370035439324, | |
| "grad_norm": 0.16859838366508484, | |
| "learning_rate": 9.976176939654804e-05, | |
| "loss": 0.7068, | |
| "mean_token_accuracy": 0.8205063017085195, | |
| "num_tokens": 5242421.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.7164251236245036, | |
| "epoch": 0.03441280703898326, | |
| "grad_norm": 0.18480005860328674, | |
| "learning_rate": 9.971145973395684e-05, | |
| "loss": 0.7162, | |
| "mean_token_accuracy": 0.8197130090557039, | |
| "num_tokens": 5766709.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.7345189340412617, | |
| "epoch": 0.03754124404252719, | |
| "grad_norm": 0.20077235996723175, | |
| "learning_rate": 9.965634940576338e-05, | |
| "loss": 0.7373, | |
| "mean_token_accuracy": 0.8166410801932216, | |
| "num_tokens": 6290997.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.6868870840407908, | |
| "epoch": 0.040669681046071125, | |
| "grad_norm": 0.19620533287525177, | |
| "learning_rate": 9.959644373400523e-05, | |
| "loss": 0.6877, | |
| "mean_token_accuracy": 0.8243126338347793, | |
| "num_tokens": 6814838.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.7020489743445069, | |
| "epoch": 0.04379811804961506, | |
| "grad_norm": 0.21126843988895416, | |
| "learning_rate": 9.953174850380918e-05, | |
| "loss": 0.7008, | |
| "mean_token_accuracy": 0.8215000284835696, | |
| "num_tokens": 7339126.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.6887436110991985, | |
| "epoch": 0.046926555053158986, | |
| "grad_norm": 0.20873498916625977, | |
| "learning_rate": 9.946226996283258e-05, | |
| "loss": 0.6878, | |
| "mean_token_accuracy": 0.825064530596137, | |
| "num_tokens": 7863414.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.6778992149047554, | |
| "epoch": 0.05005499205670292, | |
| "grad_norm": 0.20576342940330505, | |
| "learning_rate": 9.938801482065998e-05, | |
| "loss": 0.6807, | |
| "mean_token_accuracy": 0.8253566385246813, | |
| "num_tokens": 8387702.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.6773952257353812, | |
| "epoch": 0.05318342906024685, | |
| "grad_norm": 0.2139802724123001, | |
| "learning_rate": 9.930899024815517e-05, | |
| "loss": 0.6734, | |
| "mean_token_accuracy": 0.8275064108893275, | |
| "num_tokens": 8911990.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.6875678761862218, | |
| "epoch": 0.05631186606379079, | |
| "grad_norm": 0.22833411395549774, | |
| "learning_rate": 9.922520387676868e-05, | |
| "loss": 0.6866, | |
| "mean_token_accuracy": 0.8259179475717247, | |
| "num_tokens": 9436278.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.6587484122719616, | |
| "epoch": 0.05944030306733472, | |
| "grad_norm": 0.23073996603488922, | |
| "learning_rate": 9.91366637978009e-05, | |
| "loss": 0.6573, | |
| "mean_token_accuracy": 0.8297159126959741, | |
| "num_tokens": 9960272.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.6715414742939174, | |
| "epoch": 0.06256874007087865, | |
| "grad_norm": 0.2512345612049103, | |
| "learning_rate": 9.904337856162053e-05, | |
| "loss": 0.6645, | |
| "mean_token_accuracy": 0.8285545711405575, | |
| "num_tokens": 10484560.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.6463351501151919, | |
| "epoch": 0.06569717707442259, | |
| "grad_norm": 0.2298312485218048, | |
| "learning_rate": 9.894535717683902e-05, | |
| "loss": 0.6429, | |
| "mean_token_accuracy": 0.8320847055874765, | |
| "num_tokens": 11008848.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.6667697406373918, | |
| "epoch": 0.06882561407796652, | |
| "grad_norm": 0.22004173696041107, | |
| "learning_rate": 9.884260910944053e-05, | |
| "loss": 0.6681, | |
| "mean_token_accuracy": 0.8278767997398973, | |
| "num_tokens": 11533136.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.6179311077576131, | |
| "epoch": 0.07195405108151044, | |
| "grad_norm": 0.22624558210372925, | |
| "learning_rate": 9.873514428186778e-05, | |
| "loss": 0.6173, | |
| "mean_token_accuracy": 0.8369016530923545, | |
| "num_tokens": 12057424.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.6683201459236443, | |
| "epoch": 0.07508248808505438, | |
| "grad_norm": 0.24455475807189941, | |
| "learning_rate": 9.862297307206392e-05, | |
| "loss": 0.667, | |
| "mean_token_accuracy": 0.828629030380398, | |
| "num_tokens": 12581712.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.6426783930510283, | |
| "epoch": 0.07821092508859831, | |
| "grad_norm": 0.23066706955432892, | |
| "learning_rate": 9.850610631247019e-05, | |
| "loss": 0.6415, | |
| "mean_token_accuracy": 0.8333791512995958, | |
| "num_tokens": 13106000.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.6531911985948682, | |
| "epoch": 0.08133936209214225, | |
| "grad_norm": 0.24164645373821259, | |
| "learning_rate": 9.838455528897998e-05, | |
| "loss": 0.6551, | |
| "mean_token_accuracy": 0.8303816900588572, | |
| "num_tokens": 13630288.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.6194327082484961, | |
| "epoch": 0.08446779909568618, | |
| "grad_norm": 0.23268474638462067, | |
| "learning_rate": 9.82583317398488e-05, | |
| "loss": 0.6175, | |
| "mean_token_accuracy": 0.8367584650404751, | |
| "num_tokens": 14154576.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.6329398893285543, | |
| "epoch": 0.08759623609923012, | |
| "grad_norm": 0.24408580362796783, | |
| "learning_rate": 9.81274478545608e-05, | |
| "loss": 0.6325, | |
| "mean_token_accuracy": 0.8347576032392681, | |
| "num_tokens": 14678864.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.6229353230446577, | |
| "epoch": 0.09072467310277404, | |
| "grad_norm": 0.24314868450164795, | |
| "learning_rate": 9.79919162726516e-05, | |
| "loss": 0.6222, | |
| "mean_token_accuracy": 0.8371938765048981, | |
| "num_tokens": 15203026.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.6139514590613544, | |
| "epoch": 0.09385311010631797, | |
| "grad_norm": 0.25224220752716064, | |
| "learning_rate": 9.785175008248768e-05, | |
| "loss": 0.614, | |
| "mean_token_accuracy": 0.838007087353617, | |
| "num_tokens": 15727314.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.6109699255321175, | |
| "epoch": 0.09698154710986191, | |
| "grad_norm": 0.24364836513996124, | |
| "learning_rate": 9.770696282000244e-05, | |
| "loss": 0.6106, | |
| "mean_token_accuracy": 0.8386371252126992, | |
| "num_tokens": 16251602.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.6176430773921311, | |
| "epoch": 0.10010998411340584, | |
| "grad_norm": 0.2621923089027405, | |
| "learning_rate": 9.755756846738902e-05, | |
| "loss": 0.615, | |
| "mean_token_accuracy": 0.8376702214591205, | |
| "num_tokens": 16775637.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.6160387259442359, | |
| "epoch": 0.10323842111694978, | |
| "grad_norm": 0.2385210245847702, | |
| "learning_rate": 9.740358145174998e-05, | |
| "loss": 0.6184, | |
| "mean_token_accuracy": 0.8375125988386571, | |
| "num_tokens": 17299925.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.6175491204485297, | |
| "epoch": 0.1063668581204937, | |
| "grad_norm": 0.2511584460735321, | |
| "learning_rate": 9.724501664370418e-05, | |
| "loss": 0.617, | |
| "mean_token_accuracy": 0.8368921047076583, | |
| "num_tokens": 17824213.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.6241217039059848, | |
| "epoch": 0.10949529512403763, | |
| "grad_norm": 0.25787243247032166, | |
| "learning_rate": 9.708188935595059e-05, | |
| "loss": 0.6254, | |
| "mean_token_accuracy": 0.8372699371539056, | |
| "num_tokens": 18348187.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.6261336030438542, | |
| "epoch": 0.11262373212758157, | |
| "grad_norm": 0.2410293072462082, | |
| "learning_rate": 9.691421534178966e-05, | |
| "loss": 0.6246, | |
| "mean_token_accuracy": 0.8367050038650632, | |
| "num_tokens": 18872475.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.6158532982226461, | |
| "epoch": 0.1157521691311255, | |
| "grad_norm": 0.25348979234695435, | |
| "learning_rate": 9.674201079360188e-05, | |
| "loss": 0.6152, | |
| "mean_token_accuracy": 0.8399296645075083, | |
| "num_tokens": 19396763.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.601322092814371, | |
| "epoch": 0.11888060613466944, | |
| "grad_norm": 0.2700308859348297, | |
| "learning_rate": 9.656529234128418e-05, | |
| "loss": 0.6021, | |
| "mean_token_accuracy": 0.8411092776805162, | |
| "num_tokens": 19920930.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.6039648232981563, | |
| "epoch": 0.12200904313821337, | |
| "grad_norm": 0.24755984544754028, | |
| "learning_rate": 9.638407705064392e-05, | |
| "loss": 0.6039, | |
| "mean_token_accuracy": 0.8405463420785964, | |
| "num_tokens": 20445218.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.608759083552286, | |
| "epoch": 0.1251374801417573, | |
| "grad_norm": 0.26475900411605835, | |
| "learning_rate": 9.619838242175083e-05, | |
| "loss": 0.6077, | |
| "mean_token_accuracy": 0.8407429889775813, | |
| "num_tokens": 20969506.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5959294943604618, | |
| "epoch": 0.12826591714530122, | |
| "grad_norm": 0.24157044291496277, | |
| "learning_rate": 9.600822638724705e-05, | |
| "loss": 0.5964, | |
| "mean_token_accuracy": 0.8421691716648638, | |
| "num_tokens": 21493794.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5929773084353656, | |
| "epoch": 0.13139435414884518, | |
| "grad_norm": 0.2490786910057068, | |
| "learning_rate": 9.581362731061536e-05, | |
| "loss": 0.5924, | |
| "mean_token_accuracy": 0.8422665409743786, | |
| "num_tokens": 22018082.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5856014562305063, | |
| "epoch": 0.1345227911523891, | |
| "grad_norm": 0.26487597823143005, | |
| "learning_rate": 9.561460398440577e-05, | |
| "loss": 0.5845, | |
| "mean_token_accuracy": 0.8448229790665209, | |
| "num_tokens": 22542370.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5972939203493297, | |
| "epoch": 0.13765122815593303, | |
| "grad_norm": 0.2536788582801819, | |
| "learning_rate": 9.54111756284207e-05, | |
| "loss": 0.5979, | |
| "mean_token_accuracy": 0.8417689246125519, | |
| "num_tokens": 23066518.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5793864431325346, | |
| "epoch": 0.14077966515947696, | |
| "grad_norm": 0.2498546540737152, | |
| "learning_rate": 9.520336188785905e-05, | |
| "loss": 0.5797, | |
| "mean_token_accuracy": 0.8449356239289045, | |
| "num_tokens": 23590806.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5852908829692751, | |
| "epoch": 0.14390810216302088, | |
| "grad_norm": 0.2533164322376251, | |
| "learning_rate": 9.499118283141887e-05, | |
| "loss": 0.5851, | |
| "mean_token_accuracy": 0.8440230167470872, | |
| "num_tokens": 24115094.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5853700931183994, | |
| "epoch": 0.14703653916656484, | |
| "grad_norm": 0.2594294250011444, | |
| "learning_rate": 9.477465894935939e-05, | |
| "loss": 0.5833, | |
| "mean_token_accuracy": 0.8439973699860275, | |
| "num_tokens": 24638793.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5820769551210105, | |
| "epoch": 0.15016497617010877, | |
| "grad_norm": 0.26996445655822754, | |
| "learning_rate": 9.455381115152234e-05, | |
| "loss": 0.5813, | |
| "mean_token_accuracy": 0.84427694324404, | |
| "num_tokens": 25163081.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5865267035551369, | |
| "epoch": 0.1532934131736527, | |
| "grad_norm": 0.2632192075252533, | |
| "learning_rate": 9.432866076531248e-05, | |
| "loss": 0.5865, | |
| "mean_token_accuracy": 0.843591536860913, | |
| "num_tokens": 25687369.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5832636400591582, | |
| "epoch": 0.15642185017719662, | |
| "grad_norm": 0.2748562693595886, | |
| "learning_rate": 9.409922953363824e-05, | |
| "loss": 0.5814, | |
| "mean_token_accuracy": 0.8445194149389863, | |
| "num_tokens": 26211657.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5827851279173046, | |
| "epoch": 0.15955028718074055, | |
| "grad_norm": 0.27480757236480713, | |
| "learning_rate": 9.386553961281179e-05, | |
| "loss": 0.5829, | |
| "mean_token_accuracy": 0.8449146216735244, | |
| "num_tokens": 26735945.0, | |
| "step": 816 | |
| }, | |
| { | |
| "entropy": 0.5730462830979377, | |
| "epoch": 0.1626787241842845, | |
| "grad_norm": 0.26577237248420715, | |
| "learning_rate": 9.362761357040956e-05, | |
| "loss": 0.5748, | |
| "mean_token_accuracy": 0.8454892951995134, | |
| "num_tokens": 27260233.0, | |
| "step": 832 | |
| }, | |
| { | |
| "entropy": 0.5673604859039187, | |
| "epoch": 0.16580716118782843, | |
| "grad_norm": 0.27280473709106445, | |
| "learning_rate": 9.338547438309269e-05, | |
| "loss": 0.5659, | |
| "mean_token_accuracy": 0.8469823002815247, | |
| "num_tokens": 27784521.0, | |
| "step": 848 | |
| }, | |
| { | |
| "entropy": 0.5737447079736739, | |
| "epoch": 0.16893559819137235, | |
| "grad_norm": 0.243574321269989, | |
| "learning_rate": 9.313914543438835e-05, | |
| "loss": 0.5735, | |
| "mean_token_accuracy": 0.8462529699318111, | |
| "num_tokens": 28308797.0, | |
| "step": 864 | |
| }, | |
| { | |
| "entropy": 0.5753675031010062, | |
| "epoch": 0.17206403519491628, | |
| "grad_norm": 0.2583043575286865, | |
| "learning_rate": 9.288865051243142e-05, | |
| "loss": 0.5747, | |
| "mean_token_accuracy": 0.8464267165400088, | |
| "num_tokens": 28833085.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.5841653808020055, | |
| "epoch": 0.17519247219846024, | |
| "grad_norm": 0.2513315677642822, | |
| "learning_rate": 9.263401380766739e-05, | |
| "loss": 0.5837, | |
| "mean_token_accuracy": 0.8444525892846286, | |
| "num_tokens": 29357373.0, | |
| "step": 896 | |
| }, | |
| { | |
| "entropy": 0.5843619098886847, | |
| "epoch": 0.17832090920200416, | |
| "grad_norm": 0.2498634308576584, | |
| "learning_rate": 9.237525991051615e-05, | |
| "loss": 0.5848, | |
| "mean_token_accuracy": 0.8453418002463877, | |
| "num_tokens": 29881380.0, | |
| "step": 912 | |
| }, | |
| { | |
| "entropy": 0.5886711834464222, | |
| "epoch": 0.1814493462055481, | |
| "grad_norm": 0.25676679611206055, | |
| "learning_rate": 9.211241380899739e-05, | |
| "loss": 0.589, | |
| "mean_token_accuracy": 0.8431676919572055, | |
| "num_tokens": 30405668.0, | |
| "step": 928 | |
| }, | |
| { | |
| "entropy": 0.5667355505283922, | |
| "epoch": 0.18457778320909202, | |
| "grad_norm": 0.2631304860115051, | |
| "learning_rate": 9.184550088631741e-05, | |
| "loss": 0.5636, | |
| "mean_token_accuracy": 0.8479502676054835, | |
| "num_tokens": 30929956.0, | |
| "step": 944 | |
| }, | |
| { | |
| "entropy": 0.5632404731586576, | |
| "epoch": 0.18770622021263594, | |
| "grad_norm": 0.24389183521270752, | |
| "learning_rate": 9.157454691841789e-05, | |
| "loss": 0.5652, | |
| "mean_token_accuracy": 0.8481321800500154, | |
| "num_tokens": 31453452.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.5674644499085844, | |
| "epoch": 0.1908346572161799, | |
| "grad_norm": 0.25306281447410583, | |
| "learning_rate": 9.129957807148666e-05, | |
| "loss": 0.5651, | |
| "mean_token_accuracy": 0.8487979606725276, | |
| "num_tokens": 31977740.0, | |
| "step": 976 | |
| }, | |
| { | |
| "entropy": 0.5516653840895742, | |
| "epoch": 0.19396309421972383, | |
| "grad_norm": 0.26283109188079834, | |
| "learning_rate": 9.102062089943086e-05, | |
| "loss": 0.5535, | |
| "mean_token_accuracy": 0.8500809515826404, | |
| "num_tokens": 32502028.0, | |
| "step": 992 | |
| }, | |
| { | |
| "entropy": 0.561382147250697, | |
| "epoch": 0.19709153122326775, | |
| "grad_norm": 0.25755682587623596, | |
| "learning_rate": 9.07377023413126e-05, | |
| "loss": 0.5586, | |
| "mean_token_accuracy": 0.8493707147426903, | |
| "num_tokens": 33025959.0, | |
| "step": 1008 | |
| }, | |
| { | |
| "entropy": 0.5676966737955809, | |
| "epoch": 0.20021996822681168, | |
| "grad_norm": 0.25775137543678284, | |
| "learning_rate": 9.045084971874738e-05, | |
| "loss": 0.5671, | |
| "mean_token_accuracy": 0.847740254830569, | |
| "num_tokens": 33550247.0, | |
| "step": 1024 | |
| }, | |
| { | |
| "entropy": 0.5602117348462343, | |
| "epoch": 0.2033484052303556, | |
| "grad_norm": 0.25713664293289185, | |
| "learning_rate": 9.016009073326571e-05, | |
| "loss": 0.5619, | |
| "mean_token_accuracy": 0.8491683504544199, | |
| "num_tokens": 34074535.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.58018215931952, | |
| "epoch": 0.20647684223389956, | |
| "grad_norm": 0.2585735619068146, | |
| "learning_rate": 8.986545346363792e-05, | |
| "loss": 0.5792, | |
| "mean_token_accuracy": 0.8453449127264321, | |
| "num_tokens": 34598649.0, | |
| "step": 1056 | |
| }, | |
| { | |
| "entropy": 0.5575782191008329, | |
| "epoch": 0.2096052792374435, | |
| "grad_norm": 0.2617577612400055, | |
| "learning_rate": 8.956696636316255e-05, | |
| "loss": 0.5558, | |
| "mean_token_accuracy": 0.8505983497016132, | |
| "num_tokens": 35122937.0, | |
| "step": 1072 | |
| }, | |
| { | |
| "entropy": 0.5580868402030319, | |
| "epoch": 0.2127337162409874, | |
| "grad_norm": 0.2838793992996216, | |
| "learning_rate": 8.926465825691865e-05, | |
| "loss": 0.5585, | |
| "mean_token_accuracy": 0.8499568556435406, | |
| "num_tokens": 35647225.0, | |
| "step": 1088 | |
| }, | |
| { | |
| "entropy": 0.571673326427117, | |
| "epoch": 0.21586215324453134, | |
| "grad_norm": 0.26587942242622375, | |
| "learning_rate": 8.895855833898207e-05, | |
| "loss": 0.5705, | |
| "mean_token_accuracy": 0.8471988807432353, | |
| "num_tokens": 36171427.0, | |
| "step": 1104 | |
| }, | |
| { | |
| "entropy": 0.568238423904404, | |
| "epoch": 0.21899059024807527, | |
| "grad_norm": 0.24594295024871826, | |
| "learning_rate": 8.864869616960625e-05, | |
| "loss": 0.5682, | |
| "mean_token_accuracy": 0.8479999089613557, | |
| "num_tokens": 36695715.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.5731002090033144, | |
| "epoch": 0.22211902725161922, | |
| "grad_norm": 0.25455793738365173, | |
| "learning_rate": 8.833510167236747e-05, | |
| "loss": 0.5732, | |
| "mean_token_accuracy": 0.8478013505227864, | |
| "num_tokens": 37220003.0, | |
| "step": 1136 | |
| }, | |
| { | |
| "entropy": 0.5481538840103894, | |
| "epoch": 0.22524746425516315, | |
| "grad_norm": 0.26975589990615845, | |
| "learning_rate": 8.801780513127513e-05, | |
| "loss": 0.5475, | |
| "mean_token_accuracy": 0.8506444320082664, | |
| "num_tokens": 37743882.0, | |
| "step": 1152 | |
| }, | |
| { | |
| "entropy": 0.5527894860133529, | |
| "epoch": 0.22837590125870708, | |
| "grad_norm": 0.2522968053817749, | |
| "learning_rate": 8.769683718784734e-05, | |
| "loss": 0.5516, | |
| "mean_token_accuracy": 0.8505086144432425, | |
| "num_tokens": 38268170.0, | |
| "step": 1168 | |
| }, | |
| { | |
| "entropy": 0.5540862991474569, | |
| "epoch": 0.231504338262251, | |
| "grad_norm": 0.2609933912754059, | |
| "learning_rate": 8.737222883815164e-05, | |
| "loss": 0.5526, | |
| "mean_token_accuracy": 0.8506460795179009, | |
| "num_tokens": 38792458.0, | |
| "step": 1184 | |
| }, | |
| { | |
| "entropy": 0.555841225432232, | |
| "epoch": 0.23463277526579493, | |
| "grad_norm": 0.2725919187068939, | |
| "learning_rate": 8.704401142981184e-05, | |
| "loss": 0.5554, | |
| "mean_token_accuracy": 0.8496399251744151, | |
| "num_tokens": 39316746.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.5415672848466784, | |
| "epoch": 0.23776121226933888, | |
| "grad_norm": 0.2661166191101074, | |
| "learning_rate": 8.671221665898073e-05, | |
| "loss": 0.5435, | |
| "mean_token_accuracy": 0.8517133295536041, | |
| "num_tokens": 39841034.0, | |
| "step": 1216 | |
| }, | |
| { | |
| "entropy": 0.5415612279903144, | |
| "epoch": 0.2408896492728828, | |
| "grad_norm": 0.25321218371391296, | |
| "learning_rate": 8.637687656727913e-05, | |
| "loss": 0.5406, | |
| "mean_token_accuracy": 0.8533380702137947, | |
| "num_tokens": 40365322.0, | |
| "step": 1232 | |
| }, | |
| { | |
| "entropy": 0.5538674369454384, | |
| "epoch": 0.24401808627642674, | |
| "grad_norm": 0.2791917622089386, | |
| "learning_rate": 8.60380235387016e-05, | |
| "loss": 0.5518, | |
| "mean_token_accuracy": 0.8515777760185301, | |
| "num_tokens": 40889610.0, | |
| "step": 1248 | |
| }, | |
| { | |
| "entropy": 0.5463056627195328, | |
| "epoch": 0.24714652327997066, | |
| "grad_norm": 0.2911370098590851, | |
| "learning_rate": 8.569569029648923e-05, | |
| "loss": 0.5462, | |
| "mean_token_accuracy": 0.8518756083212793, | |
| "num_tokens": 41413898.0, | |
| "step": 1264 | |
| }, | |
| { | |
| "entropy": 0.5479311102535576, | |
| "epoch": 0.2502749602835146, | |
| "grad_norm": 0.26030269265174866, | |
| "learning_rate": 8.53499098999693e-05, | |
| "loss": 0.5497, | |
| "mean_token_accuracy": 0.8511806610040367, | |
| "num_tokens": 41938186.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.5538808973506093, | |
| "epoch": 0.2534033972870585, | |
| "grad_norm": 0.26965585350990295, | |
| "learning_rate": 8.500071574136295e-05, | |
| "loss": 0.5537, | |
| "mean_token_accuracy": 0.8513314896263182, | |
| "num_tokens": 42462474.0, | |
| "step": 1296 | |
| }, | |
| { | |
| "entropy": 0.5512072397395968, | |
| "epoch": 0.25653183429060245, | |
| "grad_norm": 0.24840131402015686, | |
| "learning_rate": 8.46481415425604e-05, | |
| "loss": 0.5487, | |
| "mean_token_accuracy": 0.8515388667583466, | |
| "num_tokens": 42986398.0, | |
| "step": 1312 | |
| }, | |
| { | |
| "entropy": 0.5752683402970433, | |
| "epoch": 0.2596602712941464, | |
| "grad_norm": 0.26552262902259827, | |
| "learning_rate": 8.429222135186427e-05, | |
| "loss": 0.5776, | |
| "mean_token_accuracy": 0.8456897586584091, | |
| "num_tokens": 43510686.0, | |
| "step": 1328 | |
| }, | |
| { | |
| "entropy": 0.5338181289844215, | |
| "epoch": 0.26278870829769035, | |
| "grad_norm": 0.2537406086921692, | |
| "learning_rate": 8.393298954070178e-05, | |
| "loss": 0.5323, | |
| "mean_token_accuracy": 0.8548826249316335, | |
| "num_tokens": 44034974.0, | |
| "step": 1344 | |
| }, | |
| { | |
| "entropy": 0.5521234918851405, | |
| "epoch": 0.2659171453012343, | |
| "grad_norm": 0.2752557098865509, | |
| "learning_rate": 8.357048080030522e-05, | |
| "loss": 0.5512, | |
| "mean_token_accuracy": 0.851849777624011, | |
| "num_tokens": 44559253.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.5413030001800507, | |
| "epoch": 0.2690455823047782, | |
| "grad_norm": 0.2753056585788727, | |
| "learning_rate": 8.320473013836196e-05, | |
| "loss": 0.5387, | |
| "mean_token_accuracy": 0.8531146934255958, | |
| "num_tokens": 45083541.0, | |
| "step": 1376 | |
| }, | |
| { | |
| "entropy": 0.5636138301342726, | |
| "epoch": 0.27217401930832213, | |
| "grad_norm": 0.2751758396625519, | |
| "learning_rate": 8.283577287563367e-05, | |
| "loss": 0.5662, | |
| "mean_token_accuracy": 0.8482500137761235, | |
| "num_tokens": 45607829.0, | |
| "step": 1392 | |
| }, | |
| { | |
| "entropy": 0.5462560928426683, | |
| "epoch": 0.27530245631186606, | |
| "grad_norm": 0.2786915898323059, | |
| "learning_rate": 8.246364464254539e-05, | |
| "loss": 0.5458, | |
| "mean_token_accuracy": 0.8512360248714685, | |
| "num_tokens": 46132117.0, | |
| "step": 1408 | |
| }, | |
| { | |
| "entropy": 0.5498062786646187, | |
| "epoch": 0.27843089331541, | |
| "grad_norm": 0.28462111949920654, | |
| "learning_rate": 8.20883813757447e-05, | |
| "loss": 0.5479, | |
| "mean_token_accuracy": 0.851142474450171, | |
| "num_tokens": 46656405.0, | |
| "step": 1424 | |
| }, | |
| { | |
| "entropy": 0.561028536176309, | |
| "epoch": 0.2815593303189539, | |
| "grad_norm": 0.28582215309143066, | |
| "learning_rate": 8.171001931463122e-05, | |
| "loss": 0.56, | |
| "mean_token_accuracy": 0.8496361062861979, | |
| "num_tokens": 47180693.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.5402022732887417, | |
| "epoch": 0.28468776732249784, | |
| "grad_norm": 0.25975897908210754, | |
| "learning_rate": 8.132859499785707e-05, | |
| "loss": 0.5393, | |
| "mean_token_accuracy": 0.8542964975349605, | |
| "num_tokens": 47704981.0, | |
| "step": 1456 | |
| }, | |
| { | |
| "entropy": 0.5573246807325631, | |
| "epoch": 0.28781620432604177, | |
| "grad_norm": 0.2716176211833954, | |
| "learning_rate": 8.094414525979822e-05, | |
| "loss": 0.56, | |
| "mean_token_accuracy": 0.8493000832386315, | |
| "num_tokens": 48229269.0, | |
| "step": 1472 | |
| }, | |
| { | |
| "entropy": 0.5363587085157633, | |
| "epoch": 0.29094464132958575, | |
| "grad_norm": 0.26990601420402527, | |
| "learning_rate": 8.055670722699736e-05, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8545466028153896, | |
| "num_tokens": 48753557.0, | |
| "step": 1488 | |
| } | |
| ], | |
| "logging_steps": 16, | |
| "max_steps": 5115, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.1754438561418445e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |