{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.29329096908224367, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.2905268957838416, "epoch": 0.0031284370035439325, "grad_norm": 0.43689030408859253, "learning_rate": 9.999787808528638e-05, "loss": 1.7852, "mean_token_accuracy": 0.6027710330672562, "num_tokens": 524288.0, "step": 16 }, { "entropy": 1.3333816397935152, "epoch": 0.006256874007087865, "grad_norm": 0.5030925273895264, "learning_rate": 9.99909372761763e-05, "loss": 1.2521, "mean_token_accuracy": 0.6930539021268487, "num_tokens": 1048576.0, "step": 32 }, { "entropy": 0.8592402129434049, "epoch": 0.009385311010631798, "grad_norm": 0.17196309566497803, "learning_rate": 9.99791688121494e-05, "loss": 0.8735, "mean_token_accuracy": 0.789072047919035, "num_tokens": 1572864.0, "step": 48 }, { "entropy": 0.8306971751153469, "epoch": 0.01251374801417573, "grad_norm": 0.18978707492351532, "learning_rate": 9.996257382969333e-05, "loss": 0.8334, "mean_token_accuracy": 0.7984222243539989, "num_tokens": 2096964.0, "step": 64 }, { "entropy": 0.7896183831617236, "epoch": 0.015642185017719662, "grad_norm": 0.13939695060253143, "learning_rate": 9.994115393139555e-05, "loss": 0.7922, "mean_token_accuracy": 0.8059685812331736, "num_tokens": 2621252.0, "step": 80 }, { "entropy": 0.7537235538475215, "epoch": 0.018770622021263596, "grad_norm": 0.14827103912830353, "learning_rate": 9.991491118578856e-05, "loss": 0.7564, "mean_token_accuracy": 0.8114575678482652, "num_tokens": 3145540.0, "step": 96 }, { "entropy": 0.7533294446766376, "epoch": 0.02189905902480753, "grad_norm": 0.14420101046562195, "learning_rate": 9.988384812715006e-05, "loss": 0.7549, "mean_token_accuracy": 0.8126546451821923, "num_tokens": 3669828.0, "step": 112 }, { "entropy": 0.735235239379108, "epoch": 0.02502749602835146, "grad_norm": 0.15388034284114838, "learning_rate": 9.984796775525836e-05, "loss": 0.7378, "mean_token_accuracy": 0.813876539003104, "num_tokens": 4194116.0, "step": 128 }, { "entropy": 0.745816265232861, "epoch": 0.028155933031895394, "grad_norm": 0.16290415823459625, "learning_rate": 9.980727353510257e-05, "loss": 0.7469, "mean_token_accuracy": 0.8129773042164743, "num_tokens": 4718404.0, "step": 144 }, { "entropy": 0.7049960081931204, "epoch": 0.031284370035439324, "grad_norm": 0.16859838366508484, "learning_rate": 9.976176939654804e-05, "loss": 0.7068, "mean_token_accuracy": 0.8205063017085195, "num_tokens": 5242421.0, "step": 160 }, { "entropy": 0.7164251236245036, "epoch": 0.03441280703898326, "grad_norm": 0.18480005860328674, "learning_rate": 9.971145973395684e-05, "loss": 0.7162, "mean_token_accuracy": 0.8197130090557039, "num_tokens": 5766709.0, "step": 176 }, { "entropy": 0.7345189340412617, "epoch": 0.03754124404252719, "grad_norm": 0.20077235996723175, "learning_rate": 9.965634940576338e-05, "loss": 0.7373, "mean_token_accuracy": 0.8166410801932216, "num_tokens": 6290997.0, "step": 192 }, { "entropy": 0.6868870840407908, "epoch": 0.040669681046071125, "grad_norm": 0.19620533287525177, "learning_rate": 9.959644373400523e-05, "loss": 0.6877, "mean_token_accuracy": 0.8243126338347793, "num_tokens": 6814838.0, "step": 208 }, { "entropy": 0.7020489743445069, "epoch": 0.04379811804961506, "grad_norm": 0.21126843988895416, "learning_rate": 9.953174850380918e-05, "loss": 0.7008, "mean_token_accuracy": 0.8215000284835696, "num_tokens": 7339126.0, "step": 224 }, { "entropy": 0.6887436110991985, "epoch": 0.046926555053158986, "grad_norm": 0.20873498916625977, "learning_rate": 9.946226996283258e-05, "loss": 0.6878, "mean_token_accuracy": 0.825064530596137, "num_tokens": 7863414.0, "step": 240 }, { "entropy": 0.6778992149047554, "epoch": 0.05005499205670292, "grad_norm": 0.20576342940330505, "learning_rate": 9.938801482065998e-05, "loss": 0.6807, "mean_token_accuracy": 0.8253566385246813, "num_tokens": 8387702.0, "step": 256 }, { "entropy": 0.6773952257353812, "epoch": 0.05318342906024685, "grad_norm": 0.2139802724123001, "learning_rate": 9.930899024815517e-05, "loss": 0.6734, "mean_token_accuracy": 0.8275064108893275, "num_tokens": 8911990.0, "step": 272 }, { "entropy": 0.6875678761862218, "epoch": 0.05631186606379079, "grad_norm": 0.22833411395549774, "learning_rate": 9.922520387676868e-05, "loss": 0.6866, "mean_token_accuracy": 0.8259179475717247, "num_tokens": 9436278.0, "step": 288 }, { "entropy": 0.6587484122719616, "epoch": 0.05944030306733472, "grad_norm": 0.23073996603488922, "learning_rate": 9.91366637978009e-05, "loss": 0.6573, "mean_token_accuracy": 0.8297159126959741, "num_tokens": 9960272.0, "step": 304 }, { "entropy": 0.6715414742939174, "epoch": 0.06256874007087865, "grad_norm": 0.2512345612049103, "learning_rate": 9.904337856162053e-05, "loss": 0.6645, "mean_token_accuracy": 0.8285545711405575, "num_tokens": 10484560.0, "step": 320 }, { "entropy": 0.6463351501151919, "epoch": 0.06569717707442259, "grad_norm": 0.2298312485218048, "learning_rate": 9.894535717683902e-05, "loss": 0.6429, "mean_token_accuracy": 0.8320847055874765, "num_tokens": 11008848.0, "step": 336 }, { "entropy": 0.6667697406373918, "epoch": 0.06882561407796652, "grad_norm": 0.22004173696041107, "learning_rate": 9.884260910944053e-05, "loss": 0.6681, "mean_token_accuracy": 0.8278767997398973, "num_tokens": 11533136.0, "step": 352 }, { "entropy": 0.6179311077576131, "epoch": 0.07195405108151044, "grad_norm": 0.22624558210372925, "learning_rate": 9.873514428186778e-05, "loss": 0.6173, "mean_token_accuracy": 0.8369016530923545, "num_tokens": 12057424.0, "step": 368 }, { "entropy": 0.6683201459236443, "epoch": 0.07508248808505438, "grad_norm": 0.24455475807189941, "learning_rate": 9.862297307206392e-05, "loss": 0.667, "mean_token_accuracy": 0.828629030380398, "num_tokens": 12581712.0, "step": 384 }, { "entropy": 0.6426783930510283, "epoch": 0.07821092508859831, "grad_norm": 0.23066706955432892, "learning_rate": 9.850610631247019e-05, "loss": 0.6415, "mean_token_accuracy": 0.8333791512995958, "num_tokens": 13106000.0, "step": 400 }, { "entropy": 0.6531911985948682, "epoch": 0.08133936209214225, "grad_norm": 0.24164645373821259, "learning_rate": 9.838455528897998e-05, "loss": 0.6551, "mean_token_accuracy": 0.8303816900588572, "num_tokens": 13630288.0, "step": 416 }, { "entropy": 0.6194327082484961, "epoch": 0.08446779909568618, "grad_norm": 0.23268474638462067, "learning_rate": 9.82583317398488e-05, "loss": 0.6175, "mean_token_accuracy": 0.8367584650404751, "num_tokens": 14154576.0, "step": 432 }, { "entropy": 0.6329398893285543, "epoch": 0.08759623609923012, "grad_norm": 0.24408580362796783, "learning_rate": 9.81274478545608e-05, "loss": 0.6325, "mean_token_accuracy": 0.8347576032392681, "num_tokens": 14678864.0, "step": 448 }, { "entropy": 0.6229353230446577, "epoch": 0.09072467310277404, "grad_norm": 0.24314868450164795, "learning_rate": 9.79919162726516e-05, "loss": 0.6222, "mean_token_accuracy": 0.8371938765048981, "num_tokens": 15203026.0, "step": 464 }, { "entropy": 0.6139514590613544, "epoch": 0.09385311010631797, "grad_norm": 0.25224220752716064, "learning_rate": 9.785175008248768e-05, "loss": 0.614, "mean_token_accuracy": 0.838007087353617, "num_tokens": 15727314.0, "step": 480 }, { "entropy": 0.6109699255321175, "epoch": 0.09698154710986191, "grad_norm": 0.24364836513996124, "learning_rate": 9.770696282000244e-05, "loss": 0.6106, "mean_token_accuracy": 0.8386371252126992, "num_tokens": 16251602.0, "step": 496 }, { "entropy": 0.6176430773921311, "epoch": 0.10010998411340584, "grad_norm": 0.2621923089027405, "learning_rate": 9.755756846738902e-05, "loss": 0.615, "mean_token_accuracy": 0.8376702214591205, "num_tokens": 16775637.0, "step": 512 }, { "entropy": 0.6160387259442359, "epoch": 0.10323842111694978, "grad_norm": 0.2385210245847702, "learning_rate": 9.740358145174998e-05, "loss": 0.6184, "mean_token_accuracy": 0.8375125988386571, "num_tokens": 17299925.0, "step": 528 }, { "entropy": 0.6175491204485297, "epoch": 0.1063668581204937, "grad_norm": 0.2511584460735321, "learning_rate": 9.724501664370418e-05, "loss": 0.617, "mean_token_accuracy": 0.8368921047076583, "num_tokens": 17824213.0, "step": 544 }, { "entropy": 0.6241217039059848, "epoch": 0.10949529512403763, "grad_norm": 0.25787243247032166, "learning_rate": 9.708188935595059e-05, "loss": 0.6254, "mean_token_accuracy": 0.8372699371539056, "num_tokens": 18348187.0, "step": 560 }, { "entropy": 0.6261336030438542, "epoch": 0.11262373212758157, "grad_norm": 0.2410293072462082, "learning_rate": 9.691421534178966e-05, "loss": 0.6246, "mean_token_accuracy": 0.8367050038650632, "num_tokens": 18872475.0, "step": 576 }, { "entropy": 0.6158532982226461, "epoch": 0.1157521691311255, "grad_norm": 0.25348979234695435, "learning_rate": 9.674201079360188e-05, "loss": 0.6152, "mean_token_accuracy": 0.8399296645075083, "num_tokens": 19396763.0, "step": 592 }, { "entropy": 0.601322092814371, "epoch": 0.11888060613466944, "grad_norm": 0.2700308859348297, "learning_rate": 9.656529234128418e-05, "loss": 0.6021, "mean_token_accuracy": 0.8411092776805162, "num_tokens": 19920930.0, "step": 608 }, { "entropy": 0.6039648232981563, "epoch": 0.12200904313821337, "grad_norm": 0.24755984544754028, "learning_rate": 9.638407705064392e-05, "loss": 0.6039, "mean_token_accuracy": 0.8405463420785964, "num_tokens": 20445218.0, "step": 624 }, { "entropy": 0.608759083552286, "epoch": 0.1251374801417573, "grad_norm": 0.26475900411605835, "learning_rate": 9.619838242175083e-05, "loss": 0.6077, "mean_token_accuracy": 0.8407429889775813, "num_tokens": 20969506.0, "step": 640 }, { "entropy": 0.5959294943604618, "epoch": 0.12826591714530122, "grad_norm": 0.24157044291496277, "learning_rate": 9.600822638724705e-05, "loss": 0.5964, "mean_token_accuracy": 0.8421691716648638, "num_tokens": 21493794.0, "step": 656 }, { "entropy": 0.5929773084353656, "epoch": 0.13139435414884518, "grad_norm": 0.2490786910057068, "learning_rate": 9.581362731061536e-05, "loss": 0.5924, "mean_token_accuracy": 0.8422665409743786, "num_tokens": 22018082.0, "step": 672 }, { "entropy": 0.5856014562305063, "epoch": 0.1345227911523891, "grad_norm": 0.26487597823143005, "learning_rate": 9.561460398440577e-05, "loss": 0.5845, "mean_token_accuracy": 0.8448229790665209, "num_tokens": 22542370.0, "step": 688 }, { "entropy": 0.5972939203493297, "epoch": 0.13765122815593303, "grad_norm": 0.2536788582801819, "learning_rate": 9.54111756284207e-05, "loss": 0.5979, "mean_token_accuracy": 0.8417689246125519, "num_tokens": 23066518.0, "step": 704 }, { "entropy": 0.5793864431325346, "epoch": 0.14077966515947696, "grad_norm": 0.2498546540737152, "learning_rate": 9.520336188785905e-05, "loss": 0.5797, "mean_token_accuracy": 0.8449356239289045, "num_tokens": 23590806.0, "step": 720 }, { "entropy": 0.5852908829692751, "epoch": 0.14390810216302088, "grad_norm": 0.2533164322376251, "learning_rate": 9.499118283141887e-05, "loss": 0.5851, "mean_token_accuracy": 0.8440230167470872, "num_tokens": 24115094.0, "step": 736 }, { "entropy": 0.5853700931183994, "epoch": 0.14703653916656484, "grad_norm": 0.2594294250011444, "learning_rate": 9.477465894935939e-05, "loss": 0.5833, "mean_token_accuracy": 0.8439973699860275, "num_tokens": 24638793.0, "step": 752 }, { "entropy": 0.5820769551210105, "epoch": 0.15016497617010877, "grad_norm": 0.26996445655822754, "learning_rate": 9.455381115152234e-05, "loss": 0.5813, "mean_token_accuracy": 0.84427694324404, "num_tokens": 25163081.0, "step": 768 }, { "entropy": 0.5865267035551369, "epoch": 0.1532934131736527, "grad_norm": 0.2632192075252533, "learning_rate": 9.432866076531248e-05, "loss": 0.5865, "mean_token_accuracy": 0.843591536860913, "num_tokens": 25687369.0, "step": 784 }, { "entropy": 0.5832636400591582, "epoch": 0.15642185017719662, "grad_norm": 0.2748562693595886, "learning_rate": 9.409922953363824e-05, "loss": 0.5814, "mean_token_accuracy": 0.8445194149389863, "num_tokens": 26211657.0, "step": 800 }, { "entropy": 0.5827851279173046, "epoch": 0.15955028718074055, "grad_norm": 0.27480757236480713, "learning_rate": 9.386553961281179e-05, "loss": 0.5829, "mean_token_accuracy": 0.8449146216735244, "num_tokens": 26735945.0, "step": 816 }, { "entropy": 0.5730462830979377, "epoch": 0.1626787241842845, "grad_norm": 0.26577237248420715, "learning_rate": 9.362761357040956e-05, "loss": 0.5748, "mean_token_accuracy": 0.8454892951995134, "num_tokens": 27260233.0, "step": 832 }, { "entropy": 0.5673604859039187, "epoch": 0.16580716118782843, "grad_norm": 0.27280473709106445, "learning_rate": 9.338547438309269e-05, "loss": 0.5659, "mean_token_accuracy": 0.8469823002815247, "num_tokens": 27784521.0, "step": 848 }, { "entropy": 0.5737447079736739, "epoch": 0.16893559819137235, "grad_norm": 0.243574321269989, "learning_rate": 9.313914543438835e-05, "loss": 0.5735, "mean_token_accuracy": 0.8462529699318111, "num_tokens": 28308797.0, "step": 864 }, { "entropy": 0.5753675031010062, "epoch": 0.17206403519491628, "grad_norm": 0.2583043575286865, "learning_rate": 9.288865051243142e-05, "loss": 0.5747, "mean_token_accuracy": 0.8464267165400088, "num_tokens": 28833085.0, "step": 880 }, { "entropy": 0.5841653808020055, "epoch": 0.17519247219846024, "grad_norm": 0.2513315677642822, "learning_rate": 9.263401380766739e-05, "loss": 0.5837, "mean_token_accuracy": 0.8444525892846286, "num_tokens": 29357373.0, "step": 896 }, { "entropy": 0.5843619098886847, "epoch": 0.17832090920200416, "grad_norm": 0.2498634308576584, "learning_rate": 9.237525991051615e-05, "loss": 0.5848, "mean_token_accuracy": 0.8453418002463877, "num_tokens": 29881380.0, "step": 912 }, { "entropy": 0.5886711834464222, "epoch": 0.1814493462055481, "grad_norm": 0.25676679611206055, "learning_rate": 9.211241380899739e-05, "loss": 0.589, "mean_token_accuracy": 0.8431676919572055, "num_tokens": 30405668.0, "step": 928 }, { "entropy": 0.5667355505283922, "epoch": 0.18457778320909202, "grad_norm": 0.2631304860115051, "learning_rate": 9.184550088631741e-05, "loss": 0.5636, "mean_token_accuracy": 0.8479502676054835, "num_tokens": 30929956.0, "step": 944 }, { "entropy": 0.5632404731586576, "epoch": 0.18770622021263594, "grad_norm": 0.24389183521270752, "learning_rate": 9.157454691841789e-05, "loss": 0.5652, "mean_token_accuracy": 0.8481321800500154, "num_tokens": 31453452.0, "step": 960 }, { "entropy": 0.5674644499085844, "epoch": 0.1908346572161799, "grad_norm": 0.25306281447410583, "learning_rate": 9.129957807148666e-05, "loss": 0.5651, "mean_token_accuracy": 0.8487979606725276, "num_tokens": 31977740.0, "step": 976 }, { "entropy": 0.5516653840895742, "epoch": 0.19396309421972383, "grad_norm": 0.26283109188079834, "learning_rate": 9.102062089943086e-05, "loss": 0.5535, "mean_token_accuracy": 0.8500809515826404, "num_tokens": 32502028.0, "step": 992 }, { "entropy": 0.561382147250697, "epoch": 0.19709153122326775, "grad_norm": 0.25755682587623596, "learning_rate": 9.07377023413126e-05, "loss": 0.5586, "mean_token_accuracy": 0.8493707147426903, "num_tokens": 33025959.0, "step": 1008 }, { "entropy": 0.5676966737955809, "epoch": 0.20021996822681168, "grad_norm": 0.25775137543678284, "learning_rate": 9.045084971874738e-05, "loss": 0.5671, "mean_token_accuracy": 0.847740254830569, "num_tokens": 33550247.0, "step": 1024 }, { "entropy": 0.5602117348462343, "epoch": 0.2033484052303556, "grad_norm": 0.25713664293289185, "learning_rate": 9.016009073326571e-05, "loss": 0.5619, "mean_token_accuracy": 0.8491683504544199, "num_tokens": 34074535.0, "step": 1040 }, { "entropy": 0.58018215931952, "epoch": 0.20647684223389956, "grad_norm": 0.2585735619068146, "learning_rate": 8.986545346363792e-05, "loss": 0.5792, "mean_token_accuracy": 0.8453449127264321, "num_tokens": 34598649.0, "step": 1056 }, { "entropy": 0.5575782191008329, "epoch": 0.2096052792374435, "grad_norm": 0.2617577612400055, "learning_rate": 8.956696636316255e-05, "loss": 0.5558, "mean_token_accuracy": 0.8505983497016132, "num_tokens": 35122937.0, "step": 1072 }, { "entropy": 0.5580868402030319, "epoch": 0.2127337162409874, "grad_norm": 0.2838793992996216, "learning_rate": 8.926465825691865e-05, "loss": 0.5585, "mean_token_accuracy": 0.8499568556435406, "num_tokens": 35647225.0, "step": 1088 }, { "entropy": 0.571673326427117, "epoch": 0.21586215324453134, "grad_norm": 0.26587942242622375, "learning_rate": 8.895855833898207e-05, "loss": 0.5705, "mean_token_accuracy": 0.8471988807432353, "num_tokens": 36171427.0, "step": 1104 }, { "entropy": 0.568238423904404, "epoch": 0.21899059024807527, "grad_norm": 0.24594295024871826, "learning_rate": 8.864869616960625e-05, "loss": 0.5682, "mean_token_accuracy": 0.8479999089613557, "num_tokens": 36695715.0, "step": 1120 }, { "entropy": 0.5731002090033144, "epoch": 0.22211902725161922, "grad_norm": 0.25455793738365173, "learning_rate": 8.833510167236747e-05, "loss": 0.5732, "mean_token_accuracy": 0.8478013505227864, "num_tokens": 37220003.0, "step": 1136 }, { "entropy": 0.5481538840103894, "epoch": 0.22524746425516315, "grad_norm": 0.26975589990615845, "learning_rate": 8.801780513127513e-05, "loss": 0.5475, "mean_token_accuracy": 0.8506444320082664, "num_tokens": 37743882.0, "step": 1152 }, { "entropy": 0.5527894860133529, "epoch": 0.22837590125870708, "grad_norm": 0.2522968053817749, "learning_rate": 8.769683718784734e-05, "loss": 0.5516, "mean_token_accuracy": 0.8505086144432425, "num_tokens": 38268170.0, "step": 1168 }, { "entropy": 0.5540862991474569, "epoch": 0.231504338262251, "grad_norm": 0.2609933912754059, "learning_rate": 8.737222883815164e-05, "loss": 0.5526, "mean_token_accuracy": 0.8506460795179009, "num_tokens": 38792458.0, "step": 1184 }, { "entropy": 0.555841225432232, "epoch": 0.23463277526579493, "grad_norm": 0.2725919187068939, "learning_rate": 8.704401142981184e-05, "loss": 0.5554, "mean_token_accuracy": 0.8496399251744151, "num_tokens": 39316746.0, "step": 1200 }, { "entropy": 0.5415672848466784, "epoch": 0.23776121226933888, "grad_norm": 0.2661166191101074, "learning_rate": 8.671221665898073e-05, "loss": 0.5435, "mean_token_accuracy": 0.8517133295536041, "num_tokens": 39841034.0, "step": 1216 }, { "entropy": 0.5415612279903144, "epoch": 0.2408896492728828, "grad_norm": 0.25321218371391296, "learning_rate": 8.637687656727913e-05, "loss": 0.5406, "mean_token_accuracy": 0.8533380702137947, "num_tokens": 40365322.0, "step": 1232 }, { "entropy": 0.5538674369454384, "epoch": 0.24401808627642674, "grad_norm": 0.2791917622089386, "learning_rate": 8.60380235387016e-05, "loss": 0.5518, "mean_token_accuracy": 0.8515777760185301, "num_tokens": 40889610.0, "step": 1248 }, { "entropy": 0.5463056627195328, "epoch": 0.24714652327997066, "grad_norm": 0.2911370098590851, "learning_rate": 8.569569029648923e-05, "loss": 0.5462, "mean_token_accuracy": 0.8518756083212793, "num_tokens": 41413898.0, "step": 1264 }, { "entropy": 0.5479311102535576, "epoch": 0.2502749602835146, "grad_norm": 0.26030269265174866, "learning_rate": 8.53499098999693e-05, "loss": 0.5497, "mean_token_accuracy": 0.8511806610040367, "num_tokens": 41938186.0, "step": 1280 }, { "entropy": 0.5538808973506093, "epoch": 0.2534033972870585, "grad_norm": 0.26965585350990295, "learning_rate": 8.500071574136295e-05, "loss": 0.5537, "mean_token_accuracy": 0.8513314896263182, "num_tokens": 42462474.0, "step": 1296 }, { "entropy": 0.5512072397395968, "epoch": 0.25653183429060245, "grad_norm": 0.24840131402015686, "learning_rate": 8.46481415425604e-05, "loss": 0.5487, "mean_token_accuracy": 0.8515388667583466, "num_tokens": 42986398.0, "step": 1312 }, { "entropy": 0.5752683402970433, "epoch": 0.2596602712941464, "grad_norm": 0.26552262902259827, "learning_rate": 8.429222135186427e-05, "loss": 0.5776, "mean_token_accuracy": 0.8456897586584091, "num_tokens": 43510686.0, "step": 1328 }, { "entropy": 0.5338181289844215, "epoch": 0.26278870829769035, "grad_norm": 0.2537406086921692, "learning_rate": 8.393298954070178e-05, "loss": 0.5323, "mean_token_accuracy": 0.8548826249316335, "num_tokens": 44034974.0, "step": 1344 }, { "entropy": 0.5521234918851405, "epoch": 0.2659171453012343, "grad_norm": 0.2752557098865509, "learning_rate": 8.357048080030522e-05, "loss": 0.5512, "mean_token_accuracy": 0.851849777624011, "num_tokens": 44559253.0, "step": 1360 }, { "entropy": 0.5413030001800507, "epoch": 0.2690455823047782, "grad_norm": 0.2753056585788727, "learning_rate": 8.320473013836196e-05, "loss": 0.5387, "mean_token_accuracy": 0.8531146934255958, "num_tokens": 45083541.0, "step": 1376 }, { "entropy": 0.5636138301342726, "epoch": 0.27217401930832213, "grad_norm": 0.2751758396625519, "learning_rate": 8.283577287563367e-05, "loss": 0.5662, "mean_token_accuracy": 0.8482500137761235, "num_tokens": 45607829.0, "step": 1392 }, { "entropy": 0.5462560928426683, "epoch": 0.27530245631186606, "grad_norm": 0.2786915898323059, "learning_rate": 8.246364464254539e-05, "loss": 0.5458, "mean_token_accuracy": 0.8512360248714685, "num_tokens": 46132117.0, "step": 1408 }, { "entropy": 0.5498062786646187, "epoch": 0.27843089331541, "grad_norm": 0.28462111949920654, "learning_rate": 8.20883813757447e-05, "loss": 0.5479, "mean_token_accuracy": 0.851142474450171, "num_tokens": 46656405.0, "step": 1424 }, { "entropy": 0.561028536176309, "epoch": 0.2815593303189539, "grad_norm": 0.28582215309143066, "learning_rate": 8.171001931463122e-05, "loss": 0.56, "mean_token_accuracy": 0.8496361062861979, "num_tokens": 47180693.0, "step": 1440 }, { "entropy": 0.5402022732887417, "epoch": 0.28468776732249784, "grad_norm": 0.25975897908210754, "learning_rate": 8.132859499785707e-05, "loss": 0.5393, "mean_token_accuracy": 0.8542964975349605, "num_tokens": 47704981.0, "step": 1456 }, { "entropy": 0.5573246807325631, "epoch": 0.28781620432604177, "grad_norm": 0.2716176211833954, "learning_rate": 8.094414525979822e-05, "loss": 0.56, "mean_token_accuracy": 0.8493000832386315, "num_tokens": 48229269.0, "step": 1472 }, { "entropy": 0.5363587085157633, "epoch": 0.29094464132958575, "grad_norm": 0.26990601420402527, "learning_rate": 8.055670722699736e-05, "loss": 0.5353, "mean_token_accuracy": 0.8545466028153896, "num_tokens": 48753557.0, "step": 1488 } ], "logging_steps": 16, "max_steps": 5115, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.1754438561418445e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }