diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13354 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9587999085893375, + "eval_steps": 500, + "global_step": 60000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.3703370141983031, + "epoch": 0.0016323332571577813, + "grad_norm": 37.0, + "learning_rate": 9.595300261096606e-08, + "loss": 0.5452, + "mean_token_accuracy": 0.8990191352367402, + "num_tokens": 7314577.0, + "step": 50 + }, + { + "entropy": 1.3720017457008362, + "epoch": 0.0032646665143155626, + "grad_norm": 41.5, + "learning_rate": 1.9386422976501306e-07, + "loss": 0.5496, + "mean_token_accuracy": 0.8973769438266754, + "num_tokens": 14588101.0, + "step": 100 + }, + { + "entropy": 1.3811770510673522, + "epoch": 0.004896999771473344, + "grad_norm": 30.25, + "learning_rate": 2.9177545691906004e-07, + "loss": 0.496, + "mean_token_accuracy": 0.9052803874015808, + "num_tokens": 22193383.0, + "step": 150 + }, + { + "entropy": 1.3709602856636047, + "epoch": 0.006529333028631125, + "grad_norm": 10.0, + "learning_rate": 3.896866840731071e-07, + "loss": 0.412, + "mean_token_accuracy": 0.9263910567760467, + "num_tokens": 29160848.0, + "step": 200 + }, + { + "entropy": 1.3785104417800904, + "epoch": 0.008161666285788906, + "grad_norm": 7.40625, + "learning_rate": 4.875979112271541e-07, + "loss": 0.3539, + "mean_token_accuracy": 0.9421093094348908, + "num_tokens": 35829209.0, + "step": 250 + }, + { + "entropy": 1.4016437172889709, + "epoch": 0.009793999542946689, + "grad_norm": 8.1875, + "learning_rate": 5.855091383812011e-07, + "loss": 0.3663, + "mean_token_accuracy": 0.9401827538013459, + "num_tokens": 42980520.0, + "step": 300 + }, + { + "entropy": 1.4059402322769166, + "epoch": 0.01142633280010447, + "grad_norm": 4.84375, + "learning_rate": 6.83420365535248e-07, + "loss": 0.3226, + "mean_token_accuracy": 0.9456453359127045, + "num_tokens": 49839839.0, + "step": 350 + }, + { + "entropy": 1.4296678924560546, + "epoch": 0.01305866605726225, + "grad_norm": 3.796875, + "learning_rate": 7.813315926892951e-07, + "loss": 0.3287, + "mean_token_accuracy": 0.9431667315959931, + "num_tokens": 57114933.0, + "step": 400 + }, + { + "entropy": 1.4395745277404786, + "epoch": 0.014690999314420031, + "grad_norm": 3.390625, + "learning_rate": 8.79242819843342e-07, + "loss": 0.3109, + "mean_token_accuracy": 0.9427984356880188, + "num_tokens": 64178392.0, + "step": 450 + }, + { + "entropy": 1.4591435599327087, + "epoch": 0.016323332571577812, + "grad_norm": 3.890625, + "learning_rate": 9.77154046997389e-07, + "loss": 0.3093, + "mean_token_accuracy": 0.9412086343765259, + "num_tokens": 71359680.0, + "step": 500 + }, + { + "epoch": 0.016323332571577812, + "eval_entropy": 1.4762075742085774, + "eval_loss": 0.324710875749588, + "eval_mean_token_accuracy": 0.9405147298177083, + "eval_num_tokens": 71359680.0, + "eval_runtime": 743.1027, + "eval_samples_per_second": 12.994, + "eval_steps_per_second": 0.102, + "step": 500 + }, + { + "entropy": 1.5086383247375488, + "epoch": 0.017955665828735593, + "grad_norm": 2.890625, + "learning_rate": 1.075065274151436e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.9443651103973388, + "num_tokens": 78414153.0, + "step": 550 + }, + { + "entropy": 1.5364352035522462, + "epoch": 0.019587999085893378, + "grad_norm": 5.96875, + "learning_rate": 1.172976501305483e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.9466875243186951, + "num_tokens": 85753500.0, + "step": 600 + }, + { + "entropy": 1.5489245629310608, + "epoch": 0.02122033234305116, + "grad_norm": 1.9453125, + "learning_rate": 1.27088772845953e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.9457735085487365, + "num_tokens": 92871116.0, + "step": 650 + }, + { + "entropy": 1.5694326043128968, + "epoch": 0.02285266560020894, + "grad_norm": 2.03125, + "learning_rate": 1.368798955613577e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.9427003371715545, + "num_tokens": 100022246.0, + "step": 700 + }, + { + "entropy": 1.5554496097564696, + "epoch": 0.02448499885736672, + "grad_norm": 2.71875, + "learning_rate": 1.466710182767624e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9482632505893708, + "num_tokens": 107214871.0, + "step": 750 + }, + { + "entropy": 1.5609500217437744, + "epoch": 0.0261173321145245, + "grad_norm": 4.25, + "learning_rate": 1.5646214099216712e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9476964402198792, + "num_tokens": 114188698.0, + "step": 800 + }, + { + "entropy": 1.5485923647880555, + "epoch": 0.027749665371682282, + "grad_norm": 10.9375, + "learning_rate": 1.662532637075718e-06, + "loss": 0.2584, + "mean_token_accuracy": 0.9505482971668243, + "num_tokens": 121264967.0, + "step": 850 + }, + { + "entropy": 1.546497621536255, + "epoch": 0.029381998628840063, + "grad_norm": 1.8046875, + "learning_rate": 1.760443864229765e-06, + "loss": 0.2473, + "mean_token_accuracy": 0.9520090806484223, + "num_tokens": 127984569.0, + "step": 900 + }, + { + "entropy": 1.5344351577758788, + "epoch": 0.031014331885997844, + "grad_norm": 2.359375, + "learning_rate": 1.8583550913838121e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9471143686771393, + "num_tokens": 135471574.0, + "step": 950 + }, + { + "entropy": 1.5414926147460937, + "epoch": 0.032646665143155625, + "grad_norm": 1.765625, + "learning_rate": 1.956266318537859e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.95045654296875, + "num_tokens": 142637684.0, + "step": 1000 + }, + { + "epoch": 0.032646665143155625, + "eval_entropy": 1.544297873179118, + "eval_loss": 0.2730070948600769, + "eval_mean_token_accuracy": 0.9475241343180338, + "eval_num_tokens": 142637684.0, + "eval_runtime": 746.6617, + "eval_samples_per_second": 12.932, + "eval_steps_per_second": 0.102, + "step": 1000 + }, + { + "entropy": 1.5506397199630737, + "epoch": 0.034278998400313405, + "grad_norm": 1.9140625, + "learning_rate": 2.054177545691906e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9507447695732116, + "num_tokens": 149507260.0, + "step": 1050 + }, + { + "entropy": 1.544835422039032, + "epoch": 0.035911331657471186, + "grad_norm": 1.703125, + "learning_rate": 2.152088772845953e-06, + "loss": 0.2477, + "mean_token_accuracy": 0.9511877942085266, + "num_tokens": 156809907.0, + "step": 1100 + }, + { + "entropy": 1.545062973499298, + "epoch": 0.03754366491462897, + "grad_norm": 1.7890625, + "learning_rate": 2.25e-06, + "loss": 0.2381, + "mean_token_accuracy": 0.9530003690719604, + "num_tokens": 163878508.0, + "step": 1150 + }, + { + "entropy": 1.5258307456970215, + "epoch": 0.039175998171786755, + "grad_norm": 1.4609375, + "learning_rate": 2.347911227154047e-06, + "loss": 0.2225, + "mean_token_accuracy": 0.9549448072910309, + "num_tokens": 170927568.0, + "step": 1200 + }, + { + "entropy": 1.5170037484169006, + "epoch": 0.040808331428944536, + "grad_norm": 1.375, + "learning_rate": 2.445822454308094e-06, + "loss": 0.2238, + "mean_token_accuracy": 0.9546501576900482, + "num_tokens": 178389889.0, + "step": 1250 + }, + { + "entropy": 1.5287164187431335, + "epoch": 0.04244066468610232, + "grad_norm": 1.921875, + "learning_rate": 2.543733681462141e-06, + "loss": 0.2471, + "mean_token_accuracy": 0.9508370912075043, + "num_tokens": 186069673.0, + "step": 1300 + }, + { + "entropy": 1.5002574920654297, + "epoch": 0.0440729979432601, + "grad_norm": 2.046875, + "learning_rate": 2.641644908616188e-06, + "loss": 0.2332, + "mean_token_accuracy": 0.953369448184967, + "num_tokens": 193396348.0, + "step": 1350 + }, + { + "entropy": 1.5036597728729248, + "epoch": 0.04570533120041788, + "grad_norm": 1.5078125, + "learning_rate": 2.739556135770235e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.9541668140888214, + "num_tokens": 200593430.0, + "step": 1400 + }, + { + "entropy": 1.4917612624168397, + "epoch": 0.04733766445757566, + "grad_norm": 2.265625, + "learning_rate": 2.837467362924282e-06, + "loss": 0.2193, + "mean_token_accuracy": 0.9556196844577789, + "num_tokens": 207639383.0, + "step": 1450 + }, + { + "entropy": 1.4869768619537354, + "epoch": 0.04896999771473344, + "grad_norm": 2.46875, + "learning_rate": 2.935378590078329e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9560010933876038, + "num_tokens": 214438210.0, + "step": 1500 + }, + { + "epoch": 0.04896999771473344, + "eval_entropy": 1.4914683151245116, + "eval_loss": 0.24329085648059845, + "eval_mean_token_accuracy": 0.9516400265693664, + "eval_num_tokens": 214438210.0, + "eval_runtime": 743.0423, + "eval_samples_per_second": 12.995, + "eval_steps_per_second": 0.102, + "step": 1500 + }, + { + "entropy": 1.478922975063324, + "epoch": 0.05060233097189122, + "grad_norm": 1.5390625, + "learning_rate": 3.033289817232376e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.9555383801460267, + "num_tokens": 221777722.0, + "step": 1550 + }, + { + "entropy": 1.4821615958213805, + "epoch": 0.052234664229049, + "grad_norm": 1.6328125, + "learning_rate": 3.131201044386423e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9527480769157409, + "num_tokens": 229386810.0, + "step": 1600 + }, + { + "entropy": 1.4608716344833375, + "epoch": 0.05386699748620678, + "grad_norm": 2.078125, + "learning_rate": 3.22911227154047e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9549958789348603, + "num_tokens": 237068902.0, + "step": 1650 + }, + { + "entropy": 1.4769957304000854, + "epoch": 0.055499330743364564, + "grad_norm": 1.765625, + "learning_rate": 3.327023498694517e-06, + "loss": 0.2305, + "mean_token_accuracy": 0.9542369735240936, + "num_tokens": 243773147.0, + "step": 1700 + }, + { + "entropy": 1.4610289931297302, + "epoch": 0.057131664000522345, + "grad_norm": 1.796875, + "learning_rate": 3.424934725848564e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.9570872175693512, + "num_tokens": 250400651.0, + "step": 1750 + }, + { + "entropy": 1.4512082767486572, + "epoch": 0.058763997257680126, + "grad_norm": 1.6953125, + "learning_rate": 3.522845953002611e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9566819512844086, + "num_tokens": 257634063.0, + "step": 1800 + }, + { + "entropy": 1.4433103609085083, + "epoch": 0.060396330514837906, + "grad_norm": 1.515625, + "learning_rate": 3.6207571801566577e-06, + "loss": 0.2022, + "mean_token_accuracy": 0.9584151470661163, + "num_tokens": 264346086.0, + "step": 1850 + }, + { + "entropy": 1.441446192264557, + "epoch": 0.06202866377199569, + "grad_norm": 1.7578125, + "learning_rate": 3.7186684073107047e-06, + "loss": 0.2172, + "mean_token_accuracy": 0.9561198997497559, + "num_tokens": 271468769.0, + "step": 1900 + }, + { + "entropy": 1.4351094341278077, + "epoch": 0.06366099702915347, + "grad_norm": 1.484375, + "learning_rate": 3.816579634464752e-06, + "loss": 0.2056, + "mean_token_accuracy": 0.9582554578781128, + "num_tokens": 278724065.0, + "step": 1950 + }, + { + "entropy": 1.4327974796295166, + "epoch": 0.06529333028631125, + "grad_norm": 1.4453125, + "learning_rate": 3.914490861618799e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.956677463054657, + "num_tokens": 285941177.0, + "step": 2000 + }, + { + "epoch": 0.06529333028631125, + "eval_entropy": 1.4438136545817057, + "eval_loss": 0.2242203801870346, + "eval_mean_token_accuracy": 0.9541537570953369, + "eval_num_tokens": 285941177.0, + "eval_runtime": 744.3872, + "eval_samples_per_second": 12.972, + "eval_steps_per_second": 0.102, + "step": 2000 + }, + { + "entropy": 1.444144995212555, + "epoch": 0.06692566354346903, + "grad_norm": 2.4375, + "learning_rate": 4.012402088772846e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.9598511147499085, + "num_tokens": 292840917.0, + "step": 2050 + }, + { + "entropy": 1.4242860412597655, + "epoch": 0.06855799680062681, + "grad_norm": 2.0625, + "learning_rate": 4.1103133159268925e-06, + "loss": 0.1947, + "mean_token_accuracy": 0.9596167039871216, + "num_tokens": 300067261.0, + "step": 2100 + }, + { + "entropy": 1.4138830995559692, + "epoch": 0.07019033005778459, + "grad_norm": 1.7578125, + "learning_rate": 4.2082245430809395e-06, + "loss": 0.1999, + "mean_token_accuracy": 0.9582617676258087, + "num_tokens": 307304735.0, + "step": 2150 + }, + { + "entropy": 1.4222793292999267, + "epoch": 0.07182266331494237, + "grad_norm": 1.5703125, + "learning_rate": 4.3061357702349865e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9599699079990387, + "num_tokens": 314065537.0, + "step": 2200 + }, + { + "entropy": 1.4278799700737, + "epoch": 0.07345499657210015, + "grad_norm": 1.4453125, + "learning_rate": 4.4040469973890336e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.960889185667038, + "num_tokens": 320938314.0, + "step": 2250 + }, + { + "entropy": 1.40878901720047, + "epoch": 0.07508732982925793, + "grad_norm": 2.03125, + "learning_rate": 4.501958224543081e-06, + "loss": 0.1917, + "mean_token_accuracy": 0.9593756330013276, + "num_tokens": 327763547.0, + "step": 2300 + }, + { + "entropy": 1.4114052319526673, + "epoch": 0.07671966308641573, + "grad_norm": 2.359375, + "learning_rate": 4.599869451697128e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.9592675876617431, + "num_tokens": 335032376.0, + "step": 2350 + }, + { + "entropy": 1.4144614338874817, + "epoch": 0.07835199634357351, + "grad_norm": 1.4765625, + "learning_rate": 4.697780678851175e-06, + "loss": 0.1962, + "mean_token_accuracy": 0.958512544631958, + "num_tokens": 342636323.0, + "step": 2400 + }, + { + "entropy": 1.4119537329673768, + "epoch": 0.07998432960073129, + "grad_norm": 2.1875, + "learning_rate": 4.795691906005222e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.9601361775398254, + "num_tokens": 349613670.0, + "step": 2450 + }, + { + "entropy": 1.4156457328796386, + "epoch": 0.08161666285788907, + "grad_norm": 1.953125, + "learning_rate": 4.893603133159269e-06, + "loss": 0.19, + "mean_token_accuracy": 0.9593257677555084, + "num_tokens": 356562537.0, + "step": 2500 + }, + { + "epoch": 0.08161666285788907, + "eval_entropy": 1.407395658493042, + "eval_loss": 0.20515179634094238, + "eval_mean_token_accuracy": 0.9569398101170857, + "eval_num_tokens": 356562537.0, + "eval_runtime": 748.9688, + "eval_samples_per_second": 12.892, + "eval_steps_per_second": 0.101, + "step": 2500 + }, + { + "entropy": 1.4004506325721742, + "epoch": 0.08324899611504685, + "grad_norm": 1.7578125, + "learning_rate": 4.991514360313316e-06, + "loss": 0.1797, + "mean_token_accuracy": 0.9611039471626281, + "num_tokens": 363822144.0, + "step": 2550 + }, + { + "entropy": 1.4148499584197998, + "epoch": 0.08488132937220463, + "grad_norm": 1.8984375, + "learning_rate": 5.089425587467363e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9603874254226684, + "num_tokens": 370830813.0, + "step": 2600 + }, + { + "entropy": 1.4092473483085632, + "epoch": 0.08651366262936241, + "grad_norm": 1.1640625, + "learning_rate": 5.18733681462141e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9630460107326507, + "num_tokens": 378019973.0, + "step": 2650 + }, + { + "entropy": 1.424285671710968, + "epoch": 0.0881459958865202, + "grad_norm": 2.4375, + "learning_rate": 5.285248041775457e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9613824605941772, + "num_tokens": 384991554.0, + "step": 2700 + }, + { + "entropy": 1.410207018852234, + "epoch": 0.08977832914367798, + "grad_norm": 1.96875, + "learning_rate": 5.383159268929505e-06, + "loss": 0.1632, + "mean_token_accuracy": 0.9650751757621765, + "num_tokens": 392065700.0, + "step": 2750 + }, + { + "entropy": 1.4243709874153136, + "epoch": 0.09141066240083576, + "grad_norm": 1.515625, + "learning_rate": 5.481070496083552e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9626587212085724, + "num_tokens": 399270239.0, + "step": 2800 + }, + { + "entropy": 1.413210895061493, + "epoch": 0.09304299565799354, + "grad_norm": 1.8203125, + "learning_rate": 5.578981723237598e-06, + "loss": 0.1791, + "mean_token_accuracy": 0.9606540656089783, + "num_tokens": 406647892.0, + "step": 2850 + }, + { + "entropy": 1.4289642930030824, + "epoch": 0.09467532891515132, + "grad_norm": 2.0, + "learning_rate": 5.676892950391645e-06, + "loss": 0.1853, + "mean_token_accuracy": 0.9598627412319183, + "num_tokens": 414138654.0, + "step": 2900 + }, + { + "entropy": 1.435717191696167, + "epoch": 0.0963076621723091, + "grad_norm": 1.078125, + "learning_rate": 5.774804177545692e-06, + "loss": 0.1768, + "mean_token_accuracy": 0.9624214172363281, + "num_tokens": 421217231.0, + "step": 2950 + }, + { + "entropy": 1.428927412033081, + "epoch": 0.09793999542946688, + "grad_norm": 1.7734375, + "learning_rate": 5.872715404699739e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9605919003486634, + "num_tokens": 428485940.0, + "step": 3000 + }, + { + "epoch": 0.09793999542946688, + "eval_entropy": 1.4376725546518963, + "eval_loss": 0.19405308365821838, + "eval_mean_token_accuracy": 0.958539453347524, + "eval_num_tokens": 428485940.0, + "eval_runtime": 754.0644, + "eval_samples_per_second": 12.805, + "eval_steps_per_second": 0.101, + "step": 3000 + }, + { + "entropy": 1.438420045375824, + "epoch": 0.09957232868662466, + "grad_norm": 1.6484375, + "learning_rate": 5.970626631853786e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9624645209312439, + "num_tokens": 435253940.0, + "step": 3050 + }, + { + "entropy": 1.43366064786911, + "epoch": 0.10120466194378244, + "grad_norm": 1.484375, + "learning_rate": 5.9999946455996105e-06, + "loss": 0.1691, + "mean_token_accuracy": 0.9642149293422699, + "num_tokens": 441868038.0, + "step": 3100 + }, + { + "entropy": 1.4457367968559265, + "epoch": 0.10283699520094022, + "grad_norm": 1.3046875, + "learning_rate": 5.999968420011062e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.9603316211700439, + "num_tokens": 449366573.0, + "step": 3150 + }, + { + "entropy": 1.4273823165893555, + "epoch": 0.104469328458098, + "grad_norm": 1.6015625, + "learning_rate": 5.999920339963868e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9620600152015686, + "num_tokens": 456397259.0, + "step": 3200 + }, + { + "entropy": 1.4280832529067993, + "epoch": 0.10610166171525579, + "grad_norm": 1.4609375, + "learning_rate": 5.999850405808289e-06, + "loss": 0.1706, + "mean_token_accuracy": 0.9628064668178559, + "num_tokens": 463197302.0, + "step": 3250 + }, + { + "entropy": 1.4216149377822875, + "epoch": 0.10773399497241357, + "grad_norm": 1.734375, + "learning_rate": 5.999758618053787e-06, + "loss": 0.1685, + "mean_token_accuracy": 0.9630639374256134, + "num_tokens": 469919649.0, + "step": 3300 + }, + { + "entropy": 1.4262100625038148, + "epoch": 0.10936632822957135, + "grad_norm": 1.21875, + "learning_rate": 5.999644977369027e-06, + "loss": 0.1735, + "mean_token_accuracy": 0.9620367133617401, + "num_tokens": 476875581.0, + "step": 3350 + }, + { + "entropy": 1.4126947259902953, + "epoch": 0.11099866148672913, + "grad_norm": 1.5625, + "learning_rate": 5.9995094845818684e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.9630028975009918, + "num_tokens": 483984060.0, + "step": 3400 + }, + { + "entropy": 1.436975917816162, + "epoch": 0.11263099474388691, + "grad_norm": 1.703125, + "learning_rate": 5.999352140679363e-06, + "loss": 0.1789, + "mean_token_accuracy": 0.9613595926761627, + "num_tokens": 491263590.0, + "step": 3450 + }, + { + "entropy": 1.4022981858253478, + "epoch": 0.11426332800104469, + "grad_norm": 1.0, + "learning_rate": 5.999172946807744e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9631788098812103, + "num_tokens": 498685855.0, + "step": 3500 + }, + { + "epoch": 0.11426332800104469, + "eval_entropy": 1.4040173705418904, + "eval_loss": 0.18694917857646942, + "eval_mean_token_accuracy": 0.9596376585960388, + "eval_num_tokens": 498685855.0, + "eval_runtime": 746.2297, + "eval_samples_per_second": 12.94, + "eval_steps_per_second": 0.102, + "step": 3500 + }, + { + "entropy": 1.4095824003219604, + "epoch": 0.11589566125820247, + "grad_norm": 3.796875, + "learning_rate": 5.998971904272421e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9623571968078614, + "num_tokens": 506263057.0, + "step": 3550 + }, + { + "entropy": 1.406613359451294, + "epoch": 0.11752799451536025, + "grad_norm": 1.375, + "learning_rate": 5.998749014537968e-06, + "loss": 0.1674, + "mean_token_accuracy": 0.9625373089313507, + "num_tokens": 513004369.0, + "step": 3600 + }, + { + "entropy": 1.405109441280365, + "epoch": 0.11916032777251803, + "grad_norm": 1.4140625, + "learning_rate": 5.998504279228114e-06, + "loss": 0.1655, + "mean_token_accuracy": 0.9644717895984649, + "num_tokens": 519807945.0, + "step": 3650 + }, + { + "entropy": 1.4100734496116638, + "epoch": 0.12079266102967581, + "grad_norm": 1.4375, + "learning_rate": 5.99823770012573e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9630844235420227, + "num_tokens": 526976842.0, + "step": 3700 + }, + { + "entropy": 1.407007110118866, + "epoch": 0.1224249942868336, + "grad_norm": 1.25, + "learning_rate": 5.997949279172815e-06, + "loss": 0.1655, + "mean_token_accuracy": 0.9636348211765289, + "num_tokens": 533987687.0, + "step": 3750 + }, + { + "entropy": 1.4068945956230163, + "epoch": 0.12405732754399137, + "grad_norm": 1.3125, + "learning_rate": 5.9976390184704885e-06, + "loss": 0.1701, + "mean_token_accuracy": 0.962993438243866, + "num_tokens": 540885515.0, + "step": 3800 + }, + { + "entropy": 1.4040190553665162, + "epoch": 0.12568966080114916, + "grad_norm": 1.671875, + "learning_rate": 5.997306920278967e-06, + "loss": 0.1736, + "mean_token_accuracy": 0.9614431858062744, + "num_tokens": 548248278.0, + "step": 3850 + }, + { + "entropy": 1.3793179154396058, + "epoch": 0.12732199405830694, + "grad_norm": 1.5, + "learning_rate": 5.99695298701755e-06, + "loss": 0.1484, + "mean_token_accuracy": 0.9667005109786987, + "num_tokens": 555190537.0, + "step": 3900 + }, + { + "entropy": 1.3872941756248474, + "epoch": 0.12895432731546472, + "grad_norm": 1.515625, + "learning_rate": 5.996577221264605e-06, + "loss": 0.1648, + "mean_token_accuracy": 0.9634542167186737, + "num_tokens": 562395377.0, + "step": 3950 + }, + { + "entropy": 1.3957655119895935, + "epoch": 0.1305866605726225, + "grad_norm": 1.6875, + "learning_rate": 5.9961796257575485e-06, + "loss": 0.1684, + "mean_token_accuracy": 0.9631939339637756, + "num_tokens": 569190032.0, + "step": 4000 + }, + { + "epoch": 0.1305866605726225, + "eval_entropy": 1.4019947210947672, + "eval_loss": 0.18201017379760742, + "eval_mean_token_accuracy": 0.960600491364797, + "eval_num_tokens": 569190032.0, + "eval_runtime": 751.8458, + "eval_samples_per_second": 12.843, + "eval_steps_per_second": 0.101, + "step": 4000 + }, + { + "entropy": 1.3909747576713563, + "epoch": 0.13221899382978028, + "grad_norm": 6.75, + "learning_rate": 5.99576020339282e-06, + "loss": 0.1614, + "mean_token_accuracy": 0.9640302300453186, + "num_tokens": 576222815.0, + "step": 4050 + }, + { + "entropy": 1.3952715015411377, + "epoch": 0.13385132708693806, + "grad_norm": 1.8515625, + "learning_rate": 5.995318957225869e-06, + "loss": 0.1572, + "mean_token_accuracy": 0.9658111941814422, + "num_tokens": 582993007.0, + "step": 4100 + }, + { + "entropy": 1.4083420133590698, + "epoch": 0.13548366034409584, + "grad_norm": 1.7265625, + "learning_rate": 5.994855890471128e-06, + "loss": 0.1634, + "mean_token_accuracy": 0.964879697561264, + "num_tokens": 589829579.0, + "step": 4150 + }, + { + "entropy": 1.4027349591255187, + "epoch": 0.13711599360125362, + "grad_norm": 1.65625, + "learning_rate": 5.9943710065019905e-06, + "loss": 0.1586, + "mean_token_accuracy": 0.9652115881443024, + "num_tokens": 596795182.0, + "step": 4200 + }, + { + "entropy": 1.4036769008636474, + "epoch": 0.1387483268584114, + "grad_norm": 1.5625, + "learning_rate": 5.993864308850785e-06, + "loss": 0.1644, + "mean_token_accuracy": 0.9636000382900238, + "num_tokens": 603861008.0, + "step": 4250 + }, + { + "entropy": 1.4230398559570312, + "epoch": 0.14038066011556918, + "grad_norm": 1.7578125, + "learning_rate": 5.9933358012087526e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9630668413639069, + "num_tokens": 611150442.0, + "step": 4300 + }, + { + "entropy": 1.4209274005889894, + "epoch": 0.14201299337272696, + "grad_norm": 1.296875, + "learning_rate": 5.992785487426016e-06, + "loss": 0.1613, + "mean_token_accuracy": 0.9643425738811493, + "num_tokens": 617582752.0, + "step": 4350 + }, + { + "entropy": 1.409316577911377, + "epoch": 0.14364532662988475, + "grad_norm": 1.09375, + "learning_rate": 5.992213371511554e-06, + "loss": 0.1606, + "mean_token_accuracy": 0.9648306250572205, + "num_tokens": 624322193.0, + "step": 4400 + }, + { + "entropy": 1.406817865371704, + "epoch": 0.14527765988704253, + "grad_norm": 1.765625, + "learning_rate": 5.991619457633171e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9639068651199341, + "num_tokens": 631733777.0, + "step": 4450 + }, + { + "entropy": 1.4001825642585755, + "epoch": 0.1469099931442003, + "grad_norm": 1.4140625, + "learning_rate": 5.991003750117468e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9647636806964874, + "num_tokens": 639594509.0, + "step": 4500 + }, + { + "epoch": 0.1469099931442003, + "eval_entropy": 1.4037580092748005, + "eval_loss": 0.17858904600143433, + "eval_mean_token_accuracy": 0.961095765431722, + "eval_num_tokens": 639594509.0, + "eval_runtime": 747.1098, + "eval_samples_per_second": 12.924, + "eval_steps_per_second": 0.102, + "step": 4500 + }, + { + "entropy": 1.3959466004371643, + "epoch": 0.1485423264013581, + "grad_norm": 1.515625, + "learning_rate": 5.990366253449812e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.9646508944034576, + "num_tokens": 647057874.0, + "step": 4550 + }, + { + "entropy": 1.3969790387153624, + "epoch": 0.15017465965851587, + "grad_norm": 1.015625, + "learning_rate": 5.989706972274299e-06, + "loss": 0.1617, + "mean_token_accuracy": 0.9644351935386658, + "num_tokens": 654287916.0, + "step": 4600 + }, + { + "entropy": 1.3977243065834046, + "epoch": 0.15180699291567368, + "grad_norm": 1.8203125, + "learning_rate": 5.989025911393723e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.9629066979885101, + "num_tokens": 661344302.0, + "step": 4650 + }, + { + "entropy": 1.4031280422210692, + "epoch": 0.15343932617283146, + "grad_norm": 1.78125, + "learning_rate": 5.988323075769544e-06, + "loss": 0.1478, + "mean_token_accuracy": 0.967905158996582, + "num_tokens": 667929168.0, + "step": 4700 + }, + { + "entropy": 1.4076430988311768, + "epoch": 0.15507165942998924, + "grad_norm": 0.9375, + "learning_rate": 5.987598470521845e-06, + "loss": 0.1585, + "mean_token_accuracy": 0.965575454235077, + "num_tokens": 675162945.0, + "step": 4750 + }, + { + "entropy": 1.4047666358947755, + "epoch": 0.15670399268714702, + "grad_norm": 1.1171875, + "learning_rate": 5.986852100929301e-06, + "loss": 0.1633, + "mean_token_accuracy": 0.9638245010375976, + "num_tokens": 682171614.0, + "step": 4800 + }, + { + "entropy": 1.4109639859199523, + "epoch": 0.1583363259443048, + "grad_norm": 1.5859375, + "learning_rate": 5.986083972429135e-06, + "loss": 0.1641, + "mean_token_accuracy": 0.964219799041748, + "num_tokens": 689397569.0, + "step": 4850 + }, + { + "entropy": 1.4028720164299011, + "epoch": 0.15996865920146258, + "grad_norm": 1.4375, + "learning_rate": 5.985294090617086e-06, + "loss": 0.1608, + "mean_token_accuracy": 0.9645184981822967, + "num_tokens": 696209610.0, + "step": 4900 + }, + { + "entropy": 1.3975288462638855, + "epoch": 0.16160099245862036, + "grad_norm": 1.578125, + "learning_rate": 5.98448246124736e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.9647789108753204, + "num_tokens": 703107550.0, + "step": 4950 + }, + { + "entropy": 1.3928989100456237, + "epoch": 0.16323332571577814, + "grad_norm": 1.6015625, + "learning_rate": 5.983649090232592e-06, + "loss": 0.1592, + "mean_token_accuracy": 0.9648973512649536, + "num_tokens": 710156450.0, + "step": 5000 + }, + { + "epoch": 0.16323332571577814, + "eval_entropy": 1.4044678370157877, + "eval_loss": 0.1755974143743515, + "eval_mean_token_accuracy": 0.961607707341512, + "eval_num_tokens": 710156450.0, + "eval_runtime": 749.1287, + "eval_samples_per_second": 12.89, + "eval_steps_per_second": 0.101, + "step": 5000 + }, + { + "entropy": 1.4239799737930299, + "epoch": 0.16486565897293592, + "grad_norm": 1.234375, + "learning_rate": 5.982793983643805e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.9640812170505524, + "num_tokens": 717234054.0, + "step": 5050 + }, + { + "entropy": 1.4224350619316102, + "epoch": 0.1664979922300937, + "grad_norm": 2.109375, + "learning_rate": 5.98191714771036e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.9640639245510101, + "num_tokens": 724297107.0, + "step": 5100 + }, + { + "entropy": 1.4389384937286378, + "epoch": 0.1681303254872515, + "grad_norm": 1.1953125, + "learning_rate": 5.981018588819916e-06, + "loss": 0.1681, + "mean_token_accuracy": 0.9633942902088165, + "num_tokens": 731287758.0, + "step": 5150 + }, + { + "entropy": 1.427138111591339, + "epoch": 0.16976265874440927, + "grad_norm": 1.6015625, + "learning_rate": 5.980098313518383e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.9635147547721863, + "num_tokens": 738353484.0, + "step": 5200 + }, + { + "entropy": 1.43255943775177, + "epoch": 0.17139499200156705, + "grad_norm": 1.6015625, + "learning_rate": 5.97915632850987e-06, + "loss": 0.1602, + "mean_token_accuracy": 0.9645324110984802, + "num_tokens": 745152314.0, + "step": 5250 + }, + { + "entropy": 1.4146737170219421, + "epoch": 0.17302732525872483, + "grad_norm": 1.375, + "learning_rate": 5.97819264065664e-06, + "loss": 0.1566, + "mean_token_accuracy": 0.9652618932723999, + "num_tokens": 752080428.0, + "step": 5300 + }, + { + "entropy": 1.41352454662323, + "epoch": 0.1746596585158826, + "grad_norm": 1.3515625, + "learning_rate": 5.977207256979058e-06, + "loss": 0.1472, + "mean_token_accuracy": 0.9666665005683899, + "num_tokens": 758654299.0, + "step": 5350 + }, + { + "entropy": 1.4163859128952025, + "epoch": 0.1762919917730404, + "grad_norm": 1.2578125, + "learning_rate": 5.976200184655544e-06, + "loss": 0.1646, + "mean_token_accuracy": 0.9645817792415619, + "num_tokens": 765762864.0, + "step": 5400 + }, + { + "entropy": 1.4168593525886535, + "epoch": 0.17792432503019817, + "grad_norm": 1.625, + "learning_rate": 5.9751714310225135e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9653662145137787, + "num_tokens": 772523965.0, + "step": 5450 + }, + { + "entropy": 1.4106249260902404, + "epoch": 0.17955665828735595, + "grad_norm": 1.46875, + "learning_rate": 5.974121003574331e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.9646015942096711, + "num_tokens": 779873569.0, + "step": 5500 + }, + { + "epoch": 0.17955665828735595, + "eval_entropy": 1.4159186140696207, + "eval_loss": 0.17334794998168945, + "eval_mean_token_accuracy": 0.961771670182546, + "eval_num_tokens": 779873569.0, + "eval_runtime": 752.9376, + "eval_samples_per_second": 12.824, + "eval_steps_per_second": 0.101, + "step": 5500 + }, + { + "entropy": 1.4060875940322877, + "epoch": 0.18118899154451373, + "grad_norm": 1.8828125, + "learning_rate": 5.973048909963251e-06, + "loss": 0.166, + "mean_token_accuracy": 0.9633875727653504, + "num_tokens": 787666740.0, + "step": 5550 + }, + { + "entropy": 1.4012414264678954, + "epoch": 0.18282132480167151, + "grad_norm": 1.515625, + "learning_rate": 5.971955157999365e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9654585599899292, + "num_tokens": 794513657.0, + "step": 5600 + }, + { + "entropy": 1.3988840198516845, + "epoch": 0.1844536580588293, + "grad_norm": 1.6953125, + "learning_rate": 5.970839755650541e-06, + "loss": 0.1595, + "mean_token_accuracy": 0.9642168319225312, + "num_tokens": 801328321.0, + "step": 5650 + }, + { + "entropy": 1.4012188339233398, + "epoch": 0.18608599131598708, + "grad_norm": 1.109375, + "learning_rate": 5.969702711042371e-06, + "loss": 0.167, + "mean_token_accuracy": 0.9627443432807923, + "num_tokens": 808895137.0, + "step": 5700 + }, + { + "entropy": 1.4030301642417908, + "epoch": 0.18771832457314486, + "grad_norm": 1.40625, + "learning_rate": 5.968544032458105e-06, + "loss": 0.1518, + "mean_token_accuracy": 0.9666016948223114, + "num_tokens": 815564055.0, + "step": 5750 + }, + { + "entropy": 1.3949448704719543, + "epoch": 0.18935065783030264, + "grad_norm": 1.25, + "learning_rate": 5.967363728338598e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9651316654682159, + "num_tokens": 822772629.0, + "step": 5800 + }, + { + "entropy": 1.4053015112876892, + "epoch": 0.19098299108746042, + "grad_norm": 1.8125, + "learning_rate": 5.966161807282244e-06, + "loss": 0.1475, + "mean_token_accuracy": 0.9664002013206482, + "num_tokens": 829474695.0, + "step": 5850 + }, + { + "entropy": 1.3961756944656372, + "epoch": 0.1926153243446182, + "grad_norm": 1.2734375, + "learning_rate": 5.96493827804491e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.9642155694961548, + "num_tokens": 836807189.0, + "step": 5900 + }, + { + "entropy": 1.39143967628479, + "epoch": 0.19424765760177598, + "grad_norm": 1.671875, + "learning_rate": 5.963693149539883e-06, + "loss": 0.1592, + "mean_token_accuracy": 0.9642041945457458, + "num_tokens": 843989373.0, + "step": 5950 + }, + { + "entropy": 1.395959141254425, + "epoch": 0.19587999085893376, + "grad_norm": 1.1484375, + "learning_rate": 5.962426430837792e-06, + "loss": 0.1613, + "mean_token_accuracy": 0.964136803150177, + "num_tokens": 851299727.0, + "step": 6000 + }, + { + "epoch": 0.19587999085893376, + "eval_entropy": 1.4013945213953654, + "eval_loss": 0.17208388447761536, + "eval_mean_token_accuracy": 0.9621260579427083, + "eval_num_tokens": 851299727.0, + "eval_runtime": 753.3633, + "eval_samples_per_second": 12.817, + "eval_steps_per_second": 0.101, + "step": 6000 + }, + { + "entropy": 1.4012908124923706, + "epoch": 0.19751232411609154, + "grad_norm": 0.0113525390625, + "learning_rate": 5.961138131166554e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9652227807044983, + "num_tokens": 858064092.0, + "step": 6050 + }, + { + "entropy": 1.388651340007782, + "epoch": 0.19914465737324932, + "grad_norm": 1.9375, + "learning_rate": 5.959828259911295e-06, + "loss": 0.1569, + "mean_token_accuracy": 0.9651960909366608, + "num_tokens": 865252118.0, + "step": 6100 + }, + { + "entropy": 1.4042778515815735, + "epoch": 0.2007769906304071, + "grad_norm": 1.375, + "learning_rate": 5.958496826614294e-06, + "loss": 0.1661, + "mean_token_accuracy": 0.9626063787937165, + "num_tokens": 872468561.0, + "step": 6150 + }, + { + "entropy": 1.401300666332245, + "epoch": 0.20240932388756488, + "grad_norm": 1.8515625, + "learning_rate": 5.957143840974904e-06, + "loss": 0.149, + "mean_token_accuracy": 0.9666438174247741, + "num_tokens": 879011998.0, + "step": 6200 + }, + { + "entropy": 1.4077980709075928, + "epoch": 0.20404165714472267, + "grad_norm": 1.6328125, + "learning_rate": 5.955769312849484e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.9650888216495513, + "num_tokens": 886346540.0, + "step": 6250 + }, + { + "entropy": 1.394054229259491, + "epoch": 0.20567399040188045, + "grad_norm": 2.140625, + "learning_rate": 5.954373252251329e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9657756268978119, + "num_tokens": 893393322.0, + "step": 6300 + }, + { + "entropy": 1.3978914856910705, + "epoch": 0.20730632365903823, + "grad_norm": 1.5, + "learning_rate": 5.952955669350596e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9658310306072235, + "num_tokens": 899920970.0, + "step": 6350 + }, + { + "entropy": 1.4074292516708373, + "epoch": 0.208938656916196, + "grad_norm": 1.875, + "learning_rate": 5.95151657447423e-06, + "loss": 0.1568, + "mean_token_accuracy": 0.9654946303367615, + "num_tokens": 906895264.0, + "step": 6400 + }, + { + "entropy": 1.4000064754486083, + "epoch": 0.2105709901733538, + "grad_norm": 1.390625, + "learning_rate": 5.950055978105885e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9668013238906861, + "num_tokens": 913671071.0, + "step": 6450 + }, + { + "entropy": 1.4307255530357361, + "epoch": 0.21220332343051157, + "grad_norm": 1.015625, + "learning_rate": 5.948573890885859e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9624395740032196, + "num_tokens": 921099610.0, + "step": 6500 + }, + { + "epoch": 0.21220332343051157, + "eval_entropy": 1.4208130852381389, + "eval_loss": 0.16986840963363647, + "eval_mean_token_accuracy": 0.962581082979838, + "eval_num_tokens": 921099610.0, + "eval_runtime": 746.9082, + "eval_samples_per_second": 12.928, + "eval_steps_per_second": 0.102, + "step": 6500 + }, + { + "entropy": 1.409722430706024, + "epoch": 0.21383565668766935, + "grad_norm": 1.2109375, + "learning_rate": 5.947070323610999e-06, + "loss": 0.1449, + "mean_token_accuracy": 0.9680650508403779, + "num_tokens": 928163378.0, + "step": 6550 + }, + { + "entropy": 1.4334656882286072, + "epoch": 0.21546798994482713, + "grad_norm": 1.3515625, + "learning_rate": 5.945545287234639e-06, + "loss": 0.1563, + "mean_token_accuracy": 0.964540822505951, + "num_tokens": 934586868.0, + "step": 6600 + }, + { + "entropy": 1.4435390138626099, + "epoch": 0.2171003232019849, + "grad_norm": 1.4140625, + "learning_rate": 5.943998792866509e-06, + "loss": 0.15, + "mean_token_accuracy": 0.9669582867622375, + "num_tokens": 941380079.0, + "step": 6650 + }, + { + "entropy": 1.4167057871818542, + "epoch": 0.2187326564591427, + "grad_norm": 1.7578125, + "learning_rate": 5.942430851772662e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.9633757710456848, + "num_tokens": 949348630.0, + "step": 6700 + }, + { + "entropy": 1.4309338593482972, + "epoch": 0.22036498971630047, + "grad_norm": 0.89453125, + "learning_rate": 5.9408414753753836e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9655015981197357, + "num_tokens": 956312502.0, + "step": 6750 + }, + { + "entropy": 1.4343366432189941, + "epoch": 0.22199732297345826, + "grad_norm": 1.5625, + "learning_rate": 5.939230675253119e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.96628955245018, + "num_tokens": 963500996.0, + "step": 6800 + }, + { + "entropy": 1.4271987533569337, + "epoch": 0.22362965623061604, + "grad_norm": 1.6796875, + "learning_rate": 5.9375984631403785e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9645246016979218, + "num_tokens": 970915430.0, + "step": 6850 + }, + { + "entropy": 1.4355636262893676, + "epoch": 0.22526198948777382, + "grad_norm": 1.8984375, + "learning_rate": 5.935944850927657e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9660027372837067, + "num_tokens": 978036240.0, + "step": 6900 + }, + { + "entropy": 1.425994610786438, + "epoch": 0.2268943227449316, + "grad_norm": 1.203125, + "learning_rate": 5.934269850661349e-06, + "loss": 0.152, + "mean_token_accuracy": 0.9652906250953674, + "num_tokens": 985047502.0, + "step": 6950 + }, + { + "entropy": 1.4223777842521668, + "epoch": 0.22852665600208938, + "grad_norm": 1.171875, + "learning_rate": 5.932573474543658e-06, + "loss": 0.156, + "mean_token_accuracy": 0.9658907651901245, + "num_tokens": 992373619.0, + "step": 7000 + }, + { + "epoch": 0.22852665600208938, + "eval_entropy": 1.4304504505793254, + "eval_loss": 0.16900277137756348, + "eval_mean_token_accuracy": 0.9627823217709859, + "eval_num_tokens": 992373619.0, + "eval_runtime": 749.871, + "eval_samples_per_second": 12.877, + "eval_steps_per_second": 0.101, + "step": 7000 + }, + { + "entropy": 1.4272488355636597, + "epoch": 0.23015898925924716, + "grad_norm": 1.9765625, + "learning_rate": 5.930855734932506e-06, + "loss": 0.1454, + "mean_token_accuracy": 0.9678001618385315, + "num_tokens": 999289508.0, + "step": 7050 + }, + { + "entropy": 1.4248918747901917, + "epoch": 0.23179132251640494, + "grad_norm": 1.5625, + "learning_rate": 5.92911664434145e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9652420032024384, + "num_tokens": 1006528812.0, + "step": 7100 + }, + { + "entropy": 1.4263224506378174, + "epoch": 0.23342365577356272, + "grad_norm": 1.3359375, + "learning_rate": 5.927356215439584e-06, + "loss": 0.1494, + "mean_token_accuracy": 0.9668344402313233, + "num_tokens": 1013541923.0, + "step": 7150 + }, + { + "entropy": 1.4219363307952881, + "epoch": 0.2350559890307205, + "grad_norm": 1.640625, + "learning_rate": 5.92557446105145e-06, + "loss": 0.1486, + "mean_token_accuracy": 0.9674091839790344, + "num_tokens": 1020511787.0, + "step": 7200 + }, + { + "entropy": 1.4293452215194702, + "epoch": 0.23668832228787828, + "grad_norm": 1.2578125, + "learning_rate": 5.923771394156943e-06, + "loss": 0.158, + "mean_token_accuracy": 0.9649367642402649, + "num_tokens": 1027894747.0, + "step": 7250 + }, + { + "entropy": 1.4306922936439515, + "epoch": 0.23832065554503606, + "grad_norm": 1.3515625, + "learning_rate": 5.921947027891219e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9656593954563141, + "num_tokens": 1035110900.0, + "step": 7300 + }, + { + "entropy": 1.4252450680732727, + "epoch": 0.23995298880219385, + "grad_norm": 0.9921875, + "learning_rate": 5.9201013755445955e-06, + "loss": 0.1535, + "mean_token_accuracy": 0.966197533607483, + "num_tokens": 1042230443.0, + "step": 7350 + }, + { + "entropy": 1.4307914185523987, + "epoch": 0.24158532205935163, + "grad_norm": 1.21875, + "learning_rate": 5.91823445056246e-06, + "loss": 0.147, + "mean_token_accuracy": 0.9667865073680878, + "num_tokens": 1049389935.0, + "step": 7400 + }, + { + "entropy": 1.4161819124221802, + "epoch": 0.2432176553165094, + "grad_norm": 1.703125, + "learning_rate": 5.916346266545167e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.96632697224617, + "num_tokens": 1056551888.0, + "step": 7450 + }, + { + "entropy": 1.4254303669929504, + "epoch": 0.2448499885736672, + "grad_norm": 1.4921875, + "learning_rate": 5.914436837247941e-06, + "loss": 0.1525, + "mean_token_accuracy": 0.965957795381546, + "num_tokens": 1063197615.0, + "step": 7500 + }, + { + "epoch": 0.2448499885736672, + "eval_entropy": 1.4276245164871215, + "eval_loss": 0.16787172853946686, + "eval_mean_token_accuracy": 0.9633070985476176, + "eval_num_tokens": 1063197615.0, + "eval_runtime": 751.5202, + "eval_samples_per_second": 12.849, + "eval_steps_per_second": 0.101, + "step": 7500 + }, + { + "entropy": 1.4108345437049865, + "epoch": 0.24648232183082497, + "grad_norm": 1.3828125, + "learning_rate": 5.912506176580776e-06, + "loss": 0.147, + "mean_token_accuracy": 0.9670144832134246, + "num_tokens": 1069874223.0, + "step": 7550 + }, + { + "entropy": 1.4212487936019897, + "epoch": 0.24811465508798275, + "grad_norm": 1.828125, + "learning_rate": 5.910554298608335e-06, + "loss": 0.1509, + "mean_token_accuracy": 0.96580601811409, + "num_tokens": 1076764997.0, + "step": 7600 + }, + { + "entropy": 1.4187248206138612, + "epoch": 0.24974698834514053, + "grad_norm": 1.8515625, + "learning_rate": 5.908581217549845e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9664638650417328, + "num_tokens": 1083894428.0, + "step": 7650 + }, + { + "entropy": 1.400410017967224, + "epoch": 0.2513793216022983, + "grad_norm": 0.94140625, + "learning_rate": 5.906586947778998e-06, + "loss": 0.1448, + "mean_token_accuracy": 0.9671175360679627, + "num_tokens": 1090645918.0, + "step": 7700 + }, + { + "entropy": 1.4053305006027221, + "epoch": 0.2530116548594561, + "grad_norm": 1.21875, + "learning_rate": 5.9045715038238436e-06, + "loss": 0.1509, + "mean_token_accuracy": 0.9659091722965241, + "num_tokens": 1097654372.0, + "step": 7750 + }, + { + "entropy": 1.3977803254127503, + "epoch": 0.2546439881166139, + "grad_norm": 1.25, + "learning_rate": 5.902534900366681e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.9661977970600129, + "num_tokens": 1104996723.0, + "step": 7800 + }, + { + "entropy": 1.3928361082077025, + "epoch": 0.25627632137377165, + "grad_norm": 1.78125, + "learning_rate": 5.900477152243954e-06, + "loss": 0.1467, + "mean_token_accuracy": 0.9668631637096405, + "num_tokens": 1111808272.0, + "step": 7850 + }, + { + "entropy": 1.3904444289207458, + "epoch": 0.25790865463092943, + "grad_norm": 1.484375, + "learning_rate": 5.8983982744461446e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9658901369571686, + "num_tokens": 1118777348.0, + "step": 7900 + }, + { + "entropy": 1.4107188177108765, + "epoch": 0.2595409878880872, + "grad_norm": 1.25, + "learning_rate": 5.896298282117662e-06, + "loss": 0.1508, + "mean_token_accuracy": 0.9659655904769897, + "num_tokens": 1125395200.0, + "step": 7950 + }, + { + "entropy": 1.409360373020172, + "epoch": 0.261173321145245, + "grad_norm": 1.21875, + "learning_rate": 5.894177190556733e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9658745980262756, + "num_tokens": 1132194770.0, + "step": 8000 + }, + { + "epoch": 0.261173321145245, + "eval_entropy": 1.3953218412399293, + "eval_loss": 0.16749244928359985, + "eval_mean_token_accuracy": 0.9630522100130717, + "eval_num_tokens": 1132194770.0, + "eval_runtime": 749.5556, + "eval_samples_per_second": 12.882, + "eval_steps_per_second": 0.101, + "step": 8000 + }, + { + "entropy": 1.395931794643402, + "epoch": 0.2628056544024028, + "grad_norm": 2.015625, + "learning_rate": 5.892035015215289e-06, + "loss": 0.1475, + "mean_token_accuracy": 0.967307710647583, + "num_tokens": 1139223324.0, + "step": 8050 + }, + { + "entropy": 1.4096789264678955, + "epoch": 0.26443798765956056, + "grad_norm": 0.99609375, + "learning_rate": 5.889871771698854e-06, + "loss": 0.1512, + "mean_token_accuracy": 0.9665188312530517, + "num_tokens": 1146048693.0, + "step": 8100 + }, + { + "entropy": 1.3959095120429992, + "epoch": 0.26607032091671834, + "grad_norm": 0.98828125, + "learning_rate": 5.887687475766435e-06, + "loss": 0.1517, + "mean_token_accuracy": 0.9670138394832611, + "num_tokens": 1153416156.0, + "step": 8150 + }, + { + "entropy": 1.384766845703125, + "epoch": 0.2677026541738761, + "grad_norm": 1.484375, + "learning_rate": 5.8854821433303995e-06, + "loss": 0.1478, + "mean_token_accuracy": 0.9662396657466888, + "num_tokens": 1160327310.0, + "step": 8200 + }, + { + "entropy": 1.3720842933654784, + "epoch": 0.2693349874310339, + "grad_norm": 1.5546875, + "learning_rate": 5.883255790456365e-06, + "loss": 0.1369, + "mean_token_accuracy": 0.9690599977970124, + "num_tokens": 1166793829.0, + "step": 8250 + }, + { + "entropy": 1.3875324892997742, + "epoch": 0.2709673206881917, + "grad_norm": 1.0859375, + "learning_rate": 5.881008433363083e-06, + "loss": 0.1484, + "mean_token_accuracy": 0.9658425927162171, + "num_tokens": 1173770645.0, + "step": 8300 + }, + { + "entropy": 1.382853055000305, + "epoch": 0.27259965394534946, + "grad_norm": 1.6484375, + "learning_rate": 5.878740088422315e-06, + "loss": 0.1633, + "mean_token_accuracy": 0.9631330251693726, + "num_tokens": 1181126599.0, + "step": 8350 + }, + { + "entropy": 1.3965485191345215, + "epoch": 0.27423198720250724, + "grad_norm": 1.0390625, + "learning_rate": 5.87645077215872e-06, + "loss": 0.1498, + "mean_token_accuracy": 0.9665160596370697, + "num_tokens": 1188172115.0, + "step": 8400 + }, + { + "entropy": 1.3885521602630615, + "epoch": 0.275864320459665, + "grad_norm": 1.1953125, + "learning_rate": 5.874140501249728e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.9669651210308075, + "num_tokens": 1195102960.0, + "step": 8450 + }, + { + "entropy": 1.3970652842521667, + "epoch": 0.2774966537168228, + "grad_norm": 2.328125, + "learning_rate": 5.8718092925254235e-06, + "loss": 0.1469, + "mean_token_accuracy": 0.9666703069210052, + "num_tokens": 1201982519.0, + "step": 8500 + }, + { + "epoch": 0.2774966537168228, + "eval_entropy": 1.390671566327413, + "eval_loss": 0.1665239781141281, + "eval_mean_token_accuracy": 0.9631963141759237, + "eval_num_tokens": 1201982519.0, + "eval_runtime": 751.0843, + "eval_samples_per_second": 12.856, + "eval_steps_per_second": 0.101, + "step": 8500 + }, + { + "entropy": 1.3743389058113098, + "epoch": 0.2791289869739806, + "grad_norm": 1.46875, + "learning_rate": 5.86945716296842e-06, + "loss": 0.1413, + "mean_token_accuracy": 0.9681530177593232, + "num_tokens": 1208731712.0, + "step": 8550 + }, + { + "entropy": 1.3886315321922302, + "epoch": 0.28076132023113837, + "grad_norm": 1.28125, + "learning_rate": 5.867084129713738e-06, + "loss": 0.1553, + "mean_token_accuracy": 0.9659830582141876, + "num_tokens": 1215816513.0, + "step": 8600 + }, + { + "entropy": 1.3884997010231017, + "epoch": 0.28239365348829615, + "grad_norm": 1.3515625, + "learning_rate": 5.864690210048677e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.9667926502227783, + "num_tokens": 1222796740.0, + "step": 8650 + }, + { + "entropy": 1.3810113263130188, + "epoch": 0.28402598674545393, + "grad_norm": 1.921875, + "learning_rate": 5.862275421412695e-06, + "loss": 0.1428, + "mean_token_accuracy": 0.968780642747879, + "num_tokens": 1229478573.0, + "step": 8700 + }, + { + "entropy": 1.3753751254081725, + "epoch": 0.2856583200026117, + "grad_norm": 1.6953125, + "learning_rate": 5.859839781397276e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.9648597013950347, + "num_tokens": 1236888916.0, + "step": 8750 + }, + { + "entropy": 1.3797388172149658, + "epoch": 0.2872906532597695, + "grad_norm": 1.03125, + "learning_rate": 5.857383307745805e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9654771292209625, + "num_tokens": 1243945893.0, + "step": 8800 + }, + { + "entropy": 1.37408056974411, + "epoch": 0.28892298651692727, + "grad_norm": 1.8828125, + "learning_rate": 5.854906018353436e-06, + "loss": 0.1531, + "mean_token_accuracy": 0.9655951869487762, + "num_tokens": 1250815278.0, + "step": 8850 + }, + { + "entropy": 1.3714010500907898, + "epoch": 0.29055531977408505, + "grad_norm": 1.0859375, + "learning_rate": 5.852407931266967e-06, + "loss": 0.1416, + "mean_token_accuracy": 0.967999415397644, + "num_tokens": 1257589618.0, + "step": 8900 + }, + { + "entropy": 1.370381121635437, + "epoch": 0.29218765303124283, + "grad_norm": 1.1953125, + "learning_rate": 5.849889064684703e-06, + "loss": 0.156, + "mean_token_accuracy": 0.965356330871582, + "num_tokens": 1264949457.0, + "step": 8950 + }, + { + "entropy": 1.3676560163497924, + "epoch": 0.2938199862884006, + "grad_norm": 1.5546875, + "learning_rate": 5.847349436956325e-06, + "loss": 0.1609, + "mean_token_accuracy": 0.9641006827354431, + "num_tokens": 1272234102.0, + "step": 9000 + }, + { + "epoch": 0.2938199862884006, + "eval_entropy": 1.38709463596344, + "eval_loss": 0.1650434136390686, + "eval_mean_token_accuracy": 0.9634115918477376, + "eval_num_tokens": 1272234102.0, + "eval_runtime": 751.7405, + "eval_samples_per_second": 12.845, + "eval_steps_per_second": 0.101, + "step": 9000 + }, + { + "entropy": 1.3809342765808106, + "epoch": 0.2954523195455584, + "grad_norm": 2.296875, + "learning_rate": 5.844789066582758e-06, + "loss": 0.1432, + "mean_token_accuracy": 0.9673550212383271, + "num_tokens": 1279098038.0, + "step": 9050 + }, + { + "entropy": 1.40355872631073, + "epoch": 0.2970846528027162, + "grad_norm": 1.4921875, + "learning_rate": 5.842207972216034e-06, + "loss": 0.1521, + "mean_token_accuracy": 0.9661613535881043, + "num_tokens": 1286173347.0, + "step": 9100 + }, + { + "entropy": 1.3974176049232483, + "epoch": 0.29871698605987396, + "grad_norm": 1.3125, + "learning_rate": 5.839606172659159e-06, + "loss": 0.1521, + "mean_token_accuracy": 0.9656483995914459, + "num_tokens": 1293330750.0, + "step": 9150 + }, + { + "entropy": 1.3879291534423828, + "epoch": 0.30034931931703174, + "grad_norm": 1.3671875, + "learning_rate": 5.8369836868659706e-06, + "loss": 0.1553, + "mean_token_accuracy": 0.9647518181800843, + "num_tokens": 1300327068.0, + "step": 9200 + }, + { + "entropy": 1.3752172994613647, + "epoch": 0.3019816525741896, + "grad_norm": 1.125, + "learning_rate": 5.8343405339410085e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.9691135132312775, + "num_tokens": 1307021605.0, + "step": 9250 + }, + { + "entropy": 1.3903837966918946, + "epoch": 0.30361398583134735, + "grad_norm": 1.4609375, + "learning_rate": 5.831676733139364e-06, + "loss": 0.1458, + "mean_token_accuracy": 0.9677986741065979, + "num_tokens": 1314238674.0, + "step": 9300 + }, + { + "entropy": 1.3943523359298706, + "epoch": 0.30524631908850514, + "grad_norm": 1.0546875, + "learning_rate": 5.828992303866552e-06, + "loss": 0.158, + "mean_token_accuracy": 0.9646199941635132, + "num_tokens": 1321715116.0, + "step": 9350 + }, + { + "entropy": 1.4028909087181092, + "epoch": 0.3068786523456629, + "grad_norm": 1.515625, + "learning_rate": 5.82628726567836e-06, + "loss": 0.1615, + "mean_token_accuracy": 0.9638377511501313, + "num_tokens": 1328922385.0, + "step": 9400 + }, + { + "entropy": 1.405231008529663, + "epoch": 0.3085109856028207, + "grad_norm": 1.703125, + "learning_rate": 5.823561638280711e-06, + "loss": 0.1621, + "mean_token_accuracy": 0.9635949361324311, + "num_tokens": 1336385571.0, + "step": 9450 + }, + { + "entropy": 1.370991678237915, + "epoch": 0.3101433188599785, + "grad_norm": 1.6875, + "learning_rate": 5.82081544152952e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9661216616630555, + "num_tokens": 1343638497.0, + "step": 9500 + }, + { + "epoch": 0.3101433188599785, + "eval_entropy": 1.3891894817352295, + "eval_loss": 0.16506299376487732, + "eval_mean_token_accuracy": 0.9635340309143067, + "eval_num_tokens": 1343638497.0, + "eval_runtime": 749.2359, + "eval_samples_per_second": 12.888, + "eval_steps_per_second": 0.101, + "step": 9500 + }, + { + "entropy": 1.3911400294303895, + "epoch": 0.31177565211713626, + "grad_norm": 0.8984375, + "learning_rate": 5.818048695430541e-06, + "loss": 0.1496, + "mean_token_accuracy": 0.9663948690891266, + "num_tokens": 1350638403.0, + "step": 9550 + }, + { + "entropy": 1.3863462066650392, + "epoch": 0.31340798537429404, + "grad_norm": 1.8515625, + "learning_rate": 5.815261420139235e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9667435109615325, + "num_tokens": 1357942983.0, + "step": 9600 + }, + { + "entropy": 1.3836952257156372, + "epoch": 0.3150403186314518, + "grad_norm": 1.4375, + "learning_rate": 5.812453635960613e-06, + "loss": 0.136, + "mean_token_accuracy": 0.9696350061893463, + "num_tokens": 1364441123.0, + "step": 9650 + }, + { + "entropy": 1.386801562309265, + "epoch": 0.3166726518886096, + "grad_norm": 1.3828125, + "learning_rate": 5.809625363349091e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9660963475704193, + "num_tokens": 1371638128.0, + "step": 9700 + }, + { + "entropy": 1.4033276891708375, + "epoch": 0.3183049851457674, + "grad_norm": 1.6015625, + "learning_rate": 5.806776622908341e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.9672618007659912, + "num_tokens": 1378797795.0, + "step": 9750 + }, + { + "entropy": 1.3945609354972839, + "epoch": 0.31993731840292516, + "grad_norm": 1.7109375, + "learning_rate": 5.8039074353911425e-06, + "loss": 0.1476, + "mean_token_accuracy": 0.9665350615978241, + "num_tokens": 1385958442.0, + "step": 9800 + }, + { + "entropy": 1.3897303318977356, + "epoch": 0.32156965166008294, + "grad_norm": 1.609375, + "learning_rate": 5.801017821699229e-06, + "loss": 0.1492, + "mean_token_accuracy": 0.9658246648311615, + "num_tokens": 1392915332.0, + "step": 9850 + }, + { + "entropy": 1.397285952568054, + "epoch": 0.3232019849172407, + "grad_norm": 1.4296875, + "learning_rate": 5.798107802883135e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9644203245639801, + "num_tokens": 1399970378.0, + "step": 9900 + }, + { + "entropy": 1.396610188484192, + "epoch": 0.3248343181743985, + "grad_norm": 1.15625, + "learning_rate": 5.795177400142047e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9683949732780457, + "num_tokens": 1406942412.0, + "step": 9950 + }, + { + "entropy": 1.3993594455718994, + "epoch": 0.3264666514315563, + "grad_norm": 1.2109375, + "learning_rate": 5.792226634823645e-06, + "loss": 0.166, + "mean_token_accuracy": 0.9635122084617614, + "num_tokens": 1414672963.0, + "step": 10000 + }, + { + "epoch": 0.3264666514315563, + "eval_entropy": 1.3897196292877196, + "eval_loss": 0.16665887832641602, + "eval_mean_token_accuracy": 0.9631382012367249, + "eval_num_tokens": 1414672963.0, + "eval_runtime": 743.4472, + "eval_samples_per_second": 12.988, + "eval_steps_per_second": 0.102, + "step": 10000 + }, + { + "entropy": 1.3787381386756896, + "epoch": 0.32809898468871407, + "grad_norm": 1.65625, + "learning_rate": 5.789255528423945e-06, + "loss": 0.1449, + "mean_token_accuracy": 0.9675530314445495, + "num_tokens": 1422037159.0, + "step": 10050 + }, + { + "entropy": 1.3835061955451966, + "epoch": 0.32973131794587185, + "grad_norm": 1.421875, + "learning_rate": 5.7862641025871535e-06, + "loss": 0.1493, + "mean_token_accuracy": 0.966714415550232, + "num_tokens": 1428834412.0, + "step": 10100 + }, + { + "entropy": 1.3994423723220826, + "epoch": 0.33136365120302963, + "grad_norm": 1.453125, + "learning_rate": 5.783252379105494e-06, + "loss": 0.1478, + "mean_token_accuracy": 0.9666897785663605, + "num_tokens": 1435825992.0, + "step": 10150 + }, + { + "entropy": 1.4079461932182311, + "epoch": 0.3329959844601874, + "grad_norm": 1.03125, + "learning_rate": 5.780220379919062e-06, + "loss": 0.1597, + "mean_token_accuracy": 0.9649293422698975, + "num_tokens": 1443057429.0, + "step": 10200 + }, + { + "entropy": 1.4087351322174073, + "epoch": 0.3346283177173452, + "grad_norm": 1.1875, + "learning_rate": 5.777168127115654e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9670829331874847, + "num_tokens": 1450143525.0, + "step": 10250 + }, + { + "entropy": 1.4217532348632813, + "epoch": 0.336260650974503, + "grad_norm": 1.1796875, + "learning_rate": 5.774095642930618e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9653639376163483, + "num_tokens": 1456853199.0, + "step": 10300 + }, + { + "entropy": 1.4182784628868104, + "epoch": 0.33789298423166075, + "grad_norm": 1.328125, + "learning_rate": 5.771002949746681e-06, + "loss": 0.1592, + "mean_token_accuracy": 0.9639875698089599, + "num_tokens": 1464260063.0, + "step": 10350 + }, + { + "entropy": 1.3954361629486085, + "epoch": 0.33952531748881853, + "grad_norm": 1.0234375, + "learning_rate": 5.76789007009379e-06, + "loss": 0.1444, + "mean_token_accuracy": 0.9671654045581818, + "num_tokens": 1471363389.0, + "step": 10400 + }, + { + "entropy": 1.3968884110450746, + "epoch": 0.3411576507459763, + "grad_norm": 2.15625, + "learning_rate": 5.7647570266489535e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9698529148101807, + "num_tokens": 1478273376.0, + "step": 10450 + }, + { + "entropy": 1.397156729698181, + "epoch": 0.3427899840031341, + "grad_norm": 1.0625, + "learning_rate": 5.7616038422360674e-06, + "loss": 0.1458, + "mean_token_accuracy": 0.9674423885345459, + "num_tokens": 1485177982.0, + "step": 10500 + }, + { + "epoch": 0.3427899840031341, + "eval_entropy": 1.4138343874613444, + "eval_loss": 0.1639399528503418, + "eval_mean_token_accuracy": 0.9635584902763367, + "eval_num_tokens": 1485177982.0, + "eval_runtime": 748.9668, + "eval_samples_per_second": 12.892, + "eval_steps_per_second": 0.101, + "step": 10500 + }, + { + "entropy": 1.4251120805740356, + "epoch": 0.3444223172602919, + "grad_norm": 1.703125, + "learning_rate": 5.758430539825751e-06, + "loss": 0.1423, + "mean_token_accuracy": 0.968468290567398, + "num_tokens": 1491740592.0, + "step": 10550 + }, + { + "entropy": 1.4330598759651183, + "epoch": 0.34605465051744966, + "grad_norm": 1.234375, + "learning_rate": 5.755237142535185e-06, + "loss": 0.1584, + "mean_token_accuracy": 0.9640131962299346, + "num_tokens": 1499230338.0, + "step": 10600 + }, + { + "entropy": 1.4190063452720643, + "epoch": 0.34768698377460744, + "grad_norm": 1.3671875, + "learning_rate": 5.752023673627936e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9651542448997498, + "num_tokens": 1506807165.0, + "step": 10650 + }, + { + "entropy": 1.398562343120575, + "epoch": 0.3493193170317652, + "grad_norm": 2.671875, + "learning_rate": 5.748790156513793e-06, + "loss": 0.1429, + "mean_token_accuracy": 0.9676708686351776, + "num_tokens": 1513566074.0, + "step": 10700 + }, + { + "entropy": 1.4046012330055238, + "epoch": 0.350951650288923, + "grad_norm": 1.8203125, + "learning_rate": 5.74553661474859e-06, + "loss": 0.1475, + "mean_token_accuracy": 0.9663785743713379, + "num_tokens": 1520732578.0, + "step": 10750 + }, + { + "entropy": 1.4011975502967835, + "epoch": 0.3525839835460808, + "grad_norm": 1.7109375, + "learning_rate": 5.742263072034044e-06, + "loss": 0.133, + "mean_token_accuracy": 0.9697633123397827, + "num_tokens": 1527341695.0, + "step": 10800 + }, + { + "entropy": 1.4020631575584412, + "epoch": 0.35421631680323856, + "grad_norm": 1.375, + "learning_rate": 5.738969552217573e-06, + "loss": 0.1529, + "mean_token_accuracy": 0.9653546392917634, + "num_tokens": 1534441768.0, + "step": 10850 + }, + { + "entropy": 1.4063316154479981, + "epoch": 0.35584865006039634, + "grad_norm": 1.1875, + "learning_rate": 5.735656079292128e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9655212807655335, + "num_tokens": 1542032259.0, + "step": 10900 + }, + { + "entropy": 1.3955928492546081, + "epoch": 0.3574809833175541, + "grad_norm": 1.296875, + "learning_rate": 5.732322677396013e-06, + "loss": 0.1379, + "mean_token_accuracy": 0.9682280993461609, + "num_tokens": 1549145530.0, + "step": 10950 + }, + { + "entropy": 1.417706184387207, + "epoch": 0.3591133165747119, + "grad_norm": 1.5078125, + "learning_rate": 5.728969370812716e-06, + "loss": 0.1502, + "mean_token_accuracy": 0.9673383378982544, + "num_tokens": 1556005225.0, + "step": 11000 + }, + { + "epoch": 0.3591133165747119, + "eval_entropy": 1.4156667073567708, + "eval_loss": 0.1644313782453537, + "eval_mean_token_accuracy": 0.9634674708048503, + "eval_num_tokens": 1556005225.0, + "eval_runtime": 749.0504, + "eval_samples_per_second": 12.891, + "eval_steps_per_second": 0.101, + "step": 11000 + }, + { + "entropy": 1.4165368866920471, + "epoch": 0.3607456498318697, + "grad_norm": 3.625, + "learning_rate": 5.725596183970729e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9657526600360871, + "num_tokens": 1563254781.0, + "step": 11050 + }, + { + "entropy": 1.4074536633491517, + "epoch": 0.36237798308902747, + "grad_norm": 1.5625, + "learning_rate": 5.722203141443365e-06, + "loss": 0.1452, + "mean_token_accuracy": 0.9683119165897369, + "num_tokens": 1569838865.0, + "step": 11100 + }, + { + "entropy": 1.4159915471076965, + "epoch": 0.36401031634618525, + "grad_norm": 1.265625, + "learning_rate": 5.718790267948591e-06, + "loss": 0.1505, + "mean_token_accuracy": 0.9666063642501831, + "num_tokens": 1576948931.0, + "step": 11150 + }, + { + "entropy": 1.4229880380630493, + "epoch": 0.36564264960334303, + "grad_norm": 1.7109375, + "learning_rate": 5.715357588348832e-06, + "loss": 0.1522, + "mean_token_accuracy": 0.9668408262729645, + "num_tokens": 1584164705.0, + "step": 11200 + }, + { + "entropy": 1.396066279411316, + "epoch": 0.3672749828605008, + "grad_norm": 2.09375, + "learning_rate": 5.711905127650807e-06, + "loss": 0.1373, + "mean_token_accuracy": 0.9693786513805389, + "num_tokens": 1591058327.0, + "step": 11250 + }, + { + "entropy": 1.4097445344924926, + "epoch": 0.3689073161176586, + "grad_norm": 1.9609375, + "learning_rate": 5.7084329110053294e-06, + "loss": 0.1486, + "mean_token_accuracy": 0.9671066462993622, + "num_tokens": 1598091112.0, + "step": 11300 + }, + { + "entropy": 1.4326444959640503, + "epoch": 0.37053964937481637, + "grad_norm": 1.1328125, + "learning_rate": 5.70494096370714e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9672650814056396, + "num_tokens": 1605344616.0, + "step": 11350 + }, + { + "entropy": 1.4539379978179932, + "epoch": 0.37217198263197415, + "grad_norm": 1.953125, + "learning_rate": 5.701429311194713e-06, + "loss": 0.1593, + "mean_token_accuracy": 0.9647262859344482, + "num_tokens": 1612587700.0, + "step": 11400 + }, + { + "entropy": 1.4599957299232482, + "epoch": 0.37380431588913193, + "grad_norm": 0.76953125, + "learning_rate": 5.6978979790500695e-06, + "loss": 0.1428, + "mean_token_accuracy": 0.96798752784729, + "num_tokens": 1619869251.0, + "step": 11450 + }, + { + "entropy": 1.456819851398468, + "epoch": 0.3754366491462897, + "grad_norm": 1.5703125, + "learning_rate": 5.694346992998601e-06, + "loss": 0.157, + "mean_token_accuracy": 0.9647147190570832, + "num_tokens": 1627103999.0, + "step": 11500 + }, + { + "epoch": 0.3754366491462897, + "eval_entropy": 1.4559897645314535, + "eval_loss": 0.16348470747470856, + "eval_mean_token_accuracy": 0.9634300549825032, + "eval_num_tokens": 1627103999.0, + "eval_runtime": 750.9869, + "eval_samples_per_second": 12.858, + "eval_steps_per_second": 0.101, + "step": 11500 + }, + { + "entropy": 1.445942795276642, + "epoch": 0.3770689824034475, + "grad_norm": 1.5625, + "learning_rate": 5.690776378908871e-06, + "loss": 0.1559, + "mean_token_accuracy": 0.9648426783084869, + "num_tokens": 1634135214.0, + "step": 11550 + }, + { + "entropy": 1.4337683749198913, + "epoch": 0.3787013156606053, + "grad_norm": 1.71875, + "learning_rate": 5.687186162792432e-06, + "loss": 0.1392, + "mean_token_accuracy": 0.9684048485755921, + "num_tokens": 1641123881.0, + "step": 11600 + }, + { + "entropy": 1.4271648550033569, + "epoch": 0.38033364891776306, + "grad_norm": 1.53125, + "learning_rate": 5.683576370803637e-06, + "loss": 0.1442, + "mean_token_accuracy": 0.9671970081329345, + "num_tokens": 1648087593.0, + "step": 11650 + }, + { + "entropy": 1.4187105298042297, + "epoch": 0.38196598217492084, + "grad_norm": 1.2578125, + "learning_rate": 5.679947029239446e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9652733910083771, + "num_tokens": 1655675590.0, + "step": 11700 + }, + { + "entropy": 1.4136518168449401, + "epoch": 0.3835983154320786, + "grad_norm": 1.2421875, + "learning_rate": 5.676298164539235e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9695696973800659, + "num_tokens": 1662994220.0, + "step": 11750 + }, + { + "entropy": 1.4156992936134338, + "epoch": 0.3852306486892364, + "grad_norm": 2.015625, + "learning_rate": 5.672629803284603e-06, + "loss": 0.1445, + "mean_token_accuracy": 0.967056336402893, + "num_tokens": 1670282239.0, + "step": 11800 + }, + { + "entropy": 1.381675865650177, + "epoch": 0.3868629819463942, + "grad_norm": 1.3046875, + "learning_rate": 5.668941972199185e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9707480573654175, + "num_tokens": 1677092297.0, + "step": 11850 + }, + { + "entropy": 1.413828341960907, + "epoch": 0.38849531520355196, + "grad_norm": 1.546875, + "learning_rate": 5.665234698148447e-06, + "loss": 0.1398, + "mean_token_accuracy": 0.9692868089675903, + "num_tokens": 1683596527.0, + "step": 11900 + }, + { + "entropy": 1.4255395197868348, + "epoch": 0.39012764846070974, + "grad_norm": 1.25, + "learning_rate": 5.661508008139494e-06, + "loss": 0.1428, + "mean_token_accuracy": 0.9684118723869324, + "num_tokens": 1690356557.0, + "step": 11950 + }, + { + "entropy": 1.421637599468231, + "epoch": 0.3917599817178675, + "grad_norm": 1.40625, + "learning_rate": 5.657761929320876e-06, + "loss": 0.1358, + "mean_token_accuracy": 0.9686246883869171, + "num_tokens": 1696780737.0, + "step": 12000 + }, + { + "epoch": 0.3917599817178675, + "eval_entropy": 1.4270846033096314, + "eval_loss": 0.16205987334251404, + "eval_mean_token_accuracy": 0.9638035273551941, + "eval_num_tokens": 1696780737.0, + "eval_runtime": 744.4564, + "eval_samples_per_second": 12.971, + "eval_steps_per_second": 0.102, + "step": 12000 + }, + { + "entropy": 1.4344228434562682, + "epoch": 0.3933923149750253, + "grad_norm": 1.46875, + "learning_rate": 5.65399648898239e-06, + "loss": 0.1479, + "mean_token_accuracy": 0.9664253509044647, + "num_tokens": 1703868095.0, + "step": 12050 + }, + { + "entropy": 1.40860848903656, + "epoch": 0.3950246482321831, + "grad_norm": 1.5703125, + "learning_rate": 5.650211714554876e-06, + "loss": 0.1593, + "mean_token_accuracy": 0.9646110832691193, + "num_tokens": 1711471948.0, + "step": 12100 + }, + { + "entropy": 1.420129976272583, + "epoch": 0.39665698148934087, + "grad_norm": 1.6484375, + "learning_rate": 5.6464076336100246e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9673341345787049, + "num_tokens": 1718673490.0, + "step": 12150 + }, + { + "entropy": 1.4015104007720947, + "epoch": 0.39828931474649865, + "grad_norm": 1.390625, + "learning_rate": 5.642584273860171e-06, + "loss": 0.1518, + "mean_token_accuracy": 0.966002242565155, + "num_tokens": 1726275054.0, + "step": 12200 + }, + { + "entropy": 1.4261247539520263, + "epoch": 0.3999216480036564, + "grad_norm": 1.3828125, + "learning_rate": 5.6387416631580936e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.9675724005699158, + "num_tokens": 1733190804.0, + "step": 12250 + }, + { + "entropy": 1.4047169375419617, + "epoch": 0.4015539812608142, + "grad_norm": 1.6796875, + "learning_rate": 5.634879829496813e-06, + "loss": 0.149, + "mean_token_accuracy": 0.9663934874534607, + "num_tokens": 1740409762.0, + "step": 12300 + }, + { + "entropy": 1.4087544870376587, + "epoch": 0.403186314517972, + "grad_norm": 1.296875, + "learning_rate": 5.630998801009386e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.9677630662918091, + "num_tokens": 1747425003.0, + "step": 12350 + }, + { + "entropy": 1.4205935049057006, + "epoch": 0.40481864777512977, + "grad_norm": 1.609375, + "learning_rate": 5.627098605968702e-06, + "loss": 0.1518, + "mean_token_accuracy": 0.9655173766613007, + "num_tokens": 1754787565.0, + "step": 12400 + }, + { + "entropy": 1.387464382648468, + "epoch": 0.40645098103228755, + "grad_norm": 1.375, + "learning_rate": 5.62317927278728e-06, + "loss": 0.1372, + "mean_token_accuracy": 0.968756947517395, + "num_tokens": 1761849063.0, + "step": 12450 + }, + { + "entropy": 1.3994211435317994, + "epoch": 0.40808331428944533, + "grad_norm": 2.0625, + "learning_rate": 5.619240830017051e-06, + "loss": 0.144, + "mean_token_accuracy": 0.9678842532634735, + "num_tokens": 1768859328.0, + "step": 12500 + }, + { + "epoch": 0.40808331428944533, + "eval_entropy": 1.405422296524048, + "eval_loss": 0.16121897101402283, + "eval_mean_token_accuracy": 0.9639042560259501, + "eval_num_tokens": 1768859328.0, + "eval_runtime": 749.481, + "eval_samples_per_second": 12.884, + "eval_steps_per_second": 0.101, + "step": 12500 + }, + { + "entropy": 1.419663178920746, + "epoch": 0.4097156475466031, + "grad_norm": 1.953125, + "learning_rate": 5.615283306349166e-06, + "loss": 0.1409, + "mean_token_accuracy": 0.9682516789436341, + "num_tokens": 1775694980.0, + "step": 12550 + }, + { + "entropy": 1.4085765600204467, + "epoch": 0.4113479808037609, + "grad_norm": 1.578125, + "learning_rate": 5.611306730613772e-06, + "loss": 0.1326, + "mean_token_accuracy": 0.9704544687271118, + "num_tokens": 1782348927.0, + "step": 12600 + }, + { + "entropy": 1.4163624548912048, + "epoch": 0.4129803140609187, + "grad_norm": 2.0, + "learning_rate": 5.607311131779812e-06, + "loss": 0.1501, + "mean_token_accuracy": 0.9661952567100525, + "num_tokens": 1789610698.0, + "step": 12650 + }, + { + "entropy": 1.4280620789527894, + "epoch": 0.41461264731807645, + "grad_norm": 1.890625, + "learning_rate": 5.603296538954808e-06, + "loss": 0.143, + "mean_token_accuracy": 0.9683597815036774, + "num_tokens": 1796316393.0, + "step": 12700 + }, + { + "entropy": 1.425929193496704, + "epoch": 0.41624498057523424, + "grad_norm": 2.296875, + "learning_rate": 5.599262981384652e-06, + "loss": 0.1477, + "mean_token_accuracy": 0.9663317048549652, + "num_tokens": 1803325180.0, + "step": 12750 + }, + { + "entropy": 1.4301575493812562, + "epoch": 0.417877313832392, + "grad_norm": 1.9765625, + "learning_rate": 5.595210488453392e-06, + "loss": 0.1557, + "mean_token_accuracy": 0.9648576879501343, + "num_tokens": 1810717661.0, + "step": 12800 + }, + { + "entropy": 1.405632803440094, + "epoch": 0.4195096470895498, + "grad_norm": 1.7109375, + "learning_rate": 5.591139089683021e-06, + "loss": 0.1397, + "mean_token_accuracy": 0.9687881779670715, + "num_tokens": 1817611883.0, + "step": 12850 + }, + { + "entropy": 1.3980163097381593, + "epoch": 0.4211419803467076, + "grad_norm": 2.078125, + "learning_rate": 5.587048814733253e-06, + "loss": 0.1319, + "mean_token_accuracy": 0.9697498416900635, + "num_tokens": 1824343282.0, + "step": 12900 + }, + { + "entropy": 1.3963736867904664, + "epoch": 0.42277431360386536, + "grad_norm": 1.859375, + "learning_rate": 5.582939693401319e-06, + "loss": 0.1378, + "mean_token_accuracy": 0.9693096280097961, + "num_tokens": 1831509722.0, + "step": 12950 + }, + { + "entropy": 1.4174233818054198, + "epoch": 0.42440664686102314, + "grad_norm": 1.5078125, + "learning_rate": 5.57881175562174e-06, + "loss": 0.1565, + "mean_token_accuracy": 0.9654737007617951, + "num_tokens": 1838857002.0, + "step": 13000 + }, + { + "epoch": 0.42440664686102314, + "eval_entropy": 1.4053240156173705, + "eval_loss": 0.1606142520904541, + "eval_mean_token_accuracy": 0.9640888079007467, + "eval_num_tokens": 1838857002.0, + "eval_runtime": 748.4765, + "eval_samples_per_second": 12.901, + "eval_steps_per_second": 0.102, + "step": 13000 + }, + { + "entropy": 1.4016055393218994, + "epoch": 0.4260389801181809, + "grad_norm": 2.09375, + "learning_rate": 5.574665031466116e-06, + "loss": 0.1338, + "mean_token_accuracy": 0.9696434116363526, + "num_tokens": 1845457705.0, + "step": 13050 + }, + { + "entropy": 1.4049336218833923, + "epoch": 0.4276713133753387, + "grad_norm": 1.5625, + "learning_rate": 5.570499551142902e-06, + "loss": 0.1466, + "mean_token_accuracy": 0.9665655505657196, + "num_tokens": 1852644618.0, + "step": 13100 + }, + { + "entropy": 1.4158594751358031, + "epoch": 0.4293036466324965, + "grad_norm": 1.546875, + "learning_rate": 5.566315344997188e-06, + "loss": 0.1405, + "mean_token_accuracy": 0.9682467067241669, + "num_tokens": 1859532727.0, + "step": 13150 + }, + { + "entropy": 1.406804118156433, + "epoch": 0.43093597988965426, + "grad_norm": 1.796875, + "learning_rate": 5.562112443510483e-06, + "loss": 0.1347, + "mean_token_accuracy": 0.969345440864563, + "num_tokens": 1866433523.0, + "step": 13200 + }, + { + "entropy": 1.414643156528473, + "epoch": 0.43256831314681204, + "grad_norm": 1.375, + "learning_rate": 5.557890877300489e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9673810720443725, + "num_tokens": 1873742139.0, + "step": 13250 + }, + { + "entropy": 1.4243081569671632, + "epoch": 0.4342006464039698, + "grad_norm": 1.1328125, + "learning_rate": 5.553650677120876e-06, + "loss": 0.14, + "mean_token_accuracy": 0.9683500599861145, + "num_tokens": 1881001857.0, + "step": 13300 + }, + { + "entropy": 1.4164328074455261, + "epoch": 0.4358329796611276, + "grad_norm": 2.3125, + "learning_rate": 5.549391873861064e-06, + "loss": 0.1485, + "mean_token_accuracy": 0.9669756269454957, + "num_tokens": 1888442521.0, + "step": 13350 + }, + { + "entropy": 1.4102159285545348, + "epoch": 0.4374653129182854, + "grad_norm": 1.953125, + "learning_rate": 5.545114498545991e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9687536859512329, + "num_tokens": 1895774805.0, + "step": 13400 + }, + { + "entropy": 1.4052273893356324, + "epoch": 0.43909764617544317, + "grad_norm": 1.046875, + "learning_rate": 5.540818582335894e-06, + "loss": 0.1442, + "mean_token_accuracy": 0.9679582285881042, + "num_tokens": 1902866432.0, + "step": 13450 + }, + { + "entropy": 1.4166702628135681, + "epoch": 0.44072997943260095, + "grad_norm": 1.2890625, + "learning_rate": 5.536504156526077e-06, + "loss": 0.1481, + "mean_token_accuracy": 0.9664357197284699, + "num_tokens": 1910343947.0, + "step": 13500 + }, + { + "epoch": 0.44072997943260095, + "eval_entropy": 1.4095580498377482, + "eval_loss": 0.1602867692708969, + "eval_mean_token_accuracy": 0.9641507911682129, + "eval_num_tokens": 1910343947.0, + "eval_runtime": 748.7079, + "eval_samples_per_second": 12.897, + "eval_steps_per_second": 0.102, + "step": 13500 + }, + { + "entropy": 1.4171498656272887, + "epoch": 0.44236231268975873, + "grad_norm": 1.9375, + "learning_rate": 5.5321712525466815e-06, + "loss": 0.1444, + "mean_token_accuracy": 0.967768360376358, + "num_tokens": 1917115548.0, + "step": 13550 + }, + { + "entropy": 1.4215400004386902, + "epoch": 0.4439946459469165, + "grad_norm": 1.21875, + "learning_rate": 5.5278199019624665e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.965044105052948, + "num_tokens": 1924701669.0, + "step": 13600 + }, + { + "entropy": 1.4144377851486205, + "epoch": 0.4456269792040743, + "grad_norm": 1.390625, + "learning_rate": 5.523450136472569e-06, + "loss": 0.1476, + "mean_token_accuracy": 0.9668766951560974, + "num_tokens": 1932051913.0, + "step": 13650 + }, + { + "entropy": 1.4168913602828979, + "epoch": 0.4472593124612321, + "grad_norm": 1.703125, + "learning_rate": 5.519061987910276e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9691276931762696, + "num_tokens": 1938674407.0, + "step": 13700 + }, + { + "entropy": 1.4047640895843505, + "epoch": 0.44889164571838985, + "grad_norm": 1.109375, + "learning_rate": 5.514655488242795e-06, + "loss": 0.1503, + "mean_token_accuracy": 0.9665549874305726, + "num_tokens": 1945577505.0, + "step": 13750 + }, + { + "entropy": 1.4135752677917481, + "epoch": 0.45052397897554763, + "grad_norm": 2.125, + "learning_rate": 5.510230669571018e-06, + "loss": 0.1492, + "mean_token_accuracy": 0.9664253723621369, + "num_tokens": 1952659833.0, + "step": 13800 + }, + { + "entropy": 1.3945325517654419, + "epoch": 0.4521563122327054, + "grad_norm": 2.109375, + "learning_rate": 5.505787564129291e-06, + "loss": 0.1376, + "mean_token_accuracy": 0.968870245218277, + "num_tokens": 1959360672.0, + "step": 13850 + }, + { + "entropy": 1.4091899871826172, + "epoch": 0.4537886454898632, + "grad_norm": 1.7109375, + "learning_rate": 5.5013262042851764e-06, + "loss": 0.1352, + "mean_token_accuracy": 0.9697363257408143, + "num_tokens": 1965923476.0, + "step": 13900 + }, + { + "entropy": 1.409596972465515, + "epoch": 0.455420978747021, + "grad_norm": 1.3359375, + "learning_rate": 5.4968466225392165e-06, + "loss": 0.147, + "mean_token_accuracy": 0.966924901008606, + "num_tokens": 1972776862.0, + "step": 13950 + }, + { + "entropy": 1.4293616366386415, + "epoch": 0.45705331200417876, + "grad_norm": 1.296875, + "learning_rate": 5.4923488515247e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.9687818443775177, + "num_tokens": 1979187620.0, + "step": 14000 + }, + { + "epoch": 0.45705331200417876, + "eval_entropy": 1.4118124723434449, + "eval_loss": 0.16126905381679535, + "eval_mean_token_accuracy": 0.9638542596499126, + "eval_num_tokens": 1979187620.0, + "eval_runtime": 745.6037, + "eval_samples_per_second": 12.951, + "eval_steps_per_second": 0.102, + "step": 14000 + }, + { + "entropy": 1.415801317691803, + "epoch": 0.45868564526133654, + "grad_norm": 1.25, + "learning_rate": 5.487832924007422e-06, + "loss": 0.1483, + "mean_token_accuracy": 0.9666338682174682, + "num_tokens": 1986603476.0, + "step": 14050 + }, + { + "entropy": 1.4051340079307557, + "epoch": 0.4603179785184943, + "grad_norm": 1.078125, + "learning_rate": 5.4832988728854465e-06, + "loss": 0.1557, + "mean_token_accuracy": 0.964282066822052, + "num_tokens": 1994204498.0, + "step": 14100 + }, + { + "entropy": 1.415133674144745, + "epoch": 0.4619503117756521, + "grad_norm": 1.4140625, + "learning_rate": 5.478746731188865e-06, + "loss": 0.1397, + "mean_token_accuracy": 0.9680150365829467, + "num_tokens": 2000941510.0, + "step": 14150 + }, + { + "entropy": 1.4023677587509156, + "epoch": 0.4635826450328099, + "grad_norm": 1.8671875, + "learning_rate": 5.474176532079557e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9682874810695649, + "num_tokens": 2007806068.0, + "step": 14200 + }, + { + "entropy": 1.4310323596000671, + "epoch": 0.46521497828996766, + "grad_norm": 1.390625, + "learning_rate": 5.46958830885095e-06, + "loss": 0.1513, + "mean_token_accuracy": 0.9669715762138367, + "num_tokens": 2014807742.0, + "step": 14250 + }, + { + "entropy": 1.4305420732498169, + "epoch": 0.46684731154712544, + "grad_norm": 1.09375, + "learning_rate": 5.464982094927772e-06, + "loss": 0.1486, + "mean_token_accuracy": 0.966749495267868, + "num_tokens": 2021683076.0, + "step": 14300 + }, + { + "entropy": 1.4090232229232789, + "epoch": 0.4684796448042832, + "grad_norm": 1.578125, + "learning_rate": 5.460357923865814e-06, + "loss": 0.1379, + "mean_token_accuracy": 0.9685410165786743, + "num_tokens": 2028415183.0, + "step": 14350 + }, + { + "entropy": 1.4059256029129028, + "epoch": 0.470111978061441, + "grad_norm": 1.2265625, + "learning_rate": 5.4557158293516845e-06, + "loss": 0.1368, + "mean_token_accuracy": 0.9691716039180756, + "num_tokens": 2035435433.0, + "step": 14400 + }, + { + "entropy": 1.415133411884308, + "epoch": 0.4717443113185988, + "grad_norm": 1.2265625, + "learning_rate": 5.451055845202559e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9652414166927338, + "num_tokens": 2042639097.0, + "step": 14450 + }, + { + "entropy": 1.4125458192825318, + "epoch": 0.47337664457575657, + "grad_norm": 1.5546875, + "learning_rate": 5.446378005365937e-06, + "loss": 0.1465, + "mean_token_accuracy": 0.9667794346809387, + "num_tokens": 2049839883.0, + "step": 14500 + }, + { + "epoch": 0.47337664457575657, + "eval_entropy": 1.4099786535898844, + "eval_loss": 0.16086123883724213, + "eval_mean_token_accuracy": 0.9639045866330465, + "eval_num_tokens": 2049839883.0, + "eval_runtime": 751.2639, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 14500 + }, + { + "entropy": 1.4155534076690675, + "epoch": 0.47500897783291435, + "grad_norm": 0.97265625, + "learning_rate": 5.441682343919398e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9656119549274444, + "num_tokens": 2057493706.0, + "step": 14550 + }, + { + "entropy": 1.4232526874542237, + "epoch": 0.47664131109007213, + "grad_norm": 1.265625, + "learning_rate": 5.436968895070349e-06, + "loss": 0.1463, + "mean_token_accuracy": 0.9669792699813843, + "num_tokens": 2064599903.0, + "step": 14600 + }, + { + "entropy": 1.426460826396942, + "epoch": 0.4782736443472299, + "grad_norm": 1.34375, + "learning_rate": 5.432237693155773e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.9656824862957001, + "num_tokens": 2072278266.0, + "step": 14650 + }, + { + "entropy": 1.447964243888855, + "epoch": 0.4799059776043877, + "grad_norm": 1.6328125, + "learning_rate": 5.427488772641989e-06, + "loss": 0.1505, + "mean_token_accuracy": 0.9661068272590637, + "num_tokens": 2079638913.0, + "step": 14700 + }, + { + "entropy": 1.443111469745636, + "epoch": 0.48153831086154547, + "grad_norm": 2.015625, + "learning_rate": 5.422722168124386e-06, + "loss": 0.1452, + "mean_token_accuracy": 0.9680136227607727, + "num_tokens": 2086742924.0, + "step": 14750 + }, + { + "entropy": 1.436655399799347, + "epoch": 0.48317064411870325, + "grad_norm": 1.515625, + "learning_rate": 5.417937914327187e-06, + "loss": 0.1466, + "mean_token_accuracy": 0.9671316587924957, + "num_tokens": 2094123207.0, + "step": 14800 + }, + { + "entropy": 1.4242712426185609, + "epoch": 0.48480297737586103, + "grad_norm": 1.7265625, + "learning_rate": 5.413136046103181e-06, + "loss": 0.1471, + "mean_token_accuracy": 0.9668898284435272, + "num_tokens": 2101338532.0, + "step": 14850 + }, + { + "entropy": 1.4219823241233827, + "epoch": 0.4864353106330188, + "grad_norm": 1.7734375, + "learning_rate": 5.408316598433483e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9691325139999389, + "num_tokens": 2107835134.0, + "step": 14900 + }, + { + "entropy": 1.4221902179718018, + "epoch": 0.4880676438901766, + "grad_norm": 1.6015625, + "learning_rate": 5.403479606427267e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9696690511703491, + "num_tokens": 2114891465.0, + "step": 14950 + }, + { + "entropy": 1.401020920276642, + "epoch": 0.4896999771473344, + "grad_norm": 1.2578125, + "learning_rate": 5.398625105321518e-06, + "loss": 0.143, + "mean_token_accuracy": 0.9679032492637635, + "num_tokens": 2121999447.0, + "step": 15000 + }, + { + "epoch": 0.4896999771473344, + "eval_entropy": 1.409274689356486, + "eval_loss": 0.1606949120759964, + "eval_mean_token_accuracy": 0.963873782157898, + "eval_num_tokens": 2121999447.0, + "eval_runtime": 749.6338, + "eval_samples_per_second": 12.881, + "eval_steps_per_second": 0.101, + "step": 15000 + }, + { + "entropy": 1.4075271391868591, + "epoch": 0.49133231040449216, + "grad_norm": 1.953125, + "learning_rate": 5.393753130480773e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.967915655374527, + "num_tokens": 2129364587.0, + "step": 15050 + }, + { + "entropy": 1.39142733335495, + "epoch": 0.49296464366164994, + "grad_norm": 1.5, + "learning_rate": 5.388863717396865e-06, + "loss": 0.1378, + "mean_token_accuracy": 0.9694935536384582, + "num_tokens": 2135978375.0, + "step": 15100 + }, + { + "entropy": 1.4059196853637694, + "epoch": 0.4945969769188077, + "grad_norm": 2.109375, + "learning_rate": 5.383956901688659e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9660707736015319, + "num_tokens": 2143467430.0, + "step": 15150 + }, + { + "entropy": 1.3810991740226746, + "epoch": 0.4962293101759655, + "grad_norm": 0.59375, + "learning_rate": 5.3790327191017976e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.9680761241912842, + "num_tokens": 2150524540.0, + "step": 15200 + }, + { + "entropy": 1.3994912171363831, + "epoch": 0.4978616434331233, + "grad_norm": 1.3046875, + "learning_rate": 5.374091205508442e-06, + "loss": 0.1508, + "mean_token_accuracy": 0.9663672626018525, + "num_tokens": 2157785212.0, + "step": 15250 + }, + { + "entropy": 1.405203297138214, + "epoch": 0.49949397669028106, + "grad_norm": 1.1875, + "learning_rate": 5.369132396907005e-06, + "loss": 0.1478, + "mean_token_accuracy": 0.9665706479549407, + "num_tokens": 2164998488.0, + "step": 15300 + }, + { + "entropy": 1.4063081932067871, + "epoch": 0.5011263099474389, + "grad_norm": 1.953125, + "learning_rate": 5.364156329421892e-06, + "loss": 0.1285, + "mean_token_accuracy": 0.9716930568218232, + "num_tokens": 2171612673.0, + "step": 15350 + }, + { + "entropy": 1.4068853855133057, + "epoch": 0.5027586432045966, + "grad_norm": 1.34375, + "learning_rate": 5.359163039303241e-06, + "loss": 0.1465, + "mean_token_accuracy": 0.9674324905872345, + "num_tokens": 2179025674.0, + "step": 15400 + }, + { + "entropy": 1.4135111689567565, + "epoch": 0.5043909764617545, + "grad_norm": 1.9453125, + "learning_rate": 5.354152562926649e-06, + "loss": 0.1471, + "mean_token_accuracy": 0.9673193073272706, + "num_tokens": 2186052673.0, + "step": 15450 + }, + { + "entropy": 1.4252868342399596, + "epoch": 0.5060233097189122, + "grad_norm": 1.71875, + "learning_rate": 5.349124936792918e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9673807907104492, + "num_tokens": 2193326163.0, + "step": 15500 + }, + { + "epoch": 0.5060233097189122, + "eval_entropy": 1.409689162572225, + "eval_loss": 0.16070087254047394, + "eval_mean_token_accuracy": 0.9643241794904073, + "eval_num_tokens": 2193326163.0, + "eval_runtime": 746.9801, + "eval_samples_per_second": 12.927, + "eval_steps_per_second": 0.102, + "step": 15500 + }, + { + "entropy": 1.4205117511749268, + "epoch": 0.50765564297607, + "grad_norm": 1.203125, + "learning_rate": 5.344080197527782e-06, + "loss": 0.1492, + "mean_token_accuracy": 0.9669366598129272, + "num_tokens": 2200727296.0, + "step": 15550 + }, + { + "entropy": 1.418469593524933, + "epoch": 0.5092879762332277, + "grad_norm": 1.3046875, + "learning_rate": 5.339018381881644e-06, + "loss": 0.1485, + "mean_token_accuracy": 0.9666687226295472, + "num_tokens": 2207977018.0, + "step": 15600 + }, + { + "entropy": 1.4072162437438964, + "epoch": 0.5109203094903856, + "grad_norm": 1.5078125, + "learning_rate": 5.333939526729307e-06, + "loss": 0.1393, + "mean_token_accuracy": 0.968210985660553, + "num_tokens": 2214966932.0, + "step": 15650 + }, + { + "entropy": 1.4076303386688231, + "epoch": 0.5125526427475433, + "grad_norm": 1.8984375, + "learning_rate": 5.3288436690697e-06, + "loss": 0.1411, + "mean_token_accuracy": 0.9681597316265106, + "num_tokens": 2222114305.0, + "step": 15700 + }, + { + "entropy": 1.4115266394615174, + "epoch": 0.5141849760047011, + "grad_norm": 1.453125, + "learning_rate": 5.323730846025621e-06, + "loss": 0.1436, + "mean_token_accuracy": 0.9676663541793823, + "num_tokens": 2229368804.0, + "step": 15750 + }, + { + "entropy": 1.3984081506729127, + "epoch": 0.5158173092618589, + "grad_norm": 1.40625, + "learning_rate": 5.3186010948434535e-06, + "loss": 0.1277, + "mean_token_accuracy": 0.9710724341869355, + "num_tokens": 2235855134.0, + "step": 15800 + }, + { + "entropy": 1.4158777141571044, + "epoch": 0.5174496425190167, + "grad_norm": 1.5546875, + "learning_rate": 5.313454452892903e-06, + "loss": 0.1412, + "mean_token_accuracy": 0.9679831528663635, + "num_tokens": 2242658034.0, + "step": 15850 + }, + { + "entropy": 1.4000438117980958, + "epoch": 0.5190819757761744, + "grad_norm": 1.6875, + "learning_rate": 5.3082909576667206e-06, + "loss": 0.1432, + "mean_token_accuracy": 0.9681457090377807, + "num_tokens": 2249786162.0, + "step": 15900 + }, + { + "entropy": 1.4153186726570128, + "epoch": 0.5207143090333323, + "grad_norm": 1.09375, + "learning_rate": 5.303110646780435e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.9702495551109314, + "num_tokens": 2256415467.0, + "step": 15950 + }, + { + "entropy": 1.4182570719718932, + "epoch": 0.52234664229049, + "grad_norm": 1.4609375, + "learning_rate": 5.297913557972074e-06, + "loss": 0.1444, + "mean_token_accuracy": 0.9671289598941804, + "num_tokens": 2263118935.0, + "step": 16000 + }, + { + "epoch": 0.52234664229049, + "eval_entropy": 1.4267909447352092, + "eval_loss": 0.16035768389701843, + "eval_mean_token_accuracy": 0.9642941602071127, + "eval_num_tokens": 2263118935.0, + "eval_runtime": 746.9288, + "eval_samples_per_second": 12.928, + "eval_steps_per_second": 0.102, + "step": 16000 + }, + { + "entropy": 1.4244341516494752, + "epoch": 0.5239789755476478, + "grad_norm": 1.4375, + "learning_rate": 5.292699729101888e-06, + "loss": 0.1341, + "mean_token_accuracy": 0.9685980105400085, + "num_tokens": 2270004148.0, + "step": 16050 + }, + { + "entropy": 1.433496663570404, + "epoch": 0.5256113088048056, + "grad_norm": 1.1875, + "learning_rate": 5.2874691981520814e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.9664645993709564, + "num_tokens": 2277001614.0, + "step": 16100 + }, + { + "entropy": 1.4589428210258484, + "epoch": 0.5272436420619634, + "grad_norm": 1.390625, + "learning_rate": 5.282222003226528e-06, + "loss": 0.1494, + "mean_token_accuracy": 0.9667097711563111, + "num_tokens": 2283969486.0, + "step": 16150 + }, + { + "entropy": 1.4311462545394897, + "epoch": 0.5288759753191211, + "grad_norm": 1.1796875, + "learning_rate": 5.276958182550499e-06, + "loss": 0.1498, + "mean_token_accuracy": 0.966657601594925, + "num_tokens": 2291187492.0, + "step": 16200 + }, + { + "entropy": 1.4505498099327088, + "epoch": 0.530508308576279, + "grad_norm": 1.640625, + "learning_rate": 5.271677774470383e-06, + "loss": 0.1432, + "mean_token_accuracy": 0.9675734841823578, + "num_tokens": 2298489089.0, + "step": 16250 + }, + { + "entropy": 1.4610102701187133, + "epoch": 0.5321406418334367, + "grad_norm": 1.53125, + "learning_rate": 5.2663808174534035e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9655195689201355, + "num_tokens": 2306159999.0, + "step": 16300 + }, + { + "entropy": 1.4418502616882325, + "epoch": 0.5337729750905945, + "grad_norm": 1.578125, + "learning_rate": 5.261067350087342e-06, + "loss": 0.1448, + "mean_token_accuracy": 0.9679492330551147, + "num_tokens": 2313017319.0, + "step": 16350 + }, + { + "entropy": 1.4529797434806824, + "epoch": 0.5354053083477522, + "grad_norm": 2.0, + "learning_rate": 5.255737411080258e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.9680870652198792, + "num_tokens": 2319830047.0, + "step": 16400 + }, + { + "entropy": 1.4405932116508484, + "epoch": 0.5370376416049101, + "grad_norm": 1.6328125, + "learning_rate": 5.250391039260203e-06, + "loss": 0.1341, + "mean_token_accuracy": 0.9692844843864441, + "num_tokens": 2326652556.0, + "step": 16450 + }, + { + "entropy": 1.4391892647743225, + "epoch": 0.5386699748620678, + "grad_norm": 1.203125, + "learning_rate": 5.245028273574943e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9678148710727692, + "num_tokens": 2333819381.0, + "step": 16500 + }, + { + "epoch": 0.5386699748620678, + "eval_entropy": 1.4535431814193727, + "eval_loss": 0.15979354083538055, + "eval_mean_token_accuracy": 0.9644315036137899, + "eval_num_tokens": 2333819381.0, + "eval_runtime": 744.9612, + "eval_samples_per_second": 12.962, + "eval_steps_per_second": 0.102, + "step": 16500 + }, + { + "entropy": 1.4589093685150147, + "epoch": 0.5403023081192256, + "grad_norm": 1.546875, + "learning_rate": 5.239649153091669e-06, + "loss": 0.1366, + "mean_token_accuracy": 0.9689966702461242, + "num_tokens": 2340621485.0, + "step": 16550 + }, + { + "entropy": 1.4447486186027527, + "epoch": 0.5419346413763834, + "grad_norm": 2.34375, + "learning_rate": 5.234253716996714e-06, + "loss": 0.1407, + "mean_token_accuracy": 0.9680201160907745, + "num_tokens": 2347557447.0, + "step": 16600 + }, + { + "entropy": 1.4582811617851257, + "epoch": 0.5435669746335412, + "grad_norm": 1.2734375, + "learning_rate": 5.228842004595271e-06, + "loss": 0.1416, + "mean_token_accuracy": 0.9680859756469726, + "num_tokens": 2354654075.0, + "step": 16650 + }, + { + "entropy": 1.4596582174301147, + "epoch": 0.5451993078906989, + "grad_norm": 1.78125, + "learning_rate": 5.223414055311104e-06, + "loss": 0.1456, + "mean_token_accuracy": 0.9669416832923889, + "num_tokens": 2361479872.0, + "step": 16700 + }, + { + "entropy": 1.455612142086029, + "epoch": 0.5468316411478568, + "grad_norm": 1.9296875, + "learning_rate": 5.217969908686259e-06, + "loss": 0.1494, + "mean_token_accuracy": 0.9662820196151733, + "num_tokens": 2368784116.0, + "step": 16750 + }, + { + "entropy": 1.437941801548004, + "epoch": 0.5484639744050145, + "grad_norm": 1.390625, + "learning_rate": 5.2125096043807805e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.969438636302948, + "num_tokens": 2375699232.0, + "step": 16800 + }, + { + "entropy": 1.4474022889137268, + "epoch": 0.5500963076621723, + "grad_norm": 1.5390625, + "learning_rate": 5.2070331821724175e-06, + "loss": 0.1496, + "mean_token_accuracy": 0.9666418421268463, + "num_tokens": 2382571188.0, + "step": 16850 + }, + { + "entropy": 1.443734383583069, + "epoch": 0.55172864091933, + "grad_norm": 1.359375, + "learning_rate": 5.201540681956339e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.9685157811641694, + "num_tokens": 2389499925.0, + "step": 16900 + }, + { + "entropy": 1.4289966011047364, + "epoch": 0.5533609741764879, + "grad_norm": 1.5078125, + "learning_rate": 5.196032143744837e-06, + "loss": 0.1502, + "mean_token_accuracy": 0.9662255728244782, + "num_tokens": 2396898569.0, + "step": 16950 + }, + { + "entropy": 1.422741391658783, + "epoch": 0.5549933074336456, + "grad_norm": 2.109375, + "learning_rate": 5.190507607667043e-06, + "loss": 0.1362, + "mean_token_accuracy": 0.9694574820995331, + "num_tokens": 2403493744.0, + "step": 17000 + }, + { + "epoch": 0.5549933074336456, + "eval_entropy": 1.410568381945292, + "eval_loss": 0.1593320667743683, + "eval_mean_token_accuracy": 0.9647167531649271, + "eval_num_tokens": 2403493744.0, + "eval_runtime": 749.6267, + "eval_samples_per_second": 12.881, + "eval_steps_per_second": 0.101, + "step": 17000 + }, + { + "entropy": 1.4179514050483704, + "epoch": 0.5566256406908034, + "grad_norm": 1.5078125, + "learning_rate": 5.184967113968628e-06, + "loss": 0.1437, + "mean_token_accuracy": 0.9673176133632659, + "num_tokens": 2410291888.0, + "step": 17050 + }, + { + "entropy": 1.420968849658966, + "epoch": 0.5582579739479612, + "grad_norm": 1.875, + "learning_rate": 5.179410703011514e-06, + "loss": 0.1416, + "mean_token_accuracy": 0.9681920135021209, + "num_tokens": 2417511664.0, + "step": 17100 + }, + { + "entropy": 1.4380729961395264, + "epoch": 0.559890307205119, + "grad_norm": 1.984375, + "learning_rate": 5.173838415273578e-06, + "loss": 0.1381, + "mean_token_accuracy": 0.9696194708347321, + "num_tokens": 2424287780.0, + "step": 17150 + }, + { + "entropy": 1.418275089263916, + "epoch": 0.5615226404622767, + "grad_norm": 1.796875, + "learning_rate": 5.168250291348358e-06, + "loss": 0.1307, + "mean_token_accuracy": 0.9706324160099029, + "num_tokens": 2431053313.0, + "step": 17200 + }, + { + "entropy": 1.4160829043388368, + "epoch": 0.5631549737194346, + "grad_norm": 1.1328125, + "learning_rate": 5.162646371944757e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9683985018730163, + "num_tokens": 2438106375.0, + "step": 17250 + }, + { + "entropy": 1.400640585422516, + "epoch": 0.5647873069765923, + "grad_norm": 1.7421875, + "learning_rate": 5.157026697886745e-06, + "loss": 0.1367, + "mean_token_accuracy": 0.9683994352817535, + "num_tokens": 2445056771.0, + "step": 17300 + }, + { + "entropy": 1.4130174493789673, + "epoch": 0.5664196402337501, + "grad_norm": 1.3125, + "learning_rate": 5.151391310113067e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.9672868931293488, + "num_tokens": 2452756923.0, + "step": 17350 + }, + { + "entropy": 1.4374343585968017, + "epoch": 0.5680519734909079, + "grad_norm": 1.7890625, + "learning_rate": 5.145740249676937e-06, + "loss": 0.1405, + "mean_token_accuracy": 0.9680230569839477, + "num_tokens": 2459581470.0, + "step": 17400 + }, + { + "entropy": 1.4422858119010926, + "epoch": 0.5696843067480657, + "grad_norm": 1.6640625, + "learning_rate": 5.140073557745743e-06, + "loss": 0.1365, + "mean_token_accuracy": 0.9697402846813202, + "num_tokens": 2466422672.0, + "step": 17450 + }, + { + "entropy": 1.430338339805603, + "epoch": 0.5713166400052234, + "grad_norm": 1.1875, + "learning_rate": 5.134391275600748e-06, + "loss": 0.1361, + "mean_token_accuracy": 0.9687436437606811, + "num_tokens": 2473543628.0, + "step": 17500 + }, + { + "epoch": 0.5713166400052234, + "eval_entropy": 1.4239939387639364, + "eval_loss": 0.15963123738765717, + "eval_mean_token_accuracy": 0.9646872194608053, + "eval_num_tokens": 2473543628.0, + "eval_runtime": 746.9116, + "eval_samples_per_second": 12.928, + "eval_steps_per_second": 0.102, + "step": 17500 + }, + { + "entropy": 1.41823988199234, + "epoch": 0.5729489732623813, + "grad_norm": 2.1875, + "learning_rate": 5.12869344463679e-06, + "loss": 0.1349, + "mean_token_accuracy": 0.969758038520813, + "num_tokens": 2480227038.0, + "step": 17550 + }, + { + "entropy": 1.423335657119751, + "epoch": 0.574581306519539, + "grad_norm": 1.3125, + "learning_rate": 5.122980106361973e-06, + "loss": 0.1427, + "mean_token_accuracy": 0.9678147983551025, + "num_tokens": 2487475967.0, + "step": 17600 + }, + { + "entropy": 1.4069186902046205, + "epoch": 0.5762136397766968, + "grad_norm": 1.171875, + "learning_rate": 5.117251302397376e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9696054446697235, + "num_tokens": 2494547025.0, + "step": 17650 + }, + { + "entropy": 1.4386369252204896, + "epoch": 0.5778459730338545, + "grad_norm": 1.90625, + "learning_rate": 5.111507074476741e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9627214801311493, + "num_tokens": 2502024527.0, + "step": 17700 + }, + { + "entropy": 1.4271745777130127, + "epoch": 0.5794783062910124, + "grad_norm": 2.078125, + "learning_rate": 5.105747464446171e-06, + "loss": 0.1462, + "mean_token_accuracy": 0.9673130023479461, + "num_tokens": 2509136097.0, + "step": 17750 + }, + { + "entropy": 1.4218875932693482, + "epoch": 0.5811106395481701, + "grad_norm": 2.078125, + "learning_rate": 5.099972514263828e-06, + "loss": 0.1425, + "mean_token_accuracy": 0.9676541352272033, + "num_tokens": 2516010870.0, + "step": 17800 + }, + { + "entropy": 1.4141360521316528, + "epoch": 0.5827429728053279, + "grad_norm": 1.171875, + "learning_rate": 5.094182265999625e-06, + "loss": 0.144, + "mean_token_accuracy": 0.9672036898136139, + "num_tokens": 2523293871.0, + "step": 17850 + }, + { + "entropy": 1.4077139925956725, + "epoch": 0.5843753060624857, + "grad_norm": 1.375, + "learning_rate": 5.0883767618349205e-06, + "loss": 0.1419, + "mean_token_accuracy": 0.9684661495685577, + "num_tokens": 2530147747.0, + "step": 17900 + }, + { + "entropy": 1.407555968761444, + "epoch": 0.5860076393196435, + "grad_norm": 1.5078125, + "learning_rate": 5.082556044062209e-06, + "loss": 0.1277, + "mean_token_accuracy": 0.9704045712947845, + "num_tokens": 2536714168.0, + "step": 17950 + }, + { + "entropy": 1.4110585117340089, + "epoch": 0.5876399725768012, + "grad_norm": 1.3125, + "learning_rate": 5.0767201550848155e-06, + "loss": 0.1402, + "mean_token_accuracy": 0.9680717265605927, + "num_tokens": 2543453566.0, + "step": 18000 + }, + { + "epoch": 0.5876399725768012, + "eval_entropy": 1.3970834159851073, + "eval_loss": 0.15939612686634064, + "eval_mean_token_accuracy": 0.9646043960253398, + "eval_num_tokens": 2543453566.0, + "eval_runtime": 745.3104, + "eval_samples_per_second": 12.956, + "eval_steps_per_second": 0.102, + "step": 18000 + }, + { + "entropy": 1.405302128791809, + "epoch": 0.5892723058339591, + "grad_norm": 1.8203125, + "learning_rate": 5.070869137416586e-06, + "loss": 0.1392, + "mean_token_accuracy": 0.9686900818347931, + "num_tokens": 2550316340.0, + "step": 18050 + }, + { + "entropy": 1.409914915561676, + "epoch": 0.5909046390911168, + "grad_norm": 1.0703125, + "learning_rate": 5.065003033681577e-06, + "loss": 0.1464, + "mean_token_accuracy": 0.9667869770526886, + "num_tokens": 2557419151.0, + "step": 18100 + }, + { + "entropy": 1.4183743119239807, + "epoch": 0.5925369723482746, + "grad_norm": 1.7890625, + "learning_rate": 5.059121886613746e-06, + "loss": 0.1463, + "mean_token_accuracy": 0.9668680250644683, + "num_tokens": 2564861132.0, + "step": 18150 + }, + { + "entropy": 1.4198586773872375, + "epoch": 0.5941693056054324, + "grad_norm": 1.515625, + "learning_rate": 5.053225739056638e-06, + "loss": 0.1395, + "mean_token_accuracy": 0.9689326965808869, + "num_tokens": 2571587295.0, + "step": 18200 + }, + { + "entropy": 1.4410333514213562, + "epoch": 0.5958016388625902, + "grad_norm": 2.09375, + "learning_rate": 5.047314633963077e-06, + "loss": 0.142, + "mean_token_accuracy": 0.9676812827587128, + "num_tokens": 2578976320.0, + "step": 18250 + }, + { + "entropy": 1.4181599235534668, + "epoch": 0.5974339721197479, + "grad_norm": 1.46875, + "learning_rate": 5.04138861439485e-06, + "loss": 0.1388, + "mean_token_accuracy": 0.968032683134079, + "num_tokens": 2585791943.0, + "step": 18300 + }, + { + "entropy": 1.4222650122642517, + "epoch": 0.5990663053769058, + "grad_norm": 2.0, + "learning_rate": 5.0354477235223945e-06, + "loss": 0.1479, + "mean_token_accuracy": 0.9662945425510406, + "num_tokens": 2592969827.0, + "step": 18350 + }, + { + "entropy": 1.4107950401306153, + "epoch": 0.6006986386340635, + "grad_norm": 1.6171875, + "learning_rate": 5.029492004624484e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9669906544685364, + "num_tokens": 2600453000.0, + "step": 18400 + }, + { + "entropy": 1.4090588116645812, + "epoch": 0.6023309718912213, + "grad_norm": 2.578125, + "learning_rate": 5.023521501087913e-06, + "loss": 0.142, + "mean_token_accuracy": 0.9677951109409332, + "num_tokens": 2607819604.0, + "step": 18450 + }, + { + "entropy": 1.4200125312805176, + "epoch": 0.6039633051483791, + "grad_norm": 1.6875, + "learning_rate": 5.017536256407179e-06, + "loss": 0.1471, + "mean_token_accuracy": 0.9666409981250763, + "num_tokens": 2615067666.0, + "step": 18500 + }, + { + "epoch": 0.6039633051483791, + "eval_entropy": 1.4210956970850626, + "eval_loss": 0.15948741137981415, + "eval_mean_token_accuracy": 0.9648204270998637, + "eval_num_tokens": 2615067666.0, + "eval_runtime": 748.5938, + "eval_samples_per_second": 12.899, + "eval_steps_per_second": 0.102, + "step": 18500 + }, + { + "entropy": 1.4314108753204347, + "epoch": 0.6055956384055369, + "grad_norm": 1.421875, + "learning_rate": 5.011536314184171e-06, + "loss": 0.1353, + "mean_token_accuracy": 0.9691914403438568, + "num_tokens": 2621774246.0, + "step": 18550 + }, + { + "entropy": 1.438691704273224, + "epoch": 0.6072279716626947, + "grad_norm": 1.53125, + "learning_rate": 5.0055217181278435e-06, + "loss": 0.152, + "mean_token_accuracy": 0.9660729610919953, + "num_tokens": 2628968865.0, + "step": 18600 + }, + { + "entropy": 1.4327943706512452, + "epoch": 0.6088603049198524, + "grad_norm": 0.9453125, + "learning_rate": 4.999492512053904e-06, + "loss": 0.1429, + "mean_token_accuracy": 0.9679563343524933, + "num_tokens": 2636298361.0, + "step": 18650 + }, + { + "entropy": 1.419541118144989, + "epoch": 0.6104926381770103, + "grad_norm": 1.6953125, + "learning_rate": 4.993448739884496e-06, + "loss": 0.1365, + "mean_token_accuracy": 0.9690783095359802, + "num_tokens": 2643149114.0, + "step": 18700 + }, + { + "entropy": 1.433673801422119, + "epoch": 0.612124971434168, + "grad_norm": 1.3203125, + "learning_rate": 4.98739044564787e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9684871184825897, + "num_tokens": 2649909312.0, + "step": 18750 + }, + { + "entropy": 1.41679913520813, + "epoch": 0.6137573046913258, + "grad_norm": 1.859375, + "learning_rate": 4.9813176734780714e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.9691712772846222, + "num_tokens": 2656772858.0, + "step": 18800 + }, + { + "entropy": 1.4235585713386536, + "epoch": 0.6153896379484836, + "grad_norm": 1.3125, + "learning_rate": 4.975230467614616e-06, + "loss": 0.1413, + "mean_token_accuracy": 0.9680274319648743, + "num_tokens": 2664404838.0, + "step": 18850 + }, + { + "entropy": 1.452963993549347, + "epoch": 0.6170219712056414, + "grad_norm": 1.4765625, + "learning_rate": 4.969128872402166e-06, + "loss": 0.1479, + "mean_token_accuracy": 0.9667314755916595, + "num_tokens": 2671712943.0, + "step": 18900 + }, + { + "entropy": 1.4436225032806396, + "epoch": 0.6186543044627991, + "grad_norm": 1.578125, + "learning_rate": 4.96301293229021e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.9664954626560212, + "num_tokens": 2678603623.0, + "step": 18950 + }, + { + "entropy": 1.4565304446220397, + "epoch": 0.620286637719957, + "grad_norm": 2.0625, + "learning_rate": 4.9568826918327375e-06, + "loss": 0.1481, + "mean_token_accuracy": 0.9671436643600464, + "num_tokens": 2685887315.0, + "step": 19000 + }, + { + "epoch": 0.620286637719957, + "eval_entropy": 1.4394246594111124, + "eval_loss": 0.15968339145183563, + "eval_mean_token_accuracy": 0.9645526838302613, + "eval_num_tokens": 2685887315.0, + "eval_runtime": 743.8859, + "eval_samples_per_second": 12.98, + "eval_steps_per_second": 0.102, + "step": 19000 + }, + { + "entropy": 1.4509250116348267, + "epoch": 0.6219189709771147, + "grad_norm": 1.5078125, + "learning_rate": 4.950738195687914e-06, + "loss": 0.1478, + "mean_token_accuracy": 0.9675949096679688, + "num_tokens": 2692939859.0, + "step": 19050 + }, + { + "entropy": 1.4424961733818054, + "epoch": 0.6235513042342725, + "grad_norm": 1.5859375, + "learning_rate": 4.944579488617754e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9699901640415192, + "num_tokens": 2699651700.0, + "step": 19100 + }, + { + "entropy": 1.43824138879776, + "epoch": 0.6251836374914302, + "grad_norm": 1.6953125, + "learning_rate": 4.938406615487804e-06, + "loss": 0.1498, + "mean_token_accuracy": 0.9661287260055542, + "num_tokens": 2707480098.0, + "step": 19150 + }, + { + "entropy": 1.4361098647117614, + "epoch": 0.6268159707485881, + "grad_norm": 1.671875, + "learning_rate": 4.9322196212668e-06, + "loss": 0.1563, + "mean_token_accuracy": 0.9649386501312256, + "num_tokens": 2715153146.0, + "step": 19200 + }, + { + "entropy": 1.4300830745697022, + "epoch": 0.6284483040057458, + "grad_norm": 1.15625, + "learning_rate": 4.9260185510263546e-06, + "loss": 0.1363, + "mean_token_accuracy": 0.9690042114257813, + "num_tokens": 2722322826.0, + "step": 19250 + }, + { + "entropy": 1.423244218826294, + "epoch": 0.6300806372629036, + "grad_norm": 2.140625, + "learning_rate": 4.919803449940621e-06, + "loss": 0.1253, + "mean_token_accuracy": 0.9720934724807739, + "num_tokens": 2728961876.0, + "step": 19300 + }, + { + "entropy": 1.4357132482528687, + "epoch": 0.6317129705200614, + "grad_norm": 1.2265625, + "learning_rate": 4.913574363285965e-06, + "loss": 0.1451, + "mean_token_accuracy": 0.967208684682846, + "num_tokens": 2735922821.0, + "step": 19350 + }, + { + "entropy": 1.4567182993888854, + "epoch": 0.6333453037772192, + "grad_norm": 1.1953125, + "learning_rate": 4.907331336440637e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9658544027805328, + "num_tokens": 2743184778.0, + "step": 19400 + }, + { + "entropy": 1.4356631135940552, + "epoch": 0.6349776370343769, + "grad_norm": 1.15625, + "learning_rate": 4.9010744148844414e-06, + "loss": 0.1392, + "mean_token_accuracy": 0.9687076592445374, + "num_tokens": 2750183248.0, + "step": 19450 + }, + { + "entropy": 1.4388001561164856, + "epoch": 0.6366099702915348, + "grad_norm": 1.4296875, + "learning_rate": 4.8948036441984e-06, + "loss": 0.1408, + "mean_token_accuracy": 0.9690019035339356, + "num_tokens": 2757183194.0, + "step": 19500 + }, + { + "epoch": 0.6366099702915348, + "eval_entropy": 1.4425183471043905, + "eval_loss": 0.15926624834537506, + "eval_mean_token_accuracy": 0.9642885835965475, + "eval_num_tokens": 2757183194.0, + "eval_runtime": 747.7266, + "eval_samples_per_second": 12.914, + "eval_steps_per_second": 0.102, + "step": 19500 + }, + { + "entropy": 1.4483740854263305, + "epoch": 0.6382423035486925, + "grad_norm": 1.59375, + "learning_rate": 4.888519070064427e-06, + "loss": 0.1424, + "mean_token_accuracy": 0.9682291400432587, + "num_tokens": 2764192867.0, + "step": 19550 + }, + { + "entropy": 1.4348323988914489, + "epoch": 0.6398746368058503, + "grad_norm": 1.9375, + "learning_rate": 4.882220738264994e-06, + "loss": 0.1485, + "mean_token_accuracy": 0.9667525351047516, + "num_tokens": 2771378704.0, + "step": 19600 + }, + { + "entropy": 1.431165406703949, + "epoch": 0.641506970063008, + "grad_norm": 2.03125, + "learning_rate": 4.875908694682793e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9686442255973816, + "num_tokens": 2778817459.0, + "step": 19650 + }, + { + "entropy": 1.4262248253822327, + "epoch": 0.6431393033201659, + "grad_norm": 1.9453125, + "learning_rate": 4.869582985300409e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9666674077510834, + "num_tokens": 2785809690.0, + "step": 19700 + }, + { + "entropy": 1.4198060846328735, + "epoch": 0.6447716365773236, + "grad_norm": 1.34375, + "learning_rate": 4.8632436561999754e-06, + "loss": 0.141, + "mean_token_accuracy": 0.9685878479480743, + "num_tokens": 2792972553.0, + "step": 19750 + }, + { + "entropy": 1.42301029920578, + "epoch": 0.6464039698344815, + "grad_norm": 1.234375, + "learning_rate": 4.85689075356285e-06, + "loss": 0.1323, + "mean_token_accuracy": 0.9700925529003144, + "num_tokens": 2799755524.0, + "step": 19800 + }, + { + "entropy": 1.407839720249176, + "epoch": 0.6480363030916392, + "grad_norm": 1.2578125, + "learning_rate": 4.850524323669266e-06, + "loss": 0.144, + "mean_token_accuracy": 0.9683354413509369, + "num_tokens": 2806934638.0, + "step": 19850 + }, + { + "entropy": 1.4161179232597352, + "epoch": 0.649668636348797, + "grad_norm": 1.5546875, + "learning_rate": 4.844144412898006e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.9669712007045745, + "num_tokens": 2814052906.0, + "step": 19900 + }, + { + "entropy": 1.4124717712402344, + "epoch": 0.6513009696059547, + "grad_norm": 1.5390625, + "learning_rate": 4.83775106772606e-06, + "loss": 0.1368, + "mean_token_accuracy": 0.9689941215515137, + "num_tokens": 2820334438.0, + "step": 19950 + }, + { + "entropy": 1.4184478282928468, + "epoch": 0.6529333028631126, + "grad_norm": 2.0625, + "learning_rate": 4.8313443347282805e-06, + "loss": 0.1377, + "mean_token_accuracy": 0.9692059123516082, + "num_tokens": 2826961099.0, + "step": 20000 + }, + { + "epoch": 0.6529333028631126, + "eval_entropy": 1.4259551413853964, + "eval_loss": 0.15875579416751862, + "eval_mean_token_accuracy": 0.9646251447995504, + "eval_num_tokens": 2826961099.0, + "eval_runtime": 744.3635, + "eval_samples_per_second": 12.972, + "eval_steps_per_second": 0.102, + "step": 20000 + }, + { + "entropy": 1.4160712957382202, + "epoch": 0.6545656361202703, + "grad_norm": 1.484375, + "learning_rate": 4.824924260577056e-06, + "loss": 0.1377, + "mean_token_accuracy": 0.9691171681880951, + "num_tokens": 2834147273.0, + "step": 20050 + }, + { + "entropy": 1.4152118015289306, + "epoch": 0.6561979693774281, + "grad_norm": 1.109375, + "learning_rate": 4.818490892041959e-06, + "loss": 0.1347, + "mean_token_accuracy": 0.9702432310581207, + "num_tokens": 2840408479.0, + "step": 20100 + }, + { + "entropy": 1.4018067622184753, + "epoch": 0.6578303026345859, + "grad_norm": 1.1953125, + "learning_rate": 4.81204427598941e-06, + "loss": 0.1227, + "mean_token_accuracy": 0.9716095530986786, + "num_tokens": 2847100048.0, + "step": 20150 + }, + { + "entropy": 1.4273248219490051, + "epoch": 0.6594626358917437, + "grad_norm": 1.28125, + "learning_rate": 4.805584459382342e-06, + "loss": 0.144, + "mean_token_accuracy": 0.9675836896896363, + "num_tokens": 2853932358.0, + "step": 20200 + }, + { + "entropy": 1.431253423690796, + "epoch": 0.6610949691489014, + "grad_norm": 1.8671875, + "learning_rate": 4.799111489279844e-06, + "loss": 0.1403, + "mean_token_accuracy": 0.9687580478191375, + "num_tokens": 2860669435.0, + "step": 20250 + }, + { + "entropy": 1.4074326467514038, + "epoch": 0.6627273024060593, + "grad_norm": 1.375, + "learning_rate": 4.792625412836835e-06, + "loss": 0.136, + "mean_token_accuracy": 0.9695438253879547, + "num_tokens": 2867505726.0, + "step": 20300 + }, + { + "entropy": 1.4165804195404053, + "epoch": 0.664359635663217, + "grad_norm": 1.296875, + "learning_rate": 4.786126277303707e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.9678076064586639, + "num_tokens": 2874388589.0, + "step": 20350 + }, + { + "entropy": 1.405945236682892, + "epoch": 0.6659919689203748, + "grad_norm": 1.7109375, + "learning_rate": 4.779614130025989e-06, + "loss": 0.1434, + "mean_token_accuracy": 0.9678366994857788, + "num_tokens": 2881624440.0, + "step": 20400 + }, + { + "entropy": 1.4089183855056762, + "epoch": 0.6676243021775325, + "grad_norm": 1.0546875, + "learning_rate": 4.7730890184439984e-06, + "loss": 0.1402, + "mean_token_accuracy": 0.9682434296607971, + "num_tokens": 2888818613.0, + "step": 20450 + }, + { + "entropy": 1.4186338901519775, + "epoch": 0.6692566354346904, + "grad_norm": 2.375, + "learning_rate": 4.766550990092494e-06, + "loss": 0.1501, + "mean_token_accuracy": 0.9658879733085632, + "num_tokens": 2895674916.0, + "step": 20500 + }, + { + "epoch": 0.6692566354346904, + "eval_entropy": 1.4077574555079142, + "eval_loss": 0.15949369966983795, + "eval_mean_token_accuracy": 0.9645699612299601, + "eval_num_tokens": 2895674916.0, + "eval_runtime": 751.7816, + "eval_samples_per_second": 12.844, + "eval_steps_per_second": 0.101, + "step": 20500 + }, + { + "entropy": 1.4104346489906312, + "epoch": 0.6708889686918481, + "grad_norm": 2.1875, + "learning_rate": 4.760000092600337e-06, + "loss": 0.1379, + "mean_token_accuracy": 0.969134624004364, + "num_tokens": 2902349294.0, + "step": 20550 + }, + { + "entropy": 1.4055168724060059, + "epoch": 0.672521301949006, + "grad_norm": 1.8359375, + "learning_rate": 4.7534363736901334e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9709035861492157, + "num_tokens": 2909212953.0, + "step": 20600 + }, + { + "entropy": 1.4179642391204834, + "epoch": 0.6741536352061637, + "grad_norm": 1.3515625, + "learning_rate": 4.746859881177895e-06, + "loss": 0.1462, + "mean_token_accuracy": 0.9668932211399078, + "num_tokens": 2916418343.0, + "step": 20650 + }, + { + "entropy": 1.3955682134628296, + "epoch": 0.6757859684633215, + "grad_norm": 2.03125, + "learning_rate": 4.7402706629726884e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9696724009513855, + "num_tokens": 2923371196.0, + "step": 20700 + }, + { + "entropy": 1.4021626067161561, + "epoch": 0.6774183017204792, + "grad_norm": 1.6875, + "learning_rate": 4.733668767076282e-06, + "loss": 0.1525, + "mean_token_accuracy": 0.9660227704048157, + "num_tokens": 2930542072.0, + "step": 20750 + }, + { + "entropy": 1.3913014960289, + "epoch": 0.6790506349776371, + "grad_norm": 1.453125, + "learning_rate": 4.727054241582805e-06, + "loss": 0.1393, + "mean_token_accuracy": 0.968789451122284, + "num_tokens": 2937522853.0, + "step": 20800 + }, + { + "entropy": 1.3855742335319519, + "epoch": 0.6806829682347948, + "grad_norm": 1.90625, + "learning_rate": 4.720427134678388e-06, + "loss": 0.1295, + "mean_token_accuracy": 0.9704008960723877, + "num_tokens": 2944112485.0, + "step": 20850 + }, + { + "entropy": 1.3942367219924927, + "epoch": 0.6823153014919526, + "grad_norm": 1.3828125, + "learning_rate": 4.713787494640818e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9685467886924743, + "num_tokens": 2951083570.0, + "step": 20900 + }, + { + "entropy": 1.3912032508850098, + "epoch": 0.6839476347491104, + "grad_norm": 1.4609375, + "learning_rate": 4.707135369839182e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9680955350399018, + "num_tokens": 2958496819.0, + "step": 20950 + }, + { + "entropy": 1.3916519379615784, + "epoch": 0.6855799680062682, + "grad_norm": 2.15625, + "learning_rate": 4.70047080873352e-06, + "loss": 0.1419, + "mean_token_accuracy": 0.9678064894676208, + "num_tokens": 2965544584.0, + "step": 21000 + }, + { + "epoch": 0.6855799680062682, + "eval_entropy": 1.3711633268992107, + "eval_loss": 0.15859009325504303, + "eval_mean_token_accuracy": 0.9645708012580871, + "eval_num_tokens": 2965544584.0, + "eval_runtime": 749.4291, + "eval_samples_per_second": 12.884, + "eval_steps_per_second": 0.101, + "step": 21000 + }, + { + "entropy": 1.3885661768913269, + "epoch": 0.6872123012634259, + "grad_norm": 1.546875, + "learning_rate": 4.693793859874469e-06, + "loss": 0.1389, + "mean_token_accuracy": 0.968884084224701, + "num_tokens": 2972445388.0, + "step": 21050 + }, + { + "entropy": 1.3977628707885743, + "epoch": 0.6888446345205838, + "grad_norm": 1.1796875, + "learning_rate": 4.687104571902907e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9655460596084595, + "num_tokens": 2979970737.0, + "step": 21100 + }, + { + "entropy": 1.3903096175193788, + "epoch": 0.6904769677777415, + "grad_norm": 2.21875, + "learning_rate": 4.680402993549603e-06, + "loss": 0.1434, + "mean_token_accuracy": 0.9677474045753479, + "num_tokens": 2986677415.0, + "step": 21150 + }, + { + "entropy": 1.404201271533966, + "epoch": 0.6921093010348993, + "grad_norm": 1.7421875, + "learning_rate": 4.673689173634861e-06, + "loss": 0.1581, + "mean_token_accuracy": 0.9651612138748169, + "num_tokens": 2994298485.0, + "step": 21200 + }, + { + "entropy": 1.3858560705184937, + "epoch": 0.693741634292057, + "grad_norm": 1.828125, + "learning_rate": 4.666963161068162e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.967083740234375, + "num_tokens": 3001141837.0, + "step": 21250 + }, + { + "entropy": 1.3804667377471924, + "epoch": 0.6953739675492149, + "grad_norm": 1.6796875, + "learning_rate": 4.660225004847808e-06, + "loss": 0.1464, + "mean_token_accuracy": 0.9659059000015259, + "num_tokens": 3008640955.0, + "step": 21300 + }, + { + "entropy": 1.3600660967826843, + "epoch": 0.6970063008063726, + "grad_norm": 1.6953125, + "learning_rate": 4.65347475406057e-06, + "loss": 0.1309, + "mean_token_accuracy": 0.9700274157524109, + "num_tokens": 3015031696.0, + "step": 21350 + }, + { + "entropy": 1.363333306312561, + "epoch": 0.6986386340635304, + "grad_norm": 1.4453125, + "learning_rate": 4.646712457881323e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9672285616397858, + "num_tokens": 3022491104.0, + "step": 21400 + }, + { + "entropy": 1.3792692565917968, + "epoch": 0.7002709673206882, + "grad_norm": 2.234375, + "learning_rate": 4.639938165572694e-06, + "loss": 0.1365, + "mean_token_accuracy": 0.9694170689582825, + "num_tokens": 3029347619.0, + "step": 21450 + }, + { + "entropy": 1.3687973999977112, + "epoch": 0.701903300577846, + "grad_norm": 1.171875, + "learning_rate": 4.6331519264847e-06, + "loss": 0.1425, + "mean_token_accuracy": 0.9671109592914582, + "num_tokens": 3036384532.0, + "step": 21500 + }, + { + "epoch": 0.701903300577846, + "eval_entropy": 1.377278790473938, + "eval_loss": 0.15810321271419525, + "eval_mean_token_accuracy": 0.9647063970565796, + "eval_num_tokens": 3036384532.0, + "eval_runtime": 747.7284, + "eval_samples_per_second": 12.914, + "eval_steps_per_second": 0.102, + "step": 21500 + }, + { + "entropy": 1.3783918046951293, + "epoch": 0.7035356338350037, + "grad_norm": 1.625, + "learning_rate": 4.626353790054387e-06, + "loss": 0.1487, + "mean_token_accuracy": 0.967153193950653, + "num_tokens": 3043409455.0, + "step": 21550 + }, + { + "entropy": 1.3809619188308715, + "epoch": 0.7051679670921616, + "grad_norm": 1.078125, + "learning_rate": 4.619543805805475e-06, + "loss": 0.145, + "mean_token_accuracy": 0.9672031688690186, + "num_tokens": 3050598353.0, + "step": 21600 + }, + { + "entropy": 1.3623022150993347, + "epoch": 0.7068003003493193, + "grad_norm": 1.5, + "learning_rate": 4.612722023347991e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.9682861661911011, + "num_tokens": 3057976676.0, + "step": 21650 + }, + { + "entropy": 1.361666476726532, + "epoch": 0.7084326336064771, + "grad_norm": 1.4921875, + "learning_rate": 4.6058884923779135e-06, + "loss": 0.143, + "mean_token_accuracy": 0.9676965260505677, + "num_tokens": 3065507507.0, + "step": 21700 + }, + { + "entropy": 1.3833092284202575, + "epoch": 0.7100649668636348, + "grad_norm": 1.84375, + "learning_rate": 4.599043262676806e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9680466854572296, + "num_tokens": 3072550373.0, + "step": 21750 + }, + { + "entropy": 1.386215124130249, + "epoch": 0.7116973001207927, + "grad_norm": 2.015625, + "learning_rate": 4.592186384111457e-06, + "loss": 0.1457, + "mean_token_accuracy": 0.9672110736370086, + "num_tokens": 3079372014.0, + "step": 21800 + }, + { + "entropy": 1.3695436239242553, + "epoch": 0.7133296333779504, + "grad_norm": 1.6640625, + "learning_rate": 4.585317906633516e-06, + "loss": 0.1291, + "mean_token_accuracy": 0.9703351008892059, + "num_tokens": 3086062214.0, + "step": 21850 + }, + { + "entropy": 1.3959812355041503, + "epoch": 0.7149619666351082, + "grad_norm": 1.8046875, + "learning_rate": 4.578437880279126e-06, + "loss": 0.1269, + "mean_token_accuracy": 0.9709628915786743, + "num_tokens": 3092573548.0, + "step": 21900 + }, + { + "entropy": 1.3880902981758119, + "epoch": 0.716594299892266, + "grad_norm": 1.984375, + "learning_rate": 4.571546355168567e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9681686234474182, + "num_tokens": 3099550809.0, + "step": 21950 + }, + { + "entropy": 1.40864750623703, + "epoch": 0.7182266331494238, + "grad_norm": 1.1953125, + "learning_rate": 4.564643381505886e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.9665729129314422, + "num_tokens": 3106864257.0, + "step": 22000 + }, + { + "epoch": 0.7182266331494238, + "eval_entropy": 1.3993131558100382, + "eval_loss": 0.15842117369174957, + "eval_mean_token_accuracy": 0.9647385891278585, + "eval_num_tokens": 3106864257.0, + "eval_runtime": 751.7197, + "eval_samples_per_second": 12.845, + "eval_steps_per_second": 0.101, + "step": 22000 + }, + { + "entropy": 1.3926187753677368, + "epoch": 0.7198589664065815, + "grad_norm": 1.5234375, + "learning_rate": 4.557729009578527e-06, + "loss": 0.14, + "mean_token_accuracy": 0.9683624911308288, + "num_tokens": 3113827777.0, + "step": 22050 + }, + { + "entropy": 1.3936706256866456, + "epoch": 0.7214912996637394, + "grad_norm": 1.828125, + "learning_rate": 4.550803289756973e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9700661396980286, + "num_tokens": 3120906575.0, + "step": 22100 + }, + { + "entropy": 1.3945861506462096, + "epoch": 0.7231236329208971, + "grad_norm": 1.1953125, + "learning_rate": 4.543866272494375e-06, + "loss": 0.1535, + "mean_token_accuracy": 0.9649194324016571, + "num_tokens": 3128776941.0, + "step": 22150 + }, + { + "entropy": 1.4023722219467163, + "epoch": 0.7247559661780549, + "grad_norm": 1.125, + "learning_rate": 4.536918008326183e-06, + "loss": 0.137, + "mean_token_accuracy": 0.9688715541362762, + "num_tokens": 3135678296.0, + "step": 22200 + }, + { + "entropy": 1.4066932153701783, + "epoch": 0.7263882994352127, + "grad_norm": 1.6015625, + "learning_rate": 4.529958547869781e-06, + "loss": 0.1373, + "mean_token_accuracy": 0.968953766822815, + "num_tokens": 3142368713.0, + "step": 22250 + }, + { + "entropy": 1.4128772020339966, + "epoch": 0.7280206326923705, + "grad_norm": 2.125, + "learning_rate": 4.5229879418241155e-06, + "loss": 0.1431, + "mean_token_accuracy": 0.9682337057590484, + "num_tokens": 3149212589.0, + "step": 22300 + }, + { + "entropy": 1.4088830590248107, + "epoch": 0.7296529659495282, + "grad_norm": 1.9609375, + "learning_rate": 4.516006240969329e-06, + "loss": 0.1481, + "mean_token_accuracy": 0.9669530403614044, + "num_tokens": 3156794748.0, + "step": 22350 + }, + { + "entropy": 1.41916588306427, + "epoch": 0.7312852992066861, + "grad_norm": 1.1484375, + "learning_rate": 4.509013496166387e-06, + "loss": 0.1429, + "mean_token_accuracy": 0.9679375386238098, + "num_tokens": 3164068091.0, + "step": 22400 + }, + { + "entropy": 1.408693754673004, + "epoch": 0.7329176324638438, + "grad_norm": 1.078125, + "learning_rate": 4.5020097583567104e-06, + "loss": 0.1324, + "mean_token_accuracy": 0.9701212620735169, + "num_tokens": 3170772040.0, + "step": 22450 + }, + { + "entropy": 1.4141771841049193, + "epoch": 0.7345499657210016, + "grad_norm": 1.484375, + "learning_rate": 4.4949950785618025e-06, + "loss": 0.1394, + "mean_token_accuracy": 0.9690138208866119, + "num_tokens": 3177787601.0, + "step": 22500 + }, + { + "epoch": 0.7345499657210016, + "eval_entropy": 1.405626532236735, + "eval_loss": 0.1584155559539795, + "eval_mean_token_accuracy": 0.9649723815917969, + "eval_num_tokens": 3177787601.0, + "eval_runtime": 749.0195, + "eval_samples_per_second": 12.892, + "eval_steps_per_second": 0.101, + "step": 22500 + }, + { + "entropy": 1.409652578830719, + "epoch": 0.7361822989781593, + "grad_norm": 1.515625, + "learning_rate": 4.4879695078828765e-06, + "loss": 0.1447, + "mean_token_accuracy": 0.9667559254169464, + "num_tokens": 3184597089.0, + "step": 22550 + }, + { + "entropy": 1.3969360828399657, + "epoch": 0.7378146322353172, + "grad_norm": 1.8203125, + "learning_rate": 4.480933097500489e-06, + "loss": 0.1261, + "mean_token_accuracy": 0.9719417309761047, + "num_tokens": 3191019437.0, + "step": 22600 + }, + { + "entropy": 1.4107865738868712, + "epoch": 0.7394469654924749, + "grad_norm": 2.28125, + "learning_rate": 4.473885898674155e-06, + "loss": 0.1444, + "mean_token_accuracy": 0.9674191176891327, + "num_tokens": 3198108407.0, + "step": 22650 + }, + { + "entropy": 1.4059437608718872, + "epoch": 0.7410792987496327, + "grad_norm": 1.71875, + "learning_rate": 4.4668279627419904e-06, + "loss": 0.1408, + "mean_token_accuracy": 0.9680229306221009, + "num_tokens": 3205399672.0, + "step": 22700 + }, + { + "entropy": 1.3878255271911621, + "epoch": 0.7427116320067905, + "grad_norm": 1.3671875, + "learning_rate": 4.459759341120323e-06, + "loss": 0.1355, + "mean_token_accuracy": 0.9695122539997101, + "num_tokens": 3212086664.0, + "step": 22750 + }, + { + "entropy": 1.3905278396606446, + "epoch": 0.7443439652639483, + "grad_norm": 1.2265625, + "learning_rate": 4.452680085303331e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9699708425998688, + "num_tokens": 3219061680.0, + "step": 22800 + }, + { + "entropy": 1.392570481300354, + "epoch": 0.745976298521106, + "grad_norm": 2.0, + "learning_rate": 4.445590246862656e-06, + "loss": 0.1348, + "mean_token_accuracy": 0.9696343839168549, + "num_tokens": 3225449142.0, + "step": 22850 + }, + { + "entropy": 1.3988615465164185, + "epoch": 0.7476086317782639, + "grad_norm": 1.4296875, + "learning_rate": 4.438489877447037e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9651832461357117, + "num_tokens": 3232829699.0, + "step": 22900 + }, + { + "entropy": 1.3806819915771484, + "epoch": 0.7492409650354216, + "grad_norm": 1.5546875, + "learning_rate": 4.431379028781927e-06, + "loss": 0.1391, + "mean_token_accuracy": 0.9683671975135804, + "num_tokens": 3239549563.0, + "step": 22950 + }, + { + "entropy": 1.3997270607948302, + "epoch": 0.7508732982925794, + "grad_norm": 1.5703125, + "learning_rate": 4.424257752669122e-06, + "loss": 0.1504, + "mean_token_accuracy": 0.9661344397068024, + "num_tokens": 3246842815.0, + "step": 23000 + }, + { + "epoch": 0.7508732982925794, + "eval_entropy": 1.3884711440404256, + "eval_loss": 0.1580066680908203, + "eval_mean_token_accuracy": 0.9648262977600097, + "eval_num_tokens": 3246842815.0, + "eval_runtime": 751.2443, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 23000 + }, + { + "entropy": 1.386196925640106, + "epoch": 0.7525056315497372, + "grad_norm": 1.7578125, + "learning_rate": 4.417126100986378e-06, + "loss": 0.1349, + "mean_token_accuracy": 0.9699360942840576, + "num_tokens": 3253700706.0, + "step": 23050 + }, + { + "entropy": 1.3982844924926758, + "epoch": 0.754137964806895, + "grad_norm": 2.484375, + "learning_rate": 4.409984125687039e-06, + "loss": 0.1469, + "mean_token_accuracy": 0.9673852014541626, + "num_tokens": 3260955234.0, + "step": 23100 + }, + { + "entropy": 1.4037910151481627, + "epoch": 0.7557702980640527, + "grad_norm": 1.5, + "learning_rate": 4.402831878799652e-06, + "loss": 0.1442, + "mean_token_accuracy": 0.9678021275997162, + "num_tokens": 3267932211.0, + "step": 23150 + }, + { + "entropy": 1.4121773409843446, + "epoch": 0.7574026313212106, + "grad_norm": 1.609375, + "learning_rate": 4.395669412427596e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9696631526947022, + "num_tokens": 3274869428.0, + "step": 23200 + }, + { + "entropy": 1.4119772076606751, + "epoch": 0.7590349645783683, + "grad_norm": 1.78125, + "learning_rate": 4.388496778748694e-06, + "loss": 0.1349, + "mean_token_accuracy": 0.969488970041275, + "num_tokens": 3281666389.0, + "step": 23250 + }, + { + "entropy": 1.3851684546470642, + "epoch": 0.7606672978355261, + "grad_norm": 1.078125, + "learning_rate": 4.381314030014837e-06, + "loss": 0.1419, + "mean_token_accuracy": 0.96839430809021, + "num_tokens": 3289075580.0, + "step": 23300 + }, + { + "entropy": 1.407478768825531, + "epoch": 0.7622996310926838, + "grad_norm": 2.046875, + "learning_rate": 4.374121218551606e-06, + "loss": 0.158, + "mean_token_accuracy": 0.9645157742500305, + "num_tokens": 3296401618.0, + "step": 23350 + }, + { + "entropy": 1.4000224781036377, + "epoch": 0.7639319643498417, + "grad_norm": 1.25, + "learning_rate": 4.366918396757886e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9687736296653747, + "num_tokens": 3303532934.0, + "step": 23400 + }, + { + "entropy": 1.4132404017448426, + "epoch": 0.7655642976069994, + "grad_norm": 1.90625, + "learning_rate": 4.359705617105485e-06, + "loss": 0.1464, + "mean_token_accuracy": 0.9665714311599731, + "num_tokens": 3310772502.0, + "step": 23450 + }, + { + "entropy": 1.409326949119568, + "epoch": 0.7671966308641572, + "grad_norm": 1.0078125, + "learning_rate": 4.352482932138756e-06, + "loss": 0.1475, + "mean_token_accuracy": 0.9669658172130585, + "num_tokens": 3318251468.0, + "step": 23500 + }, + { + "epoch": 0.7671966308641572, + "eval_entropy": 1.3898699220021566, + "eval_loss": 0.15739725530147552, + "eval_mean_token_accuracy": 0.9649832367897033, + "eval_num_tokens": 3318251468.0, + "eval_runtime": 743.2592, + "eval_samples_per_second": 12.991, + "eval_steps_per_second": 0.102, + "step": 23500 + }, + { + "entropy": 1.3853822755813598, + "epoch": 0.768828964121315, + "grad_norm": 1.4375, + "learning_rate": 4.345250394474207e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.968293867111206, + "num_tokens": 3325415082.0, + "step": 23550 + }, + { + "entropy": 1.3828247284889221, + "epoch": 0.7704612973784728, + "grad_norm": 1.3203125, + "learning_rate": 4.338008056800126e-06, + "loss": 0.1408, + "mean_token_accuracy": 0.9685234224796295, + "num_tokens": 3332121947.0, + "step": 23600 + }, + { + "entropy": 1.4080828213691712, + "epoch": 0.7720936306356305, + "grad_norm": 1.546875, + "learning_rate": 4.330755971876192e-06, + "loss": 0.1464, + "mean_token_accuracy": 0.9673770892620087, + "num_tokens": 3339236872.0, + "step": 23650 + }, + { + "entropy": 1.399913146495819, + "epoch": 0.7737259638927884, + "grad_norm": 1.90625, + "learning_rate": 4.3234941925330915e-06, + "loss": 0.144, + "mean_token_accuracy": 0.9670092570781708, + "num_tokens": 3346331383.0, + "step": 23700 + }, + { + "entropy": 1.391896095275879, + "epoch": 0.7753582971499461, + "grad_norm": 1.3125, + "learning_rate": 4.316222771672132e-06, + "loss": 0.1356, + "mean_token_accuracy": 0.9692227625846863, + "num_tokens": 3353234181.0, + "step": 23750 + }, + { + "entropy": 1.4092794895172118, + "epoch": 0.7769906304071039, + "grad_norm": 1.53125, + "learning_rate": 4.3089417622648605e-06, + "loss": 0.1451, + "mean_token_accuracy": 0.9670332086086273, + "num_tokens": 3360224816.0, + "step": 23800 + }, + { + "entropy": 1.4031621408462525, + "epoch": 0.7786229636642616, + "grad_norm": 1.0859375, + "learning_rate": 4.301651217352674e-06, + "loss": 0.1412, + "mean_token_accuracy": 0.9681469559669494, + "num_tokens": 3367296325.0, + "step": 23850 + }, + { + "entropy": 1.4203896260261535, + "epoch": 0.7802552969214195, + "grad_norm": 1.484375, + "learning_rate": 4.294351190046439e-06, + "loss": 0.1433, + "mean_token_accuracy": 0.9682426953315735, + "num_tokens": 3373747388.0, + "step": 23900 + }, + { + "entropy": 1.412433443069458, + "epoch": 0.7818876301785772, + "grad_norm": 1.7421875, + "learning_rate": 4.2870417335260925e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.9671615362167358, + "num_tokens": 3380836126.0, + "step": 23950 + }, + { + "entropy": 1.4091150188446044, + "epoch": 0.783519963435735, + "grad_norm": 1.1953125, + "learning_rate": 4.2797229010402695e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.9665989732742309, + "num_tokens": 3388112439.0, + "step": 24000 + }, + { + "epoch": 0.783519963435735, + "eval_entropy": 1.3812917121251425, + "eval_loss": 0.15769919753074646, + "eval_mean_token_accuracy": 0.964633092880249, + "eval_num_tokens": 3388112439.0, + "eval_runtime": 743.2853, + "eval_samples_per_second": 12.991, + "eval_steps_per_second": 0.102, + "step": 24000 + }, + { + "entropy": 1.3875314927101134, + "epoch": 0.7851522966928928, + "grad_norm": 1.515625, + "learning_rate": 4.272394745905904e-06, + "loss": 0.1394, + "mean_token_accuracy": 0.9680357229709625, + "num_tokens": 3395153920.0, + "step": 24050 + }, + { + "entropy": 1.3935593914985658, + "epoch": 0.7867846299500506, + "grad_norm": 1.609375, + "learning_rate": 4.265057321507848e-06, + "loss": 0.1223, + "mean_token_accuracy": 0.9719667887687683, + "num_tokens": 3401608036.0, + "step": 24100 + }, + { + "entropy": 1.3922600531578064, + "epoch": 0.7884169632072083, + "grad_norm": 1.140625, + "learning_rate": 4.257710681298474e-06, + "loss": 0.1431, + "mean_token_accuracy": 0.9678590965270996, + "num_tokens": 3409048396.0, + "step": 24150 + }, + { + "entropy": 1.404571294784546, + "epoch": 0.7900492964643662, + "grad_norm": 1.484375, + "learning_rate": 4.250354878797295e-06, + "loss": 0.1339, + "mean_token_accuracy": 0.969120637178421, + "num_tokens": 3415718906.0, + "step": 24200 + }, + { + "entropy": 1.4066750693321228, + "epoch": 0.7916816297215239, + "grad_norm": 1.6171875, + "learning_rate": 4.242989967590568e-06, + "loss": 0.1471, + "mean_token_accuracy": 0.9678963148593902, + "num_tokens": 3422879629.0, + "step": 24250 + }, + { + "entropy": 1.4055565428733825, + "epoch": 0.7933139629786817, + "grad_norm": 1.7734375, + "learning_rate": 4.235616001330909e-06, + "loss": 0.1386, + "mean_token_accuracy": 0.9684436011314392, + "num_tokens": 3430035861.0, + "step": 24300 + }, + { + "entropy": 1.3904715991020202, + "epoch": 0.7949462962358395, + "grad_norm": 1.6640625, + "learning_rate": 4.228233033736894e-06, + "loss": 0.1375, + "mean_token_accuracy": 0.9683421933650971, + "num_tokens": 3437370072.0, + "step": 24350 + }, + { + "entropy": 1.4026382374763489, + "epoch": 0.7965786294929973, + "grad_norm": 1.28125, + "learning_rate": 4.22084111859268e-06, + "loss": 0.1447, + "mean_token_accuracy": 0.967241278886795, + "num_tokens": 3444922290.0, + "step": 24400 + }, + { + "entropy": 1.4011790704727174, + "epoch": 0.798210962750155, + "grad_norm": 1.6484375, + "learning_rate": 4.213440309747597e-06, + "loss": 0.1355, + "mean_token_accuracy": 0.969396059513092, + "num_tokens": 3451653589.0, + "step": 24450 + }, + { + "entropy": 1.410584397315979, + "epoch": 0.7998432960073129, + "grad_norm": 1.3046875, + "learning_rate": 4.206030661115772e-06, + "loss": 0.1447, + "mean_token_accuracy": 0.9678661072254181, + "num_tokens": 3459121557.0, + "step": 24500 + }, + { + "epoch": 0.7998432960073129, + "eval_entropy": 1.3954780069986978, + "eval_loss": 0.15847522020339966, + "eval_mean_token_accuracy": 0.9648264590899149, + "eval_num_tokens": 3459121557.0, + "eval_runtime": 748.4306, + "eval_samples_per_second": 12.902, + "eval_steps_per_second": 0.102, + "step": 24500 + }, + { + "entropy": 1.4040296864509583, + "epoch": 0.8014756292644706, + "grad_norm": 1.1875, + "learning_rate": 4.198612226675727e-06, + "loss": 0.1397, + "mean_token_accuracy": 0.9695000052452087, + "num_tokens": 3466079227.0, + "step": 24550 + }, + { + "entropy": 1.4071758961677552, + "epoch": 0.8031079625216284, + "grad_norm": 1.296875, + "learning_rate": 4.191185060469987e-06, + "loss": 0.145, + "mean_token_accuracy": 0.9681662321090698, + "num_tokens": 3472782801.0, + "step": 24600 + }, + { + "entropy": 1.4046114492416382, + "epoch": 0.8047402957787863, + "grad_norm": 1.0859375, + "learning_rate": 4.183749216604685e-06, + "loss": 0.137, + "mean_token_accuracy": 0.9698592948913575, + "num_tokens": 3479515350.0, + "step": 24650 + }, + { + "entropy": 1.4098283767700195, + "epoch": 0.806372629035944, + "grad_norm": 1.875, + "learning_rate": 4.1763047492491746e-06, + "loss": 0.1386, + "mean_token_accuracy": 0.9687701988220215, + "num_tokens": 3486524754.0, + "step": 24700 + }, + { + "entropy": 1.39289204120636, + "epoch": 0.8080049622931018, + "grad_norm": 1.375, + "learning_rate": 4.1688517126356256e-06, + "loss": 0.1331, + "mean_token_accuracy": 0.9696247518062592, + "num_tokens": 3493517440.0, + "step": 24750 + }, + { + "entropy": 1.3983655071258545, + "epoch": 0.8096372955502595, + "grad_norm": 1.25, + "learning_rate": 4.161390161058637e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9675554573535919, + "num_tokens": 3500746527.0, + "step": 24800 + }, + { + "entropy": 1.4149045872688293, + "epoch": 0.8112696288074174, + "grad_norm": 0.490234375, + "learning_rate": 4.153920148874839e-06, + "loss": 0.1459, + "mean_token_accuracy": 0.9668138778209686, + "num_tokens": 3507701444.0, + "step": 24850 + }, + { + "entropy": 1.4177529954910277, + "epoch": 0.8129019620645751, + "grad_norm": 1.7109375, + "learning_rate": 4.146441730502496e-06, + "loss": 0.152, + "mean_token_accuracy": 0.9660572922229766, + "num_tokens": 3514490276.0, + "step": 24900 + }, + { + "entropy": 1.4248465538024901, + "epoch": 0.8145342953217329, + "grad_norm": 1.59375, + "learning_rate": 4.1389549604211064e-06, + "loss": 0.1481, + "mean_token_accuracy": 0.966586571931839, + "num_tokens": 3521708251.0, + "step": 24950 + }, + { + "entropy": 1.4028679990768433, + "epoch": 0.8161666285788907, + "grad_norm": 2.171875, + "learning_rate": 4.131459893171016e-06, + "loss": 0.1293, + "mean_token_accuracy": 0.9706797707080841, + "num_tokens": 3528390818.0, + "step": 25000 + }, + { + "epoch": 0.8161666285788907, + "eval_entropy": 1.4112112029393513, + "eval_loss": 0.15821218490600586, + "eval_mean_token_accuracy": 0.9647614455223084, + "eval_num_tokens": 3528390818.0, + "eval_runtime": 747.1945, + "eval_samples_per_second": 12.923, + "eval_steps_per_second": 0.102, + "step": 25000 + }, + { + "entropy": 1.4167933750152588, + "epoch": 0.8177989618360485, + "grad_norm": 1.765625, + "learning_rate": 4.1239565833530115e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9679395818710327, + "num_tokens": 3535142016.0, + "step": 25050 + }, + { + "entropy": 1.4142408227920533, + "epoch": 0.8194312950932062, + "grad_norm": 1.4140625, + "learning_rate": 4.116445085627926e-06, + "loss": 0.1437, + "mean_token_accuracy": 0.9675932359695435, + "num_tokens": 3541811208.0, + "step": 25100 + }, + { + "entropy": 1.4017269968986512, + "epoch": 0.8210636283503641, + "grad_norm": 1.078125, + "learning_rate": 4.108925454716242e-06, + "loss": 0.1388, + "mean_token_accuracy": 0.9687268888950348, + "num_tokens": 3549027064.0, + "step": 25150 + }, + { + "entropy": 1.4196626782417296, + "epoch": 0.8226959616075218, + "grad_norm": 2.0, + "learning_rate": 4.101397745397689e-06, + "loss": 0.1374, + "mean_token_accuracy": 0.9688492357730866, + "num_tokens": 3556231008.0, + "step": 25200 + }, + { + "entropy": 1.4409023642539978, + "epoch": 0.8243282948646796, + "grad_norm": 1.7578125, + "learning_rate": 4.093862012510847e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9662479484081268, + "num_tokens": 3563607911.0, + "step": 25250 + }, + { + "entropy": 1.3985059690475463, + "epoch": 0.8259606281218373, + "grad_norm": 1.1015625, + "learning_rate": 4.086318310952752e-06, + "loss": 0.1363, + "mean_token_accuracy": 0.969468570947647, + "num_tokens": 3570544476.0, + "step": 25300 + }, + { + "entropy": 1.395876476764679, + "epoch": 0.8275929613789952, + "grad_norm": 2.046875, + "learning_rate": 4.078766695678484e-06, + "loss": 0.141, + "mean_token_accuracy": 0.9681139755249023, + "num_tokens": 3577514806.0, + "step": 25350 + }, + { + "entropy": 1.3914256238937377, + "epoch": 0.8292252946361529, + "grad_norm": 1.3671875, + "learning_rate": 4.071207221700778e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.9679916310310364, + "num_tokens": 3584710985.0, + "step": 25400 + }, + { + "entropy": 1.3804515194892883, + "epoch": 0.8308576278933107, + "grad_norm": 1.6875, + "learning_rate": 4.063639944089617e-06, + "loss": 0.1369, + "mean_token_accuracy": 0.9694907116889954, + "num_tokens": 3591780461.0, + "step": 25450 + }, + { + "entropy": 1.4110288119316101, + "epoch": 0.8324899611504685, + "grad_norm": 2.03125, + "learning_rate": 4.0560649179718345e-06, + "loss": 0.1454, + "mean_token_accuracy": 0.9680348300933838, + "num_tokens": 3598701756.0, + "step": 25500 + }, + { + "epoch": 0.8324899611504685, + "eval_entropy": 1.4156104850769042, + "eval_loss": 0.1576152741909027, + "eval_mean_token_accuracy": 0.9650608507792154, + "eval_num_tokens": 3598701756.0, + "eval_runtime": 749.1988, + "eval_samples_per_second": 12.888, + "eval_steps_per_second": 0.101, + "step": 25500 + }, + { + "entropy": 1.40669499874115, + "epoch": 0.8341222944076263, + "grad_norm": 1.3359375, + "learning_rate": 4.048482198530708e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9696508872509003, + "num_tokens": 3605779866.0, + "step": 25550 + }, + { + "entropy": 1.40333979845047, + "epoch": 0.835754627664784, + "grad_norm": 1.203125, + "learning_rate": 4.040891841005562e-06, + "loss": 0.1327, + "mean_token_accuracy": 0.9699014961719513, + "num_tokens": 3612587584.0, + "step": 25600 + }, + { + "entropy": 1.412612452507019, + "epoch": 0.8373869609219419, + "grad_norm": 1.2578125, + "learning_rate": 4.033293900691364e-06, + "loss": 0.1471, + "mean_token_accuracy": 0.9674671077728272, + "num_tokens": 3619996003.0, + "step": 25650 + }, + { + "entropy": 1.411652238368988, + "epoch": 0.8390192941790996, + "grad_norm": 1.7578125, + "learning_rate": 4.02568843293832e-06, + "loss": 0.1484, + "mean_token_accuracy": 0.9667781054973602, + "num_tokens": 3627359567.0, + "step": 25700 + }, + { + "entropy": 1.3998070549964905, + "epoch": 0.8406516274362574, + "grad_norm": 1.234375, + "learning_rate": 4.0180754931514745e-06, + "loss": 0.1456, + "mean_token_accuracy": 0.9677145159244538, + "num_tokens": 3634637886.0, + "step": 25750 + }, + { + "entropy": 1.3955191278457642, + "epoch": 0.8422839606934152, + "grad_norm": 1.7109375, + "learning_rate": 4.010455136790304e-06, + "loss": 0.1469, + "mean_token_accuracy": 0.9672901368141175, + "num_tokens": 3642177370.0, + "step": 25800 + }, + { + "entropy": 1.3989661598205567, + "epoch": 0.843916293950573, + "grad_norm": 1.2734375, + "learning_rate": 4.0028274193683124e-06, + "loss": 0.1401, + "mean_token_accuracy": 0.9687706243991852, + "num_tokens": 3649058563.0, + "step": 25850 + }, + { + "entropy": 1.4021561121940613, + "epoch": 0.8455486272077307, + "grad_norm": 1.9609375, + "learning_rate": 3.995192396452631e-06, + "loss": 0.1411, + "mean_token_accuracy": 0.9682626259326935, + "num_tokens": 3655854303.0, + "step": 25900 + }, + { + "entropy": 1.382779130935669, + "epoch": 0.8471809604648886, + "grad_norm": 2.25, + "learning_rate": 3.987550123663608e-06, + "loss": 0.1462, + "mean_token_accuracy": 0.9676120269298554, + "num_tokens": 3662624685.0, + "step": 25950 + }, + { + "entropy": 1.3848181200027465, + "epoch": 0.8488132937220463, + "grad_norm": 2.25, + "learning_rate": 3.97990065667441e-06, + "loss": 0.14, + "mean_token_accuracy": 0.9676021826267243, + "num_tokens": 3669341852.0, + "step": 26000 + }, + { + "epoch": 0.8488132937220463, + "eval_entropy": 1.3808231941858928, + "eval_loss": 0.15642417967319489, + "eval_mean_token_accuracy": 0.9652521824836731, + "eval_num_tokens": 3669341852.0, + "eval_runtime": 753.4968, + "eval_samples_per_second": 12.815, + "eval_steps_per_second": 0.101, + "step": 26000 + }, + { + "entropy": 1.388938684463501, + "epoch": 0.8504456269792041, + "grad_norm": 1.8984375, + "learning_rate": 3.972244051210608e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.9663873422145843, + "num_tokens": 3676639962.0, + "step": 26050 + }, + { + "entropy": 1.3947418189048768, + "epoch": 0.8520779602363618, + "grad_norm": 1.2421875, + "learning_rate": 3.964580363049779e-06, + "loss": 0.1446, + "mean_token_accuracy": 0.9676153147220612, + "num_tokens": 3683543595.0, + "step": 26100 + }, + { + "entropy": 1.3950307440757752, + "epoch": 0.8537102934935197, + "grad_norm": 1.2109375, + "learning_rate": 3.956909648021096e-06, + "loss": 0.136, + "mean_token_accuracy": 0.9690661346912384, + "num_tokens": 3690286971.0, + "step": 26150 + }, + { + "entropy": 1.402916703224182, + "epoch": 0.8553426267506774, + "grad_norm": 2.09375, + "learning_rate": 3.949231962004922e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9650176286697387, + "num_tokens": 3697865663.0, + "step": 26200 + }, + { + "entropy": 1.408561556339264, + "epoch": 0.8569749600078352, + "grad_norm": 2.265625, + "learning_rate": 3.941547360932404e-06, + "loss": 0.1375, + "mean_token_accuracy": 0.9691826546192169, + "num_tokens": 3704856688.0, + "step": 26250 + }, + { + "entropy": 1.3971546220779418, + "epoch": 0.858607293264993, + "grad_norm": 1.6171875, + "learning_rate": 3.933855900785063e-06, + "loss": 0.1349, + "mean_token_accuracy": 0.9694166851043701, + "num_tokens": 3711350984.0, + "step": 26300 + }, + { + "entropy": 1.403090295791626, + "epoch": 0.8602396265221508, + "grad_norm": 1.34375, + "learning_rate": 3.926157637594387e-06, + "loss": 0.1427, + "mean_token_accuracy": 0.9675269103050232, + "num_tokens": 3718476089.0, + "step": 26350 + }, + { + "entropy": 1.3978805589675902, + "epoch": 0.8618719597793085, + "grad_norm": 1.890625, + "learning_rate": 3.918452627441425e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9688924837112427, + "num_tokens": 3725594290.0, + "step": 26400 + }, + { + "entropy": 1.4037820672988892, + "epoch": 0.8635042930364664, + "grad_norm": 2.109375, + "learning_rate": 3.910740926456376e-06, + "loss": 0.1356, + "mean_token_accuracy": 0.9694356083869934, + "num_tokens": 3731938265.0, + "step": 26450 + }, + { + "entropy": 1.4001559686660767, + "epoch": 0.8651366262936241, + "grad_norm": 1.8515625, + "learning_rate": 3.903022590818183e-06, + "loss": 0.1364, + "mean_token_accuracy": 0.9691901934146882, + "num_tokens": 3738816334.0, + "step": 26500 + }, + { + "epoch": 0.8651366262936241, + "eval_entropy": 1.3956389427185059, + "eval_loss": 0.1563226878643036, + "eval_mean_token_accuracy": 0.965215961933136, + "eval_num_tokens": 3738816334.0, + "eval_runtime": 749.9014, + "eval_samples_per_second": 12.876, + "eval_steps_per_second": 0.101, + "step": 26500 + }, + { + "entropy": 1.3984951615333556, + "epoch": 0.8667689595507819, + "grad_norm": 1.4765625, + "learning_rate": 3.895297676754119e-06, + "loss": 0.1413, + "mean_token_accuracy": 0.9679640185832977, + "num_tokens": 3745955192.0, + "step": 26550 + }, + { + "entropy": 1.3817971444129944, + "epoch": 0.8684012928079397, + "grad_norm": 1.390625, + "learning_rate": 3.887566240539381e-06, + "loss": 0.1445, + "mean_token_accuracy": 0.9670935535430908, + "num_tokens": 3753010501.0, + "step": 26600 + }, + { + "entropy": 1.3948706936836244, + "epoch": 0.8700336260650975, + "grad_norm": 1.5234375, + "learning_rate": 3.87982833849668e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9705722856521607, + "num_tokens": 3759798351.0, + "step": 26650 + }, + { + "entropy": 1.393795645236969, + "epoch": 0.8716659593222552, + "grad_norm": 1.0, + "learning_rate": 3.87208402699583e-06, + "loss": 0.1317, + "mean_token_accuracy": 0.9698431146144867, + "num_tokens": 3766781169.0, + "step": 26700 + }, + { + "entropy": 1.3873535466194153, + "epoch": 0.873298292579413, + "grad_norm": 1.9921875, + "learning_rate": 3.864333362453337e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9688050973415375, + "num_tokens": 3773957478.0, + "step": 26750 + }, + { + "entropy": 1.3917481398582459, + "epoch": 0.8749306258365708, + "grad_norm": 1.21875, + "learning_rate": 3.856576401331988e-06, + "loss": 0.1394, + "mean_token_accuracy": 0.9683778524398804, + "num_tokens": 3780762633.0, + "step": 26800 + }, + { + "entropy": 1.3908830904960632, + "epoch": 0.8765629590937286, + "grad_norm": 1.7265625, + "learning_rate": 3.848813200140437e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.9653457498550415, + "num_tokens": 3788278563.0, + "step": 26850 + }, + { + "entropy": 1.4087229990959167, + "epoch": 0.8781952923508863, + "grad_norm": 1.90625, + "learning_rate": 3.841043815432803e-06, + "loss": 0.1401, + "mean_token_accuracy": 0.9690191769599914, + "num_tokens": 3794646188.0, + "step": 26900 + }, + { + "entropy": 1.3997572946548462, + "epoch": 0.8798276256080442, + "grad_norm": 1.359375, + "learning_rate": 3.833268303808244e-06, + "loss": 0.1366, + "mean_token_accuracy": 0.9689948236942292, + "num_tokens": 3801360008.0, + "step": 26950 + }, + { + "entropy": 1.3976869773864746, + "epoch": 0.8814599588652019, + "grad_norm": 1.3984375, + "learning_rate": 3.8254867219105575e-06, + "loss": 0.1307, + "mean_token_accuracy": 0.970016497373581, + "num_tokens": 3808022349.0, + "step": 27000 + }, + { + "epoch": 0.8814599588652019, + "eval_entropy": 1.3932032998402912, + "eval_loss": 0.15805409848690033, + "eval_mean_token_accuracy": 0.9647916714350383, + "eval_num_tokens": 3808022349.0, + "eval_runtime": 750.2779, + "eval_samples_per_second": 12.87, + "eval_steps_per_second": 0.101, + "step": 27000 + }, + { + "entropy": 1.406122510433197, + "epoch": 0.8830922921223597, + "grad_norm": 1.625, + "learning_rate": 3.8176991264277604e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9653628063201904, + "num_tokens": 3814977879.0, + "step": 27050 + }, + { + "entropy": 1.3947002291679382, + "epoch": 0.8847246253795175, + "grad_norm": 1.1328125, + "learning_rate": 3.809905574091676e-06, + "loss": 0.1439, + "mean_token_accuracy": 0.9674938654899597, + "num_tokens": 3822114331.0, + "step": 27100 + }, + { + "entropy": 1.3995588779449464, + "epoch": 0.8863569586366753, + "grad_norm": 1.2265625, + "learning_rate": 3.802106121677525e-06, + "loss": 0.1389, + "mean_token_accuracy": 0.9679549074172974, + "num_tokens": 3829036413.0, + "step": 27150 + }, + { + "entropy": 1.397332239151001, + "epoch": 0.887989291893833, + "grad_norm": 1.8046875, + "learning_rate": 3.7943008260035106e-06, + "loss": 0.1394, + "mean_token_accuracy": 0.9689336049556733, + "num_tokens": 3835541715.0, + "step": 27200 + }, + { + "entropy": 1.398532338142395, + "epoch": 0.8896216251509909, + "grad_norm": 1.015625, + "learning_rate": 3.7864897439304e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.970375450849533, + "num_tokens": 3842178219.0, + "step": 27250 + }, + { + "entropy": 1.3905003619194032, + "epoch": 0.8912539584081486, + "grad_norm": 1.0234375, + "learning_rate": 3.778672932361116e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9678446400165558, + "num_tokens": 3849481284.0, + "step": 27300 + }, + { + "entropy": 1.4009161067008973, + "epoch": 0.8928862916653064, + "grad_norm": 1.7734375, + "learning_rate": 3.7708504482403198e-06, + "loss": 0.1369, + "mean_token_accuracy": 0.9693617796897889, + "num_tokens": 3855937939.0, + "step": 27350 + }, + { + "entropy": 1.3991939783096314, + "epoch": 0.8945186249224641, + "grad_norm": 1.2109375, + "learning_rate": 3.7630223485539955e-06, + "loss": 0.1391, + "mean_token_accuracy": 0.9692135906219482, + "num_tokens": 3863063108.0, + "step": 27400 + }, + { + "entropy": 1.4059844970703126, + "epoch": 0.896150958179622, + "grad_norm": 1.3046875, + "learning_rate": 3.755188690329039e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9687328970432282, + "num_tokens": 3870522891.0, + "step": 27450 + }, + { + "entropy": 1.4190341973304748, + "epoch": 0.8977832914367797, + "grad_norm": 1.7734375, + "learning_rate": 3.747349530632837e-06, + "loss": 0.137, + "mean_token_accuracy": 0.9695413172245025, + "num_tokens": 3877480147.0, + "step": 27500 + }, + { + "epoch": 0.8977832914367797, + "eval_entropy": 1.407424882253011, + "eval_loss": 0.1556614637374878, + "eval_mean_token_accuracy": 0.9652939391136169, + "eval_num_tokens": 3877480147.0, + "eval_runtime": 746.039, + "eval_samples_per_second": 12.943, + "eval_steps_per_second": 0.102, + "step": 27500 + }, + { + "entropy": 1.4175709581375122, + "epoch": 0.8994156246939375, + "grad_norm": 1.265625, + "learning_rate": 3.7395049265728537e-06, + "loss": 0.1574, + "mean_token_accuracy": 0.964717469215393, + "num_tokens": 3884856009.0, + "step": 27550 + }, + { + "entropy": 1.4064431715011596, + "epoch": 0.9010479579510953, + "grad_norm": 1.3828125, + "learning_rate": 3.7316549352962154e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9685567510128021, + "num_tokens": 3891927392.0, + "step": 27600 + }, + { + "entropy": 1.4132932043075561, + "epoch": 0.9026802912082531, + "grad_norm": 1.1328125, + "learning_rate": 3.7237996139892955e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9683604872226715, + "num_tokens": 3899174699.0, + "step": 27650 + }, + { + "entropy": 1.3990925669670105, + "epoch": 0.9043126244654108, + "grad_norm": 1.0703125, + "learning_rate": 3.7159390198772933e-06, + "loss": 0.1453, + "mean_token_accuracy": 0.9674279451370239, + "num_tokens": 3906648455.0, + "step": 27700 + }, + { + "entropy": 1.3980679297447205, + "epoch": 0.9059449577225687, + "grad_norm": 0.9453125, + "learning_rate": 3.7080732102238214e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9701244246959686, + "num_tokens": 3913624677.0, + "step": 27750 + }, + { + "entropy": 1.4017365527153016, + "epoch": 0.9075772909797264, + "grad_norm": 1.6171875, + "learning_rate": 3.700202242330488e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9692477977275848, + "num_tokens": 3920103211.0, + "step": 27800 + }, + { + "entropy": 1.3995830225944519, + "epoch": 0.9092096242368842, + "grad_norm": 1.5078125, + "learning_rate": 3.6923261735364753e-06, + "loss": 0.1343, + "mean_token_accuracy": 0.9691221857070923, + "num_tokens": 3927208526.0, + "step": 27850 + }, + { + "entropy": 1.3959479594230653, + "epoch": 0.910841957494042, + "grad_norm": 1.2265625, + "learning_rate": 3.6844450612181293e-06, + "loss": 0.1442, + "mean_token_accuracy": 0.967134929895401, + "num_tokens": 3934542379.0, + "step": 27900 + }, + { + "entropy": 1.391806445121765, + "epoch": 0.9124742907511998, + "grad_norm": 1.203125, + "learning_rate": 3.6765589627885352e-06, + "loss": 0.1313, + "mean_token_accuracy": 0.9697517728805543, + "num_tokens": 3941546757.0, + "step": 27950 + }, + { + "entropy": 1.3797322702407837, + "epoch": 0.9141066240083575, + "grad_norm": 1.6953125, + "learning_rate": 3.6686679356971017e-06, + "loss": 0.1352, + "mean_token_accuracy": 0.9690727829933167, + "num_tokens": 3948336251.0, + "step": 28000 + }, + { + "epoch": 0.9141066240083575, + "eval_entropy": 1.3829597409566243, + "eval_loss": 0.15241551399230957, + "eval_mean_token_accuracy": 0.9657313092549642, + "eval_num_tokens": 3948336251.0, + "eval_runtime": 750.4693, + "eval_samples_per_second": 12.867, + "eval_steps_per_second": 0.101, + "step": 28000 + }, + { + "entropy": 1.3851616716384887, + "epoch": 0.9157389572655154, + "grad_norm": 1.1015625, + "learning_rate": 3.660772037429141e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9694375658035278, + "num_tokens": 3955743379.0, + "step": 28050 + }, + { + "entropy": 1.3846897101402282, + "epoch": 0.9173712905226731, + "grad_norm": 1.5390625, + "learning_rate": 3.652871325505453e-06, + "loss": 0.1396, + "mean_token_accuracy": 0.9685408413410187, + "num_tokens": 3962634196.0, + "step": 28100 + }, + { + "entropy": 1.3823327445983886, + "epoch": 0.9190036237798309, + "grad_norm": 1.484375, + "learning_rate": 3.6449658574819062e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9699604260921478, + "num_tokens": 3969575759.0, + "step": 28150 + }, + { + "entropy": 1.3885199642181396, + "epoch": 0.9206359570369886, + "grad_norm": 1.21875, + "learning_rate": 3.637055690949012e-06, + "loss": 0.1365, + "mean_token_accuracy": 0.9687310314178467, + "num_tokens": 3976309312.0, + "step": 28200 + }, + { + "entropy": 1.3812410712242127, + "epoch": 0.9222682902941465, + "grad_norm": 1.7734375, + "learning_rate": 3.629140883531515e-06, + "loss": 0.1322, + "mean_token_accuracy": 0.9697493410110474, + "num_tokens": 3983034311.0, + "step": 28250 + }, + { + "entropy": 1.3719140005111694, + "epoch": 0.9239006235513042, + "grad_norm": 2.671875, + "learning_rate": 3.6212214928879643e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.9697664487361908, + "num_tokens": 3990038956.0, + "step": 28300 + }, + { + "entropy": 1.3883194899559022, + "epoch": 0.925532956808462, + "grad_norm": 1.734375, + "learning_rate": 3.6132975767103e-06, + "loss": 0.127, + "mean_token_accuracy": 0.9704679012298584, + "num_tokens": 3997170550.0, + "step": 28350 + }, + { + "entropy": 1.3769879937171936, + "epoch": 0.9271652900656198, + "grad_norm": 1.515625, + "learning_rate": 3.6053691927234304e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9701116299629211, + "num_tokens": 4003683105.0, + "step": 28400 + }, + { + "entropy": 1.36924959897995, + "epoch": 0.9287976233227776, + "grad_norm": 2.046875, + "learning_rate": 3.5974363986848077e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9698223459720612, + "num_tokens": 4010540640.0, + "step": 28450 + }, + { + "entropy": 1.3812626338005065, + "epoch": 0.9304299565799353, + "grad_norm": 1.65625, + "learning_rate": 3.5894992523840146e-06, + "loss": 0.1395, + "mean_token_accuracy": 0.9686692810058594, + "num_tokens": 4017705171.0, + "step": 28500 + }, + { + "epoch": 0.9304299565799353, + "eval_entropy": 1.3810707855224609, + "eval_loss": 0.15061478316783905, + "eval_mean_token_accuracy": 0.9661375037829081, + "eval_num_tokens": 4017705171.0, + "eval_runtime": 742.3916, + "eval_samples_per_second": 13.007, + "eval_steps_per_second": 0.102, + "step": 28500 + }, + { + "entropy": 1.3842783665657044, + "epoch": 0.9320622898370932, + "grad_norm": 1.8671875, + "learning_rate": 3.581557811642338e-06, + "loss": 0.1411, + "mean_token_accuracy": 0.9674799299240112, + "num_tokens": 4025097590.0, + "step": 28550 + }, + { + "entropy": 1.374339952468872, + "epoch": 0.9336946230942509, + "grad_norm": 1.9453125, + "learning_rate": 3.57361213431235e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.96745934009552, + "num_tokens": 4032859225.0, + "step": 28600 + }, + { + "entropy": 1.3793137764930725, + "epoch": 0.9353269563514087, + "grad_norm": 1.78125, + "learning_rate": 3.565662278277484e-06, + "loss": 0.1371, + "mean_token_accuracy": 0.9678510630130768, + "num_tokens": 4040008030.0, + "step": 28650 + }, + { + "entropy": 1.3788374090194702, + "epoch": 0.9369592896085664, + "grad_norm": 1.3515625, + "learning_rate": 3.5577083014516183e-06, + "loss": 0.1261, + "mean_token_accuracy": 0.9712175786495209, + "num_tokens": 4046560521.0, + "step": 28700 + }, + { + "entropy": 1.3812962436676026, + "epoch": 0.9385916228657243, + "grad_norm": 1.4765625, + "learning_rate": 3.549750261778648e-06, + "loss": 0.1378, + "mean_token_accuracy": 0.9689911651611328, + "num_tokens": 4053568855.0, + "step": 28750 + }, + { + "entropy": 1.384900779724121, + "epoch": 0.940223956122882, + "grad_norm": 3.3125, + "learning_rate": 3.5417882172320663e-06, + "loss": 0.1418, + "mean_token_accuracy": 0.9679834198951721, + "num_tokens": 4060892652.0, + "step": 28800 + }, + { + "entropy": 1.3779156827926635, + "epoch": 0.9418562893800398, + "grad_norm": 1.3515625, + "learning_rate": 3.5338222258145408e-06, + "loss": 0.1246, + "mean_token_accuracy": 0.9717523455619812, + "num_tokens": 4067536985.0, + "step": 28850 + }, + { + "entropy": 1.3658223152160645, + "epoch": 0.9434886226371976, + "grad_norm": 2.40625, + "learning_rate": 3.525852345557493e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9694867217540741, + "num_tokens": 4075107141.0, + "step": 28900 + }, + { + "entropy": 1.3954302740097047, + "epoch": 0.9451209558943554, + "grad_norm": 1.640625, + "learning_rate": 3.5178786345206746e-06, + "loss": 0.1341, + "mean_token_accuracy": 0.9694812285900116, + "num_tokens": 4082057215.0, + "step": 28950 + }, + { + "entropy": 1.3778202867507934, + "epoch": 0.9467532891515131, + "grad_norm": 1.984375, + "learning_rate": 3.509901150791742e-06, + "loss": 0.1394, + "mean_token_accuracy": 0.9685069918632507, + "num_tokens": 4089240268.0, + "step": 29000 + }, + { + "epoch": 0.9467532891515131, + "eval_entropy": 1.3786554765701293, + "eval_loss": 0.14917373657226562, + "eval_mean_token_accuracy": 0.9661656268437704, + "eval_num_tokens": 4089240268.0, + "eval_runtime": 744.7112, + "eval_samples_per_second": 12.966, + "eval_steps_per_second": 0.102, + "step": 29000 + }, + { + "entropy": 1.3767950320243836, + "epoch": 0.948385622408671, + "grad_norm": 1.9609375, + "learning_rate": 3.5019199524858355e-06, + "loss": 0.1245, + "mean_token_accuracy": 0.9718893337249755, + "num_tokens": 4096140756.0, + "step": 29050 + }, + { + "entropy": 1.375689399242401, + "epoch": 0.9500179556658287, + "grad_norm": 1.2890625, + "learning_rate": 3.493935097745158e-06, + "loss": 0.1242, + "mean_token_accuracy": 0.9714548885822296, + "num_tokens": 4102843838.0, + "step": 29100 + }, + { + "entropy": 1.386333782672882, + "epoch": 0.9516502889229865, + "grad_norm": 1.25, + "learning_rate": 3.4859466447385477e-06, + "loss": 0.1364, + "mean_token_accuracy": 0.9688875234127045, + "num_tokens": 4109572295.0, + "step": 29150 + }, + { + "entropy": 1.3935841035842895, + "epoch": 0.9532826221801443, + "grad_norm": 1.40625, + "learning_rate": 3.477954651661055e-06, + "loss": 0.1389, + "mean_token_accuracy": 0.9683682763576508, + "num_tokens": 4116355893.0, + "step": 29200 + }, + { + "entropy": 1.3900840377807617, + "epoch": 0.9549149554373021, + "grad_norm": 1.609375, + "learning_rate": 3.4699591767335203e-06, + "loss": 0.1461, + "mean_token_accuracy": 0.9672015142440796, + "num_tokens": 4123980186.0, + "step": 29250 + }, + { + "entropy": 1.3854497838020325, + "epoch": 0.9565472886944598, + "grad_norm": 1.84375, + "learning_rate": 3.4619602782021497e-06, + "loss": 0.127, + "mean_token_accuracy": 0.971345556974411, + "num_tokens": 4130458823.0, + "step": 29300 + }, + { + "entropy": 1.3904473185539246, + "epoch": 0.9581796219516177, + "grad_norm": 1.671875, + "learning_rate": 3.4539580143380884e-06, + "loss": 0.1345, + "mean_token_accuracy": 0.9702473485469818, + "num_tokens": 4137313489.0, + "step": 29350 + }, + { + "entropy": 1.393708050251007, + "epoch": 0.9598119552087754, + "grad_norm": 1.5625, + "learning_rate": 3.4459524434369967e-06, + "loss": 0.1393, + "mean_token_accuracy": 0.9680514478683472, + "num_tokens": 4144869499.0, + "step": 29400 + }, + { + "entropy": 1.392387228012085, + "epoch": 0.9614442884659332, + "grad_norm": 1.7265625, + "learning_rate": 3.437943623818631e-06, + "loss": 0.1244, + "mean_token_accuracy": 0.9713895416259766, + "num_tokens": 4151722923.0, + "step": 29450 + }, + { + "entropy": 1.3936436820030211, + "epoch": 0.9630766217230909, + "grad_norm": 1.296875, + "learning_rate": 3.4299316138264096e-06, + "loss": 0.1435, + "mean_token_accuracy": 0.9673401594161988, + "num_tokens": 4159401264.0, + "step": 29500 + }, + { + "epoch": 0.9630766217230909, + "eval_entropy": 1.377136646906535, + "eval_loss": 0.1480059176683426, + "eval_mean_token_accuracy": 0.9664394434293111, + "eval_num_tokens": 4159401264.0, + "eval_runtime": 746.3254, + "eval_samples_per_second": 12.938, + "eval_steps_per_second": 0.102, + "step": 29500 + }, + { + "entropy": 1.3720522713661194, + "epoch": 0.9647089549802488, + "grad_norm": 0.00274658203125, + "learning_rate": 3.4219164718269925e-06, + "loss": 0.1237, + "mean_token_accuracy": 0.9710344398021697, + "num_tokens": 4166633451.0, + "step": 29550 + }, + { + "entropy": 1.3591685533523559, + "epoch": 0.9663412882374065, + "grad_norm": 1.703125, + "learning_rate": 3.41389825620986e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9711633479595184, + "num_tokens": 4173854933.0, + "step": 29600 + }, + { + "entropy": 1.3691086530685426, + "epoch": 0.9679736214945643, + "grad_norm": 1.09375, + "learning_rate": 3.405877025386879e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.969690408706665, + "num_tokens": 4180650471.0, + "step": 29650 + }, + { + "entropy": 1.3538868117332459, + "epoch": 0.9696059547517221, + "grad_norm": 1.484375, + "learning_rate": 3.397852837791885e-06, + "loss": 0.1193, + "mean_token_accuracy": 0.9719677448272706, + "num_tokens": 4187582242.0, + "step": 29700 + }, + { + "entropy": 1.3645092558860779, + "epoch": 0.9712382880088799, + "grad_norm": 1.8828125, + "learning_rate": 3.389825751880252e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9694900810718536, + "num_tokens": 4194210247.0, + "step": 29750 + }, + { + "entropy": 1.3627040433883666, + "epoch": 0.9728706212660376, + "grad_norm": 1.734375, + "learning_rate": 3.381795826128467e-06, + "loss": 0.1332, + "mean_token_accuracy": 0.9694800686836242, + "num_tokens": 4201506868.0, + "step": 29800 + }, + { + "entropy": 1.3740008974075317, + "epoch": 0.9745029545231955, + "grad_norm": 1.7734375, + "learning_rate": 3.373763119033706e-06, + "loss": 0.132, + "mean_token_accuracy": 0.969896445274353, + "num_tokens": 4208636691.0, + "step": 29850 + }, + { + "entropy": 1.3614243865013123, + "epoch": 0.9761352877803532, + "grad_norm": 1.1484375, + "learning_rate": 3.365727689113406e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9692292737960816, + "num_tokens": 4215943871.0, + "step": 29900 + }, + { + "entropy": 1.3712631511688231, + "epoch": 0.977767621037511, + "grad_norm": 2.671875, + "learning_rate": 3.3576895949048423e-06, + "loss": 0.1314, + "mean_token_accuracy": 0.970370488166809, + "num_tokens": 4222890357.0, + "step": 29950 + }, + { + "entropy": 1.3694654393196106, + "epoch": 0.9793999542946688, + "grad_norm": 1.5859375, + "learning_rate": 3.3496488949646945e-06, + "loss": 0.132, + "mean_token_accuracy": 0.9699479579925537, + "num_tokens": 4229911312.0, + "step": 30000 + }, + { + "epoch": 0.9793999542946688, + "eval_entropy": 1.3661458206176758, + "eval_loss": 0.14751291275024414, + "eval_mean_token_accuracy": 0.9666498748461405, + "eval_num_tokens": 4229911312.0, + "eval_runtime": 747.9793, + "eval_samples_per_second": 12.909, + "eval_steps_per_second": 0.102, + "step": 30000 + }, + { + "entropy": 1.3588277745246886, + "epoch": 0.9810322875518266, + "grad_norm": 1.5703125, + "learning_rate": 3.34160564786863e-06, + "loss": 0.1369, + "mean_token_accuracy": 0.968973708152771, + "num_tokens": 4237375692.0, + "step": 30050 + }, + { + "entropy": 1.3632358622550964, + "epoch": 0.9826646208089843, + "grad_norm": 1.46875, + "learning_rate": 3.3335599122108676e-06, + "loss": 0.1282, + "mean_token_accuracy": 0.9707362723350524, + "num_tokens": 4244434501.0, + "step": 30100 + }, + { + "entropy": 1.3589469051361085, + "epoch": 0.9842969540661421, + "grad_norm": 1.0859375, + "learning_rate": 3.3255117466037573e-06, + "loss": 0.133, + "mean_token_accuracy": 0.9700418126583099, + "num_tokens": 4251615278.0, + "step": 30150 + }, + { + "entropy": 1.3744734477996827, + "epoch": 0.9859292873232999, + "grad_norm": 1.53125, + "learning_rate": 3.3174612096773496e-06, + "loss": 0.1455, + "mean_token_accuracy": 0.9672206926345825, + "num_tokens": 4259009585.0, + "step": 30200 + }, + { + "entropy": 1.3715810680389404, + "epoch": 0.9875616205804577, + "grad_norm": 1.5, + "learning_rate": 3.3094083600789717e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.969623521566391, + "num_tokens": 4266185168.0, + "step": 30250 + }, + { + "entropy": 1.3768983268737793, + "epoch": 0.9891939538376154, + "grad_norm": 1.1953125, + "learning_rate": 3.3013532564727965e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9680708968639373, + "num_tokens": 4273427348.0, + "step": 30300 + }, + { + "entropy": 1.3672343015670776, + "epoch": 0.9908262870947733, + "grad_norm": 1.2265625, + "learning_rate": 3.293295957539418e-06, + "loss": 0.1295, + "mean_token_accuracy": 0.9705338907241822, + "num_tokens": 4280054609.0, + "step": 30350 + }, + { + "entropy": 1.3644690942764282, + "epoch": 0.992458620351931, + "grad_norm": 1.7421875, + "learning_rate": 3.2852365219754234e-06, + "loss": 0.1298, + "mean_token_accuracy": 0.9702000212669373, + "num_tokens": 4287354435.0, + "step": 30400 + }, + { + "entropy": 1.3673407602310181, + "epoch": 0.9940909536090888, + "grad_norm": 2.046875, + "learning_rate": 3.2771750084929644e-06, + "loss": 0.1388, + "mean_token_accuracy": 0.9686336624622345, + "num_tokens": 4294494938.0, + "step": 30450 + }, + { + "entropy": 1.3702644801139832, + "epoch": 0.9957232868662466, + "grad_norm": 1.046875, + "learning_rate": 3.26911147581933e-06, + "loss": 0.1348, + "mean_token_accuracy": 0.9689600837230682, + "num_tokens": 4302306234.0, + "step": 30500 + }, + { + "epoch": 0.9957232868662466, + "eval_entropy": 1.3691168228785198, + "eval_loss": 0.14723782241344452, + "eval_mean_token_accuracy": 0.9666363048553467, + "eval_num_tokens": 4302306234.0, + "eval_runtime": 744.3054, + "eval_samples_per_second": 12.973, + "eval_steps_per_second": 0.102, + "step": 30500 + }, + { + "entropy": 1.3547360873222352, + "epoch": 0.9973556201234044, + "grad_norm": 2.03125, + "learning_rate": 3.2610459826965177e-06, + "loss": 0.1191, + "mean_token_accuracy": 0.9728741991519928, + "num_tokens": 4309003875.0, + "step": 30550 + }, + { + "entropy": 1.3677322697639465, + "epoch": 0.9989879533805621, + "grad_norm": 1.1640625, + "learning_rate": 3.2529785878808105e-06, + "loss": 0.1382, + "mean_token_accuracy": 0.9686129570007325, + "num_tokens": 4316663353.0, + "step": 30600 + }, + { + "entropy": 1.366257793903351, + "epoch": 1.0006202866377198, + "grad_norm": 1.078125, + "learning_rate": 3.244909350142341e-06, + "loss": 0.1326, + "mean_token_accuracy": 0.9687590861320495, + "num_tokens": 4324200010.0, + "step": 30650 + }, + { + "entropy": 1.3630602145195008, + "epoch": 1.0022526198948778, + "grad_norm": 1.546875, + "learning_rate": 3.2368383282646688e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9694711458683014, + "num_tokens": 4331478206.0, + "step": 30700 + }, + { + "entropy": 1.366960186958313, + "epoch": 1.0038849531520355, + "grad_norm": 1.5390625, + "learning_rate": 3.2287655810443514e-06, + "loss": 0.1382, + "mean_token_accuracy": 0.9686524891853332, + "num_tokens": 4338814315.0, + "step": 30750 + }, + { + "entropy": 1.3614837670326232, + "epoch": 1.0055172864091932, + "grad_norm": 1.234375, + "learning_rate": 3.220691167290514e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.968714509010315, + "num_tokens": 4346125127.0, + "step": 30800 + }, + { + "entropy": 1.3573535728454589, + "epoch": 1.0071496196663512, + "grad_norm": 1.5390625, + "learning_rate": 3.2126151458244233e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9723792004585267, + "num_tokens": 4352939968.0, + "step": 30850 + }, + { + "entropy": 1.3520698595046996, + "epoch": 1.008781952923509, + "grad_norm": 1.875, + "learning_rate": 3.2045375754790577e-06, + "loss": 0.1231, + "mean_token_accuracy": 0.9714046669006348, + "num_tokens": 4359619212.0, + "step": 30900 + }, + { + "entropy": 1.3647415375709533, + "epoch": 1.0104142861806666, + "grad_norm": 1.578125, + "learning_rate": 3.196458515098679e-06, + "loss": 0.1372, + "mean_token_accuracy": 0.9691076791286468, + "num_tokens": 4366790518.0, + "step": 30950 + }, + { + "entropy": 1.3572459483146668, + "epoch": 1.0120466194378244, + "grad_norm": 1.5859375, + "learning_rate": 3.1883780235384036e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9705017244815827, + "num_tokens": 4373881040.0, + "step": 31000 + }, + { + "epoch": 1.0120466194378244, + "eval_entropy": 1.3610133997599283, + "eval_loss": 0.1467311531305313, + "eval_mean_token_accuracy": 0.9667138489087422, + "eval_num_tokens": 4373881040.0, + "eval_runtime": 749.5562, + "eval_samples_per_second": 12.882, + "eval_steps_per_second": 0.101, + "step": 31000 + }, + { + "entropy": 1.3617827844619752, + "epoch": 1.0136789526949823, + "grad_norm": 2.078125, + "learning_rate": 3.180296159663773e-06, + "loss": 0.1256, + "mean_token_accuracy": 0.9706788539886475, + "num_tokens": 4380849198.0, + "step": 31050 + }, + { + "entropy": 1.35469162940979, + "epoch": 1.01531128595214, + "grad_norm": 1.140625, + "learning_rate": 3.1722129823503283e-06, + "loss": 0.1265, + "mean_token_accuracy": 0.9702699911594391, + "num_tokens": 4388053942.0, + "step": 31100 + }, + { + "entropy": 1.3688899064064026, + "epoch": 1.0169436192092978, + "grad_norm": 1.890625, + "learning_rate": 3.1641285504831776e-06, + "loss": 0.1379, + "mean_token_accuracy": 0.9682463228702545, + "num_tokens": 4395442873.0, + "step": 31150 + }, + { + "entropy": 1.3647472047805786, + "epoch": 1.0185759524664555, + "grad_norm": 1.4453125, + "learning_rate": 3.156042922956568e-06, + "loss": 0.1285, + "mean_token_accuracy": 0.9707048869132996, + "num_tokens": 4402556775.0, + "step": 31200 + }, + { + "entropy": 1.3655140924453735, + "epoch": 1.0202082857236134, + "grad_norm": 1.6015625, + "learning_rate": 3.1479561586734553e-06, + "loss": 0.1273, + "mean_token_accuracy": 0.9710086095333099, + "num_tokens": 4409440789.0, + "step": 31250 + }, + { + "entropy": 1.378363606929779, + "epoch": 1.0218406189807712, + "grad_norm": 1.8984375, + "learning_rate": 3.139868316545081e-06, + "loss": 0.1413, + "mean_token_accuracy": 0.9676133573055268, + "num_tokens": 4416910867.0, + "step": 31300 + }, + { + "entropy": 1.377373902797699, + "epoch": 1.023472952237929, + "grad_norm": 1.1171875, + "learning_rate": 3.131779455490534e-06, + "loss": 0.1353, + "mean_token_accuracy": 0.9695135807991028, + "num_tokens": 4424153945.0, + "step": 31350 + }, + { + "entropy": 1.3685814261436462, + "epoch": 1.0251052854950866, + "grad_norm": 1.5234375, + "learning_rate": 3.1236896344363276e-06, + "loss": 0.1261, + "mean_token_accuracy": 0.9713682627677918, + "num_tokens": 4431041238.0, + "step": 31400 + }, + { + "entropy": 1.3680060362815858, + "epoch": 1.0267376187522446, + "grad_norm": 1.3984375, + "learning_rate": 3.1155989123159693e-06, + "loss": 0.1308, + "mean_token_accuracy": 0.9699837076663971, + "num_tokens": 4438354536.0, + "step": 31450 + }, + { + "entropy": 1.3511241865158081, + "epoch": 1.0283699520094023, + "grad_norm": 1.4375, + "learning_rate": 3.1075073480695303e-06, + "loss": 0.12, + "mean_token_accuracy": 0.9721428179740905, + "num_tokens": 4445073229.0, + "step": 31500 + }, + { + "epoch": 1.0283699520094023, + "eval_entropy": 1.3678371334075927, + "eval_loss": 0.14646433293819427, + "eval_mean_token_accuracy": 0.9666456254323323, + "eval_num_tokens": 4445073229.0, + "eval_runtime": 748.9165, + "eval_samples_per_second": 12.893, + "eval_steps_per_second": 0.101, + "step": 31500 + }, + { + "entropy": 1.3680097246170044, + "epoch": 1.03000228526656, + "grad_norm": 2.1875, + "learning_rate": 3.099415000643216e-06, + "loss": 0.125, + "mean_token_accuracy": 0.9707134962081909, + "num_tokens": 4452143066.0, + "step": 31550 + }, + { + "entropy": 1.3657612824440002, + "epoch": 1.0316346185237177, + "grad_norm": 1.0703125, + "learning_rate": 3.0913219289889375e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9681135547161103, + "num_tokens": 4459565328.0, + "step": 31600 + }, + { + "entropy": 1.3669775104522706, + "epoch": 1.0332669517808757, + "grad_norm": 1.1953125, + "learning_rate": 3.083228192063883e-06, + "loss": 0.1296, + "mean_token_accuracy": 0.9709093308448792, + "num_tokens": 4466669853.0, + "step": 31650 + }, + { + "entropy": 1.373291413784027, + "epoch": 1.0348992850380334, + "grad_norm": 1.6171875, + "learning_rate": 3.0751338488300846e-06, + "loss": 0.1319, + "mean_token_accuracy": 0.969772047996521, + "num_tokens": 4473341935.0, + "step": 31700 + }, + { + "entropy": 1.3762017822265624, + "epoch": 1.0365316182951911, + "grad_norm": 1.859375, + "learning_rate": 3.0670389582539956e-06, + "loss": 0.138, + "mean_token_accuracy": 0.968094003200531, + "num_tokens": 4480827308.0, + "step": 31750 + }, + { + "entropy": 1.3670384407043457, + "epoch": 1.0381639515523489, + "grad_norm": 1.4375, + "learning_rate": 3.0589435793060506e-06, + "loss": 0.1257, + "mean_token_accuracy": 0.9709041547775269, + "num_tokens": 4487979209.0, + "step": 31800 + }, + { + "entropy": 1.3619010615348817, + "epoch": 1.0397962848095068, + "grad_norm": 1.9140625, + "learning_rate": 3.050847770960248e-06, + "loss": 0.1182, + "mean_token_accuracy": 0.9726350855827331, + "num_tokens": 4494810042.0, + "step": 31850 + }, + { + "entropy": 1.3843055248260498, + "epoch": 1.0414286180666645, + "grad_norm": 1.546875, + "learning_rate": 3.0427515921937097e-06, + "loss": 0.1398, + "mean_token_accuracy": 0.9685020220279693, + "num_tokens": 4502522702.0, + "step": 31900 + }, + { + "entropy": 1.3764786529541015, + "epoch": 1.0430609513238223, + "grad_norm": 1.1875, + "learning_rate": 3.034655101986258e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9681815671920776, + "num_tokens": 4509891561.0, + "step": 31950 + }, + { + "entropy": 1.3768756079673767, + "epoch": 1.04469328458098, + "grad_norm": 1.1328125, + "learning_rate": 3.026558359319985e-06, + "loss": 0.1378, + "mean_token_accuracy": 0.9689823544025421, + "num_tokens": 4517228622.0, + "step": 32000 + }, + { + "epoch": 1.04469328458098, + "eval_entropy": 1.3686085001627604, + "eval_loss": 0.14597955346107483, + "eval_mean_token_accuracy": 0.9667494138081868, + "eval_num_tokens": 4517228622.0, + "eval_runtime": 756.2048, + "eval_samples_per_second": 12.769, + "eval_steps_per_second": 0.101, + "step": 32000 + }, + { + "entropy": 1.3622459721565248, + "epoch": 1.046325617838138, + "grad_norm": 1.28125, + "learning_rate": 3.01846142317882e-06, + "loss": 0.1246, + "mean_token_accuracy": 0.971124712228775, + "num_tokens": 4524548751.0, + "step": 32050 + }, + { + "entropy": 1.360032732486725, + "epoch": 1.0479579510952957, + "grad_norm": 1.9296875, + "learning_rate": 3.0103643525481026e-06, + "loss": 0.1299, + "mean_token_accuracy": 0.970292786359787, + "num_tokens": 4531455869.0, + "step": 32100 + }, + { + "entropy": 1.3518124055862426, + "epoch": 1.0495902843524534, + "grad_norm": 3.109375, + "learning_rate": 3.0022672064141524e-06, + "loss": 0.1269, + "mean_token_accuracy": 0.971394385099411, + "num_tokens": 4538509160.0, + "step": 32150 + }, + { + "entropy": 1.3580170464515686, + "epoch": 1.051222617609611, + "grad_norm": 1.7734375, + "learning_rate": 2.9941700437638386e-06, + "loss": 0.1289, + "mean_token_accuracy": 0.9704779148101806, + "num_tokens": 4545863027.0, + "step": 32200 + }, + { + "entropy": 1.3661806869506836, + "epoch": 1.052854950866769, + "grad_norm": 1.3359375, + "learning_rate": 2.986072923584151e-06, + "loss": 0.1374, + "mean_token_accuracy": 0.9684454727172852, + "num_tokens": 4553360974.0, + "step": 32250 + }, + { + "entropy": 1.3564281272888183, + "epoch": 1.0544872841239268, + "grad_norm": 2.15625, + "learning_rate": 2.9779759048617704e-06, + "loss": 0.1416, + "mean_token_accuracy": 0.9682377851009369, + "num_tokens": 4560701368.0, + "step": 32300 + }, + { + "entropy": 1.3551061296463012, + "epoch": 1.0561196173810845, + "grad_norm": 2.484375, + "learning_rate": 2.9698790465826377e-06, + "loss": 0.1241, + "mean_token_accuracy": 0.9714620614051819, + "num_tokens": 4567111828.0, + "step": 32350 + }, + { + "entropy": 1.3527950978279113, + "epoch": 1.0577519506382422, + "grad_norm": 1.546875, + "learning_rate": 2.961782407731525e-06, + "loss": 0.1337, + "mean_token_accuracy": 0.9692088150978089, + "num_tokens": 4574301428.0, + "step": 32400 + }, + { + "entropy": 1.3572787022590638, + "epoch": 1.0593842838954002, + "grad_norm": 1.5625, + "learning_rate": 2.953686047291606e-06, + "loss": 0.1286, + "mean_token_accuracy": 0.9706799817085267, + "num_tokens": 4581456456.0, + "step": 32450 + }, + { + "entropy": 1.3538258695602416, + "epoch": 1.061016617152558, + "grad_norm": 1.9765625, + "learning_rate": 2.945590024244026e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.9701016509532928, + "num_tokens": 4588137631.0, + "step": 32500 + }, + { + "epoch": 1.061016617152558, + "eval_entropy": 1.3580620272954305, + "eval_loss": 0.14589445292949677, + "eval_mean_token_accuracy": 0.9668857765197754, + "eval_num_tokens": 4588137631.0, + "eval_runtime": 748.9222, + "eval_samples_per_second": 12.893, + "eval_steps_per_second": 0.101, + "step": 32500 + }, + { + "entropy": 1.361629192829132, + "epoch": 1.0626489504097156, + "grad_norm": 2.0625, + "learning_rate": 2.9374943975674745e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9691119182109833, + "num_tokens": 4595619130.0, + "step": 32550 + }, + { + "entropy": 1.3559014773368836, + "epoch": 1.0642812836668734, + "grad_norm": 1.171875, + "learning_rate": 2.92939922623775e-06, + "loss": 0.1239, + "mean_token_accuracy": 0.9716914188861847, + "num_tokens": 4602722754.0, + "step": 32600 + }, + { + "entropy": 1.3606642532348632, + "epoch": 1.0659136169240313, + "grad_norm": 1.4140625, + "learning_rate": 2.921304569227337e-06, + "loss": 0.1308, + "mean_token_accuracy": 0.9696120321750641, + "num_tokens": 4609681202.0, + "step": 32650 + }, + { + "entropy": 1.3540519714355468, + "epoch": 1.067545950181189, + "grad_norm": 1.265625, + "learning_rate": 2.913210485504971e-06, + "loss": 0.1191, + "mean_token_accuracy": 0.972172474861145, + "num_tokens": 4616745205.0, + "step": 32700 + }, + { + "entropy": 1.3715915560722352, + "epoch": 1.0691782834383468, + "grad_norm": 0.37109375, + "learning_rate": 2.9051170340352125e-06, + "loss": 0.1348, + "mean_token_accuracy": 0.9693544006347656, + "num_tokens": 4624203423.0, + "step": 32750 + }, + { + "entropy": 1.3620137906074523, + "epoch": 1.0708106166955045, + "grad_norm": 0.2353515625, + "learning_rate": 2.8970242737780152e-06, + "loss": 0.1319, + "mean_token_accuracy": 0.9693595457077027, + "num_tokens": 4631098385.0, + "step": 32800 + }, + { + "entropy": 1.3573533582687378, + "epoch": 1.0724429499526624, + "grad_norm": 1.5859375, + "learning_rate": 2.8889322636882975e-06, + "loss": 0.1149, + "mean_token_accuracy": 0.9735664069652558, + "num_tokens": 4637689978.0, + "step": 32850 + }, + { + "entropy": 1.3642809319496154, + "epoch": 1.0740752832098202, + "grad_norm": 1.8671875, + "learning_rate": 2.8808410627155142e-06, + "loss": 0.1288, + "mean_token_accuracy": 0.9694396567344665, + "num_tokens": 4644751687.0, + "step": 32900 + }, + { + "entropy": 1.34871666431427, + "epoch": 1.0757076164669779, + "grad_norm": 1.234375, + "learning_rate": 2.8727507298032246e-06, + "loss": 0.1298, + "mean_token_accuracy": 0.9703471696376801, + "num_tokens": 4651717972.0, + "step": 32950 + }, + { + "entropy": 1.3684996843338013, + "epoch": 1.0773399497241356, + "grad_norm": 1.390625, + "learning_rate": 2.864661323888664e-06, + "loss": 0.1341, + "mean_token_accuracy": 0.9694813418388367, + "num_tokens": 4659247374.0, + "step": 33000 + }, + { + "epoch": 1.0773399497241356, + "eval_entropy": 1.3517811473210652, + "eval_loss": 0.14623871445655823, + "eval_mean_token_accuracy": 0.9667705456415813, + "eval_num_tokens": 4659247374.0, + "eval_runtime": 756.2099, + "eval_samples_per_second": 12.769, + "eval_steps_per_second": 0.101, + "step": 33000 + }, + { + "entropy": 1.3631637930870055, + "epoch": 1.0789722829812936, + "grad_norm": 1.40625, + "learning_rate": 2.8565729039023154e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9690206825733185, + "num_tokens": 4666487708.0, + "step": 33050 + }, + { + "entropy": 1.3550728225708009, + "epoch": 1.0806046162384513, + "grad_norm": 1.96875, + "learning_rate": 2.8484855287674787e-06, + "loss": 0.139, + "mean_token_accuracy": 0.9683072865009308, + "num_tokens": 4673679571.0, + "step": 33100 + }, + { + "entropy": 1.3468902921676635, + "epoch": 1.082236949495609, + "grad_norm": 0.95703125, + "learning_rate": 2.8403992573998416e-06, + "loss": 0.1287, + "mean_token_accuracy": 0.9701869285106659, + "num_tokens": 4680648568.0, + "step": 33150 + }, + { + "entropy": 1.3574704766273498, + "epoch": 1.0838692827527667, + "grad_norm": 2.0, + "learning_rate": 2.8323141487070544e-06, + "loss": 0.1252, + "mean_token_accuracy": 0.9709026992321015, + "num_tokens": 4687624851.0, + "step": 33200 + }, + { + "entropy": 1.3406636595726014, + "epoch": 1.0855016160099247, + "grad_norm": 1.8359375, + "learning_rate": 2.824230261588294e-06, + "loss": 0.123, + "mean_token_accuracy": 0.9712547302246094, + "num_tokens": 4694450430.0, + "step": 33250 + }, + { + "entropy": 1.3555320692062378, + "epoch": 1.0871339492670824, + "grad_norm": 1.890625, + "learning_rate": 2.816147654933839e-06, + "loss": 0.137, + "mean_token_accuracy": 0.9693058180809021, + "num_tokens": 4701897095.0, + "step": 33300 + }, + { + "entropy": 1.3499479746818543, + "epoch": 1.0887662825242401, + "grad_norm": 1.0703125, + "learning_rate": 2.8080663876246394e-06, + "loss": 0.1231, + "mean_token_accuracy": 0.9722630488872528, + "num_tokens": 4708881122.0, + "step": 33350 + }, + { + "entropy": 1.3455316138267517, + "epoch": 1.0903986157813979, + "grad_norm": 1.8515625, + "learning_rate": 2.79998651853189e-06, + "loss": 0.1199, + "mean_token_accuracy": 0.9716216671466827, + "num_tokens": 4716074845.0, + "step": 33400 + }, + { + "entropy": 1.359624376296997, + "epoch": 1.0920309490385558, + "grad_norm": 2.109375, + "learning_rate": 2.7919081065165985e-06, + "loss": 0.1257, + "mean_token_accuracy": 0.9711248898506164, + "num_tokens": 4723316565.0, + "step": 33450 + }, + { + "entropy": 1.3494191646575928, + "epoch": 1.0936632822957135, + "grad_norm": 1.53125, + "learning_rate": 2.7838312104291584e-06, + "loss": 0.1323, + "mean_token_accuracy": 0.9694008147716522, + "num_tokens": 4730650888.0, + "step": 33500 + }, + { + "epoch": 1.0936632822957135, + "eval_entropy": 1.353581156730652, + "eval_loss": 0.14585214853286743, + "eval_mean_token_accuracy": 0.9667929395039876, + "eval_num_tokens": 4730650888.0, + "eval_runtime": 754.4564, + "eval_samples_per_second": 12.799, + "eval_steps_per_second": 0.101, + "step": 33500 + }, + { + "entropy": 1.3704795885086059, + "epoch": 1.0952956155528712, + "grad_norm": 1.796875, + "learning_rate": 2.775755889108919e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9687422275543213, + "num_tokens": 4737974169.0, + "step": 33550 + }, + { + "entropy": 1.3608631157875062, + "epoch": 1.096927948810029, + "grad_norm": 2.140625, + "learning_rate": 2.7676822013837588e-06, + "loss": 0.1299, + "mean_token_accuracy": 0.9702245342731476, + "num_tokens": 4745212589.0, + "step": 33600 + }, + { + "entropy": 1.3445592832565307, + "epoch": 1.098560282067187, + "grad_norm": 1.8984375, + "learning_rate": 2.7596102060696543e-06, + "loss": 0.123, + "mean_token_accuracy": 0.9716692876815796, + "num_tokens": 4752104456.0, + "step": 33650 + }, + { + "entropy": 1.3646861577033997, + "epoch": 1.1001926153243446, + "grad_norm": 2.078125, + "learning_rate": 2.7515399619702545e-06, + "loss": 0.1286, + "mean_token_accuracy": 0.9707652199268341, + "num_tokens": 4759026284.0, + "step": 33700 + }, + { + "entropy": 1.3495210075378419, + "epoch": 1.1018249485815024, + "grad_norm": 1.6015625, + "learning_rate": 2.7434715278764494e-06, + "loss": 0.1298, + "mean_token_accuracy": 0.9711257350444794, + "num_tokens": 4765987412.0, + "step": 33750 + }, + { + "entropy": 1.3825481986999513, + "epoch": 1.10345728183866, + "grad_norm": 1.03125, + "learning_rate": 2.735404962565945e-06, + "loss": 0.1417, + "mean_token_accuracy": 0.9679384648799896, + "num_tokens": 4773475530.0, + "step": 33800 + }, + { + "entropy": 1.3517045164108277, + "epoch": 1.105089615095818, + "grad_norm": 2.515625, + "learning_rate": 2.7273403248028325e-06, + "loss": 0.1183, + "mean_token_accuracy": 0.9723455941677094, + "num_tokens": 4780068847.0, + "step": 33850 + }, + { + "entropy": 1.3678963851928712, + "epoch": 1.1067219483529758, + "grad_norm": 1.3125, + "learning_rate": 2.7192776733371608e-06, + "loss": 0.1354, + "mean_token_accuracy": 0.9694834208488464, + "num_tokens": 4786993711.0, + "step": 33900 + }, + { + "entropy": 1.3571431303024293, + "epoch": 1.1083542816101335, + "grad_norm": 2.15625, + "learning_rate": 2.711217066904509e-06, + "loss": 0.1212, + "mean_token_accuracy": 0.9715266978740692, + "num_tokens": 4793644363.0, + "step": 33950 + }, + { + "entropy": 1.3623171138763428, + "epoch": 1.1099866148672912, + "grad_norm": 1.59375, + "learning_rate": 2.7031585642255596e-06, + "loss": 0.1279, + "mean_token_accuracy": 0.970818110704422, + "num_tokens": 4800772089.0, + "step": 34000 + }, + { + "epoch": 1.1099866148672912, + "eval_entropy": 1.3600939814249675, + "eval_loss": 0.14549146592617035, + "eval_mean_token_accuracy": 0.9668245681126912, + "eval_num_tokens": 4800772089.0, + "eval_runtime": 753.1399, + "eval_samples_per_second": 12.821, + "eval_steps_per_second": 0.101, + "step": 34000 + }, + { + "entropy": 1.3657611656188964, + "epoch": 1.1116189481244492, + "grad_norm": 1.375, + "learning_rate": 2.695102224005667e-06, + "loss": 0.1312, + "mean_token_accuracy": 0.9701169800758361, + "num_tokens": 4808026786.0, + "step": 34050 + }, + { + "entropy": 1.3706095337867736, + "epoch": 1.113251281381607, + "grad_norm": 0.002044677734375, + "learning_rate": 2.687048104934434e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9694833195209503, + "num_tokens": 4815351616.0, + "step": 34100 + }, + { + "entropy": 1.3567158889770508, + "epoch": 1.1148836146387646, + "grad_norm": 1.2578125, + "learning_rate": 2.6789962656852835e-06, + "loss": 0.1273, + "mean_token_accuracy": 0.9710471928119659, + "num_tokens": 4822489624.0, + "step": 34150 + }, + { + "entropy": 1.3655909848213197, + "epoch": 1.1165159478959223, + "grad_norm": 0.3984375, + "learning_rate": 2.6709467649150276e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9684849452972412, + "num_tokens": 4830165631.0, + "step": 34200 + }, + { + "entropy": 1.3546900820732117, + "epoch": 1.1181482811530803, + "grad_norm": 1.796875, + "learning_rate": 2.662899661263445e-06, + "loss": 0.1259, + "mean_token_accuracy": 0.9711956691741943, + "num_tokens": 4836745329.0, + "step": 34250 + }, + { + "entropy": 1.361421148777008, + "epoch": 1.119780614410238, + "grad_norm": 2.46875, + "learning_rate": 2.654855013352849e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.9703176605701447, + "num_tokens": 4843917385.0, + "step": 34300 + }, + { + "entropy": 1.3671817374229431, + "epoch": 1.1214129476673957, + "grad_norm": 2.34375, + "learning_rate": 2.646812879787668e-06, + "loss": 0.1262, + "mean_token_accuracy": 0.9710332584381104, + "num_tokens": 4850936707.0, + "step": 34350 + }, + { + "entropy": 1.368682358264923, + "epoch": 1.1230452809245535, + "grad_norm": 1.3046875, + "learning_rate": 2.6387733191540083e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9702812135219574, + "num_tokens": 4857774583.0, + "step": 34400 + }, + { + "entropy": 1.357949526309967, + "epoch": 1.1246776141817114, + "grad_norm": 1.03125, + "learning_rate": 2.6307363900192354e-06, + "loss": 0.1344, + "mean_token_accuracy": 0.9693097794055938, + "num_tokens": 4864886795.0, + "step": 34450 + }, + { + "entropy": 1.367612702846527, + "epoch": 1.1263099474388691, + "grad_norm": 2.21875, + "learning_rate": 2.6227021509315442e-06, + "loss": 0.1312, + "mean_token_accuracy": 0.9697531294822693, + "num_tokens": 4872140576.0, + "step": 34500 + }, + { + "epoch": 1.1263099474388691, + "eval_entropy": 1.3573125632603964, + "eval_loss": 0.14544960856437683, + "eval_mean_token_accuracy": 0.9669329651196797, + "eval_num_tokens": 4872140576.0, + "eval_runtime": 753.8369, + "eval_samples_per_second": 12.809, + "eval_steps_per_second": 0.101, + "step": 34500 + }, + { + "entropy": 1.3483564281463623, + "epoch": 1.1279422806960269, + "grad_norm": 1.5, + "learning_rate": 2.614670660419533e-06, + "loss": 0.1174, + "mean_token_accuracy": 0.9726065421104431, + "num_tokens": 4879225657.0, + "step": 34550 + }, + { + "entropy": 1.3606638669967652, + "epoch": 1.1295746139531846, + "grad_norm": 2.40625, + "learning_rate": 2.606641976991775e-06, + "loss": 0.1254, + "mean_token_accuracy": 0.9706631207466125, + "num_tokens": 4886242099.0, + "step": 34600 + }, + { + "entropy": 1.3547830367088318, + "epoch": 1.1312069472103425, + "grad_norm": 1.1953125, + "learning_rate": 2.5986161591363984e-06, + "loss": 0.1294, + "mean_token_accuracy": 0.9702156925201416, + "num_tokens": 4892983817.0, + "step": 34650 + }, + { + "entropy": 1.3496627926826477, + "epoch": 1.1328392804675003, + "grad_norm": 2.046875, + "learning_rate": 2.590593265320652e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9711934244632721, + "num_tokens": 4900048536.0, + "step": 34700 + }, + { + "entropy": 1.3515707707405091, + "epoch": 1.134471613724658, + "grad_norm": 1.9921875, + "learning_rate": 2.582573353990486e-06, + "loss": 0.1279, + "mean_token_accuracy": 0.9710315072536468, + "num_tokens": 4906893370.0, + "step": 34750 + }, + { + "entropy": 1.3535165977478028, + "epoch": 1.1361039469818157, + "grad_norm": 1.890625, + "learning_rate": 2.5745564835701206e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9720440351963043, + "num_tokens": 4913536928.0, + "step": 34800 + }, + { + "entropy": 1.3644448471069337, + "epoch": 1.1377362802389737, + "grad_norm": 1.8046875, + "learning_rate": 2.5665427124616256e-06, + "loss": 0.1317, + "mean_token_accuracy": 0.970216943025589, + "num_tokens": 4920499397.0, + "step": 34850 + }, + { + "entropy": 1.3579574704170227, + "epoch": 1.1393686134961314, + "grad_norm": 2.265625, + "learning_rate": 2.5585320990444923e-06, + "loss": 0.1277, + "mean_token_accuracy": 0.970561819076538, + "num_tokens": 4928000813.0, + "step": 34900 + }, + { + "entropy": 1.3763416075706483, + "epoch": 1.1410009467532891, + "grad_norm": 1.3046875, + "learning_rate": 2.550524701675208e-06, + "loss": 0.1359, + "mean_token_accuracy": 0.9684149813652039, + "num_tokens": 4935032419.0, + "step": 34950 + }, + { + "entropy": 1.3512770438194275, + "epoch": 1.1426332800104468, + "grad_norm": 1.2890625, + "learning_rate": 2.542520578686831e-06, + "loss": 0.1211, + "mean_token_accuracy": 0.971741670370102, + "num_tokens": 4941410711.0, + "step": 35000 + }, + { + "epoch": 1.1426332800104468, + "eval_entropy": 1.3627102088928222, + "eval_loss": 0.14536549150943756, + "eval_mean_token_accuracy": 0.9668985676765441, + "eval_num_tokens": 4941410711.0, + "eval_runtime": 752.233, + "eval_samples_per_second": 12.836, + "eval_steps_per_second": 0.101, + "step": 35000 + }, + { + "entropy": 1.3602009153366088, + "epoch": 1.1442656132676048, + "grad_norm": 1.1796875, + "learning_rate": 2.5345197883885677e-06, + "loss": 0.1279, + "mean_token_accuracy": 0.9709439516067505, + "num_tokens": 4948340334.0, + "step": 35050 + }, + { + "entropy": 1.3565789103507995, + "epoch": 1.1458979465247625, + "grad_norm": 1.3515625, + "learning_rate": 2.526522389065345e-06, + "loss": 0.1356, + "mean_token_accuracy": 0.9685303854942322, + "num_tokens": 4956282307.0, + "step": 35100 + }, + { + "entropy": 1.3583333039283751, + "epoch": 1.1475302797819202, + "grad_norm": 1.9296875, + "learning_rate": 2.518528438977387e-06, + "loss": 0.1214, + "mean_token_accuracy": 0.9718796277046203, + "num_tokens": 4963542094.0, + "step": 35150 + }, + { + "entropy": 1.35888774394989, + "epoch": 1.149162613039078, + "grad_norm": 1.84375, + "learning_rate": 2.51053799635979e-06, + "loss": 0.1341, + "mean_token_accuracy": 0.9693203794956208, + "num_tokens": 4970941813.0, + "step": 35200 + }, + { + "entropy": 1.3589514350891114, + "epoch": 1.150794946296236, + "grad_norm": 1.328125, + "learning_rate": 2.5025511194221e-06, + "loss": 0.1371, + "mean_token_accuracy": 0.9692372989654541, + "num_tokens": 4978422565.0, + "step": 35250 + }, + { + "entropy": 1.3571251654624938, + "epoch": 1.1524272795533936, + "grad_norm": 1.7734375, + "learning_rate": 2.494567866347887e-06, + "loss": 0.1301, + "mean_token_accuracy": 0.9706821513175964, + "num_tokens": 4985066771.0, + "step": 35300 + }, + { + "entropy": 1.3489887595176697, + "epoch": 1.1540596128105514, + "grad_norm": 1.328125, + "learning_rate": 2.4865882952943194e-06, + "loss": 0.1179, + "mean_token_accuracy": 0.9729493832588196, + "num_tokens": 4991808794.0, + "step": 35350 + }, + { + "entropy": 1.3580448365211486, + "epoch": 1.155691946067709, + "grad_norm": 1.5625, + "learning_rate": 2.478612464391746e-06, + "loss": 0.1267, + "mean_token_accuracy": 0.9707785761356353, + "num_tokens": 4998905781.0, + "step": 35400 + }, + { + "entropy": 1.35804701089859, + "epoch": 1.157324279324867, + "grad_norm": 1.8203125, + "learning_rate": 2.470640431743268e-06, + "loss": 0.1381, + "mean_token_accuracy": 0.9687949836254119, + "num_tokens": 5006205777.0, + "step": 35450 + }, + { + "entropy": 1.361290261745453, + "epoch": 1.1589566125820248, + "grad_norm": 1.4375, + "learning_rate": 2.4626722554243144e-06, + "loss": 0.1272, + "mean_token_accuracy": 0.9700055694580079, + "num_tokens": 5013371710.0, + "step": 35500 + }, + { + "epoch": 1.1589566125820248, + "eval_entropy": 1.359473959604899, + "eval_loss": 0.14513231813907623, + "eval_mean_token_accuracy": 0.9669572798411051, + "eval_num_tokens": 5013371710.0, + "eval_runtime": 750.7651, + "eval_samples_per_second": 12.862, + "eval_steps_per_second": 0.101, + "step": 35500 + }, + { + "entropy": 1.3512133407592772, + "epoch": 1.1605889458391825, + "grad_norm": 2.234375, + "learning_rate": 2.454707993482224e-06, + "loss": 0.1272, + "mean_token_accuracy": 0.9703232657909393, + "num_tokens": 5020188118.0, + "step": 35550 + }, + { + "entropy": 1.3578423738479615, + "epoch": 1.1622212790963404, + "grad_norm": 1.5, + "learning_rate": 2.446747703935818e-06, + "loss": 0.1222, + "mean_token_accuracy": 0.9719510304927826, + "num_tokens": 5026929276.0, + "step": 35600 + }, + { + "entropy": 1.3523945426940918, + "epoch": 1.1638536123534982, + "grad_norm": 1.3125, + "learning_rate": 2.4387914447749802e-06, + "loss": 0.131, + "mean_token_accuracy": 0.9701538634300232, + "num_tokens": 5034103966.0, + "step": 35650 + }, + { + "entropy": 1.3755294966697693, + "epoch": 1.1654859456106559, + "grad_norm": 1.359375, + "learning_rate": 2.4308392739602323e-06, + "loss": 0.138, + "mean_token_accuracy": 0.968312075138092, + "num_tokens": 5041450508.0, + "step": 35700 + }, + { + "entropy": 1.3669376826286317, + "epoch": 1.1671182788678136, + "grad_norm": 1.2109375, + "learning_rate": 2.4228912494223137e-06, + "loss": 0.1333, + "mean_token_accuracy": 0.9692367768287659, + "num_tokens": 5048332405.0, + "step": 35750 + }, + { + "entropy": 1.371087245941162, + "epoch": 1.1687506121249713, + "grad_norm": 2.375, + "learning_rate": 2.414947429061759e-06, + "loss": 0.1322, + "mean_token_accuracy": 0.9700936663150788, + "num_tokens": 5055257003.0, + "step": 35800 + }, + { + "entropy": 1.374721155166626, + "epoch": 1.1703829453821293, + "grad_norm": 1.828125, + "learning_rate": 2.4070078707484743e-06, + "loss": 0.1387, + "mean_token_accuracy": 0.9685248970985413, + "num_tokens": 5062113906.0, + "step": 35850 + }, + { + "entropy": 1.373306679725647, + "epoch": 1.172015278639287, + "grad_norm": 1.34375, + "learning_rate": 2.399072632321319e-06, + "loss": 0.1278, + "mean_token_accuracy": 0.9704372000694275, + "num_tokens": 5069153056.0, + "step": 35900 + }, + { + "entropy": 1.362773072719574, + "epoch": 1.1736476118964447, + "grad_norm": 3.0, + "learning_rate": 2.3911417715876806e-06, + "loss": 0.1175, + "mean_token_accuracy": 0.9727724301815033, + "num_tokens": 5075547365.0, + "step": 35950 + }, + { + "entropy": 1.361773042678833, + "epoch": 1.1752799451536027, + "grad_norm": 2.265625, + "learning_rate": 2.383215346323058e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9693261981010437, + "num_tokens": 5082553584.0, + "step": 36000 + }, + { + "epoch": 1.1752799451536027, + "eval_entropy": 1.3677510404586792, + "eval_loss": 0.1451091319322586, + "eval_mean_token_accuracy": 0.9669363768895467, + "eval_num_tokens": 5082553584.0, + "eval_runtime": 752.6099, + "eval_samples_per_second": 12.83, + "eval_steps_per_second": 0.101, + "step": 36000 + }, + { + "entropy": 1.367734661102295, + "epoch": 1.1769122784107604, + "grad_norm": 2.21875, + "learning_rate": 2.3752934142706355e-06, + "loss": 0.1292, + "mean_token_accuracy": 0.970736186504364, + "num_tokens": 5089460622.0, + "step": 36050 + }, + { + "entropy": 1.3702288055419922, + "epoch": 1.1785446116679181, + "grad_norm": 1.6796875, + "learning_rate": 2.3673760331408664e-06, + "loss": 0.1245, + "mean_token_accuracy": 0.9706618010997772, + "num_tokens": 5096477290.0, + "step": 36100 + }, + { + "entropy": 1.3666929292678833, + "epoch": 1.1801769449250759, + "grad_norm": 2.109375, + "learning_rate": 2.3594632606110514e-06, + "loss": 0.1348, + "mean_token_accuracy": 0.9692357456684113, + "num_tokens": 5103888303.0, + "step": 36150 + }, + { + "entropy": 1.3703182339668274, + "epoch": 1.1818092781822336, + "grad_norm": 1.484375, + "learning_rate": 2.351555154324916e-06, + "loss": 0.1352, + "mean_token_accuracy": 0.9687790739536285, + "num_tokens": 5111598482.0, + "step": 36200 + }, + { + "entropy": 1.3647811126708984, + "epoch": 1.1834416114393915, + "grad_norm": 2.46875, + "learning_rate": 2.3436517718921944e-06, + "loss": 0.123, + "mean_token_accuracy": 0.9715970456600189, + "num_tokens": 5118205523.0, + "step": 36250 + }, + { + "entropy": 1.3724281644821168, + "epoch": 1.1850739446965493, + "grad_norm": 2.09375, + "learning_rate": 2.3357531708882084e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.9688549792766571, + "num_tokens": 5125390728.0, + "step": 36300 + }, + { + "entropy": 1.3688122749328613, + "epoch": 1.186706277953707, + "grad_norm": 1.3125, + "learning_rate": 2.3278594088534453e-06, + "loss": 0.1248, + "mean_token_accuracy": 0.9709884691238403, + "num_tokens": 5132430170.0, + "step": 36350 + }, + { + "entropy": 1.371947205066681, + "epoch": 1.188338611210865, + "grad_norm": 1.4140625, + "learning_rate": 2.319970543293144e-06, + "loss": 0.1299, + "mean_token_accuracy": 0.9705728948116302, + "num_tokens": 5139505776.0, + "step": 36400 + }, + { + "entropy": 1.3706382060050963, + "epoch": 1.1899709444680227, + "grad_norm": 1.34375, + "learning_rate": 2.3120866316768705e-06, + "loss": 0.1288, + "mean_token_accuracy": 0.9700265216827393, + "num_tokens": 5146615876.0, + "step": 36450 + }, + { + "entropy": 1.3712089610099794, + "epoch": 1.1916032777251804, + "grad_norm": 1.484375, + "learning_rate": 2.3042077314381025e-06, + "loss": 0.1227, + "mean_token_accuracy": 0.9712138116359711, + "num_tokens": 5153410420.0, + "step": 36500 + }, + { + "epoch": 1.1916032777251804, + "eval_entropy": 1.369815084139506, + "eval_loss": 0.14498043060302734, + "eval_mean_token_accuracy": 0.9671436421076457, + "eval_num_tokens": 5153410420.0, + "eval_runtime": 750.4787, + "eval_samples_per_second": 12.866, + "eval_steps_per_second": 0.101, + "step": 36500 + }, + { + "entropy": 1.3633040881156921, + "epoch": 1.193235610982338, + "grad_norm": 1.34375, + "learning_rate": 2.2963338999738103e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9702033531665802, + "num_tokens": 5160782174.0, + "step": 36550 + }, + { + "entropy": 1.3697287273406982, + "epoch": 1.1948679442394958, + "grad_norm": 1.625, + "learning_rate": 2.288465194644041e-06, + "loss": 0.1346, + "mean_token_accuracy": 0.9692202270030975, + "num_tokens": 5168040473.0, + "step": 36600 + }, + { + "entropy": 1.3875136041641236, + "epoch": 1.1965002774966538, + "grad_norm": 2.34375, + "learning_rate": 2.2806016727714953e-06, + "loss": 0.1355, + "mean_token_accuracy": 0.9688812565803527, + "num_tokens": 5175213322.0, + "step": 36650 + }, + { + "entropy": 1.3730654883384705, + "epoch": 1.1981326107538115, + "grad_norm": 1.078125, + "learning_rate": 2.272743391641114e-06, + "loss": 0.123, + "mean_token_accuracy": 0.9711851370334625, + "num_tokens": 5182005797.0, + "step": 36700 + }, + { + "entropy": 1.3739676403999328, + "epoch": 1.1997649440109692, + "grad_norm": 1.7734375, + "learning_rate": 2.2648904084996593e-06, + "loss": 0.1329, + "mean_token_accuracy": 0.9690020906925202, + "num_tokens": 5188915363.0, + "step": 36750 + }, + { + "entropy": 1.3796244549751282, + "epoch": 1.2013972772681272, + "grad_norm": 1.75, + "learning_rate": 2.2570427805553e-06, + "loss": 0.135, + "mean_token_accuracy": 0.969414986371994, + "num_tokens": 5196503416.0, + "step": 36800 + }, + { + "entropy": 1.3724525594711303, + "epoch": 1.203029610525285, + "grad_norm": 1.078125, + "learning_rate": 2.24920056497719e-06, + "loss": 0.1277, + "mean_token_accuracy": 0.9707681620121003, + "num_tokens": 5203249529.0, + "step": 36850 + }, + { + "entropy": 1.3727708411216737, + "epoch": 1.2046619437824426, + "grad_norm": 1.9296875, + "learning_rate": 2.2413638188950564e-06, + "loss": 0.1301, + "mean_token_accuracy": 0.9704027915000916, + "num_tokens": 5210234077.0, + "step": 36900 + }, + { + "entropy": 1.3519327425956726, + "epoch": 1.2062942770396003, + "grad_norm": 1.4140625, + "learning_rate": 2.2335325993987815e-06, + "loss": 0.1196, + "mean_token_accuracy": 0.9722448754310608, + "num_tokens": 5216853099.0, + "step": 36950 + }, + { + "entropy": 1.365940923690796, + "epoch": 1.207926610296758, + "grad_norm": 1.6328125, + "learning_rate": 2.2257069635379863e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.9699783003330231, + "num_tokens": 5223781418.0, + "step": 37000 + }, + { + "epoch": 1.207926610296758, + "eval_entropy": 1.360196549097697, + "eval_loss": 0.1449788212776184, + "eval_mean_token_accuracy": 0.9670122480392456, + "eval_num_tokens": 5223781418.0, + "eval_runtime": 751.1493, + "eval_samples_per_second": 12.855, + "eval_steps_per_second": 0.101, + "step": 37000 + }, + { + "entropy": 1.360548312664032, + "epoch": 1.209558943553916, + "grad_norm": 1.7109375, + "learning_rate": 2.2178869683216164e-06, + "loss": 0.1262, + "mean_token_accuracy": 0.9713475477695465, + "num_tokens": 5230659746.0, + "step": 37050 + }, + { + "entropy": 1.357577109336853, + "epoch": 1.2111912768110737, + "grad_norm": 1.265625, + "learning_rate": 2.2100726707175246e-06, + "loss": 0.1313, + "mean_token_accuracy": 0.969699913263321, + "num_tokens": 5237707649.0, + "step": 37100 + }, + { + "entropy": 1.3651654267311095, + "epoch": 1.2128236100682315, + "grad_norm": 1.3359375, + "learning_rate": 2.202264127652059e-06, + "loss": 0.1295, + "mean_token_accuracy": 0.9710112833976745, + "num_tokens": 5244771475.0, + "step": 37150 + }, + { + "entropy": 1.3489612317085267, + "epoch": 1.2144559433253894, + "grad_norm": 2.140625, + "learning_rate": 2.1944613960096456e-06, + "loss": 0.1213, + "mean_token_accuracy": 0.9717336785793305, + "num_tokens": 5251743181.0, + "step": 37200 + }, + { + "entropy": 1.355490939617157, + "epoch": 1.2160882765825471, + "grad_norm": 1.6953125, + "learning_rate": 2.1866645326323743e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9707163834571838, + "num_tokens": 5258283419.0, + "step": 37250 + }, + { + "entropy": 1.3655565786361694, + "epoch": 1.2177206098397049, + "grad_norm": 1.1875, + "learning_rate": 2.1788735943195865e-06, + "loss": 0.1367, + "mean_token_accuracy": 0.9688389587402344, + "num_tokens": 5265823807.0, + "step": 37300 + }, + { + "entropy": 1.349189965724945, + "epoch": 1.2193529430968626, + "grad_norm": 1.3515625, + "learning_rate": 2.171088637827458e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9697683715820312, + "num_tokens": 5273206881.0, + "step": 37350 + }, + { + "entropy": 1.3544545078277588, + "epoch": 1.2209852763540203, + "grad_norm": 1.03125, + "learning_rate": 2.16330971986859e-06, + "loss": 0.1216, + "mean_token_accuracy": 0.9722874271869659, + "num_tokens": 5280217004.0, + "step": 37400 + }, + { + "entropy": 1.3607108736038207, + "epoch": 1.2226176096111783, + "grad_norm": 1.15625, + "learning_rate": 2.1555368971115926e-06, + "loss": 0.1353, + "mean_token_accuracy": 0.9696134865283966, + "num_tokens": 5287498948.0, + "step": 37450 + }, + { + "entropy": 1.3576147866249084, + "epoch": 1.224249942868336, + "grad_norm": 1.71875, + "learning_rate": 2.147770226180672e-06, + "loss": 0.1276, + "mean_token_accuracy": 0.9706488907337188, + "num_tokens": 5294869687.0, + "step": 37500 + }, + { + "epoch": 1.224249942868336, + "eval_entropy": 1.3614781061808268, + "eval_loss": 0.14473138749599457, + "eval_mean_token_accuracy": 0.9672889757156372, + "eval_num_tokens": 5294869687.0, + "eval_runtime": 751.3614, + "eval_samples_per_second": 12.851, + "eval_steps_per_second": 0.101, + "step": 37500 + }, + { + "entropy": 1.376727077960968, + "epoch": 1.2258822761254937, + "grad_norm": 1.265625, + "learning_rate": 2.1400097636552217e-06, + "loss": 0.1308, + "mean_token_accuracy": 0.9701304376125336, + "num_tokens": 5301938154.0, + "step": 37550 + }, + { + "entropy": 1.3636725115776063, + "epoch": 1.2275146093826517, + "grad_norm": 1.5703125, + "learning_rate": 2.1322555660694053e-06, + "loss": 0.1301, + "mean_token_accuracy": 0.9700271189212799, + "num_tokens": 5308968407.0, + "step": 37600 + }, + { + "entropy": 1.3606876850128173, + "epoch": 1.2291469426398094, + "grad_norm": 1.21875, + "learning_rate": 2.124507689911747e-06, + "loss": 0.1232, + "mean_token_accuracy": 0.9719198334217072, + "num_tokens": 5316027114.0, + "step": 37650 + }, + { + "entropy": 1.3608359265327454, + "epoch": 1.2307792758969671, + "grad_norm": 1.2421875, + "learning_rate": 2.1167661916247203e-06, + "loss": 0.1352, + "mean_token_accuracy": 0.9692904555797577, + "num_tokens": 5323480081.0, + "step": 37700 + }, + { + "entropy": 1.3596609687805177, + "epoch": 1.2324116091541248, + "grad_norm": 1.453125, + "learning_rate": 2.109031127604339e-06, + "loss": 0.1171, + "mean_token_accuracy": 0.9726130926609039, + "num_tokens": 5330170561.0, + "step": 37750 + }, + { + "entropy": 1.36385005235672, + "epoch": 1.2340439424112826, + "grad_norm": 1.1796875, + "learning_rate": 2.10130255419974e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9708395230770112, + "num_tokens": 5337270272.0, + "step": 37800 + }, + { + "entropy": 1.3592445206642152, + "epoch": 1.2356762756684405, + "grad_norm": 1.75, + "learning_rate": 2.0935805277127794e-06, + "loss": 0.1307, + "mean_token_accuracy": 0.9704990041255951, + "num_tokens": 5344402468.0, + "step": 37850 + }, + { + "entropy": 1.369277856349945, + "epoch": 1.2373086089255982, + "grad_norm": 1.4609375, + "learning_rate": 2.0858651043976183e-06, + "loss": 0.1373, + "mean_token_accuracy": 0.9686707425117492, + "num_tokens": 5351952870.0, + "step": 37900 + }, + { + "entropy": 1.3526391363143921, + "epoch": 1.238940942182756, + "grad_norm": 2.03125, + "learning_rate": 2.0781563404603153e-06, + "loss": 0.1266, + "mean_token_accuracy": 0.9708797204494476, + "num_tokens": 5359011853.0, + "step": 37950 + }, + { + "entropy": 1.358784899711609, + "epoch": 1.240573275439914, + "grad_norm": 1.109375, + "learning_rate": 2.0704542920584153e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9693182837963105, + "num_tokens": 5366436177.0, + "step": 38000 + }, + { + "epoch": 1.240573275439914, + "eval_entropy": 1.3544676637649535, + "eval_loss": 0.14443761110305786, + "eval_mean_token_accuracy": 0.9673179856936137, + "eval_num_tokens": 5366436177.0, + "eval_runtime": 752.6655, + "eval_samples_per_second": 12.829, + "eval_steps_per_second": 0.101, + "step": 38000 + }, + { + "entropy": 1.344338595867157, + "epoch": 1.2422056086970716, + "grad_norm": 1.8203125, + "learning_rate": 2.0627590153005426e-06, + "loss": 0.1146, + "mean_token_accuracy": 0.9736293482780457, + "num_tokens": 5373144912.0, + "step": 38050 + }, + { + "entropy": 1.3542405033111573, + "epoch": 1.2438379419542294, + "grad_norm": 2.109375, + "learning_rate": 2.0550705662459896e-06, + "loss": 0.1276, + "mean_token_accuracy": 0.9702631950378418, + "num_tokens": 5380308640.0, + "step": 38100 + }, + { + "entropy": 1.3496335220336915, + "epoch": 1.245470275211387, + "grad_norm": 2.015625, + "learning_rate": 2.047389000904309e-06, + "loss": 0.1233, + "mean_token_accuracy": 0.9710812640190124, + "num_tokens": 5386799272.0, + "step": 38150 + }, + { + "entropy": 1.3436848521232605, + "epoch": 1.2471026084685448, + "grad_norm": 1.078125, + "learning_rate": 2.0397143752349084e-06, + "loss": 0.1313, + "mean_token_accuracy": 0.9700379192829132, + "num_tokens": 5393957806.0, + "step": 38200 + }, + { + "entropy": 1.3535588788986206, + "epoch": 1.2487349417257028, + "grad_norm": 2.1875, + "learning_rate": 2.032046745146638e-06, + "loss": 0.1256, + "mean_token_accuracy": 0.9713840389251709, + "num_tokens": 5400892138.0, + "step": 38250 + }, + { + "entropy": 1.35460533618927, + "epoch": 1.2503672749828605, + "grad_norm": 2.046875, + "learning_rate": 2.0243861664973897e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9702360582351685, + "num_tokens": 5407837431.0, + "step": 38300 + }, + { + "entropy": 1.350141739845276, + "epoch": 1.2519996082400182, + "grad_norm": 1.8828125, + "learning_rate": 2.016732695093681e-06, + "loss": 0.1195, + "mean_token_accuracy": 0.9717020273208619, + "num_tokens": 5414369851.0, + "step": 38350 + }, + { + "entropy": 1.3572440361976623, + "epoch": 1.2536319414971762, + "grad_norm": 1.8828125, + "learning_rate": 2.009086386690259e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9700393569469452, + "num_tokens": 5421779114.0, + "step": 38400 + }, + { + "entropy": 1.3585280227661132, + "epoch": 1.2552642747543339, + "grad_norm": 0.002349853515625, + "learning_rate": 2.001447296989687e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9713066399097443, + "num_tokens": 5428733100.0, + "step": 38450 + }, + { + "entropy": 1.3632621765136719, + "epoch": 1.2568966080114916, + "grad_norm": 1.7734375, + "learning_rate": 1.993815481641939e-06, + "loss": 0.1282, + "mean_token_accuracy": 0.9696168947219849, + "num_tokens": 5435549385.0, + "step": 38500 + }, + { + "epoch": 1.2568966080114916, + "eval_entropy": 1.3491881497701008, + "eval_loss": 0.14436551928520203, + "eval_mean_token_accuracy": 0.9672579844792684, + "eval_num_tokens": 5435549385.0, + "eval_runtime": 753.3503, + "eval_samples_per_second": 12.817, + "eval_steps_per_second": 0.101, + "step": 38500 + }, + { + "entropy": 1.3552922701835632, + "epoch": 1.2585289412686493, + "grad_norm": 1.4375, + "learning_rate": 1.9861909962440006e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9696126735210419, + "num_tokens": 5442577839.0, + "step": 38550 + }, + { + "entropy": 1.3554336166381835, + "epoch": 1.260161274525807, + "grad_norm": 1.875, + "learning_rate": 1.978573896339455e-06, + "loss": 0.1203, + "mean_token_accuracy": 0.9720760262012482, + "num_tokens": 5449083615.0, + "step": 38600 + }, + { + "entropy": 1.3641015887260437, + "epoch": 1.261793607782965, + "grad_norm": 2.046875, + "learning_rate": 1.9709642374180845e-06, + "loss": 0.1304, + "mean_token_accuracy": 0.9704764342308044, + "num_tokens": 5456277556.0, + "step": 38650 + }, + { + "entropy": 1.3642727065086364, + "epoch": 1.2634259410401227, + "grad_norm": 1.984375, + "learning_rate": 1.9633620749154656e-06, + "loss": 0.129, + "mean_token_accuracy": 0.9703558135032654, + "num_tokens": 5462943399.0, + "step": 38700 + }, + { + "entropy": 1.3643340587615966, + "epoch": 1.2650582742972805, + "grad_norm": 1.984375, + "learning_rate": 1.9557674642125618e-06, + "loss": 0.1276, + "mean_token_accuracy": 0.9705350911617279, + "num_tokens": 5470083333.0, + "step": 38750 + }, + { + "entropy": 1.35529381275177, + "epoch": 1.2666906075544384, + "grad_norm": 1.328125, + "learning_rate": 1.9481804606353256e-06, + "loss": 0.1207, + "mean_token_accuracy": 0.9720316576957703, + "num_tokens": 5477264167.0, + "step": 38800 + }, + { + "entropy": 1.347254078388214, + "epoch": 1.2683229408115961, + "grad_norm": 1.2265625, + "learning_rate": 1.9406011194542896e-06, + "loss": 0.1256, + "mean_token_accuracy": 0.9704374611377716, + "num_tokens": 5484197390.0, + "step": 38850 + }, + { + "entropy": 1.3472388553619385, + "epoch": 1.2699552740687539, + "grad_norm": 1.78125, + "learning_rate": 1.933029495884169e-06, + "loss": 0.1166, + "mean_token_accuracy": 0.9729468870162964, + "num_tokens": 5490515143.0, + "step": 38900 + }, + { + "entropy": 1.3586375880241395, + "epoch": 1.2715876073259116, + "grad_norm": 2.609375, + "learning_rate": 1.925465645083455e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9688045310974122, + "num_tokens": 5497798023.0, + "step": 38950 + }, + { + "entropy": 1.3617107224464418, + "epoch": 1.2732199405830693, + "grad_norm": 1.859375, + "learning_rate": 1.9179096221540163e-06, + "loss": 0.1337, + "mean_token_accuracy": 0.969189600944519, + "num_tokens": 5504996504.0, + "step": 39000 + }, + { + "epoch": 1.2732199405830693, + "eval_entropy": 1.3547975174585978, + "eval_loss": 0.14426733553409576, + "eval_mean_token_accuracy": 0.9672306942939758, + "eval_num_tokens": 5504996504.0, + "eval_runtime": 747.051, + "eval_samples_per_second": 12.925, + "eval_steps_per_second": 0.102, + "step": 39000 + }, + { + "entropy": 1.3557351016998291, + "epoch": 1.2748522738402273, + "grad_norm": 1.3828125, + "learning_rate": 1.910361482140696e-06, + "loss": 0.1248, + "mean_token_accuracy": 0.9716817474365235, + "num_tokens": 5512011287.0, + "step": 39050 + }, + { + "entropy": 1.353394594192505, + "epoch": 1.276484607097385, + "grad_norm": 1.3203125, + "learning_rate": 1.90282128003091e-06, + "loss": 0.12, + "mean_token_accuracy": 0.9717885673046112, + "num_tokens": 5518924203.0, + "step": 39100 + }, + { + "entropy": 1.3534292221069335, + "epoch": 1.2781169403545427, + "grad_norm": 1.234375, + "learning_rate": 1.895289070754249e-06, + "loss": 0.131, + "mean_token_accuracy": 0.969032096862793, + "num_tokens": 5526036302.0, + "step": 39150 + }, + { + "entropy": 1.3387615489959717, + "epoch": 1.2797492736117007, + "grad_norm": 2.421875, + "learning_rate": 1.887764909182076e-06, + "loss": 0.1203, + "mean_token_accuracy": 0.9723163688182831, + "num_tokens": 5532560928.0, + "step": 39200 + }, + { + "entropy": 1.3516933727264404, + "epoch": 1.2813816068688584, + "grad_norm": 1.78125, + "learning_rate": 1.8802488501271259e-06, + "loss": 0.1325, + "mean_token_accuracy": 0.9697006630897522, + "num_tokens": 5539749961.0, + "step": 39250 + }, + { + "entropy": 1.358582181930542, + "epoch": 1.283013940126016, + "grad_norm": 1.2421875, + "learning_rate": 1.8727409483431112e-06, + "loss": 0.1276, + "mean_token_accuracy": 0.9706891429424286, + "num_tokens": 5546991771.0, + "step": 39300 + }, + { + "entropy": 1.3414494490623474, + "epoch": 1.2846462733831738, + "grad_norm": 1.40625, + "learning_rate": 1.8652412585243158e-06, + "loss": 0.1205, + "mean_token_accuracy": 0.9722245049476623, + "num_tokens": 5553461638.0, + "step": 39350 + }, + { + "entropy": 1.3569713354110717, + "epoch": 1.2862786066403316, + "grad_norm": 2.0625, + "learning_rate": 1.8577498353052025e-06, + "loss": 0.1257, + "mean_token_accuracy": 0.9703078508377075, + "num_tokens": 5560423404.0, + "step": 39400 + }, + { + "entropy": 1.3759264373779296, + "epoch": 1.2879109398974895, + "grad_norm": 1.5078125, + "learning_rate": 1.850266733260012e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.9663536393642426, + "num_tokens": 5568028795.0, + "step": 39450 + }, + { + "entropy": 1.3500820732116698, + "epoch": 1.2895432731546472, + "grad_norm": 1.390625, + "learning_rate": 1.8427920069023658e-06, + "loss": 0.1198, + "mean_token_accuracy": 0.9720592284202576, + "num_tokens": 5574696411.0, + "step": 39500 + }, + { + "epoch": 1.2895432731546472, + "eval_entropy": 1.3591709502538045, + "eval_loss": 0.14416085183620453, + "eval_mean_token_accuracy": 0.9672661876678467, + "eval_num_tokens": 5574696411.0, + "eval_runtime": 754.3873, + "eval_samples_per_second": 12.8, + "eval_steps_per_second": 0.101, + "step": 39500 + }, + { + "entropy": 1.363064079284668, + "epoch": 1.291175606411805, + "grad_norm": 1.25, + "learning_rate": 1.8353257106848703e-06, + "loss": 0.1242, + "mean_token_accuracy": 0.9711923873424531, + "num_tokens": 5581679975.0, + "step": 39550 + }, + { + "entropy": 1.356317241191864, + "epoch": 1.292807939668963, + "grad_norm": 1.859375, + "learning_rate": 1.8278678989987178e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9687803375720978, + "num_tokens": 5589014993.0, + "step": 39600 + }, + { + "entropy": 1.3506816673278808, + "epoch": 1.2944402729261206, + "grad_norm": 1.3046875, + "learning_rate": 1.8204186261732938e-06, + "loss": 0.1313, + "mean_token_accuracy": 0.9694370079040527, + "num_tokens": 5595958511.0, + "step": 39650 + }, + { + "entropy": 1.3653843665122987, + "epoch": 1.2960726061832784, + "grad_norm": 1.78125, + "learning_rate": 1.8129779464757774e-06, + "loss": 0.1327, + "mean_token_accuracy": 0.9690759527683258, + "num_tokens": 5603031399.0, + "step": 39700 + }, + { + "entropy": 1.3516422724723816, + "epoch": 1.297704939440436, + "grad_norm": 2.484375, + "learning_rate": 1.8055459141107477e-06, + "loss": 0.1184, + "mean_token_accuracy": 0.9726696240901948, + "num_tokens": 5610046160.0, + "step": 39750 + }, + { + "entropy": 1.376489179134369, + "epoch": 1.2993372726975938, + "grad_norm": 1.484375, + "learning_rate": 1.7981225832197894e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9676700031757355, + "num_tokens": 5617399853.0, + "step": 39800 + }, + { + "entropy": 1.345590648651123, + "epoch": 1.3009696059547518, + "grad_norm": 1.46875, + "learning_rate": 1.7907080078810983e-06, + "loss": 0.1242, + "mean_token_accuracy": 0.9713241112232208, + "num_tokens": 5624175713.0, + "step": 39850 + }, + { + "entropy": 1.3524721622467042, + "epoch": 1.3026019392119095, + "grad_norm": 1.421875, + "learning_rate": 1.7833022421090858e-06, + "loss": 0.1226, + "mean_token_accuracy": 0.9718897187709808, + "num_tokens": 5631014007.0, + "step": 39900 + }, + { + "entropy": 1.3629893851280213, + "epoch": 1.3042342724690672, + "grad_norm": 1.09375, + "learning_rate": 1.7759053398539873e-06, + "loss": 0.1312, + "mean_token_accuracy": 0.970218130350113, + "num_tokens": 5638370611.0, + "step": 39950 + }, + { + "entropy": 1.3576352548599244, + "epoch": 1.3058666057262251, + "grad_norm": 1.2890625, + "learning_rate": 1.7685173550014671e-06, + "loss": 0.134, + "mean_token_accuracy": 0.9691074180603028, + "num_tokens": 5645778490.0, + "step": 40000 + }, + { + "epoch": 1.3058666057262251, + "eval_entropy": 1.355252981185913, + "eval_loss": 0.14380384981632233, + "eval_mean_token_accuracy": 0.9673471585909525, + "eval_num_tokens": 5645778490.0, + "eval_runtime": 745.8824, + "eval_samples_per_second": 12.946, + "eval_steps_per_second": 0.102, + "step": 40000 + }, + { + "entropy": 1.3658280086517334, + "epoch": 1.3074989389833829, + "grad_norm": 2.109375, + "learning_rate": 1.7611383413722303e-06, + "loss": 0.1408, + "mean_token_accuracy": 0.9673460054397583, + "num_tokens": 5653261891.0, + "step": 40050 + }, + { + "entropy": 1.3569263100624085, + "epoch": 1.3091312722405406, + "grad_norm": 1.3984375, + "learning_rate": 1.7537683527216242e-06, + "loss": 0.1303, + "mean_token_accuracy": 0.9697596049308777, + "num_tokens": 5660316880.0, + "step": 40100 + }, + { + "entropy": 1.3518936610221863, + "epoch": 1.3107636054976983, + "grad_norm": 1.484375, + "learning_rate": 1.7464074427392512e-06, + "loss": 0.1291, + "mean_token_accuracy": 0.9700168156623841, + "num_tokens": 5667680932.0, + "step": 40150 + }, + { + "entropy": 1.3594465851783752, + "epoch": 1.3123959387548563, + "grad_norm": 1.8984375, + "learning_rate": 1.7390556650485782e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9701092898845672, + "num_tokens": 5674932027.0, + "step": 40200 + }, + { + "entropy": 1.3511109519004822, + "epoch": 1.314028272012014, + "grad_norm": 1.2890625, + "learning_rate": 1.7317130732065411e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9719385504722595, + "num_tokens": 5681669046.0, + "step": 40250 + }, + { + "entropy": 1.3469841027259826, + "epoch": 1.3156606052691717, + "grad_norm": 1.1015625, + "learning_rate": 1.7243797207031596e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9718147456645966, + "num_tokens": 5688464363.0, + "step": 40300 + }, + { + "entropy": 1.3597651433944702, + "epoch": 1.3172929385263297, + "grad_norm": 1.65625, + "learning_rate": 1.7170556609611477e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9700502359867096, + "num_tokens": 5695659707.0, + "step": 40350 + }, + { + "entropy": 1.359071056842804, + "epoch": 1.3189252717834874, + "grad_norm": 2.03125, + "learning_rate": 1.709740947335518e-06, + "loss": 0.1232, + "mean_token_accuracy": 0.9715420746803284, + "num_tokens": 5702611675.0, + "step": 40400 + }, + { + "entropy": 1.3486275053024293, + "epoch": 1.3205576050406451, + "grad_norm": 0.9609375, + "learning_rate": 1.7024356331132025e-06, + "loss": 0.1197, + "mean_token_accuracy": 0.9719861805438995, + "num_tokens": 5709648249.0, + "step": 40450 + }, + { + "entropy": 1.366611111164093, + "epoch": 1.3221899382978028, + "grad_norm": 1.5859375, + "learning_rate": 1.695139771512655e-06, + "loss": 0.1363, + "mean_token_accuracy": 0.96794402718544, + "num_tokens": 5716864381.0, + "step": 40500 + }, + { + "epoch": 1.3221899382978028, + "eval_entropy": 1.3547792641321819, + "eval_loss": 0.14348579943180084, + "eval_mean_token_accuracy": 0.9674063356717427, + "eval_num_tokens": 5716864381.0, + "eval_runtime": 751.715, + "eval_samples_per_second": 12.845, + "eval_steps_per_second": 0.101, + "step": 40500 + }, + { + "entropy": 1.3567886662483215, + "epoch": 1.3238222715549606, + "grad_norm": 1.59375, + "learning_rate": 1.687853415683473e-06, + "loss": 0.118, + "mean_token_accuracy": 0.9730365478992462, + "num_tokens": 5723690263.0, + "step": 40550 + }, + { + "entropy": 1.3625458979606628, + "epoch": 1.3254546048121185, + "grad_norm": 1.140625, + "learning_rate": 1.6805766187059998e-06, + "loss": 0.1359, + "mean_token_accuracy": 0.9688236701488495, + "num_tokens": 5731246956.0, + "step": 40600 + }, + { + "entropy": 1.3589171147346497, + "epoch": 1.3270869380692762, + "grad_norm": 1.4296875, + "learning_rate": 1.6733094335909486e-06, + "loss": 0.1366, + "mean_token_accuracy": 0.968874671459198, + "num_tokens": 5738659112.0, + "step": 40650 + }, + { + "entropy": 1.3442611718177795, + "epoch": 1.328719271326434, + "grad_norm": 1.25, + "learning_rate": 1.666051913279007e-06, + "loss": 0.1259, + "mean_token_accuracy": 0.9707275819778443, + "num_tokens": 5745995037.0, + "step": 40700 + }, + { + "entropy": 1.3413966584205628, + "epoch": 1.330351604583592, + "grad_norm": 1.6171875, + "learning_rate": 1.658804110640458e-06, + "loss": 0.1199, + "mean_token_accuracy": 0.972169394493103, + "num_tokens": 5752781216.0, + "step": 40750 + }, + { + "entropy": 1.3544580388069152, + "epoch": 1.3319839378407496, + "grad_norm": 1.421875, + "learning_rate": 1.6515660784747933e-06, + "loss": 0.1287, + "mean_token_accuracy": 0.9699071037769318, + "num_tokens": 5759681433.0, + "step": 40800 + }, + { + "entropy": 1.3634159827232362, + "epoch": 1.3336162710979074, + "grad_norm": 1.2421875, + "learning_rate": 1.6443378695103233e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.9680401980876923, + "num_tokens": 5767127976.0, + "step": 40850 + }, + { + "entropy": 1.3604375696182252, + "epoch": 1.335248604355065, + "grad_norm": 1.40625, + "learning_rate": 1.6371195364038034e-06, + "loss": 0.1281, + "mean_token_accuracy": 0.9703556621074676, + "num_tokens": 5773885662.0, + "step": 40900 + }, + { + "entropy": 1.367766616344452, + "epoch": 1.3368809376122228, + "grad_norm": 1.3203125, + "learning_rate": 1.6299111317400382e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.9696219575405121, + "num_tokens": 5780884429.0, + "step": 40950 + }, + { + "entropy": 1.3579600000381469, + "epoch": 1.3385132708693808, + "grad_norm": 1.5078125, + "learning_rate": 1.6227127080315103e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9701367461681366, + "num_tokens": 5787949031.0, + "step": 41000 + }, + { + "epoch": 1.3385132708693808, + "eval_entropy": 1.3570391607284547, + "eval_loss": 0.14324620366096497, + "eval_mean_token_accuracy": 0.9674420754114786, + "eval_num_tokens": 5787949031.0, + "eval_runtime": 754.7744, + "eval_samples_per_second": 12.793, + "eval_steps_per_second": 0.101, + "step": 41000 + }, + { + "entropy": 1.3567052793502807, + "epoch": 1.3401456041265385, + "grad_norm": 1.3671875, + "learning_rate": 1.6155243177179873e-06, + "loss": 0.125, + "mean_token_accuracy": 0.9710537779331208, + "num_tokens": 5795334113.0, + "step": 41050 + }, + { + "entropy": 1.3398216152191162, + "epoch": 1.3417779373836962, + "grad_norm": 0.91796875, + "learning_rate": 1.6083460131661477e-06, + "loss": 0.1133, + "mean_token_accuracy": 0.9733657622337342, + "num_tokens": 5802143914.0, + "step": 41100 + }, + { + "entropy": 1.3610263323783875, + "epoch": 1.3434102706408542, + "grad_norm": 1.265625, + "learning_rate": 1.6011778466691951e-06, + "loss": 0.1231, + "mean_token_accuracy": 0.9718475365638732, + "num_tokens": 5808928463.0, + "step": 41150 + }, + { + "entropy": 1.3601333618164062, + "epoch": 1.345042603898012, + "grad_norm": 1.234375, + "learning_rate": 1.5940198704464793e-06, + "loss": 0.1302, + "mean_token_accuracy": 0.969777444601059, + "num_tokens": 5816047556.0, + "step": 41200 + }, + { + "entropy": 1.36517019033432, + "epoch": 1.3466749371551696, + "grad_norm": 2.265625, + "learning_rate": 1.5868721366431148e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.9697459650039673, + "num_tokens": 5823417847.0, + "step": 41250 + }, + { + "entropy": 1.3562990355491638, + "epoch": 1.3483072704123273, + "grad_norm": 1.8125, + "learning_rate": 1.5797346973295984e-06, + "loss": 0.1294, + "mean_token_accuracy": 0.9702081382274628, + "num_tokens": 5830494397.0, + "step": 41300 + }, + { + "entropy": 1.3457439398765565, + "epoch": 1.349939603669485, + "grad_norm": 1.40625, + "learning_rate": 1.5726076045014376e-06, + "loss": 0.1206, + "mean_token_accuracy": 0.9718204891681671, + "num_tokens": 5837675412.0, + "step": 41350 + }, + { + "entropy": 1.3478499841690064, + "epoch": 1.351571936926643, + "grad_norm": 1.1484375, + "learning_rate": 1.565490910078761e-06, + "loss": 0.1224, + "mean_token_accuracy": 0.971153804063797, + "num_tokens": 5844543024.0, + "step": 41400 + }, + { + "entropy": 1.3584490442276, + "epoch": 1.3532042701838007, + "grad_norm": 1.5703125, + "learning_rate": 1.5583846659059525e-06, + "loss": 0.12, + "mean_token_accuracy": 0.9715865242481232, + "num_tokens": 5851449870.0, + "step": 41450 + }, + { + "entropy": 1.3518057036399842, + "epoch": 1.3548366034409585, + "grad_norm": 1.265625, + "learning_rate": 1.5512889237512604e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9702707767486572, + "num_tokens": 5858314021.0, + "step": 41500 + }, + { + "epoch": 1.3548366034409585, + "eval_entropy": 1.3525403213500977, + "eval_loss": 0.14299461245536804, + "eval_mean_token_accuracy": 0.9675085457166036, + "eval_num_tokens": 5858314021.0, + "eval_runtime": 751.2489, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 41500 + }, + { + "entropy": 1.3594915199279785, + "epoch": 1.3564689366981164, + "grad_norm": 1.8671875, + "learning_rate": 1.5442037353064314e-06, + "loss": 0.1252, + "mean_token_accuracy": 0.97132617354393, + "num_tokens": 5865594325.0, + "step": 41550 + }, + { + "entropy": 1.352486503124237, + "epoch": 1.3581012699552741, + "grad_norm": 1.3125, + "learning_rate": 1.537129152186329e-06, + "loss": 0.1254, + "mean_token_accuracy": 0.9710965490341187, + "num_tokens": 5872751335.0, + "step": 41600 + }, + { + "entropy": 1.3547274160385132, + "epoch": 1.3597336032124319, + "grad_norm": 1.6015625, + "learning_rate": 1.530065225928555e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9703376948833465, + "num_tokens": 5880128383.0, + "step": 41650 + }, + { + "entropy": 1.3553478455543517, + "epoch": 1.3613659364695896, + "grad_norm": 1.8828125, + "learning_rate": 1.5230120079930814e-06, + "loss": 0.1216, + "mean_token_accuracy": 0.9711664152145386, + "num_tokens": 5887029878.0, + "step": 41700 + }, + { + "entropy": 1.3534876823425293, + "epoch": 1.3629982697267473, + "grad_norm": 2.6875, + "learning_rate": 1.515969549761867e-06, + "loss": 0.1274, + "mean_token_accuracy": 0.9711965453624726, + "num_tokens": 5893747154.0, + "step": 41750 + }, + { + "entropy": 1.360564501285553, + "epoch": 1.3646306029839053, + "grad_norm": 1.4140625, + "learning_rate": 1.5089379025384912e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.969272004365921, + "num_tokens": 5901094204.0, + "step": 41800 + }, + { + "entropy": 1.342657322883606, + "epoch": 1.366262936241063, + "grad_norm": 0.020263671875, + "learning_rate": 1.501917117547772e-06, + "loss": 0.1229, + "mean_token_accuracy": 0.9716498827934266, + "num_tokens": 5908182746.0, + "step": 41850 + }, + { + "entropy": 1.3452470707893371, + "epoch": 1.3678952694982207, + "grad_norm": 1.203125, + "learning_rate": 1.4949072459354022e-06, + "loss": 0.1212, + "mean_token_accuracy": 0.972344673871994, + "num_tokens": 5915128546.0, + "step": 41900 + }, + { + "entropy": 1.3409530735015869, + "epoch": 1.3695276027553787, + "grad_norm": 1.4765625, + "learning_rate": 1.4879083387675666e-06, + "loss": 0.1229, + "mean_token_accuracy": 0.9715113770961762, + "num_tokens": 5922010723.0, + "step": 41950 + }, + { + "entropy": 1.3670923542976379, + "epoch": 1.3711599360125364, + "grad_norm": 2.484375, + "learning_rate": 1.4809204470305788e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.9693290328979492, + "num_tokens": 5929411699.0, + "step": 42000 + }, + { + "epoch": 1.3711599360125364, + "eval_entropy": 1.352181215286255, + "eval_loss": 0.14293250441551208, + "eval_mean_token_accuracy": 0.9675895547866822, + "eval_num_tokens": 5929411699.0, + "eval_runtime": 753.3298, + "eval_samples_per_second": 12.818, + "eval_steps_per_second": 0.101, + "step": 42000 + }, + { + "entropy": 1.3608778929710388, + "epoch": 1.372792269269694, + "grad_norm": 1.1328125, + "learning_rate": 1.4739436216305063e-06, + "loss": 0.1335, + "mean_token_accuracy": 0.9693701839447022, + "num_tokens": 5936747601.0, + "step": 42050 + }, + { + "entropy": 1.361914451122284, + "epoch": 1.3744246025268518, + "grad_norm": 1.734375, + "learning_rate": 1.4669779133927956e-06, + "loss": 0.1255, + "mean_token_accuracy": 0.9715308749675751, + "num_tokens": 5943992563.0, + "step": 42100 + }, + { + "entropy": 1.3531924724578857, + "epoch": 1.3760569357840096, + "grad_norm": 1.34375, + "learning_rate": 1.460023373061911e-06, + "loss": 0.1291, + "mean_token_accuracy": 0.9694987845420837, + "num_tokens": 5951045516.0, + "step": 42150 + }, + { + "entropy": 1.3581135630607606, + "epoch": 1.3776892690411675, + "grad_norm": 0.005950927734375, + "learning_rate": 1.4530800513009545e-06, + "loss": 0.1265, + "mean_token_accuracy": 0.9705501091480255, + "num_tokens": 5958106338.0, + "step": 42200 + }, + { + "entropy": 1.361065561771393, + "epoch": 1.3793216022983252, + "grad_norm": 2.1875, + "learning_rate": 1.4461479986913075e-06, + "loss": 0.1232, + "mean_token_accuracy": 0.9708436739444732, + "num_tokens": 5964787874.0, + "step": 42250 + }, + { + "entropy": 1.3638162088394166, + "epoch": 1.380953935555483, + "grad_norm": 2.03125, + "learning_rate": 1.43922726573225e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.9698211419582367, + "num_tokens": 5972312102.0, + "step": 42300 + }, + { + "entropy": 1.3620294046401977, + "epoch": 1.382586268812641, + "grad_norm": 1.328125, + "learning_rate": 1.4323179028406086e-06, + "loss": 0.126, + "mean_token_accuracy": 0.9703358936309815, + "num_tokens": 5979432191.0, + "step": 42350 + }, + { + "entropy": 1.3516957235336304, + "epoch": 1.3842186020697986, + "grad_norm": 1.65625, + "learning_rate": 1.4254199603503709e-06, + "loss": 0.1185, + "mean_token_accuracy": 0.9726535677909851, + "num_tokens": 5986862863.0, + "step": 42400 + }, + { + "entropy": 1.3603333234786987, + "epoch": 1.3858509353269564, + "grad_norm": 2.390625, + "learning_rate": 1.4185334885123332e-06, + "loss": 0.1225, + "mean_token_accuracy": 0.9719485092163086, + "num_tokens": 5993969922.0, + "step": 42450 + }, + { + "entropy": 1.3303070521354676, + "epoch": 1.387483268584114, + "grad_norm": 1.3515625, + "learning_rate": 1.4116585374937304e-06, + "loss": 0.1134, + "mean_token_accuracy": 0.9734328532218933, + "num_tokens": 6000796242.0, + "step": 42500 + }, + { + "epoch": 1.387483268584114, + "eval_entropy": 1.3550557088851929, + "eval_loss": 0.14276918768882751, + "eval_mean_token_accuracy": 0.9676290774345397, + "eval_num_tokens": 6000796242.0, + "eval_runtime": 751.2401, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 42500 + }, + { + "entropy": 1.3588794898986816, + "epoch": 1.3891156018412718, + "grad_norm": 1.1796875, + "learning_rate": 1.4047951573778641e-06, + "loss": 0.1206, + "mean_token_accuracy": 0.9720923590660095, + "num_tokens": 6007327659.0, + "step": 42550 + }, + { + "entropy": 1.3503399062156678, + "epoch": 1.3907479350984298, + "grad_norm": 1.484375, + "learning_rate": 1.3979433981637493e-06, + "loss": 0.1193, + "mean_token_accuracy": 0.9727861452102661, + "num_tokens": 6014078918.0, + "step": 42600 + }, + { + "entropy": 1.3634498286247254, + "epoch": 1.3923802683555875, + "grad_norm": 1.1171875, + "learning_rate": 1.3911033097657374e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9721335184574127, + "num_tokens": 6020712824.0, + "step": 42650 + }, + { + "entropy": 1.3603689241409302, + "epoch": 1.3940126016127452, + "grad_norm": 1.796875, + "learning_rate": 1.3842749420131663e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.969935257434845, + "num_tokens": 6027950128.0, + "step": 42700 + }, + { + "entropy": 1.3577491450309753, + "epoch": 1.3956449348699032, + "grad_norm": 1.265625, + "learning_rate": 1.3774583446499835e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.9680879211425781, + "num_tokens": 6035576325.0, + "step": 42750 + }, + { + "entropy": 1.349846076965332, + "epoch": 1.3972772681270609, + "grad_norm": 1.484375, + "learning_rate": 1.3706535673343945e-06, + "loss": 0.1289, + "mean_token_accuracy": 0.9708491718769073, + "num_tokens": 6042939083.0, + "step": 42800 + }, + { + "entropy": 1.3469208598136901, + "epoch": 1.3989096013842186, + "grad_norm": 1.828125, + "learning_rate": 1.3638606596384973e-06, + "loss": 0.1186, + "mean_token_accuracy": 0.9719974470138549, + "num_tokens": 6050053050.0, + "step": 42850 + }, + { + "entropy": 1.3655208253860474, + "epoch": 1.4005419346413763, + "grad_norm": 1.8359375, + "learning_rate": 1.3570796710479174e-06, + "loss": 0.13, + "mean_token_accuracy": 0.9699460983276367, + "num_tokens": 6057064095.0, + "step": 42900 + }, + { + "entropy": 1.351545627117157, + "epoch": 1.402174267898534, + "grad_norm": 2.28125, + "learning_rate": 1.3503106509614553e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9710482954978943, + "num_tokens": 6064091867.0, + "step": 42950 + }, + { + "entropy": 1.3588644528388978, + "epoch": 1.403806601155692, + "grad_norm": 1.8671875, + "learning_rate": 1.3435536486907172e-06, + "loss": 0.1234, + "mean_token_accuracy": 0.9709674298763276, + "num_tokens": 6071438514.0, + "step": 43000 + }, + { + "epoch": 1.403806601155692, + "eval_entropy": 1.3524145444234212, + "eval_loss": 0.14265932142734528, + "eval_mean_token_accuracy": 0.9675256490707398, + "eval_num_tokens": 6071438514.0, + "eval_runtime": 753.0235, + "eval_samples_per_second": 12.823, + "eval_steps_per_second": 0.101, + "step": 43000 + }, + { + "entropy": 1.3578597354888915, + "epoch": 1.4054389344128497, + "grad_norm": 0.01470947265625, + "learning_rate": 1.3368087134597663e-06, + "loss": 0.1238, + "mean_token_accuracy": 0.9716431427001954, + "num_tokens": 6078596404.0, + "step": 43050 + }, + { + "entropy": 1.3505292820930481, + "epoch": 1.4070712676700075, + "grad_norm": 1.3671875, + "learning_rate": 1.3300758944047536e-06, + "loss": 0.1321, + "mean_token_accuracy": 0.9695228207111358, + "num_tokens": 6086011992.0, + "step": 43100 + }, + { + "entropy": 1.3455975699424743, + "epoch": 1.4087036009271654, + "grad_norm": 1.6953125, + "learning_rate": 1.3233552405735694e-06, + "loss": 0.1156, + "mean_token_accuracy": 0.9727073276042938, + "num_tokens": 6092681553.0, + "step": 43150 + }, + { + "entropy": 1.3595034289360046, + "epoch": 1.4103359341843231, + "grad_norm": 1.75, + "learning_rate": 1.3166468009254766e-06, + "loss": 0.1196, + "mean_token_accuracy": 0.9722693479061126, + "num_tokens": 6099527516.0, + "step": 43200 + }, + { + "entropy": 1.3427305126190185, + "epoch": 1.4119682674414809, + "grad_norm": 2.359375, + "learning_rate": 1.309950624330764e-06, + "loss": 0.1194, + "mean_token_accuracy": 0.9728332602977753, + "num_tokens": 6106350241.0, + "step": 43250 + }, + { + "entropy": 1.3501028728485107, + "epoch": 1.4136006006986386, + "grad_norm": 1.40625, + "learning_rate": 1.3032667595703842e-06, + "loss": 0.1259, + "mean_token_accuracy": 0.9705328547954559, + "num_tokens": 6113268163.0, + "step": 43300 + }, + { + "entropy": 1.3498585319519043, + "epoch": 1.4152329339557963, + "grad_norm": 1.328125, + "learning_rate": 1.2965952553355958e-06, + "loss": 0.1277, + "mean_token_accuracy": 0.9707833099365234, + "num_tokens": 6120281186.0, + "step": 43350 + }, + { + "entropy": 1.35530499458313, + "epoch": 1.4168652672129542, + "grad_norm": 2.265625, + "learning_rate": 1.2899361602276175e-06, + "loss": 0.1265, + "mean_token_accuracy": 0.9698385155200958, + "num_tokens": 6127237591.0, + "step": 43400 + }, + { + "entropy": 1.3571084594726563, + "epoch": 1.418497600470112, + "grad_norm": 1.390625, + "learning_rate": 1.2832895227572622e-06, + "loss": 0.1271, + "mean_token_accuracy": 0.9701455044746399, + "num_tokens": 6134551254.0, + "step": 43450 + }, + { + "entropy": 1.3430089569091797, + "epoch": 1.4201299337272697, + "grad_norm": 1.9453125, + "learning_rate": 1.2766553913445993e-06, + "loss": 0.1201, + "mean_token_accuracy": 0.9717100954055786, + "num_tokens": 6141418599.0, + "step": 43500 + }, + { + "epoch": 1.4201299337272697, + "eval_entropy": 1.3463400856653849, + "eval_loss": 0.1425597369670868, + "eval_mean_token_accuracy": 0.9676672736803691, + "eval_num_tokens": 6141418599.0, + "eval_runtime": 753.679, + "eval_samples_per_second": 12.812, + "eval_steps_per_second": 0.101, + "step": 43500 + }, + { + "entropy": 1.3354617381095886, + "epoch": 1.4217622669844276, + "grad_norm": 1.6015625, + "learning_rate": 1.2700338143185843e-06, + "loss": 0.1162, + "mean_token_accuracy": 0.972969708442688, + "num_tokens": 6148322455.0, + "step": 43550 + }, + { + "entropy": 1.3403245830535888, + "epoch": 1.4233946002415854, + "grad_norm": 1.6328125, + "learning_rate": 1.2634248399167203e-06, + "loss": 0.1193, + "mean_token_accuracy": 0.9722915697097778, + "num_tokens": 6155533008.0, + "step": 43600 + }, + { + "entropy": 1.353358724117279, + "epoch": 1.425026933498743, + "grad_norm": 1.1953125, + "learning_rate": 1.2568285162846987e-06, + "loss": 0.1328, + "mean_token_accuracy": 0.9696524286270142, + "num_tokens": 6162774102.0, + "step": 43650 + }, + { + "entropy": 1.3451939988136292, + "epoch": 1.4266592667559008, + "grad_norm": 1.1875, + "learning_rate": 1.2502448914760533e-06, + "loss": 0.1138, + "mean_token_accuracy": 0.9730928063392639, + "num_tokens": 6169457314.0, + "step": 43700 + }, + { + "entropy": 1.3512369799613952, + "epoch": 1.4282916000130585, + "grad_norm": 2.09375, + "learning_rate": 1.2436740134518094e-06, + "loss": 0.1283, + "mean_token_accuracy": 0.9699138104915619, + "num_tokens": 6176504604.0, + "step": 43750 + }, + { + "entropy": 1.3418658518791198, + "epoch": 1.4299239332702165, + "grad_norm": 1.609375, + "learning_rate": 1.2371159300801284e-06, + "loss": 0.1169, + "mean_token_accuracy": 0.9729372990131379, + "num_tokens": 6183277441.0, + "step": 43800 + }, + { + "entropy": 1.3383120560646058, + "epoch": 1.4315562665273742, + "grad_norm": 1.4296875, + "learning_rate": 1.2305706891359698e-06, + "loss": 0.117, + "mean_token_accuracy": 0.9732610857486725, + "num_tokens": 6190472536.0, + "step": 43850 + }, + { + "entropy": 1.3513377356529235, + "epoch": 1.433188599784532, + "grad_norm": 1.328125, + "learning_rate": 1.2240383383007325e-06, + "loss": 0.135, + "mean_token_accuracy": 0.9690117561817169, + "num_tokens": 6197990663.0, + "step": 43900 + }, + { + "entropy": 1.3517193269729615, + "epoch": 1.43482093304169, + "grad_norm": 1.59375, + "learning_rate": 1.2175189251619168e-06, + "loss": 0.1177, + "mean_token_accuracy": 0.9728011786937714, + "num_tokens": 6204993365.0, + "step": 43950 + }, + { + "entropy": 1.3436356329917907, + "epoch": 1.4364532662988476, + "grad_norm": 1.03125, + "learning_rate": 1.2110124972127686e-06, + "loss": 0.1262, + "mean_token_accuracy": 0.9713046276569366, + "num_tokens": 6212573620.0, + "step": 44000 + }, + { + "epoch": 1.4364532662988476, + "eval_entropy": 1.3463455152511596, + "eval_loss": 0.14246320724487305, + "eval_mean_token_accuracy": 0.9677392840385437, + "eval_num_tokens": 6212573620.0, + "eval_runtime": 754.1482, + "eval_samples_per_second": 12.804, + "eval_steps_per_second": 0.101, + "step": 44000 + }, + { + "entropy": 1.3434760403633117, + "epoch": 1.4380855995560053, + "grad_norm": 2.40625, + "learning_rate": 1.2045191018519415e-06, + "loss": 0.1169, + "mean_token_accuracy": 0.9728634548187256, + "num_tokens": 6219647458.0, + "step": 44050 + }, + { + "entropy": 1.3590314435958861, + "epoch": 1.439717932813163, + "grad_norm": 1.0625, + "learning_rate": 1.1980387863831478e-06, + "loss": 0.126, + "mean_token_accuracy": 0.9703594601154327, + "num_tokens": 6226844151.0, + "step": 44100 + }, + { + "entropy": 1.3532491779327394, + "epoch": 1.4413502660703208, + "grad_norm": 1.4375, + "learning_rate": 1.1915715980148117e-06, + "loss": 0.1309, + "mean_token_accuracy": 0.9699800384044647, + "num_tokens": 6234263907.0, + "step": 44150 + }, + { + "entropy": 1.349997682571411, + "epoch": 1.4429825993274787, + "grad_norm": 1.7109375, + "learning_rate": 1.1851175838597306e-06, + "loss": 0.121, + "mean_token_accuracy": 0.9720881938934326, + "num_tokens": 6241048484.0, + "step": 44200 + }, + { + "entropy": 1.354639277458191, + "epoch": 1.4446149325846365, + "grad_norm": 1.6484375, + "learning_rate": 1.1786767909347268e-06, + "loss": 0.1375, + "mean_token_accuracy": 0.9680390894412995, + "num_tokens": 6248857975.0, + "step": 44250 + }, + { + "entropy": 1.3580044388771058, + "epoch": 1.4462472658417942, + "grad_norm": 2.15625, + "learning_rate": 1.1722492661603098e-06, + "loss": 0.126, + "mean_token_accuracy": 0.9711121428012848, + "num_tokens": 6255829230.0, + "step": 44300 + }, + { + "entropy": 1.3534194731712341, + "epoch": 1.4478795990989521, + "grad_norm": 1.9296875, + "learning_rate": 1.165835056360329e-06, + "loss": 0.121, + "mean_token_accuracy": 0.9716777575016021, + "num_tokens": 6262589815.0, + "step": 44350 + }, + { + "entropy": 1.3448237705230712, + "epoch": 1.4495119323561099, + "grad_norm": 1.625, + "learning_rate": 1.1594342082616386e-06, + "loss": 0.1269, + "mean_token_accuracy": 0.9700544607639313, + "num_tokens": 6269556274.0, + "step": 44400 + }, + { + "entropy": 1.3512716341018676, + "epoch": 1.4511442656132676, + "grad_norm": 1.671875, + "learning_rate": 1.1530467684937514e-06, + "loss": 0.1226, + "mean_token_accuracy": 0.9714391076564789, + "num_tokens": 6276345824.0, + "step": 44450 + }, + { + "entropy": 1.3386855101585389, + "epoch": 1.4527765988704253, + "grad_norm": 2.5, + "learning_rate": 1.146672783588504e-06, + "loss": 0.1132, + "mean_token_accuracy": 0.973590886592865, + "num_tokens": 6282741770.0, + "step": 44500 + }, + { + "epoch": 1.4527765988704253, + "eval_entropy": 1.3487922525405884, + "eval_loss": 0.14237141609191895, + "eval_mean_token_accuracy": 0.967642084757487, + "eval_num_tokens": 6282741770.0, + "eval_runtime": 750.7822, + "eval_samples_per_second": 12.861, + "eval_steps_per_second": 0.101, + "step": 44500 + }, + { + "entropy": 1.3471359300613404, + "epoch": 1.454408932127583, + "grad_norm": 0.82421875, + "learning_rate": 1.1403122999797162e-06, + "loss": 0.122, + "mean_token_accuracy": 0.9717613708972931, + "num_tokens": 6289684604.0, + "step": 44550 + }, + { + "entropy": 1.357615110874176, + "epoch": 1.456041265384741, + "grad_norm": 2.140625, + "learning_rate": 1.133965364002848e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9699806833267212, + "num_tokens": 6296718052.0, + "step": 44600 + }, + { + "entropy": 1.3517170500755311, + "epoch": 1.4576735986418987, + "grad_norm": 0.004852294921875, + "learning_rate": 1.1276320218946737e-06, + "loss": 0.1167, + "mean_token_accuracy": 0.9736652266979218, + "num_tokens": 6303023793.0, + "step": 44650 + }, + { + "entropy": 1.3588165473937988, + "epoch": 1.4593059318990564, + "grad_norm": 2.375, + "learning_rate": 1.1213123197929296e-06, + "loss": 0.1289, + "mean_token_accuracy": 0.9699364423751831, + "num_tokens": 6310502615.0, + "step": 44700 + }, + { + "entropy": 1.3603587317466737, + "epoch": 1.4609382651562144, + "grad_norm": 1.4375, + "learning_rate": 1.1150063037359927e-06, + "loss": 0.131, + "mean_token_accuracy": 0.9688486230373382, + "num_tokens": 6317956478.0, + "step": 44750 + }, + { + "entropy": 1.3491565418243407, + "epoch": 1.4625705984133721, + "grad_norm": 2.171875, + "learning_rate": 1.108714019662533e-06, + "loss": 0.1178, + "mean_token_accuracy": 0.9722418069839478, + "num_tokens": 6324906866.0, + "step": 44800 + }, + { + "entropy": 1.347842710018158, + "epoch": 1.4642029316705298, + "grad_norm": 2.0, + "learning_rate": 1.1024355134111894e-06, + "loss": 0.1182, + "mean_token_accuracy": 0.9722170174121857, + "num_tokens": 6331464077.0, + "step": 44850 + }, + { + "entropy": 1.3519015336036682, + "epoch": 1.4658352649276876, + "grad_norm": 1.296875, + "learning_rate": 1.096170830720226e-06, + "loss": 0.1297, + "mean_token_accuracy": 0.9708062732219696, + "num_tokens": 6338494070.0, + "step": 44900 + }, + { + "entropy": 1.3482877349853515, + "epoch": 1.4674675981848453, + "grad_norm": 1.9140625, + "learning_rate": 1.0899200172272073e-06, + "loss": 0.1274, + "mean_token_accuracy": 0.9707257854938507, + "num_tokens": 6345499457.0, + "step": 44950 + }, + { + "entropy": 1.3303207564353943, + "epoch": 1.4690999314420032, + "grad_norm": 1.1875, + "learning_rate": 1.0836831184686621e-06, + "loss": 0.1154, + "mean_token_accuracy": 0.972674525976181, + "num_tokens": 6352217211.0, + "step": 45000 + }, + { + "epoch": 1.4690999314420032, + "eval_entropy": 1.3471796067555746, + "eval_loss": 0.14247018098831177, + "eval_mean_token_accuracy": 0.9676760347684225, + "eval_num_tokens": 6352217211.0, + "eval_runtime": 746.9306, + "eval_samples_per_second": 12.928, + "eval_steps_per_second": 0.102, + "step": 45000 + }, + { + "entropy": 1.3490448307991028, + "epoch": 1.470732264699161, + "grad_norm": 1.8125, + "learning_rate": 1.0774601798797487e-06, + "loss": 0.1202, + "mean_token_accuracy": 0.971461181640625, + "num_tokens": 6359326989.0, + "step": 45050 + }, + { + "entropy": 1.3362992668151856, + "epoch": 1.4723645979563187, + "grad_norm": 1.4453125, + "learning_rate": 1.071251246793931e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9699250388145447, + "num_tokens": 6366511833.0, + "step": 45100 + }, + { + "entropy": 1.3491197919845581, + "epoch": 1.4739969312134766, + "grad_norm": 2.046875, + "learning_rate": 1.0650563644426402e-06, + "loss": 0.1287, + "mean_token_accuracy": 0.9701677405834198, + "num_tokens": 6373790440.0, + "step": 45150 + }, + { + "entropy": 1.3424671697616577, + "epoch": 1.4756292644706344, + "grad_norm": 1.9375, + "learning_rate": 1.0588755779549534e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9701132154464722, + "num_tokens": 6381354563.0, + "step": 45200 + }, + { + "entropy": 1.3442249703407287, + "epoch": 1.477261597727792, + "grad_norm": 1.40625, + "learning_rate": 1.0527089323572568e-06, + "loss": 0.1235, + "mean_token_accuracy": 0.9716306221485138, + "num_tokens": 6388718219.0, + "step": 45250 + }, + { + "entropy": 1.3491050267219544, + "epoch": 1.4788939309849498, + "grad_norm": 1.8515625, + "learning_rate": 1.0465564725729245e-06, + "loss": 0.1337, + "mean_token_accuracy": 0.9686619007587433, + "num_tokens": 6396204282.0, + "step": 45300 + }, + { + "entropy": 1.358343975543976, + "epoch": 1.4805262642421075, + "grad_norm": 1.328125, + "learning_rate": 1.040418243421989e-06, + "loss": 0.1324, + "mean_token_accuracy": 0.9696023035049438, + "num_tokens": 6403488343.0, + "step": 45350 + }, + { + "entropy": 1.3462382221221925, + "epoch": 1.4821585974992655, + "grad_norm": 1.2109375, + "learning_rate": 1.0342942896208105e-06, + "loss": 0.1263, + "mean_token_accuracy": 0.9713137638568878, + "num_tokens": 6410641119.0, + "step": 45400 + }, + { + "entropy": 1.3580231857299805, + "epoch": 1.4837909307564232, + "grad_norm": 1.96875, + "learning_rate": 1.028184655781759e-06, + "loss": 0.1288, + "mean_token_accuracy": 0.9706091582775116, + "num_tokens": 6418060885.0, + "step": 45450 + }, + { + "entropy": 1.3437149500846863, + "epoch": 1.485423264013581, + "grad_norm": 1.5703125, + "learning_rate": 1.0220893864128809e-06, + "loss": 0.1204, + "mean_token_accuracy": 0.9711262369155884, + "num_tokens": 6424982813.0, + "step": 45500 + }, + { + "epoch": 1.485423264013581, + "eval_entropy": 1.3490506156285604, + "eval_loss": 0.14239265024662018, + "eval_mean_token_accuracy": 0.9676458183924357, + "eval_num_tokens": 6424982813.0, + "eval_runtime": 753.7774, + "eval_samples_per_second": 12.81, + "eval_steps_per_second": 0.101, + "step": 45500 + }, + { + "entropy": 1.3549335837364196, + "epoch": 1.4870555972707389, + "grad_norm": 2.078125, + "learning_rate": 1.0160085259175834e-06, + "loss": 0.12, + "mean_token_accuracy": 0.9724775660037994, + "num_tokens": 6431726519.0, + "step": 45550 + }, + { + "entropy": 1.3539844870567321, + "epoch": 1.4886879305278966, + "grad_norm": 1.5703125, + "learning_rate": 1.0099421185943016e-06, + "loss": 0.1171, + "mean_token_accuracy": 0.9727089703083038, + "num_tokens": 6439000590.0, + "step": 45600 + }, + { + "entropy": 1.3505275821685792, + "epoch": 1.4903202637850543, + "grad_norm": 1.546875, + "learning_rate": 1.0038902086361862e-06, + "loss": 0.1234, + "mean_token_accuracy": 0.9712493371963501, + "num_tokens": 6446110346.0, + "step": 45650 + }, + { + "entropy": 1.3557030415534974, + "epoch": 1.491952597042212, + "grad_norm": 1.546875, + "learning_rate": 9.97852840130771e-07, + "loss": 0.1243, + "mean_token_accuracy": 0.9712071406841278, + "num_tokens": 6453417508.0, + "step": 45700 + }, + { + "entropy": 1.3410662007331848, + "epoch": 1.4935849302993698, + "grad_norm": 1.3671875, + "learning_rate": 9.918300570596596e-07, + "loss": 0.1175, + "mean_token_accuracy": 0.9729030966758728, + "num_tokens": 6460130049.0, + "step": 45750 + }, + { + "entropy": 1.3425062656402589, + "epoch": 1.4952172635565277, + "grad_norm": 1.6171875, + "learning_rate": 9.858219032982019e-07, + "loss": 0.1225, + "mean_token_accuracy": 0.9715787386894226, + "num_tokens": 6467242766.0, + "step": 45800 + }, + { + "entropy": 1.3404623532295228, + "epoch": 1.4968495968136855, + "grad_norm": 0.59375, + "learning_rate": 9.798284226151751e-07, + "loss": 0.1299, + "mean_token_accuracy": 0.9696118628978729, + "num_tokens": 6474698613.0, + "step": 45850 + }, + { + "entropy": 1.3416164851188659, + "epoch": 1.4984819300708432, + "grad_norm": 1.21875, + "learning_rate": 9.738496586724644e-07, + "loss": 0.1247, + "mean_token_accuracy": 0.9708961296081543, + "num_tokens": 6481733346.0, + "step": 45900 + }, + { + "entropy": 1.3436065912246704, + "epoch": 1.5001142633280011, + "grad_norm": 1.8203125, + "learning_rate": 9.678856550247433e-07, + "loss": 0.1197, + "mean_token_accuracy": 0.9722868132591248, + "num_tokens": 6488600216.0, + "step": 45950 + }, + { + "entropy": 1.3471774291992187, + "epoch": 1.5017465965851589, + "grad_norm": 2.28125, + "learning_rate": 9.619364551191615e-07, + "loss": 0.1234, + "mean_token_accuracy": 0.9715406239032746, + "num_tokens": 6495842641.0, + "step": 46000 + }, + { + "epoch": 1.5017465965851589, + "eval_entropy": 1.3393350235621135, + "eval_loss": 0.14240698516368866, + "eval_mean_token_accuracy": 0.967587119738261, + "eval_num_tokens": 6495842641.0, + "eval_runtime": 753.9166, + "eval_samples_per_second": 12.808, + "eval_steps_per_second": 0.101, + "step": 46000 + }, + { + "entropy": 1.3315513157844543, + "epoch": 1.5033789298423166, + "grad_norm": 1.96875, + "learning_rate": 9.560021022950201e-07, + "loss": 0.1152, + "mean_token_accuracy": 0.9735651075839996, + "num_tokens": 6502730531.0, + "step": 46050 + }, + { + "entropy": 1.3440596199035644, + "epoch": 1.5050112630994743, + "grad_norm": 1.6484375, + "learning_rate": 9.500826397834667e-07, + "loss": 0.1363, + "mean_token_accuracy": 0.9686136949062347, + "num_tokens": 6510207647.0, + "step": 46100 + }, + { + "entropy": 1.3446049523353576, + "epoch": 1.506643596356632, + "grad_norm": 1.2109375, + "learning_rate": 9.44178110707169e-07, + "loss": 0.1165, + "mean_token_accuracy": 0.972632863521576, + "num_tokens": 6517212379.0, + "step": 46150 + }, + { + "entropy": 1.3464979553222656, + "epoch": 1.50827592961379, + "grad_norm": 1.125, + "learning_rate": 9.382885580800094e-07, + "loss": 0.1341, + "mean_token_accuracy": 0.969519715309143, + "num_tokens": 6524843034.0, + "step": 46200 + }, + { + "entropy": 1.337069320678711, + "epoch": 1.5099082628709477, + "grad_norm": 1.3046875, + "learning_rate": 9.324140248067691e-07, + "loss": 0.123, + "mean_token_accuracy": 0.9713637149333954, + "num_tokens": 6531941894.0, + "step": 46250 + }, + { + "entropy": 1.3362834978103637, + "epoch": 1.5115405961281057, + "grad_norm": 1.59375, + "learning_rate": 9.265545536828111e-07, + "loss": 0.1154, + "mean_token_accuracy": 0.9725685477256775, + "num_tokens": 6538791279.0, + "step": 46300 + }, + { + "entropy": 1.338006112575531, + "epoch": 1.5131729293852634, + "grad_norm": 1.5, + "learning_rate": 9.207101873937768e-07, + "loss": 0.1259, + "mean_token_accuracy": 0.971262993812561, + "num_tokens": 6545902334.0, + "step": 46350 + }, + { + "entropy": 1.3359116435050964, + "epoch": 1.514805262642421, + "grad_norm": 3.1875, + "learning_rate": 9.14880968515266e-07, + "loss": 0.1162, + "mean_token_accuracy": 0.9728143513202667, + "num_tokens": 6553190103.0, + "step": 46400 + }, + { + "entropy": 1.3362672972679137, + "epoch": 1.5164375958995788, + "grad_norm": 1.7734375, + "learning_rate": 9.090669395125351e-07, + "loss": 0.1155, + "mean_token_accuracy": 0.9727682447433472, + "num_tokens": 6559814465.0, + "step": 46450 + }, + { + "entropy": 1.3332003378868102, + "epoch": 1.5180699291567366, + "grad_norm": 1.546875, + "learning_rate": 9.032681427401806e-07, + "loss": 0.1094, + "mean_token_accuracy": 0.9738853967189789, + "num_tokens": 6566382565.0, + "step": 46500 + }, + { + "epoch": 1.5180699291567366, + "eval_entropy": 1.3380318832397462, + "eval_loss": 0.14237698912620544, + "eval_mean_token_accuracy": 0.9675415523846944, + "eval_num_tokens": 6566382565.0, + "eval_runtime": 751.4412, + "eval_samples_per_second": 12.85, + "eval_steps_per_second": 0.101, + "step": 46500 + }, + { + "entropy": 1.3400082111358642, + "epoch": 1.5197022624138943, + "grad_norm": 2.140625, + "learning_rate": 8.974846204418361e-07, + "loss": 0.1245, + "mean_token_accuracy": 0.9713966703414917, + "num_tokens": 6573501735.0, + "step": 46550 + }, + { + "entropy": 1.3266588830947876, + "epoch": 1.5213345956710522, + "grad_norm": 1.2109375, + "learning_rate": 8.917164147498621e-07, + "loss": 0.1156, + "mean_token_accuracy": 0.9723888230323792, + "num_tokens": 6580331916.0, + "step": 46600 + }, + { + "entropy": 1.340979859828949, + "epoch": 1.52296692892821, + "grad_norm": 1.921875, + "learning_rate": 8.859635676850372e-07, + "loss": 0.1174, + "mean_token_accuracy": 0.9720415270328522, + "num_tokens": 6586882066.0, + "step": 46650 + }, + { + "entropy": 1.338642556667328, + "epoch": 1.524599262185368, + "grad_norm": 2.265625, + "learning_rate": 8.802261211562563e-07, + "loss": 0.1206, + "mean_token_accuracy": 0.9713714873790741, + "num_tokens": 6593693750.0, + "step": 46700 + }, + { + "entropy": 1.3478483366966247, + "epoch": 1.5262315954425256, + "grad_norm": 0.55078125, + "learning_rate": 8.745041169602207e-07, + "loss": 0.1278, + "mean_token_accuracy": 0.9710940301418305, + "num_tokens": 6601030060.0, + "step": 46750 + }, + { + "entropy": 1.3367830848693847, + "epoch": 1.5278639286996833, + "grad_norm": 2.28125, + "learning_rate": 8.687975967811393e-07, + "loss": 0.1235, + "mean_token_accuracy": 0.9711613404750824, + "num_tokens": 6607766556.0, + "step": 46800 + }, + { + "entropy": 1.3393477821350097, + "epoch": 1.529496261956841, + "grad_norm": 1.15625, + "learning_rate": 8.631066021904173e-07, + "loss": 0.1281, + "mean_token_accuracy": 0.9700063776969909, + "num_tokens": 6615264797.0, + "step": 46850 + }, + { + "entropy": 1.3377933168411256, + "epoch": 1.5311285952139988, + "grad_norm": 1.171875, + "learning_rate": 8.574311746463602e-07, + "loss": 0.1219, + "mean_token_accuracy": 0.9712197721004486, + "num_tokens": 6622625782.0, + "step": 46900 + }, + { + "entropy": 1.3495184230804442, + "epoch": 1.5327609284711565, + "grad_norm": 1.6640625, + "learning_rate": 8.517713554938698e-07, + "loss": 0.1291, + "mean_token_accuracy": 0.9702865195274353, + "num_tokens": 6629917201.0, + "step": 46950 + }, + { + "entropy": 1.337637755870819, + "epoch": 1.5343932617283145, + "grad_norm": 1.2265625, + "learning_rate": 8.461271859641413e-07, + "loss": 0.124, + "mean_token_accuracy": 0.9718515348434448, + "num_tokens": 6636921749.0, + "step": 47000 + }, + { + "epoch": 1.5343932617283145, + "eval_entropy": 1.3388334194819131, + "eval_loss": 0.14232522249221802, + "eval_mean_token_accuracy": 0.9674970960617065, + "eval_num_tokens": 6636921749.0, + "eval_runtime": 752.3993, + "eval_samples_per_second": 12.834, + "eval_steps_per_second": 0.101, + "step": 47000 + }, + { + "entropy": 1.3416547775268555, + "epoch": 1.5360255949854722, + "grad_norm": 1.90625, + "learning_rate": 8.404987071743628e-07, + "loss": 0.1152, + "mean_token_accuracy": 0.9724602663516998, + "num_tokens": 6643500789.0, + "step": 47050 + }, + { + "entropy": 1.3312445497512817, + "epoch": 1.5376579282426301, + "grad_norm": 1.765625, + "learning_rate": 8.348859601274191e-07, + "loss": 0.1141, + "mean_token_accuracy": 0.9736961400508881, + "num_tokens": 6650321316.0, + "step": 47100 + }, + { + "entropy": 1.3447378778457642, + "epoch": 1.5392902614997879, + "grad_norm": 1.265625, + "learning_rate": 8.292889857115906e-07, + "loss": 0.1251, + "mean_token_accuracy": 0.9713517308235169, + "num_tokens": 6657596511.0, + "step": 47150 + }, + { + "entropy": 1.3332865810394288, + "epoch": 1.5409225947569456, + "grad_norm": 1.28125, + "learning_rate": 8.237078247002536e-07, + "loss": 0.1144, + "mean_token_accuracy": 0.9731272792816162, + "num_tokens": 6664455846.0, + "step": 47200 + }, + { + "entropy": 1.3360036635398864, + "epoch": 1.5425549280141033, + "grad_norm": 1.578125, + "learning_rate": 8.181425177515887e-07, + "loss": 0.1181, + "mean_token_accuracy": 0.9728715085983276, + "num_tokens": 6671444402.0, + "step": 47250 + }, + { + "entropy": 1.3451998877525329, + "epoch": 1.544187261271261, + "grad_norm": 2.203125, + "learning_rate": 8.125931054082775e-07, + "loss": 0.1189, + "mean_token_accuracy": 0.9724408376216889, + "num_tokens": 6678449907.0, + "step": 47300 + }, + { + "entropy": 1.346355800628662, + "epoch": 1.5458195945284188, + "grad_norm": 1.2109375, + "learning_rate": 8.070596280972152e-07, + "loss": 0.1311, + "mean_token_accuracy": 0.9690172004699708, + "num_tokens": 6685797698.0, + "step": 47350 + }, + { + "entropy": 1.341564166545868, + "epoch": 1.5474519277855767, + "grad_norm": 1.890625, + "learning_rate": 8.01542126129208e-07, + "loss": 0.1231, + "mean_token_accuracy": 0.9712344872951507, + "num_tokens": 6692582052.0, + "step": 47400 + }, + { + "entropy": 1.3289701747894287, + "epoch": 1.5490842610427344, + "grad_norm": 2.0625, + "learning_rate": 7.960406396986855e-07, + "loss": 0.1137, + "mean_token_accuracy": 0.9735037076473236, + "num_tokens": 6699347585.0, + "step": 47450 + }, + { + "entropy": 1.3448949909210206, + "epoch": 1.5507165942998924, + "grad_norm": 1.875, + "learning_rate": 7.905552088834074e-07, + "loss": 0.1267, + "mean_token_accuracy": 0.9710251951217651, + "num_tokens": 6706378386.0, + "step": 47500 + }, + { + "epoch": 1.5507165942998924, + "eval_entropy": 1.338226900100708, + "eval_loss": 0.14231520891189575, + "eval_mean_token_accuracy": 0.9675397229194641, + "eval_num_tokens": 6706378386.0, + "eval_runtime": 753.0219, + "eval_samples_per_second": 12.823, + "eval_steps_per_second": 0.101, + "step": 47500 + }, + { + "entropy": 1.340337586402893, + "epoch": 1.5523489275570501, + "grad_norm": 1.4375, + "learning_rate": 7.850858736441654e-07, + "loss": 0.1177, + "mean_token_accuracy": 0.9723483467102051, + "num_tokens": 6713336578.0, + "step": 47550 + }, + { + "entropy": 1.3498352313041686, + "epoch": 1.5539812608142078, + "grad_norm": 1.7734375, + "learning_rate": 7.796326738245014e-07, + "loss": 0.1213, + "mean_token_accuracy": 0.9714156925678253, + "num_tokens": 6720699596.0, + "step": 47600 + }, + { + "entropy": 1.3324111270904542, + "epoch": 1.5556135940713656, + "grad_norm": 1.8046875, + "learning_rate": 7.741956491504081e-07, + "loss": 0.1174, + "mean_token_accuracy": 0.9724169254302979, + "num_tokens": 6727286916.0, + "step": 47650 + }, + { + "entropy": 1.3425724387168885, + "epoch": 1.5572459273285233, + "grad_norm": 1.5703125, + "learning_rate": 7.687748392300481e-07, + "loss": 0.1211, + "mean_token_accuracy": 0.9714575302600861, + "num_tokens": 6734228986.0, + "step": 47700 + }, + { + "entropy": 1.3485777735710145, + "epoch": 1.558878260585681, + "grad_norm": 1.734375, + "learning_rate": 7.633702835534574e-07, + "loss": 0.1245, + "mean_token_accuracy": 0.9718271470069886, + "num_tokens": 6741417194.0, + "step": 47750 + }, + { + "entropy": 1.3329823040962219, + "epoch": 1.560510593842839, + "grad_norm": 1.453125, + "learning_rate": 7.579820214922639e-07, + "loss": 0.1068, + "mean_token_accuracy": 0.9753419077396392, + "num_tokens": 6747898229.0, + "step": 47800 + }, + { + "entropy": 1.3319637727737428, + "epoch": 1.5621429270999967, + "grad_norm": 1.203125, + "learning_rate": 7.526100922993989e-07, + "loss": 0.1122, + "mean_token_accuracy": 0.9736012244224548, + "num_tokens": 6754984506.0, + "step": 47850 + }, + { + "entropy": 1.3472395992279054, + "epoch": 1.5637752603571546, + "grad_norm": 1.71875, + "learning_rate": 7.472545351088072e-07, + "loss": 0.1171, + "mean_token_accuracy": 0.9723641383647919, + "num_tokens": 6761747061.0, + "step": 47900 + }, + { + "entropy": 1.322529821395874, + "epoch": 1.5654075936143124, + "grad_norm": 1.703125, + "learning_rate": 7.419153889351687e-07, + "loss": 0.1112, + "mean_token_accuracy": 0.9733606302738189, + "num_tokens": 6768792123.0, + "step": 47950 + }, + { + "entropy": 1.338774642944336, + "epoch": 1.56703992687147, + "grad_norm": 1.203125, + "learning_rate": 7.365926926736079e-07, + "loss": 0.1298, + "mean_token_accuracy": 0.969396116733551, + "num_tokens": 6776660559.0, + "step": 48000 + }, + { + "epoch": 1.56703992687147, + "eval_entropy": 1.3360849984486898, + "eval_loss": 0.14228671789169312, + "eval_mean_token_accuracy": 0.9675799965858459, + "eval_num_tokens": 6776660559.0, + "eval_runtime": 744.7808, + "eval_samples_per_second": 12.965, + "eval_steps_per_second": 0.102, + "step": 48000 + }, + { + "entropy": 1.3329459977149964, + "epoch": 1.5686722601286278, + "grad_norm": 1.921875, + "learning_rate": 7.312864850994151e-07, + "loss": 0.1107, + "mean_token_accuracy": 0.9741031527519226, + "num_tokens": 6783181818.0, + "step": 48050 + }, + { + "entropy": 1.3349730682373047, + "epoch": 1.5703045933857855, + "grad_norm": 1.421875, + "learning_rate": 7.259968048677626e-07, + "loss": 0.1136, + "mean_token_accuracy": 0.9728020560741425, + "num_tokens": 6790226377.0, + "step": 48100 + }, + { + "entropy": 1.3363687252998353, + "epoch": 1.5719369266429433, + "grad_norm": 1.1484375, + "learning_rate": 7.207236905134222e-07, + "loss": 0.1159, + "mean_token_accuracy": 0.9732009255886078, + "num_tokens": 6797331847.0, + "step": 48150 + }, + { + "entropy": 1.3224567222595214, + "epoch": 1.5735692599001012, + "grad_norm": 2.375, + "learning_rate": 7.154671804504838e-07, + "loss": 0.1187, + "mean_token_accuracy": 0.9726608419418334, + "num_tokens": 6804197080.0, + "step": 48200 + }, + { + "entropy": 1.3282025313377381, + "epoch": 1.575201593157259, + "grad_norm": 1.4140625, + "learning_rate": 7.102273129720785e-07, + "loss": 0.1171, + "mean_token_accuracy": 0.9726303327083587, + "num_tokens": 6811554275.0, + "step": 48250 + }, + { + "entropy": 1.3339614725112916, + "epoch": 1.5768339264144169, + "grad_norm": 1.4765625, + "learning_rate": 7.050041262500963e-07, + "loss": 0.1211, + "mean_token_accuracy": 0.9716296088695526, + "num_tokens": 6818575585.0, + "step": 48300 + }, + { + "entropy": 1.3479553842544556, + "epoch": 1.5784662596715746, + "grad_norm": 1.25, + "learning_rate": 6.99797658334911e-07, + "loss": 0.1297, + "mean_token_accuracy": 0.9702737581729889, + "num_tokens": 6825774443.0, + "step": 48350 + }, + { + "entropy": 1.3463473081588746, + "epoch": 1.5800985929287323, + "grad_norm": 1.4296875, + "learning_rate": 6.946079471551018e-07, + "loss": 0.1289, + "mean_token_accuracy": 0.9703529167175293, + "num_tokens": 6833071654.0, + "step": 48400 + }, + { + "entropy": 1.3393766927719115, + "epoch": 1.58173092618589, + "grad_norm": 2.0, + "learning_rate": 6.894350305171747e-07, + "loss": 0.1196, + "mean_token_accuracy": 0.9719527661800385, + "num_tokens": 6840009616.0, + "step": 48450 + }, + { + "entropy": 1.333970193862915, + "epoch": 1.5833632594430478, + "grad_norm": 1.859375, + "learning_rate": 6.842789461052923e-07, + "loss": 0.1157, + "mean_token_accuracy": 0.9726070737838746, + "num_tokens": 6847179809.0, + "step": 48500 + }, + { + "epoch": 1.5833632594430478, + "eval_entropy": 1.3332714064915976, + "eval_loss": 0.14229924976825714, + "eval_mean_token_accuracy": 0.9675857615470886, + "eval_num_tokens": 6847179809.0, + "eval_runtime": 747.985, + "eval_samples_per_second": 12.909, + "eval_steps_per_second": 0.102, + "step": 48500 + }, + { + "entropy": 1.3204735660552978, + "epoch": 1.5849955927002055, + "grad_norm": 2.34375, + "learning_rate": 6.791397314809928e-07, + "loss": 0.107, + "mean_token_accuracy": 0.9743911874294281, + "num_tokens": 6853704094.0, + "step": 48550 + }, + { + "entropy": 1.3412930655479431, + "epoch": 1.5866279259573635, + "grad_norm": 2.0, + "learning_rate": 6.740174240829229e-07, + "loss": 0.1119, + "mean_token_accuracy": 0.973388170003891, + "num_tokens": 6860416510.0, + "step": 48600 + }, + { + "entropy": 1.3327165865898132, + "epoch": 1.5882602592145212, + "grad_norm": 1.7421875, + "learning_rate": 6.689120612265592e-07, + "loss": 0.1166, + "mean_token_accuracy": 0.9722766876220703, + "num_tokens": 6867304006.0, + "step": 48650 + }, + { + "entropy": 1.3274698781967162, + "epoch": 1.5898925924716791, + "grad_norm": 1.3984375, + "learning_rate": 6.638236801039406e-07, + "loss": 0.1179, + "mean_token_accuracy": 0.9724640393257141, + "num_tokens": 6874107973.0, + "step": 48700 + }, + { + "entropy": 1.3326347541809083, + "epoch": 1.5915249257288369, + "grad_norm": 1.671875, + "learning_rate": 6.587523177833969e-07, + "loss": 0.119, + "mean_token_accuracy": 0.9718975865840912, + "num_tokens": 6881463304.0, + "step": 48750 + }, + { + "entropy": 1.3314825320243835, + "epoch": 1.5931572589859946, + "grad_norm": 1.3828125, + "learning_rate": 6.536980112092748e-07, + "loss": 0.1296, + "mean_token_accuracy": 0.970567889213562, + "num_tokens": 6888936324.0, + "step": 48800 + }, + { + "entropy": 1.335792977809906, + "epoch": 1.5947895922431523, + "grad_norm": 1.3828125, + "learning_rate": 6.486607972016746e-07, + "loss": 0.1091, + "mean_token_accuracy": 0.9739424622058869, + "num_tokens": 6895517659.0, + "step": 48850 + }, + { + "entropy": 1.3427368450164794, + "epoch": 1.59642192550031, + "grad_norm": 1.765625, + "learning_rate": 6.436407124561761e-07, + "loss": 0.12, + "mean_token_accuracy": 0.9717347931861877, + "num_tokens": 6902937758.0, + "step": 48900 + }, + { + "entropy": 1.3364178919792176, + "epoch": 1.5980542587574678, + "grad_norm": 1.5, + "learning_rate": 6.386377935435774e-07, + "loss": 0.1177, + "mean_token_accuracy": 0.9717580342292785, + "num_tokens": 6909650064.0, + "step": 48950 + }, + { + "entropy": 1.3345691514015199, + "epoch": 1.5996865920146257, + "grad_norm": 1.4296875, + "learning_rate": 6.336520769096215e-07, + "loss": 0.1242, + "mean_token_accuracy": 0.9705976390838623, + "num_tokens": 6916954844.0, + "step": 49000 + }, + { + "epoch": 1.5996865920146257, + "eval_entropy": 1.3313024059931438, + "eval_loss": 0.14232422411441803, + "eval_mean_token_accuracy": 0.9676321744918823, + "eval_num_tokens": 6916954844.0, + "eval_runtime": 750.8283, + "eval_samples_per_second": 12.86, + "eval_steps_per_second": 0.101, + "step": 49000 + }, + { + "entropy": 1.3268843412399292, + "epoch": 1.6013189252717834, + "grad_norm": 1.9921875, + "learning_rate": 6.286835988747385e-07, + "loss": 0.12, + "mean_token_accuracy": 0.9719444465637207, + "num_tokens": 6924452826.0, + "step": 49050 + }, + { + "entropy": 1.3368324255943298, + "epoch": 1.6029512585289414, + "grad_norm": 1.9140625, + "learning_rate": 6.237323956337755e-07, + "loss": 0.1192, + "mean_token_accuracy": 0.9717724549770356, + "num_tokens": 6931690729.0, + "step": 49100 + }, + { + "entropy": 1.3325298118591309, + "epoch": 1.604583591786099, + "grad_norm": 1.4375, + "learning_rate": 6.18798503255733e-07, + "loss": 0.1192, + "mean_token_accuracy": 0.9713486981391907, + "num_tokens": 6938882010.0, + "step": 49150 + }, + { + "entropy": 1.339358766078949, + "epoch": 1.6062159250432568, + "grad_norm": 1.2578125, + "learning_rate": 6.138819576835056e-07, + "loss": 0.1148, + "mean_token_accuracy": 0.9726817321777343, + "num_tokens": 6945842260.0, + "step": 49200 + }, + { + "entropy": 1.343044672012329, + "epoch": 1.6078482583004146, + "grad_norm": 1.375, + "learning_rate": 6.089827947336176e-07, + "loss": 0.1264, + "mean_token_accuracy": 0.9707775366306305, + "num_tokens": 6953111267.0, + "step": 49250 + }, + { + "entropy": 1.3318463802337646, + "epoch": 1.6094805915575723, + "grad_norm": 1.078125, + "learning_rate": 6.041010500959636e-07, + "loss": 0.1129, + "mean_token_accuracy": 0.9740157461166382, + "num_tokens": 6960306994.0, + "step": 49300 + }, + { + "entropy": 1.329837028980255, + "epoch": 1.6111129248147302, + "grad_norm": 1.265625, + "learning_rate": 5.992367593335453e-07, + "loss": 0.1108, + "mean_token_accuracy": 0.9734898245334626, + "num_tokens": 6966883891.0, + "step": 49350 + }, + { + "entropy": 1.3326479887962341, + "epoch": 1.612745258071888, + "grad_norm": 1.4140625, + "learning_rate": 5.943899578822175e-07, + "loss": 0.1136, + "mean_token_accuracy": 0.9735347747802734, + "num_tokens": 6973945682.0, + "step": 49400 + }, + { + "entropy": 1.326375277042389, + "epoch": 1.614377591329046, + "grad_norm": 1.046875, + "learning_rate": 5.895606810504245e-07, + "loss": 0.1126, + "mean_token_accuracy": 0.9733131611347199, + "num_tokens": 6980960627.0, + "step": 49450 + }, + { + "entropy": 1.3266846776008605, + "epoch": 1.6160099245862036, + "grad_norm": 1.625, + "learning_rate": 5.847489640189483e-07, + "loss": 0.1138, + "mean_token_accuracy": 0.9726812386512756, + "num_tokens": 6987961577.0, + "step": 49500 + }, + { + "epoch": 1.6160099245862036, + "eval_entropy": 1.327593413988749, + "eval_loss": 0.1423512101173401, + "eval_mean_token_accuracy": 0.9675911315282186, + "eval_num_tokens": 6987961577.0, + "eval_runtime": 752.8917, + "eval_samples_per_second": 12.825, + "eval_steps_per_second": 0.101, + "step": 49500 + }, + { + "entropy": 1.3377934908866882, + "epoch": 1.6176422578433614, + "grad_norm": 1.21875, + "learning_rate": 5.799548418406465e-07, + "loss": 0.1259, + "mean_token_accuracy": 0.9708180844783783, + "num_tokens": 6995561320.0, + "step": 49550 + }, + { + "entropy": 1.329156894683838, + "epoch": 1.619274591100519, + "grad_norm": 0.10498046875, + "learning_rate": 5.751783494402026e-07, + "loss": 0.1231, + "mean_token_accuracy": 0.9713264811038971, + "num_tokens": 7002493076.0, + "step": 49600 + }, + { + "entropy": 1.3364929604530333, + "epoch": 1.6209069243576768, + "grad_norm": 1.9609375, + "learning_rate": 5.704195216138692e-07, + "loss": 0.1268, + "mean_token_accuracy": 0.9701859080791473, + "num_tokens": 7010022324.0, + "step": 49650 + }, + { + "entropy": 1.3322447371482848, + "epoch": 1.6225392576148345, + "grad_norm": 0.0128173828125, + "learning_rate": 5.656783930292111e-07, + "loss": 0.1099, + "mean_token_accuracy": 0.9747293889522552, + "num_tokens": 7016790835.0, + "step": 49700 + }, + { + "entropy": 1.3278017139434815, + "epoch": 1.6241715908719925, + "grad_norm": 1.296875, + "learning_rate": 5.609549982248599e-07, + "loss": 0.1191, + "mean_token_accuracy": 0.9724988090991974, + "num_tokens": 7023902551.0, + "step": 49750 + }, + { + "entropy": 1.3292198777198792, + "epoch": 1.6258039241291502, + "grad_norm": 1.2734375, + "learning_rate": 5.562493716102552e-07, + "loss": 0.1221, + "mean_token_accuracy": 0.9709643149375915, + "num_tokens": 7031377627.0, + "step": 49800 + }, + { + "entropy": 1.324635624885559, + "epoch": 1.6274362573863081, + "grad_norm": 1.515625, + "learning_rate": 5.515615474653998e-07, + "loss": 0.124, + "mean_token_accuracy": 0.9710781908035279, + "num_tokens": 7039286978.0, + "step": 49850 + }, + { + "entropy": 1.3296051907539368, + "epoch": 1.6290685906434659, + "grad_norm": 1.2734375, + "learning_rate": 5.46891559940605e-07, + "loss": 0.108, + "mean_token_accuracy": 0.9743620455265045, + "num_tokens": 7046170706.0, + "step": 49900 + }, + { + "entropy": 1.3330603170394897, + "epoch": 1.6307009239006236, + "grad_norm": 1.3828125, + "learning_rate": 5.422394430562457e-07, + "loss": 0.1062, + "mean_token_accuracy": 0.9752356350421906, + "num_tokens": 7052934305.0, + "step": 49950 + }, + { + "entropy": 1.3311968517303467, + "epoch": 1.6323332571577813, + "grad_norm": 1.1796875, + "learning_rate": 5.376052307025119e-07, + "loss": 0.1239, + "mean_token_accuracy": 0.9706245791912079, + "num_tokens": 7060043084.0, + "step": 50000 + }, + { + "epoch": 1.6323332571577813, + "eval_entropy": 1.3246519072850544, + "eval_loss": 0.14238578081130981, + "eval_mean_token_accuracy": 0.9675867708524069, + "eval_num_tokens": 7060043084.0, + "eval_runtime": 752.4961, + "eval_samples_per_second": 12.832, + "eval_steps_per_second": 0.101, + "step": 50000 + }, + { + "entropy": 1.3305332589149474, + "epoch": 1.633965590414939, + "grad_norm": 1.96875, + "learning_rate": 5.329889566391578e-07, + "loss": 0.1144, + "mean_token_accuracy": 0.9729759168624877, + "num_tokens": 7066947331.0, + "step": 50050 + }, + { + "entropy": 1.3186654925346375, + "epoch": 1.6355979236720968, + "grad_norm": 2.109375, + "learning_rate": 5.283906544952627e-07, + "loss": 0.1132, + "mean_token_accuracy": 0.9731574881076813, + "num_tokens": 7074008041.0, + "step": 50100 + }, + { + "entropy": 1.318843502998352, + "epoch": 1.6372302569292547, + "grad_norm": 1.5703125, + "learning_rate": 5.238103577689788e-07, + "loss": 0.1161, + "mean_token_accuracy": 0.9729295611381531, + "num_tokens": 7081071001.0, + "step": 50150 + }, + { + "entropy": 1.3307886505126953, + "epoch": 1.6388625901864124, + "grad_norm": 1.671875, + "learning_rate": 5.192480998272943e-07, + "loss": 0.1142, + "mean_token_accuracy": 0.972575945854187, + "num_tokens": 7088102191.0, + "step": 50200 + }, + { + "entropy": 1.335254201889038, + "epoch": 1.6404949234435704, + "grad_norm": 1.0390625, + "learning_rate": 5.147039139057831e-07, + "loss": 0.1271, + "mean_token_accuracy": 0.9703405356407165, + "num_tokens": 7095646828.0, + "step": 50250 + }, + { + "entropy": 1.3202842998504638, + "epoch": 1.6421272567007281, + "grad_norm": 2.125, + "learning_rate": 5.101778331083691e-07, + "loss": 0.1085, + "mean_token_accuracy": 0.9740463018417358, + "num_tokens": 7102740051.0, + "step": 50300 + }, + { + "entropy": 1.3307878541946412, + "epoch": 1.6437595899578858, + "grad_norm": 2.0625, + "learning_rate": 5.05669890407081e-07, + "loss": 0.1114, + "mean_token_accuracy": 0.9739047718048096, + "num_tokens": 7109640526.0, + "step": 50350 + }, + { + "entropy": 1.3166346144676209, + "epoch": 1.6453919232150436, + "grad_norm": 0.08984375, + "learning_rate": 5.011801186418147e-07, + "loss": 0.1082, + "mean_token_accuracy": 0.974234766960144, + "num_tokens": 7116612724.0, + "step": 50400 + }, + { + "entropy": 1.324273819923401, + "epoch": 1.6470242564722013, + "grad_norm": 1.75, + "learning_rate": 4.967085505200896e-07, + "loss": 0.1145, + "mean_token_accuracy": 0.9735607969760894, + "num_tokens": 7123679576.0, + "step": 50450 + }, + { + "entropy": 1.3159007930755615, + "epoch": 1.648656589729359, + "grad_norm": 2.140625, + "learning_rate": 4.922552186168168e-07, + "loss": 0.1175, + "mean_token_accuracy": 0.9724935472011567, + "num_tokens": 7130980530.0, + "step": 50500 + }, + { + "epoch": 1.648656589729359, + "eval_entropy": 1.324186561902364, + "eval_loss": 0.14239400625228882, + "eval_mean_token_accuracy": 0.9676336812973022, + "eval_num_tokens": 7130980530.0, + "eval_runtime": 748.1515, + "eval_samples_per_second": 12.906, + "eval_steps_per_second": 0.102, + "step": 50500 + }, + { + "entropy": 1.320852587223053, + "epoch": 1.650288922986517, + "grad_norm": 1.2421875, + "learning_rate": 4.878201553740573e-07, + "loss": 0.1148, + "mean_token_accuracy": 0.9720003747940064, + "num_tokens": 7137814687.0, + "step": 50550 + }, + { + "entropy": 1.3245435237884522, + "epoch": 1.6519212562436747, + "grad_norm": 2.34375, + "learning_rate": 4.834033931007857e-07, + "loss": 0.1079, + "mean_token_accuracy": 0.9746451807022095, + "num_tokens": 7144119513.0, + "step": 50600 + }, + { + "entropy": 1.317584047317505, + "epoch": 1.6535535895008326, + "grad_norm": 1.5390625, + "learning_rate": 4.790049639726581e-07, + "loss": 0.1097, + "mean_token_accuracy": 0.9739763534069061, + "num_tokens": 7150741274.0, + "step": 50650 + }, + { + "entropy": 1.3234117150306701, + "epoch": 1.6551859227579904, + "grad_norm": 0.9921875, + "learning_rate": 4.746249000317725e-07, + "loss": 0.1069, + "mean_token_accuracy": 0.9751713788509369, + "num_tokens": 7157786331.0, + "step": 50700 + }, + { + "entropy": 1.3205085873603821, + "epoch": 1.656818256015148, + "grad_norm": 1.7578125, + "learning_rate": 4.702632331864422e-07, + "loss": 0.1046, + "mean_token_accuracy": 0.9752000343799591, + "num_tokens": 7164501001.0, + "step": 50750 + }, + { + "entropy": 1.3175451397895812, + "epoch": 1.6584505892723058, + "grad_norm": 1.6171875, + "learning_rate": 4.6591999521095563e-07, + "loss": 0.0972, + "mean_token_accuracy": 0.9763833940029144, + "num_tokens": 7170583305.0, + "step": 50800 + }, + { + "entropy": 1.326640043258667, + "epoch": 1.6600829225294635, + "grad_norm": 1.40625, + "learning_rate": 4.6159521774535153e-07, + "loss": 0.1176, + "mean_token_accuracy": 0.9723085272312164, + "num_tokens": 7177641823.0, + "step": 50850 + }, + { + "entropy": 1.3212744474411011, + "epoch": 1.6617152557866213, + "grad_norm": 1.578125, + "learning_rate": 4.572889322951863e-07, + "loss": 0.1152, + "mean_token_accuracy": 0.972984424829483, + "num_tokens": 7184801732.0, + "step": 50900 + }, + { + "entropy": 1.317273302078247, + "epoch": 1.6633475890437792, + "grad_norm": 2.78125, + "learning_rate": 4.530011702313006e-07, + "loss": 0.1081, + "mean_token_accuracy": 0.9745338428020477, + "num_tokens": 7191485350.0, + "step": 50950 + }, + { + "entropy": 1.3148117685317993, + "epoch": 1.664979922300937, + "grad_norm": 1.4375, + "learning_rate": 4.487319627895976e-07, + "loss": 0.1132, + "mean_token_accuracy": 0.9734457182884216, + "num_tokens": 7198363082.0, + "step": 51000 + }, + { + "epoch": 1.664979922300937, + "eval_entropy": 1.3203301127751668, + "eval_loss": 0.14241376519203186, + "eval_mean_token_accuracy": 0.9676187674204508, + "eval_num_tokens": 7198363082.0, + "eval_runtime": 750.908, + "eval_samples_per_second": 12.859, + "eval_steps_per_second": 0.101, + "step": 51000 + }, + { + "entropy": 1.3135675048828126, + "epoch": 1.666612255558095, + "grad_norm": 0.0026092529296875, + "learning_rate": 4.4448134107080895e-07, + "loss": 0.1087, + "mean_token_accuracy": 0.9744446206092835, + "num_tokens": 7205394224.0, + "step": 51050 + }, + { + "entropy": 1.330612359046936, + "epoch": 1.6682445888152526, + "grad_norm": 2.40625, + "learning_rate": 4.4024933604027495e-07, + "loss": 0.118, + "mean_token_accuracy": 0.9718951296806335, + "num_tokens": 7212621966.0, + "step": 51100 + }, + { + "entropy": 1.3243393778800965, + "epoch": 1.6698769220724103, + "grad_norm": 2.34375, + "learning_rate": 4.360359785277107e-07, + "loss": 0.1136, + "mean_token_accuracy": 0.9730891573429108, + "num_tokens": 7219256110.0, + "step": 51150 + }, + { + "entropy": 1.321114592552185, + "epoch": 1.671509255329568, + "grad_norm": 1.6796875, + "learning_rate": 4.3184129922699e-07, + "loss": 0.1132, + "mean_token_accuracy": 0.9732060301303863, + "num_tokens": 7226320848.0, + "step": 51200 + }, + { + "entropy": 1.3259812951087953, + "epoch": 1.6731415885867258, + "grad_norm": 1.8984375, + "learning_rate": 4.276653286959168e-07, + "loss": 0.1046, + "mean_token_accuracy": 0.9755737960338593, + "num_tokens": 7233157988.0, + "step": 51250 + }, + { + "entropy": 1.3230745482444763, + "epoch": 1.6747739218438835, + "grad_norm": 2.09375, + "learning_rate": 4.2350809735600106e-07, + "loss": 0.112, + "mean_token_accuracy": 0.9729019176959991, + "num_tokens": 7240476512.0, + "step": 51300 + }, + { + "entropy": 1.3131797289848328, + "epoch": 1.6764062551010415, + "grad_norm": 1.171875, + "learning_rate": 4.1936963549224396e-07, + "loss": 0.1124, + "mean_token_accuracy": 0.9734921300411224, + "num_tokens": 7247352561.0, + "step": 51350 + }, + { + "entropy": 1.306911015510559, + "epoch": 1.6780385883581992, + "grad_norm": 1.25, + "learning_rate": 4.1524997325290903e-07, + "loss": 0.1107, + "mean_token_accuracy": 0.9738042771816253, + "num_tokens": 7254496106.0, + "step": 51400 + }, + { + "entropy": 1.3271653127670289, + "epoch": 1.6796709216153571, + "grad_norm": 0.00311279296875, + "learning_rate": 4.1114914064930875e-07, + "loss": 0.1095, + "mean_token_accuracy": 0.973537621498108, + "num_tokens": 7261315934.0, + "step": 51450 + }, + { + "entropy": 1.3219536185264587, + "epoch": 1.6813032548725149, + "grad_norm": 2.015625, + "learning_rate": 4.0706716755558326e-07, + "loss": 0.1049, + "mean_token_accuracy": 0.9747460389137268, + "num_tokens": 7268210775.0, + "step": 51500 + }, + { + "epoch": 1.6813032548725149, + "eval_entropy": 1.3198106654485067, + "eval_loss": 0.14244017004966736, + "eval_mean_token_accuracy": 0.9676569310824076, + "eval_num_tokens": 7268210775.0, + "eval_runtime": 753.6731, + "eval_samples_per_second": 12.812, + "eval_steps_per_second": 0.101, + "step": 51500 + }, + { + "entropy": 1.3197919082641603, + "epoch": 1.6829355881296726, + "grad_norm": 1.890625, + "learning_rate": 4.0300408370848365e-07, + "loss": 0.1043, + "mean_token_accuracy": 0.9753884637355804, + "num_tokens": 7274845487.0, + "step": 51550 + }, + { + "entropy": 1.3303123378753663, + "epoch": 1.6845679213868303, + "grad_norm": 1.3359375, + "learning_rate": 3.9895991870715264e-07, + "loss": 0.1181, + "mean_token_accuracy": 0.9722198081016541, + "num_tokens": 7282313836.0, + "step": 51600 + }, + { + "entropy": 1.3250636410713197, + "epoch": 1.686200254643988, + "grad_norm": 1.3203125, + "learning_rate": 3.9493470201291404e-07, + "loss": 0.109, + "mean_token_accuracy": 0.9739108157157897, + "num_tokens": 7289309163.0, + "step": 51650 + }, + { + "entropy": 1.3373154950141908, + "epoch": 1.6878325879011458, + "grad_norm": 1.171875, + "learning_rate": 3.909284629490526e-07, + "loss": 0.1186, + "mean_token_accuracy": 0.9716354882717133, + "num_tokens": 7296551436.0, + "step": 51700 + }, + { + "entropy": 1.3240708827972412, + "epoch": 1.6894649211583037, + "grad_norm": 1.1640625, + "learning_rate": 3.8694123070060473e-07, + "loss": 0.1103, + "mean_token_accuracy": 0.9735696887969971, + "num_tokens": 7303811515.0, + "step": 51750 + }, + { + "entropy": 1.3273040246963501, + "epoch": 1.6910972544154614, + "grad_norm": 1.9140625, + "learning_rate": 3.8297303431414455e-07, + "loss": 0.1176, + "mean_token_accuracy": 0.9723230707645416, + "num_tokens": 7310596062.0, + "step": 51800 + }, + { + "entropy": 1.3321073579788207, + "epoch": 1.6927295876726194, + "grad_norm": 1.59375, + "learning_rate": 3.7902390269756883e-07, + "loss": 0.1132, + "mean_token_accuracy": 0.9733636856079102, + "num_tokens": 7318010738.0, + "step": 51850 + }, + { + "entropy": 1.3346837186813354, + "epoch": 1.694361920929777, + "grad_norm": 1.4296875, + "learning_rate": 3.75093864619894e-07, + "loss": 0.118, + "mean_token_accuracy": 0.9715818917751312, + "num_tokens": 7325178822.0, + "step": 51900 + }, + { + "entropy": 1.327819790840149, + "epoch": 1.6959942541869348, + "grad_norm": 1.4609375, + "learning_rate": 3.7118294871103764e-07, + "loss": 0.1129, + "mean_token_accuracy": 0.9730360591411591, + "num_tokens": 7332489545.0, + "step": 51950 + }, + { + "entropy": 1.3169277691841126, + "epoch": 1.6976265874440926, + "grad_norm": 1.828125, + "learning_rate": 3.672911834616175e-07, + "loss": 0.1027, + "mean_token_accuracy": 0.9758607912063598, + "num_tokens": 7339208562.0, + "step": 52000 + }, + { + "epoch": 1.6976265874440926, + "eval_entropy": 1.3195398267110188, + "eval_loss": 0.1424235850572586, + "eval_mean_token_accuracy": 0.9676573673884074, + "eval_num_tokens": 7339208562.0, + "eval_runtime": 751.2637, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 52000 + }, + { + "entropy": 1.3143286561965943, + "epoch": 1.6992589207012503, + "grad_norm": 1.6484375, + "learning_rate": 3.6341859722273907e-07, + "loss": 0.1075, + "mean_token_accuracy": 0.9739359652996064, + "num_tokens": 7346374510.0, + "step": 52050 + }, + { + "entropy": 1.323881621360779, + "epoch": 1.700891253958408, + "grad_norm": 1.4375, + "learning_rate": 3.5956521820579126e-07, + "loss": 0.0998, + "mean_token_accuracy": 0.9760903561115265, + "num_tokens": 7353032386.0, + "step": 52100 + }, + { + "entropy": 1.3202632975578308, + "epoch": 1.702523587215566, + "grad_norm": 1.640625, + "learning_rate": 3.5573107448224085e-07, + "loss": 0.118, + "mean_token_accuracy": 0.9715847325325012, + "num_tokens": 7360252648.0, + "step": 52150 + }, + { + "entropy": 1.330101602077484, + "epoch": 1.7041559204727237, + "grad_norm": 0.96484375, + "learning_rate": 3.519161939834264e-07, + "loss": 0.1136, + "mean_token_accuracy": 0.9737802124023438, + "num_tokens": 7367323821.0, + "step": 52200 + }, + { + "entropy": 1.3269778847694398, + "epoch": 1.7057882537298816, + "grad_norm": 0.78515625, + "learning_rate": 3.4812060450035723e-07, + "loss": 0.1091, + "mean_token_accuracy": 0.9739231073856354, + "num_tokens": 7374681816.0, + "step": 52250 + }, + { + "entropy": 1.3149812078475953, + "epoch": 1.7074205869870394, + "grad_norm": 1.3359375, + "learning_rate": 3.44344333683508e-07, + "loss": 0.1049, + "mean_token_accuracy": 0.9752970814704895, + "num_tokens": 7381861290.0, + "step": 52300 + }, + { + "entropy": 1.31980233669281, + "epoch": 1.709052920244197, + "grad_norm": 2.078125, + "learning_rate": 3.4058740904262077e-07, + "loss": 0.1137, + "mean_token_accuracy": 0.9733495855331421, + "num_tokens": 7389307478.0, + "step": 52350 + }, + { + "entropy": 1.3273230743408204, + "epoch": 1.7106852535013548, + "grad_norm": 1.8203125, + "learning_rate": 3.3684985794650025e-07, + "loss": 0.1119, + "mean_token_accuracy": 0.9732711517810821, + "num_tokens": 7396166622.0, + "step": 52400 + }, + { + "entropy": 1.3064412307739257, + "epoch": 1.7123175867585125, + "grad_norm": 1.6875, + "learning_rate": 3.3313170762281964e-07, + "loss": 0.1004, + "mean_token_accuracy": 0.9755911242961883, + "num_tokens": 7403084742.0, + "step": 52450 + }, + { + "entropy": 1.3238463592529297, + "epoch": 1.7139499200156703, + "grad_norm": 2.671875, + "learning_rate": 3.294329851579181e-07, + "loss": 0.101, + "mean_token_accuracy": 0.9754888868331909, + "num_tokens": 7409894593.0, + "step": 52500 + }, + { + "epoch": 1.7139499200156703, + "eval_entropy": 1.3191882546742757, + "eval_loss": 0.14246180653572083, + "eval_mean_token_accuracy": 0.9676294851303101, + "eval_num_tokens": 7409894593.0, + "eval_runtime": 749.671, + "eval_samples_per_second": 12.88, + "eval_steps_per_second": 0.101, + "step": 52500 + }, + { + "entropy": 1.3101821112632752, + "epoch": 1.7155822532728282, + "grad_norm": 1.1796875, + "learning_rate": 3.25753717496604e-07, + "loss": 0.0956, + "mean_token_accuracy": 0.9765710198879242, + "num_tokens": 7416484075.0, + "step": 52550 + }, + { + "entropy": 1.3191359090805053, + "epoch": 1.717214586529986, + "grad_norm": 1.359375, + "learning_rate": 3.220939314419614e-07, + "loss": 0.106, + "mean_token_accuracy": 0.9754664206504822, + "num_tokens": 7423584170.0, + "step": 52600 + }, + { + "entropy": 1.3270062279701234, + "epoch": 1.7188469197871439, + "grad_norm": 1.359375, + "learning_rate": 3.1845365365515136e-07, + "loss": 0.1058, + "mean_token_accuracy": 0.9742505669593811, + "num_tokens": 7430705466.0, + "step": 52650 + }, + { + "entropy": 1.308781328201294, + "epoch": 1.7204792530443016, + "grad_norm": 1.515625, + "learning_rate": 3.14832910655221e-07, + "loss": 0.101, + "mean_token_accuracy": 0.9755427074432373, + "num_tokens": 7437413037.0, + "step": 52700 + }, + { + "entropy": 1.3295643472671508, + "epoch": 1.7221115863014593, + "grad_norm": 1.3203125, + "learning_rate": 3.1123172881890593e-07, + "loss": 0.1149, + "mean_token_accuracy": 0.9725350320339203, + "num_tokens": 7445324755.0, + "step": 52750 + }, + { + "entropy": 1.306700224876404, + "epoch": 1.723743919558617, + "grad_norm": 1.734375, + "learning_rate": 3.076501343804432e-07, + "loss": 0.1028, + "mean_token_accuracy": 0.9747404766082763, + "num_tokens": 7452440110.0, + "step": 52800 + }, + { + "entropy": 1.320225818157196, + "epoch": 1.7253762528157748, + "grad_norm": 2.015625, + "learning_rate": 3.0408815343137576e-07, + "loss": 0.1097, + "mean_token_accuracy": 0.9736387383937836, + "num_tokens": 7459466767.0, + "step": 52850 + }, + { + "entropy": 1.319351954460144, + "epoch": 1.7270085860729325, + "grad_norm": 2.25, + "learning_rate": 3.005458119203661e-07, + "loss": 0.0996, + "mean_token_accuracy": 0.9759821879863739, + "num_tokens": 7466190850.0, + "step": 52900 + }, + { + "entropy": 1.3243736958503722, + "epoch": 1.7286409193300905, + "grad_norm": 1.0625, + "learning_rate": 2.970231356530037e-07, + "loss": 0.1178, + "mean_token_accuracy": 0.9724557065963745, + "num_tokens": 7473713575.0, + "step": 52950 + }, + { + "entropy": 1.3095801281929016, + "epoch": 1.7302732525872482, + "grad_norm": 2.3125, + "learning_rate": 2.935201502916196e-07, + "loss": 0.104, + "mean_token_accuracy": 0.975416682958603, + "num_tokens": 7480846261.0, + "step": 53000 + }, + { + "epoch": 1.7302732525872482, + "eval_entropy": 1.3154603624343872, + "eval_loss": 0.1425192952156067, + "eval_mean_token_accuracy": 0.9676680334409078, + "eval_num_tokens": 7480846261.0, + "eval_runtime": 749.6932, + "eval_samples_per_second": 12.88, + "eval_steps_per_second": 0.101, + "step": 53000 + }, + { + "entropy": 1.327847077846527, + "epoch": 1.7319055858444061, + "grad_norm": 1.890625, + "learning_rate": 2.9003688135509996e-07, + "loss": 0.1027, + "mean_token_accuracy": 0.9751829147338867, + "num_tokens": 7487902389.0, + "step": 53050 + }, + { + "entropy": 1.3144252371788026, + "epoch": 1.7335379191015639, + "grad_norm": 1.4140625, + "learning_rate": 2.86573354218696e-07, + "loss": 0.0943, + "mean_token_accuracy": 0.9771701264381408, + "num_tokens": 7494511451.0, + "step": 53100 + }, + { + "entropy": 1.317789807319641, + "epoch": 1.7351702523587216, + "grad_norm": 2.09375, + "learning_rate": 2.8312959411384496e-07, + "loss": 0.1057, + "mean_token_accuracy": 0.9748926043510437, + "num_tokens": 7501653002.0, + "step": 53150 + }, + { + "entropy": 1.3131379342079164, + "epoch": 1.7368025856158793, + "grad_norm": 1.5234375, + "learning_rate": 2.7970562612798003e-07, + "loss": 0.0964, + "mean_token_accuracy": 0.9761436748504638, + "num_tokens": 7508205530.0, + "step": 53200 + }, + { + "entropy": 1.3117465686798095, + "epoch": 1.738434918873037, + "grad_norm": 1.203125, + "learning_rate": 2.7630147520435454e-07, + "loss": 0.0973, + "mean_token_accuracy": 0.9763562536239624, + "num_tokens": 7514970855.0, + "step": 53250 + }, + { + "entropy": 1.3275910449028014, + "epoch": 1.7400672521301948, + "grad_norm": 1.34375, + "learning_rate": 2.729171661418536e-07, + "loss": 0.1058, + "mean_token_accuracy": 0.9749761927127838, + "num_tokens": 7522035646.0, + "step": 53300 + }, + { + "entropy": 1.3121574544906616, + "epoch": 1.7416995853873527, + "grad_norm": 1.109375, + "learning_rate": 2.695527235948176e-07, + "loss": 0.1045, + "mean_token_accuracy": 0.9752842879295349, + "num_tokens": 7529319921.0, + "step": 53350 + }, + { + "entropy": 1.3087862515449524, + "epoch": 1.7433319186445104, + "grad_norm": 1.21875, + "learning_rate": 2.662081720728621e-07, + "loss": 0.0898, + "mean_token_accuracy": 0.9782419979572297, + "num_tokens": 7535926578.0, + "step": 53400 + }, + { + "entropy": 1.3212140440940856, + "epoch": 1.7449642519016684, + "grad_norm": 1.4453125, + "learning_rate": 2.6288353594069716e-07, + "loss": 0.0961, + "mean_token_accuracy": 0.9764723992347717, + "num_tokens": 7542728374.0, + "step": 53450 + }, + { + "entropy": 1.3193033647537231, + "epoch": 1.746596585158826, + "grad_norm": 1.265625, + "learning_rate": 2.595788394179528e-07, + "loss": 0.1046, + "mean_token_accuracy": 0.9745003497600555, + "num_tokens": 7549497199.0, + "step": 53500 + }, + { + "epoch": 1.746596585158826, + "eval_entropy": 1.316735652287801, + "eval_loss": 0.14255692064762115, + "eval_mean_token_accuracy": 0.9676073582967123, + "eval_num_tokens": 7549497199.0, + "eval_runtime": 749.254, + "eval_samples_per_second": 12.887, + "eval_steps_per_second": 0.101, + "step": 53500 + }, + { + "entropy": 1.3119576716423034, + "epoch": 1.7482289184159838, + "grad_norm": 1.3046875, + "learning_rate": 2.562941065789989e-07, + "loss": 0.1075, + "mean_token_accuracy": 0.9740070915222168, + "num_tokens": 7556760300.0, + "step": 53550 + }, + { + "entropy": 1.323666477203369, + "epoch": 1.7498612516731415, + "grad_norm": 1.40625, + "learning_rate": 2.530293613527752e-07, + "loss": 0.1049, + "mean_token_accuracy": 0.9741265332698822, + "num_tokens": 7563873501.0, + "step": 53600 + }, + { + "entropy": 1.313588421344757, + "epoch": 1.7514935849302993, + "grad_norm": 2.484375, + "learning_rate": 2.497846275226101e-07, + "loss": 0.0999, + "mean_token_accuracy": 0.976412239074707, + "num_tokens": 7570815590.0, + "step": 53650 + }, + { + "entropy": 1.310178370475769, + "epoch": 1.753125918187457, + "grad_norm": 1.15625, + "learning_rate": 2.4655992872605383e-07, + "loss": 0.0919, + "mean_token_accuracy": 0.9780379617214203, + "num_tokens": 7577519323.0, + "step": 53700 + }, + { + "entropy": 1.3194669818878173, + "epoch": 1.754758251444615, + "grad_norm": 1.6328125, + "learning_rate": 2.43355288454702e-07, + "loss": 0.1098, + "mean_token_accuracy": 0.973793808221817, + "num_tokens": 7584868782.0, + "step": 53750 + }, + { + "entropy": 1.3247113370895385, + "epoch": 1.7563905847017727, + "grad_norm": 1.2265625, + "learning_rate": 2.401707300540279e-07, + "loss": 0.0981, + "mean_token_accuracy": 0.9766256093978882, + "num_tokens": 7591651694.0, + "step": 53800 + }, + { + "entropy": 1.320894329547882, + "epoch": 1.7580229179589306, + "grad_norm": 1.8359375, + "learning_rate": 2.3700627672320707e-07, + "loss": 0.0977, + "mean_token_accuracy": 0.9763943207263946, + "num_tokens": 7598567378.0, + "step": 53850 + }, + { + "entropy": 1.3126529788970946, + "epoch": 1.7596552512160883, + "grad_norm": 1.375, + "learning_rate": 2.338619515149546e-07, + "loss": 0.0914, + "mean_token_accuracy": 0.9772222137451172, + "num_tokens": 7605566065.0, + "step": 53900 + }, + { + "entropy": 1.3226768159866333, + "epoch": 1.761287584473246, + "grad_norm": 1.609375, + "learning_rate": 2.307377773353535e-07, + "loss": 0.1152, + "mean_token_accuracy": 0.9727129638195038, + "num_tokens": 7613562049.0, + "step": 53950 + }, + { + "entropy": 1.3229237818717956, + "epoch": 1.7629199177304038, + "grad_norm": 1.3203125, + "learning_rate": 2.2763377694368827e-07, + "loss": 0.1008, + "mean_token_accuracy": 0.975539722442627, + "num_tokens": 7620524140.0, + "step": 54000 + }, + { + "epoch": 1.7629199177304038, + "eval_entropy": 1.3142534939448038, + "eval_loss": 0.14254117012023926, + "eval_mean_token_accuracy": 0.9676929664611816, + "eval_num_tokens": 7620524140.0, + "eval_runtime": 752.4204, + "eval_samples_per_second": 12.833, + "eval_steps_per_second": 0.101, + "step": 54000 + }, + { + "entropy": 1.3087509632110597, + "epoch": 1.7645522509875615, + "grad_norm": 1.171875, + "learning_rate": 2.2454997295227985e-07, + "loss": 0.101, + "mean_token_accuracy": 0.9754433751106262, + "num_tokens": 7627617937.0, + "step": 54050 + }, + { + "entropy": 1.3296643924713134, + "epoch": 1.7661845842447192, + "grad_norm": 2.046875, + "learning_rate": 2.2148638782631969e-07, + "loss": 0.1022, + "mean_token_accuracy": 0.9750396251678467, + "num_tokens": 7634624283.0, + "step": 54100 + }, + { + "entropy": 1.3130810499191283, + "epoch": 1.7678169175018772, + "grad_norm": 1.2734375, + "learning_rate": 2.1844304388370862e-07, + "loss": 0.0997, + "mean_token_accuracy": 0.9761820614337922, + "num_tokens": 7642188094.0, + "step": 54150 + }, + { + "entropy": 1.3113509464263915, + "epoch": 1.769449250759035, + "grad_norm": 1.1796875, + "learning_rate": 2.154199632948901e-07, + "loss": 0.0925, + "mean_token_accuracy": 0.9778699886798858, + "num_tokens": 7648970433.0, + "step": 54200 + }, + { + "entropy": 1.321754252910614, + "epoch": 1.7710815840161929, + "grad_norm": 1.09375, + "learning_rate": 2.124171680826934e-07, + "loss": 0.1044, + "mean_token_accuracy": 0.974656708240509, + "num_tokens": 7656027769.0, + "step": 54250 + }, + { + "entropy": 1.3192690873146058, + "epoch": 1.7727139172733506, + "grad_norm": 1.578125, + "learning_rate": 2.094346801221706e-07, + "loss": 0.0962, + "mean_token_accuracy": 0.9762521004676818, + "num_tokens": 7662871105.0, + "step": 54300 + }, + { + "entropy": 1.3231295156478882, + "epoch": 1.7743462505305083, + "grad_norm": 1.40625, + "learning_rate": 2.0647252114043548e-07, + "loss": 0.1029, + "mean_token_accuracy": 0.975200617313385, + "num_tokens": 7670265436.0, + "step": 54350 + }, + { + "entropy": 1.3153988409042359, + "epoch": 1.775978583787666, + "grad_norm": 1.21875, + "learning_rate": 2.0353071271651024e-07, + "loss": 0.0908, + "mean_token_accuracy": 0.9769929325580597, + "num_tokens": 7677021371.0, + "step": 54400 + }, + { + "entropy": 1.3150772476196289, + "epoch": 1.7776109170448238, + "grad_norm": 1.046875, + "learning_rate": 2.006092762811631e-07, + "loss": 0.0999, + "mean_token_accuracy": 0.9757753646373749, + "num_tokens": 7684035466.0, + "step": 54450 + }, + { + "entropy": 1.3111911249160766, + "epoch": 1.7792432503019815, + "grad_norm": 1.65625, + "learning_rate": 1.9770823311675622e-07, + "loss": 0.0956, + "mean_token_accuracy": 0.9768190658092499, + "num_tokens": 7690811924.0, + "step": 54500 + }, + { + "epoch": 1.7792432503019815, + "eval_entropy": 1.3134044408798218, + "eval_loss": 0.14260686933994293, + "eval_mean_token_accuracy": 0.9676902786890665, + "eval_num_tokens": 7690811924.0, + "eval_runtime": 751.7328, + "eval_samples_per_second": 12.845, + "eval_steps_per_second": 0.101, + "step": 54500 + }, + { + "entropy": 1.3163812851905823, + "epoch": 1.7808755835591394, + "grad_norm": 1.2578125, + "learning_rate": 1.948276043570867e-07, + "loss": 0.0971, + "mean_token_accuracy": 0.9768479645252228, + "num_tokens": 7697685594.0, + "step": 54550 + }, + { + "entropy": 1.3316512179374695, + "epoch": 1.7825079168162972, + "grad_norm": 1.1484375, + "learning_rate": 1.9196741098723714e-07, + "loss": 0.1016, + "mean_token_accuracy": 0.9754971957206726, + "num_tokens": 7704672835.0, + "step": 54600 + }, + { + "entropy": 1.3212800335884094, + "epoch": 1.7841402500734551, + "grad_norm": 1.296875, + "learning_rate": 1.8912767384341967e-07, + "loss": 0.0971, + "mean_token_accuracy": 0.9769463586807251, + "num_tokens": 7712059362.0, + "step": 54650 + }, + { + "entropy": 1.3257331156730652, + "epoch": 1.7857725833306128, + "grad_norm": 1.4140625, + "learning_rate": 1.863084136128239e-07, + "loss": 0.0931, + "mean_token_accuracy": 0.9771769893169403, + "num_tokens": 7719025676.0, + "step": 54700 + }, + { + "entropy": 1.3107067704200746, + "epoch": 1.7874049165877706, + "grad_norm": 1.375, + "learning_rate": 1.8350965083346883e-07, + "loss": 0.0811, + "mean_token_accuracy": 0.9800481641292572, + "num_tokens": 7725468150.0, + "step": 54750 + }, + { + "entropy": 1.3124605083465577, + "epoch": 1.7890372498449283, + "grad_norm": 2.0, + "learning_rate": 1.807314058940498e-07, + "loss": 0.0905, + "mean_token_accuracy": 0.9778806746006012, + "num_tokens": 7732699148.0, + "step": 54800 + }, + { + "entropy": 1.3118835282325745, + "epoch": 1.790669583102086, + "grad_norm": 1.28125, + "learning_rate": 1.7797369903379447e-07, + "loss": 0.0885, + "mean_token_accuracy": 0.9785989081859588, + "num_tokens": 7739580787.0, + "step": 54850 + }, + { + "entropy": 1.3218691396713256, + "epoch": 1.7923019163592437, + "grad_norm": 1.328125, + "learning_rate": 1.7523655034230913e-07, + "loss": 0.1019, + "mean_token_accuracy": 0.9758154857158661, + "num_tokens": 7746871274.0, + "step": 54900 + }, + { + "entropy": 1.3150616145133973, + "epoch": 1.7939342496164017, + "grad_norm": 1.2265625, + "learning_rate": 1.7251997975944023e-07, + "loss": 0.0885, + "mean_token_accuracy": 0.9779963111877441, + "num_tokens": 7754114830.0, + "step": 54950 + }, + { + "entropy": 1.3193181252479553, + "epoch": 1.7955665828735594, + "grad_norm": 1.6796875, + "learning_rate": 1.698240070751208e-07, + "loss": 0.0954, + "mean_token_accuracy": 0.9768703639507293, + "num_tokens": 7761534891.0, + "step": 55000 + }, + { + "epoch": 1.7955665828735594, + "eval_entropy": 1.3136341873804729, + "eval_loss": 0.14263209700584412, + "eval_mean_token_accuracy": 0.9676380705833435, + "eval_num_tokens": 7761534891.0, + "eval_runtime": 751.8782, + "eval_samples_per_second": 12.843, + "eval_steps_per_second": 0.101, + "step": 55000 + }, + { + "entropy": 1.3227564001083374, + "epoch": 1.7971989161307174, + "grad_norm": 1.359375, + "learning_rate": 1.6714865192923357e-07, + "loss": 0.0931, + "mean_token_accuracy": 0.9776843535900116, + "num_tokens": 7768739013.0, + "step": 55050 + }, + { + "entropy": 1.3159151887893676, + "epoch": 1.798831249387875, + "grad_norm": 1.75, + "learning_rate": 1.644939338114617e-07, + "loss": 0.0891, + "mean_token_accuracy": 0.9785628998279572, + "num_tokens": 7775714510.0, + "step": 55100 + }, + { + "entropy": 1.3172850131988525, + "epoch": 1.8004635826450328, + "grad_norm": 1.5, + "learning_rate": 1.618598720611517e-07, + "loss": 0.0929, + "mean_token_accuracy": 0.9780389821529388, + "num_tokens": 7782939417.0, + "step": 55150 + }, + { + "entropy": 1.3124612760543823, + "epoch": 1.8020959159021905, + "grad_norm": 1.453125, + "learning_rate": 1.5924648586717106e-07, + "loss": 0.0903, + "mean_token_accuracy": 0.9784395337104798, + "num_tokens": 7790076964.0, + "step": 55200 + }, + { + "entropy": 1.3138836860656737, + "epoch": 1.8037282491593483, + "grad_norm": 1.4140625, + "learning_rate": 1.566537942677657e-07, + "loss": 0.0906, + "mean_token_accuracy": 0.9779064965248108, + "num_tokens": 7796643475.0, + "step": 55250 + }, + { + "entropy": 1.3136559057235717, + "epoch": 1.805360582416506, + "grad_norm": 0.00165557861328125, + "learning_rate": 1.5408181615042594e-07, + "loss": 0.0843, + "mean_token_accuracy": 0.9791945159435272, + "num_tokens": 7803352885.0, + "step": 55300 + }, + { + "entropy": 1.3052284288406373, + "epoch": 1.806992915673664, + "grad_norm": 1.265625, + "learning_rate": 1.5153057025174432e-07, + "loss": 0.09, + "mean_token_accuracy": 0.9779746580123901, + "num_tokens": 7810594549.0, + "step": 55350 + }, + { + "entropy": 1.3102794551849366, + "epoch": 1.8086252489308219, + "grad_norm": 1.2265625, + "learning_rate": 1.4900007515728365e-07, + "loss": 0.0844, + "mean_token_accuracy": 0.9790527272224426, + "num_tokens": 7817507513.0, + "step": 55400 + }, + { + "entropy": 1.324124116897583, + "epoch": 1.8102575821879796, + "grad_norm": 1.1953125, + "learning_rate": 1.4649034930143722e-07, + "loss": 0.094, + "mean_token_accuracy": 0.9774300479888915, + "num_tokens": 7824764989.0, + "step": 55450 + }, + { + "entropy": 1.3147008061408996, + "epoch": 1.8118899154451373, + "grad_norm": 1.9765625, + "learning_rate": 1.440014109672978e-07, + "loss": 0.0914, + "mean_token_accuracy": 0.9776774108409881, + "num_tokens": 7831265405.0, + "step": 55500 + }, + { + "epoch": 1.8118899154451373, + "eval_entropy": 1.3136727301279705, + "eval_loss": 0.1426331102848053, + "eval_mean_token_accuracy": 0.9676420497894287, + "eval_num_tokens": 7831265405.0, + "eval_runtime": 749.6902, + "eval_samples_per_second": 12.88, + "eval_steps_per_second": 0.101, + "step": 55500 + }, + { + "entropy": 1.3262977242469787, + "epoch": 1.813522248702295, + "grad_norm": 1.6875, + "learning_rate": 1.415332782865235e-07, + "loss": 0.0996, + "mean_token_accuracy": 0.976102020740509, + "num_tokens": 7838532553.0, + "step": 55550 + }, + { + "entropy": 1.314659128189087, + "epoch": 1.8151545819594528, + "grad_norm": 0.00151824951171875, + "learning_rate": 1.3908596923920348e-07, + "loss": 0.0876, + "mean_token_accuracy": 0.9786652231216431, + "num_tokens": 7845416264.0, + "step": 55600 + }, + { + "entropy": 1.2995626902580262, + "epoch": 1.8167869152166105, + "grad_norm": 1.203125, + "learning_rate": 1.3665950165373177e-07, + "loss": 0.078, + "mean_token_accuracy": 0.9805066454410553, + "num_tokens": 7852143397.0, + "step": 55650 + }, + { + "entropy": 1.3230929231643678, + "epoch": 1.8184192484737685, + "grad_norm": 1.21875, + "learning_rate": 1.3425389320667126e-07, + "loss": 0.0948, + "mean_token_accuracy": 0.9774747908115387, + "num_tokens": 7859183732.0, + "step": 55700 + }, + { + "entropy": 1.3118236589431762, + "epoch": 1.8200515817309262, + "grad_norm": 1.078125, + "learning_rate": 1.3186916142263138e-07, + "loss": 0.0852, + "mean_token_accuracy": 0.9793214762210846, + "num_tokens": 7865800782.0, + "step": 55750 + }, + { + "entropy": 1.3139012956619263, + "epoch": 1.8216839149880841, + "grad_norm": 0.921875, + "learning_rate": 1.295053236741346e-07, + "loss": 0.0858, + "mean_token_accuracy": 0.9790966403484345, + "num_tokens": 7873054112.0, + "step": 55800 + }, + { + "entropy": 1.3215832591056824, + "epoch": 1.8233162482452419, + "grad_norm": 1.5625, + "learning_rate": 1.2716239718149404e-07, + "loss": 0.0915, + "mean_token_accuracy": 0.9782492446899415, + "num_tokens": 7880572440.0, + "step": 55850 + }, + { + "entropy": 1.3250614070892335, + "epoch": 1.8249485815023996, + "grad_norm": 1.421875, + "learning_rate": 1.248403990126864e-07, + "loss": 0.0836, + "mean_token_accuracy": 0.9802043890953064, + "num_tokens": 7887372576.0, + "step": 55900 + }, + { + "entropy": 1.3125244760513306, + "epoch": 1.8265809147595573, + "grad_norm": 1.6953125, + "learning_rate": 1.2253934608322704e-07, + "loss": 0.0877, + "mean_token_accuracy": 0.9786297881603241, + "num_tokens": 7894499299.0, + "step": 55950 + }, + { + "entropy": 1.310175290107727, + "epoch": 1.828213248016715, + "grad_norm": 1.2578125, + "learning_rate": 1.2025925515604797e-07, + "loss": 0.0863, + "mean_token_accuracy": 0.9788778626918793, + "num_tokens": 7901474987.0, + "step": 56000 + }, + { + "epoch": 1.828213248016715, + "eval_entropy": 1.3131318473815918, + "eval_loss": 0.14263643324375153, + "eval_mean_token_accuracy": 0.9676548767089844, + "eval_num_tokens": 7901474987.0, + "eval_runtime": 752.1988, + "eval_samples_per_second": 12.837, + "eval_steps_per_second": 0.101, + "step": 56000 + }, + { + "entropy": 1.307514934539795, + "epoch": 1.8298455812738728, + "grad_norm": 1.0546875, + "learning_rate": 1.1800014284137439e-07, + "loss": 0.0798, + "mean_token_accuracy": 0.9802592170238494, + "num_tokens": 7908470639.0, + "step": 56050 + }, + { + "entropy": 1.3173511362075805, + "epoch": 1.8314779145310307, + "grad_norm": 0.287109375, + "learning_rate": 1.157620255966061e-07, + "loss": 0.0857, + "mean_token_accuracy": 0.9793569481372834, + "num_tokens": 7915474847.0, + "step": 56100 + }, + { + "entropy": 1.326115915775299, + "epoch": 1.8331102477881884, + "grad_norm": 1.40625, + "learning_rate": 1.1354491972619418e-07, + "loss": 0.0882, + "mean_token_accuracy": 0.9790770506858826, + "num_tokens": 7922729639.0, + "step": 56150 + }, + { + "entropy": 1.3074156522750855, + "epoch": 1.8347425810453464, + "grad_norm": 1.125, + "learning_rate": 1.1134884138152556e-07, + "loss": 0.0747, + "mean_token_accuracy": 0.9815619790554047, + "num_tokens": 7929688118.0, + "step": 56200 + }, + { + "entropy": 1.3235062193870544, + "epoch": 1.836374914302504, + "grad_norm": 1.2578125, + "learning_rate": 1.0917380656080234e-07, + "loss": 0.083, + "mean_token_accuracy": 0.9802336692810059, + "num_tokens": 7936632242.0, + "step": 56250 + }, + { + "entropy": 1.320460605621338, + "epoch": 1.8380072475596618, + "grad_norm": 1.453125, + "learning_rate": 1.0701983110892821e-07, + "loss": 0.0871, + "mean_token_accuracy": 0.9788316774368286, + "num_tokens": 7944029771.0, + "step": 56300 + }, + { + "entropy": 1.3198864316940309, + "epoch": 1.8396395808168196, + "grad_norm": 1.671875, + "learning_rate": 1.0488693071738998e-07, + "loss": 0.0914, + "mean_token_accuracy": 0.9782080149650574, + "num_tokens": 7951324936.0, + "step": 56350 + }, + { + "entropy": 1.3099779963493348, + "epoch": 1.8412719140739773, + "grad_norm": 1.5625, + "learning_rate": 1.0277512092414621e-07, + "loss": 0.0808, + "mean_token_accuracy": 0.9805028080940247, + "num_tokens": 7958613468.0, + "step": 56400 + }, + { + "entropy": 1.3207287693023682, + "epoch": 1.842904247331135, + "grad_norm": 2.03125, + "learning_rate": 1.0068441711351239e-07, + "loss": 0.0848, + "mean_token_accuracy": 0.9797763514518738, + "num_tokens": 7965649951.0, + "step": 56450 + }, + { + "entropy": 1.3234837770462036, + "epoch": 1.844536580588293, + "grad_norm": 1.7265625, + "learning_rate": 9.861483451604803e-08, + "loss": 0.0848, + "mean_token_accuracy": 0.9798565399646759, + "num_tokens": 7972824474.0, + "step": 56500 + }, + { + "epoch": 1.844536580588293, + "eval_entropy": 1.3131659841537475, + "eval_loss": 0.142597496509552, + "eval_mean_token_accuracy": 0.9676955684026083, + "eval_num_tokens": 7972824474.0, + "eval_runtime": 754.1624, + "eval_samples_per_second": 12.804, + "eval_steps_per_second": 0.101, + "step": 56500 + }, + { + "entropy": 1.3255647730827331, + "epoch": 1.8461689138454507, + "grad_norm": 1.4609375, + "learning_rate": 9.656638820844832e-08, + "loss": 0.0797, + "mean_token_accuracy": 0.9803315377235413, + "num_tokens": 7979650165.0, + "step": 56550 + }, + { + "entropy": 1.3177052664756774, + "epoch": 1.8478012471026086, + "grad_norm": 1.203125, + "learning_rate": 9.453909311343168e-08, + "loss": 0.082, + "mean_token_accuracy": 0.9800474560260772, + "num_tokens": 7986279045.0, + "step": 56600 + }, + { + "entropy": 1.3207802820205687, + "epoch": 1.8494335803597663, + "grad_norm": 1.4765625, + "learning_rate": 9.253296399963306e-08, + "loss": 0.0844, + "mean_token_accuracy": 0.9793656885623931, + "num_tokens": 7993389141.0, + "step": 56650 + }, + { + "entropy": 1.313080358505249, + "epoch": 1.851065913616924, + "grad_norm": 1.015625, + "learning_rate": 9.054801548149383e-08, + "loss": 0.0835, + "mean_token_accuracy": 0.9803290486335754, + "num_tokens": 8000349967.0, + "step": 56700 + }, + { + "entropy": 1.316315357685089, + "epoch": 1.8526982468740818, + "grad_norm": 1.8125, + "learning_rate": 8.85842620191587e-08, + "loss": 0.0823, + "mean_token_accuracy": 0.9800036442279816, + "num_tokens": 8007394269.0, + "step": 56750 + }, + { + "entropy": 1.323743233680725, + "epoch": 1.8543305801312395, + "grad_norm": 1.5390625, + "learning_rate": 8.664171791836828e-08, + "loss": 0.0783, + "mean_token_accuracy": 0.9807388544082641, + "num_tokens": 8014348208.0, + "step": 56800 + }, + { + "entropy": 1.3166941332817077, + "epoch": 1.8559629133883973, + "grad_norm": 1.9296875, + "learning_rate": 8.472039733035375e-08, + "loss": 0.0887, + "mean_token_accuracy": 0.9791383862495422, + "num_tokens": 8021975439.0, + "step": 56850 + }, + { + "entropy": 1.3115915489196777, + "epoch": 1.8575952466455552, + "grad_norm": 1.265625, + "learning_rate": 8.282031425173697e-08, + "loss": 0.0742, + "mean_token_accuracy": 0.9814891684055328, + "num_tokens": 8028607289.0, + "step": 56900 + }, + { + "entropy": 1.3096832203865052, + "epoch": 1.859227579902713, + "grad_norm": 1.609375, + "learning_rate": 8.094148252442557e-08, + "loss": 0.0806, + "mean_token_accuracy": 0.9808643221855163, + "num_tokens": 8035350719.0, + "step": 56950 + }, + { + "entropy": 1.312803740501404, + "epoch": 1.8608599131598709, + "grad_norm": 0.953125, + "learning_rate": 7.908391583551399e-08, + "loss": 0.08, + "mean_token_accuracy": 0.9804242384433747, + "num_tokens": 8042661144.0, + "step": 57000 + }, + { + "epoch": 1.8608599131598709, + "eval_entropy": 1.3134743928909303, + "eval_loss": 0.14260423183441162, + "eval_mean_token_accuracy": 0.9677057147026062, + "eval_num_tokens": 8042661144.0, + "eval_runtime": 752.8525, + "eval_samples_per_second": 12.826, + "eval_steps_per_second": 0.101, + "step": 57000 + }, + { + "entropy": 1.3151041221618653, + "epoch": 1.8624922464170286, + "grad_norm": 1.6875, + "learning_rate": 7.724762771718264e-08, + "loss": 0.0703, + "mean_token_accuracy": 0.9828312218189239, + "num_tokens": 8049061325.0, + "step": 57050 + }, + { + "entropy": 1.3201626539230347, + "epoch": 1.8641245796741863, + "grad_norm": 1.3203125, + "learning_rate": 7.543263154660018e-08, + "loss": 0.0794, + "mean_token_accuracy": 0.9805750000476837, + "num_tokens": 8055861953.0, + "step": 57100 + }, + { + "entropy": 1.3100008058547974, + "epoch": 1.865756912931344, + "grad_norm": 1.1328125, + "learning_rate": 7.363894054582543e-08, + "loss": 0.0796, + "mean_token_accuracy": 0.9810529005527496, + "num_tokens": 8062705627.0, + "step": 57150 + }, + { + "entropy": 1.3101088619232177, + "epoch": 1.8673892461885018, + "grad_norm": 1.3671875, + "learning_rate": 7.186656778171064e-08, + "loss": 0.0848, + "mean_token_accuracy": 0.9795117557048798, + "num_tokens": 8069712418.0, + "step": 57200 + }, + { + "entropy": 1.3087351298332215, + "epoch": 1.8690215794456595, + "grad_norm": 1.3828125, + "learning_rate": 7.011552616580763e-08, + "loss": 0.0784, + "mean_token_accuracy": 0.9811785018444061, + "num_tokens": 8076520985.0, + "step": 57250 + }, + { + "entropy": 1.314774408340454, + "epoch": 1.8706539127028174, + "grad_norm": 1.265625, + "learning_rate": 6.838582845427322e-08, + "loss": 0.0829, + "mean_token_accuracy": 0.9801759088039398, + "num_tokens": 8083425224.0, + "step": 57300 + }, + { + "entropy": 1.3010089206695556, + "epoch": 1.8722862459599752, + "grad_norm": 1.140625, + "learning_rate": 6.667748724777589e-08, + "loss": 0.0837, + "mean_token_accuracy": 0.9797486662864685, + "num_tokens": 8090860003.0, + "step": 57350 + }, + { + "entropy": 1.3112692332267761, + "epoch": 1.8739185792171331, + "grad_norm": 1.890625, + "learning_rate": 6.499051499140363e-08, + "loss": 0.0872, + "mean_token_accuracy": 0.9786810195446014, + "num_tokens": 8097627069.0, + "step": 57400 + }, + { + "entropy": 1.3242556214332581, + "epoch": 1.8755509124742908, + "grad_norm": 1.234375, + "learning_rate": 6.332492397457457e-08, + "loss": 0.1031, + "mean_token_accuracy": 0.9754553985595703, + "num_tokens": 8104939412.0, + "step": 57450 + }, + { + "entropy": 1.3048901081085205, + "epoch": 1.8771832457314486, + "grad_norm": 1.1875, + "learning_rate": 6.168072633094578e-08, + "loss": 0.0942, + "mean_token_accuracy": 0.9768707299232483, + "num_tokens": 8111717916.0, + "step": 57500 + }, + { + "epoch": 1.8771832457314486, + "eval_entropy": 1.3135956350962321, + "eval_loss": 0.14263209700584412, + "eval_mean_token_accuracy": 0.9676501870155334, + "eval_num_tokens": 8111717916.0, + "eval_runtime": 751.2808, + "eval_samples_per_second": 12.853, + "eval_steps_per_second": 0.101, + "step": 57500 + }, + { + "entropy": 1.323448977470398, + "epoch": 1.8788155789886063, + "grad_norm": 1.2421875, + "learning_rate": 6.00579340383277e-08, + "loss": 0.1047, + "mean_token_accuracy": 0.9757449758052826, + "num_tokens": 8118451505.0, + "step": 57550 + }, + { + "entropy": 1.3023139286041259, + "epoch": 1.880447912245764, + "grad_norm": 1.046875, + "learning_rate": 5.845655891859247e-08, + "loss": 0.097, + "mean_token_accuracy": 0.9766959726810456, + "num_tokens": 8125132870.0, + "step": 57600 + }, + { + "entropy": 1.3242397212982178, + "epoch": 1.8820802455029217, + "grad_norm": 2.328125, + "learning_rate": 5.68766126375927e-08, + "loss": 0.1246, + "mean_token_accuracy": 0.9708089303970336, + "num_tokens": 8131883790.0, + "step": 57650 + }, + { + "entropy": 1.3141312503814697, + "epoch": 1.8837125787600797, + "grad_norm": 1.53125, + "learning_rate": 5.5318106705072535e-08, + "loss": 0.1395, + "mean_token_accuracy": 0.9681181204319, + "num_tokens": 8138880080.0, + "step": 57700 + }, + { + "entropy": 1.3103543734550476, + "epoch": 1.8853449120172374, + "grad_norm": 1.6953125, + "learning_rate": 5.378105247458609e-08, + "loss": 0.1249, + "mean_token_accuracy": 0.9709859442710876, + "num_tokens": 8145787855.0, + "step": 57750 + }, + { + "entropy": 1.3131978607177734, + "epoch": 1.8869772452743954, + "grad_norm": 1.953125, + "learning_rate": 5.226546114341413e-08, + "loss": 0.1242, + "mean_token_accuracy": 0.971501134634018, + "num_tokens": 8152787087.0, + "step": 57800 + }, + { + "entropy": 1.3190133213996886, + "epoch": 1.888609578531553, + "grad_norm": 1.640625, + "learning_rate": 5.077134375248183e-08, + "loss": 0.1295, + "mean_token_accuracy": 0.9703145730495453, + "num_tokens": 8159391177.0, + "step": 57850 + }, + { + "entropy": 1.306349711418152, + "epoch": 1.8902419117887108, + "grad_norm": 1.2109375, + "learning_rate": 4.9298711186279824e-08, + "loss": 0.1169, + "mean_token_accuracy": 0.9731926989555358, + "num_tokens": 8166052317.0, + "step": 57900 + }, + { + "entropy": 1.312576003074646, + "epoch": 1.8918742450458685, + "grad_norm": 1.796875, + "learning_rate": 4.784757417278296e-08, + "loss": 0.124, + "mean_token_accuracy": 0.9711933457851409, + "num_tokens": 8173047293.0, + "step": 57950 + }, + { + "entropy": 1.316941294670105, + "epoch": 1.8935065783030263, + "grad_norm": 1.4375, + "learning_rate": 4.641794328337434e-08, + "loss": 0.1289, + "mean_token_accuracy": 0.9705963468551636, + "num_tokens": 8180073089.0, + "step": 58000 + }, + { + "epoch": 1.8935065783030263, + "eval_entropy": 1.3130744123458862, + "eval_loss": 0.142622709274292, + "eval_mean_token_accuracy": 0.9676458064715068, + "eval_num_tokens": 8180073089.0, + "eval_runtime": 754.6584, + "eval_samples_per_second": 12.795, + "eval_steps_per_second": 0.101, + "step": 58000 + }, + { + "entropy": 1.3161793375015258, + "epoch": 1.895138911560184, + "grad_norm": 1.8515625, + "learning_rate": 4.5009828932766395e-08, + "loss": 0.1247, + "mean_token_accuracy": 0.9718296456336976, + "num_tokens": 8187129264.0, + "step": 58050 + }, + { + "entropy": 1.3136934351921081, + "epoch": 1.896771244817342, + "grad_norm": 1.53125, + "learning_rate": 4.362324137892626e-08, + "loss": 0.1221, + "mean_token_accuracy": 0.971994297504425, + "num_tokens": 8194219348.0, + "step": 58100 + }, + { + "entropy": 1.3273291397094726, + "epoch": 1.8984035780744997, + "grad_norm": 2.234375, + "learning_rate": 4.225819072300019e-08, + "loss": 0.1321, + "mean_token_accuracy": 0.9701541066169739, + "num_tokens": 8201245704.0, + "step": 58150 + }, + { + "entropy": 1.3125488138198853, + "epoch": 1.9000359113316576, + "grad_norm": 1.3984375, + "learning_rate": 4.091468690924061e-08, + "loss": 0.1336, + "mean_token_accuracy": 0.9690061569213867, + "num_tokens": 8208779023.0, + "step": 58200 + }, + { + "entropy": 1.3206189322471618, + "epoch": 1.9016682445888153, + "grad_norm": 2.171875, + "learning_rate": 3.9592739724933494e-08, + "loss": 0.1326, + "mean_token_accuracy": 0.9700911176204682, + "num_tokens": 8215688508.0, + "step": 58250 + }, + { + "entropy": 1.3118561697006226, + "epoch": 1.903300577845973, + "grad_norm": 2.328125, + "learning_rate": 3.8292358800326774e-08, + "loss": 0.1341, + "mean_token_accuracy": 0.9699221074581146, + "num_tokens": 8222957220.0, + "step": 58300 + }, + { + "entropy": 1.3096631002426147, + "epoch": 1.9049329111031308, + "grad_norm": 1.328125, + "learning_rate": 3.70135536085604e-08, + "loss": 0.1289, + "mean_token_accuracy": 0.970008145570755, + "num_tokens": 8230419197.0, + "step": 58350 + }, + { + "entropy": 1.3086332321166991, + "epoch": 1.9065652443602885, + "grad_norm": 1.9453125, + "learning_rate": 3.57563334655977e-08, + "loss": 0.1196, + "mean_token_accuracy": 0.9730131483078003, + "num_tokens": 8237250702.0, + "step": 58400 + }, + { + "entropy": 1.31609708070755, + "epoch": 1.9081975776174462, + "grad_norm": 1.5546875, + "learning_rate": 3.4520707530157125e-08, + "loss": 0.1276, + "mean_token_accuracy": 0.9696814298629761, + "num_tokens": 8244072641.0, + "step": 58450 + }, + { + "entropy": 1.3143381929397584, + "epoch": 1.9098299108746042, + "grad_norm": 1.875, + "learning_rate": 3.330668480364496e-08, + "loss": 0.1288, + "mean_token_accuracy": 0.9699802708625793, + "num_tokens": 8251417973.0, + "step": 58500 + }, + { + "epoch": 1.9098299108746042, + "eval_entropy": 1.313671735127767, + "eval_loss": 0.14263801276683807, + "eval_mean_token_accuracy": 0.9676080171267192, + "eval_num_tokens": 8251417973.0, + "eval_runtime": 754.9973, + "eval_samples_per_second": 12.789, + "eval_steps_per_second": 0.101, + "step": 58500 + }, + { + "entropy": 1.3198811602592468, + "epoch": 1.911462244131762, + "grad_norm": 1.140625, + "learning_rate": 3.2114274130091383e-08, + "loss": 0.124, + "mean_token_accuracy": 0.9716233015060425, + "num_tokens": 8258494800.0, + "step": 58550 + }, + { + "entropy": 1.3157025051116944, + "epoch": 1.9130945773889199, + "grad_norm": 1.4765625, + "learning_rate": 3.0943484196083836e-08, + "loss": 0.1216, + "mean_token_accuracy": 0.9713974285125733, + "num_tokens": 8265439804.0, + "step": 58600 + }, + { + "entropy": 1.313891351222992, + "epoch": 1.9147269106460776, + "grad_norm": 2.171875, + "learning_rate": 2.979432353070577e-08, + "loss": 0.1213, + "mean_token_accuracy": 0.9722525453567505, + "num_tokens": 8272196336.0, + "step": 58650 + }, + { + "entropy": 1.3181121969223022, + "epoch": 1.9163592439032353, + "grad_norm": 1.5625, + "learning_rate": 2.8666800505473655e-08, + "loss": 0.1292, + "mean_token_accuracy": 0.9704732525348664, + "num_tokens": 8279415276.0, + "step": 58700 + }, + { + "entropy": 1.312175862789154, + "epoch": 1.917991577160393, + "grad_norm": 2.140625, + "learning_rate": 2.75609233342754e-08, + "loss": 0.124, + "mean_token_accuracy": 0.9711616253852844, + "num_tokens": 8286266573.0, + "step": 58750 + }, + { + "entropy": 1.3171151232719422, + "epoch": 1.9196239104175508, + "grad_norm": 2.40625, + "learning_rate": 2.6476700073311376e-08, + "loss": 0.1278, + "mean_token_accuracy": 0.9706109237670898, + "num_tokens": 8293368650.0, + "step": 58800 + }, + { + "entropy": 1.3135269594192505, + "epoch": 1.9212562436747085, + "grad_norm": 1.46875, + "learning_rate": 2.5414138621035477e-08, + "loss": 0.1273, + "mean_token_accuracy": 0.9706173431873322, + "num_tokens": 8300206080.0, + "step": 58850 + }, + { + "entropy": 1.3087952733039856, + "epoch": 1.9228885769318664, + "grad_norm": 1.609375, + "learning_rate": 2.437324671809782e-08, + "loss": 0.1256, + "mean_token_accuracy": 0.9707586574554443, + "num_tokens": 8307284608.0, + "step": 58900 + }, + { + "entropy": 1.310805425643921, + "epoch": 1.9245209101890242, + "grad_norm": 1.5, + "learning_rate": 2.3354031947288136e-08, + "loss": 0.1192, + "mean_token_accuracy": 0.9720681381225585, + "num_tokens": 8314051370.0, + "step": 58950 + }, + { + "entropy": 1.315222783088684, + "epoch": 1.926153243446182, + "grad_norm": 1.921875, + "learning_rate": 2.2356501733479806e-08, + "loss": 0.1181, + "mean_token_accuracy": 0.9725264024734497, + "num_tokens": 8320964631.0, + "step": 59000 + }, + { + "epoch": 1.926153243446182, + "eval_entropy": 1.313348093032837, + "eval_loss": 0.14262661337852478, + "eval_mean_token_accuracy": 0.9675622606277465, + "eval_num_tokens": 8320964631.0, + "eval_runtime": 750.7844, + "eval_samples_per_second": 12.861, + "eval_steps_per_second": 0.101, + "step": 59000 + }, + { + "entropy": 1.3111545324325562, + "epoch": 1.9277855767033398, + "grad_norm": 1.984375, + "learning_rate": 2.1380663343577246e-08, + "loss": 0.1199, + "mean_token_accuracy": 0.9722956836223602, + "num_tokens": 8327540250.0, + "step": 59050 + }, + { + "entropy": 1.3120374631881715, + "epoch": 1.9294179099604976, + "grad_norm": 1.390625, + "learning_rate": 2.04265238864616e-08, + "loss": 0.1307, + "mean_token_accuracy": 0.9701423704624176, + "num_tokens": 8334721588.0, + "step": 59100 + }, + { + "entropy": 1.3150414967536925, + "epoch": 1.9310502432176553, + "grad_norm": 1.2421875, + "learning_rate": 1.949409031294014e-08, + "loss": 0.1274, + "mean_token_accuracy": 0.9704896664619446, + "num_tokens": 8341662054.0, + "step": 59150 + }, + { + "entropy": 1.3131565976142883, + "epoch": 1.932682576474813, + "grad_norm": 2.265625, + "learning_rate": 1.8583369415694608e-08, + "loss": 0.1277, + "mean_token_accuracy": 0.9700733041763305, + "num_tokens": 8349277940.0, + "step": 59200 + }, + { + "entropy": 1.3129038047790527, + "epoch": 1.9343149097319707, + "grad_norm": 2.0625, + "learning_rate": 1.769436782923195e-08, + "loss": 0.137, + "mean_token_accuracy": 0.9688089847564697, + "num_tokens": 8356789493.0, + "step": 59250 + }, + { + "entropy": 1.3096733212471008, + "epoch": 1.9359472429891287, + "grad_norm": 2.578125, + "learning_rate": 1.6827092029836678e-08, + "loss": 0.1235, + "mean_token_accuracy": 0.9711651515960693, + "num_tokens": 8363715543.0, + "step": 59300 + }, + { + "entropy": 1.3143090605735779, + "epoch": 1.9375795762462864, + "grad_norm": 2.1875, + "learning_rate": 1.59815483355229e-08, + "loss": 0.1225, + "mean_token_accuracy": 0.9719608640670776, + "num_tokens": 8370411365.0, + "step": 59350 + }, + { + "entropy": 1.3088870167732238, + "epoch": 1.9392119095034444, + "grad_norm": 2.453125, + "learning_rate": 1.5157742905989037e-08, + "loss": 0.1246, + "mean_token_accuracy": 0.9717032659053803, + "num_tokens": 8377216822.0, + "step": 59400 + }, + { + "entropy": 1.318067877292633, + "epoch": 1.940844242760602, + "grad_norm": 0.578125, + "learning_rate": 1.4355681742571847e-08, + "loss": 0.1247, + "mean_token_accuracy": 0.9717764687538147, + "num_tokens": 8384607784.0, + "step": 59450 + }, + { + "entropy": 1.3116914916038513, + "epoch": 1.9424765760177598, + "grad_norm": 2.28125, + "learning_rate": 1.357537068820347e-08, + "loss": 0.1276, + "mean_token_accuracy": 0.970702486038208, + "num_tokens": 8391872892.0, + "step": 59500 + }, + { + "epoch": 1.9424765760177598, + "eval_entropy": 1.313501017888387, + "eval_loss": 0.1426248997449875, + "eval_mean_token_accuracy": 0.9675657065709432, + "eval_num_tokens": 8391872892.0, + "eval_runtime": 746.0898, + "eval_samples_per_second": 12.942, + "eval_steps_per_second": 0.102, + "step": 59500 + }, + { + "entropy": 1.310592589378357, + "epoch": 1.9441089092749175, + "grad_norm": 1.6953125, + "learning_rate": 1.2816815427369455e-08, + "loss": 0.1318, + "mean_token_accuracy": 0.969595000743866, + "num_tokens": 8399268788.0, + "step": 59550 + }, + { + "entropy": 1.3104493141174316, + "epoch": 1.9457412425320753, + "grad_norm": 1.6640625, + "learning_rate": 1.208002148606613e-08, + "loss": 0.114, + "mean_token_accuracy": 0.9738408529758453, + "num_tokens": 8405775548.0, + "step": 59600 + }, + { + "entropy": 1.3134746599197387, + "epoch": 1.947373575789233, + "grad_norm": 1.2734375, + "learning_rate": 1.1364994231760295e-08, + "loss": 0.129, + "mean_token_accuracy": 0.9705848026275635, + "num_tokens": 8413019964.0, + "step": 59650 + }, + { + "entropy": 1.3184413361549376, + "epoch": 1.949005909046391, + "grad_norm": 1.84375, + "learning_rate": 1.0671738873351932e-08, + "loss": 0.1169, + "mean_token_accuracy": 0.9727689456939698, + "num_tokens": 8420004432.0, + "step": 59700 + }, + { + "entropy": 1.317722589969635, + "epoch": 1.9506382423035487, + "grad_norm": 1.390625, + "learning_rate": 1.0000260461134225e-08, + "loss": 0.1233, + "mean_token_accuracy": 0.9717906093597413, + "num_tokens": 8426576444.0, + "step": 59750 + }, + { + "entropy": 1.315042221546173, + "epoch": 1.9522705755607066, + "grad_norm": 2.625, + "learning_rate": 9.35056388675759e-09, + "loss": 0.1258, + "mean_token_accuracy": 0.9710904705524445, + "num_tokens": 8433326272.0, + "step": 59800 + }, + { + "entropy": 1.3289856863021852, + "epoch": 1.9539029088178643, + "grad_norm": 1.921875, + "learning_rate": 8.722653883194375e-09, + "loss": 0.1389, + "mean_token_accuracy": 0.9682004976272583, + "num_tokens": 8440652691.0, + "step": 59850 + }, + { + "entropy": 1.3113100409507752, + "epoch": 1.955535242075022, + "grad_norm": 1.9921875, + "learning_rate": 8.116535024703554e-09, + "loss": 0.1263, + "mean_token_accuracy": 0.9709288036823273, + "num_tokens": 8447487171.0, + "step": 59900 + }, + { + "entropy": 1.3235667037963867, + "epoch": 1.9571675753321798, + "grad_norm": 2.109375, + "learning_rate": 7.53221172679841e-09, + "loss": 0.1217, + "mean_token_accuracy": 0.972466766834259, + "num_tokens": 8454238666.0, + "step": 59950 + }, + { + "entropy": 1.3157719588279724, + "epoch": 1.9587999085893375, + "grad_norm": 1.65625, + "learning_rate": 6.969688246213246e-09, + "loss": 0.1267, + "mean_token_accuracy": 0.971341325044632, + "num_tokens": 8461254971.0, + "step": 60000 + }, + { + "epoch": 1.9587999085893375, + "eval_entropy": 1.3126351674397787, + "eval_loss": 0.14264898002147675, + "eval_mean_token_accuracy": 0.9676442178090413, + "eval_num_tokens": 8461254971.0, + "eval_runtime": 749.6809, + "eval_samples_per_second": 12.88, + "eval_steps_per_second": 0.101, + "step": 60000 + } + ], + "logging_steps": 50, + "max_steps": 61262, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0297347722946334e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}