{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9587999085893375, "eval_steps": 500, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3703370141983031, "epoch": 0.0016323332571577813, "grad_norm": 37.0, "learning_rate": 9.595300261096606e-08, "loss": 0.5452, "mean_token_accuracy": 0.8990191352367402, "num_tokens": 7314577.0, "step": 50 }, { "entropy": 1.3720017457008362, "epoch": 0.0032646665143155626, "grad_norm": 41.5, "learning_rate": 1.9386422976501306e-07, "loss": 0.5496, "mean_token_accuracy": 0.8973769438266754, "num_tokens": 14588101.0, "step": 100 }, { "entropy": 1.3811770510673522, "epoch": 0.004896999771473344, "grad_norm": 30.25, "learning_rate": 2.9177545691906004e-07, "loss": 0.496, "mean_token_accuracy": 0.9052803874015808, "num_tokens": 22193383.0, "step": 150 }, { "entropy": 1.3709602856636047, "epoch": 0.006529333028631125, "grad_norm": 10.0, "learning_rate": 3.896866840731071e-07, "loss": 0.412, "mean_token_accuracy": 0.9263910567760467, "num_tokens": 29160848.0, "step": 200 }, { "entropy": 1.3785104417800904, "epoch": 0.008161666285788906, "grad_norm": 7.40625, "learning_rate": 4.875979112271541e-07, "loss": 0.3539, "mean_token_accuracy": 0.9421093094348908, "num_tokens": 35829209.0, "step": 250 }, { "entropy": 1.4016437172889709, "epoch": 0.009793999542946689, "grad_norm": 8.1875, "learning_rate": 5.855091383812011e-07, "loss": 0.3663, "mean_token_accuracy": 0.9401827538013459, "num_tokens": 42980520.0, "step": 300 }, { "entropy": 1.4059402322769166, "epoch": 0.01142633280010447, "grad_norm": 4.84375, "learning_rate": 6.83420365535248e-07, "loss": 0.3226, "mean_token_accuracy": 0.9456453359127045, "num_tokens": 49839839.0, "step": 350 }, { "entropy": 1.4296678924560546, "epoch": 0.01305866605726225, "grad_norm": 3.796875, "learning_rate": 7.813315926892951e-07, "loss": 0.3287, "mean_token_accuracy": 0.9431667315959931, "num_tokens": 57114933.0, "step": 400 }, { "entropy": 1.4395745277404786, "epoch": 0.014690999314420031, "grad_norm": 3.390625, "learning_rate": 8.79242819843342e-07, "loss": 0.3109, "mean_token_accuracy": 0.9427984356880188, "num_tokens": 64178392.0, "step": 450 }, { "entropy": 1.4591435599327087, "epoch": 0.016323332571577812, "grad_norm": 3.890625, "learning_rate": 9.77154046997389e-07, "loss": 0.3093, "mean_token_accuracy": 0.9412086343765259, "num_tokens": 71359680.0, "step": 500 }, { "epoch": 0.016323332571577812, "eval_entropy": 1.4762075742085774, "eval_loss": 0.324710875749588, "eval_mean_token_accuracy": 0.9405147298177083, "eval_num_tokens": 71359680.0, "eval_runtime": 743.1027, "eval_samples_per_second": 12.994, "eval_steps_per_second": 0.102, "step": 500 }, { "entropy": 1.5086383247375488, "epoch": 0.017955665828735593, "grad_norm": 2.890625, "learning_rate": 1.075065274151436e-06, "loss": 0.2984, "mean_token_accuracy": 0.9443651103973388, "num_tokens": 78414153.0, "step": 550 }, { "entropy": 1.5364352035522462, "epoch": 0.019587999085893378, "grad_norm": 5.96875, "learning_rate": 1.172976501305483e-06, "loss": 0.2817, "mean_token_accuracy": 0.9466875243186951, "num_tokens": 85753500.0, "step": 600 }, { "entropy": 1.5489245629310608, "epoch": 0.02122033234305116, "grad_norm": 1.9453125, "learning_rate": 1.27088772845953e-06, "loss": 0.2808, "mean_token_accuracy": 0.9457735085487365, "num_tokens": 92871116.0, "step": 650 }, { "entropy": 1.5694326043128968, "epoch": 0.02285266560020894, "grad_norm": 2.03125, "learning_rate": 1.368798955613577e-06, "loss": 0.2981, "mean_token_accuracy": 0.9427003371715545, "num_tokens": 100022246.0, "step": 700 }, { "entropy": 1.5554496097564696, "epoch": 0.02448499885736672, "grad_norm": 2.71875, "learning_rate": 1.466710182767624e-06, "loss": 0.2684, "mean_token_accuracy": 0.9482632505893708, "num_tokens": 107214871.0, "step": 750 }, { "entropy": 1.5609500217437744, "epoch": 0.0261173321145245, "grad_norm": 4.25, "learning_rate": 1.5646214099216712e-06, "loss": 0.269, "mean_token_accuracy": 0.9476964402198792, "num_tokens": 114188698.0, "step": 800 }, { "entropy": 1.5485923647880555, "epoch": 0.027749665371682282, "grad_norm": 10.9375, "learning_rate": 1.662532637075718e-06, "loss": 0.2584, "mean_token_accuracy": 0.9505482971668243, "num_tokens": 121264967.0, "step": 850 }, { "entropy": 1.546497621536255, "epoch": 0.029381998628840063, "grad_norm": 1.8046875, "learning_rate": 1.760443864229765e-06, "loss": 0.2473, "mean_token_accuracy": 0.9520090806484223, "num_tokens": 127984569.0, "step": 900 }, { "entropy": 1.5344351577758788, "epoch": 0.031014331885997844, "grad_norm": 2.359375, "learning_rate": 1.8583550913838121e-06, "loss": 0.2728, "mean_token_accuracy": 0.9471143686771393, "num_tokens": 135471574.0, "step": 950 }, { "entropy": 1.5414926147460937, "epoch": 0.032646665143155625, "grad_norm": 1.765625, "learning_rate": 1.956266318537859e-06, "loss": 0.2573, "mean_token_accuracy": 0.95045654296875, "num_tokens": 142637684.0, "step": 1000 }, { "epoch": 0.032646665143155625, "eval_entropy": 1.544297873179118, "eval_loss": 0.2730070948600769, "eval_mean_token_accuracy": 0.9475241343180338, "eval_num_tokens": 142637684.0, "eval_runtime": 746.6617, "eval_samples_per_second": 12.932, "eval_steps_per_second": 0.102, "step": 1000 }, { "entropy": 1.5506397199630737, "epoch": 0.034278998400313405, "grad_norm": 1.9140625, "learning_rate": 2.054177545691906e-06, "loss": 0.2524, "mean_token_accuracy": 0.9507447695732116, "num_tokens": 149507260.0, "step": 1050 }, { "entropy": 1.544835422039032, "epoch": 0.035911331657471186, "grad_norm": 1.703125, "learning_rate": 2.152088772845953e-06, "loss": 0.2477, "mean_token_accuracy": 0.9511877942085266, "num_tokens": 156809907.0, "step": 1100 }, { "entropy": 1.545062973499298, "epoch": 0.03754366491462897, "grad_norm": 1.7890625, "learning_rate": 2.25e-06, "loss": 0.2381, "mean_token_accuracy": 0.9530003690719604, "num_tokens": 163878508.0, "step": 1150 }, { "entropy": 1.5258307456970215, "epoch": 0.039175998171786755, "grad_norm": 1.4609375, "learning_rate": 2.347911227154047e-06, "loss": 0.2225, "mean_token_accuracy": 0.9549448072910309, "num_tokens": 170927568.0, "step": 1200 }, { "entropy": 1.5170037484169006, "epoch": 0.040808331428944536, "grad_norm": 1.375, "learning_rate": 2.445822454308094e-06, "loss": 0.2238, "mean_token_accuracy": 0.9546501576900482, "num_tokens": 178389889.0, "step": 1250 }, { "entropy": 1.5287164187431335, "epoch": 0.04244066468610232, "grad_norm": 1.921875, "learning_rate": 2.543733681462141e-06, "loss": 0.2471, "mean_token_accuracy": 0.9508370912075043, "num_tokens": 186069673.0, "step": 1300 }, { "entropy": 1.5002574920654297, "epoch": 0.0440729979432601, "grad_norm": 2.046875, "learning_rate": 2.641644908616188e-06, "loss": 0.2332, "mean_token_accuracy": 0.953369448184967, "num_tokens": 193396348.0, "step": 1350 }, { "entropy": 1.5036597728729248, "epoch": 0.04570533120041788, "grad_norm": 1.5078125, "learning_rate": 2.739556135770235e-06, "loss": 0.2245, "mean_token_accuracy": 0.9541668140888214, "num_tokens": 200593430.0, "step": 1400 }, { "entropy": 1.4917612624168397, "epoch": 0.04733766445757566, "grad_norm": 2.265625, "learning_rate": 2.837467362924282e-06, "loss": 0.2193, "mean_token_accuracy": 0.9556196844577789, "num_tokens": 207639383.0, "step": 1450 }, { "entropy": 1.4869768619537354, "epoch": 0.04896999771473344, "grad_norm": 2.46875, "learning_rate": 2.935378590078329e-06, "loss": 0.2195, "mean_token_accuracy": 0.9560010933876038, "num_tokens": 214438210.0, "step": 1500 }, { "epoch": 0.04896999771473344, "eval_entropy": 1.4914683151245116, "eval_loss": 0.24329085648059845, "eval_mean_token_accuracy": 0.9516400265693664, "eval_num_tokens": 214438210.0, "eval_runtime": 743.0423, "eval_samples_per_second": 12.995, "eval_steps_per_second": 0.102, "step": 1500 }, { "entropy": 1.478922975063324, "epoch": 0.05060233097189122, "grad_norm": 1.5390625, "learning_rate": 3.033289817232376e-06, "loss": 0.2176, "mean_token_accuracy": 0.9555383801460267, "num_tokens": 221777722.0, "step": 1550 }, { "entropy": 1.4821615958213805, "epoch": 0.052234664229049, "grad_norm": 1.6328125, "learning_rate": 3.131201044386423e-06, "loss": 0.2317, "mean_token_accuracy": 0.9527480769157409, "num_tokens": 229386810.0, "step": 1600 }, { "entropy": 1.4608716344833375, "epoch": 0.05386699748620678, "grad_norm": 2.078125, "learning_rate": 3.22911227154047e-06, "loss": 0.2195, "mean_token_accuracy": 0.9549958789348603, "num_tokens": 237068902.0, "step": 1650 }, { "entropy": 1.4769957304000854, "epoch": 0.055499330743364564, "grad_norm": 1.765625, "learning_rate": 3.327023498694517e-06, "loss": 0.2305, "mean_token_accuracy": 0.9542369735240936, "num_tokens": 243773147.0, "step": 1700 }, { "entropy": 1.4610289931297302, "epoch": 0.057131664000522345, "grad_norm": 1.796875, "learning_rate": 3.424934725848564e-06, "loss": 0.2108, "mean_token_accuracy": 0.9570872175693512, "num_tokens": 250400651.0, "step": 1750 }, { "entropy": 1.4512082767486572, "epoch": 0.058763997257680126, "grad_norm": 1.6953125, "learning_rate": 3.522845953002611e-06, "loss": 0.2065, "mean_token_accuracy": 0.9566819512844086, "num_tokens": 257634063.0, "step": 1800 }, { "entropy": 1.4433103609085083, "epoch": 0.060396330514837906, "grad_norm": 1.515625, "learning_rate": 3.6207571801566577e-06, "loss": 0.2022, "mean_token_accuracy": 0.9584151470661163, "num_tokens": 264346086.0, "step": 1850 }, { "entropy": 1.441446192264557, "epoch": 0.06202866377199569, "grad_norm": 1.7578125, "learning_rate": 3.7186684073107047e-06, "loss": 0.2172, "mean_token_accuracy": 0.9561198997497559, "num_tokens": 271468769.0, "step": 1900 }, { "entropy": 1.4351094341278077, "epoch": 0.06366099702915347, "grad_norm": 1.484375, "learning_rate": 3.816579634464752e-06, "loss": 0.2056, "mean_token_accuracy": 0.9582554578781128, "num_tokens": 278724065.0, "step": 1950 }, { "entropy": 1.4327974796295166, "epoch": 0.06529333028631125, "grad_norm": 1.4453125, "learning_rate": 3.914490861618799e-06, "loss": 0.2062, "mean_token_accuracy": 0.956677463054657, "num_tokens": 285941177.0, "step": 2000 }, { "epoch": 0.06529333028631125, "eval_entropy": 1.4438136545817057, "eval_loss": 0.2242203801870346, "eval_mean_token_accuracy": 0.9541537570953369, "eval_num_tokens": 285941177.0, "eval_runtime": 744.3872, "eval_samples_per_second": 12.972, "eval_steps_per_second": 0.102, "step": 2000 }, { "entropy": 1.444144995212555, "epoch": 0.06692566354346903, "grad_norm": 2.4375, "learning_rate": 4.012402088772846e-06, "loss": 0.1935, "mean_token_accuracy": 0.9598511147499085, "num_tokens": 292840917.0, "step": 2050 }, { "entropy": 1.4242860412597655, "epoch": 0.06855799680062681, "grad_norm": 2.0625, "learning_rate": 4.1103133159268925e-06, "loss": 0.1947, "mean_token_accuracy": 0.9596167039871216, "num_tokens": 300067261.0, "step": 2100 }, { "entropy": 1.4138830995559692, "epoch": 0.07019033005778459, "grad_norm": 1.7578125, "learning_rate": 4.2082245430809395e-06, "loss": 0.1999, "mean_token_accuracy": 0.9582617676258087, "num_tokens": 307304735.0, "step": 2150 }, { "entropy": 1.4222793292999267, "epoch": 0.07182266331494237, "grad_norm": 1.5703125, "learning_rate": 4.3061357702349865e-06, "loss": 0.1896, "mean_token_accuracy": 0.9599699079990387, "num_tokens": 314065537.0, "step": 2200 }, { "entropy": 1.4278799700737, "epoch": 0.07345499657210015, "grad_norm": 1.4453125, "learning_rate": 4.4040469973890336e-06, "loss": 0.1845, "mean_token_accuracy": 0.960889185667038, "num_tokens": 320938314.0, "step": 2250 }, { "entropy": 1.40878901720047, "epoch": 0.07508732982925793, "grad_norm": 2.03125, "learning_rate": 4.501958224543081e-06, "loss": 0.1917, "mean_token_accuracy": 0.9593756330013276, "num_tokens": 327763547.0, "step": 2300 }, { "entropy": 1.4114052319526673, "epoch": 0.07671966308641573, "grad_norm": 2.359375, "learning_rate": 4.599869451697128e-06, "loss": 0.1938, "mean_token_accuracy": 0.9592675876617431, "num_tokens": 335032376.0, "step": 2350 }, { "entropy": 1.4144614338874817, "epoch": 0.07835199634357351, "grad_norm": 1.4765625, "learning_rate": 4.697780678851175e-06, "loss": 0.1962, "mean_token_accuracy": 0.958512544631958, "num_tokens": 342636323.0, "step": 2400 }, { "entropy": 1.4119537329673768, "epoch": 0.07998432960073129, "grad_norm": 2.1875, "learning_rate": 4.795691906005222e-06, "loss": 0.1882, "mean_token_accuracy": 0.9601361775398254, "num_tokens": 349613670.0, "step": 2450 }, { "entropy": 1.4156457328796386, "epoch": 0.08161666285788907, "grad_norm": 1.953125, "learning_rate": 4.893603133159269e-06, "loss": 0.19, "mean_token_accuracy": 0.9593257677555084, "num_tokens": 356562537.0, "step": 2500 }, { "epoch": 0.08161666285788907, "eval_entropy": 1.407395658493042, "eval_loss": 0.20515179634094238, "eval_mean_token_accuracy": 0.9569398101170857, "eval_num_tokens": 356562537.0, "eval_runtime": 748.9688, "eval_samples_per_second": 12.892, "eval_steps_per_second": 0.101, "step": 2500 }, { "entropy": 1.4004506325721742, "epoch": 0.08324899611504685, "grad_norm": 1.7578125, "learning_rate": 4.991514360313316e-06, "loss": 0.1797, "mean_token_accuracy": 0.9611039471626281, "num_tokens": 363822144.0, "step": 2550 }, { "entropy": 1.4148499584197998, "epoch": 0.08488132937220463, "grad_norm": 1.8984375, "learning_rate": 5.089425587467363e-06, "loss": 0.1868, "mean_token_accuracy": 0.9603874254226684, "num_tokens": 370830813.0, "step": 2600 }, { "entropy": 1.4092473483085632, "epoch": 0.08651366262936241, "grad_norm": 1.1640625, "learning_rate": 5.18733681462141e-06, "loss": 0.1729, "mean_token_accuracy": 0.9630460107326507, "num_tokens": 378019973.0, "step": 2650 }, { "entropy": 1.424285671710968, "epoch": 0.0881459958865202, "grad_norm": 2.4375, "learning_rate": 5.285248041775457e-06, "loss": 0.1829, "mean_token_accuracy": 0.9613824605941772, "num_tokens": 384991554.0, "step": 2700 }, { "entropy": 1.410207018852234, "epoch": 0.08977832914367798, "grad_norm": 1.96875, "learning_rate": 5.383159268929505e-06, "loss": 0.1632, "mean_token_accuracy": 0.9650751757621765, "num_tokens": 392065700.0, "step": 2750 }, { "entropy": 1.4243709874153136, "epoch": 0.09141066240083576, "grad_norm": 1.515625, "learning_rate": 5.481070496083552e-06, "loss": 0.1728, "mean_token_accuracy": 0.9626587212085724, "num_tokens": 399270239.0, "step": 2800 }, { "entropy": 1.413210895061493, "epoch": 0.09304299565799354, "grad_norm": 1.8203125, "learning_rate": 5.578981723237598e-06, "loss": 0.1791, "mean_token_accuracy": 0.9606540656089783, "num_tokens": 406647892.0, "step": 2850 }, { "entropy": 1.4289642930030824, "epoch": 0.09467532891515132, "grad_norm": 2.0, "learning_rate": 5.676892950391645e-06, "loss": 0.1853, "mean_token_accuracy": 0.9598627412319183, "num_tokens": 414138654.0, "step": 2900 }, { "entropy": 1.435717191696167, "epoch": 0.0963076621723091, "grad_norm": 1.078125, "learning_rate": 5.774804177545692e-06, "loss": 0.1768, "mean_token_accuracy": 0.9624214172363281, "num_tokens": 421217231.0, "step": 2950 }, { "entropy": 1.428927412033081, "epoch": 0.09793999542946688, "grad_norm": 1.7734375, "learning_rate": 5.872715404699739e-06, "loss": 0.1829, "mean_token_accuracy": 0.9605919003486634, "num_tokens": 428485940.0, "step": 3000 }, { "epoch": 0.09793999542946688, "eval_entropy": 1.4376725546518963, "eval_loss": 0.19405308365821838, "eval_mean_token_accuracy": 0.958539453347524, "eval_num_tokens": 428485940.0, "eval_runtime": 754.0644, "eval_samples_per_second": 12.805, "eval_steps_per_second": 0.101, "step": 3000 }, { "entropy": 1.438420045375824, "epoch": 0.09957232868662466, "grad_norm": 1.6484375, "learning_rate": 5.970626631853786e-06, "loss": 0.175, "mean_token_accuracy": 0.9624645209312439, "num_tokens": 435253940.0, "step": 3050 }, { "entropy": 1.43366064786911, "epoch": 0.10120466194378244, "grad_norm": 1.484375, "learning_rate": 5.9999946455996105e-06, "loss": 0.1691, "mean_token_accuracy": 0.9642149293422699, "num_tokens": 441868038.0, "step": 3100 }, { "entropy": 1.4457367968559265, "epoch": 0.10283699520094022, "grad_norm": 1.3046875, "learning_rate": 5.999968420011062e-06, "loss": 0.1845, "mean_token_accuracy": 0.9603316211700439, "num_tokens": 449366573.0, "step": 3150 }, { "entropy": 1.4273823165893555, "epoch": 0.104469328458098, "grad_norm": 1.6015625, "learning_rate": 5.999920339963868e-06, "loss": 0.1727, "mean_token_accuracy": 0.9620600152015686, "num_tokens": 456397259.0, "step": 3200 }, { "entropy": 1.4280832529067993, "epoch": 0.10610166171525579, "grad_norm": 1.4609375, "learning_rate": 5.999850405808289e-06, "loss": 0.1706, "mean_token_accuracy": 0.9628064668178559, "num_tokens": 463197302.0, "step": 3250 }, { "entropy": 1.4216149377822875, "epoch": 0.10773399497241357, "grad_norm": 1.734375, "learning_rate": 5.999758618053787e-06, "loss": 0.1685, "mean_token_accuracy": 0.9630639374256134, "num_tokens": 469919649.0, "step": 3300 }, { "entropy": 1.4262100625038148, "epoch": 0.10936632822957135, "grad_norm": 1.21875, "learning_rate": 5.999644977369027e-06, "loss": 0.1735, "mean_token_accuracy": 0.9620367133617401, "num_tokens": 476875581.0, "step": 3350 }, { "entropy": 1.4126947259902953, "epoch": 0.11099866148672913, "grad_norm": 1.5625, "learning_rate": 5.9995094845818684e-06, "loss": 0.1697, "mean_token_accuracy": 0.9630028975009918, "num_tokens": 483984060.0, "step": 3400 }, { "entropy": 1.436975917816162, "epoch": 0.11263099474388691, "grad_norm": 1.703125, "learning_rate": 5.999352140679363e-06, "loss": 0.1789, "mean_token_accuracy": 0.9613595926761627, "num_tokens": 491263590.0, "step": 3450 }, { "entropy": 1.4022981858253478, "epoch": 0.11426332800104469, "grad_norm": 1.0, "learning_rate": 5.999172946807744e-06, "loss": 0.1677, "mean_token_accuracy": 0.9631788098812103, "num_tokens": 498685855.0, "step": 3500 }, { "epoch": 0.11426332800104469, "eval_entropy": 1.4040173705418904, "eval_loss": 0.18694917857646942, "eval_mean_token_accuracy": 0.9596376585960388, "eval_num_tokens": 498685855.0, "eval_runtime": 746.2297, "eval_samples_per_second": 12.94, "eval_steps_per_second": 0.102, "step": 3500 }, { "entropy": 1.4095824003219604, "epoch": 0.11589566125820247, "grad_norm": 3.796875, "learning_rate": 5.998971904272421e-06, "loss": 0.174, "mean_token_accuracy": 0.9623571968078614, "num_tokens": 506263057.0, "step": 3550 }, { "entropy": 1.406613359451294, "epoch": 0.11752799451536025, "grad_norm": 1.375, "learning_rate": 5.998749014537968e-06, "loss": 0.1674, "mean_token_accuracy": 0.9625373089313507, "num_tokens": 513004369.0, "step": 3600 }, { "entropy": 1.405109441280365, "epoch": 0.11916032777251803, "grad_norm": 1.4140625, "learning_rate": 5.998504279228114e-06, "loss": 0.1655, "mean_token_accuracy": 0.9644717895984649, "num_tokens": 519807945.0, "step": 3650 }, { "entropy": 1.4100734496116638, "epoch": 0.12079266102967581, "grad_norm": 1.4375, "learning_rate": 5.99823770012573e-06, "loss": 0.1678, "mean_token_accuracy": 0.9630844235420227, "num_tokens": 526976842.0, "step": 3700 }, { "entropy": 1.407007110118866, "epoch": 0.1224249942868336, "grad_norm": 1.25, "learning_rate": 5.997949279172815e-06, "loss": 0.1655, "mean_token_accuracy": 0.9636348211765289, "num_tokens": 533987687.0, "step": 3750 }, { "entropy": 1.4068945956230163, "epoch": 0.12405732754399137, "grad_norm": 1.3125, "learning_rate": 5.9976390184704885e-06, "loss": 0.1701, "mean_token_accuracy": 0.962993438243866, "num_tokens": 540885515.0, "step": 3800 }, { "entropy": 1.4040190553665162, "epoch": 0.12568966080114916, "grad_norm": 1.671875, "learning_rate": 5.997306920278967e-06, "loss": 0.1736, "mean_token_accuracy": 0.9614431858062744, "num_tokens": 548248278.0, "step": 3850 }, { "entropy": 1.3793179154396058, "epoch": 0.12732199405830694, "grad_norm": 1.5, "learning_rate": 5.99695298701755e-06, "loss": 0.1484, "mean_token_accuracy": 0.9667005109786987, "num_tokens": 555190537.0, "step": 3900 }, { "entropy": 1.3872941756248474, "epoch": 0.12895432731546472, "grad_norm": 1.515625, "learning_rate": 5.996577221264605e-06, "loss": 0.1648, "mean_token_accuracy": 0.9634542167186737, "num_tokens": 562395377.0, "step": 3950 }, { "entropy": 1.3957655119895935, "epoch": 0.1305866605726225, "grad_norm": 1.6875, "learning_rate": 5.9961796257575485e-06, "loss": 0.1684, "mean_token_accuracy": 0.9631939339637756, "num_tokens": 569190032.0, "step": 4000 }, { "epoch": 0.1305866605726225, "eval_entropy": 1.4019947210947672, "eval_loss": 0.18201017379760742, "eval_mean_token_accuracy": 0.960600491364797, "eval_num_tokens": 569190032.0, "eval_runtime": 751.8458, "eval_samples_per_second": 12.843, "eval_steps_per_second": 0.101, "step": 4000 }, { "entropy": 1.3909747576713563, "epoch": 0.13221899382978028, "grad_norm": 6.75, "learning_rate": 5.99576020339282e-06, "loss": 0.1614, "mean_token_accuracy": 0.9640302300453186, "num_tokens": 576222815.0, "step": 4050 }, { "entropy": 1.3952715015411377, "epoch": 0.13385132708693806, "grad_norm": 1.8515625, "learning_rate": 5.995318957225869e-06, "loss": 0.1572, "mean_token_accuracy": 0.9658111941814422, "num_tokens": 582993007.0, "step": 4100 }, { "entropy": 1.4083420133590698, "epoch": 0.13548366034409584, "grad_norm": 1.7265625, "learning_rate": 5.994855890471128e-06, "loss": 0.1634, "mean_token_accuracy": 0.964879697561264, "num_tokens": 589829579.0, "step": 4150 }, { "entropy": 1.4027349591255187, "epoch": 0.13711599360125362, "grad_norm": 1.65625, "learning_rate": 5.9943710065019905e-06, "loss": 0.1586, "mean_token_accuracy": 0.9652115881443024, "num_tokens": 596795182.0, "step": 4200 }, { "entropy": 1.4036769008636474, "epoch": 0.1387483268584114, "grad_norm": 1.5625, "learning_rate": 5.993864308850785e-06, "loss": 0.1644, "mean_token_accuracy": 0.9636000382900238, "num_tokens": 603861008.0, "step": 4250 }, { "entropy": 1.4230398559570312, "epoch": 0.14038066011556918, "grad_norm": 1.7578125, "learning_rate": 5.9933358012087526e-06, "loss": 0.1651, "mean_token_accuracy": 0.9630668413639069, "num_tokens": 611150442.0, "step": 4300 }, { "entropy": 1.4209274005889894, "epoch": 0.14201299337272696, "grad_norm": 1.296875, "learning_rate": 5.992785487426016e-06, "loss": 0.1613, "mean_token_accuracy": 0.9643425738811493, "num_tokens": 617582752.0, "step": 4350 }, { "entropy": 1.409316577911377, "epoch": 0.14364532662988475, "grad_norm": 1.09375, "learning_rate": 5.992213371511554e-06, "loss": 0.1606, "mean_token_accuracy": 0.9648306250572205, "num_tokens": 624322193.0, "step": 4400 }, { "entropy": 1.406817865371704, "epoch": 0.14527765988704253, "grad_norm": 1.765625, "learning_rate": 5.991619457633171e-06, "loss": 0.1659, "mean_token_accuracy": 0.9639068651199341, "num_tokens": 631733777.0, "step": 4450 }, { "entropy": 1.4001825642585755, "epoch": 0.1469099931442003, "grad_norm": 1.4140625, "learning_rate": 5.991003750117468e-06, "loss": 0.1601, "mean_token_accuracy": 0.9647636806964874, "num_tokens": 639594509.0, "step": 4500 }, { "epoch": 0.1469099931442003, "eval_entropy": 1.4037580092748005, "eval_loss": 0.17858904600143433, "eval_mean_token_accuracy": 0.961095765431722, "eval_num_tokens": 639594509.0, "eval_runtime": 747.1098, "eval_samples_per_second": 12.924, "eval_steps_per_second": 0.102, "step": 4500 }, { "entropy": 1.3959466004371643, "epoch": 0.1485423264013581, "grad_norm": 1.515625, "learning_rate": 5.990366253449812e-06, "loss": 0.1605, "mean_token_accuracy": 0.9646508944034576, "num_tokens": 647057874.0, "step": 4550 }, { "entropy": 1.3969790387153624, "epoch": 0.15017465965851587, "grad_norm": 1.015625, "learning_rate": 5.989706972274299e-06, "loss": 0.1617, "mean_token_accuracy": 0.9644351935386658, "num_tokens": 654287916.0, "step": 4600 }, { "entropy": 1.3977243065834046, "epoch": 0.15180699291567368, "grad_norm": 1.8203125, "learning_rate": 5.989025911393723e-06, "loss": 0.1733, "mean_token_accuracy": 0.9629066979885101, "num_tokens": 661344302.0, "step": 4650 }, { "entropy": 1.4031280422210692, "epoch": 0.15343932617283146, "grad_norm": 1.78125, "learning_rate": 5.988323075769544e-06, "loss": 0.1478, "mean_token_accuracy": 0.967905158996582, "num_tokens": 667929168.0, "step": 4700 }, { "entropy": 1.4076430988311768, "epoch": 0.15507165942998924, "grad_norm": 0.9375, "learning_rate": 5.987598470521845e-06, "loss": 0.1585, "mean_token_accuracy": 0.965575454235077, "num_tokens": 675162945.0, "step": 4750 }, { "entropy": 1.4047666358947755, "epoch": 0.15670399268714702, "grad_norm": 1.1171875, "learning_rate": 5.986852100929301e-06, "loss": 0.1633, "mean_token_accuracy": 0.9638245010375976, "num_tokens": 682171614.0, "step": 4800 }, { "entropy": 1.4109639859199523, "epoch": 0.1583363259443048, "grad_norm": 1.5859375, "learning_rate": 5.986083972429135e-06, "loss": 0.1641, "mean_token_accuracy": 0.964219799041748, "num_tokens": 689397569.0, "step": 4850 }, { "entropy": 1.4028720164299011, "epoch": 0.15996865920146258, "grad_norm": 1.4375, "learning_rate": 5.985294090617086e-06, "loss": 0.1608, "mean_token_accuracy": 0.9645184981822967, "num_tokens": 696209610.0, "step": 4900 }, { "entropy": 1.3975288462638855, "epoch": 0.16160099245862036, "grad_norm": 1.578125, "learning_rate": 5.98448246124736e-06, "loss": 0.1551, "mean_token_accuracy": 0.9647789108753204, "num_tokens": 703107550.0, "step": 4950 }, { "entropy": 1.3928989100456237, "epoch": 0.16323332571577814, "grad_norm": 1.6015625, "learning_rate": 5.983649090232592e-06, "loss": 0.1592, "mean_token_accuracy": 0.9648973512649536, "num_tokens": 710156450.0, "step": 5000 }, { "epoch": 0.16323332571577814, "eval_entropy": 1.4044678370157877, "eval_loss": 0.1755974143743515, "eval_mean_token_accuracy": 0.961607707341512, "eval_num_tokens": 710156450.0, "eval_runtime": 749.1287, "eval_samples_per_second": 12.89, "eval_steps_per_second": 0.101, "step": 5000 }, { "entropy": 1.4239799737930299, "epoch": 0.16486565897293592, "grad_norm": 1.234375, "learning_rate": 5.982793983643805e-06, "loss": 0.1637, "mean_token_accuracy": 0.9640812170505524, "num_tokens": 717234054.0, "step": 5050 }, { "entropy": 1.4224350619316102, "epoch": 0.1664979922300937, "grad_norm": 2.109375, "learning_rate": 5.98191714771036e-06, "loss": 0.1624, "mean_token_accuracy": 0.9640639245510101, "num_tokens": 724297107.0, "step": 5100 }, { "entropy": 1.4389384937286378, "epoch": 0.1681303254872515, "grad_norm": 1.1953125, "learning_rate": 5.981018588819916e-06, "loss": 0.1681, "mean_token_accuracy": 0.9633942902088165, "num_tokens": 731287758.0, "step": 5150 }, { "entropy": 1.427138111591339, "epoch": 0.16976265874440927, "grad_norm": 1.6015625, "learning_rate": 5.980098313518383e-06, "loss": 0.1669, "mean_token_accuracy": 0.9635147547721863, "num_tokens": 738353484.0, "step": 5200 }, { "entropy": 1.43255943775177, "epoch": 0.17139499200156705, "grad_norm": 1.6015625, "learning_rate": 5.97915632850987e-06, "loss": 0.1602, "mean_token_accuracy": 0.9645324110984802, "num_tokens": 745152314.0, "step": 5250 }, { "entropy": 1.4146737170219421, "epoch": 0.17302732525872483, "grad_norm": 1.375, "learning_rate": 5.97819264065664e-06, "loss": 0.1566, "mean_token_accuracy": 0.9652618932723999, "num_tokens": 752080428.0, "step": 5300 }, { "entropy": 1.41352454662323, "epoch": 0.1746596585158826, "grad_norm": 1.3515625, "learning_rate": 5.977207256979058e-06, "loss": 0.1472, "mean_token_accuracy": 0.9666665005683899, "num_tokens": 758654299.0, "step": 5350 }, { "entropy": 1.4163859128952025, "epoch": 0.1762919917730404, "grad_norm": 1.2578125, "learning_rate": 5.976200184655544e-06, "loss": 0.1646, "mean_token_accuracy": 0.9645817792415619, "num_tokens": 765762864.0, "step": 5400 }, { "entropy": 1.4168593525886535, "epoch": 0.17792432503019817, "grad_norm": 1.625, "learning_rate": 5.9751714310225135e-06, "loss": 0.1558, "mean_token_accuracy": 0.9653662145137787, "num_tokens": 772523965.0, "step": 5450 }, { "entropy": 1.4106249260902404, "epoch": 0.17955665828735595, "grad_norm": 1.46875, "learning_rate": 5.974121003574331e-06, "loss": 0.1605, "mean_token_accuracy": 0.9646015942096711, "num_tokens": 779873569.0, "step": 5500 }, { "epoch": 0.17955665828735595, "eval_entropy": 1.4159186140696207, "eval_loss": 0.17334794998168945, "eval_mean_token_accuracy": 0.961771670182546, "eval_num_tokens": 779873569.0, "eval_runtime": 752.9376, "eval_samples_per_second": 12.824, "eval_steps_per_second": 0.101, "step": 5500 }, { "entropy": 1.4060875940322877, "epoch": 0.18118899154451373, "grad_norm": 1.8828125, "learning_rate": 5.973048909963251e-06, "loss": 0.166, "mean_token_accuracy": 0.9633875727653504, "num_tokens": 787666740.0, "step": 5550 }, { "entropy": 1.4012414264678954, "epoch": 0.18282132480167151, "grad_norm": 1.515625, "learning_rate": 5.971955157999365e-06, "loss": 0.1542, "mean_token_accuracy": 0.9654585599899292, "num_tokens": 794513657.0, "step": 5600 }, { "entropy": 1.3988840198516845, "epoch": 0.1844536580588293, "grad_norm": 1.6953125, "learning_rate": 5.970839755650541e-06, "loss": 0.1595, "mean_token_accuracy": 0.9642168319225312, "num_tokens": 801328321.0, "step": 5650 }, { "entropy": 1.4012188339233398, "epoch": 0.18608599131598708, "grad_norm": 1.109375, "learning_rate": 5.969702711042371e-06, "loss": 0.167, "mean_token_accuracy": 0.9627443432807923, "num_tokens": 808895137.0, "step": 5700 }, { "entropy": 1.4030301642417908, "epoch": 0.18771832457314486, "grad_norm": 1.40625, "learning_rate": 5.968544032458105e-06, "loss": 0.1518, "mean_token_accuracy": 0.9666016948223114, "num_tokens": 815564055.0, "step": 5750 }, { "entropy": 1.3949448704719543, "epoch": 0.18935065783030264, "grad_norm": 1.25, "learning_rate": 5.967363728338598e-06, "loss": 0.1542, "mean_token_accuracy": 0.9651316654682159, "num_tokens": 822772629.0, "step": 5800 }, { "entropy": 1.4053015112876892, "epoch": 0.19098299108746042, "grad_norm": 1.8125, "learning_rate": 5.966161807282244e-06, "loss": 0.1475, "mean_token_accuracy": 0.9664002013206482, "num_tokens": 829474695.0, "step": 5850 }, { "entropy": 1.3961756944656372, "epoch": 0.1926153243446182, "grad_norm": 1.2734375, "learning_rate": 5.96493827804491e-06, "loss": 0.1636, "mean_token_accuracy": 0.9642155694961548, "num_tokens": 836807189.0, "step": 5900 }, { "entropy": 1.39143967628479, "epoch": 0.19424765760177598, "grad_norm": 1.671875, "learning_rate": 5.963693149539883e-06, "loss": 0.1592, "mean_token_accuracy": 0.9642041945457458, "num_tokens": 843989373.0, "step": 5950 }, { "entropy": 1.395959141254425, "epoch": 0.19587999085893376, "grad_norm": 1.1484375, "learning_rate": 5.962426430837792e-06, "loss": 0.1613, "mean_token_accuracy": 0.964136803150177, "num_tokens": 851299727.0, "step": 6000 }, { "epoch": 0.19587999085893376, "eval_entropy": 1.4013945213953654, "eval_loss": 0.17208388447761536, "eval_mean_token_accuracy": 0.9621260579427083, "eval_num_tokens": 851299727.0, "eval_runtime": 753.3633, "eval_samples_per_second": 12.817, "eval_steps_per_second": 0.101, "step": 6000 }, { "entropy": 1.4012908124923706, "epoch": 0.19751232411609154, "grad_norm": 0.0113525390625, "learning_rate": 5.961138131166554e-06, "loss": 0.1554, "mean_token_accuracy": 0.9652227807044983, "num_tokens": 858064092.0, "step": 6050 }, { "entropy": 1.388651340007782, "epoch": 0.19914465737324932, "grad_norm": 1.9375, "learning_rate": 5.959828259911295e-06, "loss": 0.1569, "mean_token_accuracy": 0.9651960909366608, "num_tokens": 865252118.0, "step": 6100 }, { "entropy": 1.4042778515815735, "epoch": 0.2007769906304071, "grad_norm": 1.375, "learning_rate": 5.958496826614294e-06, "loss": 0.1661, "mean_token_accuracy": 0.9626063787937165, "num_tokens": 872468561.0, "step": 6150 }, { "entropy": 1.401300666332245, "epoch": 0.20240932388756488, "grad_norm": 1.8515625, "learning_rate": 5.957143840974904e-06, "loss": 0.149, "mean_token_accuracy": 0.9666438174247741, "num_tokens": 879011998.0, "step": 6200 }, { "entropy": 1.4077980709075928, "epoch": 0.20404165714472267, "grad_norm": 1.6328125, "learning_rate": 5.955769312849484e-06, "loss": 0.1605, "mean_token_accuracy": 0.9650888216495513, "num_tokens": 886346540.0, "step": 6250 }, { "entropy": 1.394054229259491, "epoch": 0.20567399040188045, "grad_norm": 2.140625, "learning_rate": 5.954373252251329e-06, "loss": 0.1537, "mean_token_accuracy": 0.9657756268978119, "num_tokens": 893393322.0, "step": 6300 }, { "entropy": 1.3978914856910705, "epoch": 0.20730632365903823, "grad_norm": 1.5, "learning_rate": 5.952955669350596e-06, "loss": 0.1515, "mean_token_accuracy": 0.9658310306072235, "num_tokens": 899920970.0, "step": 6350 }, { "entropy": 1.4074292516708373, "epoch": 0.208938656916196, "grad_norm": 1.875, "learning_rate": 5.95151657447423e-06, "loss": 0.1568, "mean_token_accuracy": 0.9654946303367615, "num_tokens": 906895264.0, "step": 6400 }, { "entropy": 1.4000064754486083, "epoch": 0.2105709901733538, "grad_norm": 1.390625, "learning_rate": 5.950055978105885e-06, "loss": 0.1495, "mean_token_accuracy": 0.9668013238906861, "num_tokens": 913671071.0, "step": 6450 }, { "entropy": 1.4307255530357361, "epoch": 0.21220332343051157, "grad_norm": 1.015625, "learning_rate": 5.948573890885859e-06, "loss": 0.1663, "mean_token_accuracy": 0.9624395740032196, "num_tokens": 921099610.0, "step": 6500 }, { "epoch": 0.21220332343051157, "eval_entropy": 1.4208130852381389, "eval_loss": 0.16986840963363647, "eval_mean_token_accuracy": 0.962581082979838, "eval_num_tokens": 921099610.0, "eval_runtime": 746.9082, "eval_samples_per_second": 12.928, "eval_steps_per_second": 0.102, "step": 6500 }, { "entropy": 1.409722430706024, "epoch": 0.21383565668766935, "grad_norm": 1.2109375, "learning_rate": 5.947070323610999e-06, "loss": 0.1449, "mean_token_accuracy": 0.9680650508403779, "num_tokens": 928163378.0, "step": 6550 }, { "entropy": 1.4334656882286072, "epoch": 0.21546798994482713, "grad_norm": 1.3515625, "learning_rate": 5.945545287234639e-06, "loss": 0.1563, "mean_token_accuracy": 0.964540822505951, "num_tokens": 934586868.0, "step": 6600 }, { "entropy": 1.4435390138626099, "epoch": 0.2171003232019849, "grad_norm": 1.4140625, "learning_rate": 5.943998792866509e-06, "loss": 0.15, "mean_token_accuracy": 0.9669582867622375, "num_tokens": 941380079.0, "step": 6650 }, { "entropy": 1.4167057871818542, "epoch": 0.2187326564591427, "grad_norm": 1.7578125, "learning_rate": 5.942430851772662e-06, "loss": 0.1627, "mean_token_accuracy": 0.9633757710456848, "num_tokens": 949348630.0, "step": 6700 }, { "entropy": 1.4309338593482972, "epoch": 0.22036498971630047, "grad_norm": 0.89453125, "learning_rate": 5.9408414753753836e-06, "loss": 0.1546, "mean_token_accuracy": 0.9655015981197357, "num_tokens": 956312502.0, "step": 6750 }, { "entropy": 1.4343366432189941, "epoch": 0.22199732297345826, "grad_norm": 1.5625, "learning_rate": 5.939230675253119e-06, "loss": 0.1489, "mean_token_accuracy": 0.96628955245018, "num_tokens": 963500996.0, "step": 6800 }, { "entropy": 1.4271987533569337, "epoch": 0.22362965623061604, "grad_norm": 1.6796875, "learning_rate": 5.9375984631403785e-06, "loss": 0.1616, "mean_token_accuracy": 0.9645246016979218, "num_tokens": 970915430.0, "step": 6850 }, { "entropy": 1.4355636262893676, "epoch": 0.22526198948777382, "grad_norm": 1.8984375, "learning_rate": 5.935944850927657e-06, "loss": 0.1533, "mean_token_accuracy": 0.9660027372837067, "num_tokens": 978036240.0, "step": 6900 }, { "entropy": 1.425994610786438, "epoch": 0.2268943227449316, "grad_norm": 1.203125, "learning_rate": 5.934269850661349e-06, "loss": 0.152, "mean_token_accuracy": 0.9652906250953674, "num_tokens": 985047502.0, "step": 6950 }, { "entropy": 1.4223777842521668, "epoch": 0.22852665600208938, "grad_norm": 1.171875, "learning_rate": 5.932573474543658e-06, "loss": 0.156, "mean_token_accuracy": 0.9658907651901245, "num_tokens": 992373619.0, "step": 7000 }, { "epoch": 0.22852665600208938, "eval_entropy": 1.4304504505793254, "eval_loss": 0.16900277137756348, "eval_mean_token_accuracy": 0.9627823217709859, "eval_num_tokens": 992373619.0, "eval_runtime": 749.871, "eval_samples_per_second": 12.877, "eval_steps_per_second": 0.101, "step": 7000 }, { "entropy": 1.4272488355636597, "epoch": 0.23015898925924716, "grad_norm": 1.9765625, "learning_rate": 5.930855734932506e-06, "loss": 0.1454, "mean_token_accuracy": 0.9678001618385315, "num_tokens": 999289508.0, "step": 7050 }, { "entropy": 1.4248918747901917, "epoch": 0.23179132251640494, "grad_norm": 1.5625, "learning_rate": 5.92911664434145e-06, "loss": 0.1558, "mean_token_accuracy": 0.9652420032024384, "num_tokens": 1006528812.0, "step": 7100 }, { "entropy": 1.4263224506378174, "epoch": 0.23342365577356272, "grad_norm": 1.3359375, "learning_rate": 5.927356215439584e-06, "loss": 0.1494, "mean_token_accuracy": 0.9668344402313233, "num_tokens": 1013541923.0, "step": 7150 }, { "entropy": 1.4219363307952881, "epoch": 0.2350559890307205, "grad_norm": 1.640625, "learning_rate": 5.92557446105145e-06, "loss": 0.1486, "mean_token_accuracy": 0.9674091839790344, "num_tokens": 1020511787.0, "step": 7200 }, { "entropy": 1.4293452215194702, "epoch": 0.23668832228787828, "grad_norm": 1.2578125, "learning_rate": 5.923771394156943e-06, "loss": 0.158, "mean_token_accuracy": 0.9649367642402649, "num_tokens": 1027894747.0, "step": 7250 }, { "entropy": 1.4306922936439515, "epoch": 0.23832065554503606, "grad_norm": 1.3515625, "learning_rate": 5.921947027891219e-06, "loss": 0.1528, "mean_token_accuracy": 0.9656593954563141, "num_tokens": 1035110900.0, "step": 7300 }, { "entropy": 1.4252450680732727, "epoch": 0.23995298880219385, "grad_norm": 0.9921875, "learning_rate": 5.9201013755445955e-06, "loss": 0.1535, "mean_token_accuracy": 0.966197533607483, "num_tokens": 1042230443.0, "step": 7350 }, { "entropy": 1.4307914185523987, "epoch": 0.24158532205935163, "grad_norm": 1.21875, "learning_rate": 5.91823445056246e-06, "loss": 0.147, "mean_token_accuracy": 0.9667865073680878, "num_tokens": 1049389935.0, "step": 7400 }, { "entropy": 1.4161819124221802, "epoch": 0.2432176553165094, "grad_norm": 1.703125, "learning_rate": 5.916346266545167e-06, "loss": 0.1468, "mean_token_accuracy": 0.96632697224617, "num_tokens": 1056551888.0, "step": 7450 }, { "entropy": 1.4254303669929504, "epoch": 0.2448499885736672, "grad_norm": 1.4921875, "learning_rate": 5.914436837247941e-06, "loss": 0.1525, "mean_token_accuracy": 0.965957795381546, "num_tokens": 1063197615.0, "step": 7500 }, { "epoch": 0.2448499885736672, "eval_entropy": 1.4276245164871215, "eval_loss": 0.16787172853946686, "eval_mean_token_accuracy": 0.9633070985476176, "eval_num_tokens": 1063197615.0, "eval_runtime": 751.5202, "eval_samples_per_second": 12.849, "eval_steps_per_second": 0.101, "step": 7500 }, { "entropy": 1.4108345437049865, "epoch": 0.24648232183082497, "grad_norm": 1.3828125, "learning_rate": 5.912506176580776e-06, "loss": 0.147, "mean_token_accuracy": 0.9670144832134246, "num_tokens": 1069874223.0, "step": 7550 }, { "entropy": 1.4212487936019897, "epoch": 0.24811465508798275, "grad_norm": 1.828125, "learning_rate": 5.910554298608335e-06, "loss": 0.1509, "mean_token_accuracy": 0.96580601811409, "num_tokens": 1076764997.0, "step": 7600 }, { "entropy": 1.4187248206138612, "epoch": 0.24974698834514053, "grad_norm": 1.8515625, "learning_rate": 5.908581217549845e-06, "loss": 0.1528, "mean_token_accuracy": 0.9664638650417328, "num_tokens": 1083894428.0, "step": 7650 }, { "entropy": 1.400410017967224, "epoch": 0.2513793216022983, "grad_norm": 0.94140625, "learning_rate": 5.906586947778998e-06, "loss": 0.1448, "mean_token_accuracy": 0.9671175360679627, "num_tokens": 1090645918.0, "step": 7700 }, { "entropy": 1.4053305006027221, "epoch": 0.2530116548594561, "grad_norm": 1.21875, "learning_rate": 5.9045715038238436e-06, "loss": 0.1509, "mean_token_accuracy": 0.9659091722965241, "num_tokens": 1097654372.0, "step": 7750 }, { "entropy": 1.3977803254127503, "epoch": 0.2546439881166139, "grad_norm": 1.25, "learning_rate": 5.902534900366681e-06, "loss": 0.1547, "mean_token_accuracy": 0.9661977970600129, "num_tokens": 1104996723.0, "step": 7800 }, { "entropy": 1.3928361082077025, "epoch": 0.25627632137377165, "grad_norm": 1.78125, "learning_rate": 5.900477152243954e-06, "loss": 0.1467, "mean_token_accuracy": 0.9668631637096405, "num_tokens": 1111808272.0, "step": 7850 }, { "entropy": 1.3904444289207458, "epoch": 0.25790865463092943, "grad_norm": 1.484375, "learning_rate": 5.8983982744461446e-06, "loss": 0.1523, "mean_token_accuracy": 0.9658901369571686, "num_tokens": 1118777348.0, "step": 7900 }, { "entropy": 1.4107188177108765, "epoch": 0.2595409878880872, "grad_norm": 1.25, "learning_rate": 5.896298282117662e-06, "loss": 0.1508, "mean_token_accuracy": 0.9659655904769897, "num_tokens": 1125395200.0, "step": 7950 }, { "entropy": 1.409360373020172, "epoch": 0.261173321145245, "grad_norm": 1.21875, "learning_rate": 5.894177190556733e-06, "loss": 0.1523, "mean_token_accuracy": 0.9658745980262756, "num_tokens": 1132194770.0, "step": 8000 }, { "epoch": 0.261173321145245, "eval_entropy": 1.3953218412399293, "eval_loss": 0.16749244928359985, "eval_mean_token_accuracy": 0.9630522100130717, "eval_num_tokens": 1132194770.0, "eval_runtime": 749.5556, "eval_samples_per_second": 12.882, "eval_steps_per_second": 0.101, "step": 8000 }, { "entropy": 1.395931794643402, "epoch": 0.2628056544024028, "grad_norm": 2.015625, "learning_rate": 5.892035015215289e-06, "loss": 0.1475, "mean_token_accuracy": 0.967307710647583, "num_tokens": 1139223324.0, "step": 8050 }, { "entropy": 1.4096789264678955, "epoch": 0.26443798765956056, "grad_norm": 0.99609375, "learning_rate": 5.889871771698854e-06, "loss": 0.1512, "mean_token_accuracy": 0.9665188312530517, "num_tokens": 1146048693.0, "step": 8100 }, { "entropy": 1.3959095120429992, "epoch": 0.26607032091671834, "grad_norm": 0.98828125, "learning_rate": 5.887687475766435e-06, "loss": 0.1517, "mean_token_accuracy": 0.9670138394832611, "num_tokens": 1153416156.0, "step": 8150 }, { "entropy": 1.384766845703125, "epoch": 0.2677026541738761, "grad_norm": 1.484375, "learning_rate": 5.8854821433303995e-06, "loss": 0.1478, "mean_token_accuracy": 0.9662396657466888, "num_tokens": 1160327310.0, "step": 8200 }, { "entropy": 1.3720842933654784, "epoch": 0.2693349874310339, "grad_norm": 1.5546875, "learning_rate": 5.883255790456365e-06, "loss": 0.1369, "mean_token_accuracy": 0.9690599977970124, "num_tokens": 1166793829.0, "step": 8250 }, { "entropy": 1.3875324892997742, "epoch": 0.2709673206881917, "grad_norm": 1.0859375, "learning_rate": 5.881008433363083e-06, "loss": 0.1484, "mean_token_accuracy": 0.9658425927162171, "num_tokens": 1173770645.0, "step": 8300 }, { "entropy": 1.382853055000305, "epoch": 0.27259965394534946, "grad_norm": 1.6484375, "learning_rate": 5.878740088422315e-06, "loss": 0.1633, "mean_token_accuracy": 0.9631330251693726, "num_tokens": 1181126599.0, "step": 8350 }, { "entropy": 1.3965485191345215, "epoch": 0.27423198720250724, "grad_norm": 1.0390625, "learning_rate": 5.87645077215872e-06, "loss": 0.1498, "mean_token_accuracy": 0.9665160596370697, "num_tokens": 1188172115.0, "step": 8400 }, { "entropy": 1.3885521602630615, "epoch": 0.275864320459665, "grad_norm": 1.1953125, "learning_rate": 5.874140501249728e-06, "loss": 0.1468, "mean_token_accuracy": 0.9669651210308075, "num_tokens": 1195102960.0, "step": 8450 }, { "entropy": 1.3970652842521667, "epoch": 0.2774966537168228, "grad_norm": 2.328125, "learning_rate": 5.8718092925254235e-06, "loss": 0.1469, "mean_token_accuracy": 0.9666703069210052, "num_tokens": 1201982519.0, "step": 8500 }, { "epoch": 0.2774966537168228, "eval_entropy": 1.390671566327413, "eval_loss": 0.1665239781141281, "eval_mean_token_accuracy": 0.9631963141759237, "eval_num_tokens": 1201982519.0, "eval_runtime": 751.0843, "eval_samples_per_second": 12.856, "eval_steps_per_second": 0.101, "step": 8500 }, { "entropy": 1.3743389058113098, "epoch": 0.2791289869739806, "grad_norm": 1.46875, "learning_rate": 5.86945716296842e-06, "loss": 0.1413, "mean_token_accuracy": 0.9681530177593232, "num_tokens": 1208731712.0, "step": 8550 }, { "entropy": 1.3886315321922302, "epoch": 0.28076132023113837, "grad_norm": 1.28125, "learning_rate": 5.867084129713738e-06, "loss": 0.1553, "mean_token_accuracy": 0.9659830582141876, "num_tokens": 1215816513.0, "step": 8600 }, { "entropy": 1.3884997010231017, "epoch": 0.28239365348829615, "grad_norm": 1.3515625, "learning_rate": 5.864690210048677e-06, "loss": 0.1499, "mean_token_accuracy": 0.9667926502227783, "num_tokens": 1222796740.0, "step": 8650 }, { "entropy": 1.3810113263130188, "epoch": 0.28402598674545393, "grad_norm": 1.921875, "learning_rate": 5.862275421412695e-06, "loss": 0.1428, "mean_token_accuracy": 0.968780642747879, "num_tokens": 1229478573.0, "step": 8700 }, { "entropy": 1.3753751254081725, "epoch": 0.2856583200026117, "grad_norm": 1.6953125, "learning_rate": 5.859839781397276e-06, "loss": 0.1552, "mean_token_accuracy": 0.9648597013950347, "num_tokens": 1236888916.0, "step": 8750 }, { "entropy": 1.3797388172149658, "epoch": 0.2872906532597695, "grad_norm": 1.03125, "learning_rate": 5.857383307745805e-06, "loss": 0.1555, "mean_token_accuracy": 0.9654771292209625, "num_tokens": 1243945893.0, "step": 8800 }, { "entropy": 1.37408056974411, "epoch": 0.28892298651692727, "grad_norm": 1.8828125, "learning_rate": 5.854906018353436e-06, "loss": 0.1531, "mean_token_accuracy": 0.9655951869487762, "num_tokens": 1250815278.0, "step": 8850 }, { "entropy": 1.3714010500907898, "epoch": 0.29055531977408505, "grad_norm": 1.0859375, "learning_rate": 5.852407931266967e-06, "loss": 0.1416, "mean_token_accuracy": 0.967999415397644, "num_tokens": 1257589618.0, "step": 8900 }, { "entropy": 1.370381121635437, "epoch": 0.29218765303124283, "grad_norm": 1.1953125, "learning_rate": 5.849889064684703e-06, "loss": 0.156, "mean_token_accuracy": 0.965356330871582, "num_tokens": 1264949457.0, "step": 8950 }, { "entropy": 1.3676560163497924, "epoch": 0.2938199862884006, "grad_norm": 1.5546875, "learning_rate": 5.847349436956325e-06, "loss": 0.1609, "mean_token_accuracy": 0.9641006827354431, "num_tokens": 1272234102.0, "step": 9000 }, { "epoch": 0.2938199862884006, "eval_entropy": 1.38709463596344, "eval_loss": 0.1650434136390686, "eval_mean_token_accuracy": 0.9634115918477376, "eval_num_tokens": 1272234102.0, "eval_runtime": 751.7405, "eval_samples_per_second": 12.845, "eval_steps_per_second": 0.101, "step": 9000 }, { "entropy": 1.3809342765808106, "epoch": 0.2954523195455584, "grad_norm": 2.296875, "learning_rate": 5.844789066582758e-06, "loss": 0.1432, "mean_token_accuracy": 0.9673550212383271, "num_tokens": 1279098038.0, "step": 9050 }, { "entropy": 1.40355872631073, "epoch": 0.2970846528027162, "grad_norm": 1.4921875, "learning_rate": 5.842207972216034e-06, "loss": 0.1521, "mean_token_accuracy": 0.9661613535881043, "num_tokens": 1286173347.0, "step": 9100 }, { "entropy": 1.3974176049232483, "epoch": 0.29871698605987396, "grad_norm": 1.3125, "learning_rate": 5.839606172659159e-06, "loss": 0.1521, "mean_token_accuracy": 0.9656483995914459, "num_tokens": 1293330750.0, "step": 9150 }, { "entropy": 1.3879291534423828, "epoch": 0.30034931931703174, "grad_norm": 1.3671875, "learning_rate": 5.8369836868659706e-06, "loss": 0.1553, "mean_token_accuracy": 0.9647518181800843, "num_tokens": 1300327068.0, "step": 9200 }, { "entropy": 1.3752172994613647, "epoch": 0.3019816525741896, "grad_norm": 1.125, "learning_rate": 5.8343405339410085e-06, "loss": 0.1383, "mean_token_accuracy": 0.9691135132312775, "num_tokens": 1307021605.0, "step": 9250 }, { "entropy": 1.3903837966918946, "epoch": 0.30361398583134735, "grad_norm": 1.4609375, "learning_rate": 5.831676733139364e-06, "loss": 0.1458, "mean_token_accuracy": 0.9677986741065979, "num_tokens": 1314238674.0, "step": 9300 }, { "entropy": 1.3943523359298706, "epoch": 0.30524631908850514, "grad_norm": 1.0546875, "learning_rate": 5.828992303866552e-06, "loss": 0.158, "mean_token_accuracy": 0.9646199941635132, "num_tokens": 1321715116.0, "step": 9350 }, { "entropy": 1.4028909087181092, "epoch": 0.3068786523456629, "grad_norm": 1.515625, "learning_rate": 5.82628726567836e-06, "loss": 0.1615, "mean_token_accuracy": 0.9638377511501313, "num_tokens": 1328922385.0, "step": 9400 }, { "entropy": 1.405231008529663, "epoch": 0.3085109856028207, "grad_norm": 1.703125, "learning_rate": 5.823561638280711e-06, "loss": 0.1621, "mean_token_accuracy": 0.9635949361324311, "num_tokens": 1336385571.0, "step": 9450 }, { "entropy": 1.370991678237915, "epoch": 0.3101433188599785, "grad_norm": 1.6875, "learning_rate": 5.82081544152952e-06, "loss": 0.1515, "mean_token_accuracy": 0.9661216616630555, "num_tokens": 1343638497.0, "step": 9500 }, { "epoch": 0.3101433188599785, "eval_entropy": 1.3891894817352295, "eval_loss": 0.16506299376487732, "eval_mean_token_accuracy": 0.9635340309143067, "eval_num_tokens": 1343638497.0, "eval_runtime": 749.2359, "eval_samples_per_second": 12.888, "eval_steps_per_second": 0.101, "step": 9500 }, { "entropy": 1.3911400294303895, "epoch": 0.31177565211713626, "grad_norm": 0.8984375, "learning_rate": 5.818048695430541e-06, "loss": 0.1496, "mean_token_accuracy": 0.9663948690891266, "num_tokens": 1350638403.0, "step": 9550 }, { "entropy": 1.3863462066650392, "epoch": 0.31340798537429404, "grad_norm": 1.8515625, "learning_rate": 5.815261420139235e-06, "loss": 0.1495, "mean_token_accuracy": 0.9667435109615325, "num_tokens": 1357942983.0, "step": 9600 }, { "entropy": 1.3836952257156372, "epoch": 0.3150403186314518, "grad_norm": 1.4375, "learning_rate": 5.812453635960613e-06, "loss": 0.136, "mean_token_accuracy": 0.9696350061893463, "num_tokens": 1364441123.0, "step": 9650 }, { "entropy": 1.386801562309265, "epoch": 0.3166726518886096, "grad_norm": 1.3828125, "learning_rate": 5.809625363349091e-06, "loss": 0.1537, "mean_token_accuracy": 0.9660963475704193, "num_tokens": 1371638128.0, "step": 9700 }, { "entropy": 1.4033276891708375, "epoch": 0.3183049851457674, "grad_norm": 1.6015625, "learning_rate": 5.806776622908341e-06, "loss": 0.1489, "mean_token_accuracy": 0.9672618007659912, "num_tokens": 1378797795.0, "step": 9750 }, { "entropy": 1.3945609354972839, "epoch": 0.31993731840292516, "grad_norm": 1.7109375, "learning_rate": 5.8039074353911425e-06, "loss": 0.1476, "mean_token_accuracy": 0.9665350615978241, "num_tokens": 1385958442.0, "step": 9800 }, { "entropy": 1.3897303318977356, "epoch": 0.32156965166008294, "grad_norm": 1.609375, "learning_rate": 5.801017821699229e-06, "loss": 0.1492, "mean_token_accuracy": 0.9658246648311615, "num_tokens": 1392915332.0, "step": 9850 }, { "entropy": 1.397285952568054, "epoch": 0.3232019849172407, "grad_norm": 1.4296875, "learning_rate": 5.798107802883135e-06, "loss": 0.1538, "mean_token_accuracy": 0.9644203245639801, "num_tokens": 1399970378.0, "step": 9900 }, { "entropy": 1.396610188484192, "epoch": 0.3248343181743985, "grad_norm": 1.15625, "learning_rate": 5.795177400142047e-06, "loss": 0.1399, "mean_token_accuracy": 0.9683949732780457, "num_tokens": 1406942412.0, "step": 9950 }, { "entropy": 1.3993594455718994, "epoch": 0.3264666514315563, "grad_norm": 1.2109375, "learning_rate": 5.792226634823645e-06, "loss": 0.166, "mean_token_accuracy": 0.9635122084617614, "num_tokens": 1414672963.0, "step": 10000 }, { "epoch": 0.3264666514315563, "eval_entropy": 1.3897196292877196, "eval_loss": 0.16665887832641602, "eval_mean_token_accuracy": 0.9631382012367249, "eval_num_tokens": 1414672963.0, "eval_runtime": 743.4472, "eval_samples_per_second": 12.988, "eval_steps_per_second": 0.102, "step": 10000 }, { "entropy": 1.3787381386756896, "epoch": 0.32809898468871407, "grad_norm": 1.65625, "learning_rate": 5.789255528423945e-06, "loss": 0.1449, "mean_token_accuracy": 0.9675530314445495, "num_tokens": 1422037159.0, "step": 10050 }, { "entropy": 1.3835061955451966, "epoch": 0.32973131794587185, "grad_norm": 1.421875, "learning_rate": 5.7862641025871535e-06, "loss": 0.1493, "mean_token_accuracy": 0.966714415550232, "num_tokens": 1428834412.0, "step": 10100 }, { "entropy": 1.3994423723220826, "epoch": 0.33136365120302963, "grad_norm": 1.453125, "learning_rate": 5.783252379105494e-06, "loss": 0.1478, "mean_token_accuracy": 0.9666897785663605, "num_tokens": 1435825992.0, "step": 10150 }, { "entropy": 1.4079461932182311, "epoch": 0.3329959844601874, "grad_norm": 1.03125, "learning_rate": 5.780220379919062e-06, "loss": 0.1597, "mean_token_accuracy": 0.9649293422698975, "num_tokens": 1443057429.0, "step": 10200 }, { "entropy": 1.4087351322174073, "epoch": 0.3346283177173452, "grad_norm": 1.1875, "learning_rate": 5.777168127115654e-06, "loss": 0.1495, "mean_token_accuracy": 0.9670829331874847, "num_tokens": 1450143525.0, "step": 10250 }, { "entropy": 1.4217532348632813, "epoch": 0.336260650974503, "grad_norm": 1.1796875, "learning_rate": 5.774095642930618e-06, "loss": 0.1538, "mean_token_accuracy": 0.9653639376163483, "num_tokens": 1456853199.0, "step": 10300 }, { "entropy": 1.4182784628868104, "epoch": 0.33789298423166075, "grad_norm": 1.328125, "learning_rate": 5.771002949746681e-06, "loss": 0.1592, "mean_token_accuracy": 0.9639875698089599, "num_tokens": 1464260063.0, "step": 10350 }, { "entropy": 1.3954361629486085, "epoch": 0.33952531748881853, "grad_norm": 1.0234375, "learning_rate": 5.76789007009379e-06, "loss": 0.1444, "mean_token_accuracy": 0.9671654045581818, "num_tokens": 1471363389.0, "step": 10400 }, { "entropy": 1.3968884110450746, "epoch": 0.3411576507459763, "grad_norm": 2.15625, "learning_rate": 5.7647570266489535e-06, "loss": 0.1325, "mean_token_accuracy": 0.9698529148101807, "num_tokens": 1478273376.0, "step": 10450 }, { "entropy": 1.397156729698181, "epoch": 0.3427899840031341, "grad_norm": 1.0625, "learning_rate": 5.7616038422360674e-06, "loss": 0.1458, "mean_token_accuracy": 0.9674423885345459, "num_tokens": 1485177982.0, "step": 10500 }, { "epoch": 0.3427899840031341, "eval_entropy": 1.4138343874613444, "eval_loss": 0.1639399528503418, "eval_mean_token_accuracy": 0.9635584902763367, "eval_num_tokens": 1485177982.0, "eval_runtime": 748.9668, "eval_samples_per_second": 12.892, "eval_steps_per_second": 0.101, "step": 10500 }, { "entropy": 1.4251120805740356, "epoch": 0.3444223172602919, "grad_norm": 1.703125, "learning_rate": 5.758430539825751e-06, "loss": 0.1423, "mean_token_accuracy": 0.968468290567398, "num_tokens": 1491740592.0, "step": 10550 }, { "entropy": 1.4330598759651183, "epoch": 0.34605465051744966, "grad_norm": 1.234375, "learning_rate": 5.755237142535185e-06, "loss": 0.1584, "mean_token_accuracy": 0.9640131962299346, "num_tokens": 1499230338.0, "step": 10600 }, { "entropy": 1.4190063452720643, "epoch": 0.34768698377460744, "grad_norm": 1.3671875, "learning_rate": 5.752023673627936e-06, "loss": 0.1549, "mean_token_accuracy": 0.9651542448997498, "num_tokens": 1506807165.0, "step": 10650 }, { "entropy": 1.398562343120575, "epoch": 0.3493193170317652, "grad_norm": 2.671875, "learning_rate": 5.748790156513793e-06, "loss": 0.1429, "mean_token_accuracy": 0.9676708686351776, "num_tokens": 1513566074.0, "step": 10700 }, { "entropy": 1.4046012330055238, "epoch": 0.350951650288923, "grad_norm": 1.8203125, "learning_rate": 5.74553661474859e-06, "loss": 0.1475, "mean_token_accuracy": 0.9663785743713379, "num_tokens": 1520732578.0, "step": 10750 }, { "entropy": 1.4011975502967835, "epoch": 0.3525839835460808, "grad_norm": 1.7109375, "learning_rate": 5.742263072034044e-06, "loss": 0.133, "mean_token_accuracy": 0.9697633123397827, "num_tokens": 1527341695.0, "step": 10800 }, { "entropy": 1.4020631575584412, "epoch": 0.35421631680323856, "grad_norm": 1.375, "learning_rate": 5.738969552217573e-06, "loss": 0.1529, "mean_token_accuracy": 0.9653546392917634, "num_tokens": 1534441768.0, "step": 10850 }, { "entropy": 1.4063316154479981, "epoch": 0.35584865006039634, "grad_norm": 1.1875, "learning_rate": 5.735656079292128e-06, "loss": 0.1541, "mean_token_accuracy": 0.9655212807655335, "num_tokens": 1542032259.0, "step": 10900 }, { "entropy": 1.3955928492546081, "epoch": 0.3574809833175541, "grad_norm": 1.296875, "learning_rate": 5.732322677396013e-06, "loss": 0.1379, "mean_token_accuracy": 0.9682280993461609, "num_tokens": 1549145530.0, "step": 10950 }, { "entropy": 1.417706184387207, "epoch": 0.3591133165747119, "grad_norm": 1.5078125, "learning_rate": 5.728969370812716e-06, "loss": 0.1502, "mean_token_accuracy": 0.9673383378982544, "num_tokens": 1556005225.0, "step": 11000 }, { "epoch": 0.3591133165747119, "eval_entropy": 1.4156667073567708, "eval_loss": 0.1644313782453537, "eval_mean_token_accuracy": 0.9634674708048503, "eval_num_tokens": 1556005225.0, "eval_runtime": 749.0504, "eval_samples_per_second": 12.891, "eval_steps_per_second": 0.101, "step": 11000 }, { "entropy": 1.4165368866920471, "epoch": 0.3607456498318697, "grad_norm": 3.625, "learning_rate": 5.725596183970729e-06, "loss": 0.1491, "mean_token_accuracy": 0.9657526600360871, "num_tokens": 1563254781.0, "step": 11050 }, { "entropy": 1.4074536633491517, "epoch": 0.36237798308902747, "grad_norm": 1.5625, "learning_rate": 5.722203141443365e-06, "loss": 0.1452, "mean_token_accuracy": 0.9683119165897369, "num_tokens": 1569838865.0, "step": 11100 }, { "entropy": 1.4159915471076965, "epoch": 0.36401031634618525, "grad_norm": 1.265625, "learning_rate": 5.718790267948591e-06, "loss": 0.1505, "mean_token_accuracy": 0.9666063642501831, "num_tokens": 1576948931.0, "step": 11150 }, { "entropy": 1.4229880380630493, "epoch": 0.36564264960334303, "grad_norm": 1.7109375, "learning_rate": 5.715357588348832e-06, "loss": 0.1522, "mean_token_accuracy": 0.9668408262729645, "num_tokens": 1584164705.0, "step": 11200 }, { "entropy": 1.396066279411316, "epoch": 0.3672749828605008, "grad_norm": 2.09375, "learning_rate": 5.711905127650807e-06, "loss": 0.1373, "mean_token_accuracy": 0.9693786513805389, "num_tokens": 1591058327.0, "step": 11250 }, { "entropy": 1.4097445344924926, "epoch": 0.3689073161176586, "grad_norm": 1.9609375, "learning_rate": 5.7084329110053294e-06, "loss": 0.1486, "mean_token_accuracy": 0.9671066462993622, "num_tokens": 1598091112.0, "step": 11300 }, { "entropy": 1.4326444959640503, "epoch": 0.37053964937481637, "grad_norm": 1.1328125, "learning_rate": 5.70494096370714e-06, "loss": 0.1495, "mean_token_accuracy": 0.9672650814056396, "num_tokens": 1605344616.0, "step": 11350 }, { "entropy": 1.4539379978179932, "epoch": 0.37217198263197415, "grad_norm": 1.953125, "learning_rate": 5.701429311194713e-06, "loss": 0.1593, "mean_token_accuracy": 0.9647262859344482, "num_tokens": 1612587700.0, "step": 11400 }, { "entropy": 1.4599957299232482, "epoch": 0.37380431588913193, "grad_norm": 0.76953125, "learning_rate": 5.6978979790500695e-06, "loss": 0.1428, "mean_token_accuracy": 0.96798752784729, "num_tokens": 1619869251.0, "step": 11450 }, { "entropy": 1.456819851398468, "epoch": 0.3754366491462897, "grad_norm": 1.5703125, "learning_rate": 5.694346992998601e-06, "loss": 0.157, "mean_token_accuracy": 0.9647147190570832, "num_tokens": 1627103999.0, "step": 11500 }, { "epoch": 0.3754366491462897, "eval_entropy": 1.4559897645314535, "eval_loss": 0.16348470747470856, "eval_mean_token_accuracy": 0.9634300549825032, "eval_num_tokens": 1627103999.0, "eval_runtime": 750.9869, "eval_samples_per_second": 12.858, "eval_steps_per_second": 0.101, "step": 11500 }, { "entropy": 1.445942795276642, "epoch": 0.3770689824034475, "grad_norm": 1.5625, "learning_rate": 5.690776378908871e-06, "loss": 0.1559, "mean_token_accuracy": 0.9648426783084869, "num_tokens": 1634135214.0, "step": 11550 }, { "entropy": 1.4337683749198913, "epoch": 0.3787013156606053, "grad_norm": 1.71875, "learning_rate": 5.687186162792432e-06, "loss": 0.1392, "mean_token_accuracy": 0.9684048485755921, "num_tokens": 1641123881.0, "step": 11600 }, { "entropy": 1.4271648550033569, "epoch": 0.38033364891776306, "grad_norm": 1.53125, "learning_rate": 5.683576370803637e-06, "loss": 0.1442, "mean_token_accuracy": 0.9671970081329345, "num_tokens": 1648087593.0, "step": 11650 }, { "entropy": 1.4187105298042297, "epoch": 0.38196598217492084, "grad_norm": 1.2578125, "learning_rate": 5.679947029239446e-06, "loss": 0.1558, "mean_token_accuracy": 0.9652733910083771, "num_tokens": 1655675590.0, "step": 11700 }, { "entropy": 1.4136518168449401, "epoch": 0.3835983154320786, "grad_norm": 1.2421875, "learning_rate": 5.676298164539235e-06, "loss": 0.1344, "mean_token_accuracy": 0.9695696973800659, "num_tokens": 1662994220.0, "step": 11750 }, { "entropy": 1.4156992936134338, "epoch": 0.3852306486892364, "grad_norm": 2.015625, "learning_rate": 5.672629803284603e-06, "loss": 0.1445, "mean_token_accuracy": 0.967056336402893, "num_tokens": 1670282239.0, "step": 11800 }, { "entropy": 1.381675865650177, "epoch": 0.3868629819463942, "grad_norm": 1.3046875, "learning_rate": 5.668941972199185e-06, "loss": 0.1318, "mean_token_accuracy": 0.9707480573654175, "num_tokens": 1677092297.0, "step": 11850 }, { "entropy": 1.413828341960907, "epoch": 0.38849531520355196, "grad_norm": 1.546875, "learning_rate": 5.665234698148447e-06, "loss": 0.1398, "mean_token_accuracy": 0.9692868089675903, "num_tokens": 1683596527.0, "step": 11900 }, { "entropy": 1.4255395197868348, "epoch": 0.39012764846070974, "grad_norm": 1.25, "learning_rate": 5.661508008139494e-06, "loss": 0.1428, "mean_token_accuracy": 0.9684118723869324, "num_tokens": 1690356557.0, "step": 11950 }, { "entropy": 1.421637599468231, "epoch": 0.3917599817178675, "grad_norm": 1.40625, "learning_rate": 5.657761929320876e-06, "loss": 0.1358, "mean_token_accuracy": 0.9686246883869171, "num_tokens": 1696780737.0, "step": 12000 }, { "epoch": 0.3917599817178675, "eval_entropy": 1.4270846033096314, "eval_loss": 0.16205987334251404, "eval_mean_token_accuracy": 0.9638035273551941, "eval_num_tokens": 1696780737.0, "eval_runtime": 744.4564, "eval_samples_per_second": 12.971, "eval_steps_per_second": 0.102, "step": 12000 }, { "entropy": 1.4344228434562682, "epoch": 0.3933923149750253, "grad_norm": 1.46875, "learning_rate": 5.65399648898239e-06, "loss": 0.1479, "mean_token_accuracy": 0.9664253509044647, "num_tokens": 1703868095.0, "step": 12050 }, { "entropy": 1.40860848903656, "epoch": 0.3950246482321831, "grad_norm": 1.5703125, "learning_rate": 5.650211714554876e-06, "loss": 0.1593, "mean_token_accuracy": 0.9646110832691193, "num_tokens": 1711471948.0, "step": 12100 }, { "entropy": 1.420129976272583, "epoch": 0.39665698148934087, "grad_norm": 1.6484375, "learning_rate": 5.6464076336100246e-06, "loss": 0.1455, "mean_token_accuracy": 0.9673341345787049, "num_tokens": 1718673490.0, "step": 12150 }, { "entropy": 1.4015104007720947, "epoch": 0.39828931474649865, "grad_norm": 1.390625, "learning_rate": 5.642584273860171e-06, "loss": 0.1518, "mean_token_accuracy": 0.966002242565155, "num_tokens": 1726275054.0, "step": 12200 }, { "entropy": 1.4261247539520263, "epoch": 0.3999216480036564, "grad_norm": 1.3828125, "learning_rate": 5.6387416631580936e-06, "loss": 0.1417, "mean_token_accuracy": 0.9675724005699158, "num_tokens": 1733190804.0, "step": 12250 }, { "entropy": 1.4047169375419617, "epoch": 0.4015539812608142, "grad_norm": 1.6796875, "learning_rate": 5.634879829496813e-06, "loss": 0.149, "mean_token_accuracy": 0.9663934874534607, "num_tokens": 1740409762.0, "step": 12300 }, { "entropy": 1.4087544870376587, "epoch": 0.403186314517972, "grad_norm": 1.296875, "learning_rate": 5.630998801009386e-06, "loss": 0.1422, "mean_token_accuracy": 0.9677630662918091, "num_tokens": 1747425003.0, "step": 12350 }, { "entropy": 1.4205935049057006, "epoch": 0.40481864777512977, "grad_norm": 1.609375, "learning_rate": 5.627098605968702e-06, "loss": 0.1518, "mean_token_accuracy": 0.9655173766613007, "num_tokens": 1754787565.0, "step": 12400 }, { "entropy": 1.387464382648468, "epoch": 0.40645098103228755, "grad_norm": 1.375, "learning_rate": 5.62317927278728e-06, "loss": 0.1372, "mean_token_accuracy": 0.968756947517395, "num_tokens": 1761849063.0, "step": 12450 }, { "entropy": 1.3994211435317994, "epoch": 0.40808331428944533, "grad_norm": 2.0625, "learning_rate": 5.619240830017051e-06, "loss": 0.144, "mean_token_accuracy": 0.9678842532634735, "num_tokens": 1768859328.0, "step": 12500 }, { "epoch": 0.40808331428944533, "eval_entropy": 1.405422296524048, "eval_loss": 0.16121897101402283, "eval_mean_token_accuracy": 0.9639042560259501, "eval_num_tokens": 1768859328.0, "eval_runtime": 749.481, "eval_samples_per_second": 12.884, "eval_steps_per_second": 0.101, "step": 12500 }, { "entropy": 1.419663178920746, "epoch": 0.4097156475466031, "grad_norm": 1.953125, "learning_rate": 5.615283306349166e-06, "loss": 0.1409, "mean_token_accuracy": 0.9682516789436341, "num_tokens": 1775694980.0, "step": 12550 }, { "entropy": 1.4085765600204467, "epoch": 0.4113479808037609, "grad_norm": 1.578125, "learning_rate": 5.611306730613772e-06, "loss": 0.1326, "mean_token_accuracy": 0.9704544687271118, "num_tokens": 1782348927.0, "step": 12600 }, { "entropy": 1.4163624548912048, "epoch": 0.4129803140609187, "grad_norm": 2.0, "learning_rate": 5.607311131779812e-06, "loss": 0.1501, "mean_token_accuracy": 0.9661952567100525, "num_tokens": 1789610698.0, "step": 12650 }, { "entropy": 1.4280620789527894, "epoch": 0.41461264731807645, "grad_norm": 1.890625, "learning_rate": 5.603296538954808e-06, "loss": 0.143, "mean_token_accuracy": 0.9683597815036774, "num_tokens": 1796316393.0, "step": 12700 }, { "entropy": 1.425929193496704, "epoch": 0.41624498057523424, "grad_norm": 2.296875, "learning_rate": 5.599262981384652e-06, "loss": 0.1477, "mean_token_accuracy": 0.9663317048549652, "num_tokens": 1803325180.0, "step": 12750 }, { "entropy": 1.4301575493812562, "epoch": 0.417877313832392, "grad_norm": 1.9765625, "learning_rate": 5.595210488453392e-06, "loss": 0.1557, "mean_token_accuracy": 0.9648576879501343, "num_tokens": 1810717661.0, "step": 12800 }, { "entropy": 1.405632803440094, "epoch": 0.4195096470895498, "grad_norm": 1.7109375, "learning_rate": 5.591139089683021e-06, "loss": 0.1397, "mean_token_accuracy": 0.9687881779670715, "num_tokens": 1817611883.0, "step": 12850 }, { "entropy": 1.3980163097381593, "epoch": 0.4211419803467076, "grad_norm": 2.078125, "learning_rate": 5.587048814733253e-06, "loss": 0.1319, "mean_token_accuracy": 0.9697498416900635, "num_tokens": 1824343282.0, "step": 12900 }, { "entropy": 1.3963736867904664, "epoch": 0.42277431360386536, "grad_norm": 1.859375, "learning_rate": 5.582939693401319e-06, "loss": 0.1378, "mean_token_accuracy": 0.9693096280097961, "num_tokens": 1831509722.0, "step": 12950 }, { "entropy": 1.4174233818054198, "epoch": 0.42440664686102314, "grad_norm": 1.5078125, "learning_rate": 5.57881175562174e-06, "loss": 0.1565, "mean_token_accuracy": 0.9654737007617951, "num_tokens": 1838857002.0, "step": 13000 }, { "epoch": 0.42440664686102314, "eval_entropy": 1.4053240156173705, "eval_loss": 0.1606142520904541, "eval_mean_token_accuracy": 0.9640888079007467, "eval_num_tokens": 1838857002.0, "eval_runtime": 748.4765, "eval_samples_per_second": 12.901, "eval_steps_per_second": 0.102, "step": 13000 }, { "entropy": 1.4016055393218994, "epoch": 0.4260389801181809, "grad_norm": 2.09375, "learning_rate": 5.574665031466116e-06, "loss": 0.1338, "mean_token_accuracy": 0.9696434116363526, "num_tokens": 1845457705.0, "step": 13050 }, { "entropy": 1.4049336218833923, "epoch": 0.4276713133753387, "grad_norm": 1.5625, "learning_rate": 5.570499551142902e-06, "loss": 0.1466, "mean_token_accuracy": 0.9665655505657196, "num_tokens": 1852644618.0, "step": 13100 }, { "entropy": 1.4158594751358031, "epoch": 0.4293036466324965, "grad_norm": 1.546875, "learning_rate": 5.566315344997188e-06, "loss": 0.1405, "mean_token_accuracy": 0.9682467067241669, "num_tokens": 1859532727.0, "step": 13150 }, { "entropy": 1.406804118156433, "epoch": 0.43093597988965426, "grad_norm": 1.796875, "learning_rate": 5.562112443510483e-06, "loss": 0.1347, "mean_token_accuracy": 0.969345440864563, "num_tokens": 1866433523.0, "step": 13200 }, { "entropy": 1.414643156528473, "epoch": 0.43256831314681204, "grad_norm": 1.375, "learning_rate": 5.557890877300489e-06, "loss": 0.1455, "mean_token_accuracy": 0.9673810720443725, "num_tokens": 1873742139.0, "step": 13250 }, { "entropy": 1.4243081569671632, "epoch": 0.4342006464039698, "grad_norm": 1.1328125, "learning_rate": 5.553650677120876e-06, "loss": 0.14, "mean_token_accuracy": 0.9683500599861145, "num_tokens": 1881001857.0, "step": 13300 }, { "entropy": 1.4164328074455261, "epoch": 0.4358329796611276, "grad_norm": 2.3125, "learning_rate": 5.549391873861064e-06, "loss": 0.1485, "mean_token_accuracy": 0.9669756269454957, "num_tokens": 1888442521.0, "step": 13350 }, { "entropy": 1.4102159285545348, "epoch": 0.4374653129182854, "grad_norm": 1.953125, "learning_rate": 5.545114498545991e-06, "loss": 0.139, "mean_token_accuracy": 0.9687536859512329, "num_tokens": 1895774805.0, "step": 13400 }, { "entropy": 1.4052273893356324, "epoch": 0.43909764617544317, "grad_norm": 1.046875, "learning_rate": 5.540818582335894e-06, "loss": 0.1442, "mean_token_accuracy": 0.9679582285881042, "num_tokens": 1902866432.0, "step": 13450 }, { "entropy": 1.4166702628135681, "epoch": 0.44072997943260095, "grad_norm": 1.2890625, "learning_rate": 5.536504156526077e-06, "loss": 0.1481, "mean_token_accuracy": 0.9664357197284699, "num_tokens": 1910343947.0, "step": 13500 }, { "epoch": 0.44072997943260095, "eval_entropy": 1.4095580498377482, "eval_loss": 0.1602867692708969, "eval_mean_token_accuracy": 0.9641507911682129, "eval_num_tokens": 1910343947.0, "eval_runtime": 748.7079, "eval_samples_per_second": 12.897, "eval_steps_per_second": 0.102, "step": 13500 }, { "entropy": 1.4171498656272887, "epoch": 0.44236231268975873, "grad_norm": 1.9375, "learning_rate": 5.5321712525466815e-06, "loss": 0.1444, "mean_token_accuracy": 0.967768360376358, "num_tokens": 1917115548.0, "step": 13550 }, { "entropy": 1.4215400004386902, "epoch": 0.4439946459469165, "grad_norm": 1.21875, "learning_rate": 5.5278199019624665e-06, "loss": 0.1558, "mean_token_accuracy": 0.965044105052948, "num_tokens": 1924701669.0, "step": 13600 }, { "entropy": 1.4144377851486205, "epoch": 0.4456269792040743, "grad_norm": 1.390625, "learning_rate": 5.523450136472569e-06, "loss": 0.1476, "mean_token_accuracy": 0.9668766951560974, "num_tokens": 1932051913.0, "step": 13650 }, { "entropy": 1.4168913602828979, "epoch": 0.4472593124612321, "grad_norm": 1.703125, "learning_rate": 5.519061987910276e-06, "loss": 0.1344, "mean_token_accuracy": 0.9691276931762696, "num_tokens": 1938674407.0, "step": 13700 }, { "entropy": 1.4047640895843505, "epoch": 0.44889164571838985, "grad_norm": 1.109375, "learning_rate": 5.514655488242795e-06, "loss": 0.1503, "mean_token_accuracy": 0.9665549874305726, "num_tokens": 1945577505.0, "step": 13750 }, { "entropy": 1.4135752677917481, "epoch": 0.45052397897554763, "grad_norm": 2.125, "learning_rate": 5.510230669571018e-06, "loss": 0.1492, "mean_token_accuracy": 0.9664253723621369, "num_tokens": 1952659833.0, "step": 13800 }, { "entropy": 1.3945325517654419, "epoch": 0.4521563122327054, "grad_norm": 2.109375, "learning_rate": 5.505787564129291e-06, "loss": 0.1376, "mean_token_accuracy": 0.968870245218277, "num_tokens": 1959360672.0, "step": 13850 }, { "entropy": 1.4091899871826172, "epoch": 0.4537886454898632, "grad_norm": 1.7109375, "learning_rate": 5.5013262042851764e-06, "loss": 0.1352, "mean_token_accuracy": 0.9697363257408143, "num_tokens": 1965923476.0, "step": 13900 }, { "entropy": 1.409596972465515, "epoch": 0.455420978747021, "grad_norm": 1.3359375, "learning_rate": 5.4968466225392165e-06, "loss": 0.147, "mean_token_accuracy": 0.966924901008606, "num_tokens": 1972776862.0, "step": 13950 }, { "entropy": 1.4293616366386415, "epoch": 0.45705331200417876, "grad_norm": 1.296875, "learning_rate": 5.4923488515247e-06, "loss": 0.1417, "mean_token_accuracy": 0.9687818443775177, "num_tokens": 1979187620.0, "step": 14000 }, { "epoch": 0.45705331200417876, "eval_entropy": 1.4118124723434449, "eval_loss": 0.16126905381679535, "eval_mean_token_accuracy": 0.9638542596499126, "eval_num_tokens": 1979187620.0, "eval_runtime": 745.6037, "eval_samples_per_second": 12.951, "eval_steps_per_second": 0.102, "step": 14000 }, { "entropy": 1.415801317691803, "epoch": 0.45868564526133654, "grad_norm": 1.25, "learning_rate": 5.487832924007422e-06, "loss": 0.1483, "mean_token_accuracy": 0.9666338682174682, "num_tokens": 1986603476.0, "step": 14050 }, { "entropy": 1.4051340079307557, "epoch": 0.4603179785184943, "grad_norm": 1.078125, "learning_rate": 5.4832988728854465e-06, "loss": 0.1557, "mean_token_accuracy": 0.964282066822052, "num_tokens": 1994204498.0, "step": 14100 }, { "entropy": 1.415133674144745, "epoch": 0.4619503117756521, "grad_norm": 1.4140625, "learning_rate": 5.478746731188865e-06, "loss": 0.1397, "mean_token_accuracy": 0.9680150365829467, "num_tokens": 2000941510.0, "step": 14150 }, { "entropy": 1.4023677587509156, "epoch": 0.4635826450328099, "grad_norm": 1.8671875, "learning_rate": 5.474176532079557e-06, "loss": 0.139, "mean_token_accuracy": 0.9682874810695649, "num_tokens": 2007806068.0, "step": 14200 }, { "entropy": 1.4310323596000671, "epoch": 0.46521497828996766, "grad_norm": 1.390625, "learning_rate": 5.46958830885095e-06, "loss": 0.1513, "mean_token_accuracy": 0.9669715762138367, "num_tokens": 2014807742.0, "step": 14250 }, { "entropy": 1.4305420732498169, "epoch": 0.46684731154712544, "grad_norm": 1.09375, "learning_rate": 5.464982094927772e-06, "loss": 0.1486, "mean_token_accuracy": 0.966749495267868, "num_tokens": 2021683076.0, "step": 14300 }, { "entropy": 1.4090232229232789, "epoch": 0.4684796448042832, "grad_norm": 1.578125, "learning_rate": 5.460357923865814e-06, "loss": 0.1379, "mean_token_accuracy": 0.9685410165786743, "num_tokens": 2028415183.0, "step": 14350 }, { "entropy": 1.4059256029129028, "epoch": 0.470111978061441, "grad_norm": 1.2265625, "learning_rate": 5.4557158293516845e-06, "loss": 0.1368, "mean_token_accuracy": 0.9691716039180756, "num_tokens": 2035435433.0, "step": 14400 }, { "entropy": 1.415133411884308, "epoch": 0.4717443113185988, "grad_norm": 1.2265625, "learning_rate": 5.451055845202559e-06, "loss": 0.1558, "mean_token_accuracy": 0.9652414166927338, "num_tokens": 2042639097.0, "step": 14450 }, { "entropy": 1.4125458192825318, "epoch": 0.47337664457575657, "grad_norm": 1.5546875, "learning_rate": 5.446378005365937e-06, "loss": 0.1465, "mean_token_accuracy": 0.9667794346809387, "num_tokens": 2049839883.0, "step": 14500 }, { "epoch": 0.47337664457575657, "eval_entropy": 1.4099786535898844, "eval_loss": 0.16086123883724213, "eval_mean_token_accuracy": 0.9639045866330465, "eval_num_tokens": 2049839883.0, "eval_runtime": 751.2639, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 14500 }, { "entropy": 1.4155534076690675, "epoch": 0.47500897783291435, "grad_norm": 0.97265625, "learning_rate": 5.441682343919398e-06, "loss": 0.1515, "mean_token_accuracy": 0.9656119549274444, "num_tokens": 2057493706.0, "step": 14550 }, { "entropy": 1.4232526874542237, "epoch": 0.47664131109007213, "grad_norm": 1.265625, "learning_rate": 5.436968895070349e-06, "loss": 0.1463, "mean_token_accuracy": 0.9669792699813843, "num_tokens": 2064599903.0, "step": 14600 }, { "entropy": 1.426460826396942, "epoch": 0.4782736443472299, "grad_norm": 1.34375, "learning_rate": 5.432237693155773e-06, "loss": 0.1551, "mean_token_accuracy": 0.9656824862957001, "num_tokens": 2072278266.0, "step": 14650 }, { "entropy": 1.447964243888855, "epoch": 0.4799059776043877, "grad_norm": 1.6328125, "learning_rate": 5.427488772641989e-06, "loss": 0.1505, "mean_token_accuracy": 0.9661068272590637, "num_tokens": 2079638913.0, "step": 14700 }, { "entropy": 1.443111469745636, "epoch": 0.48153831086154547, "grad_norm": 2.015625, "learning_rate": 5.422722168124386e-06, "loss": 0.1452, "mean_token_accuracy": 0.9680136227607727, "num_tokens": 2086742924.0, "step": 14750 }, { "entropy": 1.436655399799347, "epoch": 0.48317064411870325, "grad_norm": 1.515625, "learning_rate": 5.417937914327187e-06, "loss": 0.1466, "mean_token_accuracy": 0.9671316587924957, "num_tokens": 2094123207.0, "step": 14800 }, { "entropy": 1.4242712426185609, "epoch": 0.48480297737586103, "grad_norm": 1.7265625, "learning_rate": 5.413136046103181e-06, "loss": 0.1471, "mean_token_accuracy": 0.9668898284435272, "num_tokens": 2101338532.0, "step": 14850 }, { "entropy": 1.4219823241233827, "epoch": 0.4864353106330188, "grad_norm": 1.7734375, "learning_rate": 5.408316598433483e-06, "loss": 0.1387, "mean_token_accuracy": 0.9691325139999389, "num_tokens": 2107835134.0, "step": 14900 }, { "entropy": 1.4221902179718018, "epoch": 0.4880676438901766, "grad_norm": 1.6015625, "learning_rate": 5.403479606427267e-06, "loss": 0.1344, "mean_token_accuracy": 0.9696690511703491, "num_tokens": 2114891465.0, "step": 14950 }, { "entropy": 1.401020920276642, "epoch": 0.4896999771473344, "grad_norm": 1.2578125, "learning_rate": 5.398625105321518e-06, "loss": 0.143, "mean_token_accuracy": 0.9679032492637635, "num_tokens": 2121999447.0, "step": 15000 }, { "epoch": 0.4896999771473344, "eval_entropy": 1.409274689356486, "eval_loss": 0.1606949120759964, "eval_mean_token_accuracy": 0.963873782157898, "eval_num_tokens": 2121999447.0, "eval_runtime": 749.6338, "eval_samples_per_second": 12.881, "eval_steps_per_second": 0.101, "step": 15000 }, { "entropy": 1.4075271391868591, "epoch": 0.49133231040449216, "grad_norm": 1.953125, "learning_rate": 5.393753130480773e-06, "loss": 0.1422, "mean_token_accuracy": 0.967915655374527, "num_tokens": 2129364587.0, "step": 15050 }, { "entropy": 1.39142733335495, "epoch": 0.49296464366164994, "grad_norm": 1.5, "learning_rate": 5.388863717396865e-06, "loss": 0.1378, "mean_token_accuracy": 0.9694935536384582, "num_tokens": 2135978375.0, "step": 15100 }, { "entropy": 1.4059196853637694, "epoch": 0.4945969769188077, "grad_norm": 2.109375, "learning_rate": 5.383956901688659e-06, "loss": 0.1495, "mean_token_accuracy": 0.9660707736015319, "num_tokens": 2143467430.0, "step": 15150 }, { "entropy": 1.3810991740226746, "epoch": 0.4962293101759655, "grad_norm": 0.59375, "learning_rate": 5.3790327191017976e-06, "loss": 0.1421, "mean_token_accuracy": 0.9680761241912842, "num_tokens": 2150524540.0, "step": 15200 }, { "entropy": 1.3994912171363831, "epoch": 0.4978616434331233, "grad_norm": 1.3046875, "learning_rate": 5.374091205508442e-06, "loss": 0.1508, "mean_token_accuracy": 0.9663672626018525, "num_tokens": 2157785212.0, "step": 15250 }, { "entropy": 1.405203297138214, "epoch": 0.49949397669028106, "grad_norm": 1.1875, "learning_rate": 5.369132396907005e-06, "loss": 0.1478, "mean_token_accuracy": 0.9665706479549407, "num_tokens": 2164998488.0, "step": 15300 }, { "entropy": 1.4063081932067871, "epoch": 0.5011263099474389, "grad_norm": 1.953125, "learning_rate": 5.364156329421892e-06, "loss": 0.1285, "mean_token_accuracy": 0.9716930568218232, "num_tokens": 2171612673.0, "step": 15350 }, { "entropy": 1.4068853855133057, "epoch": 0.5027586432045966, "grad_norm": 1.34375, "learning_rate": 5.359163039303241e-06, "loss": 0.1465, "mean_token_accuracy": 0.9674324905872345, "num_tokens": 2179025674.0, "step": 15400 }, { "entropy": 1.4135111689567565, "epoch": 0.5043909764617545, "grad_norm": 1.9453125, "learning_rate": 5.354152562926649e-06, "loss": 0.1471, "mean_token_accuracy": 0.9673193073272706, "num_tokens": 2186052673.0, "step": 15450 }, { "entropy": 1.4252868342399596, "epoch": 0.5060233097189122, "grad_norm": 1.71875, "learning_rate": 5.349124936792918e-06, "loss": 0.1441, "mean_token_accuracy": 0.9673807907104492, "num_tokens": 2193326163.0, "step": 15500 }, { "epoch": 0.5060233097189122, "eval_entropy": 1.409689162572225, "eval_loss": 0.16070087254047394, "eval_mean_token_accuracy": 0.9643241794904073, "eval_num_tokens": 2193326163.0, "eval_runtime": 746.9801, "eval_samples_per_second": 12.927, "eval_steps_per_second": 0.102, "step": 15500 }, { "entropy": 1.4205117511749268, "epoch": 0.50765564297607, "grad_norm": 1.203125, "learning_rate": 5.344080197527782e-06, "loss": 0.1492, "mean_token_accuracy": 0.9669366598129272, "num_tokens": 2200727296.0, "step": 15550 }, { "entropy": 1.418469593524933, "epoch": 0.5092879762332277, "grad_norm": 1.3046875, "learning_rate": 5.339018381881644e-06, "loss": 0.1485, "mean_token_accuracy": 0.9666687226295472, "num_tokens": 2207977018.0, "step": 15600 }, { "entropy": 1.4072162437438964, "epoch": 0.5109203094903856, "grad_norm": 1.5078125, "learning_rate": 5.333939526729307e-06, "loss": 0.1393, "mean_token_accuracy": 0.968210985660553, "num_tokens": 2214966932.0, "step": 15650 }, { "entropy": 1.4076303386688231, "epoch": 0.5125526427475433, "grad_norm": 1.8984375, "learning_rate": 5.3288436690697e-06, "loss": 0.1411, "mean_token_accuracy": 0.9681597316265106, "num_tokens": 2222114305.0, "step": 15700 }, { "entropy": 1.4115266394615174, "epoch": 0.5141849760047011, "grad_norm": 1.453125, "learning_rate": 5.323730846025621e-06, "loss": 0.1436, "mean_token_accuracy": 0.9676663541793823, "num_tokens": 2229368804.0, "step": 15750 }, { "entropy": 1.3984081506729127, "epoch": 0.5158173092618589, "grad_norm": 1.40625, "learning_rate": 5.3186010948434535e-06, "loss": 0.1277, "mean_token_accuracy": 0.9710724341869355, "num_tokens": 2235855134.0, "step": 15800 }, { "entropy": 1.4158777141571044, "epoch": 0.5174496425190167, "grad_norm": 1.5546875, "learning_rate": 5.313454452892903e-06, "loss": 0.1412, "mean_token_accuracy": 0.9679831528663635, "num_tokens": 2242658034.0, "step": 15850 }, { "entropy": 1.4000438117980958, "epoch": 0.5190819757761744, "grad_norm": 1.6875, "learning_rate": 5.3082909576667206e-06, "loss": 0.1432, "mean_token_accuracy": 0.9681457090377807, "num_tokens": 2249786162.0, "step": 15900 }, { "entropy": 1.4153186726570128, "epoch": 0.5207143090333323, "grad_norm": 1.09375, "learning_rate": 5.303110646780435e-06, "loss": 0.1311, "mean_token_accuracy": 0.9702495551109314, "num_tokens": 2256415467.0, "step": 15950 }, { "entropy": 1.4182570719718932, "epoch": 0.52234664229049, "grad_norm": 1.4609375, "learning_rate": 5.297913557972074e-06, "loss": 0.1444, "mean_token_accuracy": 0.9671289598941804, "num_tokens": 2263118935.0, "step": 16000 }, { "epoch": 0.52234664229049, "eval_entropy": 1.4267909447352092, "eval_loss": 0.16035768389701843, "eval_mean_token_accuracy": 0.9642941602071127, "eval_num_tokens": 2263118935.0, "eval_runtime": 746.9288, "eval_samples_per_second": 12.928, "eval_steps_per_second": 0.102, "step": 16000 }, { "entropy": 1.4244341516494752, "epoch": 0.5239789755476478, "grad_norm": 1.4375, "learning_rate": 5.292699729101888e-06, "loss": 0.1341, "mean_token_accuracy": 0.9685980105400085, "num_tokens": 2270004148.0, "step": 16050 }, { "entropy": 1.433496663570404, "epoch": 0.5256113088048056, "grad_norm": 1.1875, "learning_rate": 5.2874691981520814e-06, "loss": 0.1489, "mean_token_accuracy": 0.9664645993709564, "num_tokens": 2277001614.0, "step": 16100 }, { "entropy": 1.4589428210258484, "epoch": 0.5272436420619634, "grad_norm": 1.390625, "learning_rate": 5.282222003226528e-06, "loss": 0.1494, "mean_token_accuracy": 0.9667097711563111, "num_tokens": 2283969486.0, "step": 16150 }, { "entropy": 1.4311462545394897, "epoch": 0.5288759753191211, "grad_norm": 1.1796875, "learning_rate": 5.276958182550499e-06, "loss": 0.1498, "mean_token_accuracy": 0.966657601594925, "num_tokens": 2291187492.0, "step": 16200 }, { "entropy": 1.4505498099327088, "epoch": 0.530508308576279, "grad_norm": 1.640625, "learning_rate": 5.271677774470383e-06, "loss": 0.1432, "mean_token_accuracy": 0.9675734841823578, "num_tokens": 2298489089.0, "step": 16250 }, { "entropy": 1.4610102701187133, "epoch": 0.5321406418334367, "grad_norm": 1.53125, "learning_rate": 5.2663808174534035e-06, "loss": 0.1528, "mean_token_accuracy": 0.9655195689201355, "num_tokens": 2306159999.0, "step": 16300 }, { "entropy": 1.4418502616882325, "epoch": 0.5337729750905945, "grad_norm": 1.578125, "learning_rate": 5.261067350087342e-06, "loss": 0.1448, "mean_token_accuracy": 0.9679492330551147, "num_tokens": 2313017319.0, "step": 16350 }, { "entropy": 1.4529797434806824, "epoch": 0.5354053083477522, "grad_norm": 2.0, "learning_rate": 5.255737411080258e-06, "loss": 0.1421, "mean_token_accuracy": 0.9680870652198792, "num_tokens": 2319830047.0, "step": 16400 }, { "entropy": 1.4405932116508484, "epoch": 0.5370376416049101, "grad_norm": 1.6328125, "learning_rate": 5.250391039260203e-06, "loss": 0.1341, "mean_token_accuracy": 0.9692844843864441, "num_tokens": 2326652556.0, "step": 16450 }, { "entropy": 1.4391892647743225, "epoch": 0.5386699748620678, "grad_norm": 1.203125, "learning_rate": 5.245028273574943e-06, "loss": 0.1455, "mean_token_accuracy": 0.9678148710727692, "num_tokens": 2333819381.0, "step": 16500 }, { "epoch": 0.5386699748620678, "eval_entropy": 1.4535431814193727, "eval_loss": 0.15979354083538055, "eval_mean_token_accuracy": 0.9644315036137899, "eval_num_tokens": 2333819381.0, "eval_runtime": 744.9612, "eval_samples_per_second": 12.962, "eval_steps_per_second": 0.102, "step": 16500 }, { "entropy": 1.4589093685150147, "epoch": 0.5403023081192256, "grad_norm": 1.546875, "learning_rate": 5.239649153091669e-06, "loss": 0.1366, "mean_token_accuracy": 0.9689966702461242, "num_tokens": 2340621485.0, "step": 16550 }, { "entropy": 1.4447486186027527, "epoch": 0.5419346413763834, "grad_norm": 2.34375, "learning_rate": 5.234253716996714e-06, "loss": 0.1407, "mean_token_accuracy": 0.9680201160907745, "num_tokens": 2347557447.0, "step": 16600 }, { "entropy": 1.4582811617851257, "epoch": 0.5435669746335412, "grad_norm": 1.2734375, "learning_rate": 5.228842004595271e-06, "loss": 0.1416, "mean_token_accuracy": 0.9680859756469726, "num_tokens": 2354654075.0, "step": 16650 }, { "entropy": 1.4596582174301147, "epoch": 0.5451993078906989, "grad_norm": 1.78125, "learning_rate": 5.223414055311104e-06, "loss": 0.1456, "mean_token_accuracy": 0.9669416832923889, "num_tokens": 2361479872.0, "step": 16700 }, { "entropy": 1.455612142086029, "epoch": 0.5468316411478568, "grad_norm": 1.9296875, "learning_rate": 5.217969908686259e-06, "loss": 0.1494, "mean_token_accuracy": 0.9662820196151733, "num_tokens": 2368784116.0, "step": 16750 }, { "entropy": 1.437941801548004, "epoch": 0.5484639744050145, "grad_norm": 1.390625, "learning_rate": 5.2125096043807805e-06, "loss": 0.1351, "mean_token_accuracy": 0.969438636302948, "num_tokens": 2375699232.0, "step": 16800 }, { "entropy": 1.4474022889137268, "epoch": 0.5500963076621723, "grad_norm": 1.5390625, "learning_rate": 5.2070331821724175e-06, "loss": 0.1496, "mean_token_accuracy": 0.9666418421268463, "num_tokens": 2382571188.0, "step": 16850 }, { "entropy": 1.443734383583069, "epoch": 0.55172864091933, "grad_norm": 1.359375, "learning_rate": 5.201540681956339e-06, "loss": 0.1417, "mean_token_accuracy": 0.9685157811641694, "num_tokens": 2389499925.0, "step": 16900 }, { "entropy": 1.4289966011047364, "epoch": 0.5533609741764879, "grad_norm": 1.5078125, "learning_rate": 5.196032143744837e-06, "loss": 0.1502, "mean_token_accuracy": 0.9662255728244782, "num_tokens": 2396898569.0, "step": 16950 }, { "entropy": 1.422741391658783, "epoch": 0.5549933074336456, "grad_norm": 2.109375, "learning_rate": 5.190507607667043e-06, "loss": 0.1362, "mean_token_accuracy": 0.9694574820995331, "num_tokens": 2403493744.0, "step": 17000 }, { "epoch": 0.5549933074336456, "eval_entropy": 1.410568381945292, "eval_loss": 0.1593320667743683, "eval_mean_token_accuracy": 0.9647167531649271, "eval_num_tokens": 2403493744.0, "eval_runtime": 749.6267, "eval_samples_per_second": 12.881, "eval_steps_per_second": 0.101, "step": 17000 }, { "entropy": 1.4179514050483704, "epoch": 0.5566256406908034, "grad_norm": 1.5078125, "learning_rate": 5.184967113968628e-06, "loss": 0.1437, "mean_token_accuracy": 0.9673176133632659, "num_tokens": 2410291888.0, "step": 17050 }, { "entropy": 1.420968849658966, "epoch": 0.5582579739479612, "grad_norm": 1.875, "learning_rate": 5.179410703011514e-06, "loss": 0.1416, "mean_token_accuracy": 0.9681920135021209, "num_tokens": 2417511664.0, "step": 17100 }, { "entropy": 1.4380729961395264, "epoch": 0.559890307205119, "grad_norm": 1.984375, "learning_rate": 5.173838415273578e-06, "loss": 0.1381, "mean_token_accuracy": 0.9696194708347321, "num_tokens": 2424287780.0, "step": 17150 }, { "entropy": 1.418275089263916, "epoch": 0.5615226404622767, "grad_norm": 1.796875, "learning_rate": 5.168250291348358e-06, "loss": 0.1307, "mean_token_accuracy": 0.9706324160099029, "num_tokens": 2431053313.0, "step": 17200 }, { "entropy": 1.4160829043388368, "epoch": 0.5631549737194346, "grad_norm": 1.1328125, "learning_rate": 5.162646371944757e-06, "loss": 0.139, "mean_token_accuracy": 0.9683985018730163, "num_tokens": 2438106375.0, "step": 17250 }, { "entropy": 1.400640585422516, "epoch": 0.5647873069765923, "grad_norm": 1.7421875, "learning_rate": 5.157026697886745e-06, "loss": 0.1367, "mean_token_accuracy": 0.9683994352817535, "num_tokens": 2445056771.0, "step": 17300 }, { "entropy": 1.4130174493789673, "epoch": 0.5664196402337501, "grad_norm": 1.3125, "learning_rate": 5.151391310113067e-06, "loss": 0.1459, "mean_token_accuracy": 0.9672868931293488, "num_tokens": 2452756923.0, "step": 17350 }, { "entropy": 1.4374343585968017, "epoch": 0.5680519734909079, "grad_norm": 1.7890625, "learning_rate": 5.145740249676937e-06, "loss": 0.1405, "mean_token_accuracy": 0.9680230569839477, "num_tokens": 2459581470.0, "step": 17400 }, { "entropy": 1.4422858119010926, "epoch": 0.5696843067480657, "grad_norm": 1.6640625, "learning_rate": 5.140073557745743e-06, "loss": 0.1365, "mean_token_accuracy": 0.9697402846813202, "num_tokens": 2466422672.0, "step": 17450 }, { "entropy": 1.430338339805603, "epoch": 0.5713166400052234, "grad_norm": 1.1875, "learning_rate": 5.134391275600748e-06, "loss": 0.1361, "mean_token_accuracy": 0.9687436437606811, "num_tokens": 2473543628.0, "step": 17500 }, { "epoch": 0.5713166400052234, "eval_entropy": 1.4239939387639364, "eval_loss": 0.15963123738765717, "eval_mean_token_accuracy": 0.9646872194608053, "eval_num_tokens": 2473543628.0, "eval_runtime": 746.9116, "eval_samples_per_second": 12.928, "eval_steps_per_second": 0.102, "step": 17500 }, { "entropy": 1.41823988199234, "epoch": 0.5729489732623813, "grad_norm": 2.1875, "learning_rate": 5.12869344463679e-06, "loss": 0.1349, "mean_token_accuracy": 0.969758038520813, "num_tokens": 2480227038.0, "step": 17550 }, { "entropy": 1.423335657119751, "epoch": 0.574581306519539, "grad_norm": 1.3125, "learning_rate": 5.122980106361973e-06, "loss": 0.1427, "mean_token_accuracy": 0.9678147983551025, "num_tokens": 2487475967.0, "step": 17600 }, { "entropy": 1.4069186902046205, "epoch": 0.5762136397766968, "grad_norm": 1.171875, "learning_rate": 5.117251302397376e-06, "loss": 0.138, "mean_token_accuracy": 0.9696054446697235, "num_tokens": 2494547025.0, "step": 17650 }, { "entropy": 1.4386369252204896, "epoch": 0.5778459730338545, "grad_norm": 1.90625, "learning_rate": 5.111507074476741e-06, "loss": 0.1677, "mean_token_accuracy": 0.9627214801311493, "num_tokens": 2502024527.0, "step": 17700 }, { "entropy": 1.4271745777130127, "epoch": 0.5794783062910124, "grad_norm": 2.078125, "learning_rate": 5.105747464446171e-06, "loss": 0.1462, "mean_token_accuracy": 0.9673130023479461, "num_tokens": 2509136097.0, "step": 17750 }, { "entropy": 1.4218875932693482, "epoch": 0.5811106395481701, "grad_norm": 2.078125, "learning_rate": 5.099972514263828e-06, "loss": 0.1425, "mean_token_accuracy": 0.9676541352272033, "num_tokens": 2516010870.0, "step": 17800 }, { "entropy": 1.4141360521316528, "epoch": 0.5827429728053279, "grad_norm": 1.171875, "learning_rate": 5.094182265999625e-06, "loss": 0.144, "mean_token_accuracy": 0.9672036898136139, "num_tokens": 2523293871.0, "step": 17850 }, { "entropy": 1.4077139925956725, "epoch": 0.5843753060624857, "grad_norm": 1.375, "learning_rate": 5.0883767618349205e-06, "loss": 0.1419, "mean_token_accuracy": 0.9684661495685577, "num_tokens": 2530147747.0, "step": 17900 }, { "entropy": 1.407555968761444, "epoch": 0.5860076393196435, "grad_norm": 1.5078125, "learning_rate": 5.082556044062209e-06, "loss": 0.1277, "mean_token_accuracy": 0.9704045712947845, "num_tokens": 2536714168.0, "step": 17950 }, { "entropy": 1.4110585117340089, "epoch": 0.5876399725768012, "grad_norm": 1.3125, "learning_rate": 5.0767201550848155e-06, "loss": 0.1402, "mean_token_accuracy": 0.9680717265605927, "num_tokens": 2543453566.0, "step": 18000 }, { "epoch": 0.5876399725768012, "eval_entropy": 1.3970834159851073, "eval_loss": 0.15939612686634064, "eval_mean_token_accuracy": 0.9646043960253398, "eval_num_tokens": 2543453566.0, "eval_runtime": 745.3104, "eval_samples_per_second": 12.956, "eval_steps_per_second": 0.102, "step": 18000 }, { "entropy": 1.405302128791809, "epoch": 0.5892723058339591, "grad_norm": 1.8203125, "learning_rate": 5.070869137416586e-06, "loss": 0.1392, "mean_token_accuracy": 0.9686900818347931, "num_tokens": 2550316340.0, "step": 18050 }, { "entropy": 1.409914915561676, "epoch": 0.5909046390911168, "grad_norm": 1.0703125, "learning_rate": 5.065003033681577e-06, "loss": 0.1464, "mean_token_accuracy": 0.9667869770526886, "num_tokens": 2557419151.0, "step": 18100 }, { "entropy": 1.4183743119239807, "epoch": 0.5925369723482746, "grad_norm": 1.7890625, "learning_rate": 5.059121886613746e-06, "loss": 0.1463, "mean_token_accuracy": 0.9668680250644683, "num_tokens": 2564861132.0, "step": 18150 }, { "entropy": 1.4198586773872375, "epoch": 0.5941693056054324, "grad_norm": 1.515625, "learning_rate": 5.053225739056638e-06, "loss": 0.1395, "mean_token_accuracy": 0.9689326965808869, "num_tokens": 2571587295.0, "step": 18200 }, { "entropy": 1.4410333514213562, "epoch": 0.5958016388625902, "grad_norm": 2.09375, "learning_rate": 5.047314633963077e-06, "loss": 0.142, "mean_token_accuracy": 0.9676812827587128, "num_tokens": 2578976320.0, "step": 18250 }, { "entropy": 1.4181599235534668, "epoch": 0.5974339721197479, "grad_norm": 1.46875, "learning_rate": 5.04138861439485e-06, "loss": 0.1388, "mean_token_accuracy": 0.968032683134079, "num_tokens": 2585791943.0, "step": 18300 }, { "entropy": 1.4222650122642517, "epoch": 0.5990663053769058, "grad_norm": 2.0, "learning_rate": 5.0354477235223945e-06, "loss": 0.1479, "mean_token_accuracy": 0.9662945425510406, "num_tokens": 2592969827.0, "step": 18350 }, { "entropy": 1.4107950401306153, "epoch": 0.6006986386340635, "grad_norm": 1.6171875, "learning_rate": 5.029492004624484e-06, "loss": 0.1495, "mean_token_accuracy": 0.9669906544685364, "num_tokens": 2600453000.0, "step": 18400 }, { "entropy": 1.4090588116645812, "epoch": 0.6023309718912213, "grad_norm": 2.578125, "learning_rate": 5.023521501087913e-06, "loss": 0.142, "mean_token_accuracy": 0.9677951109409332, "num_tokens": 2607819604.0, "step": 18450 }, { "entropy": 1.4200125312805176, "epoch": 0.6039633051483791, "grad_norm": 1.6875, "learning_rate": 5.017536256407179e-06, "loss": 0.1471, "mean_token_accuracy": 0.9666409981250763, "num_tokens": 2615067666.0, "step": 18500 }, { "epoch": 0.6039633051483791, "eval_entropy": 1.4210956970850626, "eval_loss": 0.15948741137981415, "eval_mean_token_accuracy": 0.9648204270998637, "eval_num_tokens": 2615067666.0, "eval_runtime": 748.5938, "eval_samples_per_second": 12.899, "eval_steps_per_second": 0.102, "step": 18500 }, { "entropy": 1.4314108753204347, "epoch": 0.6055956384055369, "grad_norm": 1.421875, "learning_rate": 5.011536314184171e-06, "loss": 0.1353, "mean_token_accuracy": 0.9691914403438568, "num_tokens": 2621774246.0, "step": 18550 }, { "entropy": 1.438691704273224, "epoch": 0.6072279716626947, "grad_norm": 1.53125, "learning_rate": 5.0055217181278435e-06, "loss": 0.152, "mean_token_accuracy": 0.9660729610919953, "num_tokens": 2628968865.0, "step": 18600 }, { "entropy": 1.4327943706512452, "epoch": 0.6088603049198524, "grad_norm": 0.9453125, "learning_rate": 4.999492512053904e-06, "loss": 0.1429, "mean_token_accuracy": 0.9679563343524933, "num_tokens": 2636298361.0, "step": 18650 }, { "entropy": 1.419541118144989, "epoch": 0.6104926381770103, "grad_norm": 1.6953125, "learning_rate": 4.993448739884496e-06, "loss": 0.1365, "mean_token_accuracy": 0.9690783095359802, "num_tokens": 2643149114.0, "step": 18700 }, { "entropy": 1.433673801422119, "epoch": 0.612124971434168, "grad_norm": 1.3203125, "learning_rate": 4.98739044564787e-06, "loss": 0.1387, "mean_token_accuracy": 0.9684871184825897, "num_tokens": 2649909312.0, "step": 18750 }, { "entropy": 1.41679913520813, "epoch": 0.6137573046913258, "grad_norm": 1.859375, "learning_rate": 4.9813176734780714e-06, "loss": 0.1351, "mean_token_accuracy": 0.9691712772846222, "num_tokens": 2656772858.0, "step": 18800 }, { "entropy": 1.4235585713386536, "epoch": 0.6153896379484836, "grad_norm": 1.3125, "learning_rate": 4.975230467614616e-06, "loss": 0.1413, "mean_token_accuracy": 0.9680274319648743, "num_tokens": 2664404838.0, "step": 18850 }, { "entropy": 1.452963993549347, "epoch": 0.6170219712056414, "grad_norm": 1.4765625, "learning_rate": 4.969128872402166e-06, "loss": 0.1479, "mean_token_accuracy": 0.9667314755916595, "num_tokens": 2671712943.0, "step": 18900 }, { "entropy": 1.4436225032806396, "epoch": 0.6186543044627991, "grad_norm": 1.578125, "learning_rate": 4.96301293229021e-06, "loss": 0.1499, "mean_token_accuracy": 0.9664954626560212, "num_tokens": 2678603623.0, "step": 18950 }, { "entropy": 1.4565304446220397, "epoch": 0.620286637719957, "grad_norm": 2.0625, "learning_rate": 4.9568826918327375e-06, "loss": 0.1481, "mean_token_accuracy": 0.9671436643600464, "num_tokens": 2685887315.0, "step": 19000 }, { "epoch": 0.620286637719957, "eval_entropy": 1.4394246594111124, "eval_loss": 0.15968339145183563, "eval_mean_token_accuracy": 0.9645526838302613, "eval_num_tokens": 2685887315.0, "eval_runtime": 743.8859, "eval_samples_per_second": 12.98, "eval_steps_per_second": 0.102, "step": 19000 }, { "entropy": 1.4509250116348267, "epoch": 0.6219189709771147, "grad_norm": 1.5078125, "learning_rate": 4.950738195687914e-06, "loss": 0.1478, "mean_token_accuracy": 0.9675949096679688, "num_tokens": 2692939859.0, "step": 19050 }, { "entropy": 1.4424961733818054, "epoch": 0.6235513042342725, "grad_norm": 1.5859375, "learning_rate": 4.944579488617754e-06, "loss": 0.1342, "mean_token_accuracy": 0.9699901640415192, "num_tokens": 2699651700.0, "step": 19100 }, { "entropy": 1.43824138879776, "epoch": 0.6251836374914302, "grad_norm": 1.6953125, "learning_rate": 4.938406615487804e-06, "loss": 0.1498, "mean_token_accuracy": 0.9661287260055542, "num_tokens": 2707480098.0, "step": 19150 }, { "entropy": 1.4361098647117614, "epoch": 0.6268159707485881, "grad_norm": 1.671875, "learning_rate": 4.9322196212668e-06, "loss": 0.1563, "mean_token_accuracy": 0.9649386501312256, "num_tokens": 2715153146.0, "step": 19200 }, { "entropy": 1.4300830745697022, "epoch": 0.6284483040057458, "grad_norm": 1.15625, "learning_rate": 4.9260185510263546e-06, "loss": 0.1363, "mean_token_accuracy": 0.9690042114257813, "num_tokens": 2722322826.0, "step": 19250 }, { "entropy": 1.423244218826294, "epoch": 0.6300806372629036, "grad_norm": 2.140625, "learning_rate": 4.919803449940621e-06, "loss": 0.1253, "mean_token_accuracy": 0.9720934724807739, "num_tokens": 2728961876.0, "step": 19300 }, { "entropy": 1.4357132482528687, "epoch": 0.6317129705200614, "grad_norm": 1.2265625, "learning_rate": 4.913574363285965e-06, "loss": 0.1451, "mean_token_accuracy": 0.967208684682846, "num_tokens": 2735922821.0, "step": 19350 }, { "entropy": 1.4567182993888854, "epoch": 0.6333453037772192, "grad_norm": 1.1953125, "learning_rate": 4.907331336440637e-06, "loss": 0.1533, "mean_token_accuracy": 0.9658544027805328, "num_tokens": 2743184778.0, "step": 19400 }, { "entropy": 1.4356631135940552, "epoch": 0.6349776370343769, "grad_norm": 1.15625, "learning_rate": 4.9010744148844414e-06, "loss": 0.1392, "mean_token_accuracy": 0.9687076592445374, "num_tokens": 2750183248.0, "step": 19450 }, { "entropy": 1.4388001561164856, "epoch": 0.6366099702915348, "grad_norm": 1.4296875, "learning_rate": 4.8948036441984e-06, "loss": 0.1408, "mean_token_accuracy": 0.9690019035339356, "num_tokens": 2757183194.0, "step": 19500 }, { "epoch": 0.6366099702915348, "eval_entropy": 1.4425183471043905, "eval_loss": 0.15926624834537506, "eval_mean_token_accuracy": 0.9642885835965475, "eval_num_tokens": 2757183194.0, "eval_runtime": 747.7266, "eval_samples_per_second": 12.914, "eval_steps_per_second": 0.102, "step": 19500 }, { "entropy": 1.4483740854263305, "epoch": 0.6382423035486925, "grad_norm": 1.59375, "learning_rate": 4.888519070064427e-06, "loss": 0.1424, "mean_token_accuracy": 0.9682291400432587, "num_tokens": 2764192867.0, "step": 19550 }, { "entropy": 1.4348323988914489, "epoch": 0.6398746368058503, "grad_norm": 1.9375, "learning_rate": 4.882220738264994e-06, "loss": 0.1485, "mean_token_accuracy": 0.9667525351047516, "num_tokens": 2771378704.0, "step": 19600 }, { "entropy": 1.431165406703949, "epoch": 0.641506970063008, "grad_norm": 2.03125, "learning_rate": 4.875908694682793e-06, "loss": 0.1387, "mean_token_accuracy": 0.9686442255973816, "num_tokens": 2778817459.0, "step": 19650 }, { "entropy": 1.4262248253822327, "epoch": 0.6431393033201659, "grad_norm": 1.9453125, "learning_rate": 4.869582985300409e-06, "loss": 0.1443, "mean_token_accuracy": 0.9666674077510834, "num_tokens": 2785809690.0, "step": 19700 }, { "entropy": 1.4198060846328735, "epoch": 0.6447716365773236, "grad_norm": 1.34375, "learning_rate": 4.8632436561999754e-06, "loss": 0.141, "mean_token_accuracy": 0.9685878479480743, "num_tokens": 2792972553.0, "step": 19750 }, { "entropy": 1.42301029920578, "epoch": 0.6464039698344815, "grad_norm": 1.234375, "learning_rate": 4.85689075356285e-06, "loss": 0.1323, "mean_token_accuracy": 0.9700925529003144, "num_tokens": 2799755524.0, "step": 19800 }, { "entropy": 1.407839720249176, "epoch": 0.6480363030916392, "grad_norm": 1.2578125, "learning_rate": 4.850524323669266e-06, "loss": 0.144, "mean_token_accuracy": 0.9683354413509369, "num_tokens": 2806934638.0, "step": 19850 }, { "entropy": 1.4161179232597352, "epoch": 0.649668636348797, "grad_norm": 1.5546875, "learning_rate": 4.844144412898006e-06, "loss": 0.1468, "mean_token_accuracy": 0.9669712007045745, "num_tokens": 2814052906.0, "step": 19900 }, { "entropy": 1.4124717712402344, "epoch": 0.6513009696059547, "grad_norm": 1.5390625, "learning_rate": 4.83775106772606e-06, "loss": 0.1368, "mean_token_accuracy": 0.9689941215515137, "num_tokens": 2820334438.0, "step": 19950 }, { "entropy": 1.4184478282928468, "epoch": 0.6529333028631126, "grad_norm": 2.0625, "learning_rate": 4.8313443347282805e-06, "loss": 0.1377, "mean_token_accuracy": 0.9692059123516082, "num_tokens": 2826961099.0, "step": 20000 }, { "epoch": 0.6529333028631126, "eval_entropy": 1.4259551413853964, "eval_loss": 0.15875579416751862, "eval_mean_token_accuracy": 0.9646251447995504, "eval_num_tokens": 2826961099.0, "eval_runtime": 744.3635, "eval_samples_per_second": 12.972, "eval_steps_per_second": 0.102, "step": 20000 }, { "entropy": 1.4160712957382202, "epoch": 0.6545656361202703, "grad_norm": 1.484375, "learning_rate": 4.824924260577056e-06, "loss": 0.1377, "mean_token_accuracy": 0.9691171681880951, "num_tokens": 2834147273.0, "step": 20050 }, { "entropy": 1.4152118015289306, "epoch": 0.6561979693774281, "grad_norm": 1.109375, "learning_rate": 4.818490892041959e-06, "loss": 0.1347, "mean_token_accuracy": 0.9702432310581207, "num_tokens": 2840408479.0, "step": 20100 }, { "entropy": 1.4018067622184753, "epoch": 0.6578303026345859, "grad_norm": 1.1953125, "learning_rate": 4.81204427598941e-06, "loss": 0.1227, "mean_token_accuracy": 0.9716095530986786, "num_tokens": 2847100048.0, "step": 20150 }, { "entropy": 1.4273248219490051, "epoch": 0.6594626358917437, "grad_norm": 1.28125, "learning_rate": 4.805584459382342e-06, "loss": 0.144, "mean_token_accuracy": 0.9675836896896363, "num_tokens": 2853932358.0, "step": 20200 }, { "entropy": 1.431253423690796, "epoch": 0.6610949691489014, "grad_norm": 1.8671875, "learning_rate": 4.799111489279844e-06, "loss": 0.1403, "mean_token_accuracy": 0.9687580478191375, "num_tokens": 2860669435.0, "step": 20250 }, { "entropy": 1.4074326467514038, "epoch": 0.6627273024060593, "grad_norm": 1.375, "learning_rate": 4.792625412836835e-06, "loss": 0.136, "mean_token_accuracy": 0.9695438253879547, "num_tokens": 2867505726.0, "step": 20300 }, { "entropy": 1.4165804195404053, "epoch": 0.664359635663217, "grad_norm": 1.296875, "learning_rate": 4.786126277303707e-06, "loss": 0.1459, "mean_token_accuracy": 0.9678076064586639, "num_tokens": 2874388589.0, "step": 20350 }, { "entropy": 1.405945236682892, "epoch": 0.6659919689203748, "grad_norm": 1.7109375, "learning_rate": 4.779614130025989e-06, "loss": 0.1434, "mean_token_accuracy": 0.9678366994857788, "num_tokens": 2881624440.0, "step": 20400 }, { "entropy": 1.4089183855056762, "epoch": 0.6676243021775325, "grad_norm": 1.0546875, "learning_rate": 4.7730890184439984e-06, "loss": 0.1402, "mean_token_accuracy": 0.9682434296607971, "num_tokens": 2888818613.0, "step": 20450 }, { "entropy": 1.4186338901519775, "epoch": 0.6692566354346904, "grad_norm": 2.375, "learning_rate": 4.766550990092494e-06, "loss": 0.1501, "mean_token_accuracy": 0.9658879733085632, "num_tokens": 2895674916.0, "step": 20500 }, { "epoch": 0.6692566354346904, "eval_entropy": 1.4077574555079142, "eval_loss": 0.15949369966983795, "eval_mean_token_accuracy": 0.9645699612299601, "eval_num_tokens": 2895674916.0, "eval_runtime": 751.7816, "eval_samples_per_second": 12.844, "eval_steps_per_second": 0.101, "step": 20500 }, { "entropy": 1.4104346489906312, "epoch": 0.6708889686918481, "grad_norm": 2.1875, "learning_rate": 4.760000092600337e-06, "loss": 0.1379, "mean_token_accuracy": 0.969134624004364, "num_tokens": 2902349294.0, "step": 20550 }, { "entropy": 1.4055168724060059, "epoch": 0.672521301949006, "grad_norm": 1.8359375, "learning_rate": 4.7534363736901334e-06, "loss": 0.1305, "mean_token_accuracy": 0.9709035861492157, "num_tokens": 2909212953.0, "step": 20600 }, { "entropy": 1.4179642391204834, "epoch": 0.6741536352061637, "grad_norm": 1.3515625, "learning_rate": 4.746859881177895e-06, "loss": 0.1462, "mean_token_accuracy": 0.9668932211399078, "num_tokens": 2916418343.0, "step": 20650 }, { "entropy": 1.3955682134628296, "epoch": 0.6757859684633215, "grad_norm": 2.03125, "learning_rate": 4.7402706629726884e-06, "loss": 0.1335, "mean_token_accuracy": 0.9696724009513855, "num_tokens": 2923371196.0, "step": 20700 }, { "entropy": 1.4021626067161561, "epoch": 0.6774183017204792, "grad_norm": 1.6875, "learning_rate": 4.733668767076282e-06, "loss": 0.1525, "mean_token_accuracy": 0.9660227704048157, "num_tokens": 2930542072.0, "step": 20750 }, { "entropy": 1.3913014960289, "epoch": 0.6790506349776371, "grad_norm": 1.453125, "learning_rate": 4.727054241582805e-06, "loss": 0.1393, "mean_token_accuracy": 0.968789451122284, "num_tokens": 2937522853.0, "step": 20800 }, { "entropy": 1.3855742335319519, "epoch": 0.6806829682347948, "grad_norm": 1.90625, "learning_rate": 4.720427134678388e-06, "loss": 0.1295, "mean_token_accuracy": 0.9704008960723877, "num_tokens": 2944112485.0, "step": 20850 }, { "entropy": 1.3942367219924927, "epoch": 0.6823153014919526, "grad_norm": 1.3828125, "learning_rate": 4.713787494640818e-06, "loss": 0.1414, "mean_token_accuracy": 0.9685467886924743, "num_tokens": 2951083570.0, "step": 20900 }, { "entropy": 1.3912032508850098, "epoch": 0.6839476347491104, "grad_norm": 1.4609375, "learning_rate": 4.707135369839182e-06, "loss": 0.1443, "mean_token_accuracy": 0.9680955350399018, "num_tokens": 2958496819.0, "step": 20950 }, { "entropy": 1.3916519379615784, "epoch": 0.6855799680062682, "grad_norm": 2.15625, "learning_rate": 4.70047080873352e-06, "loss": 0.1419, "mean_token_accuracy": 0.9678064894676208, "num_tokens": 2965544584.0, "step": 21000 }, { "epoch": 0.6855799680062682, "eval_entropy": 1.3711633268992107, "eval_loss": 0.15859009325504303, "eval_mean_token_accuracy": 0.9645708012580871, "eval_num_tokens": 2965544584.0, "eval_runtime": 749.4291, "eval_samples_per_second": 12.884, "eval_steps_per_second": 0.101, "step": 21000 }, { "entropy": 1.3885661768913269, "epoch": 0.6872123012634259, "grad_norm": 1.546875, "learning_rate": 4.693793859874469e-06, "loss": 0.1389, "mean_token_accuracy": 0.968884084224701, "num_tokens": 2972445388.0, "step": 21050 }, { "entropy": 1.3977628707885743, "epoch": 0.6888446345205838, "grad_norm": 1.1796875, "learning_rate": 4.687104571902907e-06, "loss": 0.153, "mean_token_accuracy": 0.9655460596084595, "num_tokens": 2979970737.0, "step": 21100 }, { "entropy": 1.3903096175193788, "epoch": 0.6904769677777415, "grad_norm": 2.21875, "learning_rate": 4.680402993549603e-06, "loss": 0.1434, "mean_token_accuracy": 0.9677474045753479, "num_tokens": 2986677415.0, "step": 21150 }, { "entropy": 1.404201271533966, "epoch": 0.6921093010348993, "grad_norm": 1.7421875, "learning_rate": 4.673689173634861e-06, "loss": 0.1581, "mean_token_accuracy": 0.9651612138748169, "num_tokens": 2994298485.0, "step": 21200 }, { "entropy": 1.3858560705184937, "epoch": 0.693741634292057, "grad_norm": 1.828125, "learning_rate": 4.666963161068162e-06, "loss": 0.1459, "mean_token_accuracy": 0.967083740234375, "num_tokens": 3001141837.0, "step": 21250 }, { "entropy": 1.3804667377471924, "epoch": 0.6953739675492149, "grad_norm": 1.6796875, "learning_rate": 4.660225004847808e-06, "loss": 0.1464, "mean_token_accuracy": 0.9659059000015259, "num_tokens": 3008640955.0, "step": 21300 }, { "entropy": 1.3600660967826843, "epoch": 0.6970063008063726, "grad_norm": 1.6953125, "learning_rate": 4.65347475406057e-06, "loss": 0.1309, "mean_token_accuracy": 0.9700274157524109, "num_tokens": 3015031696.0, "step": 21350 }, { "entropy": 1.363333306312561, "epoch": 0.6986386340635304, "grad_norm": 1.4453125, "learning_rate": 4.646712457881323e-06, "loss": 0.1455, "mean_token_accuracy": 0.9672285616397858, "num_tokens": 3022491104.0, "step": 21400 }, { "entropy": 1.3792692565917968, "epoch": 0.7002709673206882, "grad_norm": 2.234375, "learning_rate": 4.639938165572694e-06, "loss": 0.1365, "mean_token_accuracy": 0.9694170689582825, "num_tokens": 3029347619.0, "step": 21450 }, { "entropy": 1.3687973999977112, "epoch": 0.701903300577846, "grad_norm": 1.171875, "learning_rate": 4.6331519264847e-06, "loss": 0.1425, "mean_token_accuracy": 0.9671109592914582, "num_tokens": 3036384532.0, "step": 21500 }, { "epoch": 0.701903300577846, "eval_entropy": 1.377278790473938, "eval_loss": 0.15810321271419525, "eval_mean_token_accuracy": 0.9647063970565796, "eval_num_tokens": 3036384532.0, "eval_runtime": 747.7284, "eval_samples_per_second": 12.914, "eval_steps_per_second": 0.102, "step": 21500 }, { "entropy": 1.3783918046951293, "epoch": 0.7035356338350037, "grad_norm": 1.625, "learning_rate": 4.626353790054387e-06, "loss": 0.1487, "mean_token_accuracy": 0.967153193950653, "num_tokens": 3043409455.0, "step": 21550 }, { "entropy": 1.3809619188308715, "epoch": 0.7051679670921616, "grad_norm": 1.078125, "learning_rate": 4.619543805805475e-06, "loss": 0.145, "mean_token_accuracy": 0.9672031688690186, "num_tokens": 3050598353.0, "step": 21600 }, { "entropy": 1.3623022150993347, "epoch": 0.7068003003493193, "grad_norm": 1.5, "learning_rate": 4.612722023347991e-06, "loss": 0.1383, "mean_token_accuracy": 0.9682861661911011, "num_tokens": 3057976676.0, "step": 21650 }, { "entropy": 1.361666476726532, "epoch": 0.7084326336064771, "grad_norm": 1.4921875, "learning_rate": 4.6058884923779135e-06, "loss": 0.143, "mean_token_accuracy": 0.9676965260505677, "num_tokens": 3065507507.0, "step": 21700 }, { "entropy": 1.3833092284202575, "epoch": 0.7100649668636348, "grad_norm": 1.84375, "learning_rate": 4.599043262676806e-06, "loss": 0.1443, "mean_token_accuracy": 0.9680466854572296, "num_tokens": 3072550373.0, "step": 21750 }, { "entropy": 1.386215124130249, "epoch": 0.7116973001207927, "grad_norm": 2.015625, "learning_rate": 4.592186384111457e-06, "loss": 0.1457, "mean_token_accuracy": 0.9672110736370086, "num_tokens": 3079372014.0, "step": 21800 }, { "entropy": 1.3695436239242553, "epoch": 0.7133296333779504, "grad_norm": 1.6640625, "learning_rate": 4.585317906633516e-06, "loss": 0.1291, "mean_token_accuracy": 0.9703351008892059, "num_tokens": 3086062214.0, "step": 21850 }, { "entropy": 1.3959812355041503, "epoch": 0.7149619666351082, "grad_norm": 1.8046875, "learning_rate": 4.578437880279126e-06, "loss": 0.1269, "mean_token_accuracy": 0.9709628915786743, "num_tokens": 3092573548.0, "step": 21900 }, { "entropy": 1.3880902981758119, "epoch": 0.716594299892266, "grad_norm": 1.984375, "learning_rate": 4.571546355168567e-06, "loss": 0.1414, "mean_token_accuracy": 0.9681686234474182, "num_tokens": 3099550809.0, "step": 21950 }, { "entropy": 1.40864750623703, "epoch": 0.7182266331494238, "grad_norm": 1.1953125, "learning_rate": 4.564643381505886e-06, "loss": 0.1468, "mean_token_accuracy": 0.9665729129314422, "num_tokens": 3106864257.0, "step": 22000 }, { "epoch": 0.7182266331494238, "eval_entropy": 1.3993131558100382, "eval_loss": 0.15842117369174957, "eval_mean_token_accuracy": 0.9647385891278585, "eval_num_tokens": 3106864257.0, "eval_runtime": 751.7197, "eval_samples_per_second": 12.845, "eval_steps_per_second": 0.101, "step": 22000 }, { "entropy": 1.3926187753677368, "epoch": 0.7198589664065815, "grad_norm": 1.5234375, "learning_rate": 4.557729009578527e-06, "loss": 0.14, "mean_token_accuracy": 0.9683624911308288, "num_tokens": 3113827777.0, "step": 22050 }, { "entropy": 1.3936706256866456, "epoch": 0.7214912996637394, "grad_norm": 1.828125, "learning_rate": 4.550803289756973e-06, "loss": 0.1333, "mean_token_accuracy": 0.9700661396980286, "num_tokens": 3120906575.0, "step": 22100 }, { "entropy": 1.3945861506462096, "epoch": 0.7231236329208971, "grad_norm": 1.1953125, "learning_rate": 4.543866272494375e-06, "loss": 0.1535, "mean_token_accuracy": 0.9649194324016571, "num_tokens": 3128776941.0, "step": 22150 }, { "entropy": 1.4023722219467163, "epoch": 0.7247559661780549, "grad_norm": 1.125, "learning_rate": 4.536918008326183e-06, "loss": 0.137, "mean_token_accuracy": 0.9688715541362762, "num_tokens": 3135678296.0, "step": 22200 }, { "entropy": 1.4066932153701783, "epoch": 0.7263882994352127, "grad_norm": 1.6015625, "learning_rate": 4.529958547869781e-06, "loss": 0.1373, "mean_token_accuracy": 0.968953766822815, "num_tokens": 3142368713.0, "step": 22250 }, { "entropy": 1.4128772020339966, "epoch": 0.7280206326923705, "grad_norm": 2.125, "learning_rate": 4.5229879418241155e-06, "loss": 0.1431, "mean_token_accuracy": 0.9682337057590484, "num_tokens": 3149212589.0, "step": 22300 }, { "entropy": 1.4088830590248107, "epoch": 0.7296529659495282, "grad_norm": 1.9609375, "learning_rate": 4.516006240969329e-06, "loss": 0.1481, "mean_token_accuracy": 0.9669530403614044, "num_tokens": 3156794748.0, "step": 22350 }, { "entropy": 1.41916588306427, "epoch": 0.7312852992066861, "grad_norm": 1.1484375, "learning_rate": 4.509013496166387e-06, "loss": 0.1429, "mean_token_accuracy": 0.9679375386238098, "num_tokens": 3164068091.0, "step": 22400 }, { "entropy": 1.408693754673004, "epoch": 0.7329176324638438, "grad_norm": 1.078125, "learning_rate": 4.5020097583567104e-06, "loss": 0.1324, "mean_token_accuracy": 0.9701212620735169, "num_tokens": 3170772040.0, "step": 22450 }, { "entropy": 1.4141771841049193, "epoch": 0.7345499657210016, "grad_norm": 1.484375, "learning_rate": 4.4949950785618025e-06, "loss": 0.1394, "mean_token_accuracy": 0.9690138208866119, "num_tokens": 3177787601.0, "step": 22500 }, { "epoch": 0.7345499657210016, "eval_entropy": 1.405626532236735, "eval_loss": 0.1584155559539795, "eval_mean_token_accuracy": 0.9649723815917969, "eval_num_tokens": 3177787601.0, "eval_runtime": 749.0195, "eval_samples_per_second": 12.892, "eval_steps_per_second": 0.101, "step": 22500 }, { "entropy": 1.409652578830719, "epoch": 0.7361822989781593, "grad_norm": 1.515625, "learning_rate": 4.4879695078828765e-06, "loss": 0.1447, "mean_token_accuracy": 0.9667559254169464, "num_tokens": 3184597089.0, "step": 22550 }, { "entropy": 1.3969360828399657, "epoch": 0.7378146322353172, "grad_norm": 1.8203125, "learning_rate": 4.480933097500489e-06, "loss": 0.1261, "mean_token_accuracy": 0.9719417309761047, "num_tokens": 3191019437.0, "step": 22600 }, { "entropy": 1.4107865738868712, "epoch": 0.7394469654924749, "grad_norm": 2.28125, "learning_rate": 4.473885898674155e-06, "loss": 0.1444, "mean_token_accuracy": 0.9674191176891327, "num_tokens": 3198108407.0, "step": 22650 }, { "entropy": 1.4059437608718872, "epoch": 0.7410792987496327, "grad_norm": 1.71875, "learning_rate": 4.4668279627419904e-06, "loss": 0.1408, "mean_token_accuracy": 0.9680229306221009, "num_tokens": 3205399672.0, "step": 22700 }, { "entropy": 1.3878255271911621, "epoch": 0.7427116320067905, "grad_norm": 1.3671875, "learning_rate": 4.459759341120323e-06, "loss": 0.1355, "mean_token_accuracy": 0.9695122539997101, "num_tokens": 3212086664.0, "step": 22750 }, { "entropy": 1.3905278396606446, "epoch": 0.7443439652639483, "grad_norm": 1.2265625, "learning_rate": 4.452680085303331e-06, "loss": 0.1335, "mean_token_accuracy": 0.9699708425998688, "num_tokens": 3219061680.0, "step": 22800 }, { "entropy": 1.392570481300354, "epoch": 0.745976298521106, "grad_norm": 2.0, "learning_rate": 4.445590246862656e-06, "loss": 0.1348, "mean_token_accuracy": 0.9696343839168549, "num_tokens": 3225449142.0, "step": 22850 }, { "entropy": 1.3988615465164185, "epoch": 0.7476086317782639, "grad_norm": 1.4296875, "learning_rate": 4.438489877447037e-06, "loss": 0.153, "mean_token_accuracy": 0.9651832461357117, "num_tokens": 3232829699.0, "step": 22900 }, { "entropy": 1.3806819915771484, "epoch": 0.7492409650354216, "grad_norm": 1.5546875, "learning_rate": 4.431379028781927e-06, "loss": 0.1391, "mean_token_accuracy": 0.9683671975135804, "num_tokens": 3239549563.0, "step": 22950 }, { "entropy": 1.3997270607948302, "epoch": 0.7508732982925794, "grad_norm": 1.5703125, "learning_rate": 4.424257752669122e-06, "loss": 0.1504, "mean_token_accuracy": 0.9661344397068024, "num_tokens": 3246842815.0, "step": 23000 }, { "epoch": 0.7508732982925794, "eval_entropy": 1.3884711440404256, "eval_loss": 0.1580066680908203, "eval_mean_token_accuracy": 0.9648262977600097, "eval_num_tokens": 3246842815.0, "eval_runtime": 751.2443, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 23000 }, { "entropy": 1.386196925640106, "epoch": 0.7525056315497372, "grad_norm": 1.7578125, "learning_rate": 4.417126100986378e-06, "loss": 0.1349, "mean_token_accuracy": 0.9699360942840576, "num_tokens": 3253700706.0, "step": 23050 }, { "entropy": 1.3982844924926758, "epoch": 0.754137964806895, "grad_norm": 2.484375, "learning_rate": 4.409984125687039e-06, "loss": 0.1469, "mean_token_accuracy": 0.9673852014541626, "num_tokens": 3260955234.0, "step": 23100 }, { "entropy": 1.4037910151481627, "epoch": 0.7557702980640527, "grad_norm": 1.5, "learning_rate": 4.402831878799652e-06, "loss": 0.1442, "mean_token_accuracy": 0.9678021275997162, "num_tokens": 3267932211.0, "step": 23150 }, { "entropy": 1.4121773409843446, "epoch": 0.7574026313212106, "grad_norm": 1.609375, "learning_rate": 4.395669412427596e-06, "loss": 0.1335, "mean_token_accuracy": 0.9696631526947022, "num_tokens": 3274869428.0, "step": 23200 }, { "entropy": 1.4119772076606751, "epoch": 0.7590349645783683, "grad_norm": 1.78125, "learning_rate": 4.388496778748694e-06, "loss": 0.1349, "mean_token_accuracy": 0.969488970041275, "num_tokens": 3281666389.0, "step": 23250 }, { "entropy": 1.3851684546470642, "epoch": 0.7606672978355261, "grad_norm": 1.078125, "learning_rate": 4.381314030014837e-06, "loss": 0.1419, "mean_token_accuracy": 0.96839430809021, "num_tokens": 3289075580.0, "step": 23300 }, { "entropy": 1.407478768825531, "epoch": 0.7622996310926838, "grad_norm": 2.046875, "learning_rate": 4.374121218551606e-06, "loss": 0.158, "mean_token_accuracy": 0.9645157742500305, "num_tokens": 3296401618.0, "step": 23350 }, { "entropy": 1.4000224781036377, "epoch": 0.7639319643498417, "grad_norm": 1.25, "learning_rate": 4.366918396757886e-06, "loss": 0.139, "mean_token_accuracy": 0.9687736296653747, "num_tokens": 3303532934.0, "step": 23400 }, { "entropy": 1.4132404017448426, "epoch": 0.7655642976069994, "grad_norm": 1.90625, "learning_rate": 4.359705617105485e-06, "loss": 0.1464, "mean_token_accuracy": 0.9665714311599731, "num_tokens": 3310772502.0, "step": 23450 }, { "entropy": 1.409326949119568, "epoch": 0.7671966308641572, "grad_norm": 1.0078125, "learning_rate": 4.352482932138756e-06, "loss": 0.1475, "mean_token_accuracy": 0.9669658172130585, "num_tokens": 3318251468.0, "step": 23500 }, { "epoch": 0.7671966308641572, "eval_entropy": 1.3898699220021566, "eval_loss": 0.15739725530147552, "eval_mean_token_accuracy": 0.9649832367897033, "eval_num_tokens": 3318251468.0, "eval_runtime": 743.2592, "eval_samples_per_second": 12.991, "eval_steps_per_second": 0.102, "step": 23500 }, { "entropy": 1.3853822755813598, "epoch": 0.768828964121315, "grad_norm": 1.4375, "learning_rate": 4.345250394474207e-06, "loss": 0.1417, "mean_token_accuracy": 0.968293867111206, "num_tokens": 3325415082.0, "step": 23550 }, { "entropy": 1.3828247284889221, "epoch": 0.7704612973784728, "grad_norm": 1.3203125, "learning_rate": 4.338008056800126e-06, "loss": 0.1408, "mean_token_accuracy": 0.9685234224796295, "num_tokens": 3332121947.0, "step": 23600 }, { "entropy": 1.4080828213691712, "epoch": 0.7720936306356305, "grad_norm": 1.546875, "learning_rate": 4.330755971876192e-06, "loss": 0.1464, "mean_token_accuracy": 0.9673770892620087, "num_tokens": 3339236872.0, "step": 23650 }, { "entropy": 1.399913146495819, "epoch": 0.7737259638927884, "grad_norm": 1.90625, "learning_rate": 4.3234941925330915e-06, "loss": 0.144, "mean_token_accuracy": 0.9670092570781708, "num_tokens": 3346331383.0, "step": 23700 }, { "entropy": 1.391896095275879, "epoch": 0.7753582971499461, "grad_norm": 1.3125, "learning_rate": 4.316222771672132e-06, "loss": 0.1356, "mean_token_accuracy": 0.9692227625846863, "num_tokens": 3353234181.0, "step": 23750 }, { "entropy": 1.4092794895172118, "epoch": 0.7769906304071039, "grad_norm": 1.53125, "learning_rate": 4.3089417622648605e-06, "loss": 0.1451, "mean_token_accuracy": 0.9670332086086273, "num_tokens": 3360224816.0, "step": 23800 }, { "entropy": 1.4031621408462525, "epoch": 0.7786229636642616, "grad_norm": 1.0859375, "learning_rate": 4.301651217352674e-06, "loss": 0.1412, "mean_token_accuracy": 0.9681469559669494, "num_tokens": 3367296325.0, "step": 23850 }, { "entropy": 1.4203896260261535, "epoch": 0.7802552969214195, "grad_norm": 1.484375, "learning_rate": 4.294351190046439e-06, "loss": 0.1433, "mean_token_accuracy": 0.9682426953315735, "num_tokens": 3373747388.0, "step": 23900 }, { "entropy": 1.412433443069458, "epoch": 0.7818876301785772, "grad_norm": 1.7421875, "learning_rate": 4.2870417335260925e-06, "loss": 0.1459, "mean_token_accuracy": 0.9671615362167358, "num_tokens": 3380836126.0, "step": 23950 }, { "entropy": 1.4091150188446044, "epoch": 0.783519963435735, "grad_norm": 1.1953125, "learning_rate": 4.2797229010402695e-06, "loss": 0.1459, "mean_token_accuracy": 0.9665989732742309, "num_tokens": 3388112439.0, "step": 24000 }, { "epoch": 0.783519963435735, "eval_entropy": 1.3812917121251425, "eval_loss": 0.15769919753074646, "eval_mean_token_accuracy": 0.964633092880249, "eval_num_tokens": 3388112439.0, "eval_runtime": 743.2853, "eval_samples_per_second": 12.991, "eval_steps_per_second": 0.102, "step": 24000 }, { "entropy": 1.3875314927101134, "epoch": 0.7851522966928928, "grad_norm": 1.515625, "learning_rate": 4.272394745905904e-06, "loss": 0.1394, "mean_token_accuracy": 0.9680357229709625, "num_tokens": 3395153920.0, "step": 24050 }, { "entropy": 1.3935593914985658, "epoch": 0.7867846299500506, "grad_norm": 1.609375, "learning_rate": 4.265057321507848e-06, "loss": 0.1223, "mean_token_accuracy": 0.9719667887687683, "num_tokens": 3401608036.0, "step": 24100 }, { "entropy": 1.3922600531578064, "epoch": 0.7884169632072083, "grad_norm": 1.140625, "learning_rate": 4.257710681298474e-06, "loss": 0.1431, "mean_token_accuracy": 0.9678590965270996, "num_tokens": 3409048396.0, "step": 24150 }, { "entropy": 1.404571294784546, "epoch": 0.7900492964643662, "grad_norm": 1.484375, "learning_rate": 4.250354878797295e-06, "loss": 0.1339, "mean_token_accuracy": 0.969120637178421, "num_tokens": 3415718906.0, "step": 24200 }, { "entropy": 1.4066750693321228, "epoch": 0.7916816297215239, "grad_norm": 1.6171875, "learning_rate": 4.242989967590568e-06, "loss": 0.1471, "mean_token_accuracy": 0.9678963148593902, "num_tokens": 3422879629.0, "step": 24250 }, { "entropy": 1.4055565428733825, "epoch": 0.7933139629786817, "grad_norm": 1.7734375, "learning_rate": 4.235616001330909e-06, "loss": 0.1386, "mean_token_accuracy": 0.9684436011314392, "num_tokens": 3430035861.0, "step": 24300 }, { "entropy": 1.3904715991020202, "epoch": 0.7949462962358395, "grad_norm": 1.6640625, "learning_rate": 4.228233033736894e-06, "loss": 0.1375, "mean_token_accuracy": 0.9683421933650971, "num_tokens": 3437370072.0, "step": 24350 }, { "entropy": 1.4026382374763489, "epoch": 0.7965786294929973, "grad_norm": 1.28125, "learning_rate": 4.22084111859268e-06, "loss": 0.1447, "mean_token_accuracy": 0.967241278886795, "num_tokens": 3444922290.0, "step": 24400 }, { "entropy": 1.4011790704727174, "epoch": 0.798210962750155, "grad_norm": 1.6484375, "learning_rate": 4.213440309747597e-06, "loss": 0.1355, "mean_token_accuracy": 0.969396059513092, "num_tokens": 3451653589.0, "step": 24450 }, { "entropy": 1.410584397315979, "epoch": 0.7998432960073129, "grad_norm": 1.3046875, "learning_rate": 4.206030661115772e-06, "loss": 0.1447, "mean_token_accuracy": 0.9678661072254181, "num_tokens": 3459121557.0, "step": 24500 }, { "epoch": 0.7998432960073129, "eval_entropy": 1.3954780069986978, "eval_loss": 0.15847522020339966, "eval_mean_token_accuracy": 0.9648264590899149, "eval_num_tokens": 3459121557.0, "eval_runtime": 748.4306, "eval_samples_per_second": 12.902, "eval_steps_per_second": 0.102, "step": 24500 }, { "entropy": 1.4040296864509583, "epoch": 0.8014756292644706, "grad_norm": 1.1875, "learning_rate": 4.198612226675727e-06, "loss": 0.1397, "mean_token_accuracy": 0.9695000052452087, "num_tokens": 3466079227.0, "step": 24550 }, { "entropy": 1.4071758961677552, "epoch": 0.8031079625216284, "grad_norm": 1.296875, "learning_rate": 4.191185060469987e-06, "loss": 0.145, "mean_token_accuracy": 0.9681662321090698, "num_tokens": 3472782801.0, "step": 24600 }, { "entropy": 1.4046114492416382, "epoch": 0.8047402957787863, "grad_norm": 1.0859375, "learning_rate": 4.183749216604685e-06, "loss": 0.137, "mean_token_accuracy": 0.9698592948913575, "num_tokens": 3479515350.0, "step": 24650 }, { "entropy": 1.4098283767700195, "epoch": 0.806372629035944, "grad_norm": 1.875, "learning_rate": 4.1763047492491746e-06, "loss": 0.1386, "mean_token_accuracy": 0.9687701988220215, "num_tokens": 3486524754.0, "step": 24700 }, { "entropy": 1.39289204120636, "epoch": 0.8080049622931018, "grad_norm": 1.375, "learning_rate": 4.1688517126356256e-06, "loss": 0.1331, "mean_token_accuracy": 0.9696247518062592, "num_tokens": 3493517440.0, "step": 24750 }, { "entropy": 1.3983655071258545, "epoch": 0.8096372955502595, "grad_norm": 1.25, "learning_rate": 4.161390161058637e-06, "loss": 0.1455, "mean_token_accuracy": 0.9675554573535919, "num_tokens": 3500746527.0, "step": 24800 }, { "entropy": 1.4149045872688293, "epoch": 0.8112696288074174, "grad_norm": 0.490234375, "learning_rate": 4.153920148874839e-06, "loss": 0.1459, "mean_token_accuracy": 0.9668138778209686, "num_tokens": 3507701444.0, "step": 24850 }, { "entropy": 1.4177529954910277, "epoch": 0.8129019620645751, "grad_norm": 1.7109375, "learning_rate": 4.146441730502496e-06, "loss": 0.152, "mean_token_accuracy": 0.9660572922229766, "num_tokens": 3514490276.0, "step": 24900 }, { "entropy": 1.4248465538024901, "epoch": 0.8145342953217329, "grad_norm": 1.59375, "learning_rate": 4.1389549604211064e-06, "loss": 0.1481, "mean_token_accuracy": 0.966586571931839, "num_tokens": 3521708251.0, "step": 24950 }, { "entropy": 1.4028679990768433, "epoch": 0.8161666285788907, "grad_norm": 2.171875, "learning_rate": 4.131459893171016e-06, "loss": 0.1293, "mean_token_accuracy": 0.9706797707080841, "num_tokens": 3528390818.0, "step": 25000 }, { "epoch": 0.8161666285788907, "eval_entropy": 1.4112112029393513, "eval_loss": 0.15821218490600586, "eval_mean_token_accuracy": 0.9647614455223084, "eval_num_tokens": 3528390818.0, "eval_runtime": 747.1945, "eval_samples_per_second": 12.923, "eval_steps_per_second": 0.102, "step": 25000 }, { "entropy": 1.4167933750152588, "epoch": 0.8177989618360485, "grad_norm": 1.765625, "learning_rate": 4.1239565833530115e-06, "loss": 0.1441, "mean_token_accuracy": 0.9679395818710327, "num_tokens": 3535142016.0, "step": 25050 }, { "entropy": 1.4142408227920533, "epoch": 0.8194312950932062, "grad_norm": 1.4140625, "learning_rate": 4.116445085627926e-06, "loss": 0.1437, "mean_token_accuracy": 0.9675932359695435, "num_tokens": 3541811208.0, "step": 25100 }, { "entropy": 1.4017269968986512, "epoch": 0.8210636283503641, "grad_norm": 1.078125, "learning_rate": 4.108925454716242e-06, "loss": 0.1388, "mean_token_accuracy": 0.9687268888950348, "num_tokens": 3549027064.0, "step": 25150 }, { "entropy": 1.4196626782417296, "epoch": 0.8226959616075218, "grad_norm": 2.0, "learning_rate": 4.101397745397689e-06, "loss": 0.1374, "mean_token_accuracy": 0.9688492357730866, "num_tokens": 3556231008.0, "step": 25200 }, { "entropy": 1.4409023642539978, "epoch": 0.8243282948646796, "grad_norm": 1.7578125, "learning_rate": 4.093862012510847e-06, "loss": 0.1528, "mean_token_accuracy": 0.9662479484081268, "num_tokens": 3563607911.0, "step": 25250 }, { "entropy": 1.3985059690475463, "epoch": 0.8259606281218373, "grad_norm": 1.1015625, "learning_rate": 4.086318310952752e-06, "loss": 0.1363, "mean_token_accuracy": 0.969468570947647, "num_tokens": 3570544476.0, "step": 25300 }, { "entropy": 1.395876476764679, "epoch": 0.8275929613789952, "grad_norm": 2.046875, "learning_rate": 4.078766695678484e-06, "loss": 0.141, "mean_token_accuracy": 0.9681139755249023, "num_tokens": 3577514806.0, "step": 25350 }, { "entropy": 1.3914256238937377, "epoch": 0.8292252946361529, "grad_norm": 1.3671875, "learning_rate": 4.071207221700778e-06, "loss": 0.1417, "mean_token_accuracy": 0.9679916310310364, "num_tokens": 3584710985.0, "step": 25400 }, { "entropy": 1.3804515194892883, "epoch": 0.8308576278933107, "grad_norm": 1.6875, "learning_rate": 4.063639944089617e-06, "loss": 0.1369, "mean_token_accuracy": 0.9694907116889954, "num_tokens": 3591780461.0, "step": 25450 }, { "entropy": 1.4110288119316101, "epoch": 0.8324899611504685, "grad_norm": 2.03125, "learning_rate": 4.0560649179718345e-06, "loss": 0.1454, "mean_token_accuracy": 0.9680348300933838, "num_tokens": 3598701756.0, "step": 25500 }, { "epoch": 0.8324899611504685, "eval_entropy": 1.4156104850769042, "eval_loss": 0.1576152741909027, "eval_mean_token_accuracy": 0.9650608507792154, "eval_num_tokens": 3598701756.0, "eval_runtime": 749.1988, "eval_samples_per_second": 12.888, "eval_steps_per_second": 0.101, "step": 25500 }, { "entropy": 1.40669499874115, "epoch": 0.8341222944076263, "grad_norm": 1.3359375, "learning_rate": 4.048482198530708e-06, "loss": 0.1321, "mean_token_accuracy": 0.9696508872509003, "num_tokens": 3605779866.0, "step": 25550 }, { "entropy": 1.40333979845047, "epoch": 0.835754627664784, "grad_norm": 1.203125, "learning_rate": 4.040891841005562e-06, "loss": 0.1327, "mean_token_accuracy": 0.9699014961719513, "num_tokens": 3612587584.0, "step": 25600 }, { "entropy": 1.412612452507019, "epoch": 0.8373869609219419, "grad_norm": 1.2578125, "learning_rate": 4.033293900691364e-06, "loss": 0.1471, "mean_token_accuracy": 0.9674671077728272, "num_tokens": 3619996003.0, "step": 25650 }, { "entropy": 1.411652238368988, "epoch": 0.8390192941790996, "grad_norm": 1.7578125, "learning_rate": 4.02568843293832e-06, "loss": 0.1484, "mean_token_accuracy": 0.9667781054973602, "num_tokens": 3627359567.0, "step": 25700 }, { "entropy": 1.3998070549964905, "epoch": 0.8406516274362574, "grad_norm": 1.234375, "learning_rate": 4.0180754931514745e-06, "loss": 0.1456, "mean_token_accuracy": 0.9677145159244538, "num_tokens": 3634637886.0, "step": 25750 }, { "entropy": 1.3955191278457642, "epoch": 0.8422839606934152, "grad_norm": 1.7109375, "learning_rate": 4.010455136790304e-06, "loss": 0.1469, "mean_token_accuracy": 0.9672901368141175, "num_tokens": 3642177370.0, "step": 25800 }, { "entropy": 1.3989661598205567, "epoch": 0.843916293950573, "grad_norm": 1.2734375, "learning_rate": 4.0028274193683124e-06, "loss": 0.1401, "mean_token_accuracy": 0.9687706243991852, "num_tokens": 3649058563.0, "step": 25850 }, { "entropy": 1.4021561121940613, "epoch": 0.8455486272077307, "grad_norm": 1.9609375, "learning_rate": 3.995192396452631e-06, "loss": 0.1411, "mean_token_accuracy": 0.9682626259326935, "num_tokens": 3655854303.0, "step": 25900 }, { "entropy": 1.382779130935669, "epoch": 0.8471809604648886, "grad_norm": 2.25, "learning_rate": 3.987550123663608e-06, "loss": 0.1462, "mean_token_accuracy": 0.9676120269298554, "num_tokens": 3662624685.0, "step": 25950 }, { "entropy": 1.3848181200027465, "epoch": 0.8488132937220463, "grad_norm": 2.25, "learning_rate": 3.97990065667441e-06, "loss": 0.14, "mean_token_accuracy": 0.9676021826267243, "num_tokens": 3669341852.0, "step": 26000 }, { "epoch": 0.8488132937220463, "eval_entropy": 1.3808231941858928, "eval_loss": 0.15642417967319489, "eval_mean_token_accuracy": 0.9652521824836731, "eval_num_tokens": 3669341852.0, "eval_runtime": 753.4968, "eval_samples_per_second": 12.815, "eval_steps_per_second": 0.101, "step": 26000 }, { "entropy": 1.388938684463501, "epoch": 0.8504456269792041, "grad_norm": 1.8984375, "learning_rate": 3.972244051210608e-06, "loss": 0.1499, "mean_token_accuracy": 0.9663873422145843, "num_tokens": 3676639962.0, "step": 26050 }, { "entropy": 1.3947418189048768, "epoch": 0.8520779602363618, "grad_norm": 1.2421875, "learning_rate": 3.964580363049779e-06, "loss": 0.1446, "mean_token_accuracy": 0.9676153147220612, "num_tokens": 3683543595.0, "step": 26100 }, { "entropy": 1.3950307440757752, "epoch": 0.8537102934935197, "grad_norm": 1.2109375, "learning_rate": 3.956909648021096e-06, "loss": 0.136, "mean_token_accuracy": 0.9690661346912384, "num_tokens": 3690286971.0, "step": 26150 }, { "entropy": 1.402916703224182, "epoch": 0.8553426267506774, "grad_norm": 2.09375, "learning_rate": 3.949231962004922e-06, "loss": 0.1541, "mean_token_accuracy": 0.9650176286697387, "num_tokens": 3697865663.0, "step": 26200 }, { "entropy": 1.408561556339264, "epoch": 0.8569749600078352, "grad_norm": 2.265625, "learning_rate": 3.941547360932404e-06, "loss": 0.1375, "mean_token_accuracy": 0.9691826546192169, "num_tokens": 3704856688.0, "step": 26250 }, { "entropy": 1.3971546220779418, "epoch": 0.858607293264993, "grad_norm": 1.6171875, "learning_rate": 3.933855900785063e-06, "loss": 0.1349, "mean_token_accuracy": 0.9694166851043701, "num_tokens": 3711350984.0, "step": 26300 }, { "entropy": 1.403090295791626, "epoch": 0.8602396265221508, "grad_norm": 1.34375, "learning_rate": 3.926157637594387e-06, "loss": 0.1427, "mean_token_accuracy": 0.9675269103050232, "num_tokens": 3718476089.0, "step": 26350 }, { "entropy": 1.3978805589675902, "epoch": 0.8618719597793085, "grad_norm": 1.890625, "learning_rate": 3.918452627441425e-06, "loss": 0.1342, "mean_token_accuracy": 0.9688924837112427, "num_tokens": 3725594290.0, "step": 26400 }, { "entropy": 1.4037820672988892, "epoch": 0.8635042930364664, "grad_norm": 2.109375, "learning_rate": 3.910740926456376e-06, "loss": 0.1356, "mean_token_accuracy": 0.9694356083869934, "num_tokens": 3731938265.0, "step": 26450 }, { "entropy": 1.4001559686660767, "epoch": 0.8651366262936241, "grad_norm": 1.8515625, "learning_rate": 3.903022590818183e-06, "loss": 0.1364, "mean_token_accuracy": 0.9691901934146882, "num_tokens": 3738816334.0, "step": 26500 }, { "epoch": 0.8651366262936241, "eval_entropy": 1.3956389427185059, "eval_loss": 0.1563226878643036, "eval_mean_token_accuracy": 0.965215961933136, "eval_num_tokens": 3738816334.0, "eval_runtime": 749.9014, "eval_samples_per_second": 12.876, "eval_steps_per_second": 0.101, "step": 26500 }, { "entropy": 1.3984951615333556, "epoch": 0.8667689595507819, "grad_norm": 1.4765625, "learning_rate": 3.895297676754119e-06, "loss": 0.1413, "mean_token_accuracy": 0.9679640185832977, "num_tokens": 3745955192.0, "step": 26550 }, { "entropy": 1.3817971444129944, "epoch": 0.8684012928079397, "grad_norm": 1.390625, "learning_rate": 3.887566240539381e-06, "loss": 0.1445, "mean_token_accuracy": 0.9670935535430908, "num_tokens": 3753010501.0, "step": 26600 }, { "entropy": 1.3948706936836244, "epoch": 0.8700336260650975, "grad_norm": 1.5234375, "learning_rate": 3.87982833849668e-06, "loss": 0.1325, "mean_token_accuracy": 0.9705722856521607, "num_tokens": 3759798351.0, "step": 26650 }, { "entropy": 1.393795645236969, "epoch": 0.8716659593222552, "grad_norm": 1.0, "learning_rate": 3.87208402699583e-06, "loss": 0.1317, "mean_token_accuracy": 0.9698431146144867, "num_tokens": 3766781169.0, "step": 26700 }, { "entropy": 1.3873535466194153, "epoch": 0.873298292579413, "grad_norm": 1.9921875, "learning_rate": 3.864333362453337e-06, "loss": 0.1387, "mean_token_accuracy": 0.9688050973415375, "num_tokens": 3773957478.0, "step": 26750 }, { "entropy": 1.3917481398582459, "epoch": 0.8749306258365708, "grad_norm": 1.21875, "learning_rate": 3.856576401331988e-06, "loss": 0.1394, "mean_token_accuracy": 0.9683778524398804, "num_tokens": 3780762633.0, "step": 26800 }, { "entropy": 1.3908830904960632, "epoch": 0.8765629590937286, "grad_norm": 1.7265625, "learning_rate": 3.848813200140437e-06, "loss": 0.1547, "mean_token_accuracy": 0.9653457498550415, "num_tokens": 3788278563.0, "step": 26850 }, { "entropy": 1.4087229990959167, "epoch": 0.8781952923508863, "grad_norm": 1.90625, "learning_rate": 3.841043815432803e-06, "loss": 0.1401, "mean_token_accuracy": 0.9690191769599914, "num_tokens": 3794646188.0, "step": 26900 }, { "entropy": 1.3997572946548462, "epoch": 0.8798276256080442, "grad_norm": 1.359375, "learning_rate": 3.833268303808244e-06, "loss": 0.1366, "mean_token_accuracy": 0.9689948236942292, "num_tokens": 3801360008.0, "step": 26950 }, { "entropy": 1.3976869773864746, "epoch": 0.8814599588652019, "grad_norm": 1.3984375, "learning_rate": 3.8254867219105575e-06, "loss": 0.1307, "mean_token_accuracy": 0.970016497373581, "num_tokens": 3808022349.0, "step": 27000 }, { "epoch": 0.8814599588652019, "eval_entropy": 1.3932032998402912, "eval_loss": 0.15805409848690033, "eval_mean_token_accuracy": 0.9647916714350383, "eval_num_tokens": 3808022349.0, "eval_runtime": 750.2779, "eval_samples_per_second": 12.87, "eval_steps_per_second": 0.101, "step": 27000 }, { "entropy": 1.406122510433197, "epoch": 0.8830922921223597, "grad_norm": 1.625, "learning_rate": 3.8176991264277604e-06, "loss": 0.1555, "mean_token_accuracy": 0.9653628063201904, "num_tokens": 3814977879.0, "step": 27050 }, { "entropy": 1.3947002291679382, "epoch": 0.8847246253795175, "grad_norm": 1.1328125, "learning_rate": 3.809905574091676e-06, "loss": 0.1439, "mean_token_accuracy": 0.9674938654899597, "num_tokens": 3822114331.0, "step": 27100 }, { "entropy": 1.3995588779449464, "epoch": 0.8863569586366753, "grad_norm": 1.2265625, "learning_rate": 3.802106121677525e-06, "loss": 0.1389, "mean_token_accuracy": 0.9679549074172974, "num_tokens": 3829036413.0, "step": 27150 }, { "entropy": 1.397332239151001, "epoch": 0.887989291893833, "grad_norm": 1.8046875, "learning_rate": 3.7943008260035106e-06, "loss": 0.1394, "mean_token_accuracy": 0.9689336049556733, "num_tokens": 3835541715.0, "step": 27200 }, { "entropy": 1.398532338142395, "epoch": 0.8896216251509909, "grad_norm": 1.015625, "learning_rate": 3.7864897439304e-06, "loss": 0.1328, "mean_token_accuracy": 0.970375450849533, "num_tokens": 3842178219.0, "step": 27250 }, { "entropy": 1.3905003619194032, "epoch": 0.8912539584081486, "grad_norm": 1.0234375, "learning_rate": 3.778672932361116e-06, "loss": 0.138, "mean_token_accuracy": 0.9678446400165558, "num_tokens": 3849481284.0, "step": 27300 }, { "entropy": 1.4009161067008973, "epoch": 0.8928862916653064, "grad_norm": 1.7734375, "learning_rate": 3.7708504482403198e-06, "loss": 0.1369, "mean_token_accuracy": 0.9693617796897889, "num_tokens": 3855937939.0, "step": 27350 }, { "entropy": 1.3991939783096314, "epoch": 0.8945186249224641, "grad_norm": 1.2109375, "learning_rate": 3.7630223485539955e-06, "loss": 0.1391, "mean_token_accuracy": 0.9692135906219482, "num_tokens": 3863063108.0, "step": 27400 }, { "entropy": 1.4059844970703126, "epoch": 0.896150958179622, "grad_norm": 1.3046875, "learning_rate": 3.755188690329039e-06, "loss": 0.1387, "mean_token_accuracy": 0.9687328970432282, "num_tokens": 3870522891.0, "step": 27450 }, { "entropy": 1.4190341973304748, "epoch": 0.8977832914367797, "grad_norm": 1.7734375, "learning_rate": 3.747349530632837e-06, "loss": 0.137, "mean_token_accuracy": 0.9695413172245025, "num_tokens": 3877480147.0, "step": 27500 }, { "epoch": 0.8977832914367797, "eval_entropy": 1.407424882253011, "eval_loss": 0.1556614637374878, "eval_mean_token_accuracy": 0.9652939391136169, "eval_num_tokens": 3877480147.0, "eval_runtime": 746.039, "eval_samples_per_second": 12.943, "eval_steps_per_second": 0.102, "step": 27500 }, { "entropy": 1.4175709581375122, "epoch": 0.8994156246939375, "grad_norm": 1.265625, "learning_rate": 3.7395049265728537e-06, "loss": 0.1574, "mean_token_accuracy": 0.964717469215393, "num_tokens": 3884856009.0, "step": 27550 }, { "entropy": 1.4064431715011596, "epoch": 0.9010479579510953, "grad_norm": 1.3828125, "learning_rate": 3.7316549352962154e-06, "loss": 0.139, "mean_token_accuracy": 0.9685567510128021, "num_tokens": 3891927392.0, "step": 27600 }, { "entropy": 1.4132932043075561, "epoch": 0.9026802912082531, "grad_norm": 1.1328125, "learning_rate": 3.7237996139892955e-06, "loss": 0.1441, "mean_token_accuracy": 0.9683604872226715, "num_tokens": 3899174699.0, "step": 27650 }, { "entropy": 1.3990925669670105, "epoch": 0.9043126244654108, "grad_norm": 1.0703125, "learning_rate": 3.7159390198772933e-06, "loss": 0.1453, "mean_token_accuracy": 0.9674279451370239, "num_tokens": 3906648455.0, "step": 27700 }, { "entropy": 1.3980679297447205, "epoch": 0.9059449577225687, "grad_norm": 0.9453125, "learning_rate": 3.7080732102238214e-06, "loss": 0.13, "mean_token_accuracy": 0.9701244246959686, "num_tokens": 3913624677.0, "step": 27750 }, { "entropy": 1.4017365527153016, "epoch": 0.9075772909797264, "grad_norm": 1.6171875, "learning_rate": 3.700202242330488e-06, "loss": 0.1335, "mean_token_accuracy": 0.9692477977275848, "num_tokens": 3920103211.0, "step": 27800 }, { "entropy": 1.3995830225944519, "epoch": 0.9092096242368842, "grad_norm": 1.5078125, "learning_rate": 3.6923261735364753e-06, "loss": 0.1343, "mean_token_accuracy": 0.9691221857070923, "num_tokens": 3927208526.0, "step": 27850 }, { "entropy": 1.3959479594230653, "epoch": 0.910841957494042, "grad_norm": 1.2265625, "learning_rate": 3.6844450612181293e-06, "loss": 0.1442, "mean_token_accuracy": 0.967134929895401, "num_tokens": 3934542379.0, "step": 27900 }, { "entropy": 1.391806445121765, "epoch": 0.9124742907511998, "grad_norm": 1.203125, "learning_rate": 3.6765589627885352e-06, "loss": 0.1313, "mean_token_accuracy": 0.9697517728805543, "num_tokens": 3941546757.0, "step": 27950 }, { "entropy": 1.3797322702407837, "epoch": 0.9141066240083575, "grad_norm": 1.6953125, "learning_rate": 3.6686679356971017e-06, "loss": 0.1352, "mean_token_accuracy": 0.9690727829933167, "num_tokens": 3948336251.0, "step": 28000 }, { "epoch": 0.9141066240083575, "eval_entropy": 1.3829597409566243, "eval_loss": 0.15241551399230957, "eval_mean_token_accuracy": 0.9657313092549642, "eval_num_tokens": 3948336251.0, "eval_runtime": 750.4693, "eval_samples_per_second": 12.867, "eval_steps_per_second": 0.101, "step": 28000 }, { "entropy": 1.3851616716384887, "epoch": 0.9157389572655154, "grad_norm": 1.1015625, "learning_rate": 3.660772037429141e-06, "loss": 0.1329, "mean_token_accuracy": 0.9694375658035278, "num_tokens": 3955743379.0, "step": 28050 }, { "entropy": 1.3846897101402282, "epoch": 0.9173712905226731, "grad_norm": 1.5390625, "learning_rate": 3.652871325505453e-06, "loss": 0.1396, "mean_token_accuracy": 0.9685408413410187, "num_tokens": 3962634196.0, "step": 28100 }, { "entropy": 1.3823327445983886, "epoch": 0.9190036237798309, "grad_norm": 1.484375, "learning_rate": 3.6449658574819062e-06, "loss": 0.1342, "mean_token_accuracy": 0.9699604260921478, "num_tokens": 3969575759.0, "step": 28150 }, { "entropy": 1.3885199642181396, "epoch": 0.9206359570369886, "grad_norm": 1.21875, "learning_rate": 3.637055690949012e-06, "loss": 0.1365, "mean_token_accuracy": 0.9687310314178467, "num_tokens": 3976309312.0, "step": 28200 }, { "entropy": 1.3812410712242127, "epoch": 0.9222682902941465, "grad_norm": 1.7734375, "learning_rate": 3.629140883531515e-06, "loss": 0.1322, "mean_token_accuracy": 0.9697493410110474, "num_tokens": 3983034311.0, "step": 28250 }, { "entropy": 1.3719140005111694, "epoch": 0.9239006235513042, "grad_norm": 2.671875, "learning_rate": 3.6212214928879643e-06, "loss": 0.1311, "mean_token_accuracy": 0.9697664487361908, "num_tokens": 3990038956.0, "step": 28300 }, { "entropy": 1.3883194899559022, "epoch": 0.925532956808462, "grad_norm": 1.734375, "learning_rate": 3.6132975767103e-06, "loss": 0.127, "mean_token_accuracy": 0.9704679012298584, "num_tokens": 3997170550.0, "step": 28350 }, { "entropy": 1.3769879937171936, "epoch": 0.9271652900656198, "grad_norm": 1.515625, "learning_rate": 3.6053691927234304e-06, "loss": 0.1318, "mean_token_accuracy": 0.9701116299629211, "num_tokens": 4003683105.0, "step": 28400 }, { "entropy": 1.36924959897995, "epoch": 0.9287976233227776, "grad_norm": 2.046875, "learning_rate": 3.5974363986848077e-06, "loss": 0.13, "mean_token_accuracy": 0.9698223459720612, "num_tokens": 4010540640.0, "step": 28450 }, { "entropy": 1.3812626338005065, "epoch": 0.9304299565799353, "grad_norm": 1.65625, "learning_rate": 3.5894992523840146e-06, "loss": 0.1395, "mean_token_accuracy": 0.9686692810058594, "num_tokens": 4017705171.0, "step": 28500 }, { "epoch": 0.9304299565799353, "eval_entropy": 1.3810707855224609, "eval_loss": 0.15061478316783905, "eval_mean_token_accuracy": 0.9661375037829081, "eval_num_tokens": 4017705171.0, "eval_runtime": 742.3916, "eval_samples_per_second": 13.007, "eval_steps_per_second": 0.102, "step": 28500 }, { "entropy": 1.3842783665657044, "epoch": 0.9320622898370932, "grad_norm": 1.8671875, "learning_rate": 3.581557811642338e-06, "loss": 0.1411, "mean_token_accuracy": 0.9674799299240112, "num_tokens": 4025097590.0, "step": 28550 }, { "entropy": 1.374339952468872, "epoch": 0.9336946230942509, "grad_norm": 1.9453125, "learning_rate": 3.57361213431235e-06, "loss": 0.1421, "mean_token_accuracy": 0.96745934009552, "num_tokens": 4032859225.0, "step": 28600 }, { "entropy": 1.3793137764930725, "epoch": 0.9353269563514087, "grad_norm": 1.78125, "learning_rate": 3.565662278277484e-06, "loss": 0.1371, "mean_token_accuracy": 0.9678510630130768, "num_tokens": 4040008030.0, "step": 28650 }, { "entropy": 1.3788374090194702, "epoch": 0.9369592896085664, "grad_norm": 1.3515625, "learning_rate": 3.5577083014516183e-06, "loss": 0.1261, "mean_token_accuracy": 0.9712175786495209, "num_tokens": 4046560521.0, "step": 28700 }, { "entropy": 1.3812962436676026, "epoch": 0.9385916228657243, "grad_norm": 1.4765625, "learning_rate": 3.549750261778648e-06, "loss": 0.1378, "mean_token_accuracy": 0.9689911651611328, "num_tokens": 4053568855.0, "step": 28750 }, { "entropy": 1.384900779724121, "epoch": 0.940223956122882, "grad_norm": 3.3125, "learning_rate": 3.5417882172320663e-06, "loss": 0.1418, "mean_token_accuracy": 0.9679834198951721, "num_tokens": 4060892652.0, "step": 28800 }, { "entropy": 1.3779156827926635, "epoch": 0.9418562893800398, "grad_norm": 1.3515625, "learning_rate": 3.5338222258145408e-06, "loss": 0.1246, "mean_token_accuracy": 0.9717523455619812, "num_tokens": 4067536985.0, "step": 28850 }, { "entropy": 1.3658223152160645, "epoch": 0.9434886226371976, "grad_norm": 2.40625, "learning_rate": 3.525852345557493e-06, "loss": 0.1344, "mean_token_accuracy": 0.9694867217540741, "num_tokens": 4075107141.0, "step": 28900 }, { "entropy": 1.3954302740097047, "epoch": 0.9451209558943554, "grad_norm": 1.640625, "learning_rate": 3.5178786345206746e-06, "loss": 0.1341, "mean_token_accuracy": 0.9694812285900116, "num_tokens": 4082057215.0, "step": 28950 }, { "entropy": 1.3778202867507934, "epoch": 0.9467532891515131, "grad_norm": 1.984375, "learning_rate": 3.509901150791742e-06, "loss": 0.1394, "mean_token_accuracy": 0.9685069918632507, "num_tokens": 4089240268.0, "step": 29000 }, { "epoch": 0.9467532891515131, "eval_entropy": 1.3786554765701293, "eval_loss": 0.14917373657226562, "eval_mean_token_accuracy": 0.9661656268437704, "eval_num_tokens": 4089240268.0, "eval_runtime": 744.7112, "eval_samples_per_second": 12.966, "eval_steps_per_second": 0.102, "step": 29000 }, { "entropy": 1.3767950320243836, "epoch": 0.948385622408671, "grad_norm": 1.9609375, "learning_rate": 3.5019199524858355e-06, "loss": 0.1245, "mean_token_accuracy": 0.9718893337249755, "num_tokens": 4096140756.0, "step": 29050 }, { "entropy": 1.375689399242401, "epoch": 0.9500179556658287, "grad_norm": 1.2890625, "learning_rate": 3.493935097745158e-06, "loss": 0.1242, "mean_token_accuracy": 0.9714548885822296, "num_tokens": 4102843838.0, "step": 29100 }, { "entropy": 1.386333782672882, "epoch": 0.9516502889229865, "grad_norm": 1.25, "learning_rate": 3.4859466447385477e-06, "loss": 0.1364, "mean_token_accuracy": 0.9688875234127045, "num_tokens": 4109572295.0, "step": 29150 }, { "entropy": 1.3935841035842895, "epoch": 0.9532826221801443, "grad_norm": 1.40625, "learning_rate": 3.477954651661055e-06, "loss": 0.1389, "mean_token_accuracy": 0.9683682763576508, "num_tokens": 4116355893.0, "step": 29200 }, { "entropy": 1.3900840377807617, "epoch": 0.9549149554373021, "grad_norm": 1.609375, "learning_rate": 3.4699591767335203e-06, "loss": 0.1461, "mean_token_accuracy": 0.9672015142440796, "num_tokens": 4123980186.0, "step": 29250 }, { "entropy": 1.3854497838020325, "epoch": 0.9565472886944598, "grad_norm": 1.84375, "learning_rate": 3.4619602782021497e-06, "loss": 0.127, "mean_token_accuracy": 0.971345556974411, "num_tokens": 4130458823.0, "step": 29300 }, { "entropy": 1.3904473185539246, "epoch": 0.9581796219516177, "grad_norm": 1.671875, "learning_rate": 3.4539580143380884e-06, "loss": 0.1345, "mean_token_accuracy": 0.9702473485469818, "num_tokens": 4137313489.0, "step": 29350 }, { "entropy": 1.393708050251007, "epoch": 0.9598119552087754, "grad_norm": 1.5625, "learning_rate": 3.4459524434369967e-06, "loss": 0.1393, "mean_token_accuracy": 0.9680514478683472, "num_tokens": 4144869499.0, "step": 29400 }, { "entropy": 1.392387228012085, "epoch": 0.9614442884659332, "grad_norm": 1.7265625, "learning_rate": 3.437943623818631e-06, "loss": 0.1244, "mean_token_accuracy": 0.9713895416259766, "num_tokens": 4151722923.0, "step": 29450 }, { "entropy": 1.3936436820030211, "epoch": 0.9630766217230909, "grad_norm": 1.296875, "learning_rate": 3.4299316138264096e-06, "loss": 0.1435, "mean_token_accuracy": 0.9673401594161988, "num_tokens": 4159401264.0, "step": 29500 }, { "epoch": 0.9630766217230909, "eval_entropy": 1.377136646906535, "eval_loss": 0.1480059176683426, "eval_mean_token_accuracy": 0.9664394434293111, "eval_num_tokens": 4159401264.0, "eval_runtime": 746.3254, "eval_samples_per_second": 12.938, "eval_steps_per_second": 0.102, "step": 29500 }, { "entropy": 1.3720522713661194, "epoch": 0.9647089549802488, "grad_norm": 0.00274658203125, "learning_rate": 3.4219164718269925e-06, "loss": 0.1237, "mean_token_accuracy": 0.9710344398021697, "num_tokens": 4166633451.0, "step": 29550 }, { "entropy": 1.3591685533523559, "epoch": 0.9663412882374065, "grad_norm": 1.703125, "learning_rate": 3.41389825620986e-06, "loss": 0.1271, "mean_token_accuracy": 0.9711633479595184, "num_tokens": 4173854933.0, "step": 29600 }, { "entropy": 1.3691086530685426, "epoch": 0.9679736214945643, "grad_norm": 1.09375, "learning_rate": 3.405877025386879e-06, "loss": 0.1329, "mean_token_accuracy": 0.969690408706665, "num_tokens": 4180650471.0, "step": 29650 }, { "entropy": 1.3538868117332459, "epoch": 0.9696059547517221, "grad_norm": 1.484375, "learning_rate": 3.397852837791885e-06, "loss": 0.1193, "mean_token_accuracy": 0.9719677448272706, "num_tokens": 4187582242.0, "step": 29700 }, { "entropy": 1.3645092558860779, "epoch": 0.9712382880088799, "grad_norm": 1.8828125, "learning_rate": 3.389825751880252e-06, "loss": 0.1333, "mean_token_accuracy": 0.9694900810718536, "num_tokens": 4194210247.0, "step": 29750 }, { "entropy": 1.3627040433883666, "epoch": 0.9728706212660376, "grad_norm": 1.734375, "learning_rate": 3.381795826128467e-06, "loss": 0.1332, "mean_token_accuracy": 0.9694800686836242, "num_tokens": 4201506868.0, "step": 29800 }, { "entropy": 1.3740008974075317, "epoch": 0.9745029545231955, "grad_norm": 1.7734375, "learning_rate": 3.373763119033706e-06, "loss": 0.132, "mean_token_accuracy": 0.969896445274353, "num_tokens": 4208636691.0, "step": 29850 }, { "entropy": 1.3614243865013123, "epoch": 0.9761352877803532, "grad_norm": 1.1484375, "learning_rate": 3.365727689113406e-06, "loss": 0.1333, "mean_token_accuracy": 0.9692292737960816, "num_tokens": 4215943871.0, "step": 29900 }, { "entropy": 1.3712631511688231, "epoch": 0.977767621037511, "grad_norm": 2.671875, "learning_rate": 3.3576895949048423e-06, "loss": 0.1314, "mean_token_accuracy": 0.970370488166809, "num_tokens": 4222890357.0, "step": 29950 }, { "entropy": 1.3694654393196106, "epoch": 0.9793999542946688, "grad_norm": 1.5859375, "learning_rate": 3.3496488949646945e-06, "loss": 0.132, "mean_token_accuracy": 0.9699479579925537, "num_tokens": 4229911312.0, "step": 30000 }, { "epoch": 0.9793999542946688, "eval_entropy": 1.3661458206176758, "eval_loss": 0.14751291275024414, "eval_mean_token_accuracy": 0.9666498748461405, "eval_num_tokens": 4229911312.0, "eval_runtime": 747.9793, "eval_samples_per_second": 12.909, "eval_steps_per_second": 0.102, "step": 30000 }, { "entropy": 1.3588277745246886, "epoch": 0.9810322875518266, "grad_norm": 1.5703125, "learning_rate": 3.34160564786863e-06, "loss": 0.1369, "mean_token_accuracy": 0.968973708152771, "num_tokens": 4237375692.0, "step": 30050 }, { "entropy": 1.3632358622550964, "epoch": 0.9826646208089843, "grad_norm": 1.46875, "learning_rate": 3.3335599122108676e-06, "loss": 0.1282, "mean_token_accuracy": 0.9707362723350524, "num_tokens": 4244434501.0, "step": 30100 }, { "entropy": 1.3589469051361085, "epoch": 0.9842969540661421, "grad_norm": 1.0859375, "learning_rate": 3.3255117466037573e-06, "loss": 0.133, "mean_token_accuracy": 0.9700418126583099, "num_tokens": 4251615278.0, "step": 30150 }, { "entropy": 1.3744734477996827, "epoch": 0.9859292873232999, "grad_norm": 1.53125, "learning_rate": 3.3174612096773496e-06, "loss": 0.1455, "mean_token_accuracy": 0.9672206926345825, "num_tokens": 4259009585.0, "step": 30200 }, { "entropy": 1.3715810680389404, "epoch": 0.9875616205804577, "grad_norm": 1.5, "learning_rate": 3.3094083600789717e-06, "loss": 0.1328, "mean_token_accuracy": 0.969623521566391, "num_tokens": 4266185168.0, "step": 30250 }, { "entropy": 1.3768983268737793, "epoch": 0.9891939538376154, "grad_norm": 1.1953125, "learning_rate": 3.3013532564727965e-06, "loss": 0.1399, "mean_token_accuracy": 0.9680708968639373, "num_tokens": 4273427348.0, "step": 30300 }, { "entropy": 1.3672343015670776, "epoch": 0.9908262870947733, "grad_norm": 1.2265625, "learning_rate": 3.293295957539418e-06, "loss": 0.1295, "mean_token_accuracy": 0.9705338907241822, "num_tokens": 4280054609.0, "step": 30350 }, { "entropy": 1.3644690942764282, "epoch": 0.992458620351931, "grad_norm": 1.7421875, "learning_rate": 3.2852365219754234e-06, "loss": 0.1298, "mean_token_accuracy": 0.9702000212669373, "num_tokens": 4287354435.0, "step": 30400 }, { "entropy": 1.3673407602310181, "epoch": 0.9940909536090888, "grad_norm": 2.046875, "learning_rate": 3.2771750084929644e-06, "loss": 0.1388, "mean_token_accuracy": 0.9686336624622345, "num_tokens": 4294494938.0, "step": 30450 }, { "entropy": 1.3702644801139832, "epoch": 0.9957232868662466, "grad_norm": 1.046875, "learning_rate": 3.26911147581933e-06, "loss": 0.1348, "mean_token_accuracy": 0.9689600837230682, "num_tokens": 4302306234.0, "step": 30500 }, { "epoch": 0.9957232868662466, "eval_entropy": 1.3691168228785198, "eval_loss": 0.14723782241344452, "eval_mean_token_accuracy": 0.9666363048553467, "eval_num_tokens": 4302306234.0, "eval_runtime": 744.3054, "eval_samples_per_second": 12.973, "eval_steps_per_second": 0.102, "step": 30500 }, { "entropy": 1.3547360873222352, "epoch": 0.9973556201234044, "grad_norm": 2.03125, "learning_rate": 3.2610459826965177e-06, "loss": 0.1191, "mean_token_accuracy": 0.9728741991519928, "num_tokens": 4309003875.0, "step": 30550 }, { "entropy": 1.3677322697639465, "epoch": 0.9989879533805621, "grad_norm": 1.1640625, "learning_rate": 3.2529785878808105e-06, "loss": 0.1382, "mean_token_accuracy": 0.9686129570007325, "num_tokens": 4316663353.0, "step": 30600 }, { "entropy": 1.366257793903351, "epoch": 1.0006202866377198, "grad_norm": 1.078125, "learning_rate": 3.244909350142341e-06, "loss": 0.1326, "mean_token_accuracy": 0.9687590861320495, "num_tokens": 4324200010.0, "step": 30650 }, { "entropy": 1.3630602145195008, "epoch": 1.0022526198948778, "grad_norm": 1.546875, "learning_rate": 3.2368383282646688e-06, "loss": 0.1329, "mean_token_accuracy": 0.9694711458683014, "num_tokens": 4331478206.0, "step": 30700 }, { "entropy": 1.366960186958313, "epoch": 1.0038849531520355, "grad_norm": 1.5390625, "learning_rate": 3.2287655810443514e-06, "loss": 0.1382, "mean_token_accuracy": 0.9686524891853332, "num_tokens": 4338814315.0, "step": 30750 }, { "entropy": 1.3614837670326232, "epoch": 1.0055172864091932, "grad_norm": 1.234375, "learning_rate": 3.220691167290514e-06, "loss": 0.1387, "mean_token_accuracy": 0.968714509010315, "num_tokens": 4346125127.0, "step": 30800 }, { "entropy": 1.3573535728454589, "epoch": 1.0071496196663512, "grad_norm": 1.5390625, "learning_rate": 3.2126151458244233e-06, "loss": 0.1204, "mean_token_accuracy": 0.9723792004585267, "num_tokens": 4352939968.0, "step": 30850 }, { "entropy": 1.3520698595046996, "epoch": 1.008781952923509, "grad_norm": 1.875, "learning_rate": 3.2045375754790577e-06, "loss": 0.1231, "mean_token_accuracy": 0.9714046669006348, "num_tokens": 4359619212.0, "step": 30900 }, { "entropy": 1.3647415375709533, "epoch": 1.0104142861806666, "grad_norm": 1.578125, "learning_rate": 3.196458515098679e-06, "loss": 0.1372, "mean_token_accuracy": 0.9691076791286468, "num_tokens": 4366790518.0, "step": 30950 }, { "entropy": 1.3572459483146668, "epoch": 1.0120466194378244, "grad_norm": 1.5859375, "learning_rate": 3.1883780235384036e-06, "loss": 0.13, "mean_token_accuracy": 0.9705017244815827, "num_tokens": 4373881040.0, "step": 31000 }, { "epoch": 1.0120466194378244, "eval_entropy": 1.3610133997599283, "eval_loss": 0.1467311531305313, "eval_mean_token_accuracy": 0.9667138489087422, "eval_num_tokens": 4373881040.0, "eval_runtime": 749.5562, "eval_samples_per_second": 12.882, "eval_steps_per_second": 0.101, "step": 31000 }, { "entropy": 1.3617827844619752, "epoch": 1.0136789526949823, "grad_norm": 2.078125, "learning_rate": 3.180296159663773e-06, "loss": 0.1256, "mean_token_accuracy": 0.9706788539886475, "num_tokens": 4380849198.0, "step": 31050 }, { "entropy": 1.35469162940979, "epoch": 1.01531128595214, "grad_norm": 1.140625, "learning_rate": 3.1722129823503283e-06, "loss": 0.1265, "mean_token_accuracy": 0.9702699911594391, "num_tokens": 4388053942.0, "step": 31100 }, { "entropy": 1.3688899064064026, "epoch": 1.0169436192092978, "grad_norm": 1.890625, "learning_rate": 3.1641285504831776e-06, "loss": 0.1379, "mean_token_accuracy": 0.9682463228702545, "num_tokens": 4395442873.0, "step": 31150 }, { "entropy": 1.3647472047805786, "epoch": 1.0185759524664555, "grad_norm": 1.4453125, "learning_rate": 3.156042922956568e-06, "loss": 0.1285, "mean_token_accuracy": 0.9707048869132996, "num_tokens": 4402556775.0, "step": 31200 }, { "entropy": 1.3655140924453735, "epoch": 1.0202082857236134, "grad_norm": 1.6015625, "learning_rate": 3.1479561586734553e-06, "loss": 0.1273, "mean_token_accuracy": 0.9710086095333099, "num_tokens": 4409440789.0, "step": 31250 }, { "entropy": 1.378363606929779, "epoch": 1.0218406189807712, "grad_norm": 1.8984375, "learning_rate": 3.139868316545081e-06, "loss": 0.1413, "mean_token_accuracy": 0.9676133573055268, "num_tokens": 4416910867.0, "step": 31300 }, { "entropy": 1.377373902797699, "epoch": 1.023472952237929, "grad_norm": 1.1171875, "learning_rate": 3.131779455490534e-06, "loss": 0.1353, "mean_token_accuracy": 0.9695135807991028, "num_tokens": 4424153945.0, "step": 31350 }, { "entropy": 1.3685814261436462, "epoch": 1.0251052854950866, "grad_norm": 1.5234375, "learning_rate": 3.1236896344363276e-06, "loss": 0.1261, "mean_token_accuracy": 0.9713682627677918, "num_tokens": 4431041238.0, "step": 31400 }, { "entropy": 1.3680060362815858, "epoch": 1.0267376187522446, "grad_norm": 1.3984375, "learning_rate": 3.1155989123159693e-06, "loss": 0.1308, "mean_token_accuracy": 0.9699837076663971, "num_tokens": 4438354536.0, "step": 31450 }, { "entropy": 1.3511241865158081, "epoch": 1.0283699520094023, "grad_norm": 1.4375, "learning_rate": 3.1075073480695303e-06, "loss": 0.12, "mean_token_accuracy": 0.9721428179740905, "num_tokens": 4445073229.0, "step": 31500 }, { "epoch": 1.0283699520094023, "eval_entropy": 1.3678371334075927, "eval_loss": 0.14646433293819427, "eval_mean_token_accuracy": 0.9666456254323323, "eval_num_tokens": 4445073229.0, "eval_runtime": 748.9165, "eval_samples_per_second": 12.893, "eval_steps_per_second": 0.101, "step": 31500 }, { "entropy": 1.3680097246170044, "epoch": 1.03000228526656, "grad_norm": 2.1875, "learning_rate": 3.099415000643216e-06, "loss": 0.125, "mean_token_accuracy": 0.9707134962081909, "num_tokens": 4452143066.0, "step": 31550 }, { "entropy": 1.3657612824440002, "epoch": 1.0316346185237177, "grad_norm": 1.0703125, "learning_rate": 3.0913219289889375e-06, "loss": 0.1399, "mean_token_accuracy": 0.9681135547161103, "num_tokens": 4459565328.0, "step": 31600 }, { "entropy": 1.3669775104522706, "epoch": 1.0332669517808757, "grad_norm": 1.1953125, "learning_rate": 3.083228192063883e-06, "loss": 0.1296, "mean_token_accuracy": 0.9709093308448792, "num_tokens": 4466669853.0, "step": 31650 }, { "entropy": 1.373291413784027, "epoch": 1.0348992850380334, "grad_norm": 1.6171875, "learning_rate": 3.0751338488300846e-06, "loss": 0.1319, "mean_token_accuracy": 0.969772047996521, "num_tokens": 4473341935.0, "step": 31700 }, { "entropy": 1.3762017822265624, "epoch": 1.0365316182951911, "grad_norm": 1.859375, "learning_rate": 3.0670389582539956e-06, "loss": 0.138, "mean_token_accuracy": 0.968094003200531, "num_tokens": 4480827308.0, "step": 31750 }, { "entropy": 1.3670384407043457, "epoch": 1.0381639515523489, "grad_norm": 1.4375, "learning_rate": 3.0589435793060506e-06, "loss": 0.1257, "mean_token_accuracy": 0.9709041547775269, "num_tokens": 4487979209.0, "step": 31800 }, { "entropy": 1.3619010615348817, "epoch": 1.0397962848095068, "grad_norm": 1.9140625, "learning_rate": 3.050847770960248e-06, "loss": 0.1182, "mean_token_accuracy": 0.9726350855827331, "num_tokens": 4494810042.0, "step": 31850 }, { "entropy": 1.3843055248260498, "epoch": 1.0414286180666645, "grad_norm": 1.546875, "learning_rate": 3.0427515921937097e-06, "loss": 0.1398, "mean_token_accuracy": 0.9685020220279693, "num_tokens": 4502522702.0, "step": 31900 }, { "entropy": 1.3764786529541015, "epoch": 1.0430609513238223, "grad_norm": 1.1875, "learning_rate": 3.034655101986258e-06, "loss": 0.1399, "mean_token_accuracy": 0.9681815671920776, "num_tokens": 4509891561.0, "step": 31950 }, { "entropy": 1.3768756079673767, "epoch": 1.04469328458098, "grad_norm": 1.1328125, "learning_rate": 3.026558359319985e-06, "loss": 0.1378, "mean_token_accuracy": 0.9689823544025421, "num_tokens": 4517228622.0, "step": 32000 }, { "epoch": 1.04469328458098, "eval_entropy": 1.3686085001627604, "eval_loss": 0.14597955346107483, "eval_mean_token_accuracy": 0.9667494138081868, "eval_num_tokens": 4517228622.0, "eval_runtime": 756.2048, "eval_samples_per_second": 12.769, "eval_steps_per_second": 0.101, "step": 32000 }, { "entropy": 1.3622459721565248, "epoch": 1.046325617838138, "grad_norm": 1.28125, "learning_rate": 3.01846142317882e-06, "loss": 0.1246, "mean_token_accuracy": 0.971124712228775, "num_tokens": 4524548751.0, "step": 32050 }, { "entropy": 1.360032732486725, "epoch": 1.0479579510952957, "grad_norm": 1.9296875, "learning_rate": 3.0103643525481026e-06, "loss": 0.1299, "mean_token_accuracy": 0.970292786359787, "num_tokens": 4531455869.0, "step": 32100 }, { "entropy": 1.3518124055862426, "epoch": 1.0495902843524534, "grad_norm": 3.109375, "learning_rate": 3.0022672064141524e-06, "loss": 0.1269, "mean_token_accuracy": 0.971394385099411, "num_tokens": 4538509160.0, "step": 32150 }, { "entropy": 1.3580170464515686, "epoch": 1.051222617609611, "grad_norm": 1.7734375, "learning_rate": 2.9941700437638386e-06, "loss": 0.1289, "mean_token_accuracy": 0.9704779148101806, "num_tokens": 4545863027.0, "step": 32200 }, { "entropy": 1.3661806869506836, "epoch": 1.052854950866769, "grad_norm": 1.3359375, "learning_rate": 2.986072923584151e-06, "loss": 0.1374, "mean_token_accuracy": 0.9684454727172852, "num_tokens": 4553360974.0, "step": 32250 }, { "entropy": 1.3564281272888183, "epoch": 1.0544872841239268, "grad_norm": 2.15625, "learning_rate": 2.9779759048617704e-06, "loss": 0.1416, "mean_token_accuracy": 0.9682377851009369, "num_tokens": 4560701368.0, "step": 32300 }, { "entropy": 1.3551061296463012, "epoch": 1.0561196173810845, "grad_norm": 2.484375, "learning_rate": 2.9698790465826377e-06, "loss": 0.1241, "mean_token_accuracy": 0.9714620614051819, "num_tokens": 4567111828.0, "step": 32350 }, { "entropy": 1.3527950978279113, "epoch": 1.0577519506382422, "grad_norm": 1.546875, "learning_rate": 2.961782407731525e-06, "loss": 0.1337, "mean_token_accuracy": 0.9692088150978089, "num_tokens": 4574301428.0, "step": 32400 }, { "entropy": 1.3572787022590638, "epoch": 1.0593842838954002, "grad_norm": 1.5625, "learning_rate": 2.953686047291606e-06, "loss": 0.1286, "mean_token_accuracy": 0.9706799817085267, "num_tokens": 4581456456.0, "step": 32450 }, { "entropy": 1.3538258695602416, "epoch": 1.061016617152558, "grad_norm": 1.9765625, "learning_rate": 2.945590024244026e-06, "loss": 0.1297, "mean_token_accuracy": 0.9701016509532928, "num_tokens": 4588137631.0, "step": 32500 }, { "epoch": 1.061016617152558, "eval_entropy": 1.3580620272954305, "eval_loss": 0.14589445292949677, "eval_mean_token_accuracy": 0.9668857765197754, "eval_num_tokens": 4588137631.0, "eval_runtime": 748.9222, "eval_samples_per_second": 12.893, "eval_steps_per_second": 0.101, "step": 32500 }, { "entropy": 1.361629192829132, "epoch": 1.0626489504097156, "grad_norm": 2.0625, "learning_rate": 2.9374943975674745e-06, "loss": 0.138, "mean_token_accuracy": 0.9691119182109833, "num_tokens": 4595619130.0, "step": 32550 }, { "entropy": 1.3559014773368836, "epoch": 1.0642812836668734, "grad_norm": 1.171875, "learning_rate": 2.92939922623775e-06, "loss": 0.1239, "mean_token_accuracy": 0.9716914188861847, "num_tokens": 4602722754.0, "step": 32600 }, { "entropy": 1.3606642532348632, "epoch": 1.0659136169240313, "grad_norm": 1.4140625, "learning_rate": 2.921304569227337e-06, "loss": 0.1308, "mean_token_accuracy": 0.9696120321750641, "num_tokens": 4609681202.0, "step": 32650 }, { "entropy": 1.3540519714355468, "epoch": 1.067545950181189, "grad_norm": 1.265625, "learning_rate": 2.913210485504971e-06, "loss": 0.1191, "mean_token_accuracy": 0.972172474861145, "num_tokens": 4616745205.0, "step": 32700 }, { "entropy": 1.3715915560722352, "epoch": 1.0691782834383468, "grad_norm": 0.37109375, "learning_rate": 2.9051170340352125e-06, "loss": 0.1348, "mean_token_accuracy": 0.9693544006347656, "num_tokens": 4624203423.0, "step": 32750 }, { "entropy": 1.3620137906074523, "epoch": 1.0708106166955045, "grad_norm": 0.2353515625, "learning_rate": 2.8970242737780152e-06, "loss": 0.1319, "mean_token_accuracy": 0.9693595457077027, "num_tokens": 4631098385.0, "step": 32800 }, { "entropy": 1.3573533582687378, "epoch": 1.0724429499526624, "grad_norm": 1.5859375, "learning_rate": 2.8889322636882975e-06, "loss": 0.1149, "mean_token_accuracy": 0.9735664069652558, "num_tokens": 4637689978.0, "step": 32850 }, { "entropy": 1.3642809319496154, "epoch": 1.0740752832098202, "grad_norm": 1.8671875, "learning_rate": 2.8808410627155142e-06, "loss": 0.1288, "mean_token_accuracy": 0.9694396567344665, "num_tokens": 4644751687.0, "step": 32900 }, { "entropy": 1.34871666431427, "epoch": 1.0757076164669779, "grad_norm": 1.234375, "learning_rate": 2.8727507298032246e-06, "loss": 0.1298, "mean_token_accuracy": 0.9703471696376801, "num_tokens": 4651717972.0, "step": 32950 }, { "entropy": 1.3684996843338013, "epoch": 1.0773399497241356, "grad_norm": 1.390625, "learning_rate": 2.864661323888664e-06, "loss": 0.1341, "mean_token_accuracy": 0.9694813418388367, "num_tokens": 4659247374.0, "step": 33000 }, { "epoch": 1.0773399497241356, "eval_entropy": 1.3517811473210652, "eval_loss": 0.14623871445655823, "eval_mean_token_accuracy": 0.9667705456415813, "eval_num_tokens": 4659247374.0, "eval_runtime": 756.2099, "eval_samples_per_second": 12.769, "eval_steps_per_second": 0.101, "step": 33000 }, { "entropy": 1.3631637930870055, "epoch": 1.0789722829812936, "grad_norm": 1.40625, "learning_rate": 2.8565729039023154e-06, "loss": 0.134, "mean_token_accuracy": 0.9690206825733185, "num_tokens": 4666487708.0, "step": 33050 }, { "entropy": 1.3550728225708009, "epoch": 1.0806046162384513, "grad_norm": 1.96875, "learning_rate": 2.8484855287674787e-06, "loss": 0.139, "mean_token_accuracy": 0.9683072865009308, "num_tokens": 4673679571.0, "step": 33100 }, { "entropy": 1.3468902921676635, "epoch": 1.082236949495609, "grad_norm": 0.95703125, "learning_rate": 2.8403992573998416e-06, "loss": 0.1287, "mean_token_accuracy": 0.9701869285106659, "num_tokens": 4680648568.0, "step": 33150 }, { "entropy": 1.3574704766273498, "epoch": 1.0838692827527667, "grad_norm": 2.0, "learning_rate": 2.8323141487070544e-06, "loss": 0.1252, "mean_token_accuracy": 0.9709026992321015, "num_tokens": 4687624851.0, "step": 33200 }, { "entropy": 1.3406636595726014, "epoch": 1.0855016160099247, "grad_norm": 1.8359375, "learning_rate": 2.824230261588294e-06, "loss": 0.123, "mean_token_accuracy": 0.9712547302246094, "num_tokens": 4694450430.0, "step": 33250 }, { "entropy": 1.3555320692062378, "epoch": 1.0871339492670824, "grad_norm": 1.890625, "learning_rate": 2.816147654933839e-06, "loss": 0.137, "mean_token_accuracy": 0.9693058180809021, "num_tokens": 4701897095.0, "step": 33300 }, { "entropy": 1.3499479746818543, "epoch": 1.0887662825242401, "grad_norm": 1.0703125, "learning_rate": 2.8080663876246394e-06, "loss": 0.1231, "mean_token_accuracy": 0.9722630488872528, "num_tokens": 4708881122.0, "step": 33350 }, { "entropy": 1.3455316138267517, "epoch": 1.0903986157813979, "grad_norm": 1.8515625, "learning_rate": 2.79998651853189e-06, "loss": 0.1199, "mean_token_accuracy": 0.9716216671466827, "num_tokens": 4716074845.0, "step": 33400 }, { "entropy": 1.359624376296997, "epoch": 1.0920309490385558, "grad_norm": 2.109375, "learning_rate": 2.7919081065165985e-06, "loss": 0.1257, "mean_token_accuracy": 0.9711248898506164, "num_tokens": 4723316565.0, "step": 33450 }, { "entropy": 1.3494191646575928, "epoch": 1.0936632822957135, "grad_norm": 1.53125, "learning_rate": 2.7838312104291584e-06, "loss": 0.1323, "mean_token_accuracy": 0.9694008147716522, "num_tokens": 4730650888.0, "step": 33500 }, { "epoch": 1.0936632822957135, "eval_entropy": 1.353581156730652, "eval_loss": 0.14585214853286743, "eval_mean_token_accuracy": 0.9667929395039876, "eval_num_tokens": 4730650888.0, "eval_runtime": 754.4564, "eval_samples_per_second": 12.799, "eval_steps_per_second": 0.101, "step": 33500 }, { "entropy": 1.3704795885086059, "epoch": 1.0952956155528712, "grad_norm": 1.796875, "learning_rate": 2.775755889108919e-06, "loss": 0.134, "mean_token_accuracy": 0.9687422275543213, "num_tokens": 4737974169.0, "step": 33550 }, { "entropy": 1.3608631157875062, "epoch": 1.096927948810029, "grad_norm": 2.140625, "learning_rate": 2.7676822013837588e-06, "loss": 0.1299, "mean_token_accuracy": 0.9702245342731476, "num_tokens": 4745212589.0, "step": 33600 }, { "entropy": 1.3445592832565307, "epoch": 1.098560282067187, "grad_norm": 1.8984375, "learning_rate": 2.7596102060696543e-06, "loss": 0.123, "mean_token_accuracy": 0.9716692876815796, "num_tokens": 4752104456.0, "step": 33650 }, { "entropy": 1.3646861577033997, "epoch": 1.1001926153243446, "grad_norm": 2.078125, "learning_rate": 2.7515399619702545e-06, "loss": 0.1286, "mean_token_accuracy": 0.9707652199268341, "num_tokens": 4759026284.0, "step": 33700 }, { "entropy": 1.3495210075378419, "epoch": 1.1018249485815024, "grad_norm": 1.6015625, "learning_rate": 2.7434715278764494e-06, "loss": 0.1298, "mean_token_accuracy": 0.9711257350444794, "num_tokens": 4765987412.0, "step": 33750 }, { "entropy": 1.3825481986999513, "epoch": 1.10345728183866, "grad_norm": 1.03125, "learning_rate": 2.735404962565945e-06, "loss": 0.1417, "mean_token_accuracy": 0.9679384648799896, "num_tokens": 4773475530.0, "step": 33800 }, { "entropy": 1.3517045164108277, "epoch": 1.105089615095818, "grad_norm": 2.515625, "learning_rate": 2.7273403248028325e-06, "loss": 0.1183, "mean_token_accuracy": 0.9723455941677094, "num_tokens": 4780068847.0, "step": 33850 }, { "entropy": 1.3678963851928712, "epoch": 1.1067219483529758, "grad_norm": 1.3125, "learning_rate": 2.7192776733371608e-06, "loss": 0.1354, "mean_token_accuracy": 0.9694834208488464, "num_tokens": 4786993711.0, "step": 33900 }, { "entropy": 1.3571431303024293, "epoch": 1.1083542816101335, "grad_norm": 2.15625, "learning_rate": 2.711217066904509e-06, "loss": 0.1212, "mean_token_accuracy": 0.9715266978740692, "num_tokens": 4793644363.0, "step": 33950 }, { "entropy": 1.3623171138763428, "epoch": 1.1099866148672912, "grad_norm": 1.59375, "learning_rate": 2.7031585642255596e-06, "loss": 0.1279, "mean_token_accuracy": 0.970818110704422, "num_tokens": 4800772089.0, "step": 34000 }, { "epoch": 1.1099866148672912, "eval_entropy": 1.3600939814249675, "eval_loss": 0.14549146592617035, "eval_mean_token_accuracy": 0.9668245681126912, "eval_num_tokens": 4800772089.0, "eval_runtime": 753.1399, "eval_samples_per_second": 12.821, "eval_steps_per_second": 0.101, "step": 34000 }, { "entropy": 1.3657611656188964, "epoch": 1.1116189481244492, "grad_norm": 1.375, "learning_rate": 2.695102224005667e-06, "loss": 0.1312, "mean_token_accuracy": 0.9701169800758361, "num_tokens": 4808026786.0, "step": 34050 }, { "entropy": 1.3706095337867736, "epoch": 1.113251281381607, "grad_norm": 0.002044677734375, "learning_rate": 2.687048104934434e-06, "loss": 0.1344, "mean_token_accuracy": 0.9694833195209503, "num_tokens": 4815351616.0, "step": 34100 }, { "entropy": 1.3567158889770508, "epoch": 1.1148836146387646, "grad_norm": 1.2578125, "learning_rate": 2.6789962656852835e-06, "loss": 0.1273, "mean_token_accuracy": 0.9710471928119659, "num_tokens": 4822489624.0, "step": 34150 }, { "entropy": 1.3655909848213197, "epoch": 1.1165159478959223, "grad_norm": 0.3984375, "learning_rate": 2.6709467649150276e-06, "loss": 0.138, "mean_token_accuracy": 0.9684849452972412, "num_tokens": 4830165631.0, "step": 34200 }, { "entropy": 1.3546900820732117, "epoch": 1.1181482811530803, "grad_norm": 1.796875, "learning_rate": 2.662899661263445e-06, "loss": 0.1259, "mean_token_accuracy": 0.9711956691741943, "num_tokens": 4836745329.0, "step": 34250 }, { "entropy": 1.361421148777008, "epoch": 1.119780614410238, "grad_norm": 2.46875, "learning_rate": 2.654855013352849e-06, "loss": 0.1297, "mean_token_accuracy": 0.9703176605701447, "num_tokens": 4843917385.0, "step": 34300 }, { "entropy": 1.3671817374229431, "epoch": 1.1214129476673957, "grad_norm": 2.34375, "learning_rate": 2.646812879787668e-06, "loss": 0.1262, "mean_token_accuracy": 0.9710332584381104, "num_tokens": 4850936707.0, "step": 34350 }, { "entropy": 1.368682358264923, "epoch": 1.1230452809245535, "grad_norm": 1.3046875, "learning_rate": 2.6387733191540083e-06, "loss": 0.1271, "mean_token_accuracy": 0.9702812135219574, "num_tokens": 4857774583.0, "step": 34400 }, { "entropy": 1.357949526309967, "epoch": 1.1246776141817114, "grad_norm": 1.03125, "learning_rate": 2.6307363900192354e-06, "loss": 0.1344, "mean_token_accuracy": 0.9693097794055938, "num_tokens": 4864886795.0, "step": 34450 }, { "entropy": 1.367612702846527, "epoch": 1.1263099474388691, "grad_norm": 2.21875, "learning_rate": 2.6227021509315442e-06, "loss": 0.1312, "mean_token_accuracy": 0.9697531294822693, "num_tokens": 4872140576.0, "step": 34500 }, { "epoch": 1.1263099474388691, "eval_entropy": 1.3573125632603964, "eval_loss": 0.14544960856437683, "eval_mean_token_accuracy": 0.9669329651196797, "eval_num_tokens": 4872140576.0, "eval_runtime": 753.8369, "eval_samples_per_second": 12.809, "eval_steps_per_second": 0.101, "step": 34500 }, { "entropy": 1.3483564281463623, "epoch": 1.1279422806960269, "grad_norm": 1.5, "learning_rate": 2.614670660419533e-06, "loss": 0.1174, "mean_token_accuracy": 0.9726065421104431, "num_tokens": 4879225657.0, "step": 34550 }, { "entropy": 1.3606638669967652, "epoch": 1.1295746139531846, "grad_norm": 2.40625, "learning_rate": 2.606641976991775e-06, "loss": 0.1254, "mean_token_accuracy": 0.9706631207466125, "num_tokens": 4886242099.0, "step": 34600 }, { "entropy": 1.3547830367088318, "epoch": 1.1312069472103425, "grad_norm": 1.1953125, "learning_rate": 2.5986161591363984e-06, "loss": 0.1294, "mean_token_accuracy": 0.9702156925201416, "num_tokens": 4892983817.0, "step": 34650 }, { "entropy": 1.3496627926826477, "epoch": 1.1328392804675003, "grad_norm": 2.046875, "learning_rate": 2.590593265320652e-06, "loss": 0.1236, "mean_token_accuracy": 0.9711934244632721, "num_tokens": 4900048536.0, "step": 34700 }, { "entropy": 1.3515707707405091, "epoch": 1.134471613724658, "grad_norm": 1.9921875, "learning_rate": 2.582573353990486e-06, "loss": 0.1279, "mean_token_accuracy": 0.9710315072536468, "num_tokens": 4906893370.0, "step": 34750 }, { "entropy": 1.3535165977478028, "epoch": 1.1361039469818157, "grad_norm": 1.890625, "learning_rate": 2.5745564835701206e-06, "loss": 0.1204, "mean_token_accuracy": 0.9720440351963043, "num_tokens": 4913536928.0, "step": 34800 }, { "entropy": 1.3644448471069337, "epoch": 1.1377362802389737, "grad_norm": 1.8046875, "learning_rate": 2.5665427124616256e-06, "loss": 0.1317, "mean_token_accuracy": 0.970216943025589, "num_tokens": 4920499397.0, "step": 34850 }, { "entropy": 1.3579574704170227, "epoch": 1.1393686134961314, "grad_norm": 2.265625, "learning_rate": 2.5585320990444923e-06, "loss": 0.1277, "mean_token_accuracy": 0.970561819076538, "num_tokens": 4928000813.0, "step": 34900 }, { "entropy": 1.3763416075706483, "epoch": 1.1410009467532891, "grad_norm": 1.3046875, "learning_rate": 2.550524701675208e-06, "loss": 0.1359, "mean_token_accuracy": 0.9684149813652039, "num_tokens": 4935032419.0, "step": 34950 }, { "entropy": 1.3512770438194275, "epoch": 1.1426332800104468, "grad_norm": 1.2890625, "learning_rate": 2.542520578686831e-06, "loss": 0.1211, "mean_token_accuracy": 0.971741670370102, "num_tokens": 4941410711.0, "step": 35000 }, { "epoch": 1.1426332800104468, "eval_entropy": 1.3627102088928222, "eval_loss": 0.14536549150943756, "eval_mean_token_accuracy": 0.9668985676765441, "eval_num_tokens": 4941410711.0, "eval_runtime": 752.233, "eval_samples_per_second": 12.836, "eval_steps_per_second": 0.101, "step": 35000 }, { "entropy": 1.3602009153366088, "epoch": 1.1442656132676048, "grad_norm": 1.1796875, "learning_rate": 2.5345197883885677e-06, "loss": 0.1279, "mean_token_accuracy": 0.9709439516067505, "num_tokens": 4948340334.0, "step": 35050 }, { "entropy": 1.3565789103507995, "epoch": 1.1458979465247625, "grad_norm": 1.3515625, "learning_rate": 2.526522389065345e-06, "loss": 0.1356, "mean_token_accuracy": 0.9685303854942322, "num_tokens": 4956282307.0, "step": 35100 }, { "entropy": 1.3583333039283751, "epoch": 1.1475302797819202, "grad_norm": 1.9296875, "learning_rate": 2.518528438977387e-06, "loss": 0.1214, "mean_token_accuracy": 0.9718796277046203, "num_tokens": 4963542094.0, "step": 35150 }, { "entropy": 1.35888774394989, "epoch": 1.149162613039078, "grad_norm": 1.84375, "learning_rate": 2.51053799635979e-06, "loss": 0.1341, "mean_token_accuracy": 0.9693203794956208, "num_tokens": 4970941813.0, "step": 35200 }, { "entropy": 1.3589514350891114, "epoch": 1.150794946296236, "grad_norm": 1.328125, "learning_rate": 2.5025511194221e-06, "loss": 0.1371, "mean_token_accuracy": 0.9692372989654541, "num_tokens": 4978422565.0, "step": 35250 }, { "entropy": 1.3571251654624938, "epoch": 1.1524272795533936, "grad_norm": 1.7734375, "learning_rate": 2.494567866347887e-06, "loss": 0.1301, "mean_token_accuracy": 0.9706821513175964, "num_tokens": 4985066771.0, "step": 35300 }, { "entropy": 1.3489887595176697, "epoch": 1.1540596128105514, "grad_norm": 1.328125, "learning_rate": 2.4865882952943194e-06, "loss": 0.1179, "mean_token_accuracy": 0.9729493832588196, "num_tokens": 4991808794.0, "step": 35350 }, { "entropy": 1.3580448365211486, "epoch": 1.155691946067709, "grad_norm": 1.5625, "learning_rate": 2.478612464391746e-06, "loss": 0.1267, "mean_token_accuracy": 0.9707785761356353, "num_tokens": 4998905781.0, "step": 35400 }, { "entropy": 1.35804701089859, "epoch": 1.157324279324867, "grad_norm": 1.8203125, "learning_rate": 2.470640431743268e-06, "loss": 0.1381, "mean_token_accuracy": 0.9687949836254119, "num_tokens": 5006205777.0, "step": 35450 }, { "entropy": 1.361290261745453, "epoch": 1.1589566125820248, "grad_norm": 1.4375, "learning_rate": 2.4626722554243144e-06, "loss": 0.1272, "mean_token_accuracy": 0.9700055694580079, "num_tokens": 5013371710.0, "step": 35500 }, { "epoch": 1.1589566125820248, "eval_entropy": 1.359473959604899, "eval_loss": 0.14513231813907623, "eval_mean_token_accuracy": 0.9669572798411051, "eval_num_tokens": 5013371710.0, "eval_runtime": 750.7651, "eval_samples_per_second": 12.862, "eval_steps_per_second": 0.101, "step": 35500 }, { "entropy": 1.3512133407592772, "epoch": 1.1605889458391825, "grad_norm": 2.234375, "learning_rate": 2.454707993482224e-06, "loss": 0.1272, "mean_token_accuracy": 0.9703232657909393, "num_tokens": 5020188118.0, "step": 35550 }, { "entropy": 1.3578423738479615, "epoch": 1.1622212790963404, "grad_norm": 1.5, "learning_rate": 2.446747703935818e-06, "loss": 0.1222, "mean_token_accuracy": 0.9719510304927826, "num_tokens": 5026929276.0, "step": 35600 }, { "entropy": 1.3523945426940918, "epoch": 1.1638536123534982, "grad_norm": 1.3125, "learning_rate": 2.4387914447749802e-06, "loss": 0.131, "mean_token_accuracy": 0.9701538634300232, "num_tokens": 5034103966.0, "step": 35650 }, { "entropy": 1.3755294966697693, "epoch": 1.1654859456106559, "grad_norm": 1.359375, "learning_rate": 2.4308392739602323e-06, "loss": 0.138, "mean_token_accuracy": 0.968312075138092, "num_tokens": 5041450508.0, "step": 35700 }, { "entropy": 1.3669376826286317, "epoch": 1.1671182788678136, "grad_norm": 1.2109375, "learning_rate": 2.4228912494223137e-06, "loss": 0.1333, "mean_token_accuracy": 0.9692367768287659, "num_tokens": 5048332405.0, "step": 35750 }, { "entropy": 1.371087245941162, "epoch": 1.1687506121249713, "grad_norm": 2.375, "learning_rate": 2.414947429061759e-06, "loss": 0.1322, "mean_token_accuracy": 0.9700936663150788, "num_tokens": 5055257003.0, "step": 35800 }, { "entropy": 1.374721155166626, "epoch": 1.1703829453821293, "grad_norm": 1.828125, "learning_rate": 2.4070078707484743e-06, "loss": 0.1387, "mean_token_accuracy": 0.9685248970985413, "num_tokens": 5062113906.0, "step": 35850 }, { "entropy": 1.373306679725647, "epoch": 1.172015278639287, "grad_norm": 1.34375, "learning_rate": 2.399072632321319e-06, "loss": 0.1278, "mean_token_accuracy": 0.9704372000694275, "num_tokens": 5069153056.0, "step": 35900 }, { "entropy": 1.362773072719574, "epoch": 1.1736476118964447, "grad_norm": 3.0, "learning_rate": 2.3911417715876806e-06, "loss": 0.1175, "mean_token_accuracy": 0.9727724301815033, "num_tokens": 5075547365.0, "step": 35950 }, { "entropy": 1.361773042678833, "epoch": 1.1752799451536027, "grad_norm": 2.265625, "learning_rate": 2.383215346323058e-06, "loss": 0.1318, "mean_token_accuracy": 0.9693261981010437, "num_tokens": 5082553584.0, "step": 36000 }, { "epoch": 1.1752799451536027, "eval_entropy": 1.3677510404586792, "eval_loss": 0.1451091319322586, "eval_mean_token_accuracy": 0.9669363768895467, "eval_num_tokens": 5082553584.0, "eval_runtime": 752.6099, "eval_samples_per_second": 12.83, "eval_steps_per_second": 0.101, "step": 36000 }, { "entropy": 1.367734661102295, "epoch": 1.1769122784107604, "grad_norm": 2.21875, "learning_rate": 2.3752934142706355e-06, "loss": 0.1292, "mean_token_accuracy": 0.970736186504364, "num_tokens": 5089460622.0, "step": 36050 }, { "entropy": 1.3702288055419922, "epoch": 1.1785446116679181, "grad_norm": 1.6796875, "learning_rate": 2.3673760331408664e-06, "loss": 0.1245, "mean_token_accuracy": 0.9706618010997772, "num_tokens": 5096477290.0, "step": 36100 }, { "entropy": 1.3666929292678833, "epoch": 1.1801769449250759, "grad_norm": 2.109375, "learning_rate": 2.3594632606110514e-06, "loss": 0.1348, "mean_token_accuracy": 0.9692357456684113, "num_tokens": 5103888303.0, "step": 36150 }, { "entropy": 1.3703182339668274, "epoch": 1.1818092781822336, "grad_norm": 1.484375, "learning_rate": 2.351555154324916e-06, "loss": 0.1352, "mean_token_accuracy": 0.9687790739536285, "num_tokens": 5111598482.0, "step": 36200 }, { "entropy": 1.3647811126708984, "epoch": 1.1834416114393915, "grad_norm": 2.46875, "learning_rate": 2.3436517718921944e-06, "loss": 0.123, "mean_token_accuracy": 0.9715970456600189, "num_tokens": 5118205523.0, "step": 36250 }, { "entropy": 1.3724281644821168, "epoch": 1.1850739446965493, "grad_norm": 2.09375, "learning_rate": 2.3357531708882084e-06, "loss": 0.1351, "mean_token_accuracy": 0.9688549792766571, "num_tokens": 5125390728.0, "step": 36300 }, { "entropy": 1.3688122749328613, "epoch": 1.186706277953707, "grad_norm": 1.3125, "learning_rate": 2.3278594088534453e-06, "loss": 0.1248, "mean_token_accuracy": 0.9709884691238403, "num_tokens": 5132430170.0, "step": 36350 }, { "entropy": 1.371947205066681, "epoch": 1.188338611210865, "grad_norm": 1.4140625, "learning_rate": 2.319970543293144e-06, "loss": 0.1299, "mean_token_accuracy": 0.9705728948116302, "num_tokens": 5139505776.0, "step": 36400 }, { "entropy": 1.3706382060050963, "epoch": 1.1899709444680227, "grad_norm": 1.34375, "learning_rate": 2.3120866316768705e-06, "loss": 0.1288, "mean_token_accuracy": 0.9700265216827393, "num_tokens": 5146615876.0, "step": 36450 }, { "entropy": 1.3712089610099794, "epoch": 1.1916032777251804, "grad_norm": 1.484375, "learning_rate": 2.3042077314381025e-06, "loss": 0.1227, "mean_token_accuracy": 0.9712138116359711, "num_tokens": 5153410420.0, "step": 36500 }, { "epoch": 1.1916032777251804, "eval_entropy": 1.369815084139506, "eval_loss": 0.14498043060302734, "eval_mean_token_accuracy": 0.9671436421076457, "eval_num_tokens": 5153410420.0, "eval_runtime": 750.4787, "eval_samples_per_second": 12.866, "eval_steps_per_second": 0.101, "step": 36500 }, { "entropy": 1.3633040881156921, "epoch": 1.193235610982338, "grad_norm": 1.34375, "learning_rate": 2.2963338999738103e-06, "loss": 0.1321, "mean_token_accuracy": 0.9702033531665802, "num_tokens": 5160782174.0, "step": 36550 }, { "entropy": 1.3697287273406982, "epoch": 1.1948679442394958, "grad_norm": 1.625, "learning_rate": 2.288465194644041e-06, "loss": 0.1346, "mean_token_accuracy": 0.9692202270030975, "num_tokens": 5168040473.0, "step": 36600 }, { "entropy": 1.3875136041641236, "epoch": 1.1965002774966538, "grad_norm": 2.34375, "learning_rate": 2.2806016727714953e-06, "loss": 0.1355, "mean_token_accuracy": 0.9688812565803527, "num_tokens": 5175213322.0, "step": 36650 }, { "entropy": 1.3730654883384705, "epoch": 1.1981326107538115, "grad_norm": 1.078125, "learning_rate": 2.272743391641114e-06, "loss": 0.123, "mean_token_accuracy": 0.9711851370334625, "num_tokens": 5182005797.0, "step": 36700 }, { "entropy": 1.3739676403999328, "epoch": 1.1997649440109692, "grad_norm": 1.7734375, "learning_rate": 2.2648904084996593e-06, "loss": 0.1329, "mean_token_accuracy": 0.9690020906925202, "num_tokens": 5188915363.0, "step": 36750 }, { "entropy": 1.3796244549751282, "epoch": 1.2013972772681272, "grad_norm": 1.75, "learning_rate": 2.2570427805553e-06, "loss": 0.135, "mean_token_accuracy": 0.969414986371994, "num_tokens": 5196503416.0, "step": 36800 }, { "entropy": 1.3724525594711303, "epoch": 1.203029610525285, "grad_norm": 1.078125, "learning_rate": 2.24920056497719e-06, "loss": 0.1277, "mean_token_accuracy": 0.9707681620121003, "num_tokens": 5203249529.0, "step": 36850 }, { "entropy": 1.3727708411216737, "epoch": 1.2046619437824426, "grad_norm": 1.9296875, "learning_rate": 2.2413638188950564e-06, "loss": 0.1301, "mean_token_accuracy": 0.9704027915000916, "num_tokens": 5210234077.0, "step": 36900 }, { "entropy": 1.3519327425956726, "epoch": 1.2062942770396003, "grad_norm": 1.4140625, "learning_rate": 2.2335325993987815e-06, "loss": 0.1196, "mean_token_accuracy": 0.9722448754310608, "num_tokens": 5216853099.0, "step": 36950 }, { "entropy": 1.365940923690796, "epoch": 1.207926610296758, "grad_norm": 1.6328125, "learning_rate": 2.2257069635379863e-06, "loss": 0.1328, "mean_token_accuracy": 0.9699783003330231, "num_tokens": 5223781418.0, "step": 37000 }, { "epoch": 1.207926610296758, "eval_entropy": 1.360196549097697, "eval_loss": 0.1449788212776184, "eval_mean_token_accuracy": 0.9670122480392456, "eval_num_tokens": 5223781418.0, "eval_runtime": 751.1493, "eval_samples_per_second": 12.855, "eval_steps_per_second": 0.101, "step": 37000 }, { "entropy": 1.360548312664032, "epoch": 1.209558943553916, "grad_norm": 1.7109375, "learning_rate": 2.2178869683216164e-06, "loss": 0.1262, "mean_token_accuracy": 0.9713475477695465, "num_tokens": 5230659746.0, "step": 37050 }, { "entropy": 1.357577109336853, "epoch": 1.2111912768110737, "grad_norm": 1.265625, "learning_rate": 2.2100726707175246e-06, "loss": 0.1313, "mean_token_accuracy": 0.969699913263321, "num_tokens": 5237707649.0, "step": 37100 }, { "entropy": 1.3651654267311095, "epoch": 1.2128236100682315, "grad_norm": 1.3359375, "learning_rate": 2.202264127652059e-06, "loss": 0.1295, "mean_token_accuracy": 0.9710112833976745, "num_tokens": 5244771475.0, "step": 37150 }, { "entropy": 1.3489612317085267, "epoch": 1.2144559433253894, "grad_norm": 2.140625, "learning_rate": 2.1944613960096456e-06, "loss": 0.1213, "mean_token_accuracy": 0.9717336785793305, "num_tokens": 5251743181.0, "step": 37200 }, { "entropy": 1.355490939617157, "epoch": 1.2160882765825471, "grad_norm": 1.6953125, "learning_rate": 2.1866645326323743e-06, "loss": 0.1271, "mean_token_accuracy": 0.9707163834571838, "num_tokens": 5258283419.0, "step": 37250 }, { "entropy": 1.3655565786361694, "epoch": 1.2177206098397049, "grad_norm": 1.1875, "learning_rate": 2.1788735943195865e-06, "loss": 0.1367, "mean_token_accuracy": 0.9688389587402344, "num_tokens": 5265823807.0, "step": 37300 }, { "entropy": 1.349189965724945, "epoch": 1.2193529430968626, "grad_norm": 1.3515625, "learning_rate": 2.171088637827458e-06, "loss": 0.1318, "mean_token_accuracy": 0.9697683715820312, "num_tokens": 5273206881.0, "step": 37350 }, { "entropy": 1.3544545078277588, "epoch": 1.2209852763540203, "grad_norm": 1.03125, "learning_rate": 2.16330971986859e-06, "loss": 0.1216, "mean_token_accuracy": 0.9722874271869659, "num_tokens": 5280217004.0, "step": 37400 }, { "entropy": 1.3607108736038207, "epoch": 1.2226176096111783, "grad_norm": 1.15625, "learning_rate": 2.1555368971115926e-06, "loss": 0.1353, "mean_token_accuracy": 0.9696134865283966, "num_tokens": 5287498948.0, "step": 37450 }, { "entropy": 1.3576147866249084, "epoch": 1.224249942868336, "grad_norm": 1.71875, "learning_rate": 2.147770226180672e-06, "loss": 0.1276, "mean_token_accuracy": 0.9706488907337188, "num_tokens": 5294869687.0, "step": 37500 }, { "epoch": 1.224249942868336, "eval_entropy": 1.3614781061808268, "eval_loss": 0.14473138749599457, "eval_mean_token_accuracy": 0.9672889757156372, "eval_num_tokens": 5294869687.0, "eval_runtime": 751.3614, "eval_samples_per_second": 12.851, "eval_steps_per_second": 0.101, "step": 37500 }, { "entropy": 1.376727077960968, "epoch": 1.2258822761254937, "grad_norm": 1.265625, "learning_rate": 2.1400097636552217e-06, "loss": 0.1308, "mean_token_accuracy": 0.9701304376125336, "num_tokens": 5301938154.0, "step": 37550 }, { "entropy": 1.3636725115776063, "epoch": 1.2275146093826517, "grad_norm": 1.5703125, "learning_rate": 2.1322555660694053e-06, "loss": 0.1301, "mean_token_accuracy": 0.9700271189212799, "num_tokens": 5308968407.0, "step": 37600 }, { "entropy": 1.3606876850128173, "epoch": 1.2291469426398094, "grad_norm": 1.21875, "learning_rate": 2.124507689911747e-06, "loss": 0.1232, "mean_token_accuracy": 0.9719198334217072, "num_tokens": 5316027114.0, "step": 37650 }, { "entropy": 1.3608359265327454, "epoch": 1.2307792758969671, "grad_norm": 1.2421875, "learning_rate": 2.1167661916247203e-06, "loss": 0.1352, "mean_token_accuracy": 0.9692904555797577, "num_tokens": 5323480081.0, "step": 37700 }, { "entropy": 1.3596609687805177, "epoch": 1.2324116091541248, "grad_norm": 1.453125, "learning_rate": 2.109031127604339e-06, "loss": 0.1171, "mean_token_accuracy": 0.9726130926609039, "num_tokens": 5330170561.0, "step": 37750 }, { "entropy": 1.36385005235672, "epoch": 1.2340439424112826, "grad_norm": 1.1796875, "learning_rate": 2.10130255419974e-06, "loss": 0.1271, "mean_token_accuracy": 0.9708395230770112, "num_tokens": 5337270272.0, "step": 37800 }, { "entropy": 1.3592445206642152, "epoch": 1.2356762756684405, "grad_norm": 1.75, "learning_rate": 2.0935805277127794e-06, "loss": 0.1307, "mean_token_accuracy": 0.9704990041255951, "num_tokens": 5344402468.0, "step": 37850 }, { "entropy": 1.369277856349945, "epoch": 1.2373086089255982, "grad_norm": 1.4609375, "learning_rate": 2.0858651043976183e-06, "loss": 0.1373, "mean_token_accuracy": 0.9686707425117492, "num_tokens": 5351952870.0, "step": 37900 }, { "entropy": 1.3526391363143921, "epoch": 1.238940942182756, "grad_norm": 2.03125, "learning_rate": 2.0781563404603153e-06, "loss": 0.1266, "mean_token_accuracy": 0.9708797204494476, "num_tokens": 5359011853.0, "step": 37950 }, { "entropy": 1.358784899711609, "epoch": 1.240573275439914, "grad_norm": 1.109375, "learning_rate": 2.0704542920584153e-06, "loss": 0.1342, "mean_token_accuracy": 0.9693182837963105, "num_tokens": 5366436177.0, "step": 38000 }, { "epoch": 1.240573275439914, "eval_entropy": 1.3544676637649535, "eval_loss": 0.14443761110305786, "eval_mean_token_accuracy": 0.9673179856936137, "eval_num_tokens": 5366436177.0, "eval_runtime": 752.6655, "eval_samples_per_second": 12.829, "eval_steps_per_second": 0.101, "step": 38000 }, { "entropy": 1.344338595867157, "epoch": 1.2422056086970716, "grad_norm": 1.8203125, "learning_rate": 2.0627590153005426e-06, "loss": 0.1146, "mean_token_accuracy": 0.9736293482780457, "num_tokens": 5373144912.0, "step": 38050 }, { "entropy": 1.3542405033111573, "epoch": 1.2438379419542294, "grad_norm": 2.109375, "learning_rate": 2.0550705662459896e-06, "loss": 0.1276, "mean_token_accuracy": 0.9702631950378418, "num_tokens": 5380308640.0, "step": 38100 }, { "entropy": 1.3496335220336915, "epoch": 1.245470275211387, "grad_norm": 2.015625, "learning_rate": 2.047389000904309e-06, "loss": 0.1233, "mean_token_accuracy": 0.9710812640190124, "num_tokens": 5386799272.0, "step": 38150 }, { "entropy": 1.3436848521232605, "epoch": 1.2471026084685448, "grad_norm": 1.078125, "learning_rate": 2.0397143752349084e-06, "loss": 0.1313, "mean_token_accuracy": 0.9700379192829132, "num_tokens": 5393957806.0, "step": 38200 }, { "entropy": 1.3535588788986206, "epoch": 1.2487349417257028, "grad_norm": 2.1875, "learning_rate": 2.032046745146638e-06, "loss": 0.1256, "mean_token_accuracy": 0.9713840389251709, "num_tokens": 5400892138.0, "step": 38250 }, { "entropy": 1.35460533618927, "epoch": 1.2503672749828605, "grad_norm": 2.046875, "learning_rate": 2.0243861664973897e-06, "loss": 0.1271, "mean_token_accuracy": 0.9702360582351685, "num_tokens": 5407837431.0, "step": 38300 }, { "entropy": 1.350141739845276, "epoch": 1.2519996082400182, "grad_norm": 1.8828125, "learning_rate": 2.016732695093681e-06, "loss": 0.1195, "mean_token_accuracy": 0.9717020273208619, "num_tokens": 5414369851.0, "step": 38350 }, { "entropy": 1.3572440361976623, "epoch": 1.2536319414971762, "grad_norm": 1.8828125, "learning_rate": 2.009086386690259e-06, "loss": 0.1321, "mean_token_accuracy": 0.9700393569469452, "num_tokens": 5421779114.0, "step": 38400 }, { "entropy": 1.3585280227661132, "epoch": 1.2552642747543339, "grad_norm": 0.002349853515625, "learning_rate": 2.001447296989687e-06, "loss": 0.1236, "mean_token_accuracy": 0.9713066399097443, "num_tokens": 5428733100.0, "step": 38450 }, { "entropy": 1.3632621765136719, "epoch": 1.2568966080114916, "grad_norm": 1.7734375, "learning_rate": 1.993815481641939e-06, "loss": 0.1282, "mean_token_accuracy": 0.9696168947219849, "num_tokens": 5435549385.0, "step": 38500 }, { "epoch": 1.2568966080114916, "eval_entropy": 1.3491881497701008, "eval_loss": 0.14436551928520203, "eval_mean_token_accuracy": 0.9672579844792684, "eval_num_tokens": 5435549385.0, "eval_runtime": 753.3503, "eval_samples_per_second": 12.817, "eval_steps_per_second": 0.101, "step": 38500 }, { "entropy": 1.3552922701835632, "epoch": 1.2585289412686493, "grad_norm": 1.4375, "learning_rate": 1.9861909962440006e-06, "loss": 0.1325, "mean_token_accuracy": 0.9696126735210419, "num_tokens": 5442577839.0, "step": 38550 }, { "entropy": 1.3554336166381835, "epoch": 1.260161274525807, "grad_norm": 1.875, "learning_rate": 1.978573896339455e-06, "loss": 0.1203, "mean_token_accuracy": 0.9720760262012482, "num_tokens": 5449083615.0, "step": 38600 }, { "entropy": 1.3641015887260437, "epoch": 1.261793607782965, "grad_norm": 2.046875, "learning_rate": 1.9709642374180845e-06, "loss": 0.1304, "mean_token_accuracy": 0.9704764342308044, "num_tokens": 5456277556.0, "step": 38650 }, { "entropy": 1.3642727065086364, "epoch": 1.2634259410401227, "grad_norm": 1.984375, "learning_rate": 1.9633620749154656e-06, "loss": 0.129, "mean_token_accuracy": 0.9703558135032654, "num_tokens": 5462943399.0, "step": 38700 }, { "entropy": 1.3643340587615966, "epoch": 1.2650582742972805, "grad_norm": 1.984375, "learning_rate": 1.9557674642125618e-06, "loss": 0.1276, "mean_token_accuracy": 0.9705350911617279, "num_tokens": 5470083333.0, "step": 38750 }, { "entropy": 1.35529381275177, "epoch": 1.2666906075544384, "grad_norm": 1.328125, "learning_rate": 1.9481804606353256e-06, "loss": 0.1207, "mean_token_accuracy": 0.9720316576957703, "num_tokens": 5477264167.0, "step": 38800 }, { "entropy": 1.347254078388214, "epoch": 1.2683229408115961, "grad_norm": 1.2265625, "learning_rate": 1.9406011194542896e-06, "loss": 0.1256, "mean_token_accuracy": 0.9704374611377716, "num_tokens": 5484197390.0, "step": 38850 }, { "entropy": 1.3472388553619385, "epoch": 1.2699552740687539, "grad_norm": 1.78125, "learning_rate": 1.933029495884169e-06, "loss": 0.1166, "mean_token_accuracy": 0.9729468870162964, "num_tokens": 5490515143.0, "step": 38900 }, { "entropy": 1.3586375880241395, "epoch": 1.2715876073259116, "grad_norm": 2.609375, "learning_rate": 1.925465645083455e-06, "loss": 0.1342, "mean_token_accuracy": 0.9688045310974122, "num_tokens": 5497798023.0, "step": 38950 }, { "entropy": 1.3617107224464418, "epoch": 1.2732199405830693, "grad_norm": 1.859375, "learning_rate": 1.9179096221540163e-06, "loss": 0.1337, "mean_token_accuracy": 0.969189600944519, "num_tokens": 5504996504.0, "step": 39000 }, { "epoch": 1.2732199405830693, "eval_entropy": 1.3547975174585978, "eval_loss": 0.14426733553409576, "eval_mean_token_accuracy": 0.9672306942939758, "eval_num_tokens": 5504996504.0, "eval_runtime": 747.051, "eval_samples_per_second": 12.925, "eval_steps_per_second": 0.102, "step": 39000 }, { "entropy": 1.3557351016998291, "epoch": 1.2748522738402273, "grad_norm": 1.3828125, "learning_rate": 1.910361482140696e-06, "loss": 0.1248, "mean_token_accuracy": 0.9716817474365235, "num_tokens": 5512011287.0, "step": 39050 }, { "entropy": 1.353394594192505, "epoch": 1.276484607097385, "grad_norm": 1.3203125, "learning_rate": 1.90282128003091e-06, "loss": 0.12, "mean_token_accuracy": 0.9717885673046112, "num_tokens": 5518924203.0, "step": 39100 }, { "entropy": 1.3534292221069335, "epoch": 1.2781169403545427, "grad_norm": 1.234375, "learning_rate": 1.895289070754249e-06, "loss": 0.131, "mean_token_accuracy": 0.969032096862793, "num_tokens": 5526036302.0, "step": 39150 }, { "entropy": 1.3387615489959717, "epoch": 1.2797492736117007, "grad_norm": 2.421875, "learning_rate": 1.887764909182076e-06, "loss": 0.1203, "mean_token_accuracy": 0.9723163688182831, "num_tokens": 5532560928.0, "step": 39200 }, { "entropy": 1.3516933727264404, "epoch": 1.2813816068688584, "grad_norm": 1.78125, "learning_rate": 1.8802488501271259e-06, "loss": 0.1325, "mean_token_accuracy": 0.9697006630897522, "num_tokens": 5539749961.0, "step": 39250 }, { "entropy": 1.358582181930542, "epoch": 1.283013940126016, "grad_norm": 1.2421875, "learning_rate": 1.8727409483431112e-06, "loss": 0.1276, "mean_token_accuracy": 0.9706891429424286, "num_tokens": 5546991771.0, "step": 39300 }, { "entropy": 1.3414494490623474, "epoch": 1.2846462733831738, "grad_norm": 1.40625, "learning_rate": 1.8652412585243158e-06, "loss": 0.1205, "mean_token_accuracy": 0.9722245049476623, "num_tokens": 5553461638.0, "step": 39350 }, { "entropy": 1.3569713354110717, "epoch": 1.2862786066403316, "grad_norm": 2.0625, "learning_rate": 1.8577498353052025e-06, "loss": 0.1257, "mean_token_accuracy": 0.9703078508377075, "num_tokens": 5560423404.0, "step": 39400 }, { "entropy": 1.3759264373779296, "epoch": 1.2879109398974895, "grad_norm": 1.5078125, "learning_rate": 1.850266733260012e-06, "loss": 0.1489, "mean_token_accuracy": 0.9663536393642426, "num_tokens": 5568028795.0, "step": 39450 }, { "entropy": 1.3500820732116698, "epoch": 1.2895432731546472, "grad_norm": 1.390625, "learning_rate": 1.8427920069023658e-06, "loss": 0.1198, "mean_token_accuracy": 0.9720592284202576, "num_tokens": 5574696411.0, "step": 39500 }, { "epoch": 1.2895432731546472, "eval_entropy": 1.3591709502538045, "eval_loss": 0.14416085183620453, "eval_mean_token_accuracy": 0.9672661876678467, "eval_num_tokens": 5574696411.0, "eval_runtime": 754.3873, "eval_samples_per_second": 12.8, "eval_steps_per_second": 0.101, "step": 39500 }, { "entropy": 1.363064079284668, "epoch": 1.291175606411805, "grad_norm": 1.25, "learning_rate": 1.8353257106848703e-06, "loss": 0.1242, "mean_token_accuracy": 0.9711923873424531, "num_tokens": 5581679975.0, "step": 39550 }, { "entropy": 1.356317241191864, "epoch": 1.292807939668963, "grad_norm": 1.859375, "learning_rate": 1.8278678989987178e-06, "loss": 0.1335, "mean_token_accuracy": 0.9687803375720978, "num_tokens": 5589014993.0, "step": 39600 }, { "entropy": 1.3506816673278808, "epoch": 1.2944402729261206, "grad_norm": 1.3046875, "learning_rate": 1.8204186261732938e-06, "loss": 0.1313, "mean_token_accuracy": 0.9694370079040527, "num_tokens": 5595958511.0, "step": 39650 }, { "entropy": 1.3653843665122987, "epoch": 1.2960726061832784, "grad_norm": 1.78125, "learning_rate": 1.8129779464757774e-06, "loss": 0.1327, "mean_token_accuracy": 0.9690759527683258, "num_tokens": 5603031399.0, "step": 39700 }, { "entropy": 1.3516422724723816, "epoch": 1.297704939440436, "grad_norm": 2.484375, "learning_rate": 1.8055459141107477e-06, "loss": 0.1184, "mean_token_accuracy": 0.9726696240901948, "num_tokens": 5610046160.0, "step": 39750 }, { "entropy": 1.376489179134369, "epoch": 1.2993372726975938, "grad_norm": 1.484375, "learning_rate": 1.7981225832197894e-06, "loss": 0.1399, "mean_token_accuracy": 0.9676700031757355, "num_tokens": 5617399853.0, "step": 39800 }, { "entropy": 1.345590648651123, "epoch": 1.3009696059547518, "grad_norm": 1.46875, "learning_rate": 1.7907080078810983e-06, "loss": 0.1242, "mean_token_accuracy": 0.9713241112232208, "num_tokens": 5624175713.0, "step": 39850 }, { "entropy": 1.3524721622467042, "epoch": 1.3026019392119095, "grad_norm": 1.421875, "learning_rate": 1.7833022421090858e-06, "loss": 0.1226, "mean_token_accuracy": 0.9718897187709808, "num_tokens": 5631014007.0, "step": 39900 }, { "entropy": 1.3629893851280213, "epoch": 1.3042342724690672, "grad_norm": 1.09375, "learning_rate": 1.7759053398539873e-06, "loss": 0.1312, "mean_token_accuracy": 0.970218130350113, "num_tokens": 5638370611.0, "step": 39950 }, { "entropy": 1.3576352548599244, "epoch": 1.3058666057262251, "grad_norm": 1.2890625, "learning_rate": 1.7685173550014671e-06, "loss": 0.134, "mean_token_accuracy": 0.9691074180603028, "num_tokens": 5645778490.0, "step": 40000 }, { "epoch": 1.3058666057262251, "eval_entropy": 1.355252981185913, "eval_loss": 0.14380384981632233, "eval_mean_token_accuracy": 0.9673471585909525, "eval_num_tokens": 5645778490.0, "eval_runtime": 745.8824, "eval_samples_per_second": 12.946, "eval_steps_per_second": 0.102, "step": 40000 }, { "entropy": 1.3658280086517334, "epoch": 1.3074989389833829, "grad_norm": 2.109375, "learning_rate": 1.7611383413722303e-06, "loss": 0.1408, "mean_token_accuracy": 0.9673460054397583, "num_tokens": 5653261891.0, "step": 40050 }, { "entropy": 1.3569263100624085, "epoch": 1.3091312722405406, "grad_norm": 1.3984375, "learning_rate": 1.7537683527216242e-06, "loss": 0.1303, "mean_token_accuracy": 0.9697596049308777, "num_tokens": 5660316880.0, "step": 40100 }, { "entropy": 1.3518936610221863, "epoch": 1.3107636054976983, "grad_norm": 1.484375, "learning_rate": 1.7464074427392512e-06, "loss": 0.1291, "mean_token_accuracy": 0.9700168156623841, "num_tokens": 5667680932.0, "step": 40150 }, { "entropy": 1.3594465851783752, "epoch": 1.3123959387548563, "grad_norm": 1.8984375, "learning_rate": 1.7390556650485782e-06, "loss": 0.13, "mean_token_accuracy": 0.9701092898845672, "num_tokens": 5674932027.0, "step": 40200 }, { "entropy": 1.3511109519004822, "epoch": 1.314028272012014, "grad_norm": 1.2890625, "learning_rate": 1.7317130732065411e-06, "loss": 0.1236, "mean_token_accuracy": 0.9719385504722595, "num_tokens": 5681669046.0, "step": 40250 }, { "entropy": 1.3469841027259826, "epoch": 1.3156606052691717, "grad_norm": 1.1015625, "learning_rate": 1.7243797207031596e-06, "loss": 0.1204, "mean_token_accuracy": 0.9718147456645966, "num_tokens": 5688464363.0, "step": 40300 }, { "entropy": 1.3597651433944702, "epoch": 1.3172929385263297, "grad_norm": 1.65625, "learning_rate": 1.7170556609611477e-06, "loss": 0.1321, "mean_token_accuracy": 0.9700502359867096, "num_tokens": 5695659707.0, "step": 40350 }, { "entropy": 1.359071056842804, "epoch": 1.3189252717834874, "grad_norm": 2.03125, "learning_rate": 1.709740947335518e-06, "loss": 0.1232, "mean_token_accuracy": 0.9715420746803284, "num_tokens": 5702611675.0, "step": 40400 }, { "entropy": 1.3486275053024293, "epoch": 1.3205576050406451, "grad_norm": 0.9609375, "learning_rate": 1.7024356331132025e-06, "loss": 0.1197, "mean_token_accuracy": 0.9719861805438995, "num_tokens": 5709648249.0, "step": 40450 }, { "entropy": 1.366611111164093, "epoch": 1.3221899382978028, "grad_norm": 1.5859375, "learning_rate": 1.695139771512655e-06, "loss": 0.1363, "mean_token_accuracy": 0.96794402718544, "num_tokens": 5716864381.0, "step": 40500 }, { "epoch": 1.3221899382978028, "eval_entropy": 1.3547792641321819, "eval_loss": 0.14348579943180084, "eval_mean_token_accuracy": 0.9674063356717427, "eval_num_tokens": 5716864381.0, "eval_runtime": 751.715, "eval_samples_per_second": 12.845, "eval_steps_per_second": 0.101, "step": 40500 }, { "entropy": 1.3567886662483215, "epoch": 1.3238222715549606, "grad_norm": 1.59375, "learning_rate": 1.687853415683473e-06, "loss": 0.118, "mean_token_accuracy": 0.9730365478992462, "num_tokens": 5723690263.0, "step": 40550 }, { "entropy": 1.3625458979606628, "epoch": 1.3254546048121185, "grad_norm": 1.140625, "learning_rate": 1.6805766187059998e-06, "loss": 0.1359, "mean_token_accuracy": 0.9688236701488495, "num_tokens": 5731246956.0, "step": 40600 }, { "entropy": 1.3589171147346497, "epoch": 1.3270869380692762, "grad_norm": 1.4296875, "learning_rate": 1.6733094335909486e-06, "loss": 0.1366, "mean_token_accuracy": 0.968874671459198, "num_tokens": 5738659112.0, "step": 40650 }, { "entropy": 1.3442611718177795, "epoch": 1.328719271326434, "grad_norm": 1.25, "learning_rate": 1.666051913279007e-06, "loss": 0.1259, "mean_token_accuracy": 0.9707275819778443, "num_tokens": 5745995037.0, "step": 40700 }, { "entropy": 1.3413966584205628, "epoch": 1.330351604583592, "grad_norm": 1.6171875, "learning_rate": 1.658804110640458e-06, "loss": 0.1199, "mean_token_accuracy": 0.972169394493103, "num_tokens": 5752781216.0, "step": 40750 }, { "entropy": 1.3544580388069152, "epoch": 1.3319839378407496, "grad_norm": 1.421875, "learning_rate": 1.6515660784747933e-06, "loss": 0.1287, "mean_token_accuracy": 0.9699071037769318, "num_tokens": 5759681433.0, "step": 40800 }, { "entropy": 1.3634159827232362, "epoch": 1.3336162710979074, "grad_norm": 1.2421875, "learning_rate": 1.6443378695103233e-06, "loss": 0.1422, "mean_token_accuracy": 0.9680401980876923, "num_tokens": 5767127976.0, "step": 40850 }, { "entropy": 1.3604375696182252, "epoch": 1.335248604355065, "grad_norm": 1.40625, "learning_rate": 1.6371195364038034e-06, "loss": 0.1281, "mean_token_accuracy": 0.9703556621074676, "num_tokens": 5773885662.0, "step": 40900 }, { "entropy": 1.367766616344452, "epoch": 1.3368809376122228, "grad_norm": 1.3203125, "learning_rate": 1.6299111317400382e-06, "loss": 0.1311, "mean_token_accuracy": 0.9696219575405121, "num_tokens": 5780884429.0, "step": 40950 }, { "entropy": 1.3579600000381469, "epoch": 1.3385132708693808, "grad_norm": 1.5078125, "learning_rate": 1.6227127080315103e-06, "loss": 0.1271, "mean_token_accuracy": 0.9701367461681366, "num_tokens": 5787949031.0, "step": 41000 }, { "epoch": 1.3385132708693808, "eval_entropy": 1.3570391607284547, "eval_loss": 0.14324620366096497, "eval_mean_token_accuracy": 0.9674420754114786, "eval_num_tokens": 5787949031.0, "eval_runtime": 754.7744, "eval_samples_per_second": 12.793, "eval_steps_per_second": 0.101, "step": 41000 }, { "entropy": 1.3567052793502807, "epoch": 1.3401456041265385, "grad_norm": 1.3671875, "learning_rate": 1.6155243177179873e-06, "loss": 0.125, "mean_token_accuracy": 0.9710537779331208, "num_tokens": 5795334113.0, "step": 41050 }, { "entropy": 1.3398216152191162, "epoch": 1.3417779373836962, "grad_norm": 0.91796875, "learning_rate": 1.6083460131661477e-06, "loss": 0.1133, "mean_token_accuracy": 0.9733657622337342, "num_tokens": 5802143914.0, "step": 41100 }, { "entropy": 1.3610263323783875, "epoch": 1.3434102706408542, "grad_norm": 1.265625, "learning_rate": 1.6011778466691951e-06, "loss": 0.1231, "mean_token_accuracy": 0.9718475365638732, "num_tokens": 5808928463.0, "step": 41150 }, { "entropy": 1.3601333618164062, "epoch": 1.345042603898012, "grad_norm": 1.234375, "learning_rate": 1.5940198704464793e-06, "loss": 0.1302, "mean_token_accuracy": 0.969777444601059, "num_tokens": 5816047556.0, "step": 41200 }, { "entropy": 1.36517019033432, "epoch": 1.3466749371551696, "grad_norm": 2.265625, "learning_rate": 1.5868721366431148e-06, "loss": 0.1311, "mean_token_accuracy": 0.9697459650039673, "num_tokens": 5823417847.0, "step": 41250 }, { "entropy": 1.3562990355491638, "epoch": 1.3483072704123273, "grad_norm": 1.8125, "learning_rate": 1.5797346973295984e-06, "loss": 0.1294, "mean_token_accuracy": 0.9702081382274628, "num_tokens": 5830494397.0, "step": 41300 }, { "entropy": 1.3457439398765565, "epoch": 1.349939603669485, "grad_norm": 1.40625, "learning_rate": 1.5726076045014376e-06, "loss": 0.1206, "mean_token_accuracy": 0.9718204891681671, "num_tokens": 5837675412.0, "step": 41350 }, { "entropy": 1.3478499841690064, "epoch": 1.351571936926643, "grad_norm": 1.1484375, "learning_rate": 1.565490910078761e-06, "loss": 0.1224, "mean_token_accuracy": 0.971153804063797, "num_tokens": 5844543024.0, "step": 41400 }, { "entropy": 1.3584490442276, "epoch": 1.3532042701838007, "grad_norm": 1.5703125, "learning_rate": 1.5583846659059525e-06, "loss": 0.12, "mean_token_accuracy": 0.9715865242481232, "num_tokens": 5851449870.0, "step": 41450 }, { "entropy": 1.3518057036399842, "epoch": 1.3548366034409585, "grad_norm": 1.265625, "learning_rate": 1.5512889237512604e-06, "loss": 0.13, "mean_token_accuracy": 0.9702707767486572, "num_tokens": 5858314021.0, "step": 41500 }, { "epoch": 1.3548366034409585, "eval_entropy": 1.3525403213500977, "eval_loss": 0.14299461245536804, "eval_mean_token_accuracy": 0.9675085457166036, "eval_num_tokens": 5858314021.0, "eval_runtime": 751.2489, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 41500 }, { "entropy": 1.3594915199279785, "epoch": 1.3564689366981164, "grad_norm": 1.8671875, "learning_rate": 1.5442037353064314e-06, "loss": 0.1252, "mean_token_accuracy": 0.97132617354393, "num_tokens": 5865594325.0, "step": 41550 }, { "entropy": 1.352486503124237, "epoch": 1.3581012699552741, "grad_norm": 1.3125, "learning_rate": 1.537129152186329e-06, "loss": 0.1254, "mean_token_accuracy": 0.9710965490341187, "num_tokens": 5872751335.0, "step": 41600 }, { "entropy": 1.3547274160385132, "epoch": 1.3597336032124319, "grad_norm": 1.6015625, "learning_rate": 1.530065225928555e-06, "loss": 0.13, "mean_token_accuracy": 0.9703376948833465, "num_tokens": 5880128383.0, "step": 41650 }, { "entropy": 1.3553478455543517, "epoch": 1.3613659364695896, "grad_norm": 1.8828125, "learning_rate": 1.5230120079930814e-06, "loss": 0.1216, "mean_token_accuracy": 0.9711664152145386, "num_tokens": 5887029878.0, "step": 41700 }, { "entropy": 1.3534876823425293, "epoch": 1.3629982697267473, "grad_norm": 2.6875, "learning_rate": 1.515969549761867e-06, "loss": 0.1274, "mean_token_accuracy": 0.9711965453624726, "num_tokens": 5893747154.0, "step": 41750 }, { "entropy": 1.360564501285553, "epoch": 1.3646306029839053, "grad_norm": 1.4140625, "learning_rate": 1.5089379025384912e-06, "loss": 0.1351, "mean_token_accuracy": 0.969272004365921, "num_tokens": 5901094204.0, "step": 41800 }, { "entropy": 1.342657322883606, "epoch": 1.366262936241063, "grad_norm": 0.020263671875, "learning_rate": 1.501917117547772e-06, "loss": 0.1229, "mean_token_accuracy": 0.9716498827934266, "num_tokens": 5908182746.0, "step": 41850 }, { "entropy": 1.3452470707893371, "epoch": 1.3678952694982207, "grad_norm": 1.203125, "learning_rate": 1.4949072459354022e-06, "loss": 0.1212, "mean_token_accuracy": 0.972344673871994, "num_tokens": 5915128546.0, "step": 41900 }, { "entropy": 1.3409530735015869, "epoch": 1.3695276027553787, "grad_norm": 1.4765625, "learning_rate": 1.4879083387675666e-06, "loss": 0.1229, "mean_token_accuracy": 0.9715113770961762, "num_tokens": 5922010723.0, "step": 41950 }, { "entropy": 1.3670923542976379, "epoch": 1.3711599360125364, "grad_norm": 2.484375, "learning_rate": 1.4809204470305788e-06, "loss": 0.1351, "mean_token_accuracy": 0.9693290328979492, "num_tokens": 5929411699.0, "step": 42000 }, { "epoch": 1.3711599360125364, "eval_entropy": 1.352181215286255, "eval_loss": 0.14293250441551208, "eval_mean_token_accuracy": 0.9675895547866822, "eval_num_tokens": 5929411699.0, "eval_runtime": 753.3298, "eval_samples_per_second": 12.818, "eval_steps_per_second": 0.101, "step": 42000 }, { "entropy": 1.3608778929710388, "epoch": 1.372792269269694, "grad_norm": 1.1328125, "learning_rate": 1.4739436216305063e-06, "loss": 0.1335, "mean_token_accuracy": 0.9693701839447022, "num_tokens": 5936747601.0, "step": 42050 }, { "entropy": 1.361914451122284, "epoch": 1.3744246025268518, "grad_norm": 1.734375, "learning_rate": 1.4669779133927956e-06, "loss": 0.1255, "mean_token_accuracy": 0.9715308749675751, "num_tokens": 5943992563.0, "step": 42100 }, { "entropy": 1.3531924724578857, "epoch": 1.3760569357840096, "grad_norm": 1.34375, "learning_rate": 1.460023373061911e-06, "loss": 0.1291, "mean_token_accuracy": 0.9694987845420837, "num_tokens": 5951045516.0, "step": 42150 }, { "entropy": 1.3581135630607606, "epoch": 1.3776892690411675, "grad_norm": 0.005950927734375, "learning_rate": 1.4530800513009545e-06, "loss": 0.1265, "mean_token_accuracy": 0.9705501091480255, "num_tokens": 5958106338.0, "step": 42200 }, { "entropy": 1.361065561771393, "epoch": 1.3793216022983252, "grad_norm": 2.1875, "learning_rate": 1.4461479986913075e-06, "loss": 0.1232, "mean_token_accuracy": 0.9708436739444732, "num_tokens": 5964787874.0, "step": 42250 }, { "entropy": 1.3638162088394166, "epoch": 1.380953935555483, "grad_norm": 2.03125, "learning_rate": 1.43922726573225e-06, "loss": 0.1297, "mean_token_accuracy": 0.9698211419582367, "num_tokens": 5972312102.0, "step": 42300 }, { "entropy": 1.3620294046401977, "epoch": 1.382586268812641, "grad_norm": 1.328125, "learning_rate": 1.4323179028406086e-06, "loss": 0.126, "mean_token_accuracy": 0.9703358936309815, "num_tokens": 5979432191.0, "step": 42350 }, { "entropy": 1.3516957235336304, "epoch": 1.3842186020697986, "grad_norm": 1.65625, "learning_rate": 1.4254199603503709e-06, "loss": 0.1185, "mean_token_accuracy": 0.9726535677909851, "num_tokens": 5986862863.0, "step": 42400 }, { "entropy": 1.3603333234786987, "epoch": 1.3858509353269564, "grad_norm": 2.390625, "learning_rate": 1.4185334885123332e-06, "loss": 0.1225, "mean_token_accuracy": 0.9719485092163086, "num_tokens": 5993969922.0, "step": 42450 }, { "entropy": 1.3303070521354676, "epoch": 1.387483268584114, "grad_norm": 1.3515625, "learning_rate": 1.4116585374937304e-06, "loss": 0.1134, "mean_token_accuracy": 0.9734328532218933, "num_tokens": 6000796242.0, "step": 42500 }, { "epoch": 1.387483268584114, "eval_entropy": 1.3550557088851929, "eval_loss": 0.14276918768882751, "eval_mean_token_accuracy": 0.9676290774345397, "eval_num_tokens": 6000796242.0, "eval_runtime": 751.2401, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 42500 }, { "entropy": 1.3588794898986816, "epoch": 1.3891156018412718, "grad_norm": 1.1796875, "learning_rate": 1.4047951573778641e-06, "loss": 0.1206, "mean_token_accuracy": 0.9720923590660095, "num_tokens": 6007327659.0, "step": 42550 }, { "entropy": 1.3503399062156678, "epoch": 1.3907479350984298, "grad_norm": 1.484375, "learning_rate": 1.3979433981637493e-06, "loss": 0.1193, "mean_token_accuracy": 0.9727861452102661, "num_tokens": 6014078918.0, "step": 42600 }, { "entropy": 1.3634498286247254, "epoch": 1.3923802683555875, "grad_norm": 1.1171875, "learning_rate": 1.3911033097657374e-06, "loss": 0.1204, "mean_token_accuracy": 0.9721335184574127, "num_tokens": 6020712824.0, "step": 42650 }, { "entropy": 1.3603689241409302, "epoch": 1.3940126016127452, "grad_norm": 1.796875, "learning_rate": 1.3842749420131663e-06, "loss": 0.1297, "mean_token_accuracy": 0.969935257434845, "num_tokens": 6027950128.0, "step": 42700 }, { "entropy": 1.3577491450309753, "epoch": 1.3956449348699032, "grad_norm": 1.265625, "learning_rate": 1.3774583446499835e-06, "loss": 0.1383, "mean_token_accuracy": 0.9680879211425781, "num_tokens": 6035576325.0, "step": 42750 }, { "entropy": 1.349846076965332, "epoch": 1.3972772681270609, "grad_norm": 1.484375, "learning_rate": 1.3706535673343945e-06, "loss": 0.1289, "mean_token_accuracy": 0.9708491718769073, "num_tokens": 6042939083.0, "step": 42800 }, { "entropy": 1.3469208598136901, "epoch": 1.3989096013842186, "grad_norm": 1.828125, "learning_rate": 1.3638606596384973e-06, "loss": 0.1186, "mean_token_accuracy": 0.9719974470138549, "num_tokens": 6050053050.0, "step": 42850 }, { "entropy": 1.3655208253860474, "epoch": 1.4005419346413763, "grad_norm": 1.8359375, "learning_rate": 1.3570796710479174e-06, "loss": 0.13, "mean_token_accuracy": 0.9699460983276367, "num_tokens": 6057064095.0, "step": 42900 }, { "entropy": 1.351545627117157, "epoch": 1.402174267898534, "grad_norm": 2.28125, "learning_rate": 1.3503106509614553e-06, "loss": 0.1235, "mean_token_accuracy": 0.9710482954978943, "num_tokens": 6064091867.0, "step": 42950 }, { "entropy": 1.3588644528388978, "epoch": 1.403806601155692, "grad_norm": 1.8671875, "learning_rate": 1.3435536486907172e-06, "loss": 0.1234, "mean_token_accuracy": 0.9709674298763276, "num_tokens": 6071438514.0, "step": 43000 }, { "epoch": 1.403806601155692, "eval_entropy": 1.3524145444234212, "eval_loss": 0.14265932142734528, "eval_mean_token_accuracy": 0.9675256490707398, "eval_num_tokens": 6071438514.0, "eval_runtime": 753.0235, "eval_samples_per_second": 12.823, "eval_steps_per_second": 0.101, "step": 43000 }, { "entropy": 1.3578597354888915, "epoch": 1.4054389344128497, "grad_norm": 0.01470947265625, "learning_rate": 1.3368087134597663e-06, "loss": 0.1238, "mean_token_accuracy": 0.9716431427001954, "num_tokens": 6078596404.0, "step": 43050 }, { "entropy": 1.3505292820930481, "epoch": 1.4070712676700075, "grad_norm": 1.3671875, "learning_rate": 1.3300758944047536e-06, "loss": 0.1321, "mean_token_accuracy": 0.9695228207111358, "num_tokens": 6086011992.0, "step": 43100 }, { "entropy": 1.3455975699424743, "epoch": 1.4087036009271654, "grad_norm": 1.6953125, "learning_rate": 1.3233552405735694e-06, "loss": 0.1156, "mean_token_accuracy": 0.9727073276042938, "num_tokens": 6092681553.0, "step": 43150 }, { "entropy": 1.3595034289360046, "epoch": 1.4103359341843231, "grad_norm": 1.75, "learning_rate": 1.3166468009254766e-06, "loss": 0.1196, "mean_token_accuracy": 0.9722693479061126, "num_tokens": 6099527516.0, "step": 43200 }, { "entropy": 1.3427305126190185, "epoch": 1.4119682674414809, "grad_norm": 2.359375, "learning_rate": 1.309950624330764e-06, "loss": 0.1194, "mean_token_accuracy": 0.9728332602977753, "num_tokens": 6106350241.0, "step": 43250 }, { "entropy": 1.3501028728485107, "epoch": 1.4136006006986386, "grad_norm": 1.40625, "learning_rate": 1.3032667595703842e-06, "loss": 0.1259, "mean_token_accuracy": 0.9705328547954559, "num_tokens": 6113268163.0, "step": 43300 }, { "entropy": 1.3498585319519043, "epoch": 1.4152329339557963, "grad_norm": 1.328125, "learning_rate": 1.2965952553355958e-06, "loss": 0.1277, "mean_token_accuracy": 0.9707833099365234, "num_tokens": 6120281186.0, "step": 43350 }, { "entropy": 1.35530499458313, "epoch": 1.4168652672129542, "grad_norm": 2.265625, "learning_rate": 1.2899361602276175e-06, "loss": 0.1265, "mean_token_accuracy": 0.9698385155200958, "num_tokens": 6127237591.0, "step": 43400 }, { "entropy": 1.3571084594726563, "epoch": 1.418497600470112, "grad_norm": 1.390625, "learning_rate": 1.2832895227572622e-06, "loss": 0.1271, "mean_token_accuracy": 0.9701455044746399, "num_tokens": 6134551254.0, "step": 43450 }, { "entropy": 1.3430089569091797, "epoch": 1.4201299337272697, "grad_norm": 1.9453125, "learning_rate": 1.2766553913445993e-06, "loss": 0.1201, "mean_token_accuracy": 0.9717100954055786, "num_tokens": 6141418599.0, "step": 43500 }, { "epoch": 1.4201299337272697, "eval_entropy": 1.3463400856653849, "eval_loss": 0.1425597369670868, "eval_mean_token_accuracy": 0.9676672736803691, "eval_num_tokens": 6141418599.0, "eval_runtime": 753.679, "eval_samples_per_second": 12.812, "eval_steps_per_second": 0.101, "step": 43500 }, { "entropy": 1.3354617381095886, "epoch": 1.4217622669844276, "grad_norm": 1.6015625, "learning_rate": 1.2700338143185843e-06, "loss": 0.1162, "mean_token_accuracy": 0.972969708442688, "num_tokens": 6148322455.0, "step": 43550 }, { "entropy": 1.3403245830535888, "epoch": 1.4233946002415854, "grad_norm": 1.6328125, "learning_rate": 1.2634248399167203e-06, "loss": 0.1193, "mean_token_accuracy": 0.9722915697097778, "num_tokens": 6155533008.0, "step": 43600 }, { "entropy": 1.353358724117279, "epoch": 1.425026933498743, "grad_norm": 1.1953125, "learning_rate": 1.2568285162846987e-06, "loss": 0.1328, "mean_token_accuracy": 0.9696524286270142, "num_tokens": 6162774102.0, "step": 43650 }, { "entropy": 1.3451939988136292, "epoch": 1.4266592667559008, "grad_norm": 1.1875, "learning_rate": 1.2502448914760533e-06, "loss": 0.1138, "mean_token_accuracy": 0.9730928063392639, "num_tokens": 6169457314.0, "step": 43700 }, { "entropy": 1.3512369799613952, "epoch": 1.4282916000130585, "grad_norm": 2.09375, "learning_rate": 1.2436740134518094e-06, "loss": 0.1283, "mean_token_accuracy": 0.9699138104915619, "num_tokens": 6176504604.0, "step": 43750 }, { "entropy": 1.3418658518791198, "epoch": 1.4299239332702165, "grad_norm": 1.609375, "learning_rate": 1.2371159300801284e-06, "loss": 0.1169, "mean_token_accuracy": 0.9729372990131379, "num_tokens": 6183277441.0, "step": 43800 }, { "entropy": 1.3383120560646058, "epoch": 1.4315562665273742, "grad_norm": 1.4296875, "learning_rate": 1.2305706891359698e-06, "loss": 0.117, "mean_token_accuracy": 0.9732610857486725, "num_tokens": 6190472536.0, "step": 43850 }, { "entropy": 1.3513377356529235, "epoch": 1.433188599784532, "grad_norm": 1.328125, "learning_rate": 1.2240383383007325e-06, "loss": 0.135, "mean_token_accuracy": 0.9690117561817169, "num_tokens": 6197990663.0, "step": 43900 }, { "entropy": 1.3517193269729615, "epoch": 1.43482093304169, "grad_norm": 1.59375, "learning_rate": 1.2175189251619168e-06, "loss": 0.1177, "mean_token_accuracy": 0.9728011786937714, "num_tokens": 6204993365.0, "step": 43950 }, { "entropy": 1.3436356329917907, "epoch": 1.4364532662988476, "grad_norm": 1.03125, "learning_rate": 1.2110124972127686e-06, "loss": 0.1262, "mean_token_accuracy": 0.9713046276569366, "num_tokens": 6212573620.0, "step": 44000 }, { "epoch": 1.4364532662988476, "eval_entropy": 1.3463455152511596, "eval_loss": 0.14246320724487305, "eval_mean_token_accuracy": 0.9677392840385437, "eval_num_tokens": 6212573620.0, "eval_runtime": 754.1482, "eval_samples_per_second": 12.804, "eval_steps_per_second": 0.101, "step": 44000 }, { "entropy": 1.3434760403633117, "epoch": 1.4380855995560053, "grad_norm": 2.40625, "learning_rate": 1.2045191018519415e-06, "loss": 0.1169, "mean_token_accuracy": 0.9728634548187256, "num_tokens": 6219647458.0, "step": 44050 }, { "entropy": 1.3590314435958861, "epoch": 1.439717932813163, "grad_norm": 1.0625, "learning_rate": 1.1980387863831478e-06, "loss": 0.126, "mean_token_accuracy": 0.9703594601154327, "num_tokens": 6226844151.0, "step": 44100 }, { "entropy": 1.3532491779327394, "epoch": 1.4413502660703208, "grad_norm": 1.4375, "learning_rate": 1.1915715980148117e-06, "loss": 0.1309, "mean_token_accuracy": 0.9699800384044647, "num_tokens": 6234263907.0, "step": 44150 }, { "entropy": 1.349997682571411, "epoch": 1.4429825993274787, "grad_norm": 1.7109375, "learning_rate": 1.1851175838597306e-06, "loss": 0.121, "mean_token_accuracy": 0.9720881938934326, "num_tokens": 6241048484.0, "step": 44200 }, { "entropy": 1.354639277458191, "epoch": 1.4446149325846365, "grad_norm": 1.6484375, "learning_rate": 1.1786767909347268e-06, "loss": 0.1375, "mean_token_accuracy": 0.9680390894412995, "num_tokens": 6248857975.0, "step": 44250 }, { "entropy": 1.3580044388771058, "epoch": 1.4462472658417942, "grad_norm": 2.15625, "learning_rate": 1.1722492661603098e-06, "loss": 0.126, "mean_token_accuracy": 0.9711121428012848, "num_tokens": 6255829230.0, "step": 44300 }, { "entropy": 1.3534194731712341, "epoch": 1.4478795990989521, "grad_norm": 1.9296875, "learning_rate": 1.165835056360329e-06, "loss": 0.121, "mean_token_accuracy": 0.9716777575016021, "num_tokens": 6262589815.0, "step": 44350 }, { "entropy": 1.3448237705230712, "epoch": 1.4495119323561099, "grad_norm": 1.625, "learning_rate": 1.1594342082616386e-06, "loss": 0.1269, "mean_token_accuracy": 0.9700544607639313, "num_tokens": 6269556274.0, "step": 44400 }, { "entropy": 1.3512716341018676, "epoch": 1.4511442656132676, "grad_norm": 1.671875, "learning_rate": 1.1530467684937514e-06, "loss": 0.1226, "mean_token_accuracy": 0.9714391076564789, "num_tokens": 6276345824.0, "step": 44450 }, { "entropy": 1.3386855101585389, "epoch": 1.4527765988704253, "grad_norm": 2.5, "learning_rate": 1.146672783588504e-06, "loss": 0.1132, "mean_token_accuracy": 0.973590886592865, "num_tokens": 6282741770.0, "step": 44500 }, { "epoch": 1.4527765988704253, "eval_entropy": 1.3487922525405884, "eval_loss": 0.14237141609191895, "eval_mean_token_accuracy": 0.967642084757487, "eval_num_tokens": 6282741770.0, "eval_runtime": 750.7822, "eval_samples_per_second": 12.861, "eval_steps_per_second": 0.101, "step": 44500 }, { "entropy": 1.3471359300613404, "epoch": 1.454408932127583, "grad_norm": 0.82421875, "learning_rate": 1.1403122999797162e-06, "loss": 0.122, "mean_token_accuracy": 0.9717613708972931, "num_tokens": 6289684604.0, "step": 44550 }, { "entropy": 1.357615110874176, "epoch": 1.456041265384741, "grad_norm": 2.140625, "learning_rate": 1.133965364002848e-06, "loss": 0.1318, "mean_token_accuracy": 0.9699806833267212, "num_tokens": 6296718052.0, "step": 44600 }, { "entropy": 1.3517170500755311, "epoch": 1.4576735986418987, "grad_norm": 0.004852294921875, "learning_rate": 1.1276320218946737e-06, "loss": 0.1167, "mean_token_accuracy": 0.9736652266979218, "num_tokens": 6303023793.0, "step": 44650 }, { "entropy": 1.3588165473937988, "epoch": 1.4593059318990564, "grad_norm": 2.375, "learning_rate": 1.1213123197929296e-06, "loss": 0.1289, "mean_token_accuracy": 0.9699364423751831, "num_tokens": 6310502615.0, "step": 44700 }, { "entropy": 1.3603587317466737, "epoch": 1.4609382651562144, "grad_norm": 1.4375, "learning_rate": 1.1150063037359927e-06, "loss": 0.131, "mean_token_accuracy": 0.9688486230373382, "num_tokens": 6317956478.0, "step": 44750 }, { "entropy": 1.3491565418243407, "epoch": 1.4625705984133721, "grad_norm": 2.171875, "learning_rate": 1.108714019662533e-06, "loss": 0.1178, "mean_token_accuracy": 0.9722418069839478, "num_tokens": 6324906866.0, "step": 44800 }, { "entropy": 1.347842710018158, "epoch": 1.4642029316705298, "grad_norm": 2.0, "learning_rate": 1.1024355134111894e-06, "loss": 0.1182, "mean_token_accuracy": 0.9722170174121857, "num_tokens": 6331464077.0, "step": 44850 }, { "entropy": 1.3519015336036682, "epoch": 1.4658352649276876, "grad_norm": 1.296875, "learning_rate": 1.096170830720226e-06, "loss": 0.1297, "mean_token_accuracy": 0.9708062732219696, "num_tokens": 6338494070.0, "step": 44900 }, { "entropy": 1.3482877349853515, "epoch": 1.4674675981848453, "grad_norm": 1.9140625, "learning_rate": 1.0899200172272073e-06, "loss": 0.1274, "mean_token_accuracy": 0.9707257854938507, "num_tokens": 6345499457.0, "step": 44950 }, { "entropy": 1.3303207564353943, "epoch": 1.4690999314420032, "grad_norm": 1.1875, "learning_rate": 1.0836831184686621e-06, "loss": 0.1154, "mean_token_accuracy": 0.972674525976181, "num_tokens": 6352217211.0, "step": 45000 }, { "epoch": 1.4690999314420032, "eval_entropy": 1.3471796067555746, "eval_loss": 0.14247018098831177, "eval_mean_token_accuracy": 0.9676760347684225, "eval_num_tokens": 6352217211.0, "eval_runtime": 746.9306, "eval_samples_per_second": 12.928, "eval_steps_per_second": 0.102, "step": 45000 }, { "entropy": 1.3490448307991028, "epoch": 1.470732264699161, "grad_norm": 1.8125, "learning_rate": 1.0774601798797487e-06, "loss": 0.1202, "mean_token_accuracy": 0.971461181640625, "num_tokens": 6359326989.0, "step": 45050 }, { "entropy": 1.3362992668151856, "epoch": 1.4723645979563187, "grad_norm": 1.4453125, "learning_rate": 1.071251246793931e-06, "loss": 0.1305, "mean_token_accuracy": 0.9699250388145447, "num_tokens": 6366511833.0, "step": 45100 }, { "entropy": 1.3491197919845581, "epoch": 1.4739969312134766, "grad_norm": 2.046875, "learning_rate": 1.0650563644426402e-06, "loss": 0.1287, "mean_token_accuracy": 0.9701677405834198, "num_tokens": 6373790440.0, "step": 45150 }, { "entropy": 1.3424671697616577, "epoch": 1.4756292644706344, "grad_norm": 1.9375, "learning_rate": 1.0588755779549534e-06, "loss": 0.1305, "mean_token_accuracy": 0.9701132154464722, "num_tokens": 6381354563.0, "step": 45200 }, { "entropy": 1.3442249703407287, "epoch": 1.477261597727792, "grad_norm": 1.40625, "learning_rate": 1.0527089323572568e-06, "loss": 0.1235, "mean_token_accuracy": 0.9716306221485138, "num_tokens": 6388718219.0, "step": 45250 }, { "entropy": 1.3491050267219544, "epoch": 1.4788939309849498, "grad_norm": 1.8515625, "learning_rate": 1.0465564725729245e-06, "loss": 0.1337, "mean_token_accuracy": 0.9686619007587433, "num_tokens": 6396204282.0, "step": 45300 }, { "entropy": 1.358343975543976, "epoch": 1.4805262642421075, "grad_norm": 1.328125, "learning_rate": 1.040418243421989e-06, "loss": 0.1324, "mean_token_accuracy": 0.9696023035049438, "num_tokens": 6403488343.0, "step": 45350 }, { "entropy": 1.3462382221221925, "epoch": 1.4821585974992655, "grad_norm": 1.2109375, "learning_rate": 1.0342942896208105e-06, "loss": 0.1263, "mean_token_accuracy": 0.9713137638568878, "num_tokens": 6410641119.0, "step": 45400 }, { "entropy": 1.3580231857299805, "epoch": 1.4837909307564232, "grad_norm": 1.96875, "learning_rate": 1.028184655781759e-06, "loss": 0.1288, "mean_token_accuracy": 0.9706091582775116, "num_tokens": 6418060885.0, "step": 45450 }, { "entropy": 1.3437149500846863, "epoch": 1.485423264013581, "grad_norm": 1.5703125, "learning_rate": 1.0220893864128809e-06, "loss": 0.1204, "mean_token_accuracy": 0.9711262369155884, "num_tokens": 6424982813.0, "step": 45500 }, { "epoch": 1.485423264013581, "eval_entropy": 1.3490506156285604, "eval_loss": 0.14239265024662018, "eval_mean_token_accuracy": 0.9676458183924357, "eval_num_tokens": 6424982813.0, "eval_runtime": 753.7774, "eval_samples_per_second": 12.81, "eval_steps_per_second": 0.101, "step": 45500 }, { "entropy": 1.3549335837364196, "epoch": 1.4870555972707389, "grad_norm": 2.078125, "learning_rate": 1.0160085259175834e-06, "loss": 0.12, "mean_token_accuracy": 0.9724775660037994, "num_tokens": 6431726519.0, "step": 45550 }, { "entropy": 1.3539844870567321, "epoch": 1.4886879305278966, "grad_norm": 1.5703125, "learning_rate": 1.0099421185943016e-06, "loss": 0.1171, "mean_token_accuracy": 0.9727089703083038, "num_tokens": 6439000590.0, "step": 45600 }, { "entropy": 1.3505275821685792, "epoch": 1.4903202637850543, "grad_norm": 1.546875, "learning_rate": 1.0038902086361862e-06, "loss": 0.1234, "mean_token_accuracy": 0.9712493371963501, "num_tokens": 6446110346.0, "step": 45650 }, { "entropy": 1.3557030415534974, "epoch": 1.491952597042212, "grad_norm": 1.546875, "learning_rate": 9.97852840130771e-07, "loss": 0.1243, "mean_token_accuracy": 0.9712071406841278, "num_tokens": 6453417508.0, "step": 45700 }, { "entropy": 1.3410662007331848, "epoch": 1.4935849302993698, "grad_norm": 1.3671875, "learning_rate": 9.918300570596596e-07, "loss": 0.1175, "mean_token_accuracy": 0.9729030966758728, "num_tokens": 6460130049.0, "step": 45750 }, { "entropy": 1.3425062656402589, "epoch": 1.4952172635565277, "grad_norm": 1.6171875, "learning_rate": 9.858219032982019e-07, "loss": 0.1225, "mean_token_accuracy": 0.9715787386894226, "num_tokens": 6467242766.0, "step": 45800 }, { "entropy": 1.3404623532295228, "epoch": 1.4968495968136855, "grad_norm": 0.59375, "learning_rate": 9.798284226151751e-07, "loss": 0.1299, "mean_token_accuracy": 0.9696118628978729, "num_tokens": 6474698613.0, "step": 45850 }, { "entropy": 1.3416164851188659, "epoch": 1.4984819300708432, "grad_norm": 1.21875, "learning_rate": 9.738496586724644e-07, "loss": 0.1247, "mean_token_accuracy": 0.9708961296081543, "num_tokens": 6481733346.0, "step": 45900 }, { "entropy": 1.3436065912246704, "epoch": 1.5001142633280011, "grad_norm": 1.8203125, "learning_rate": 9.678856550247433e-07, "loss": 0.1197, "mean_token_accuracy": 0.9722868132591248, "num_tokens": 6488600216.0, "step": 45950 }, { "entropy": 1.3471774291992187, "epoch": 1.5017465965851589, "grad_norm": 2.28125, "learning_rate": 9.619364551191615e-07, "loss": 0.1234, "mean_token_accuracy": 0.9715406239032746, "num_tokens": 6495842641.0, "step": 46000 }, { "epoch": 1.5017465965851589, "eval_entropy": 1.3393350235621135, "eval_loss": 0.14240698516368866, "eval_mean_token_accuracy": 0.967587119738261, "eval_num_tokens": 6495842641.0, "eval_runtime": 753.9166, "eval_samples_per_second": 12.808, "eval_steps_per_second": 0.101, "step": 46000 }, { "entropy": 1.3315513157844543, "epoch": 1.5033789298423166, "grad_norm": 1.96875, "learning_rate": 9.560021022950201e-07, "loss": 0.1152, "mean_token_accuracy": 0.9735651075839996, "num_tokens": 6502730531.0, "step": 46050 }, { "entropy": 1.3440596199035644, "epoch": 1.5050112630994743, "grad_norm": 1.6484375, "learning_rate": 9.500826397834667e-07, "loss": 0.1363, "mean_token_accuracy": 0.9686136949062347, "num_tokens": 6510207647.0, "step": 46100 }, { "entropy": 1.3446049523353576, "epoch": 1.506643596356632, "grad_norm": 1.2109375, "learning_rate": 9.44178110707169e-07, "loss": 0.1165, "mean_token_accuracy": 0.972632863521576, "num_tokens": 6517212379.0, "step": 46150 }, { "entropy": 1.3464979553222656, "epoch": 1.50827592961379, "grad_norm": 1.125, "learning_rate": 9.382885580800094e-07, "loss": 0.1341, "mean_token_accuracy": 0.969519715309143, "num_tokens": 6524843034.0, "step": 46200 }, { "entropy": 1.337069320678711, "epoch": 1.5099082628709477, "grad_norm": 1.3046875, "learning_rate": 9.324140248067691e-07, "loss": 0.123, "mean_token_accuracy": 0.9713637149333954, "num_tokens": 6531941894.0, "step": 46250 }, { "entropy": 1.3362834978103637, "epoch": 1.5115405961281057, "grad_norm": 1.59375, "learning_rate": 9.265545536828111e-07, "loss": 0.1154, "mean_token_accuracy": 0.9725685477256775, "num_tokens": 6538791279.0, "step": 46300 }, { "entropy": 1.338006112575531, "epoch": 1.5131729293852634, "grad_norm": 1.5, "learning_rate": 9.207101873937768e-07, "loss": 0.1259, "mean_token_accuracy": 0.971262993812561, "num_tokens": 6545902334.0, "step": 46350 }, { "entropy": 1.3359116435050964, "epoch": 1.514805262642421, "grad_norm": 3.1875, "learning_rate": 9.14880968515266e-07, "loss": 0.1162, "mean_token_accuracy": 0.9728143513202667, "num_tokens": 6553190103.0, "step": 46400 }, { "entropy": 1.3362672972679137, "epoch": 1.5164375958995788, "grad_norm": 1.7734375, "learning_rate": 9.090669395125351e-07, "loss": 0.1155, "mean_token_accuracy": 0.9727682447433472, "num_tokens": 6559814465.0, "step": 46450 }, { "entropy": 1.3332003378868102, "epoch": 1.5180699291567366, "grad_norm": 1.546875, "learning_rate": 9.032681427401806e-07, "loss": 0.1094, "mean_token_accuracy": 0.9738853967189789, "num_tokens": 6566382565.0, "step": 46500 }, { "epoch": 1.5180699291567366, "eval_entropy": 1.3380318832397462, "eval_loss": 0.14237698912620544, "eval_mean_token_accuracy": 0.9675415523846944, "eval_num_tokens": 6566382565.0, "eval_runtime": 751.4412, "eval_samples_per_second": 12.85, "eval_steps_per_second": 0.101, "step": 46500 }, { "entropy": 1.3400082111358642, "epoch": 1.5197022624138943, "grad_norm": 2.140625, "learning_rate": 8.974846204418361e-07, "loss": 0.1245, "mean_token_accuracy": 0.9713966703414917, "num_tokens": 6573501735.0, "step": 46550 }, { "entropy": 1.3266588830947876, "epoch": 1.5213345956710522, "grad_norm": 1.2109375, "learning_rate": 8.917164147498621e-07, "loss": 0.1156, "mean_token_accuracy": 0.9723888230323792, "num_tokens": 6580331916.0, "step": 46600 }, { "entropy": 1.340979859828949, "epoch": 1.52296692892821, "grad_norm": 1.921875, "learning_rate": 8.859635676850372e-07, "loss": 0.1174, "mean_token_accuracy": 0.9720415270328522, "num_tokens": 6586882066.0, "step": 46650 }, { "entropy": 1.338642556667328, "epoch": 1.524599262185368, "grad_norm": 2.265625, "learning_rate": 8.802261211562563e-07, "loss": 0.1206, "mean_token_accuracy": 0.9713714873790741, "num_tokens": 6593693750.0, "step": 46700 }, { "entropy": 1.3478483366966247, "epoch": 1.5262315954425256, "grad_norm": 0.55078125, "learning_rate": 8.745041169602207e-07, "loss": 0.1278, "mean_token_accuracy": 0.9710940301418305, "num_tokens": 6601030060.0, "step": 46750 }, { "entropy": 1.3367830848693847, "epoch": 1.5278639286996833, "grad_norm": 2.28125, "learning_rate": 8.687975967811393e-07, "loss": 0.1235, "mean_token_accuracy": 0.9711613404750824, "num_tokens": 6607766556.0, "step": 46800 }, { "entropy": 1.3393477821350097, "epoch": 1.529496261956841, "grad_norm": 1.15625, "learning_rate": 8.631066021904173e-07, "loss": 0.1281, "mean_token_accuracy": 0.9700063776969909, "num_tokens": 6615264797.0, "step": 46850 }, { "entropy": 1.3377933168411256, "epoch": 1.5311285952139988, "grad_norm": 1.171875, "learning_rate": 8.574311746463602e-07, "loss": 0.1219, "mean_token_accuracy": 0.9712197721004486, "num_tokens": 6622625782.0, "step": 46900 }, { "entropy": 1.3495184230804442, "epoch": 1.5327609284711565, "grad_norm": 1.6640625, "learning_rate": 8.517713554938698e-07, "loss": 0.1291, "mean_token_accuracy": 0.9702865195274353, "num_tokens": 6629917201.0, "step": 46950 }, { "entropy": 1.337637755870819, "epoch": 1.5343932617283145, "grad_norm": 1.2265625, "learning_rate": 8.461271859641413e-07, "loss": 0.124, "mean_token_accuracy": 0.9718515348434448, "num_tokens": 6636921749.0, "step": 47000 }, { "epoch": 1.5343932617283145, "eval_entropy": 1.3388334194819131, "eval_loss": 0.14232522249221802, "eval_mean_token_accuracy": 0.9674970960617065, "eval_num_tokens": 6636921749.0, "eval_runtime": 752.3993, "eval_samples_per_second": 12.834, "eval_steps_per_second": 0.101, "step": 47000 }, { "entropy": 1.3416547775268555, "epoch": 1.5360255949854722, "grad_norm": 1.90625, "learning_rate": 8.404987071743628e-07, "loss": 0.1152, "mean_token_accuracy": 0.9724602663516998, "num_tokens": 6643500789.0, "step": 47050 }, { "entropy": 1.3312445497512817, "epoch": 1.5376579282426301, "grad_norm": 1.765625, "learning_rate": 8.348859601274191e-07, "loss": 0.1141, "mean_token_accuracy": 0.9736961400508881, "num_tokens": 6650321316.0, "step": 47100 }, { "entropy": 1.3447378778457642, "epoch": 1.5392902614997879, "grad_norm": 1.265625, "learning_rate": 8.292889857115906e-07, "loss": 0.1251, "mean_token_accuracy": 0.9713517308235169, "num_tokens": 6657596511.0, "step": 47150 }, { "entropy": 1.3332865810394288, "epoch": 1.5409225947569456, "grad_norm": 1.28125, "learning_rate": 8.237078247002536e-07, "loss": 0.1144, "mean_token_accuracy": 0.9731272792816162, "num_tokens": 6664455846.0, "step": 47200 }, { "entropy": 1.3360036635398864, "epoch": 1.5425549280141033, "grad_norm": 1.578125, "learning_rate": 8.181425177515887e-07, "loss": 0.1181, "mean_token_accuracy": 0.9728715085983276, "num_tokens": 6671444402.0, "step": 47250 }, { "entropy": 1.3451998877525329, "epoch": 1.544187261271261, "grad_norm": 2.203125, "learning_rate": 8.125931054082775e-07, "loss": 0.1189, "mean_token_accuracy": 0.9724408376216889, "num_tokens": 6678449907.0, "step": 47300 }, { "entropy": 1.346355800628662, "epoch": 1.5458195945284188, "grad_norm": 1.2109375, "learning_rate": 8.070596280972152e-07, "loss": 0.1311, "mean_token_accuracy": 0.9690172004699708, "num_tokens": 6685797698.0, "step": 47350 }, { "entropy": 1.341564166545868, "epoch": 1.5474519277855767, "grad_norm": 1.890625, "learning_rate": 8.01542126129208e-07, "loss": 0.1231, "mean_token_accuracy": 0.9712344872951507, "num_tokens": 6692582052.0, "step": 47400 }, { "entropy": 1.3289701747894287, "epoch": 1.5490842610427344, "grad_norm": 2.0625, "learning_rate": 7.960406396986855e-07, "loss": 0.1137, "mean_token_accuracy": 0.9735037076473236, "num_tokens": 6699347585.0, "step": 47450 }, { "entropy": 1.3448949909210206, "epoch": 1.5507165942998924, "grad_norm": 1.875, "learning_rate": 7.905552088834074e-07, "loss": 0.1267, "mean_token_accuracy": 0.9710251951217651, "num_tokens": 6706378386.0, "step": 47500 }, { "epoch": 1.5507165942998924, "eval_entropy": 1.338226900100708, "eval_loss": 0.14231520891189575, "eval_mean_token_accuracy": 0.9675397229194641, "eval_num_tokens": 6706378386.0, "eval_runtime": 753.0219, "eval_samples_per_second": 12.823, "eval_steps_per_second": 0.101, "step": 47500 }, { "entropy": 1.340337586402893, "epoch": 1.5523489275570501, "grad_norm": 1.4375, "learning_rate": 7.850858736441654e-07, "loss": 0.1177, "mean_token_accuracy": 0.9723483467102051, "num_tokens": 6713336578.0, "step": 47550 }, { "entropy": 1.3498352313041686, "epoch": 1.5539812608142078, "grad_norm": 1.7734375, "learning_rate": 7.796326738245014e-07, "loss": 0.1213, "mean_token_accuracy": 0.9714156925678253, "num_tokens": 6720699596.0, "step": 47600 }, { "entropy": 1.3324111270904542, "epoch": 1.5556135940713656, "grad_norm": 1.8046875, "learning_rate": 7.741956491504081e-07, "loss": 0.1174, "mean_token_accuracy": 0.9724169254302979, "num_tokens": 6727286916.0, "step": 47650 }, { "entropy": 1.3425724387168885, "epoch": 1.5572459273285233, "grad_norm": 1.5703125, "learning_rate": 7.687748392300481e-07, "loss": 0.1211, "mean_token_accuracy": 0.9714575302600861, "num_tokens": 6734228986.0, "step": 47700 }, { "entropy": 1.3485777735710145, "epoch": 1.558878260585681, "grad_norm": 1.734375, "learning_rate": 7.633702835534574e-07, "loss": 0.1245, "mean_token_accuracy": 0.9718271470069886, "num_tokens": 6741417194.0, "step": 47750 }, { "entropy": 1.3329823040962219, "epoch": 1.560510593842839, "grad_norm": 1.453125, "learning_rate": 7.579820214922639e-07, "loss": 0.1068, "mean_token_accuracy": 0.9753419077396392, "num_tokens": 6747898229.0, "step": 47800 }, { "entropy": 1.3319637727737428, "epoch": 1.5621429270999967, "grad_norm": 1.203125, "learning_rate": 7.526100922993989e-07, "loss": 0.1122, "mean_token_accuracy": 0.9736012244224548, "num_tokens": 6754984506.0, "step": 47850 }, { "entropy": 1.3472395992279054, "epoch": 1.5637752603571546, "grad_norm": 1.71875, "learning_rate": 7.472545351088072e-07, "loss": 0.1171, "mean_token_accuracy": 0.9723641383647919, "num_tokens": 6761747061.0, "step": 47900 }, { "entropy": 1.322529821395874, "epoch": 1.5654075936143124, "grad_norm": 1.703125, "learning_rate": 7.419153889351687e-07, "loss": 0.1112, "mean_token_accuracy": 0.9733606302738189, "num_tokens": 6768792123.0, "step": 47950 }, { "entropy": 1.338774642944336, "epoch": 1.56703992687147, "grad_norm": 1.203125, "learning_rate": 7.365926926736079e-07, "loss": 0.1298, "mean_token_accuracy": 0.969396116733551, "num_tokens": 6776660559.0, "step": 48000 }, { "epoch": 1.56703992687147, "eval_entropy": 1.3360849984486898, "eval_loss": 0.14228671789169312, "eval_mean_token_accuracy": 0.9675799965858459, "eval_num_tokens": 6776660559.0, "eval_runtime": 744.7808, "eval_samples_per_second": 12.965, "eval_steps_per_second": 0.102, "step": 48000 }, { "entropy": 1.3329459977149964, "epoch": 1.5686722601286278, "grad_norm": 1.921875, "learning_rate": 7.312864850994151e-07, "loss": 0.1107, "mean_token_accuracy": 0.9741031527519226, "num_tokens": 6783181818.0, "step": 48050 }, { "entropy": 1.3349730682373047, "epoch": 1.5703045933857855, "grad_norm": 1.421875, "learning_rate": 7.259968048677626e-07, "loss": 0.1136, "mean_token_accuracy": 0.9728020560741425, "num_tokens": 6790226377.0, "step": 48100 }, { "entropy": 1.3363687252998353, "epoch": 1.5719369266429433, "grad_norm": 1.1484375, "learning_rate": 7.207236905134222e-07, "loss": 0.1159, "mean_token_accuracy": 0.9732009255886078, "num_tokens": 6797331847.0, "step": 48150 }, { "entropy": 1.3224567222595214, "epoch": 1.5735692599001012, "grad_norm": 2.375, "learning_rate": 7.154671804504838e-07, "loss": 0.1187, "mean_token_accuracy": 0.9726608419418334, "num_tokens": 6804197080.0, "step": 48200 }, { "entropy": 1.3282025313377381, "epoch": 1.575201593157259, "grad_norm": 1.4140625, "learning_rate": 7.102273129720785e-07, "loss": 0.1171, "mean_token_accuracy": 0.9726303327083587, "num_tokens": 6811554275.0, "step": 48250 }, { "entropy": 1.3339614725112916, "epoch": 1.5768339264144169, "grad_norm": 1.4765625, "learning_rate": 7.050041262500963e-07, "loss": 0.1211, "mean_token_accuracy": 0.9716296088695526, "num_tokens": 6818575585.0, "step": 48300 }, { "entropy": 1.3479553842544556, "epoch": 1.5784662596715746, "grad_norm": 1.25, "learning_rate": 6.99797658334911e-07, "loss": 0.1297, "mean_token_accuracy": 0.9702737581729889, "num_tokens": 6825774443.0, "step": 48350 }, { "entropy": 1.3463473081588746, "epoch": 1.5800985929287323, "grad_norm": 1.4296875, "learning_rate": 6.946079471551018e-07, "loss": 0.1289, "mean_token_accuracy": 0.9703529167175293, "num_tokens": 6833071654.0, "step": 48400 }, { "entropy": 1.3393766927719115, "epoch": 1.58173092618589, "grad_norm": 2.0, "learning_rate": 6.894350305171747e-07, "loss": 0.1196, "mean_token_accuracy": 0.9719527661800385, "num_tokens": 6840009616.0, "step": 48450 }, { "entropy": 1.333970193862915, "epoch": 1.5833632594430478, "grad_norm": 1.859375, "learning_rate": 6.842789461052923e-07, "loss": 0.1157, "mean_token_accuracy": 0.9726070737838746, "num_tokens": 6847179809.0, "step": 48500 }, { "epoch": 1.5833632594430478, "eval_entropy": 1.3332714064915976, "eval_loss": 0.14229924976825714, "eval_mean_token_accuracy": 0.9675857615470886, "eval_num_tokens": 6847179809.0, "eval_runtime": 747.985, "eval_samples_per_second": 12.909, "eval_steps_per_second": 0.102, "step": 48500 }, { "entropy": 1.3204735660552978, "epoch": 1.5849955927002055, "grad_norm": 2.34375, "learning_rate": 6.791397314809928e-07, "loss": 0.107, "mean_token_accuracy": 0.9743911874294281, "num_tokens": 6853704094.0, "step": 48550 }, { "entropy": 1.3412930655479431, "epoch": 1.5866279259573635, "grad_norm": 2.0, "learning_rate": 6.740174240829229e-07, "loss": 0.1119, "mean_token_accuracy": 0.973388170003891, "num_tokens": 6860416510.0, "step": 48600 }, { "entropy": 1.3327165865898132, "epoch": 1.5882602592145212, "grad_norm": 1.7421875, "learning_rate": 6.689120612265592e-07, "loss": 0.1166, "mean_token_accuracy": 0.9722766876220703, "num_tokens": 6867304006.0, "step": 48650 }, { "entropy": 1.3274698781967162, "epoch": 1.5898925924716791, "grad_norm": 1.3984375, "learning_rate": 6.638236801039406e-07, "loss": 0.1179, "mean_token_accuracy": 0.9724640393257141, "num_tokens": 6874107973.0, "step": 48700 }, { "entropy": 1.3326347541809083, "epoch": 1.5915249257288369, "grad_norm": 1.671875, "learning_rate": 6.587523177833969e-07, "loss": 0.119, "mean_token_accuracy": 0.9718975865840912, "num_tokens": 6881463304.0, "step": 48750 }, { "entropy": 1.3314825320243835, "epoch": 1.5931572589859946, "grad_norm": 1.3828125, "learning_rate": 6.536980112092748e-07, "loss": 0.1296, "mean_token_accuracy": 0.970567889213562, "num_tokens": 6888936324.0, "step": 48800 }, { "entropy": 1.335792977809906, "epoch": 1.5947895922431523, "grad_norm": 1.3828125, "learning_rate": 6.486607972016746e-07, "loss": 0.1091, "mean_token_accuracy": 0.9739424622058869, "num_tokens": 6895517659.0, "step": 48850 }, { "entropy": 1.3427368450164794, "epoch": 1.59642192550031, "grad_norm": 1.765625, "learning_rate": 6.436407124561761e-07, "loss": 0.12, "mean_token_accuracy": 0.9717347931861877, "num_tokens": 6902937758.0, "step": 48900 }, { "entropy": 1.3364178919792176, "epoch": 1.5980542587574678, "grad_norm": 1.5, "learning_rate": 6.386377935435774e-07, "loss": 0.1177, "mean_token_accuracy": 0.9717580342292785, "num_tokens": 6909650064.0, "step": 48950 }, { "entropy": 1.3345691514015199, "epoch": 1.5996865920146257, "grad_norm": 1.4296875, "learning_rate": 6.336520769096215e-07, "loss": 0.1242, "mean_token_accuracy": 0.9705976390838623, "num_tokens": 6916954844.0, "step": 49000 }, { "epoch": 1.5996865920146257, "eval_entropy": 1.3313024059931438, "eval_loss": 0.14232422411441803, "eval_mean_token_accuracy": 0.9676321744918823, "eval_num_tokens": 6916954844.0, "eval_runtime": 750.8283, "eval_samples_per_second": 12.86, "eval_steps_per_second": 0.101, "step": 49000 }, { "entropy": 1.3268843412399292, "epoch": 1.6013189252717834, "grad_norm": 1.9921875, "learning_rate": 6.286835988747385e-07, "loss": 0.12, "mean_token_accuracy": 0.9719444465637207, "num_tokens": 6924452826.0, "step": 49050 }, { "entropy": 1.3368324255943298, "epoch": 1.6029512585289414, "grad_norm": 1.9140625, "learning_rate": 6.237323956337755e-07, "loss": 0.1192, "mean_token_accuracy": 0.9717724549770356, "num_tokens": 6931690729.0, "step": 49100 }, { "entropy": 1.3325298118591309, "epoch": 1.604583591786099, "grad_norm": 1.4375, "learning_rate": 6.18798503255733e-07, "loss": 0.1192, "mean_token_accuracy": 0.9713486981391907, "num_tokens": 6938882010.0, "step": 49150 }, { "entropy": 1.339358766078949, "epoch": 1.6062159250432568, "grad_norm": 1.2578125, "learning_rate": 6.138819576835056e-07, "loss": 0.1148, "mean_token_accuracy": 0.9726817321777343, "num_tokens": 6945842260.0, "step": 49200 }, { "entropy": 1.343044672012329, "epoch": 1.6078482583004146, "grad_norm": 1.375, "learning_rate": 6.089827947336176e-07, "loss": 0.1264, "mean_token_accuracy": 0.9707775366306305, "num_tokens": 6953111267.0, "step": 49250 }, { "entropy": 1.3318463802337646, "epoch": 1.6094805915575723, "grad_norm": 1.078125, "learning_rate": 6.041010500959636e-07, "loss": 0.1129, "mean_token_accuracy": 0.9740157461166382, "num_tokens": 6960306994.0, "step": 49300 }, { "entropy": 1.329837028980255, "epoch": 1.6111129248147302, "grad_norm": 1.265625, "learning_rate": 5.992367593335453e-07, "loss": 0.1108, "mean_token_accuracy": 0.9734898245334626, "num_tokens": 6966883891.0, "step": 49350 }, { "entropy": 1.3326479887962341, "epoch": 1.612745258071888, "grad_norm": 1.4140625, "learning_rate": 5.943899578822175e-07, "loss": 0.1136, "mean_token_accuracy": 0.9735347747802734, "num_tokens": 6973945682.0, "step": 49400 }, { "entropy": 1.326375277042389, "epoch": 1.614377591329046, "grad_norm": 1.046875, "learning_rate": 5.895606810504245e-07, "loss": 0.1126, "mean_token_accuracy": 0.9733131611347199, "num_tokens": 6980960627.0, "step": 49450 }, { "entropy": 1.3266846776008605, "epoch": 1.6160099245862036, "grad_norm": 1.625, "learning_rate": 5.847489640189483e-07, "loss": 0.1138, "mean_token_accuracy": 0.9726812386512756, "num_tokens": 6987961577.0, "step": 49500 }, { "epoch": 1.6160099245862036, "eval_entropy": 1.327593413988749, "eval_loss": 0.1423512101173401, "eval_mean_token_accuracy": 0.9675911315282186, "eval_num_tokens": 6987961577.0, "eval_runtime": 752.8917, "eval_samples_per_second": 12.825, "eval_steps_per_second": 0.101, "step": 49500 }, { "entropy": 1.3377934908866882, "epoch": 1.6176422578433614, "grad_norm": 1.21875, "learning_rate": 5.799548418406465e-07, "loss": 0.1259, "mean_token_accuracy": 0.9708180844783783, "num_tokens": 6995561320.0, "step": 49550 }, { "entropy": 1.329156894683838, "epoch": 1.619274591100519, "grad_norm": 0.10498046875, "learning_rate": 5.751783494402026e-07, "loss": 0.1231, "mean_token_accuracy": 0.9713264811038971, "num_tokens": 7002493076.0, "step": 49600 }, { "entropy": 1.3364929604530333, "epoch": 1.6209069243576768, "grad_norm": 1.9609375, "learning_rate": 5.704195216138692e-07, "loss": 0.1268, "mean_token_accuracy": 0.9701859080791473, "num_tokens": 7010022324.0, "step": 49650 }, { "entropy": 1.3322447371482848, "epoch": 1.6225392576148345, "grad_norm": 0.0128173828125, "learning_rate": 5.656783930292111e-07, "loss": 0.1099, "mean_token_accuracy": 0.9747293889522552, "num_tokens": 7016790835.0, "step": 49700 }, { "entropy": 1.3278017139434815, "epoch": 1.6241715908719925, "grad_norm": 1.296875, "learning_rate": 5.609549982248599e-07, "loss": 0.1191, "mean_token_accuracy": 0.9724988090991974, "num_tokens": 7023902551.0, "step": 49750 }, { "entropy": 1.3292198777198792, "epoch": 1.6258039241291502, "grad_norm": 1.2734375, "learning_rate": 5.562493716102552e-07, "loss": 0.1221, "mean_token_accuracy": 0.9709643149375915, "num_tokens": 7031377627.0, "step": 49800 }, { "entropy": 1.324635624885559, "epoch": 1.6274362573863081, "grad_norm": 1.515625, "learning_rate": 5.515615474653998e-07, "loss": 0.124, "mean_token_accuracy": 0.9710781908035279, "num_tokens": 7039286978.0, "step": 49850 }, { "entropy": 1.3296051907539368, "epoch": 1.6290685906434659, "grad_norm": 1.2734375, "learning_rate": 5.46891559940605e-07, "loss": 0.108, "mean_token_accuracy": 0.9743620455265045, "num_tokens": 7046170706.0, "step": 49900 }, { "entropy": 1.3330603170394897, "epoch": 1.6307009239006236, "grad_norm": 1.3828125, "learning_rate": 5.422394430562457e-07, "loss": 0.1062, "mean_token_accuracy": 0.9752356350421906, "num_tokens": 7052934305.0, "step": 49950 }, { "entropy": 1.3311968517303467, "epoch": 1.6323332571577813, "grad_norm": 1.1796875, "learning_rate": 5.376052307025119e-07, "loss": 0.1239, "mean_token_accuracy": 0.9706245791912079, "num_tokens": 7060043084.0, "step": 50000 }, { "epoch": 1.6323332571577813, "eval_entropy": 1.3246519072850544, "eval_loss": 0.14238578081130981, "eval_mean_token_accuracy": 0.9675867708524069, "eval_num_tokens": 7060043084.0, "eval_runtime": 752.4961, "eval_samples_per_second": 12.832, "eval_steps_per_second": 0.101, "step": 50000 }, { "entropy": 1.3305332589149474, "epoch": 1.633965590414939, "grad_norm": 1.96875, "learning_rate": 5.329889566391578e-07, "loss": 0.1144, "mean_token_accuracy": 0.9729759168624877, "num_tokens": 7066947331.0, "step": 50050 }, { "entropy": 1.3186654925346375, "epoch": 1.6355979236720968, "grad_norm": 2.109375, "learning_rate": 5.283906544952627e-07, "loss": 0.1132, "mean_token_accuracy": 0.9731574881076813, "num_tokens": 7074008041.0, "step": 50100 }, { "entropy": 1.318843502998352, "epoch": 1.6372302569292547, "grad_norm": 1.5703125, "learning_rate": 5.238103577689788e-07, "loss": 0.1161, "mean_token_accuracy": 0.9729295611381531, "num_tokens": 7081071001.0, "step": 50150 }, { "entropy": 1.3307886505126953, "epoch": 1.6388625901864124, "grad_norm": 1.671875, "learning_rate": 5.192480998272943e-07, "loss": 0.1142, "mean_token_accuracy": 0.972575945854187, "num_tokens": 7088102191.0, "step": 50200 }, { "entropy": 1.335254201889038, "epoch": 1.6404949234435704, "grad_norm": 1.0390625, "learning_rate": 5.147039139057831e-07, "loss": 0.1271, "mean_token_accuracy": 0.9703405356407165, "num_tokens": 7095646828.0, "step": 50250 }, { "entropy": 1.3202842998504638, "epoch": 1.6421272567007281, "grad_norm": 2.125, "learning_rate": 5.101778331083691e-07, "loss": 0.1085, "mean_token_accuracy": 0.9740463018417358, "num_tokens": 7102740051.0, "step": 50300 }, { "entropy": 1.3307878541946412, "epoch": 1.6437595899578858, "grad_norm": 2.0625, "learning_rate": 5.05669890407081e-07, "loss": 0.1114, "mean_token_accuracy": 0.9739047718048096, "num_tokens": 7109640526.0, "step": 50350 }, { "entropy": 1.3166346144676209, "epoch": 1.6453919232150436, "grad_norm": 0.08984375, "learning_rate": 5.011801186418147e-07, "loss": 0.1082, "mean_token_accuracy": 0.974234766960144, "num_tokens": 7116612724.0, "step": 50400 }, { "entropy": 1.324273819923401, "epoch": 1.6470242564722013, "grad_norm": 1.75, "learning_rate": 4.967085505200896e-07, "loss": 0.1145, "mean_token_accuracy": 0.9735607969760894, "num_tokens": 7123679576.0, "step": 50450 }, { "entropy": 1.3159007930755615, "epoch": 1.648656589729359, "grad_norm": 2.140625, "learning_rate": 4.922552186168168e-07, "loss": 0.1175, "mean_token_accuracy": 0.9724935472011567, "num_tokens": 7130980530.0, "step": 50500 }, { "epoch": 1.648656589729359, "eval_entropy": 1.324186561902364, "eval_loss": 0.14239400625228882, "eval_mean_token_accuracy": 0.9676336812973022, "eval_num_tokens": 7130980530.0, "eval_runtime": 748.1515, "eval_samples_per_second": 12.906, "eval_steps_per_second": 0.102, "step": 50500 }, { "entropy": 1.320852587223053, "epoch": 1.650288922986517, "grad_norm": 1.2421875, "learning_rate": 4.878201553740573e-07, "loss": 0.1148, "mean_token_accuracy": 0.9720003747940064, "num_tokens": 7137814687.0, "step": 50550 }, { "entropy": 1.3245435237884522, "epoch": 1.6519212562436747, "grad_norm": 2.34375, "learning_rate": 4.834033931007857e-07, "loss": 0.1079, "mean_token_accuracy": 0.9746451807022095, "num_tokens": 7144119513.0, "step": 50600 }, { "entropy": 1.317584047317505, "epoch": 1.6535535895008326, "grad_norm": 1.5390625, "learning_rate": 4.790049639726581e-07, "loss": 0.1097, "mean_token_accuracy": 0.9739763534069061, "num_tokens": 7150741274.0, "step": 50650 }, { "entropy": 1.3234117150306701, "epoch": 1.6551859227579904, "grad_norm": 0.9921875, "learning_rate": 4.746249000317725e-07, "loss": 0.1069, "mean_token_accuracy": 0.9751713788509369, "num_tokens": 7157786331.0, "step": 50700 }, { "entropy": 1.3205085873603821, "epoch": 1.656818256015148, "grad_norm": 1.7578125, "learning_rate": 4.702632331864422e-07, "loss": 0.1046, "mean_token_accuracy": 0.9752000343799591, "num_tokens": 7164501001.0, "step": 50750 }, { "entropy": 1.3175451397895812, "epoch": 1.6584505892723058, "grad_norm": 1.6171875, "learning_rate": 4.6591999521095563e-07, "loss": 0.0972, "mean_token_accuracy": 0.9763833940029144, "num_tokens": 7170583305.0, "step": 50800 }, { "entropy": 1.326640043258667, "epoch": 1.6600829225294635, "grad_norm": 1.40625, "learning_rate": 4.6159521774535153e-07, "loss": 0.1176, "mean_token_accuracy": 0.9723085272312164, "num_tokens": 7177641823.0, "step": 50850 }, { "entropy": 1.3212744474411011, "epoch": 1.6617152557866213, "grad_norm": 1.578125, "learning_rate": 4.572889322951863e-07, "loss": 0.1152, "mean_token_accuracy": 0.972984424829483, "num_tokens": 7184801732.0, "step": 50900 }, { "entropy": 1.317273302078247, "epoch": 1.6633475890437792, "grad_norm": 2.78125, "learning_rate": 4.530011702313006e-07, "loss": 0.1081, "mean_token_accuracy": 0.9745338428020477, "num_tokens": 7191485350.0, "step": 50950 }, { "entropy": 1.3148117685317993, "epoch": 1.664979922300937, "grad_norm": 1.4375, "learning_rate": 4.487319627895976e-07, "loss": 0.1132, "mean_token_accuracy": 0.9734457182884216, "num_tokens": 7198363082.0, "step": 51000 }, { "epoch": 1.664979922300937, "eval_entropy": 1.3203301127751668, "eval_loss": 0.14241376519203186, "eval_mean_token_accuracy": 0.9676187674204508, "eval_num_tokens": 7198363082.0, "eval_runtime": 750.908, "eval_samples_per_second": 12.859, "eval_steps_per_second": 0.101, "step": 51000 }, { "entropy": 1.3135675048828126, "epoch": 1.666612255558095, "grad_norm": 0.0026092529296875, "learning_rate": 4.4448134107080895e-07, "loss": 0.1087, "mean_token_accuracy": 0.9744446206092835, "num_tokens": 7205394224.0, "step": 51050 }, { "entropy": 1.330612359046936, "epoch": 1.6682445888152526, "grad_norm": 2.40625, "learning_rate": 4.4024933604027495e-07, "loss": 0.118, "mean_token_accuracy": 0.9718951296806335, "num_tokens": 7212621966.0, "step": 51100 }, { "entropy": 1.3243393778800965, "epoch": 1.6698769220724103, "grad_norm": 2.34375, "learning_rate": 4.360359785277107e-07, "loss": 0.1136, "mean_token_accuracy": 0.9730891573429108, "num_tokens": 7219256110.0, "step": 51150 }, { "entropy": 1.321114592552185, "epoch": 1.671509255329568, "grad_norm": 1.6796875, "learning_rate": 4.3184129922699e-07, "loss": 0.1132, "mean_token_accuracy": 0.9732060301303863, "num_tokens": 7226320848.0, "step": 51200 }, { "entropy": 1.3259812951087953, "epoch": 1.6731415885867258, "grad_norm": 1.8984375, "learning_rate": 4.276653286959168e-07, "loss": 0.1046, "mean_token_accuracy": 0.9755737960338593, "num_tokens": 7233157988.0, "step": 51250 }, { "entropy": 1.3230745482444763, "epoch": 1.6747739218438835, "grad_norm": 2.09375, "learning_rate": 4.2350809735600106e-07, "loss": 0.112, "mean_token_accuracy": 0.9729019176959991, "num_tokens": 7240476512.0, "step": 51300 }, { "entropy": 1.3131797289848328, "epoch": 1.6764062551010415, "grad_norm": 1.171875, "learning_rate": 4.1936963549224396e-07, "loss": 0.1124, "mean_token_accuracy": 0.9734921300411224, "num_tokens": 7247352561.0, "step": 51350 }, { "entropy": 1.306911015510559, "epoch": 1.6780385883581992, "grad_norm": 1.25, "learning_rate": 4.1524997325290903e-07, "loss": 0.1107, "mean_token_accuracy": 0.9738042771816253, "num_tokens": 7254496106.0, "step": 51400 }, { "entropy": 1.3271653127670289, "epoch": 1.6796709216153571, "grad_norm": 0.00311279296875, "learning_rate": 4.1114914064930875e-07, "loss": 0.1095, "mean_token_accuracy": 0.973537621498108, "num_tokens": 7261315934.0, "step": 51450 }, { "entropy": 1.3219536185264587, "epoch": 1.6813032548725149, "grad_norm": 2.015625, "learning_rate": 4.0706716755558326e-07, "loss": 0.1049, "mean_token_accuracy": 0.9747460389137268, "num_tokens": 7268210775.0, "step": 51500 }, { "epoch": 1.6813032548725149, "eval_entropy": 1.3198106654485067, "eval_loss": 0.14244017004966736, "eval_mean_token_accuracy": 0.9676569310824076, "eval_num_tokens": 7268210775.0, "eval_runtime": 753.6731, "eval_samples_per_second": 12.812, "eval_steps_per_second": 0.101, "step": 51500 }, { "entropy": 1.3197919082641603, "epoch": 1.6829355881296726, "grad_norm": 1.890625, "learning_rate": 4.0300408370848365e-07, "loss": 0.1043, "mean_token_accuracy": 0.9753884637355804, "num_tokens": 7274845487.0, "step": 51550 }, { "entropy": 1.3303123378753663, "epoch": 1.6845679213868303, "grad_norm": 1.3359375, "learning_rate": 3.9895991870715264e-07, "loss": 0.1181, "mean_token_accuracy": 0.9722198081016541, "num_tokens": 7282313836.0, "step": 51600 }, { "entropy": 1.3250636410713197, "epoch": 1.686200254643988, "grad_norm": 1.3203125, "learning_rate": 3.9493470201291404e-07, "loss": 0.109, "mean_token_accuracy": 0.9739108157157897, "num_tokens": 7289309163.0, "step": 51650 }, { "entropy": 1.3373154950141908, "epoch": 1.6878325879011458, "grad_norm": 1.171875, "learning_rate": 3.909284629490526e-07, "loss": 0.1186, "mean_token_accuracy": 0.9716354882717133, "num_tokens": 7296551436.0, "step": 51700 }, { "entropy": 1.3240708827972412, "epoch": 1.6894649211583037, "grad_norm": 1.1640625, "learning_rate": 3.8694123070060473e-07, "loss": 0.1103, "mean_token_accuracy": 0.9735696887969971, "num_tokens": 7303811515.0, "step": 51750 }, { "entropy": 1.3273040246963501, "epoch": 1.6910972544154614, "grad_norm": 1.9140625, "learning_rate": 3.8297303431414455e-07, "loss": 0.1176, "mean_token_accuracy": 0.9723230707645416, "num_tokens": 7310596062.0, "step": 51800 }, { "entropy": 1.3321073579788207, "epoch": 1.6927295876726194, "grad_norm": 1.59375, "learning_rate": 3.7902390269756883e-07, "loss": 0.1132, "mean_token_accuracy": 0.9733636856079102, "num_tokens": 7318010738.0, "step": 51850 }, { "entropy": 1.3346837186813354, "epoch": 1.694361920929777, "grad_norm": 1.4296875, "learning_rate": 3.75093864619894e-07, "loss": 0.118, "mean_token_accuracy": 0.9715818917751312, "num_tokens": 7325178822.0, "step": 51900 }, { "entropy": 1.327819790840149, "epoch": 1.6959942541869348, "grad_norm": 1.4609375, "learning_rate": 3.7118294871103764e-07, "loss": 0.1129, "mean_token_accuracy": 0.9730360591411591, "num_tokens": 7332489545.0, "step": 51950 }, { "entropy": 1.3169277691841126, "epoch": 1.6976265874440926, "grad_norm": 1.828125, "learning_rate": 3.672911834616175e-07, "loss": 0.1027, "mean_token_accuracy": 0.9758607912063598, "num_tokens": 7339208562.0, "step": 52000 }, { "epoch": 1.6976265874440926, "eval_entropy": 1.3195398267110188, "eval_loss": 0.1424235850572586, "eval_mean_token_accuracy": 0.9676573673884074, "eval_num_tokens": 7339208562.0, "eval_runtime": 751.2637, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 52000 }, { "entropy": 1.3143286561965943, "epoch": 1.6992589207012503, "grad_norm": 1.6484375, "learning_rate": 3.6341859722273907e-07, "loss": 0.1075, "mean_token_accuracy": 0.9739359652996064, "num_tokens": 7346374510.0, "step": 52050 }, { "entropy": 1.323881621360779, "epoch": 1.700891253958408, "grad_norm": 1.4375, "learning_rate": 3.5956521820579126e-07, "loss": 0.0998, "mean_token_accuracy": 0.9760903561115265, "num_tokens": 7353032386.0, "step": 52100 }, { "entropy": 1.3202632975578308, "epoch": 1.702523587215566, "grad_norm": 1.640625, "learning_rate": 3.5573107448224085e-07, "loss": 0.118, "mean_token_accuracy": 0.9715847325325012, "num_tokens": 7360252648.0, "step": 52150 }, { "entropy": 1.330101602077484, "epoch": 1.7041559204727237, "grad_norm": 0.96484375, "learning_rate": 3.519161939834264e-07, "loss": 0.1136, "mean_token_accuracy": 0.9737802124023438, "num_tokens": 7367323821.0, "step": 52200 }, { "entropy": 1.3269778847694398, "epoch": 1.7057882537298816, "grad_norm": 0.78515625, "learning_rate": 3.4812060450035723e-07, "loss": 0.1091, "mean_token_accuracy": 0.9739231073856354, "num_tokens": 7374681816.0, "step": 52250 }, { "entropy": 1.3149812078475953, "epoch": 1.7074205869870394, "grad_norm": 1.3359375, "learning_rate": 3.44344333683508e-07, "loss": 0.1049, "mean_token_accuracy": 0.9752970814704895, "num_tokens": 7381861290.0, "step": 52300 }, { "entropy": 1.31980233669281, "epoch": 1.709052920244197, "grad_norm": 2.078125, "learning_rate": 3.4058740904262077e-07, "loss": 0.1137, "mean_token_accuracy": 0.9733495855331421, "num_tokens": 7389307478.0, "step": 52350 }, { "entropy": 1.3273230743408204, "epoch": 1.7106852535013548, "grad_norm": 1.8203125, "learning_rate": 3.3684985794650025e-07, "loss": 0.1119, "mean_token_accuracy": 0.9732711517810821, "num_tokens": 7396166622.0, "step": 52400 }, { "entropy": 1.3064412307739257, "epoch": 1.7123175867585125, "grad_norm": 1.6875, "learning_rate": 3.3313170762281964e-07, "loss": 0.1004, "mean_token_accuracy": 0.9755911242961883, "num_tokens": 7403084742.0, "step": 52450 }, { "entropy": 1.3238463592529297, "epoch": 1.7139499200156703, "grad_norm": 2.671875, "learning_rate": 3.294329851579181e-07, "loss": 0.101, "mean_token_accuracy": 0.9754888868331909, "num_tokens": 7409894593.0, "step": 52500 }, { "epoch": 1.7139499200156703, "eval_entropy": 1.3191882546742757, "eval_loss": 0.14246180653572083, "eval_mean_token_accuracy": 0.9676294851303101, "eval_num_tokens": 7409894593.0, "eval_runtime": 749.671, "eval_samples_per_second": 12.88, "eval_steps_per_second": 0.101, "step": 52500 }, { "entropy": 1.3101821112632752, "epoch": 1.7155822532728282, "grad_norm": 1.1796875, "learning_rate": 3.25753717496604e-07, "loss": 0.0956, "mean_token_accuracy": 0.9765710198879242, "num_tokens": 7416484075.0, "step": 52550 }, { "entropy": 1.3191359090805053, "epoch": 1.717214586529986, "grad_norm": 1.359375, "learning_rate": 3.220939314419614e-07, "loss": 0.106, "mean_token_accuracy": 0.9754664206504822, "num_tokens": 7423584170.0, "step": 52600 }, { "entropy": 1.3270062279701234, "epoch": 1.7188469197871439, "grad_norm": 1.359375, "learning_rate": 3.1845365365515136e-07, "loss": 0.1058, "mean_token_accuracy": 0.9742505669593811, "num_tokens": 7430705466.0, "step": 52650 }, { "entropy": 1.308781328201294, "epoch": 1.7204792530443016, "grad_norm": 1.515625, "learning_rate": 3.14832910655221e-07, "loss": 0.101, "mean_token_accuracy": 0.9755427074432373, "num_tokens": 7437413037.0, "step": 52700 }, { "entropy": 1.3295643472671508, "epoch": 1.7221115863014593, "grad_norm": 1.3203125, "learning_rate": 3.1123172881890593e-07, "loss": 0.1149, "mean_token_accuracy": 0.9725350320339203, "num_tokens": 7445324755.0, "step": 52750 }, { "entropy": 1.306700224876404, "epoch": 1.723743919558617, "grad_norm": 1.734375, "learning_rate": 3.076501343804432e-07, "loss": 0.1028, "mean_token_accuracy": 0.9747404766082763, "num_tokens": 7452440110.0, "step": 52800 }, { "entropy": 1.320225818157196, "epoch": 1.7253762528157748, "grad_norm": 2.015625, "learning_rate": 3.0408815343137576e-07, "loss": 0.1097, "mean_token_accuracy": 0.9736387383937836, "num_tokens": 7459466767.0, "step": 52850 }, { "entropy": 1.319351954460144, "epoch": 1.7270085860729325, "grad_norm": 2.25, "learning_rate": 3.005458119203661e-07, "loss": 0.0996, "mean_token_accuracy": 0.9759821879863739, "num_tokens": 7466190850.0, "step": 52900 }, { "entropy": 1.3243736958503722, "epoch": 1.7286409193300905, "grad_norm": 1.0625, "learning_rate": 2.970231356530037e-07, "loss": 0.1178, "mean_token_accuracy": 0.9724557065963745, "num_tokens": 7473713575.0, "step": 52950 }, { "entropy": 1.3095801281929016, "epoch": 1.7302732525872482, "grad_norm": 2.3125, "learning_rate": 2.935201502916196e-07, "loss": 0.104, "mean_token_accuracy": 0.975416682958603, "num_tokens": 7480846261.0, "step": 53000 }, { "epoch": 1.7302732525872482, "eval_entropy": 1.3154603624343872, "eval_loss": 0.1425192952156067, "eval_mean_token_accuracy": 0.9676680334409078, "eval_num_tokens": 7480846261.0, "eval_runtime": 749.6932, "eval_samples_per_second": 12.88, "eval_steps_per_second": 0.101, "step": 53000 }, { "entropy": 1.327847077846527, "epoch": 1.7319055858444061, "grad_norm": 1.890625, "learning_rate": 2.9003688135509996e-07, "loss": 0.1027, "mean_token_accuracy": 0.9751829147338867, "num_tokens": 7487902389.0, "step": 53050 }, { "entropy": 1.3144252371788026, "epoch": 1.7335379191015639, "grad_norm": 1.4140625, "learning_rate": 2.86573354218696e-07, "loss": 0.0943, "mean_token_accuracy": 0.9771701264381408, "num_tokens": 7494511451.0, "step": 53100 }, { "entropy": 1.317789807319641, "epoch": 1.7351702523587216, "grad_norm": 2.09375, "learning_rate": 2.8312959411384496e-07, "loss": 0.1057, "mean_token_accuracy": 0.9748926043510437, "num_tokens": 7501653002.0, "step": 53150 }, { "entropy": 1.3131379342079164, "epoch": 1.7368025856158793, "grad_norm": 1.5234375, "learning_rate": 2.7970562612798003e-07, "loss": 0.0964, "mean_token_accuracy": 0.9761436748504638, "num_tokens": 7508205530.0, "step": 53200 }, { "entropy": 1.3117465686798095, "epoch": 1.738434918873037, "grad_norm": 1.203125, "learning_rate": 2.7630147520435454e-07, "loss": 0.0973, "mean_token_accuracy": 0.9763562536239624, "num_tokens": 7514970855.0, "step": 53250 }, { "entropy": 1.3275910449028014, "epoch": 1.7400672521301948, "grad_norm": 1.34375, "learning_rate": 2.729171661418536e-07, "loss": 0.1058, "mean_token_accuracy": 0.9749761927127838, "num_tokens": 7522035646.0, "step": 53300 }, { "entropy": 1.3121574544906616, "epoch": 1.7416995853873527, "grad_norm": 1.109375, "learning_rate": 2.695527235948176e-07, "loss": 0.1045, "mean_token_accuracy": 0.9752842879295349, "num_tokens": 7529319921.0, "step": 53350 }, { "entropy": 1.3087862515449524, "epoch": 1.7433319186445104, "grad_norm": 1.21875, "learning_rate": 2.662081720728621e-07, "loss": 0.0898, "mean_token_accuracy": 0.9782419979572297, "num_tokens": 7535926578.0, "step": 53400 }, { "entropy": 1.3212140440940856, "epoch": 1.7449642519016684, "grad_norm": 1.4453125, "learning_rate": 2.6288353594069716e-07, "loss": 0.0961, "mean_token_accuracy": 0.9764723992347717, "num_tokens": 7542728374.0, "step": 53450 }, { "entropy": 1.3193033647537231, "epoch": 1.746596585158826, "grad_norm": 1.265625, "learning_rate": 2.595788394179528e-07, "loss": 0.1046, "mean_token_accuracy": 0.9745003497600555, "num_tokens": 7549497199.0, "step": 53500 }, { "epoch": 1.746596585158826, "eval_entropy": 1.316735652287801, "eval_loss": 0.14255692064762115, "eval_mean_token_accuracy": 0.9676073582967123, "eval_num_tokens": 7549497199.0, "eval_runtime": 749.254, "eval_samples_per_second": 12.887, "eval_steps_per_second": 0.101, "step": 53500 }, { "entropy": 1.3119576716423034, "epoch": 1.7482289184159838, "grad_norm": 1.3046875, "learning_rate": 2.562941065789989e-07, "loss": 0.1075, "mean_token_accuracy": 0.9740070915222168, "num_tokens": 7556760300.0, "step": 53550 }, { "entropy": 1.323666477203369, "epoch": 1.7498612516731415, "grad_norm": 1.40625, "learning_rate": 2.530293613527752e-07, "loss": 0.1049, "mean_token_accuracy": 0.9741265332698822, "num_tokens": 7563873501.0, "step": 53600 }, { "entropy": 1.313588421344757, "epoch": 1.7514935849302993, "grad_norm": 2.484375, "learning_rate": 2.497846275226101e-07, "loss": 0.0999, "mean_token_accuracy": 0.976412239074707, "num_tokens": 7570815590.0, "step": 53650 }, { "entropy": 1.310178370475769, "epoch": 1.753125918187457, "grad_norm": 1.15625, "learning_rate": 2.4655992872605383e-07, "loss": 0.0919, "mean_token_accuracy": 0.9780379617214203, "num_tokens": 7577519323.0, "step": 53700 }, { "entropy": 1.3194669818878173, "epoch": 1.754758251444615, "grad_norm": 1.6328125, "learning_rate": 2.43355288454702e-07, "loss": 0.1098, "mean_token_accuracy": 0.973793808221817, "num_tokens": 7584868782.0, "step": 53750 }, { "entropy": 1.3247113370895385, "epoch": 1.7563905847017727, "grad_norm": 1.2265625, "learning_rate": 2.401707300540279e-07, "loss": 0.0981, "mean_token_accuracy": 0.9766256093978882, "num_tokens": 7591651694.0, "step": 53800 }, { "entropy": 1.320894329547882, "epoch": 1.7580229179589306, "grad_norm": 1.8359375, "learning_rate": 2.3700627672320707e-07, "loss": 0.0977, "mean_token_accuracy": 0.9763943207263946, "num_tokens": 7598567378.0, "step": 53850 }, { "entropy": 1.3126529788970946, "epoch": 1.7596552512160883, "grad_norm": 1.375, "learning_rate": 2.338619515149546e-07, "loss": 0.0914, "mean_token_accuracy": 0.9772222137451172, "num_tokens": 7605566065.0, "step": 53900 }, { "entropy": 1.3226768159866333, "epoch": 1.761287584473246, "grad_norm": 1.609375, "learning_rate": 2.307377773353535e-07, "loss": 0.1152, "mean_token_accuracy": 0.9727129638195038, "num_tokens": 7613562049.0, "step": 53950 }, { "entropy": 1.3229237818717956, "epoch": 1.7629199177304038, "grad_norm": 1.3203125, "learning_rate": 2.2763377694368827e-07, "loss": 0.1008, "mean_token_accuracy": 0.975539722442627, "num_tokens": 7620524140.0, "step": 54000 }, { "epoch": 1.7629199177304038, "eval_entropy": 1.3142534939448038, "eval_loss": 0.14254117012023926, "eval_mean_token_accuracy": 0.9676929664611816, "eval_num_tokens": 7620524140.0, "eval_runtime": 752.4204, "eval_samples_per_second": 12.833, "eval_steps_per_second": 0.101, "step": 54000 }, { "entropy": 1.3087509632110597, "epoch": 1.7645522509875615, "grad_norm": 1.171875, "learning_rate": 2.2454997295227985e-07, "loss": 0.101, "mean_token_accuracy": 0.9754433751106262, "num_tokens": 7627617937.0, "step": 54050 }, { "entropy": 1.3296643924713134, "epoch": 1.7661845842447192, "grad_norm": 2.046875, "learning_rate": 2.2148638782631969e-07, "loss": 0.1022, "mean_token_accuracy": 0.9750396251678467, "num_tokens": 7634624283.0, "step": 54100 }, { "entropy": 1.3130810499191283, "epoch": 1.7678169175018772, "grad_norm": 1.2734375, "learning_rate": 2.1844304388370862e-07, "loss": 0.0997, "mean_token_accuracy": 0.9761820614337922, "num_tokens": 7642188094.0, "step": 54150 }, { "entropy": 1.3113509464263915, "epoch": 1.769449250759035, "grad_norm": 1.1796875, "learning_rate": 2.154199632948901e-07, "loss": 0.0925, "mean_token_accuracy": 0.9778699886798858, "num_tokens": 7648970433.0, "step": 54200 }, { "entropy": 1.321754252910614, "epoch": 1.7710815840161929, "grad_norm": 1.09375, "learning_rate": 2.124171680826934e-07, "loss": 0.1044, "mean_token_accuracy": 0.974656708240509, "num_tokens": 7656027769.0, "step": 54250 }, { "entropy": 1.3192690873146058, "epoch": 1.7727139172733506, "grad_norm": 1.578125, "learning_rate": 2.094346801221706e-07, "loss": 0.0962, "mean_token_accuracy": 0.9762521004676818, "num_tokens": 7662871105.0, "step": 54300 }, { "entropy": 1.3231295156478882, "epoch": 1.7743462505305083, "grad_norm": 1.40625, "learning_rate": 2.0647252114043548e-07, "loss": 0.1029, "mean_token_accuracy": 0.975200617313385, "num_tokens": 7670265436.0, "step": 54350 }, { "entropy": 1.3153988409042359, "epoch": 1.775978583787666, "grad_norm": 1.21875, "learning_rate": 2.0353071271651024e-07, "loss": 0.0908, "mean_token_accuracy": 0.9769929325580597, "num_tokens": 7677021371.0, "step": 54400 }, { "entropy": 1.3150772476196289, "epoch": 1.7776109170448238, "grad_norm": 1.046875, "learning_rate": 2.006092762811631e-07, "loss": 0.0999, "mean_token_accuracy": 0.9757753646373749, "num_tokens": 7684035466.0, "step": 54450 }, { "entropy": 1.3111911249160766, "epoch": 1.7792432503019815, "grad_norm": 1.65625, "learning_rate": 1.9770823311675622e-07, "loss": 0.0956, "mean_token_accuracy": 0.9768190658092499, "num_tokens": 7690811924.0, "step": 54500 }, { "epoch": 1.7792432503019815, "eval_entropy": 1.3134044408798218, "eval_loss": 0.14260686933994293, "eval_mean_token_accuracy": 0.9676902786890665, "eval_num_tokens": 7690811924.0, "eval_runtime": 751.7328, "eval_samples_per_second": 12.845, "eval_steps_per_second": 0.101, "step": 54500 }, { "entropy": 1.3163812851905823, "epoch": 1.7808755835591394, "grad_norm": 1.2578125, "learning_rate": 1.948276043570867e-07, "loss": 0.0971, "mean_token_accuracy": 0.9768479645252228, "num_tokens": 7697685594.0, "step": 54550 }, { "entropy": 1.3316512179374695, "epoch": 1.7825079168162972, "grad_norm": 1.1484375, "learning_rate": 1.9196741098723714e-07, "loss": 0.1016, "mean_token_accuracy": 0.9754971957206726, "num_tokens": 7704672835.0, "step": 54600 }, { "entropy": 1.3212800335884094, "epoch": 1.7841402500734551, "grad_norm": 1.296875, "learning_rate": 1.8912767384341967e-07, "loss": 0.0971, "mean_token_accuracy": 0.9769463586807251, "num_tokens": 7712059362.0, "step": 54650 }, { "entropy": 1.3257331156730652, "epoch": 1.7857725833306128, "grad_norm": 1.4140625, "learning_rate": 1.863084136128239e-07, "loss": 0.0931, "mean_token_accuracy": 0.9771769893169403, "num_tokens": 7719025676.0, "step": 54700 }, { "entropy": 1.3107067704200746, "epoch": 1.7874049165877706, "grad_norm": 1.375, "learning_rate": 1.8350965083346883e-07, "loss": 0.0811, "mean_token_accuracy": 0.9800481641292572, "num_tokens": 7725468150.0, "step": 54750 }, { "entropy": 1.3124605083465577, "epoch": 1.7890372498449283, "grad_norm": 2.0, "learning_rate": 1.807314058940498e-07, "loss": 0.0905, "mean_token_accuracy": 0.9778806746006012, "num_tokens": 7732699148.0, "step": 54800 }, { "entropy": 1.3118835282325745, "epoch": 1.790669583102086, "grad_norm": 1.28125, "learning_rate": 1.7797369903379447e-07, "loss": 0.0885, "mean_token_accuracy": 0.9785989081859588, "num_tokens": 7739580787.0, "step": 54850 }, { "entropy": 1.3218691396713256, "epoch": 1.7923019163592437, "grad_norm": 1.328125, "learning_rate": 1.7523655034230913e-07, "loss": 0.1019, "mean_token_accuracy": 0.9758154857158661, "num_tokens": 7746871274.0, "step": 54900 }, { "entropy": 1.3150616145133973, "epoch": 1.7939342496164017, "grad_norm": 1.2265625, "learning_rate": 1.7251997975944023e-07, "loss": 0.0885, "mean_token_accuracy": 0.9779963111877441, "num_tokens": 7754114830.0, "step": 54950 }, { "entropy": 1.3193181252479553, "epoch": 1.7955665828735594, "grad_norm": 1.6796875, "learning_rate": 1.698240070751208e-07, "loss": 0.0954, "mean_token_accuracy": 0.9768703639507293, "num_tokens": 7761534891.0, "step": 55000 }, { "epoch": 1.7955665828735594, "eval_entropy": 1.3136341873804729, "eval_loss": 0.14263209700584412, "eval_mean_token_accuracy": 0.9676380705833435, "eval_num_tokens": 7761534891.0, "eval_runtime": 751.8782, "eval_samples_per_second": 12.843, "eval_steps_per_second": 0.101, "step": 55000 }, { "entropy": 1.3227564001083374, "epoch": 1.7971989161307174, "grad_norm": 1.359375, "learning_rate": 1.6714865192923357e-07, "loss": 0.0931, "mean_token_accuracy": 0.9776843535900116, "num_tokens": 7768739013.0, "step": 55050 }, { "entropy": 1.3159151887893676, "epoch": 1.798831249387875, "grad_norm": 1.75, "learning_rate": 1.644939338114617e-07, "loss": 0.0891, "mean_token_accuracy": 0.9785628998279572, "num_tokens": 7775714510.0, "step": 55100 }, { "entropy": 1.3172850131988525, "epoch": 1.8004635826450328, "grad_norm": 1.5, "learning_rate": 1.618598720611517e-07, "loss": 0.0929, "mean_token_accuracy": 0.9780389821529388, "num_tokens": 7782939417.0, "step": 55150 }, { "entropy": 1.3124612760543823, "epoch": 1.8020959159021905, "grad_norm": 1.453125, "learning_rate": 1.5924648586717106e-07, "loss": 0.0903, "mean_token_accuracy": 0.9784395337104798, "num_tokens": 7790076964.0, "step": 55200 }, { "entropy": 1.3138836860656737, "epoch": 1.8037282491593483, "grad_norm": 1.4140625, "learning_rate": 1.566537942677657e-07, "loss": 0.0906, "mean_token_accuracy": 0.9779064965248108, "num_tokens": 7796643475.0, "step": 55250 }, { "entropy": 1.3136559057235717, "epoch": 1.805360582416506, "grad_norm": 0.00165557861328125, "learning_rate": 1.5408181615042594e-07, "loss": 0.0843, "mean_token_accuracy": 0.9791945159435272, "num_tokens": 7803352885.0, "step": 55300 }, { "entropy": 1.3052284288406373, "epoch": 1.806992915673664, "grad_norm": 1.265625, "learning_rate": 1.5153057025174432e-07, "loss": 0.09, "mean_token_accuracy": 0.9779746580123901, "num_tokens": 7810594549.0, "step": 55350 }, { "entropy": 1.3102794551849366, "epoch": 1.8086252489308219, "grad_norm": 1.2265625, "learning_rate": 1.4900007515728365e-07, "loss": 0.0844, "mean_token_accuracy": 0.9790527272224426, "num_tokens": 7817507513.0, "step": 55400 }, { "entropy": 1.324124116897583, "epoch": 1.8102575821879796, "grad_norm": 1.1953125, "learning_rate": 1.4649034930143722e-07, "loss": 0.094, "mean_token_accuracy": 0.9774300479888915, "num_tokens": 7824764989.0, "step": 55450 }, { "entropy": 1.3147008061408996, "epoch": 1.8118899154451373, "grad_norm": 1.9765625, "learning_rate": 1.440014109672978e-07, "loss": 0.0914, "mean_token_accuracy": 0.9776774108409881, "num_tokens": 7831265405.0, "step": 55500 }, { "epoch": 1.8118899154451373, "eval_entropy": 1.3136727301279705, "eval_loss": 0.1426331102848053, "eval_mean_token_accuracy": 0.9676420497894287, "eval_num_tokens": 7831265405.0, "eval_runtime": 749.6902, "eval_samples_per_second": 12.88, "eval_steps_per_second": 0.101, "step": 55500 }, { "entropy": 1.3262977242469787, "epoch": 1.813522248702295, "grad_norm": 1.6875, "learning_rate": 1.415332782865235e-07, "loss": 0.0996, "mean_token_accuracy": 0.976102020740509, "num_tokens": 7838532553.0, "step": 55550 }, { "entropy": 1.314659128189087, "epoch": 1.8151545819594528, "grad_norm": 0.00151824951171875, "learning_rate": 1.3908596923920348e-07, "loss": 0.0876, "mean_token_accuracy": 0.9786652231216431, "num_tokens": 7845416264.0, "step": 55600 }, { "entropy": 1.2995626902580262, "epoch": 1.8167869152166105, "grad_norm": 1.203125, "learning_rate": 1.3665950165373177e-07, "loss": 0.078, "mean_token_accuracy": 0.9805066454410553, "num_tokens": 7852143397.0, "step": 55650 }, { "entropy": 1.3230929231643678, "epoch": 1.8184192484737685, "grad_norm": 1.21875, "learning_rate": 1.3425389320667126e-07, "loss": 0.0948, "mean_token_accuracy": 0.9774747908115387, "num_tokens": 7859183732.0, "step": 55700 }, { "entropy": 1.3118236589431762, "epoch": 1.8200515817309262, "grad_norm": 1.078125, "learning_rate": 1.3186916142263138e-07, "loss": 0.0852, "mean_token_accuracy": 0.9793214762210846, "num_tokens": 7865800782.0, "step": 55750 }, { "entropy": 1.3139012956619263, "epoch": 1.8216839149880841, "grad_norm": 0.921875, "learning_rate": 1.295053236741346e-07, "loss": 0.0858, "mean_token_accuracy": 0.9790966403484345, "num_tokens": 7873054112.0, "step": 55800 }, { "entropy": 1.3215832591056824, "epoch": 1.8233162482452419, "grad_norm": 1.5625, "learning_rate": 1.2716239718149404e-07, "loss": 0.0915, "mean_token_accuracy": 0.9782492446899415, "num_tokens": 7880572440.0, "step": 55850 }, { "entropy": 1.3250614070892335, "epoch": 1.8249485815023996, "grad_norm": 1.421875, "learning_rate": 1.248403990126864e-07, "loss": 0.0836, "mean_token_accuracy": 0.9802043890953064, "num_tokens": 7887372576.0, "step": 55900 }, { "entropy": 1.3125244760513306, "epoch": 1.8265809147595573, "grad_norm": 1.6953125, "learning_rate": 1.2253934608322704e-07, "loss": 0.0877, "mean_token_accuracy": 0.9786297881603241, "num_tokens": 7894499299.0, "step": 55950 }, { "entropy": 1.310175290107727, "epoch": 1.828213248016715, "grad_norm": 1.2578125, "learning_rate": 1.2025925515604797e-07, "loss": 0.0863, "mean_token_accuracy": 0.9788778626918793, "num_tokens": 7901474987.0, "step": 56000 }, { "epoch": 1.828213248016715, "eval_entropy": 1.3131318473815918, "eval_loss": 0.14263643324375153, "eval_mean_token_accuracy": 0.9676548767089844, "eval_num_tokens": 7901474987.0, "eval_runtime": 752.1988, "eval_samples_per_second": 12.837, "eval_steps_per_second": 0.101, "step": 56000 }, { "entropy": 1.307514934539795, "epoch": 1.8298455812738728, "grad_norm": 1.0546875, "learning_rate": 1.1800014284137439e-07, "loss": 0.0798, "mean_token_accuracy": 0.9802592170238494, "num_tokens": 7908470639.0, "step": 56050 }, { "entropy": 1.3173511362075805, "epoch": 1.8314779145310307, "grad_norm": 0.287109375, "learning_rate": 1.157620255966061e-07, "loss": 0.0857, "mean_token_accuracy": 0.9793569481372834, "num_tokens": 7915474847.0, "step": 56100 }, { "entropy": 1.326115915775299, "epoch": 1.8331102477881884, "grad_norm": 1.40625, "learning_rate": 1.1354491972619418e-07, "loss": 0.0882, "mean_token_accuracy": 0.9790770506858826, "num_tokens": 7922729639.0, "step": 56150 }, { "entropy": 1.3074156522750855, "epoch": 1.8347425810453464, "grad_norm": 1.125, "learning_rate": 1.1134884138152556e-07, "loss": 0.0747, "mean_token_accuracy": 0.9815619790554047, "num_tokens": 7929688118.0, "step": 56200 }, { "entropy": 1.3235062193870544, "epoch": 1.836374914302504, "grad_norm": 1.2578125, "learning_rate": 1.0917380656080234e-07, "loss": 0.083, "mean_token_accuracy": 0.9802336692810059, "num_tokens": 7936632242.0, "step": 56250 }, { "entropy": 1.320460605621338, "epoch": 1.8380072475596618, "grad_norm": 1.453125, "learning_rate": 1.0701983110892821e-07, "loss": 0.0871, "mean_token_accuracy": 0.9788316774368286, "num_tokens": 7944029771.0, "step": 56300 }, { "entropy": 1.3198864316940309, "epoch": 1.8396395808168196, "grad_norm": 1.671875, "learning_rate": 1.0488693071738998e-07, "loss": 0.0914, "mean_token_accuracy": 0.9782080149650574, "num_tokens": 7951324936.0, "step": 56350 }, { "entropy": 1.3099779963493348, "epoch": 1.8412719140739773, "grad_norm": 1.5625, "learning_rate": 1.0277512092414621e-07, "loss": 0.0808, "mean_token_accuracy": 0.9805028080940247, "num_tokens": 7958613468.0, "step": 56400 }, { "entropy": 1.3207287693023682, "epoch": 1.842904247331135, "grad_norm": 2.03125, "learning_rate": 1.0068441711351239e-07, "loss": 0.0848, "mean_token_accuracy": 0.9797763514518738, "num_tokens": 7965649951.0, "step": 56450 }, { "entropy": 1.3234837770462036, "epoch": 1.844536580588293, "grad_norm": 1.7265625, "learning_rate": 9.861483451604803e-08, "loss": 0.0848, "mean_token_accuracy": 0.9798565399646759, "num_tokens": 7972824474.0, "step": 56500 }, { "epoch": 1.844536580588293, "eval_entropy": 1.3131659841537475, "eval_loss": 0.142597496509552, "eval_mean_token_accuracy": 0.9676955684026083, "eval_num_tokens": 7972824474.0, "eval_runtime": 754.1624, "eval_samples_per_second": 12.804, "eval_steps_per_second": 0.101, "step": 56500 }, { "entropy": 1.3255647730827331, "epoch": 1.8461689138454507, "grad_norm": 1.4609375, "learning_rate": 9.656638820844832e-08, "loss": 0.0797, "mean_token_accuracy": 0.9803315377235413, "num_tokens": 7979650165.0, "step": 56550 }, { "entropy": 1.3177052664756774, "epoch": 1.8478012471026086, "grad_norm": 1.203125, "learning_rate": 9.453909311343168e-08, "loss": 0.082, "mean_token_accuracy": 0.9800474560260772, "num_tokens": 7986279045.0, "step": 56600 }, { "entropy": 1.3207802820205687, "epoch": 1.8494335803597663, "grad_norm": 1.4765625, "learning_rate": 9.253296399963306e-08, "loss": 0.0844, "mean_token_accuracy": 0.9793656885623931, "num_tokens": 7993389141.0, "step": 56650 }, { "entropy": 1.313080358505249, "epoch": 1.851065913616924, "grad_norm": 1.015625, "learning_rate": 9.054801548149383e-08, "loss": 0.0835, "mean_token_accuracy": 0.9803290486335754, "num_tokens": 8000349967.0, "step": 56700 }, { "entropy": 1.316315357685089, "epoch": 1.8526982468740818, "grad_norm": 1.8125, "learning_rate": 8.85842620191587e-08, "loss": 0.0823, "mean_token_accuracy": 0.9800036442279816, "num_tokens": 8007394269.0, "step": 56750 }, { "entropy": 1.323743233680725, "epoch": 1.8543305801312395, "grad_norm": 1.5390625, "learning_rate": 8.664171791836828e-08, "loss": 0.0783, "mean_token_accuracy": 0.9807388544082641, "num_tokens": 8014348208.0, "step": 56800 }, { "entropy": 1.3166941332817077, "epoch": 1.8559629133883973, "grad_norm": 1.9296875, "learning_rate": 8.472039733035375e-08, "loss": 0.0887, "mean_token_accuracy": 0.9791383862495422, "num_tokens": 8021975439.0, "step": 56850 }, { "entropy": 1.3115915489196777, "epoch": 1.8575952466455552, "grad_norm": 1.265625, "learning_rate": 8.282031425173697e-08, "loss": 0.0742, "mean_token_accuracy": 0.9814891684055328, "num_tokens": 8028607289.0, "step": 56900 }, { "entropy": 1.3096832203865052, "epoch": 1.859227579902713, "grad_norm": 1.609375, "learning_rate": 8.094148252442557e-08, "loss": 0.0806, "mean_token_accuracy": 0.9808643221855163, "num_tokens": 8035350719.0, "step": 56950 }, { "entropy": 1.312803740501404, "epoch": 1.8608599131598709, "grad_norm": 0.953125, "learning_rate": 7.908391583551399e-08, "loss": 0.08, "mean_token_accuracy": 0.9804242384433747, "num_tokens": 8042661144.0, "step": 57000 }, { "epoch": 1.8608599131598709, "eval_entropy": 1.3134743928909303, "eval_loss": 0.14260423183441162, "eval_mean_token_accuracy": 0.9677057147026062, "eval_num_tokens": 8042661144.0, "eval_runtime": 752.8525, "eval_samples_per_second": 12.826, "eval_steps_per_second": 0.101, "step": 57000 }, { "entropy": 1.3151041221618653, "epoch": 1.8624922464170286, "grad_norm": 1.6875, "learning_rate": 7.724762771718264e-08, "loss": 0.0703, "mean_token_accuracy": 0.9828312218189239, "num_tokens": 8049061325.0, "step": 57050 }, { "entropy": 1.3201626539230347, "epoch": 1.8641245796741863, "grad_norm": 1.3203125, "learning_rate": 7.543263154660018e-08, "loss": 0.0794, "mean_token_accuracy": 0.9805750000476837, "num_tokens": 8055861953.0, "step": 57100 }, { "entropy": 1.3100008058547974, "epoch": 1.865756912931344, "grad_norm": 1.1328125, "learning_rate": 7.363894054582543e-08, "loss": 0.0796, "mean_token_accuracy": 0.9810529005527496, "num_tokens": 8062705627.0, "step": 57150 }, { "entropy": 1.3101088619232177, "epoch": 1.8673892461885018, "grad_norm": 1.3671875, "learning_rate": 7.186656778171064e-08, "loss": 0.0848, "mean_token_accuracy": 0.9795117557048798, "num_tokens": 8069712418.0, "step": 57200 }, { "entropy": 1.3087351298332215, "epoch": 1.8690215794456595, "grad_norm": 1.3828125, "learning_rate": 7.011552616580763e-08, "loss": 0.0784, "mean_token_accuracy": 0.9811785018444061, "num_tokens": 8076520985.0, "step": 57250 }, { "entropy": 1.314774408340454, "epoch": 1.8706539127028174, "grad_norm": 1.265625, "learning_rate": 6.838582845427322e-08, "loss": 0.0829, "mean_token_accuracy": 0.9801759088039398, "num_tokens": 8083425224.0, "step": 57300 }, { "entropy": 1.3010089206695556, "epoch": 1.8722862459599752, "grad_norm": 1.140625, "learning_rate": 6.667748724777589e-08, "loss": 0.0837, "mean_token_accuracy": 0.9797486662864685, "num_tokens": 8090860003.0, "step": 57350 }, { "entropy": 1.3112692332267761, "epoch": 1.8739185792171331, "grad_norm": 1.890625, "learning_rate": 6.499051499140363e-08, "loss": 0.0872, "mean_token_accuracy": 0.9786810195446014, "num_tokens": 8097627069.0, "step": 57400 }, { "entropy": 1.3242556214332581, "epoch": 1.8755509124742908, "grad_norm": 1.234375, "learning_rate": 6.332492397457457e-08, "loss": 0.1031, "mean_token_accuracy": 0.9754553985595703, "num_tokens": 8104939412.0, "step": 57450 }, { "entropy": 1.3048901081085205, "epoch": 1.8771832457314486, "grad_norm": 1.1875, "learning_rate": 6.168072633094578e-08, "loss": 0.0942, "mean_token_accuracy": 0.9768707299232483, "num_tokens": 8111717916.0, "step": 57500 }, { "epoch": 1.8771832457314486, "eval_entropy": 1.3135956350962321, "eval_loss": 0.14263209700584412, "eval_mean_token_accuracy": 0.9676501870155334, "eval_num_tokens": 8111717916.0, "eval_runtime": 751.2808, "eval_samples_per_second": 12.853, "eval_steps_per_second": 0.101, "step": 57500 }, { "entropy": 1.323448977470398, "epoch": 1.8788155789886063, "grad_norm": 1.2421875, "learning_rate": 6.00579340383277e-08, "loss": 0.1047, "mean_token_accuracy": 0.9757449758052826, "num_tokens": 8118451505.0, "step": 57550 }, { "entropy": 1.3023139286041259, "epoch": 1.880447912245764, "grad_norm": 1.046875, "learning_rate": 5.845655891859247e-08, "loss": 0.097, "mean_token_accuracy": 0.9766959726810456, "num_tokens": 8125132870.0, "step": 57600 }, { "entropy": 1.3242397212982178, "epoch": 1.8820802455029217, "grad_norm": 2.328125, "learning_rate": 5.68766126375927e-08, "loss": 0.1246, "mean_token_accuracy": 0.9708089303970336, "num_tokens": 8131883790.0, "step": 57650 }, { "entropy": 1.3141312503814697, "epoch": 1.8837125787600797, "grad_norm": 1.53125, "learning_rate": 5.5318106705072535e-08, "loss": 0.1395, "mean_token_accuracy": 0.9681181204319, "num_tokens": 8138880080.0, "step": 57700 }, { "entropy": 1.3103543734550476, "epoch": 1.8853449120172374, "grad_norm": 1.6953125, "learning_rate": 5.378105247458609e-08, "loss": 0.1249, "mean_token_accuracy": 0.9709859442710876, "num_tokens": 8145787855.0, "step": 57750 }, { "entropy": 1.3131978607177734, "epoch": 1.8869772452743954, "grad_norm": 1.953125, "learning_rate": 5.226546114341413e-08, "loss": 0.1242, "mean_token_accuracy": 0.971501134634018, "num_tokens": 8152787087.0, "step": 57800 }, { "entropy": 1.3190133213996886, "epoch": 1.888609578531553, "grad_norm": 1.640625, "learning_rate": 5.077134375248183e-08, "loss": 0.1295, "mean_token_accuracy": 0.9703145730495453, "num_tokens": 8159391177.0, "step": 57850 }, { "entropy": 1.306349711418152, "epoch": 1.8902419117887108, "grad_norm": 1.2109375, "learning_rate": 4.9298711186279824e-08, "loss": 0.1169, "mean_token_accuracy": 0.9731926989555358, "num_tokens": 8166052317.0, "step": 57900 }, { "entropy": 1.312576003074646, "epoch": 1.8918742450458685, "grad_norm": 1.796875, "learning_rate": 4.784757417278296e-08, "loss": 0.124, "mean_token_accuracy": 0.9711933457851409, "num_tokens": 8173047293.0, "step": 57950 }, { "entropy": 1.316941294670105, "epoch": 1.8935065783030263, "grad_norm": 1.4375, "learning_rate": 4.641794328337434e-08, "loss": 0.1289, "mean_token_accuracy": 0.9705963468551636, "num_tokens": 8180073089.0, "step": 58000 }, { "epoch": 1.8935065783030263, "eval_entropy": 1.3130744123458862, "eval_loss": 0.142622709274292, "eval_mean_token_accuracy": 0.9676458064715068, "eval_num_tokens": 8180073089.0, "eval_runtime": 754.6584, "eval_samples_per_second": 12.795, "eval_steps_per_second": 0.101, "step": 58000 }, { "entropy": 1.3161793375015258, "epoch": 1.895138911560184, "grad_norm": 1.8515625, "learning_rate": 4.5009828932766395e-08, "loss": 0.1247, "mean_token_accuracy": 0.9718296456336976, "num_tokens": 8187129264.0, "step": 58050 }, { "entropy": 1.3136934351921081, "epoch": 1.896771244817342, "grad_norm": 1.53125, "learning_rate": 4.362324137892626e-08, "loss": 0.1221, "mean_token_accuracy": 0.971994297504425, "num_tokens": 8194219348.0, "step": 58100 }, { "entropy": 1.3273291397094726, "epoch": 1.8984035780744997, "grad_norm": 2.234375, "learning_rate": 4.225819072300019e-08, "loss": 0.1321, "mean_token_accuracy": 0.9701541066169739, "num_tokens": 8201245704.0, "step": 58150 }, { "entropy": 1.3125488138198853, "epoch": 1.9000359113316576, "grad_norm": 1.3984375, "learning_rate": 4.091468690924061e-08, "loss": 0.1336, "mean_token_accuracy": 0.9690061569213867, "num_tokens": 8208779023.0, "step": 58200 }, { "entropy": 1.3206189322471618, "epoch": 1.9016682445888153, "grad_norm": 2.171875, "learning_rate": 3.9592739724933494e-08, "loss": 0.1326, "mean_token_accuracy": 0.9700911176204682, "num_tokens": 8215688508.0, "step": 58250 }, { "entropy": 1.3118561697006226, "epoch": 1.903300577845973, "grad_norm": 2.328125, "learning_rate": 3.8292358800326774e-08, "loss": 0.1341, "mean_token_accuracy": 0.9699221074581146, "num_tokens": 8222957220.0, "step": 58300 }, { "entropy": 1.3096631002426147, "epoch": 1.9049329111031308, "grad_norm": 1.328125, "learning_rate": 3.70135536085604e-08, "loss": 0.1289, "mean_token_accuracy": 0.970008145570755, "num_tokens": 8230419197.0, "step": 58350 }, { "entropy": 1.3086332321166991, "epoch": 1.9065652443602885, "grad_norm": 1.9453125, "learning_rate": 3.57563334655977e-08, "loss": 0.1196, "mean_token_accuracy": 0.9730131483078003, "num_tokens": 8237250702.0, "step": 58400 }, { "entropy": 1.31609708070755, "epoch": 1.9081975776174462, "grad_norm": 1.5546875, "learning_rate": 3.4520707530157125e-08, "loss": 0.1276, "mean_token_accuracy": 0.9696814298629761, "num_tokens": 8244072641.0, "step": 58450 }, { "entropy": 1.3143381929397584, "epoch": 1.9098299108746042, "grad_norm": 1.875, "learning_rate": 3.330668480364496e-08, "loss": 0.1288, "mean_token_accuracy": 0.9699802708625793, "num_tokens": 8251417973.0, "step": 58500 }, { "epoch": 1.9098299108746042, "eval_entropy": 1.313671735127767, "eval_loss": 0.14263801276683807, "eval_mean_token_accuracy": 0.9676080171267192, "eval_num_tokens": 8251417973.0, "eval_runtime": 754.9973, "eval_samples_per_second": 12.789, "eval_steps_per_second": 0.101, "step": 58500 }, { "entropy": 1.3198811602592468, "epoch": 1.911462244131762, "grad_norm": 1.140625, "learning_rate": 3.2114274130091383e-08, "loss": 0.124, "mean_token_accuracy": 0.9716233015060425, "num_tokens": 8258494800.0, "step": 58550 }, { "entropy": 1.3157025051116944, "epoch": 1.9130945773889199, "grad_norm": 1.4765625, "learning_rate": 3.0943484196083836e-08, "loss": 0.1216, "mean_token_accuracy": 0.9713974285125733, "num_tokens": 8265439804.0, "step": 58600 }, { "entropy": 1.313891351222992, "epoch": 1.9147269106460776, "grad_norm": 2.171875, "learning_rate": 2.979432353070577e-08, "loss": 0.1213, "mean_token_accuracy": 0.9722525453567505, "num_tokens": 8272196336.0, "step": 58650 }, { "entropy": 1.3181121969223022, "epoch": 1.9163592439032353, "grad_norm": 1.5625, "learning_rate": 2.8666800505473655e-08, "loss": 0.1292, "mean_token_accuracy": 0.9704732525348664, "num_tokens": 8279415276.0, "step": 58700 }, { "entropy": 1.312175862789154, "epoch": 1.917991577160393, "grad_norm": 2.140625, "learning_rate": 2.75609233342754e-08, "loss": 0.124, "mean_token_accuracy": 0.9711616253852844, "num_tokens": 8286266573.0, "step": 58750 }, { "entropy": 1.3171151232719422, "epoch": 1.9196239104175508, "grad_norm": 2.40625, "learning_rate": 2.6476700073311376e-08, "loss": 0.1278, "mean_token_accuracy": 0.9706109237670898, "num_tokens": 8293368650.0, "step": 58800 }, { "entropy": 1.3135269594192505, "epoch": 1.9212562436747085, "grad_norm": 1.46875, "learning_rate": 2.5414138621035477e-08, "loss": 0.1273, "mean_token_accuracy": 0.9706173431873322, "num_tokens": 8300206080.0, "step": 58850 }, { "entropy": 1.3087952733039856, "epoch": 1.9228885769318664, "grad_norm": 1.609375, "learning_rate": 2.437324671809782e-08, "loss": 0.1256, "mean_token_accuracy": 0.9707586574554443, "num_tokens": 8307284608.0, "step": 58900 }, { "entropy": 1.310805425643921, "epoch": 1.9245209101890242, "grad_norm": 1.5, "learning_rate": 2.3354031947288136e-08, "loss": 0.1192, "mean_token_accuracy": 0.9720681381225585, "num_tokens": 8314051370.0, "step": 58950 }, { "entropy": 1.315222783088684, "epoch": 1.926153243446182, "grad_norm": 1.921875, "learning_rate": 2.2356501733479806e-08, "loss": 0.1181, "mean_token_accuracy": 0.9725264024734497, "num_tokens": 8320964631.0, "step": 59000 }, { "epoch": 1.926153243446182, "eval_entropy": 1.313348093032837, "eval_loss": 0.14262661337852478, "eval_mean_token_accuracy": 0.9675622606277465, "eval_num_tokens": 8320964631.0, "eval_runtime": 750.7844, "eval_samples_per_second": 12.861, "eval_steps_per_second": 0.101, "step": 59000 }, { "entropy": 1.3111545324325562, "epoch": 1.9277855767033398, "grad_norm": 1.984375, "learning_rate": 2.1380663343577246e-08, "loss": 0.1199, "mean_token_accuracy": 0.9722956836223602, "num_tokens": 8327540250.0, "step": 59050 }, { "entropy": 1.3120374631881715, "epoch": 1.9294179099604976, "grad_norm": 1.390625, "learning_rate": 2.04265238864616e-08, "loss": 0.1307, "mean_token_accuracy": 0.9701423704624176, "num_tokens": 8334721588.0, "step": 59100 }, { "entropy": 1.3150414967536925, "epoch": 1.9310502432176553, "grad_norm": 1.2421875, "learning_rate": 1.949409031294014e-08, "loss": 0.1274, "mean_token_accuracy": 0.9704896664619446, "num_tokens": 8341662054.0, "step": 59150 }, { "entropy": 1.3131565976142883, "epoch": 1.932682576474813, "grad_norm": 2.265625, "learning_rate": 1.8583369415694608e-08, "loss": 0.1277, "mean_token_accuracy": 0.9700733041763305, "num_tokens": 8349277940.0, "step": 59200 }, { "entropy": 1.3129038047790527, "epoch": 1.9343149097319707, "grad_norm": 2.0625, "learning_rate": 1.769436782923195e-08, "loss": 0.137, "mean_token_accuracy": 0.9688089847564697, "num_tokens": 8356789493.0, "step": 59250 }, { "entropy": 1.3096733212471008, "epoch": 1.9359472429891287, "grad_norm": 2.578125, "learning_rate": 1.6827092029836678e-08, "loss": 0.1235, "mean_token_accuracy": 0.9711651515960693, "num_tokens": 8363715543.0, "step": 59300 }, { "entropy": 1.3143090605735779, "epoch": 1.9375795762462864, "grad_norm": 2.1875, "learning_rate": 1.59815483355229e-08, "loss": 0.1225, "mean_token_accuracy": 0.9719608640670776, "num_tokens": 8370411365.0, "step": 59350 }, { "entropy": 1.3088870167732238, "epoch": 1.9392119095034444, "grad_norm": 2.453125, "learning_rate": 1.5157742905989037e-08, "loss": 0.1246, "mean_token_accuracy": 0.9717032659053803, "num_tokens": 8377216822.0, "step": 59400 }, { "entropy": 1.318067877292633, "epoch": 1.940844242760602, "grad_norm": 0.578125, "learning_rate": 1.4355681742571847e-08, "loss": 0.1247, "mean_token_accuracy": 0.9717764687538147, "num_tokens": 8384607784.0, "step": 59450 }, { "entropy": 1.3116914916038513, "epoch": 1.9424765760177598, "grad_norm": 2.28125, "learning_rate": 1.357537068820347e-08, "loss": 0.1276, "mean_token_accuracy": 0.970702486038208, "num_tokens": 8391872892.0, "step": 59500 }, { "epoch": 1.9424765760177598, "eval_entropy": 1.313501017888387, "eval_loss": 0.1426248997449875, "eval_mean_token_accuracy": 0.9675657065709432, "eval_num_tokens": 8391872892.0, "eval_runtime": 746.0898, "eval_samples_per_second": 12.942, "eval_steps_per_second": 0.102, "step": 59500 }, { "entropy": 1.310592589378357, "epoch": 1.9441089092749175, "grad_norm": 1.6953125, "learning_rate": 1.2816815427369455e-08, "loss": 0.1318, "mean_token_accuracy": 0.969595000743866, "num_tokens": 8399268788.0, "step": 59550 }, { "entropy": 1.3104493141174316, "epoch": 1.9457412425320753, "grad_norm": 1.6640625, "learning_rate": 1.208002148606613e-08, "loss": 0.114, "mean_token_accuracy": 0.9738408529758453, "num_tokens": 8405775548.0, "step": 59600 }, { "entropy": 1.3134746599197387, "epoch": 1.947373575789233, "grad_norm": 1.2734375, "learning_rate": 1.1364994231760295e-08, "loss": 0.129, "mean_token_accuracy": 0.9705848026275635, "num_tokens": 8413019964.0, "step": 59650 }, { "entropy": 1.3184413361549376, "epoch": 1.949005909046391, "grad_norm": 1.84375, "learning_rate": 1.0671738873351932e-08, "loss": 0.1169, "mean_token_accuracy": 0.9727689456939698, "num_tokens": 8420004432.0, "step": 59700 }, { "entropy": 1.317722589969635, "epoch": 1.9506382423035487, "grad_norm": 1.390625, "learning_rate": 1.0000260461134225e-08, "loss": 0.1233, "mean_token_accuracy": 0.9717906093597413, "num_tokens": 8426576444.0, "step": 59750 }, { "entropy": 1.315042221546173, "epoch": 1.9522705755607066, "grad_norm": 2.625, "learning_rate": 9.35056388675759e-09, "loss": 0.1258, "mean_token_accuracy": 0.9710904705524445, "num_tokens": 8433326272.0, "step": 59800 }, { "entropy": 1.3289856863021852, "epoch": 1.9539029088178643, "grad_norm": 1.921875, "learning_rate": 8.722653883194375e-09, "loss": 0.1389, "mean_token_accuracy": 0.9682004976272583, "num_tokens": 8440652691.0, "step": 59850 }, { "entropy": 1.3113100409507752, "epoch": 1.955535242075022, "grad_norm": 1.9921875, "learning_rate": 8.116535024703554e-09, "loss": 0.1263, "mean_token_accuracy": 0.9709288036823273, "num_tokens": 8447487171.0, "step": 59900 }, { "entropy": 1.3235667037963867, "epoch": 1.9571675753321798, "grad_norm": 2.109375, "learning_rate": 7.53221172679841e-09, "loss": 0.1217, "mean_token_accuracy": 0.972466766834259, "num_tokens": 8454238666.0, "step": 59950 }, { "entropy": 1.3157719588279724, "epoch": 1.9587999085893375, "grad_norm": 1.65625, "learning_rate": 6.969688246213246e-09, "loss": 0.1267, "mean_token_accuracy": 0.971341325044632, "num_tokens": 8461254971.0, "step": 60000 }, { "epoch": 1.9587999085893375, "eval_entropy": 1.3126351674397787, "eval_loss": 0.14264898002147675, "eval_mean_token_accuracy": 0.9676442178090413, "eval_num_tokens": 8461254971.0, "eval_runtime": 749.6809, "eval_samples_per_second": 12.88, "eval_steps_per_second": 0.101, "step": 60000 } ], "logging_steps": 50, "max_steps": 61262, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0297347722946334e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }