diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12775 @@ +{ + "best_global_step": 6200, + "best_metric": 0.3402261435985565, + "best_model_checkpoint": "./sft_model/checkpoint-6200", + "epoch": 0.9897828863346104, + "eval_steps": 200, + "global_step": 6200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 0.2782653570175171, + "epoch": 0.0007982120051085569, + "grad_norm": 11.474634170532227, + "learning_rate": 4.999999962269939e-06, + "loss": 1.5842, + "mean_token_accuracy": 0.7387546181678772, + "num_tokens": 40960.0, + "step": 5 + }, + { + "entropy": 0.6856095910072326, + "epoch": 0.0015964240102171138, + "grad_norm": 17.486251831054688, + "learning_rate": 4.99999980899157e-06, + "loss": 1.2661, + "mean_token_accuracy": 0.7310832023620606, + "num_tokens": 81920.0, + "step": 10 + }, + { + "entropy": 0.5054598093032837, + "epoch": 0.0023946360153256703, + "grad_norm": 4.929881572723389, + "learning_rate": 4.999999537806773e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7575001239776611, + "num_tokens": 122880.0, + "step": 15 + }, + { + "entropy": 0.5710949540138245, + "epoch": 0.0031928480204342275, + "grad_norm": 2.9681556224823, + "learning_rate": 4.999999148715565e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7802138924598694, + "num_tokens": 163840.0, + "step": 20 + }, + { + "entropy": 0.6400085330009461, + "epoch": 0.003991060025542784, + "grad_norm": 2.278716564178467, + "learning_rate": 4.999998641717971e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.8072834253311157, + "num_tokens": 204800.0, + "step": 25 + }, + { + "entropy": 0.5254894733428955, + "epoch": 0.004789272030651341, + "grad_norm": 2.6655399799346924, + "learning_rate": 4.999998016814023e-06, + "loss": 0.631, + "mean_token_accuracy": 0.8123680472373962, + "num_tokens": 245760.0, + "step": 30 + }, + { + "entropy": 0.6046146094799042, + "epoch": 0.005587484035759898, + "grad_norm": 2.30499005317688, + "learning_rate": 4.99999727400376e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.8119417071342468, + "num_tokens": 286720.0, + "step": 35 + }, + { + "entropy": 0.532450532913208, + "epoch": 0.006385696040868455, + "grad_norm": 1.6848894357681274, + "learning_rate": 4.999996413287229e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.834265160560608, + "num_tokens": 327680.0, + "step": 40 + }, + { + "entropy": 0.5985349774360657, + "epoch": 0.007183908045977011, + "grad_norm": 2.2001168727874756, + "learning_rate": 4.999995434664483e-06, + "loss": 0.605, + "mean_token_accuracy": 0.8122081875801086, + "num_tokens": 368640.0, + "step": 45 + }, + { + "entropy": 0.5565362870693207, + "epoch": 0.007982120051085569, + "grad_norm": 1.8532217741012573, + "learning_rate": 4.9999943381355846e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.8251007676124573, + "num_tokens": 409600.0, + "step": 50 + }, + { + "entropy": 0.5010187983512878, + "epoch": 0.008780332056194126, + "grad_norm": 1.9065581560134888, + "learning_rate": 4.999993123700602e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8482044339179993, + "num_tokens": 450560.0, + "step": 55 + }, + { + "entropy": 0.5530545592308045, + "epoch": 0.009578544061302681, + "grad_norm": 1.9626890420913696, + "learning_rate": 4.999991791359612e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8315532445907593, + "num_tokens": 491520.0, + "step": 60 + }, + { + "entropy": 0.5973767757415771, + "epoch": 0.010376756066411238, + "grad_norm": 1.920264482498169, + "learning_rate": 4.999990341112699e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.815071988105774, + "num_tokens": 532480.0, + "step": 65 + }, + { + "entropy": 0.5213993549346924, + "epoch": 0.011174968071519796, + "grad_norm": 1.8560436964035034, + "learning_rate": 4.999988772959954e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8370494961738586, + "num_tokens": 573440.0, + "step": 70 + }, + { + "entropy": 0.47170872092247007, + "epoch": 0.011973180076628353, + "grad_norm": 1.7740520238876343, + "learning_rate": 4.999987086901475e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8490700483322143, + "num_tokens": 614400.0, + "step": 75 + }, + { + "entropy": 0.5295802831649781, + "epoch": 0.01277139208173691, + "grad_norm": 1.9278310537338257, + "learning_rate": 4.999985282937368e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8388131141662598, + "num_tokens": 655360.0, + "step": 80 + }, + { + "entropy": 0.5061579287052155, + "epoch": 0.013569604086845466, + "grad_norm": 1.9197410345077515, + "learning_rate": 4.999983361067747e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8418465852737427, + "num_tokens": 696320.0, + "step": 85 + }, + { + "entropy": 0.5168275713920594, + "epoch": 0.014367816091954023, + "grad_norm": 4.458969593048096, + "learning_rate": 4.999981321292733e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8336622238159179, + "num_tokens": 737280.0, + "step": 90 + }, + { + "entropy": 0.5189633309841156, + "epoch": 0.01516602809706258, + "grad_norm": 1.9136766195297241, + "learning_rate": 4.9999791636124526e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8349774599075317, + "num_tokens": 778240.0, + "step": 95 + }, + { + "entropy": 0.5578200101852417, + "epoch": 0.015964240102171137, + "grad_norm": 1.945987582206726, + "learning_rate": 4.999976888027044e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.8311297535896301, + "num_tokens": 819200.0, + "step": 100 + }, + { + "entropy": 0.5309346675872803, + "epoch": 0.016762452107279693, + "grad_norm": 1.6795809268951416, + "learning_rate": 4.999974494536648e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8364368081092834, + "num_tokens": 860160.0, + "step": 105 + }, + { + "entropy": 0.4735103607177734, + "epoch": 0.01756066411238825, + "grad_norm": 2.085732936859131, + "learning_rate": 4.9999719831414165e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.848661732673645, + "num_tokens": 901120.0, + "step": 110 + }, + { + "entropy": 0.46962087154388427, + "epoch": 0.018358876117496807, + "grad_norm": 1.8151161670684814, + "learning_rate": 4.999969353841507e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8622438430786132, + "num_tokens": 942080.0, + "step": 115 + }, + { + "entropy": 0.5223342657089234, + "epoch": 0.019157088122605363, + "grad_norm": 1.746271014213562, + "learning_rate": 4.9999666066370854e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8385911345481872, + "num_tokens": 983040.0, + "step": 120 + }, + { + "entropy": 0.5088116765022278, + "epoch": 0.01995530012771392, + "grad_norm": 1.7642600536346436, + "learning_rate": 4.999963741528323e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.836833918094635, + "num_tokens": 1024000.0, + "step": 125 + }, + { + "entropy": 0.5108103513717651, + "epoch": 0.020753512132822477, + "grad_norm": 1.894156813621521, + "learning_rate": 4.999960758515402e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8450562000274658, + "num_tokens": 1064960.0, + "step": 130 + }, + { + "entropy": 0.525665408372879, + "epoch": 0.021551724137931036, + "grad_norm": 1.8148276805877686, + "learning_rate": 4.999957657598509e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8344801664352417, + "num_tokens": 1105920.0, + "step": 135 + }, + { + "entropy": 0.46289663314819335, + "epoch": 0.02234993614303959, + "grad_norm": 1.8011553287506104, + "learning_rate": 4.9999544387778385e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8528064727783203, + "num_tokens": 1146880.0, + "step": 140 + }, + { + "entropy": 0.47348416447639463, + "epoch": 0.023148148148148147, + "grad_norm": 1.655940055847168, + "learning_rate": 4.999951102053593e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8520541548728943, + "num_tokens": 1187840.0, + "step": 145 + }, + { + "entropy": 0.4965839982032776, + "epoch": 0.023946360153256706, + "grad_norm": 1.81589674949646, + "learning_rate": 4.999947647425983e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8466312050819397, + "num_tokens": 1228800.0, + "step": 150 + }, + { + "entropy": 0.4940677106380463, + "epoch": 0.02474457215836526, + "grad_norm": 1.7606313228607178, + "learning_rate": 4.999944074895225e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.847114372253418, + "num_tokens": 1269760.0, + "step": 155 + }, + { + "entropy": 0.5418534994125366, + "epoch": 0.02554278416347382, + "grad_norm": 1.75408935546875, + "learning_rate": 4.999940384461543e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8292249202728271, + "num_tokens": 1310720.0, + "step": 160 + }, + { + "entropy": 0.4658185839653015, + "epoch": 0.026340996168582376, + "grad_norm": 1.655227541923523, + "learning_rate": 4.999936576125173e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8518807768821717, + "num_tokens": 1351680.0, + "step": 165 + }, + { + "entropy": 0.576075655221939, + "epoch": 0.02713920817369093, + "grad_norm": 2.072255849838257, + "learning_rate": 4.999932649886349e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8192096590995789, + "num_tokens": 1392640.0, + "step": 170 + }, + { + "entropy": 0.4817675709724426, + "epoch": 0.02793742017879949, + "grad_norm": 2.084299087524414, + "learning_rate": 4.999928605745321e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8512140274047851, + "num_tokens": 1433600.0, + "step": 175 + }, + { + "entropy": 0.45817756056785586, + "epoch": 0.028735632183908046, + "grad_norm": 1.553299903869629, + "learning_rate": 4.999924443702344e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8554870367050171, + "num_tokens": 1474560.0, + "step": 180 + }, + { + "entropy": 0.5050954639911651, + "epoch": 0.029533844189016605, + "grad_norm": 1.8562591075897217, + "learning_rate": 4.9999201637576775e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8387478709220886, + "num_tokens": 1515520.0, + "step": 185 + }, + { + "entropy": 0.4602130055427551, + "epoch": 0.03033205619412516, + "grad_norm": 1.8530664443969727, + "learning_rate": 4.999915765911592e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8530979156494141, + "num_tokens": 1556480.0, + "step": 190 + }, + { + "entropy": 0.47528478503227234, + "epoch": 0.031130268199233715, + "grad_norm": 1.7237930297851562, + "learning_rate": 4.9999112501643635e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8522473692893981, + "num_tokens": 1597440.0, + "step": 195 + }, + { + "entropy": 0.48855978846549986, + "epoch": 0.031928480204342274, + "grad_norm": 1.7260383367538452, + "learning_rate": 4.9999066165162755e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8458796501159668, + "num_tokens": 1638400.0, + "step": 200 + }, + { + "epoch": 0.031928480204342274, + "eval_entropy": 0.47576266503334047, + "eval_loss": 0.45461931824684143, + "eval_mean_token_accuracy": 0.8487958540916443, + "eval_num_tokens": 1638400.0, + "eval_runtime": 69.1687, + "eval_samples_per_second": 14.457, + "eval_steps_per_second": 1.807, + "step": 200 + }, + { + "entropy": 0.515596067905426, + "epoch": 0.03272669220945083, + "grad_norm": 2.013831377029419, + "learning_rate": 4.999901864967621e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8359745025634766, + "num_tokens": 1679360.0, + "step": 205 + }, + { + "entropy": 0.48475693464279174, + "epoch": 0.033524904214559385, + "grad_norm": 1.9115880727767944, + "learning_rate": 4.999896995518698e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8532404661178589, + "num_tokens": 1720320.0, + "step": 210 + }, + { + "entropy": 0.4396271646022797, + "epoch": 0.034323116219667944, + "grad_norm": 1.712744116783142, + "learning_rate": 4.999892008169811e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8664462685585022, + "num_tokens": 1761280.0, + "step": 215 + }, + { + "entropy": 0.5019235789775849, + "epoch": 0.0351213282247765, + "grad_norm": 1.7797718048095703, + "learning_rate": 4.9998869029212766e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8428232192993164, + "num_tokens": 1802240.0, + "step": 220 + }, + { + "entropy": 0.45359727144241335, + "epoch": 0.035919540229885055, + "grad_norm": 1.6519943475723267, + "learning_rate": 4.999881679773414e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8586545705795288, + "num_tokens": 1843200.0, + "step": 225 + }, + { + "entropy": 0.45751402378082273, + "epoch": 0.036717752234993614, + "grad_norm": 1.8015882968902588, + "learning_rate": 4.999876338726552e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8589802742004394, + "num_tokens": 1884160.0, + "step": 230 + }, + { + "entropy": 0.447160130739212, + "epoch": 0.03751596424010217, + "grad_norm": 1.7113579511642456, + "learning_rate": 4.999870879781027e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8592816829681397, + "num_tokens": 1925120.0, + "step": 235 + }, + { + "entropy": 0.4892503976821899, + "epoch": 0.038314176245210725, + "grad_norm": 2.0123822689056396, + "learning_rate": 4.999865302937182e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8455776095390319, + "num_tokens": 1966080.0, + "step": 240 + }, + { + "entropy": 0.4409046471118927, + "epoch": 0.039112388250319284, + "grad_norm": 1.7804988622665405, + "learning_rate": 4.999859608195366e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8591686248779297, + "num_tokens": 2007040.0, + "step": 245 + }, + { + "entropy": 0.4628835916519165, + "epoch": 0.03991060025542784, + "grad_norm": 1.730065941810608, + "learning_rate": 4.999853795555939e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8521682500839234, + "num_tokens": 2048000.0, + "step": 250 + }, + { + "entropy": 0.40462678074836733, + "epoch": 0.040708812260536395, + "grad_norm": 1.8947077989578247, + "learning_rate": 4.999847865019267e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8727653503417969, + "num_tokens": 2088960.0, + "step": 255 + }, + { + "entropy": 0.4546548843383789, + "epoch": 0.041507024265644954, + "grad_norm": 1.7862297296524048, + "learning_rate": 4.999841816585722e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8530672907829284, + "num_tokens": 2129490.0, + "step": 260 + }, + { + "entropy": 0.45019919872283937, + "epoch": 0.04230523627075351, + "grad_norm": 1.7621231079101562, + "learning_rate": 4.999835650255683e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8593084692955018, + "num_tokens": 2170450.0, + "step": 265 + }, + { + "entropy": 0.5008151054382324, + "epoch": 0.04310344827586207, + "grad_norm": 1.917815923690796, + "learning_rate": 4.99982936602954e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8447394847869873, + "num_tokens": 2211180.0, + "step": 270 + }, + { + "entropy": 0.4359456837177277, + "epoch": 0.043901660280970624, + "grad_norm": 1.7914767265319824, + "learning_rate": 4.999822963907688e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8615276694297791, + "num_tokens": 2252140.0, + "step": 275 + }, + { + "entropy": 0.4787565290927887, + "epoch": 0.04469987228607918, + "grad_norm": 2.0970892906188965, + "learning_rate": 4.999816443890528e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8491812825202942, + "num_tokens": 2293100.0, + "step": 280 + }, + { + "entropy": 0.45131545066833495, + "epoch": 0.04549808429118774, + "grad_norm": 1.5809624195098877, + "learning_rate": 4.99980980597847e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8572892904281616, + "num_tokens": 2334060.0, + "step": 285 + }, + { + "entropy": 0.4229037821292877, + "epoch": 0.046296296296296294, + "grad_norm": 1.4807138442993164, + "learning_rate": 4.999803050171935e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8665313005447388, + "num_tokens": 2375020.0, + "step": 290 + }, + { + "entropy": 0.4446040511131287, + "epoch": 0.04709450830140485, + "grad_norm": 1.7885520458221436, + "learning_rate": 4.999796176471343e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8560836672782898, + "num_tokens": 2415980.0, + "step": 295 + }, + { + "entropy": 0.45139994025230407, + "epoch": 0.04789272030651341, + "grad_norm": 1.6485087871551514, + "learning_rate": 4.999789184877129e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8609436988830567, + "num_tokens": 2456940.0, + "step": 300 + }, + { + "entropy": 0.4461036443710327, + "epoch": 0.048690932311621964, + "grad_norm": 1.5833100080490112, + "learning_rate": 4.999782075389732e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8589345335960388, + "num_tokens": 2497900.0, + "step": 305 + }, + { + "entropy": 0.39054363369941714, + "epoch": 0.04948914431673052, + "grad_norm": 1.6196880340576172, + "learning_rate": 4.9997748480096e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.873320484161377, + "num_tokens": 2538860.0, + "step": 310 + }, + { + "entropy": 0.49510942697525023, + "epoch": 0.05028735632183908, + "grad_norm": 1.8079206943511963, + "learning_rate": 4.9997675027371855e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8449161767959594, + "num_tokens": 2579820.0, + "step": 315 + }, + { + "entropy": 0.49750194549560545, + "epoch": 0.05108556832694764, + "grad_norm": 1.7544002532958984, + "learning_rate": 4.999760039572952e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8446771740913391, + "num_tokens": 2620780.0, + "step": 320 + }, + { + "entropy": 0.4802740037441254, + "epoch": 0.05188378033205619, + "grad_norm": 1.807607889175415, + "learning_rate": 4.999752458517367e-06, + "loss": 0.445, + "mean_token_accuracy": 0.852498996257782, + "num_tokens": 2661740.0, + "step": 325 + }, + { + "entropy": 0.45189476013183594, + "epoch": 0.05268199233716475, + "grad_norm": 1.7295165061950684, + "learning_rate": 4.99974475957091e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8570088505744934, + "num_tokens": 2702700.0, + "step": 330 + }, + { + "entropy": 0.4113829493522644, + "epoch": 0.05348020434227331, + "grad_norm": 3.3795394897460938, + "learning_rate": 4.9997369427340635e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8682220339775085, + "num_tokens": 2743660.0, + "step": 335 + }, + { + "entropy": 0.46611291766166685, + "epoch": 0.05427841634738186, + "grad_norm": 1.7831217050552368, + "learning_rate": 4.999729008007319e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8525894761085511, + "num_tokens": 2784620.0, + "step": 340 + }, + { + "entropy": 0.43543928265571596, + "epoch": 0.05507662835249042, + "grad_norm": 1.7297419309616089, + "learning_rate": 4.9997209553911755e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8630126476287842, + "num_tokens": 2825580.0, + "step": 345 + }, + { + "entropy": 0.46474372744560244, + "epoch": 0.05587484035759898, + "grad_norm": 1.8551281690597534, + "learning_rate": 4.99971278488614e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.849402928352356, + "num_tokens": 2866540.0, + "step": 350 + }, + { + "entropy": 0.4573587000370026, + "epoch": 0.05667305236270753, + "grad_norm": 1.977996587753296, + "learning_rate": 4.999704496492726e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8525743603706359, + "num_tokens": 2907500.0, + "step": 355 + }, + { + "entropy": 0.4407190024852753, + "epoch": 0.05747126436781609, + "grad_norm": 1.7498115301132202, + "learning_rate": 4.999696090211454e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8584057688713074, + "num_tokens": 2948460.0, + "step": 360 + }, + { + "entropy": 0.43572052717208865, + "epoch": 0.05826947637292465, + "grad_norm": 1.7404518127441406, + "learning_rate": 4.999687566042853e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8642658710479736, + "num_tokens": 2989420.0, + "step": 365 + }, + { + "entropy": 0.43591675758361814, + "epoch": 0.05906768837803321, + "grad_norm": 1.9127355813980103, + "learning_rate": 4.999678923987459e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8618336081504822, + "num_tokens": 3030380.0, + "step": 370 + }, + { + "entropy": 0.48763387799263, + "epoch": 0.05986590038314176, + "grad_norm": 1.9815560579299927, + "learning_rate": 4.999670164045816e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8467477440834046, + "num_tokens": 3071340.0, + "step": 375 + }, + { + "entropy": 0.49417877197265625, + "epoch": 0.06066411238825032, + "grad_norm": 1.7125637531280518, + "learning_rate": 4.9996612862184745e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8464985489845276, + "num_tokens": 3112300.0, + "step": 380 + }, + { + "entropy": 0.4436693489551544, + "epoch": 0.06146232439335888, + "grad_norm": 1.6935256719589233, + "learning_rate": 4.999652290505993e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8586057186126709, + "num_tokens": 3153260.0, + "step": 385 + }, + { + "entropy": 0.3957001864910126, + "epoch": 0.06226053639846743, + "grad_norm": 1.5968080759048462, + "learning_rate": 4.999643176908937e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8730635643005371, + "num_tokens": 3194220.0, + "step": 390 + }, + { + "entropy": 0.4333923041820526, + "epoch": 0.06305874840357599, + "grad_norm": 1.6355855464935303, + "learning_rate": 4.999633945427879e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8614879369735717, + "num_tokens": 3235180.0, + "step": 395 + }, + { + "entropy": 0.4490614771842957, + "epoch": 0.06385696040868455, + "grad_norm": 1.7251191139221191, + "learning_rate": 4.999624596063401e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8589560508728027, + "num_tokens": 3276140.0, + "step": 400 + }, + { + "epoch": 0.06385696040868455, + "eval_entropy": 0.456509140253067, + "eval_loss": 0.42409759759902954, + "eval_mean_token_accuracy": 0.8566836671829223, + "eval_num_tokens": 3276140.0, + "eval_runtime": 69.3986, + "eval_samples_per_second": 14.41, + "eval_steps_per_second": 1.801, + "step": 400 + }, + { + "entropy": 0.4575593590736389, + "epoch": 0.06465517241379311, + "grad_norm": 1.9416770935058594, + "learning_rate": 4.9996151288160885e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.852801513671875, + "num_tokens": 3317100.0, + "step": 405 + }, + { + "entropy": 0.4455989181995392, + "epoch": 0.06545338441890167, + "grad_norm": 1.639709711074829, + "learning_rate": 4.9996055436865395e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8566305279731751, + "num_tokens": 3358060.0, + "step": 410 + }, + { + "entropy": 0.436484295129776, + "epoch": 0.06625159642401021, + "grad_norm": 1.7730813026428223, + "learning_rate": 4.999595840675355e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8643952012062073, + "num_tokens": 3399020.0, + "step": 415 + }, + { + "entropy": 0.4210900843143463, + "epoch": 0.06704980842911877, + "grad_norm": 1.8874180316925049, + "learning_rate": 4.999586019783145e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8668930411338807, + "num_tokens": 3439980.0, + "step": 420 + }, + { + "entropy": 0.4613493621349335, + "epoch": 0.06784802043422733, + "grad_norm": 1.9141099452972412, + "learning_rate": 4.999576081010529e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8520909667015075, + "num_tokens": 3480940.0, + "step": 425 + }, + { + "entropy": 0.4226214110851288, + "epoch": 0.06864623243933589, + "grad_norm": 1.8183785676956177, + "learning_rate": 4.99956602435813e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8657994747161866, + "num_tokens": 3521900.0, + "step": 430 + }, + { + "entropy": 0.3875891506671906, + "epoch": 0.06944444444444445, + "grad_norm": 1.5938745737075806, + "learning_rate": 4.999555849826582e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8773833990097046, + "num_tokens": 3562860.0, + "step": 435 + }, + { + "entropy": 0.4186811327934265, + "epoch": 0.070242656449553, + "grad_norm": 1.7199757099151611, + "learning_rate": 4.999545557416523e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8652471303939819, + "num_tokens": 3603820.0, + "step": 440 + }, + { + "entropy": 0.43029149770736697, + "epoch": 0.07104086845466155, + "grad_norm": 1.851920247077942, + "learning_rate": 4.9995351471286015e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8676915645599366, + "num_tokens": 3644780.0, + "step": 445 + }, + { + "entropy": 0.45375868678092957, + "epoch": 0.07183908045977011, + "grad_norm": 1.766336441040039, + "learning_rate": 4.9995246189634715e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8571653246879578, + "num_tokens": 3685740.0, + "step": 450 + }, + { + "entropy": 0.4406971275806427, + "epoch": 0.07263729246487867, + "grad_norm": 1.7922120094299316, + "learning_rate": 4.999513972921796e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8598119735717773, + "num_tokens": 3726700.0, + "step": 455 + }, + { + "entropy": 0.3757154107093811, + "epoch": 0.07343550446998723, + "grad_norm": 1.6567347049713135, + "learning_rate": 4.999503209004244e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8820300817489624, + "num_tokens": 3767660.0, + "step": 460 + }, + { + "entropy": 0.4569710433483124, + "epoch": 0.07423371647509579, + "grad_norm": 1.678511142730713, + "learning_rate": 4.9994923272114905e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8535709857940674, + "num_tokens": 3808620.0, + "step": 465 + }, + { + "entropy": 0.4190040946006775, + "epoch": 0.07503192848020435, + "grad_norm": 1.7449942827224731, + "learning_rate": 4.999481327544224e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8726458072662353, + "num_tokens": 3849580.0, + "step": 470 + }, + { + "entropy": 0.4649412989616394, + "epoch": 0.0758301404853129, + "grad_norm": 1.9117767810821533, + "learning_rate": 4.999470210003132e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8512534976005555, + "num_tokens": 3890540.0, + "step": 475 + }, + { + "entropy": 0.4217049300670624, + "epoch": 0.07662835249042145, + "grad_norm": 1.6323946714401245, + "learning_rate": 4.9994589745889155e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8682831287384033, + "num_tokens": 3931500.0, + "step": 480 + }, + { + "entropy": 0.4169506013393402, + "epoch": 0.07742656449553001, + "grad_norm": 1.7119159698486328, + "learning_rate": 4.9994476213022804e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8702028751373291, + "num_tokens": 3972460.0, + "step": 485 + }, + { + "entropy": 0.4700320720672607, + "epoch": 0.07822477650063857, + "grad_norm": 1.8065766096115112, + "learning_rate": 4.999436150143941e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8486562728881836, + "num_tokens": 4013420.0, + "step": 490 + }, + { + "entropy": 0.5004218697547913, + "epoch": 0.07902298850574713, + "grad_norm": 1.7902945280075073, + "learning_rate": 4.99942456111462e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8416624784469604, + "num_tokens": 4054380.0, + "step": 495 + }, + { + "entropy": 0.38799951076507566, + "epoch": 0.07982120051085569, + "grad_norm": 1.724098563194275, + "learning_rate": 4.999412854215044e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8762063622474671, + "num_tokens": 4095340.0, + "step": 500 + }, + { + "entropy": 0.42835206985473634, + "epoch": 0.08061941251596424, + "grad_norm": 1.63278067111969, + "learning_rate": 4.99940102944595e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8626661181449891, + "num_tokens": 4136300.0, + "step": 505 + }, + { + "entropy": 0.4672918081283569, + "epoch": 0.08141762452107279, + "grad_norm": 1.9030835628509521, + "learning_rate": 4.999389086808082e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8479044079780579, + "num_tokens": 4177260.0, + "step": 510 + }, + { + "entropy": 0.43039371371269225, + "epoch": 0.08221583652618135, + "grad_norm": 1.7228889465332031, + "learning_rate": 4.99937702630219e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8647573947906494, + "num_tokens": 4218220.0, + "step": 515 + }, + { + "entropy": 0.44171770811080935, + "epoch": 0.08301404853128991, + "grad_norm": 1.7838937044143677, + "learning_rate": 4.999364847929033e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8621063232421875, + "num_tokens": 4259180.0, + "step": 520 + }, + { + "entropy": 0.44105868935585024, + "epoch": 0.08381226053639847, + "grad_norm": 1.8959293365478516, + "learning_rate": 4.9993525516893775e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8541361808776855, + "num_tokens": 4300140.0, + "step": 525 + }, + { + "entropy": 0.46709821820259095, + "epoch": 0.08461047254150703, + "grad_norm": 1.7269469499588013, + "learning_rate": 4.999340137583995e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8574911475181579, + "num_tokens": 4341100.0, + "step": 530 + }, + { + "entropy": 0.5288523077964783, + "epoch": 0.08540868454661558, + "grad_norm": 1.8445000648498535, + "learning_rate": 4.999327605613668e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.839403486251831, + "num_tokens": 4382060.0, + "step": 535 + }, + { + "entropy": 0.42816867828369143, + "epoch": 0.08620689655172414, + "grad_norm": 1.7641640901565552, + "learning_rate": 4.999314955779183e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8624008417129516, + "num_tokens": 4423020.0, + "step": 540 + }, + { + "entropy": 0.40726423263549805, + "epoch": 0.08700510855683269, + "grad_norm": 1.865195870399475, + "learning_rate": 4.999302188081338e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.868608021736145, + "num_tokens": 4463980.0, + "step": 545 + }, + { + "entropy": 0.4436700284481049, + "epoch": 0.08780332056194125, + "grad_norm": 1.7468849420547485, + "learning_rate": 4.999289302520932e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8610162615776062, + "num_tokens": 4504795.0, + "step": 550 + }, + { + "entropy": 0.4452336847782135, + "epoch": 0.0886015325670498, + "grad_norm": 1.5321035385131836, + "learning_rate": 4.999276299098779e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8643596410751343, + "num_tokens": 4545755.0, + "step": 555 + }, + { + "entropy": 0.3910303771495819, + "epoch": 0.08939974457215837, + "grad_norm": 1.6199473142623901, + "learning_rate": 4.999263177815693e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.874867069721222, + "num_tokens": 4586715.0, + "step": 560 + }, + { + "entropy": 0.44607588052749636, + "epoch": 0.09019795657726692, + "grad_norm": 1.7175859212875366, + "learning_rate": 4.999249938672503e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8542021036148071, + "num_tokens": 4627675.0, + "step": 565 + }, + { + "entropy": 0.4354964554309845, + "epoch": 0.09099616858237548, + "grad_norm": 2.050355911254883, + "learning_rate": 4.9992365816700394e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8629257440567016, + "num_tokens": 4668635.0, + "step": 570 + }, + { + "entropy": 0.48360870480537416, + "epoch": 0.09179438058748404, + "grad_norm": 1.6667331457138062, + "learning_rate": 4.999223106809142e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8496447801589966, + "num_tokens": 4709595.0, + "step": 575 + }, + { + "entropy": 0.46697754263877866, + "epoch": 0.09259259259259259, + "grad_norm": 1.8308717012405396, + "learning_rate": 4.999209514090658e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8522969365119935, + "num_tokens": 4750555.0, + "step": 580 + }, + { + "entropy": 0.431224399805069, + "epoch": 0.09339080459770115, + "grad_norm": 1.7282050848007202, + "learning_rate": 4.999195803515444e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8619638442993164, + "num_tokens": 4791515.0, + "step": 585 + }, + { + "entropy": 0.48489547371864317, + "epoch": 0.0941890166028097, + "grad_norm": 1.8322498798370361, + "learning_rate": 4.99918197508436e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8451943159103393, + "num_tokens": 4832475.0, + "step": 590 + }, + { + "entropy": 0.42147684693336485, + "epoch": 0.09498722860791826, + "grad_norm": 1.637582540512085, + "learning_rate": 4.999168028798277e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8680244445800781, + "num_tokens": 4873435.0, + "step": 595 + }, + { + "entropy": 0.4362496554851532, + "epoch": 0.09578544061302682, + "grad_norm": 1.850331425666809, + "learning_rate": 4.999153964658071e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8630183935165405, + "num_tokens": 4914395.0, + "step": 600 + }, + { + "epoch": 0.09578544061302682, + "eval_entropy": 0.44316129422187805, + "eval_loss": 0.4095243811607361, + "eval_mean_token_accuracy": 0.8603043389320374, + "eval_num_tokens": 4914395.0, + "eval_runtime": 69.1898, + "eval_samples_per_second": 14.453, + "eval_steps_per_second": 1.807, + "step": 600 + }, + { + "entropy": 0.44732285141944883, + "epoch": 0.09658365261813538, + "grad_norm": 1.751281499862671, + "learning_rate": 4.999139782664627e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8580738425254821, + "num_tokens": 4955355.0, + "step": 605 + }, + { + "entropy": 0.44980551600456237, + "epoch": 0.09738186462324393, + "grad_norm": 1.8443349599838257, + "learning_rate": 4.999125482818837e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8509649872779846, + "num_tokens": 4996315.0, + "step": 610 + }, + { + "entropy": 0.4144567608833313, + "epoch": 0.09818007662835249, + "grad_norm": 1.7080366611480713, + "learning_rate": 4.9991110651216e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8717723250389099, + "num_tokens": 5037275.0, + "step": 615 + }, + { + "entropy": 0.38187943696975707, + "epoch": 0.09897828863346105, + "grad_norm": 1.6727166175842285, + "learning_rate": 4.999096529573822e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8789412021636963, + "num_tokens": 5078235.0, + "step": 620 + }, + { + "entropy": 0.41196991205215455, + "epoch": 0.0997765006385696, + "grad_norm": 1.7526803016662598, + "learning_rate": 4.999081876176418e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8686864018440247, + "num_tokens": 5119195.0, + "step": 625 + }, + { + "entropy": 0.4536400198936462, + "epoch": 0.10057471264367816, + "grad_norm": 1.753389596939087, + "learning_rate": 4.999067104930308e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8584004759788513, + "num_tokens": 5160105.0, + "step": 630 + }, + { + "entropy": 0.42510269284248353, + "epoch": 0.10137292464878672, + "grad_norm": 1.7358981370925903, + "learning_rate": 4.999052215836423e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8650004863739014, + "num_tokens": 5201065.0, + "step": 635 + }, + { + "entropy": 0.4355582058429718, + "epoch": 0.10217113665389528, + "grad_norm": 1.642331600189209, + "learning_rate": 4.999037208895699e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8562347054481506, + "num_tokens": 5242025.0, + "step": 640 + }, + { + "entropy": 0.44882028698921206, + "epoch": 0.10296934865900383, + "grad_norm": 6.373381614685059, + "learning_rate": 4.9990220841090775e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8576697468757629, + "num_tokens": 5282985.0, + "step": 645 + }, + { + "entropy": 0.408430814743042, + "epoch": 0.10376756066411238, + "grad_norm": 1.6307820081710815, + "learning_rate": 4.999006841477512e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8739194750785828, + "num_tokens": 5323945.0, + "step": 650 + }, + { + "entropy": 0.40124152302742006, + "epoch": 0.10456577266922094, + "grad_norm": 1.6492515802383423, + "learning_rate": 4.998991481001959e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8745116472244263, + "num_tokens": 5364905.0, + "step": 655 + }, + { + "entropy": 0.44308696389198304, + "epoch": 0.1053639846743295, + "grad_norm": 1.9647578001022339, + "learning_rate": 4.998976002683385e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8612207174301147, + "num_tokens": 5405865.0, + "step": 660 + }, + { + "entropy": 0.4585953652858734, + "epoch": 0.10616219667943806, + "grad_norm": 1.82151460647583, + "learning_rate": 4.9989604065227655e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8487472653388977, + "num_tokens": 5446825.0, + "step": 665 + }, + { + "entropy": 0.40014599561691283, + "epoch": 0.10696040868454662, + "grad_norm": 1.619546890258789, + "learning_rate": 4.998944692521078e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8710795283317566, + "num_tokens": 5487785.0, + "step": 670 + }, + { + "entropy": 0.439222115278244, + "epoch": 0.10775862068965517, + "grad_norm": 1.5537372827529907, + "learning_rate": 4.9989288606793126e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8664328694343567, + "num_tokens": 5528745.0, + "step": 675 + }, + { + "entropy": 0.3984904527664185, + "epoch": 0.10855683269476372, + "grad_norm": 1.7261492013931274, + "learning_rate": 4.998912910998465e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.875708258152008, + "num_tokens": 5569705.0, + "step": 680 + }, + { + "entropy": 0.40191400051116943, + "epoch": 0.10935504469987228, + "grad_norm": 1.675265908241272, + "learning_rate": 4.998896843479537e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8739235281944275, + "num_tokens": 5610665.0, + "step": 685 + }, + { + "entropy": 0.3728193402290344, + "epoch": 0.11015325670498084, + "grad_norm": 1.590753436088562, + "learning_rate": 4.9988806581235385e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8822915911674499, + "num_tokens": 5651625.0, + "step": 690 + }, + { + "entropy": 0.42136245369911196, + "epoch": 0.1109514687100894, + "grad_norm": 1.5533643960952759, + "learning_rate": 4.9988643549314895e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8653243780136108, + "num_tokens": 5692585.0, + "step": 695 + }, + { + "entropy": 0.43504267930984497, + "epoch": 0.11174968071519796, + "grad_norm": 1.715844750404358, + "learning_rate": 4.998847933904414e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8631648063659668, + "num_tokens": 5733545.0, + "step": 700 + }, + { + "entropy": 0.4285335838794708, + "epoch": 0.11254789272030652, + "grad_norm": 1.579676866531372, + "learning_rate": 4.998831395043344e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8621499538421631, + "num_tokens": 5774505.0, + "step": 705 + }, + { + "entropy": 0.42913134694099425, + "epoch": 0.11334610472541506, + "grad_norm": 1.932811975479126, + "learning_rate": 4.998814738349322e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8622578978538513, + "num_tokens": 5815465.0, + "step": 710 + }, + { + "entropy": 0.44279545545578003, + "epoch": 0.11414431673052362, + "grad_norm": 1.821500539779663, + "learning_rate": 4.99879796382339e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8612294793128967, + "num_tokens": 5856425.0, + "step": 715 + }, + { + "entropy": 0.423015832901001, + "epoch": 0.11494252873563218, + "grad_norm": 1.6205596923828125, + "learning_rate": 4.998781071466609e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8664390563964843, + "num_tokens": 5897385.0, + "step": 720 + }, + { + "entropy": 0.43310112357139585, + "epoch": 0.11574074074074074, + "grad_norm": 1.5314807891845703, + "learning_rate": 4.9987640612800395e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8635281443595886, + "num_tokens": 5938345.0, + "step": 725 + }, + { + "entropy": 0.45245509743690493, + "epoch": 0.1165389527458493, + "grad_norm": 1.9062870740890503, + "learning_rate": 4.998746933264749e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8594639301300049, + "num_tokens": 5979305.0, + "step": 730 + }, + { + "entropy": 0.4959665656089783, + "epoch": 0.11733716475095786, + "grad_norm": 1.8808233737945557, + "learning_rate": 4.998729687421816e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8400404214859009, + "num_tokens": 6020265.0, + "step": 735 + }, + { + "entropy": 0.4221862554550171, + "epoch": 0.11813537675606642, + "grad_norm": 1.8408641815185547, + "learning_rate": 4.998712323752325e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8666460990905762, + "num_tokens": 6061225.0, + "step": 740 + }, + { + "entropy": 0.43175222873687746, + "epoch": 0.11893358876117496, + "grad_norm": 1.737856388092041, + "learning_rate": 4.998694842257367e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.865670645236969, + "num_tokens": 6102185.0, + "step": 745 + }, + { + "entropy": 0.43961183428764344, + "epoch": 0.11973180076628352, + "grad_norm": 1.624342679977417, + "learning_rate": 4.998677242938043e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.859315299987793, + "num_tokens": 6143145.0, + "step": 750 + }, + { + "entropy": 0.45612156987190244, + "epoch": 0.12053001277139208, + "grad_norm": 1.8945883512496948, + "learning_rate": 4.998659525795459e-06, + "loss": 0.422, + "mean_token_accuracy": 0.854542326927185, + "num_tokens": 6184105.0, + "step": 755 + }, + { + "entropy": 0.41530930399894717, + "epoch": 0.12132822477650064, + "grad_norm": 1.8583099842071533, + "learning_rate": 4.998641690830728e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8694210529327393, + "num_tokens": 6225065.0, + "step": 760 + }, + { + "entropy": 0.39272274971008303, + "epoch": 0.1221264367816092, + "grad_norm": 1.621265172958374, + "learning_rate": 4.9986237380449734e-06, + "loss": 0.347, + "mean_token_accuracy": 0.880846381187439, + "num_tokens": 6266025.0, + "step": 765 + }, + { + "entropy": 0.4349696278572083, + "epoch": 0.12292464878671776, + "grad_norm": 1.6760231256484985, + "learning_rate": 4.998605667439322e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8591925144195557, + "num_tokens": 6306985.0, + "step": 770 + }, + { + "entropy": 0.3744224488735199, + "epoch": 0.1237228607918263, + "grad_norm": 1.481189250946045, + "learning_rate": 4.998587479014912e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8797758460044861, + "num_tokens": 6347945.0, + "step": 775 + }, + { + "entropy": 0.39019187688827517, + "epoch": 0.12452107279693486, + "grad_norm": 1.720441460609436, + "learning_rate": 4.998569172772886e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8817312359809876, + "num_tokens": 6388905.0, + "step": 780 + }, + { + "entropy": 0.4243058919906616, + "epoch": 0.12531928480204343, + "grad_norm": 1.5835771560668945, + "learning_rate": 4.9985507487143964e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8680808663368225, + "num_tokens": 6429865.0, + "step": 785 + }, + { + "entropy": 0.43315653800964354, + "epoch": 0.12611749680715198, + "grad_norm": 1.8805222511291504, + "learning_rate": 4.9985322068406e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8571700334548951, + "num_tokens": 6470825.0, + "step": 790 + }, + { + "entropy": 0.4055586576461792, + "epoch": 0.12691570881226052, + "grad_norm": 1.5204501152038574, + "learning_rate": 4.998513547152665e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8707727670669556, + "num_tokens": 6511785.0, + "step": 795 + }, + { + "entropy": 0.4430310487747192, + "epoch": 0.1277139208173691, + "grad_norm": 1.6038336753845215, + "learning_rate": 4.998494769651762e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8609652161598206, + "num_tokens": 6552745.0, + "step": 800 + }, + { + "epoch": 0.1277139208173691, + "eval_entropy": 0.4325378756523132, + "eval_loss": 0.4014604687690735, + "eval_mean_token_accuracy": 0.8624647760391235, + "eval_num_tokens": 6552745.0, + "eval_runtime": 69.2056, + "eval_samples_per_second": 14.45, + "eval_steps_per_second": 1.806, + "step": 800 + }, + { + "entropy": 0.4043457269668579, + "epoch": 0.12851213282247764, + "grad_norm": 1.5992178916931152, + "learning_rate": 4.998475874339074e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8718292593955994, + "num_tokens": 6593705.0, + "step": 805 + }, + { + "entropy": 0.42595568895339964, + "epoch": 0.12931034482758622, + "grad_norm": 1.7485393285751343, + "learning_rate": 4.998456861215789e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8653177976608276, + "num_tokens": 6634665.0, + "step": 810 + }, + { + "entropy": 0.4524208724498749, + "epoch": 0.13010855683269476, + "grad_norm": 1.942795753479004, + "learning_rate": 4.998437730283102e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8551061153411865, + "num_tokens": 6675625.0, + "step": 815 + }, + { + "entropy": 0.4193409144878387, + "epoch": 0.13090676883780333, + "grad_norm": 1.827155590057373, + "learning_rate": 4.998418481542215e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8670408844947814, + "num_tokens": 6716585.0, + "step": 820 + }, + { + "entropy": 0.44782190322875975, + "epoch": 0.13170498084291188, + "grad_norm": 1.7226262092590332, + "learning_rate": 4.998399114994341e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8589698910713196, + "num_tokens": 6757545.0, + "step": 825 + }, + { + "entropy": 0.4577144384384155, + "epoch": 0.13250319284802042, + "grad_norm": 1.8020867109298706, + "learning_rate": 4.998379630640696e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8565351963043213, + "num_tokens": 6798505.0, + "step": 830 + }, + { + "entropy": 0.38689329028129577, + "epoch": 0.133301404853129, + "grad_norm": 1.7878338098526, + "learning_rate": 4.998360028482505e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.878675889968872, + "num_tokens": 6839465.0, + "step": 835 + }, + { + "entropy": 0.4039497375488281, + "epoch": 0.13409961685823754, + "grad_norm": 1.794539213180542, + "learning_rate": 4.998340308521002e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8708958387374878, + "num_tokens": 6880425.0, + "step": 840 + }, + { + "entropy": 0.40037272572517396, + "epoch": 0.13489782886334611, + "grad_norm": 1.7989890575408936, + "learning_rate": 4.998320470757426e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8725654721260071, + "num_tokens": 6921385.0, + "step": 845 + }, + { + "entropy": 0.4616117298603058, + "epoch": 0.13569604086845466, + "grad_norm": 1.8218706846237183, + "learning_rate": 4.998300515193026e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8513168811798095, + "num_tokens": 6962345.0, + "step": 850 + }, + { + "entropy": 0.440923935174942, + "epoch": 0.13649425287356323, + "grad_norm": 1.712561845779419, + "learning_rate": 4.998280441829054e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8618036508560181, + "num_tokens": 7003305.0, + "step": 855 + }, + { + "entropy": 0.4474347770214081, + "epoch": 0.13729246487867178, + "grad_norm": 1.7691107988357544, + "learning_rate": 4.998260250666775e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8586588621139526, + "num_tokens": 7044265.0, + "step": 860 + }, + { + "entropy": 0.39236196875572205, + "epoch": 0.13809067688378032, + "grad_norm": 1.5966753959655762, + "learning_rate": 4.998239941707457e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.875457501411438, + "num_tokens": 7085225.0, + "step": 865 + }, + { + "entropy": 0.41359818577766416, + "epoch": 0.1388888888888889, + "grad_norm": 1.8600451946258545, + "learning_rate": 4.998219514952378e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.866973626613617, + "num_tokens": 7126185.0, + "step": 870 + }, + { + "entropy": 0.42218191623687745, + "epoch": 0.13968710089399744, + "grad_norm": 1.6097564697265625, + "learning_rate": 4.998198970402822e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.866248095035553, + "num_tokens": 7167145.0, + "step": 875 + }, + { + "entropy": 0.41699051260948183, + "epoch": 0.140485312899106, + "grad_norm": 1.737248182296753, + "learning_rate": 4.998178308060082e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8678730607032776, + "num_tokens": 7208105.0, + "step": 880 + }, + { + "entropy": 0.4190727710723877, + "epoch": 0.14128352490421456, + "grad_norm": 1.5152932405471802, + "learning_rate": 4.998157527925456e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8653331279754639, + "num_tokens": 7249065.0, + "step": 885 + }, + { + "entropy": 0.4779325366020203, + "epoch": 0.1420817369093231, + "grad_norm": 1.8298388719558716, + "learning_rate": 4.998136630000251e-06, + "loss": 0.437, + "mean_token_accuracy": 0.84938884973526, + "num_tokens": 7290025.0, + "step": 890 + }, + { + "entropy": 0.4784541606903076, + "epoch": 0.14287994891443168, + "grad_norm": 1.6877150535583496, + "learning_rate": 4.998115614285782e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8492711663246155, + "num_tokens": 7330985.0, + "step": 895 + }, + { + "entropy": 0.44440594911575315, + "epoch": 0.14367816091954022, + "grad_norm": 1.829339623451233, + "learning_rate": 4.99809448078337e-06, + "loss": 0.411, + "mean_token_accuracy": 0.859615421295166, + "num_tokens": 7371945.0, + "step": 900 + }, + { + "entropy": 0.41583908200263975, + "epoch": 0.1444763729246488, + "grad_norm": 1.5231865644454956, + "learning_rate": 4.9980732294943435e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8711596846580505, + "num_tokens": 7412905.0, + "step": 905 + }, + { + "entropy": 0.45024530291557313, + "epoch": 0.14527458492975734, + "grad_norm": 1.843286156654358, + "learning_rate": 4.998051860420039e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8572163820266724, + "num_tokens": 7453865.0, + "step": 910 + }, + { + "entropy": 0.39993720054626464, + "epoch": 0.1460727969348659, + "grad_norm": 1.4822101593017578, + "learning_rate": 4.998030373561801e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8721619606018066, + "num_tokens": 7494825.0, + "step": 915 + }, + { + "entropy": 0.4568325638771057, + "epoch": 0.14687100893997446, + "grad_norm": 1.7281900644302368, + "learning_rate": 4.99800876892098e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8540958523750305, + "num_tokens": 7535785.0, + "step": 920 + }, + { + "entropy": 0.3957813024520874, + "epoch": 0.147669220945083, + "grad_norm": 1.746334195137024, + "learning_rate": 4.997987046498934e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8738395810127259, + "num_tokens": 7576745.0, + "step": 925 + }, + { + "entropy": 0.4682184398174286, + "epoch": 0.14846743295019157, + "grad_norm": 1.788724660873413, + "learning_rate": 4.99796520629703e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.848229706287384, + "num_tokens": 7617705.0, + "step": 930 + }, + { + "entropy": 0.43413129448890686, + "epoch": 0.14926564495530012, + "grad_norm": 1.674914002418518, + "learning_rate": 4.9979432483166415e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8650355458259582, + "num_tokens": 7658665.0, + "step": 935 + }, + { + "entropy": 0.40451282262802124, + "epoch": 0.1500638569604087, + "grad_norm": 1.8498766422271729, + "learning_rate": 4.997921172559149e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8703109145164489, + "num_tokens": 7699625.0, + "step": 940 + }, + { + "entropy": 0.395086282491684, + "epoch": 0.15086206896551724, + "grad_norm": 1.7072008848190308, + "learning_rate": 4.99789897902594e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8722005605697631, + "num_tokens": 7740585.0, + "step": 945 + }, + { + "entropy": 0.4430574059486389, + "epoch": 0.1516602809706258, + "grad_norm": 1.6196160316467285, + "learning_rate": 4.997876667718411e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8589556455612183, + "num_tokens": 7781545.0, + "step": 950 + }, + { + "entropy": 0.38962970972061156, + "epoch": 0.15245849297573436, + "grad_norm": 1.6539169549942017, + "learning_rate": 4.997854238637964e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8773809552192688, + "num_tokens": 7822505.0, + "step": 955 + }, + { + "entropy": 0.42701526880264284, + "epoch": 0.1532567049808429, + "grad_norm": 1.7930479049682617, + "learning_rate": 4.9978316917860115e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8697462320327759, + "num_tokens": 7863465.0, + "step": 960 + }, + { + "entropy": 0.46107348799705505, + "epoch": 0.15405491698595147, + "grad_norm": 1.8451169729232788, + "learning_rate": 4.997809027163969e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8550493597984314, + "num_tokens": 7904425.0, + "step": 965 + }, + { + "entropy": 0.4252850949764252, + "epoch": 0.15485312899106002, + "grad_norm": 1.5299930572509766, + "learning_rate": 4.997786244773263e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8656162261962891, + "num_tokens": 7945385.0, + "step": 970 + }, + { + "entropy": 0.40233043432235716, + "epoch": 0.1556513409961686, + "grad_norm": 1.7764883041381836, + "learning_rate": 4.997763344615325e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8747176647186279, + "num_tokens": 7986345.0, + "step": 975 + }, + { + "entropy": 0.41937166452407837, + "epoch": 0.15644955300127714, + "grad_norm": 1.5040113925933838, + "learning_rate": 4.997740326691597e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8664495825767518, + "num_tokens": 8027305.0, + "step": 980 + }, + { + "entropy": 0.4262326657772064, + "epoch": 0.1572477650063857, + "grad_norm": 1.381565809249878, + "learning_rate": 4.997717191003525e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8680181741714478, + "num_tokens": 8068265.0, + "step": 985 + }, + { + "entropy": 0.4555032432079315, + "epoch": 0.15804597701149425, + "grad_norm": 1.8633453845977783, + "learning_rate": 4.997693937552564e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8548953294754028, + "num_tokens": 8109225.0, + "step": 990 + }, + { + "entropy": 0.4013682246208191, + "epoch": 0.1588441890166028, + "grad_norm": 1.7503520250320435, + "learning_rate": 4.997670566340176e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8736639976501465, + "num_tokens": 8150185.0, + "step": 995 + }, + { + "entropy": 0.402683812379837, + "epoch": 0.15964240102171137, + "grad_norm": 1.6846781969070435, + "learning_rate": 4.997647077367831e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8719624161720276, + "num_tokens": 8191145.0, + "step": 1000 + }, + { + "epoch": 0.15964240102171137, + "eval_entropy": 0.42480656147003176, + "eval_loss": 0.3921413719654083, + "eval_mean_token_accuracy": 0.8651772165298461, + "eval_num_tokens": 8191145.0, + "eval_runtime": 69.347, + "eval_samples_per_second": 14.42, + "eval_steps_per_second": 1.803, + "step": 1000 + }, + { + "entropy": 0.36306692361831666, + "epoch": 0.16044061302681992, + "grad_norm": 1.4345459938049316, + "learning_rate": 4.997623470637007e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8871942043304444, + "num_tokens": 8232105.0, + "step": 1005 + }, + { + "entropy": 0.41327076554298403, + "epoch": 0.1612388250319285, + "grad_norm": 1.675029993057251, + "learning_rate": 4.997599746149186e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8690639019012452, + "num_tokens": 8273065.0, + "step": 1010 + }, + { + "entropy": 0.4097064435482025, + "epoch": 0.16203703703703703, + "grad_norm": 1.8893979787826538, + "learning_rate": 4.997575903905863e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8692134499549866, + "num_tokens": 8314025.0, + "step": 1015 + }, + { + "entropy": 0.3804477572441101, + "epoch": 0.16283524904214558, + "grad_norm": 1.4815566539764404, + "learning_rate": 4.997551943908536e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8803953766822815, + "num_tokens": 8354985.0, + "step": 1020 + }, + { + "entropy": 0.40303739309310915, + "epoch": 0.16363346104725415, + "grad_norm": 1.9521483182907104, + "learning_rate": 4.99752786615871e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8721148848533631, + "num_tokens": 8395945.0, + "step": 1025 + }, + { + "entropy": 0.4187192976474762, + "epoch": 0.1644316730523627, + "grad_norm": 1.6583740711212158, + "learning_rate": 4.9975036706579015e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8685552954673768, + "num_tokens": 8436905.0, + "step": 1030 + }, + { + "entropy": 0.4502474308013916, + "epoch": 0.16522988505747127, + "grad_norm": 1.5872939825057983, + "learning_rate": 4.997479357407631e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8583998322486878, + "num_tokens": 8477865.0, + "step": 1035 + }, + { + "entropy": 0.4257081806659698, + "epoch": 0.16602809706257982, + "grad_norm": 1.6918632984161377, + "learning_rate": 4.997454926409427e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8680700540542603, + "num_tokens": 8518825.0, + "step": 1040 + }, + { + "entropy": 0.42606639862060547, + "epoch": 0.1668263090676884, + "grad_norm": 1.7691048383712769, + "learning_rate": 4.997430377664826e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8670408368110657, + "num_tokens": 8559785.0, + "step": 1045 + }, + { + "entropy": 0.4012975811958313, + "epoch": 0.16762452107279693, + "grad_norm": 1.622340202331543, + "learning_rate": 4.997405711175373e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8724196314811706, + "num_tokens": 8600745.0, + "step": 1050 + }, + { + "entropy": 0.3835790574550629, + "epoch": 0.16842273307790548, + "grad_norm": 1.6021138429641724, + "learning_rate": 4.9973809269426175e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8784301280975342, + "num_tokens": 8641705.0, + "step": 1055 + }, + { + "entropy": 0.41893797516822817, + "epoch": 0.16922094508301405, + "grad_norm": 1.6803170442581177, + "learning_rate": 4.997356024968118e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8650649070739747, + "num_tokens": 8682665.0, + "step": 1060 + }, + { + "entropy": 0.4112949728965759, + "epoch": 0.1700191570881226, + "grad_norm": 1.7723191976547241, + "learning_rate": 4.997331005253442e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8707603693008423, + "num_tokens": 8723625.0, + "step": 1065 + }, + { + "entropy": 0.429644775390625, + "epoch": 0.17081736909323117, + "grad_norm": 1.5139392614364624, + "learning_rate": 4.9973058678001605e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8641191124916077, + "num_tokens": 8764585.0, + "step": 1070 + }, + { + "entropy": 0.4199754595756531, + "epoch": 0.17161558109833971, + "grad_norm": 1.7999293804168701, + "learning_rate": 4.997280612609857e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8690139651298523, + "num_tokens": 8805545.0, + "step": 1075 + }, + { + "entropy": 0.4308688461780548, + "epoch": 0.1724137931034483, + "grad_norm": 1.652504563331604, + "learning_rate": 4.9972552396841175e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8629522681236267, + "num_tokens": 8846505.0, + "step": 1080 + }, + { + "entropy": 0.4425890207290649, + "epoch": 0.17321200510855683, + "grad_norm": 1.6451890468597412, + "learning_rate": 4.997229749024538e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8623061180114746, + "num_tokens": 8887465.0, + "step": 1085 + }, + { + "entropy": 0.4118497908115387, + "epoch": 0.17401021711366538, + "grad_norm": 1.6473734378814697, + "learning_rate": 4.997204140632722e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8697147250175477, + "num_tokens": 8928425.0, + "step": 1090 + }, + { + "entropy": 0.4137786865234375, + "epoch": 0.17480842911877395, + "grad_norm": 1.679807186126709, + "learning_rate": 4.99717841451028e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8703425407409668, + "num_tokens": 8969385.0, + "step": 1095 + }, + { + "entropy": 0.3895216822624207, + "epoch": 0.1756066411238825, + "grad_norm": 1.7490582466125488, + "learning_rate": 4.997152570658829e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8749435305595398, + "num_tokens": 9010345.0, + "step": 1100 + }, + { + "entropy": 0.4318494439125061, + "epoch": 0.17640485312899107, + "grad_norm": 1.7533351182937622, + "learning_rate": 4.997126609079993e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8652554154396057, + "num_tokens": 9051305.0, + "step": 1105 + }, + { + "entropy": 0.374624902009964, + "epoch": 0.1772030651340996, + "grad_norm": 1.5839003324508667, + "learning_rate": 4.9971005297754075e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8784180879592896, + "num_tokens": 9092265.0, + "step": 1110 + }, + { + "entropy": 0.4146465241909027, + "epoch": 0.17800127713920819, + "grad_norm": 1.6019837856292725, + "learning_rate": 4.99707433274671e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8676789045333863, + "num_tokens": 9133225.0, + "step": 1115 + }, + { + "entropy": 0.4506120502948761, + "epoch": 0.17879948914431673, + "grad_norm": 1.6022826433181763, + "learning_rate": 4.99704801799555e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8616150975227356, + "num_tokens": 9174185.0, + "step": 1120 + }, + { + "entropy": 0.4405996799468994, + "epoch": 0.17959770114942528, + "grad_norm": 1.7077938318252563, + "learning_rate": 4.99702158552358e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8605331301689148, + "num_tokens": 9215145.0, + "step": 1125 + }, + { + "entropy": 0.45600707530975343, + "epoch": 0.18039591315453385, + "grad_norm": 1.7418618202209473, + "learning_rate": 4.9969950353324635e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8526764512062073, + "num_tokens": 9256105.0, + "step": 1130 + }, + { + "entropy": 0.4088563621044159, + "epoch": 0.1811941251596424, + "grad_norm": 1.6304500102996826, + "learning_rate": 4.9969683674238704e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8687464475631714, + "num_tokens": 9297065.0, + "step": 1135 + }, + { + "entropy": 0.4286839008331299, + "epoch": 0.18199233716475097, + "grad_norm": 1.7950910329818726, + "learning_rate": 4.996941581799476e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8645257353782654, + "num_tokens": 9338025.0, + "step": 1140 + }, + { + "entropy": 0.4073693871498108, + "epoch": 0.1827905491698595, + "grad_norm": 1.8495526313781738, + "learning_rate": 4.996914678460966e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8706475257873535, + "num_tokens": 9378985.0, + "step": 1145 + }, + { + "entropy": 0.37133595943450926, + "epoch": 0.18358876117496808, + "grad_norm": 1.507561206817627, + "learning_rate": 4.996887657410032e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8849801659584046, + "num_tokens": 9419945.0, + "step": 1150 + }, + { + "entropy": 0.3901697754859924, + "epoch": 0.18438697318007663, + "grad_norm": 1.7057150602340698, + "learning_rate": 4.996860518648373e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8752909541130066, + "num_tokens": 9460905.0, + "step": 1155 + }, + { + "entropy": 0.37988726496696473, + "epoch": 0.18518518518518517, + "grad_norm": 1.510456919670105, + "learning_rate": 4.9968332621776956e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8772407650947571, + "num_tokens": 9501865.0, + "step": 1160 + }, + { + "entropy": 0.38587768077850343, + "epoch": 0.18598339719029375, + "grad_norm": 1.4991780519485474, + "learning_rate": 4.996805887999713e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8768713116645813, + "num_tokens": 9542825.0, + "step": 1165 + }, + { + "entropy": 0.38253093957901, + "epoch": 0.1867816091954023, + "grad_norm": 1.5953854322433472, + "learning_rate": 4.996778396116149e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8802226543426513, + "num_tokens": 9583785.0, + "step": 1170 + }, + { + "entropy": 0.4161043405532837, + "epoch": 0.18757982120051087, + "grad_norm": 1.6561322212219238, + "learning_rate": 4.99675078652873e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8681931853294372, + "num_tokens": 9624745.0, + "step": 1175 + }, + { + "entropy": 0.4351208686828613, + "epoch": 0.1883780332056194, + "grad_norm": 1.580498456954956, + "learning_rate": 4.996723059239193e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8593961119651794, + "num_tokens": 9665705.0, + "step": 1180 + }, + { + "entropy": 0.4033635318279266, + "epoch": 0.18917624521072796, + "grad_norm": 1.4505336284637451, + "learning_rate": 4.9966952142492815e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8754417181015015, + "num_tokens": 9706665.0, + "step": 1185 + }, + { + "entropy": 0.43068079352378846, + "epoch": 0.18997445721583653, + "grad_norm": 1.7219266891479492, + "learning_rate": 4.996667251560747e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8612993121147156, + "num_tokens": 9747625.0, + "step": 1190 + }, + { + "entropy": 0.36189231276512146, + "epoch": 0.19077266922094507, + "grad_norm": 1.5952274799346924, + "learning_rate": 4.9966391711753465e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8871069312095642, + "num_tokens": 9788585.0, + "step": 1195 + }, + { + "entropy": 0.4119158685207367, + "epoch": 0.19157088122605365, + "grad_norm": 1.6258676052093506, + "learning_rate": 4.996610973094848e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8701352715492249, + "num_tokens": 9829545.0, + "step": 1200 + }, + { + "epoch": 0.19157088122605365, + "eval_entropy": 0.41716468119621275, + "eval_loss": 0.3879784643650055, + "eval_mean_token_accuracy": 0.8664626970291137, + "eval_num_tokens": 9829545.0, + "eval_runtime": 69.3572, + "eval_samples_per_second": 14.418, + "eval_steps_per_second": 1.802, + "step": 1200 + }, + { + "entropy": 0.38318485021591187, + "epoch": 0.1923690932311622, + "grad_norm": 1.6998229026794434, + "learning_rate": 4.996582657321022e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8767566442489624, + "num_tokens": 9870505.0, + "step": 1205 + }, + { + "entropy": 0.3972532510757446, + "epoch": 0.19316730523627076, + "grad_norm": 1.586173176765442, + "learning_rate": 4.996554223855652e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8730546355247497, + "num_tokens": 9911465.0, + "step": 1210 + }, + { + "entropy": 0.4261841118335724, + "epoch": 0.1939655172413793, + "grad_norm": 2.009549140930176, + "learning_rate": 4.996525672700523e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8656880736351014, + "num_tokens": 9952425.0, + "step": 1215 + }, + { + "entropy": 0.39083985686302186, + "epoch": 0.19476372924648785, + "grad_norm": 1.5788795948028564, + "learning_rate": 4.9964970038574326e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.876624321937561, + "num_tokens": 9993385.0, + "step": 1220 + }, + { + "entropy": 0.41900729537010195, + "epoch": 0.19556194125159643, + "grad_norm": 1.7903603315353394, + "learning_rate": 4.996468217328183e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8655580163002015, + "num_tokens": 10034345.0, + "step": 1225 + }, + { + "entropy": 0.4208202064037323, + "epoch": 0.19636015325670497, + "grad_norm": 1.9505068063735962, + "learning_rate": 4.996439313114584e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8678246140480042, + "num_tokens": 10075305.0, + "step": 1230 + }, + { + "entropy": 0.3945443332195282, + "epoch": 0.19715836526181355, + "grad_norm": 3.14209246635437, + "learning_rate": 4.9964102912184535e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8775973558425904, + "num_tokens": 10116265.0, + "step": 1235 + }, + { + "entropy": 0.42588833570480344, + "epoch": 0.1979565772669221, + "grad_norm": 1.7747955322265625, + "learning_rate": 4.9963811516416165e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8639430999755859, + "num_tokens": 10157225.0, + "step": 1240 + }, + { + "entropy": 0.41568330526351926, + "epoch": 0.19875478927203066, + "grad_norm": 1.7321171760559082, + "learning_rate": 4.996351894385906e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.865398371219635, + "num_tokens": 10198185.0, + "step": 1245 + }, + { + "entropy": 0.3887935698032379, + "epoch": 0.1995530012771392, + "grad_norm": 1.696902871131897, + "learning_rate": 4.99632251945316e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8796878099441529, + "num_tokens": 10239145.0, + "step": 1250 + }, + { + "entropy": 0.45456741452217103, + "epoch": 0.20035121328224775, + "grad_norm": 1.7765012979507446, + "learning_rate": 4.996293026845228e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8554024338722229, + "num_tokens": 10280105.0, + "step": 1255 + }, + { + "entropy": 0.4271567165851593, + "epoch": 0.20114942528735633, + "grad_norm": 1.7857844829559326, + "learning_rate": 4.996263416563963e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8687859892845153, + "num_tokens": 10321065.0, + "step": 1260 + }, + { + "entropy": 0.41049537658691404, + "epoch": 0.20194763729246487, + "grad_norm": 1.8603590726852417, + "learning_rate": 4.996233688611227e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8668972373008728, + "num_tokens": 10362025.0, + "step": 1265 + }, + { + "entropy": 0.4028216958045959, + "epoch": 0.20274584929757344, + "grad_norm": 1.6554845571517944, + "learning_rate": 4.996203842988891e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8734982132911682, + "num_tokens": 10402985.0, + "step": 1270 + }, + { + "entropy": 0.3871129274368286, + "epoch": 0.203544061302682, + "grad_norm": 1.6808788776397705, + "learning_rate": 4.99617387969883e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8815069556236267, + "num_tokens": 10443945.0, + "step": 1275 + }, + { + "entropy": 0.40030001401901244, + "epoch": 0.20434227330779056, + "grad_norm": 1.705714225769043, + "learning_rate": 4.9961437987429285e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8696768760681153, + "num_tokens": 10484905.0, + "step": 1280 + }, + { + "entropy": 0.4262424409389496, + "epoch": 0.2051404853128991, + "grad_norm": 1.873246192932129, + "learning_rate": 4.996113600123079e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8671039581298828, + "num_tokens": 10525865.0, + "step": 1285 + }, + { + "entropy": 0.40372244119644163, + "epoch": 0.20593869731800765, + "grad_norm": 1.584495186805725, + "learning_rate": 4.996083283841179e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8726119875907898, + "num_tokens": 10566825.0, + "step": 1290 + }, + { + "entropy": 0.37636016607284545, + "epoch": 0.20673690932311622, + "grad_norm": 1.3686262369155884, + "learning_rate": 4.996052849899136e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8790806293487549, + "num_tokens": 10607785.0, + "step": 1295 + }, + { + "entropy": 0.39137051701545716, + "epoch": 0.20753512132822477, + "grad_norm": 1.361937403678894, + "learning_rate": 4.996022298298866e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8768706321716309, + "num_tokens": 10648745.0, + "step": 1300 + }, + { + "entropy": 0.4153124690055847, + "epoch": 0.20833333333333334, + "grad_norm": 1.7018482685089111, + "learning_rate": 4.995991629042286e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8716052293777465, + "num_tokens": 10689705.0, + "step": 1305 + }, + { + "entropy": 0.43957144021987915, + "epoch": 0.2091315453384419, + "grad_norm": 1.761306643486023, + "learning_rate": 4.995960842131326e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8598607301712036, + "num_tokens": 10730665.0, + "step": 1310 + }, + { + "entropy": 0.42865965962409974, + "epoch": 0.20992975734355046, + "grad_norm": 1.6357239484786987, + "learning_rate": 4.995929937567922e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.861563766002655, + "num_tokens": 10771625.0, + "step": 1315 + }, + { + "entropy": 0.38937110304832456, + "epoch": 0.210727969348659, + "grad_norm": 1.6105756759643555, + "learning_rate": 4.9958989153540186e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8771203875541687, + "num_tokens": 10812585.0, + "step": 1320 + }, + { + "entropy": 0.46120203137397764, + "epoch": 0.21152618135376755, + "grad_norm": 1.6245516538619995, + "learning_rate": 4.995867775491567e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8491848587989808, + "num_tokens": 10853545.0, + "step": 1325 + }, + { + "entropy": 0.3691314458847046, + "epoch": 0.21232439335887612, + "grad_norm": 1.4518979787826538, + "learning_rate": 4.995836517982522e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8822991251945496, + "num_tokens": 10894505.0, + "step": 1330 + }, + { + "entropy": 0.40480037331581115, + "epoch": 0.21312260536398467, + "grad_norm": 1.7446019649505615, + "learning_rate": 4.995805142828852e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8746320247650147, + "num_tokens": 10935465.0, + "step": 1335 + }, + { + "entropy": 0.4062842130661011, + "epoch": 0.21392081736909324, + "grad_norm": 1.6260181665420532, + "learning_rate": 4.99577365003253e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8707757234573364, + "num_tokens": 10976425.0, + "step": 1340 + }, + { + "entropy": 0.46745346784591674, + "epoch": 0.2147190293742018, + "grad_norm": 1.8439220190048218, + "learning_rate": 4.9957420395955345e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8548336029052734, + "num_tokens": 11017385.0, + "step": 1345 + }, + { + "entropy": 0.44144474864006045, + "epoch": 0.21551724137931033, + "grad_norm": 1.6311644315719604, + "learning_rate": 4.9957103115198556e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8604014754295349, + "num_tokens": 11058345.0, + "step": 1350 + }, + { + "entropy": 0.40855112075805666, + "epoch": 0.2163154533844189, + "grad_norm": 1.7732211351394653, + "learning_rate": 4.995678465807486e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8709009766578675, + "num_tokens": 11099305.0, + "step": 1355 + }, + { + "entropy": 0.397980147600174, + "epoch": 0.21711366538952745, + "grad_norm": 1.7110074758529663, + "learning_rate": 4.995646502460431e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8722142934799194, + "num_tokens": 11139855.0, + "step": 1360 + }, + { + "entropy": 0.41088297963142395, + "epoch": 0.21791187739463602, + "grad_norm": 1.6622663736343384, + "learning_rate": 4.995614421480699e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8695937275886536, + "num_tokens": 11180815.0, + "step": 1365 + }, + { + "entropy": 0.4248778760433197, + "epoch": 0.21871008939974457, + "grad_norm": 1.6985255479812622, + "learning_rate": 4.995582222870306e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8654048562049865, + "num_tokens": 11221775.0, + "step": 1370 + }, + { + "entropy": 0.39926210045814514, + "epoch": 0.21950830140485314, + "grad_norm": 1.6084798574447632, + "learning_rate": 4.9955499066312795e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8749465584754944, + "num_tokens": 11262735.0, + "step": 1375 + }, + { + "entropy": 0.40205529928207395, + "epoch": 0.22030651340996169, + "grad_norm": 1.5554734468460083, + "learning_rate": 4.995517472765651e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8706400275230408, + "num_tokens": 11303695.0, + "step": 1380 + }, + { + "entropy": 0.4029591023921967, + "epoch": 0.22110472541507023, + "grad_norm": 1.5115835666656494, + "learning_rate": 4.995484921275457e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8746947407722473, + "num_tokens": 11344655.0, + "step": 1385 + }, + { + "entropy": 0.39063696265220643, + "epoch": 0.2219029374201788, + "grad_norm": 1.6652828454971313, + "learning_rate": 4.995452252162749e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8736486792564392, + "num_tokens": 11385615.0, + "step": 1390 + }, + { + "entropy": 0.3742422580718994, + "epoch": 0.22270114942528735, + "grad_norm": 1.5964741706848145, + "learning_rate": 4.99541946542958e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.881929349899292, + "num_tokens": 11426575.0, + "step": 1395 + }, + { + "entropy": 0.4363535761833191, + "epoch": 0.22349936143039592, + "grad_norm": 1.8830647468566895, + "learning_rate": 4.9953865610780095e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8590501070022583, + "num_tokens": 11467535.0, + "step": 1400 + }, + { + "epoch": 0.22349936143039592, + "eval_entropy": 0.41811490869522094, + "eval_loss": 0.3820214867591858, + "eval_mean_token_accuracy": 0.8677202377319336, + "eval_num_tokens": 11467535.0, + "eval_runtime": 69.4634, + "eval_samples_per_second": 14.396, + "eval_steps_per_second": 1.8, + "step": 1400 + }, + { + "entropy": 0.4278635323047638, + "epoch": 0.22429757343550447, + "grad_norm": 1.6763865947723389, + "learning_rate": 4.995353539110108e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8624410390853882, + "num_tokens": 11508495.0, + "step": 1405 + }, + { + "entropy": 0.4178195416927338, + "epoch": 0.22509578544061304, + "grad_norm": 1.5268527269363403, + "learning_rate": 4.9953203995279525e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8675993204116821, + "num_tokens": 11549455.0, + "step": 1410 + }, + { + "entropy": 0.3592303812503815, + "epoch": 0.22589399744572158, + "grad_norm": 1.5988072156906128, + "learning_rate": 4.995287142333627e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8882538080215454, + "num_tokens": 11590415.0, + "step": 1415 + }, + { + "entropy": 0.39708020687103274, + "epoch": 0.22669220945083013, + "grad_norm": 1.5527164936065674, + "learning_rate": 4.995253767529222e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8767055392265319, + "num_tokens": 11631375.0, + "step": 1420 + }, + { + "entropy": 0.43148024678230285, + "epoch": 0.2274904214559387, + "grad_norm": 1.753352403640747, + "learning_rate": 4.995220275116836e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8635330319404602, + "num_tokens": 11672335.0, + "step": 1425 + }, + { + "entropy": 0.3746448040008545, + "epoch": 0.22828863346104725, + "grad_norm": 1.6090322732925415, + "learning_rate": 4.995186665098577e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8834616661071777, + "num_tokens": 11713295.0, + "step": 1430 + }, + { + "entropy": 0.41809865832328796, + "epoch": 0.22908684546615582, + "grad_norm": 1.6184989213943481, + "learning_rate": 4.9951529374765576e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8649685978889465, + "num_tokens": 11754255.0, + "step": 1435 + }, + { + "entropy": 0.41490686535835264, + "epoch": 0.22988505747126436, + "grad_norm": 1.7772951126098633, + "learning_rate": 4.995119092252898e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8693205952644348, + "num_tokens": 11795215.0, + "step": 1440 + }, + { + "entropy": 0.37119303941726683, + "epoch": 0.23068326947637294, + "grad_norm": 1.6093522310256958, + "learning_rate": 4.9950851294297265e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.884914219379425, + "num_tokens": 11836175.0, + "step": 1445 + }, + { + "entropy": 0.411442232131958, + "epoch": 0.23148148148148148, + "grad_norm": 1.5621263980865479, + "learning_rate": 4.99505104900918e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8707284569740296, + "num_tokens": 11877135.0, + "step": 1450 + }, + { + "entropy": 0.43793959021568296, + "epoch": 0.23227969348659003, + "grad_norm": 1.8224726915359497, + "learning_rate": 4.9950168509934e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8595661878585815, + "num_tokens": 11918095.0, + "step": 1455 + }, + { + "entropy": 0.4227015495300293, + "epoch": 0.2330779054916986, + "grad_norm": 1.6003056764602661, + "learning_rate": 4.994982535384538e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8620925664901733, + "num_tokens": 11959055.0, + "step": 1460 + }, + { + "entropy": 0.4124914050102234, + "epoch": 0.23387611749680715, + "grad_norm": 1.7063300609588623, + "learning_rate": 4.994948102184751e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8710904479026794, + "num_tokens": 12000015.0, + "step": 1465 + }, + { + "entropy": 0.40440365076065066, + "epoch": 0.23467432950191572, + "grad_norm": 1.4815607070922852, + "learning_rate": 4.994913551396206e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8743473768234253, + "num_tokens": 12040975.0, + "step": 1470 + }, + { + "entropy": 0.44553318023681643, + "epoch": 0.23547254150702426, + "grad_norm": 1.8663265705108643, + "learning_rate": 4.994878883021073e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8563253045082092, + "num_tokens": 12081935.0, + "step": 1475 + }, + { + "entropy": 0.4078981041908264, + "epoch": 0.23627075351213284, + "grad_norm": 1.9143680334091187, + "learning_rate": 4.994844097061536e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8678545594215393, + "num_tokens": 12122895.0, + "step": 1480 + }, + { + "entropy": 0.41621212363243104, + "epoch": 0.23706896551724138, + "grad_norm": 1.7652965784072876, + "learning_rate": 4.99480919351978e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8677355766296386, + "num_tokens": 12163855.0, + "step": 1485 + }, + { + "entropy": 0.411162918806076, + "epoch": 0.23786717752234993, + "grad_norm": 1.578628659248352, + "learning_rate": 4.994774172397998e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8704973578453064, + "num_tokens": 12204815.0, + "step": 1490 + }, + { + "entropy": 0.4328682243824005, + "epoch": 0.2386653895274585, + "grad_norm": 1.5921419858932495, + "learning_rate": 4.994739033698395e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.867378830909729, + "num_tokens": 12245775.0, + "step": 1495 + }, + { + "entropy": 0.4088943064212799, + "epoch": 0.23946360153256704, + "grad_norm": 1.5701849460601807, + "learning_rate": 4.994703777423181e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8706322550773621, + "num_tokens": 12286735.0, + "step": 1500 + }, + { + "entropy": 0.37183575630187987, + "epoch": 0.24026181353767562, + "grad_norm": 1.63997220993042, + "learning_rate": 4.994668403574571e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.880238938331604, + "num_tokens": 12327695.0, + "step": 1505 + }, + { + "entropy": 0.3934726595878601, + "epoch": 0.24106002554278416, + "grad_norm": 1.68795645236969, + "learning_rate": 4.9946329121547906e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8705344796180725, + "num_tokens": 12368655.0, + "step": 1510 + }, + { + "entropy": 0.4208143472671509, + "epoch": 0.2418582375478927, + "grad_norm": 1.404971957206726, + "learning_rate": 4.9945973031660715e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8673396229743957, + "num_tokens": 12409615.0, + "step": 1515 + }, + { + "entropy": 0.3697489082813263, + "epoch": 0.24265644955300128, + "grad_norm": 1.5042173862457275, + "learning_rate": 4.994561576610652e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8833913207054138, + "num_tokens": 12450575.0, + "step": 1520 + }, + { + "entropy": 0.3957326591014862, + "epoch": 0.24345466155810983, + "grad_norm": 1.3875174522399902, + "learning_rate": 4.99452573249078e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8749968886375428, + "num_tokens": 12491535.0, + "step": 1525 + }, + { + "entropy": 0.436593234539032, + "epoch": 0.2442528735632184, + "grad_norm": 1.6070129871368408, + "learning_rate": 4.994489770808709e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8614607095718384, + "num_tokens": 12532495.0, + "step": 1530 + }, + { + "entropy": 0.40942646861076354, + "epoch": 0.24505108556832694, + "grad_norm": 1.7601579427719116, + "learning_rate": 4.994453691566701e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8729120850563049, + "num_tokens": 12573455.0, + "step": 1535 + }, + { + "entropy": 0.4324551522731781, + "epoch": 0.24584929757343552, + "grad_norm": 1.7088350057601929, + "learning_rate": 4.994417494767024e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8620304942131043, + "num_tokens": 12614415.0, + "step": 1540 + }, + { + "entropy": 0.4621385753154755, + "epoch": 0.24664750957854406, + "grad_norm": 1.619863510131836, + "learning_rate": 4.9943811804119535e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8502998352050781, + "num_tokens": 12655375.0, + "step": 1545 + }, + { + "entropy": 0.38465359807014465, + "epoch": 0.2474457215836526, + "grad_norm": 1.7016675472259521, + "learning_rate": 4.9943447485037744e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8797366380691528, + "num_tokens": 12696335.0, + "step": 1550 + }, + { + "entropy": 0.41603352427482604, + "epoch": 0.24824393358876118, + "grad_norm": 1.6022270917892456, + "learning_rate": 4.994308199044777e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.866445803642273, + "num_tokens": 12737295.0, + "step": 1555 + }, + { + "entropy": 0.4290209412574768, + "epoch": 0.24904214559386972, + "grad_norm": 1.666760802268982, + "learning_rate": 4.99427153203726e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8672929883003235, + "num_tokens": 12778255.0, + "step": 1560 + }, + { + "entropy": 0.40751102566719055, + "epoch": 0.2498403575989783, + "grad_norm": 1.7844884395599365, + "learning_rate": 4.99423474748353e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8718531012535096, + "num_tokens": 12819215.0, + "step": 1565 + }, + { + "entropy": 0.41257060766220094, + "epoch": 0.25063856960408687, + "grad_norm": 1.6598808765411377, + "learning_rate": 4.994197845385897e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8714763522148132, + "num_tokens": 12860175.0, + "step": 1570 + }, + { + "entropy": 0.4657106280326843, + "epoch": 0.2514367816091954, + "grad_norm": 1.7844305038452148, + "learning_rate": 4.994160825746686e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8530724525451661, + "num_tokens": 12901135.0, + "step": 1575 + }, + { + "entropy": 0.42342046499252317, + "epoch": 0.25223499361430396, + "grad_norm": 1.8664227724075317, + "learning_rate": 4.994123688568222e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8675359487533569, + "num_tokens": 12942095.0, + "step": 1580 + }, + { + "entropy": 0.40034735202789307, + "epoch": 0.2530332056194125, + "grad_norm": 1.5805197954177856, + "learning_rate": 4.994086433852841e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8732864022254944, + "num_tokens": 12983055.0, + "step": 1585 + }, + { + "entropy": 0.44049054980278013, + "epoch": 0.25383141762452105, + "grad_norm": 1.6537421941757202, + "learning_rate": 4.994049061602883e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8610108494758606, + "num_tokens": 13024015.0, + "step": 1590 + }, + { + "entropy": 0.39807049036026, + "epoch": 0.25462962962962965, + "grad_norm": 1.674716591835022, + "learning_rate": 4.9940115718207035e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.875378680229187, + "num_tokens": 13064975.0, + "step": 1595 + }, + { + "entropy": 0.4111276030540466, + "epoch": 0.2554278416347382, + "grad_norm": 1.793379545211792, + "learning_rate": 4.993973964508657e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8707456111907959, + "num_tokens": 13105935.0, + "step": 1600 + }, + { + "epoch": 0.2554278416347382, + "eval_entropy": 0.4195484278202057, + "eval_loss": 0.37786614894866943, + "eval_mean_token_accuracy": 0.8689507064819336, + "eval_num_tokens": 13105935.0, + "eval_runtime": 69.3072, + "eval_samples_per_second": 14.429, + "eval_steps_per_second": 1.804, + "step": 1600 + }, + { + "entropy": 0.40499958395957947, + "epoch": 0.25622605363984674, + "grad_norm": 1.7946947813034058, + "learning_rate": 4.993936239669108e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8722654223442078, + "num_tokens": 13146895.0, + "step": 1605 + }, + { + "entropy": 0.43797704577445984, + "epoch": 0.2570242656449553, + "grad_norm": 1.5900914669036865, + "learning_rate": 4.993898397304429e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8610888838768005, + "num_tokens": 13187855.0, + "step": 1610 + }, + { + "entropy": 0.4190030813217163, + "epoch": 0.25782247765006383, + "grad_norm": 1.7633140087127686, + "learning_rate": 4.993860437417e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.863499665260315, + "num_tokens": 13228729.0, + "step": 1615 + }, + { + "entropy": 0.4361671805381775, + "epoch": 0.25862068965517243, + "grad_norm": 2.0313098430633545, + "learning_rate": 4.993822360009209e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8606663584709168, + "num_tokens": 13269689.0, + "step": 1620 + }, + { + "entropy": 0.43784735798835756, + "epoch": 0.259418901660281, + "grad_norm": 1.6960283517837524, + "learning_rate": 4.993784165083449e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.863409411907196, + "num_tokens": 13310649.0, + "step": 1625 + }, + { + "entropy": 0.3651863753795624, + "epoch": 0.2602171136653895, + "grad_norm": 1.549738883972168, + "learning_rate": 4.993745852642122e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8872558116912842, + "num_tokens": 13351609.0, + "step": 1630 + }, + { + "entropy": 0.43080597519874575, + "epoch": 0.26101532567049807, + "grad_norm": 1.7376998662948608, + "learning_rate": 4.9937074226876385e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8636496901512146, + "num_tokens": 13392569.0, + "step": 1635 + }, + { + "entropy": 0.36666577458381655, + "epoch": 0.26181353767560667, + "grad_norm": 1.6298332214355469, + "learning_rate": 4.993668875222413e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.884006929397583, + "num_tokens": 13433529.0, + "step": 1640 + }, + { + "entropy": 0.3666319906711578, + "epoch": 0.2626117496807152, + "grad_norm": 1.4979429244995117, + "learning_rate": 4.993630210248872e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.881742000579834, + "num_tokens": 13474489.0, + "step": 1645 + }, + { + "entropy": 0.3915225386619568, + "epoch": 0.26340996168582376, + "grad_norm": 1.614754319190979, + "learning_rate": 4.993591427769444e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8764796495437622, + "num_tokens": 13515449.0, + "step": 1650 + }, + { + "entropy": 0.41066797971725466, + "epoch": 0.2642081736909323, + "grad_norm": 1.5959410667419434, + "learning_rate": 4.99355252778657e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8697964787483216, + "num_tokens": 13556409.0, + "step": 1655 + }, + { + "entropy": 0.4193182408809662, + "epoch": 0.26500638569604085, + "grad_norm": 1.795488715171814, + "learning_rate": 4.9935135103026955e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8639777302742004, + "num_tokens": 13597369.0, + "step": 1660 + }, + { + "entropy": 0.4457306921482086, + "epoch": 0.26580459770114945, + "grad_norm": 1.7554657459259033, + "learning_rate": 4.993474375320274e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8610527634620666, + "num_tokens": 13638329.0, + "step": 1665 + }, + { + "entropy": 0.4340463042259216, + "epoch": 0.266602809706258, + "grad_norm": 1.7473564147949219, + "learning_rate": 4.993435122841766e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8637238264083862, + "num_tokens": 13679289.0, + "step": 1670 + }, + { + "entropy": 0.41148359775543214, + "epoch": 0.26740102171136654, + "grad_norm": 1.6237411499023438, + "learning_rate": 4.9933957528696404e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8702250599861145, + "num_tokens": 13720249.0, + "step": 1675 + }, + { + "entropy": 0.37870003581047057, + "epoch": 0.2681992337164751, + "grad_norm": 1.5291156768798828, + "learning_rate": 4.993356265406373e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.882771122455597, + "num_tokens": 13761209.0, + "step": 1680 + }, + { + "entropy": 0.4399440348148346, + "epoch": 0.26899744572158363, + "grad_norm": 1.4831913709640503, + "learning_rate": 4.993316660454447e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.860145092010498, + "num_tokens": 13802169.0, + "step": 1685 + }, + { + "entropy": 0.4290666043758392, + "epoch": 0.26979565772669223, + "grad_norm": 1.7227705717086792, + "learning_rate": 4.993276938016352e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8653980612754821, + "num_tokens": 13843129.0, + "step": 1690 + }, + { + "entropy": 0.4140145003795624, + "epoch": 0.2705938697318008, + "grad_norm": 1.556611180305481, + "learning_rate": 4.993237098094587e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8696804285049439, + "num_tokens": 13884089.0, + "step": 1695 + }, + { + "entropy": 0.4349702954292297, + "epoch": 0.2713920817369093, + "grad_norm": 1.7088748216629028, + "learning_rate": 4.993197140691657e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8617449760437011, + "num_tokens": 13925049.0, + "step": 1700 + }, + { + "entropy": 0.37474197149276733, + "epoch": 0.27219029374201786, + "grad_norm": 1.496077060699463, + "learning_rate": 4.993157065810074e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8825637340545655, + "num_tokens": 13966009.0, + "step": 1705 + }, + { + "entropy": 0.38251233100891113, + "epoch": 0.27298850574712646, + "grad_norm": 1.817301630973816, + "learning_rate": 4.993116873452358e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.875286865234375, + "num_tokens": 14006969.0, + "step": 1710 + }, + { + "entropy": 0.4126917839050293, + "epoch": 0.273786717752235, + "grad_norm": 1.7066811323165894, + "learning_rate": 4.993076563621037e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.869832706451416, + "num_tokens": 14047929.0, + "step": 1715 + }, + { + "entropy": 0.4246919810771942, + "epoch": 0.27458492975734355, + "grad_norm": 1.6397778987884521, + "learning_rate": 4.993036136318646e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.865372383594513, + "num_tokens": 14088889.0, + "step": 1720 + }, + { + "entropy": 0.4292417407035828, + "epoch": 0.2753831417624521, + "grad_norm": 1.785861611366272, + "learning_rate": 4.992995591547727e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8645003080368042, + "num_tokens": 14129849.0, + "step": 1725 + }, + { + "entropy": 0.42507190704345704, + "epoch": 0.27618135376756064, + "grad_norm": 1.6692979335784912, + "learning_rate": 4.99295492931083e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8670867443084717, + "num_tokens": 14170809.0, + "step": 1730 + }, + { + "entropy": 0.39853169322013854, + "epoch": 0.27697956577266925, + "grad_norm": 1.610140085220337, + "learning_rate": 4.992914149610511e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8755905747413635, + "num_tokens": 14211769.0, + "step": 1735 + }, + { + "entropy": 0.37006590366363523, + "epoch": 0.2777777777777778, + "grad_norm": 1.4653048515319824, + "learning_rate": 4.992873252449335e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8839511036872864, + "num_tokens": 14252729.0, + "step": 1740 + }, + { + "entropy": 0.40155852437019346, + "epoch": 0.27857598978288634, + "grad_norm": 1.59873366355896, + "learning_rate": 4.9928322378298736e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8729602098464966, + "num_tokens": 14293689.0, + "step": 1745 + }, + { + "entropy": 0.38730211853981017, + "epoch": 0.2793742017879949, + "grad_norm": 1.5279861688613892, + "learning_rate": 4.9927911057547065e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8754580855369568, + "num_tokens": 14334649.0, + "step": 1750 + }, + { + "entropy": 0.4186099946498871, + "epoch": 0.2801724137931034, + "grad_norm": 1.773861050605774, + "learning_rate": 4.992749856226419e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8691788077354431, + "num_tokens": 14375609.0, + "step": 1755 + }, + { + "entropy": 0.41721214056015016, + "epoch": 0.280970625798212, + "grad_norm": 1.6732553243637085, + "learning_rate": 4.992708489247606e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8656150698661804, + "num_tokens": 14416569.0, + "step": 1760 + }, + { + "entropy": 0.40146389603614807, + "epoch": 0.28176883780332057, + "grad_norm": 1.6273856163024902, + "learning_rate": 4.992667004820868e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8757187008857727, + "num_tokens": 14457529.0, + "step": 1765 + }, + { + "entropy": 0.3743233919143677, + "epoch": 0.2825670498084291, + "grad_norm": 1.336595058441162, + "learning_rate": 4.992625402948815e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8810940265655518, + "num_tokens": 14498489.0, + "step": 1770 + }, + { + "entropy": 0.4039636254310608, + "epoch": 0.28336526181353766, + "grad_norm": 1.54541015625, + "learning_rate": 4.992583683634061e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8718450546264649, + "num_tokens": 14539449.0, + "step": 1775 + }, + { + "entropy": 0.43368394374847413, + "epoch": 0.2841634738186462, + "grad_norm": 1.88645601272583, + "learning_rate": 4.992541846879232e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8583693742752075, + "num_tokens": 14580409.0, + "step": 1780 + }, + { + "entropy": 0.36426802277565, + "epoch": 0.2849616858237548, + "grad_norm": 1.3493199348449707, + "learning_rate": 4.992499892686957e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.886569344997406, + "num_tokens": 14621369.0, + "step": 1785 + }, + { + "entropy": 0.39198213815689087, + "epoch": 0.28575989782886335, + "grad_norm": 1.6369388103485107, + "learning_rate": 4.992457821059875e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8761023283004761, + "num_tokens": 14662329.0, + "step": 1790 + }, + { + "entropy": 0.38651488423347474, + "epoch": 0.2865581098339719, + "grad_norm": 1.5533897876739502, + "learning_rate": 4.992415632000631e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8807941317558289, + "num_tokens": 14703289.0, + "step": 1795 + }, + { + "entropy": 0.3910827159881592, + "epoch": 0.28735632183908044, + "grad_norm": 1.7036851644515991, + "learning_rate": 4.992373325511878e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8766248464584351, + "num_tokens": 14744249.0, + "step": 1800 + }, + { + "epoch": 0.28735632183908044, + "eval_entropy": 0.4121441757678986, + "eval_loss": 0.3748484253883362, + "eval_mean_token_accuracy": 0.8696171550750732, + "eval_num_tokens": 14744249.0, + "eval_runtime": 69.3324, + "eval_samples_per_second": 14.423, + "eval_steps_per_second": 1.803, + "step": 1800 + }, + { + "entropy": 0.4093415796756744, + "epoch": 0.28815453384418904, + "grad_norm": 1.698420524597168, + "learning_rate": 4.992330901596277e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8717585563659668, + "num_tokens": 14785209.0, + "step": 1805 + }, + { + "entropy": 0.3527294874191284, + "epoch": 0.2889527458492976, + "grad_norm": 1.5491911172866821, + "learning_rate": 4.9922883602564966e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8896806955337524, + "num_tokens": 14826169.0, + "step": 1810 + }, + { + "entropy": 0.42450478076934817, + "epoch": 0.28975095785440613, + "grad_norm": 1.58077073097229, + "learning_rate": 4.99224570149521e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8667027711868286, + "num_tokens": 14867129.0, + "step": 1815 + }, + { + "entropy": 0.43766342401504515, + "epoch": 0.2905491698595147, + "grad_norm": 1.547052025794983, + "learning_rate": 4.9922029253151e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8626722812652587, + "num_tokens": 14908089.0, + "step": 1820 + }, + { + "entropy": 0.4361523449420929, + "epoch": 0.2913473818646232, + "grad_norm": 1.8358014822006226, + "learning_rate": 4.992160031718859e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8596999645233154, + "num_tokens": 14949049.0, + "step": 1825 + }, + { + "entropy": 0.407147616147995, + "epoch": 0.2921455938697318, + "grad_norm": 1.7107361555099487, + "learning_rate": 4.99211702070918e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8710406422615051, + "num_tokens": 14990009.0, + "step": 1830 + }, + { + "entropy": 0.40477213859558103, + "epoch": 0.29294380587484037, + "grad_norm": 1.570025086402893, + "learning_rate": 4.992073892288772e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8652504324913025, + "num_tokens": 15030969.0, + "step": 1835 + }, + { + "entropy": 0.421870356798172, + "epoch": 0.2937420178799489, + "grad_norm": 1.607050895690918, + "learning_rate": 4.992030646460344e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8673770546913147, + "num_tokens": 15071929.0, + "step": 1840 + }, + { + "entropy": 0.41957166194915774, + "epoch": 0.29454022988505746, + "grad_norm": 1.6813313961029053, + "learning_rate": 4.991987283226617e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.866571569442749, + "num_tokens": 15112889.0, + "step": 1845 + }, + { + "entropy": 0.3803585350513458, + "epoch": 0.295338441890166, + "grad_norm": 1.6727192401885986, + "learning_rate": 4.9919438025903175e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8757133960723877, + "num_tokens": 15153849.0, + "step": 1850 + }, + { + "entropy": 0.41606191992759706, + "epoch": 0.2961366538952746, + "grad_norm": 1.6527150869369507, + "learning_rate": 4.99190020455418e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8708307147026062, + "num_tokens": 15194809.0, + "step": 1855 + }, + { + "entropy": 0.42957100868225095, + "epoch": 0.29693486590038315, + "grad_norm": 1.4922149181365967, + "learning_rate": 4.991856489120946e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8649673223495483, + "num_tokens": 15235769.0, + "step": 1860 + }, + { + "entropy": 0.4057033896446228, + "epoch": 0.2977330779054917, + "grad_norm": 1.660406231880188, + "learning_rate": 4.991812656293363e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8736201524734497, + "num_tokens": 15276729.0, + "step": 1865 + }, + { + "entropy": 0.40441020131111144, + "epoch": 0.29853128991060024, + "grad_norm": 1.577506184577942, + "learning_rate": 4.991768706074188e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8724161505699157, + "num_tokens": 15317689.0, + "step": 1870 + }, + { + "entropy": 0.373467230796814, + "epoch": 0.29932950191570884, + "grad_norm": 1.6950846910476685, + "learning_rate": 4.991724638466186e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8780014872550964, + "num_tokens": 15358649.0, + "step": 1875 + }, + { + "entropy": 0.3957930028438568, + "epoch": 0.3001277139208174, + "grad_norm": 1.5578737258911133, + "learning_rate": 4.991680453472129e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8749694466590882, + "num_tokens": 15399609.0, + "step": 1880 + }, + { + "entropy": 0.38373724818229676, + "epoch": 0.30092592592592593, + "grad_norm": 1.620660662651062, + "learning_rate": 4.991636151094792e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.878369402885437, + "num_tokens": 15440569.0, + "step": 1885 + }, + { + "entropy": 0.38547171354293824, + "epoch": 0.3017241379310345, + "grad_norm": 1.5540834665298462, + "learning_rate": 4.991591731336964e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8797260999679566, + "num_tokens": 15481529.0, + "step": 1890 + }, + { + "entropy": 0.39545257687568663, + "epoch": 0.302522349936143, + "grad_norm": 1.7817574739456177, + "learning_rate": 4.991547194201436e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8727735161781311, + "num_tokens": 15522489.0, + "step": 1895 + }, + { + "entropy": 0.41355872750282285, + "epoch": 0.3033205619412516, + "grad_norm": 1.624796748161316, + "learning_rate": 4.991502539691011e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8697443842887879, + "num_tokens": 15563449.0, + "step": 1900 + }, + { + "entropy": 0.4103052496910095, + "epoch": 0.30411877394636017, + "grad_norm": 1.7812132835388184, + "learning_rate": 4.991457767808494e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8709424614906311, + "num_tokens": 15604409.0, + "step": 1905 + }, + { + "entropy": 0.41177705526351926, + "epoch": 0.3049169859514687, + "grad_norm": 1.5392616987228394, + "learning_rate": 4.991412878556704e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8675289511680603, + "num_tokens": 15645369.0, + "step": 1910 + }, + { + "entropy": 0.4046268105506897, + "epoch": 0.30571519795657726, + "grad_norm": 1.865882396697998, + "learning_rate": 4.99136787193846e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8721927642822266, + "num_tokens": 15686329.0, + "step": 1915 + }, + { + "entropy": 0.46906509399414065, + "epoch": 0.3065134099616858, + "grad_norm": 1.639910340309143, + "learning_rate": 4.991322747956596e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8528730273246765, + "num_tokens": 15727289.0, + "step": 1920 + }, + { + "entropy": 0.38653295040130614, + "epoch": 0.3073116219667944, + "grad_norm": 1.516993761062622, + "learning_rate": 4.991277506613948e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8772754549980164, + "num_tokens": 15768249.0, + "step": 1925 + }, + { + "entropy": 0.42357757687568665, + "epoch": 0.30810983397190295, + "grad_norm": 1.6272945404052734, + "learning_rate": 4.991232147913359e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8665747880935669, + "num_tokens": 15809209.0, + "step": 1930 + }, + { + "entropy": 0.42249606251716615, + "epoch": 0.3089080459770115, + "grad_norm": 1.7107625007629395, + "learning_rate": 4.991186671857683e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.866193950176239, + "num_tokens": 15850169.0, + "step": 1935 + }, + { + "entropy": 0.40224887132644654, + "epoch": 0.30970625798212004, + "grad_norm": 1.562688946723938, + "learning_rate": 4.991141078449779e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8737279891967773, + "num_tokens": 15891129.0, + "step": 1940 + }, + { + "entropy": 0.3883086383342743, + "epoch": 0.3105044699872286, + "grad_norm": 1.5433399677276611, + "learning_rate": 4.9910953676925165e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8784520983695984, + "num_tokens": 15931668.0, + "step": 1945 + }, + { + "entropy": 0.41734763979911804, + "epoch": 0.3113026819923372, + "grad_norm": 1.7147794961929321, + "learning_rate": 4.991049539588768e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8696650862693787, + "num_tokens": 15972628.0, + "step": 1950 + }, + { + "entropy": 0.40874959230422975, + "epoch": 0.3121008939974457, + "grad_norm": 1.5410938262939453, + "learning_rate": 4.991003594141414e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8690267920494079, + "num_tokens": 16013588.0, + "step": 1955 + }, + { + "entropy": 0.42068531513214114, + "epoch": 0.3128991060025543, + "grad_norm": 1.5189158916473389, + "learning_rate": 4.990957531353346e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8713066935539245, + "num_tokens": 16054548.0, + "step": 1960 + }, + { + "entropy": 0.4159302175045013, + "epoch": 0.3136973180076628, + "grad_norm": 1.880847454071045, + "learning_rate": 4.99091135122746e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8719622373580933, + "num_tokens": 16095508.0, + "step": 1965 + }, + { + "entropy": 0.4221408247947693, + "epoch": 0.3144955300127714, + "grad_norm": 1.5728129148483276, + "learning_rate": 4.990865053766659e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8609007358551025, + "num_tokens": 16136468.0, + "step": 1970 + }, + { + "entropy": 0.438957542181015, + "epoch": 0.31529374201787996, + "grad_norm": 1.8245769739151, + "learning_rate": 4.9908186389738564e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8557961821556092, + "num_tokens": 16177428.0, + "step": 1975 + }, + { + "entropy": 0.441065239906311, + "epoch": 0.3160919540229885, + "grad_norm": 1.7864265441894531, + "learning_rate": 4.9907721068519686e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8611894130706788, + "num_tokens": 16218388.0, + "step": 1980 + }, + { + "entropy": 0.39668573141098024, + "epoch": 0.31689016602809705, + "grad_norm": 1.5603904724121094, + "learning_rate": 4.990725457403923e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8762275218963623, + "num_tokens": 16259348.0, + "step": 1985 + }, + { + "entropy": 0.39811150431632997, + "epoch": 0.3176883780332056, + "grad_norm": 1.4460902214050293, + "learning_rate": 4.990678690632652e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8742186784744262, + "num_tokens": 16300308.0, + "step": 1990 + }, + { + "entropy": 0.4311108887195587, + "epoch": 0.3184865900383142, + "grad_norm": 1.4956344366073608, + "learning_rate": 4.990631806541098e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8633324503898621, + "num_tokens": 16341268.0, + "step": 1995 + }, + { + "entropy": 0.4104701280593872, + "epoch": 0.31928480204342274, + "grad_norm": 1.7465938329696655, + "learning_rate": 4.990584805132208e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8737729787826538, + "num_tokens": 16382228.0, + "step": 2000 + }, + { + "epoch": 0.31928480204342274, + "eval_entropy": 0.4038234350681305, + "eval_loss": 0.3711216151714325, + "eval_mean_token_accuracy": 0.8712776474952698, + "eval_num_tokens": 16382228.0, + "eval_runtime": 69.2679, + "eval_samples_per_second": 14.437, + "eval_steps_per_second": 1.805, + "step": 2000 + }, + { + "entropy": 0.4014134407043457, + "epoch": 0.3200830140485313, + "grad_norm": 1.7096283435821533, + "learning_rate": 4.990537686408939e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8709941625595092, + "num_tokens": 16423188.0, + "step": 2005 + }, + { + "entropy": 0.3651737213134766, + "epoch": 0.32088122605363983, + "grad_norm": 1.3931198120117188, + "learning_rate": 4.990490450374251e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8843650698661805, + "num_tokens": 16464148.0, + "step": 2010 + }, + { + "entropy": 0.3889296054840088, + "epoch": 0.3216794380587484, + "grad_norm": 1.6823824644088745, + "learning_rate": 4.990443097031118e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8789047360420227, + "num_tokens": 16505108.0, + "step": 2015 + }, + { + "entropy": 0.37964903116226195, + "epoch": 0.322477650063857, + "grad_norm": 1.5910693407058716, + "learning_rate": 4.9903956263825155e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8798817157745361, + "num_tokens": 16546068.0, + "step": 2020 + }, + { + "entropy": 0.4190361201763153, + "epoch": 0.3232758620689655, + "grad_norm": 1.602673888206482, + "learning_rate": 4.99034803843143e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8659457564353943, + "num_tokens": 16587028.0, + "step": 2025 + }, + { + "entropy": 0.41818163394927976, + "epoch": 0.32407407407407407, + "grad_norm": 1.5723743438720703, + "learning_rate": 4.990300333180853e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8674409031867981, + "num_tokens": 16627988.0, + "step": 2030 + }, + { + "entropy": 0.4227443754673004, + "epoch": 0.3248722860791826, + "grad_norm": 1.5853668451309204, + "learning_rate": 4.990252510633785e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8644875526428223, + "num_tokens": 16668948.0, + "step": 2035 + }, + { + "entropy": 0.40144317150115966, + "epoch": 0.32567049808429116, + "grad_norm": 1.6941038370132446, + "learning_rate": 4.9902045707932315e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8758120179176331, + "num_tokens": 16709908.0, + "step": 2040 + }, + { + "entropy": 0.4214106798171997, + "epoch": 0.32646871008939976, + "grad_norm": 1.6031938791275024, + "learning_rate": 4.99015651366221e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8669756650924683, + "num_tokens": 16750868.0, + "step": 2045 + }, + { + "entropy": 0.36540584564208983, + "epoch": 0.3272669220945083, + "grad_norm": 1.5755289793014526, + "learning_rate": 4.99010833924374e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8872041344642639, + "num_tokens": 16791828.0, + "step": 2050 + }, + { + "entropy": 0.4037174999713898, + "epoch": 0.32806513409961685, + "grad_norm": 1.5162173509597778, + "learning_rate": 4.990060047540852e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8711183547973633, + "num_tokens": 16832788.0, + "step": 2055 + }, + { + "entropy": 0.39979116916656493, + "epoch": 0.3288633461047254, + "grad_norm": 1.638429880142212, + "learning_rate": 4.9900116385565825e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8761036038398743, + "num_tokens": 16873748.0, + "step": 2060 + }, + { + "entropy": 0.4227651596069336, + "epoch": 0.329661558109834, + "grad_norm": 1.749085545539856, + "learning_rate": 4.989963112293977e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8673272252082824, + "num_tokens": 16914708.0, + "step": 2065 + }, + { + "entropy": 0.37639381885528567, + "epoch": 0.33045977011494254, + "grad_norm": 1.413428544998169, + "learning_rate": 4.989914468756084e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8843110561370849, + "num_tokens": 16955668.0, + "step": 2070 + }, + { + "entropy": 0.38432916402816775, + "epoch": 0.3312579821200511, + "grad_norm": 1.6481380462646484, + "learning_rate": 4.989865707945965e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8767096996307373, + "num_tokens": 16996628.0, + "step": 2075 + }, + { + "entropy": 0.4439726769924164, + "epoch": 0.33205619412515963, + "grad_norm": 1.812071681022644, + "learning_rate": 4.989816829866686e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8621229887008667, + "num_tokens": 17037588.0, + "step": 2080 + }, + { + "entropy": 0.4180517435073853, + "epoch": 0.3328544061302682, + "grad_norm": 1.8122986555099487, + "learning_rate": 4.989767834521318e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8722500562667846, + "num_tokens": 17078548.0, + "step": 2085 + }, + { + "entropy": 0.37677949070930483, + "epoch": 0.3336526181353768, + "grad_norm": 1.6600393056869507, + "learning_rate": 4.989718721912946e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8783737897872925, + "num_tokens": 17119508.0, + "step": 2090 + }, + { + "entropy": 0.39887595772743223, + "epoch": 0.3344508301404853, + "grad_norm": 1.576180338859558, + "learning_rate": 4.989669492044655e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8705180883407593, + "num_tokens": 17160468.0, + "step": 2095 + }, + { + "entropy": 0.3965007960796356, + "epoch": 0.33524904214559387, + "grad_norm": 1.63950514793396, + "learning_rate": 4.989620144919543e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8723701715469361, + "num_tokens": 17201428.0, + "step": 2100 + }, + { + "entropy": 0.41184504628181456, + "epoch": 0.3360472541507024, + "grad_norm": 1.544554591178894, + "learning_rate": 4.989570680540712e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8735958099365234, + "num_tokens": 17242388.0, + "step": 2105 + }, + { + "entropy": 0.34826286435127257, + "epoch": 0.33684546615581096, + "grad_norm": 1.3255348205566406, + "learning_rate": 4.989521098911272e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8921523332595825, + "num_tokens": 17283348.0, + "step": 2110 + }, + { + "entropy": 0.39912793040275574, + "epoch": 0.33764367816091956, + "grad_norm": 1.566488265991211, + "learning_rate": 4.989471400034343e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8726530551910401, + "num_tokens": 17324308.0, + "step": 2115 + }, + { + "entropy": 0.3988477885723114, + "epoch": 0.3384418901660281, + "grad_norm": 1.8146026134490967, + "learning_rate": 4.989421583913047e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8763777494430542, + "num_tokens": 17365268.0, + "step": 2120 + }, + { + "entropy": 0.4144528448581696, + "epoch": 0.33924010217113665, + "grad_norm": 1.733222246170044, + "learning_rate": 4.989371650550519e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8681719303131104, + "num_tokens": 17406228.0, + "step": 2125 + }, + { + "entropy": 0.3982251286506653, + "epoch": 0.3400383141762452, + "grad_norm": 1.4397218227386475, + "learning_rate": 4.9893215999499e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8726215600967407, + "num_tokens": 17447188.0, + "step": 2130 + }, + { + "entropy": 0.3918505072593689, + "epoch": 0.3408365261813538, + "grad_norm": 1.5369162559509277, + "learning_rate": 4.9892714321143346e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8802524089813233, + "num_tokens": 17488148.0, + "step": 2135 + }, + { + "entropy": 0.4021181404590607, + "epoch": 0.34163473818646234, + "grad_norm": 1.5608594417572021, + "learning_rate": 4.9892211470469775e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8733767867088318, + "num_tokens": 17529108.0, + "step": 2140 + }, + { + "entropy": 0.3666627109050751, + "epoch": 0.3424329501915709, + "grad_norm": 1.418404459953308, + "learning_rate": 4.989170744750993e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8857243657112122, + "num_tokens": 17570068.0, + "step": 2145 + }, + { + "entropy": 0.3634820461273193, + "epoch": 0.34323116219667943, + "grad_norm": 1.5730950832366943, + "learning_rate": 4.9891202252295495e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8852996826171875, + "num_tokens": 17611028.0, + "step": 2150 + }, + { + "entropy": 0.40299826860427856, + "epoch": 0.344029374201788, + "grad_norm": 1.8429025411605835, + "learning_rate": 4.989069588485824e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8722189664840698, + "num_tokens": 17651988.0, + "step": 2155 + }, + { + "entropy": 0.39875529408454896, + "epoch": 0.3448275862068966, + "grad_norm": 1.7688708305358887, + "learning_rate": 4.989018834523001e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8719764232635498, + "num_tokens": 17692948.0, + "step": 2160 + }, + { + "entropy": 0.41312822699546814, + "epoch": 0.3456257982120051, + "grad_norm": 1.5525833368301392, + "learning_rate": 4.9889679633442706e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8743491053581238, + "num_tokens": 17733908.0, + "step": 2165 + }, + { + "entropy": 0.43781871199607847, + "epoch": 0.34642401021711366, + "grad_norm": 1.7160017490386963, + "learning_rate": 4.988916974952833e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8564818978309632, + "num_tokens": 17774868.0, + "step": 2170 + }, + { + "entropy": 0.38255208134651186, + "epoch": 0.3472222222222222, + "grad_norm": 1.643141508102417, + "learning_rate": 4.988865869351895e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8778419256210327, + "num_tokens": 17815828.0, + "step": 2175 + }, + { + "entropy": 0.42076377272605897, + "epoch": 0.34802043422733075, + "grad_norm": 1.8374202251434326, + "learning_rate": 4.988814646544669e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8678974390029908, + "num_tokens": 17856788.0, + "step": 2180 + }, + { + "entropy": 0.4139479219913483, + "epoch": 0.34881864623243936, + "grad_norm": 1.75631582736969, + "learning_rate": 4.988763306534376e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8708728432655335, + "num_tokens": 17897748.0, + "step": 2185 + }, + { + "entropy": 0.3780826270580292, + "epoch": 0.3496168582375479, + "grad_norm": 1.620390772819519, + "learning_rate": 4.988711849324247e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8815474390983582, + "num_tokens": 17938708.0, + "step": 2190 + }, + { + "entropy": 0.3495974004268646, + "epoch": 0.35041507024265645, + "grad_norm": 1.5734292268753052, + "learning_rate": 4.988660274917515e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8917647361755371, + "num_tokens": 17979668.0, + "step": 2195 + }, + { + "entropy": 0.3996447205543518, + "epoch": 0.351213282247765, + "grad_norm": 1.7574753761291504, + "learning_rate": 4.988608583317424e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8691877007484436, + "num_tokens": 18020628.0, + "step": 2200 + }, + { + "epoch": 0.351213282247765, + "eval_entropy": 0.40308074259758, + "eval_loss": 0.3682883381843567, + "eval_mean_token_accuracy": 0.8719504637718201, + "eval_num_tokens": 18020628.0, + "eval_runtime": 69.2047, + "eval_samples_per_second": 14.45, + "eval_steps_per_second": 1.806, + "step": 2200 + }, + { + "entropy": 0.4189161598682404, + "epoch": 0.35201149425287354, + "grad_norm": 1.8016271591186523, + "learning_rate": 4.988556774527226e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.864763867855072, + "num_tokens": 18061368.0, + "step": 2205 + }, + { + "entropy": 0.4112794458866119, + "epoch": 0.35280970625798214, + "grad_norm": 1.630026936531067, + "learning_rate": 4.988504848550175e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8749266624450683, + "num_tokens": 18102328.0, + "step": 2210 + }, + { + "entropy": 0.3925338864326477, + "epoch": 0.3536079182630907, + "grad_norm": 1.639182448387146, + "learning_rate": 4.988452805389541e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8734994173049927, + "num_tokens": 18143288.0, + "step": 2215 + }, + { + "entropy": 0.383824622631073, + "epoch": 0.3544061302681992, + "grad_norm": 1.7463170289993286, + "learning_rate": 4.9884006450485935e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8832499623298645, + "num_tokens": 18184248.0, + "step": 2220 + }, + { + "entropy": 0.40624321103096006, + "epoch": 0.35520434227330777, + "grad_norm": 1.669136881828308, + "learning_rate": 4.9883483675306144e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.872031056880951, + "num_tokens": 18225208.0, + "step": 2225 + }, + { + "entropy": 0.3683569490909576, + "epoch": 0.35600255427841637, + "grad_norm": 1.5674257278442383, + "learning_rate": 4.98829597283889e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8851659893989563, + "num_tokens": 18266168.0, + "step": 2230 + }, + { + "entropy": 0.39384169578552247, + "epoch": 0.3568007662835249, + "grad_norm": 1.5700942277908325, + "learning_rate": 4.988243460976715e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8744176506996155, + "num_tokens": 18307128.0, + "step": 2235 + }, + { + "entropy": 0.40435452461242677, + "epoch": 0.35759897828863346, + "grad_norm": 1.5019632577896118, + "learning_rate": 4.988190831947391e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8701990246772766, + "num_tokens": 18348088.0, + "step": 2240 + }, + { + "entropy": 0.39732733368873596, + "epoch": 0.358397190293742, + "grad_norm": 1.7081998586654663, + "learning_rate": 4.988138085754229e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8721163272857666, + "num_tokens": 18389048.0, + "step": 2245 + }, + { + "entropy": 0.4120040833950043, + "epoch": 0.35919540229885055, + "grad_norm": 1.8202104568481445, + "learning_rate": 4.988085222400546e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8671935796737671, + "num_tokens": 18430008.0, + "step": 2250 + }, + { + "entropy": 0.39716677069664, + "epoch": 0.35999361430395915, + "grad_norm": 1.6864914894104004, + "learning_rate": 4.988032241889665e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8778967261314392, + "num_tokens": 18470968.0, + "step": 2255 + }, + { + "entropy": 0.4007763683795929, + "epoch": 0.3607918263090677, + "grad_norm": 1.6153517961502075, + "learning_rate": 4.987979144224917e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8720175743103027, + "num_tokens": 18511928.0, + "step": 2260 + }, + { + "entropy": 0.38735673427581785, + "epoch": 0.36159003831417624, + "grad_norm": 1.4474482536315918, + "learning_rate": 4.9879259294096426e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8750486254692078, + "num_tokens": 18552888.0, + "step": 2265 + }, + { + "entropy": 0.4178376317024231, + "epoch": 0.3623882503192848, + "grad_norm": 1.721549391746521, + "learning_rate": 4.987872597447188e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8684224009513855, + "num_tokens": 18593848.0, + "step": 2270 + }, + { + "entropy": 0.39219988584518434, + "epoch": 0.36318646232439333, + "grad_norm": 1.5142766237258911, + "learning_rate": 4.987819148340906e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8798442125320435, + "num_tokens": 18634808.0, + "step": 2275 + }, + { + "entropy": 0.4054730415344238, + "epoch": 0.36398467432950193, + "grad_norm": 1.5823688507080078, + "learning_rate": 4.987765582094158e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8720792055130004, + "num_tokens": 18675768.0, + "step": 2280 + }, + { + "entropy": 0.46094911694526675, + "epoch": 0.3647828863346105, + "grad_norm": 1.7849924564361572, + "learning_rate": 4.987711898710312e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8514412760734558, + "num_tokens": 18716728.0, + "step": 2285 + }, + { + "entropy": 0.38086835741996766, + "epoch": 0.365581098339719, + "grad_norm": 1.7232701778411865, + "learning_rate": 4.987658098192745e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8816040277481079, + "num_tokens": 18757688.0, + "step": 2290 + }, + { + "entropy": 0.3888665437698364, + "epoch": 0.36637931034482757, + "grad_norm": 1.341325044631958, + "learning_rate": 4.987604180544839e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8755691409111023, + "num_tokens": 18798648.0, + "step": 2295 + }, + { + "entropy": 0.431248152256012, + "epoch": 0.36717752234993617, + "grad_norm": 1.6617993116378784, + "learning_rate": 4.987550145769986e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8633793830871582, + "num_tokens": 18839608.0, + "step": 2300 + }, + { + "entropy": 0.36826140284538267, + "epoch": 0.3679757343550447, + "grad_norm": 1.32558012008667, + "learning_rate": 4.987495993871582e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8857893109321594, + "num_tokens": 18880568.0, + "step": 2305 + }, + { + "entropy": 0.3880879878997803, + "epoch": 0.36877394636015326, + "grad_norm": 1.639264464378357, + "learning_rate": 4.9874417248530325e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8788592934608459, + "num_tokens": 18921528.0, + "step": 2310 + }, + { + "entropy": 0.3850403368473053, + "epoch": 0.3695721583652618, + "grad_norm": 1.4602205753326416, + "learning_rate": 4.9873873387177515e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8812838077545166, + "num_tokens": 18962146.0, + "step": 2315 + }, + { + "entropy": 0.39695783257484435, + "epoch": 0.37037037037037035, + "grad_norm": 1.3622066974639893, + "learning_rate": 4.987332835469158e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8771039724349976, + "num_tokens": 19003106.0, + "step": 2320 + }, + { + "entropy": 0.37232043147087096, + "epoch": 0.37116858237547895, + "grad_norm": 1.464799404144287, + "learning_rate": 4.98727821511068e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8875339508056641, + "num_tokens": 19044066.0, + "step": 2325 + }, + { + "entropy": 0.4204703152179718, + "epoch": 0.3719667943805875, + "grad_norm": 1.6495721340179443, + "learning_rate": 4.9872234776457515e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8661399126052857, + "num_tokens": 19085026.0, + "step": 2330 + }, + { + "entropy": 0.368586528301239, + "epoch": 0.37276500638569604, + "grad_norm": 1.5902228355407715, + "learning_rate": 4.987168623077815e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8872491240501403, + "num_tokens": 19125986.0, + "step": 2335 + }, + { + "entropy": 0.4211663603782654, + "epoch": 0.3735632183908046, + "grad_norm": 1.6928033828735352, + "learning_rate": 4.9871136514103194e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8665610313415527, + "num_tokens": 19166946.0, + "step": 2340 + }, + { + "entropy": 0.44752530455589296, + "epoch": 0.37436143039591313, + "grad_norm": 1.8591210842132568, + "learning_rate": 4.987058562646722e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8581960439682007, + "num_tokens": 19207906.0, + "step": 2345 + }, + { + "entropy": 0.3855793416500092, + "epoch": 0.37515964240102173, + "grad_norm": 1.5569732189178467, + "learning_rate": 4.987003356790487e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.877441143989563, + "num_tokens": 19248866.0, + "step": 2350 + }, + { + "entropy": 0.37906638383865354, + "epoch": 0.3759578544061303, + "grad_norm": 1.720017910003662, + "learning_rate": 4.986948033845086e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8832611203193664, + "num_tokens": 19289826.0, + "step": 2355 + }, + { + "entropy": 0.4253597676753998, + "epoch": 0.3767560664112388, + "grad_norm": 1.5680551528930664, + "learning_rate": 4.986892593813998e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8657567858695984, + "num_tokens": 19330786.0, + "step": 2360 + }, + { + "entropy": 0.4145656943321228, + "epoch": 0.37755427841634737, + "grad_norm": 1.678577184677124, + "learning_rate": 4.986837036700708e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8670534491539001, + "num_tokens": 19371746.0, + "step": 2365 + }, + { + "entropy": 0.42500597834587095, + "epoch": 0.3783524904214559, + "grad_norm": 1.7057173252105713, + "learning_rate": 4.986781362508711e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8676841259002686, + "num_tokens": 19412706.0, + "step": 2370 + }, + { + "entropy": 0.4140367269515991, + "epoch": 0.3791507024265645, + "grad_norm": 1.6917656660079956, + "learning_rate": 4.986725571241508e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8730501174926758, + "num_tokens": 19453666.0, + "step": 2375 + }, + { + "entropy": 0.4040639877319336, + "epoch": 0.37994891443167306, + "grad_norm": 1.5331501960754395, + "learning_rate": 4.986669662902607e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8733095169067383, + "num_tokens": 19494626.0, + "step": 2380 + }, + { + "entropy": 0.4209416925907135, + "epoch": 0.3807471264367816, + "grad_norm": 1.7044132947921753, + "learning_rate": 4.986613637495524e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8655770182609558, + "num_tokens": 19535586.0, + "step": 2385 + }, + { + "entropy": 0.4252281904220581, + "epoch": 0.38154533844189015, + "grad_norm": 1.5789233446121216, + "learning_rate": 4.986557495023781e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8630497813224792, + "num_tokens": 19576546.0, + "step": 2390 + }, + { + "entropy": 0.3741147518157959, + "epoch": 0.38234355044699875, + "grad_norm": 1.4773911237716675, + "learning_rate": 4.986501235490909e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.882919716835022, + "num_tokens": 19617506.0, + "step": 2395 + }, + { + "entropy": 0.4245776295661926, + "epoch": 0.3831417624521073, + "grad_norm": 1.5666937828063965, + "learning_rate": 4.986444858900447e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8653708696365356, + "num_tokens": 19658466.0, + "step": 2400 + }, + { + "epoch": 0.3831417624521073, + "eval_entropy": 0.40449087238311765, + "eval_loss": 0.3654989004135132, + "eval_mean_token_accuracy": 0.8728414220809937, + "eval_num_tokens": 19658466.0, + "eval_runtime": 69.2913, + "eval_samples_per_second": 14.432, + "eval_steps_per_second": 1.804, + "step": 2400 + }, + { + "entropy": 0.4110782384872437, + "epoch": 0.38393997445721584, + "grad_norm": 1.9226335287094116, + "learning_rate": 4.986388365255937e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8674784779548645, + "num_tokens": 19699426.0, + "step": 2405 + }, + { + "entropy": 0.40915406942367555, + "epoch": 0.3847381864623244, + "grad_norm": 1.647294044494629, + "learning_rate": 4.9863317545609355e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8719962477684021, + "num_tokens": 19740386.0, + "step": 2410 + }, + { + "entropy": 0.40981475114822385, + "epoch": 0.38553639846743293, + "grad_norm": 1.625821590423584, + "learning_rate": 4.986275026818999e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8711891293525695, + "num_tokens": 19781346.0, + "step": 2415 + }, + { + "entropy": 0.4291534602642059, + "epoch": 0.38633461047254153, + "grad_norm": 1.5893938541412354, + "learning_rate": 4.986218182033697e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8647595286369324, + "num_tokens": 19822306.0, + "step": 2420 + }, + { + "entropy": 0.4076783239841461, + "epoch": 0.3871328224776501, + "grad_norm": 1.6474665403366089, + "learning_rate": 4.986161220208604e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8684025168418884, + "num_tokens": 19863266.0, + "step": 2425 + }, + { + "entropy": 0.3912591695785522, + "epoch": 0.3879310344827586, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.986104141347301e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8792547583580017, + "num_tokens": 19904226.0, + "step": 2430 + }, + { + "entropy": 0.4334920525550842, + "epoch": 0.38872924648786716, + "grad_norm": 1.6844035387039185, + "learning_rate": 4.9860469454533775e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.863440215587616, + "num_tokens": 19945186.0, + "step": 2435 + }, + { + "entropy": 0.3987038731575012, + "epoch": 0.3895274584929757, + "grad_norm": 1.6433604955673218, + "learning_rate": 4.98598963253043e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8737675786018372, + "num_tokens": 19986146.0, + "step": 2440 + }, + { + "entropy": 0.40528036952018737, + "epoch": 0.3903256704980843, + "grad_norm": 1.5972541570663452, + "learning_rate": 4.985932202582062e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.873537254333496, + "num_tokens": 20027106.0, + "step": 2445 + }, + { + "entropy": 0.44673908948898317, + "epoch": 0.39112388250319285, + "grad_norm": 1.6527669429779053, + "learning_rate": 4.985874655611887e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8543389558792114, + "num_tokens": 20068066.0, + "step": 2450 + }, + { + "entropy": 0.35718379616737367, + "epoch": 0.3919220945083014, + "grad_norm": 1.511210560798645, + "learning_rate": 4.985816991623521e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.890525484085083, + "num_tokens": 20109026.0, + "step": 2455 + }, + { + "entropy": 0.39349877238273623, + "epoch": 0.39272030651340994, + "grad_norm": 1.6557594537734985, + "learning_rate": 4.985759210620593e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8725938200950623, + "num_tokens": 20149986.0, + "step": 2460 + }, + { + "entropy": 0.39194449186325075, + "epoch": 0.39351851851851855, + "grad_norm": 1.434440016746521, + "learning_rate": 4.985701312606735e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8777651786804199, + "num_tokens": 20190946.0, + "step": 2465 + }, + { + "entropy": 0.38448486328125, + "epoch": 0.3943167305236271, + "grad_norm": 1.5193266868591309, + "learning_rate": 4.985643297585587e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8788821578025818, + "num_tokens": 20231906.0, + "step": 2470 + }, + { + "entropy": 0.37055438160896303, + "epoch": 0.39511494252873564, + "grad_norm": 1.668213129043579, + "learning_rate": 4.985585165560798e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8830554604530334, + "num_tokens": 20272866.0, + "step": 2475 + }, + { + "entropy": 0.3725506365299225, + "epoch": 0.3959131545338442, + "grad_norm": 1.5674769878387451, + "learning_rate": 4.985526916536024e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8806295394897461, + "num_tokens": 20313826.0, + "step": 2480 + }, + { + "entropy": 0.41256141662597656, + "epoch": 0.3967113665389527, + "grad_norm": 1.7599780559539795, + "learning_rate": 4.985468550514928e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8694740176200867, + "num_tokens": 20354786.0, + "step": 2485 + }, + { + "entropy": 0.4078804194927216, + "epoch": 0.3975095785440613, + "grad_norm": 1.5582984685897827, + "learning_rate": 4.985410067501178e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8715253114700318, + "num_tokens": 20395746.0, + "step": 2490 + }, + { + "entropy": 0.3821629822254181, + "epoch": 0.39830779054916987, + "grad_norm": 1.3672593832015991, + "learning_rate": 4.985351467498455e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8817133903503418, + "num_tokens": 20436706.0, + "step": 2495 + }, + { + "entropy": 0.36203756332397463, + "epoch": 0.3991060025542784, + "grad_norm": 1.6370218992233276, + "learning_rate": 4.985292750510442e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8887966752052308, + "num_tokens": 20477666.0, + "step": 2500 + }, + { + "entropy": 0.4348323345184326, + "epoch": 0.39990421455938696, + "grad_norm": 1.5410937070846558, + "learning_rate": 4.9852339165408305e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8647068858146667, + "num_tokens": 20518626.0, + "step": 2505 + }, + { + "entropy": 0.3871714770793915, + "epoch": 0.4007024265644955, + "grad_norm": 1.6580190658569336, + "learning_rate": 4.985174965593323e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8785181164741516, + "num_tokens": 20559586.0, + "step": 2510 + }, + { + "entropy": 0.4144402027130127, + "epoch": 0.4015006385696041, + "grad_norm": 1.7754415273666382, + "learning_rate": 4.985115897671624e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8701366901397705, + "num_tokens": 20600546.0, + "step": 2515 + }, + { + "entropy": 0.40664401054382326, + "epoch": 0.40229885057471265, + "grad_norm": 1.7231062650680542, + "learning_rate": 4.985056712779449e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8717346429824829, + "num_tokens": 20641506.0, + "step": 2520 + }, + { + "entropy": 0.39899550676345824, + "epoch": 0.4030970625798212, + "grad_norm": 1.7191085815429688, + "learning_rate": 4.984997410920519e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.873437762260437, + "num_tokens": 20682466.0, + "step": 2525 + }, + { + "entropy": 0.41406258940696716, + "epoch": 0.40389527458492974, + "grad_norm": 1.402048945426941, + "learning_rate": 4.984937992098563e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8696890354156495, + "num_tokens": 20723426.0, + "step": 2530 + }, + { + "entropy": 0.3955492854118347, + "epoch": 0.4046934865900383, + "grad_norm": 1.577100396156311, + "learning_rate": 4.984878456317319e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8827073097229003, + "num_tokens": 20764386.0, + "step": 2535 + }, + { + "entropy": 0.43836078643798826, + "epoch": 0.4054916985951469, + "grad_norm": 1.6845039129257202, + "learning_rate": 4.98481880358053e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8614120006561279, + "num_tokens": 20805346.0, + "step": 2540 + }, + { + "entropy": 0.39585062861442566, + "epoch": 0.40628991060025543, + "grad_norm": 1.5522538423538208, + "learning_rate": 4.984759033891947e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8703001976013184, + "num_tokens": 20846306.0, + "step": 2545 + }, + { + "entropy": 0.3866327404975891, + "epoch": 0.407088122605364, + "grad_norm": 1.6172707080841064, + "learning_rate": 4.984699147255328e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8748281240463257, + "num_tokens": 20887266.0, + "step": 2550 + }, + { + "entropy": 0.3750840961933136, + "epoch": 0.4078863346104725, + "grad_norm": 1.4508105516433716, + "learning_rate": 4.98463914367444e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8814164757728576, + "num_tokens": 20928226.0, + "step": 2555 + }, + { + "entropy": 0.41147985458374026, + "epoch": 0.4086845466155811, + "grad_norm": 1.4683401584625244, + "learning_rate": 4.984579023153055e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8700311422348023, + "num_tokens": 20969186.0, + "step": 2560 + }, + { + "entropy": 0.392798638343811, + "epoch": 0.40948275862068967, + "grad_norm": 1.5705111026763916, + "learning_rate": 4.984518785694955e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8787789344787598, + "num_tokens": 21009740.0, + "step": 2565 + }, + { + "entropy": 0.3867809295654297, + "epoch": 0.4102809706257982, + "grad_norm": 1.5291118621826172, + "learning_rate": 4.984458431303926e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8778688192367554, + "num_tokens": 21050700.0, + "step": 2570 + }, + { + "entropy": 0.40426079630851747, + "epoch": 0.41107918263090676, + "grad_norm": 1.7074692249298096, + "learning_rate": 4.984397959983767e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8738163113594055, + "num_tokens": 21091660.0, + "step": 2575 + }, + { + "entropy": 0.3680769085884094, + "epoch": 0.4118773946360153, + "grad_norm": 1.4109336137771606, + "learning_rate": 4.984337371738276e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8867873907089233, + "num_tokens": 21132620.0, + "step": 2580 + }, + { + "entropy": 0.4069203794002533, + "epoch": 0.4126756066411239, + "grad_norm": 1.6039010286331177, + "learning_rate": 4.984276666571265e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8743149757385253, + "num_tokens": 21173580.0, + "step": 2585 + }, + { + "entropy": 0.3620123088359833, + "epoch": 0.41347381864623245, + "grad_norm": 1.5748003721237183, + "learning_rate": 4.984215844486552e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8843584299087525, + "num_tokens": 21214540.0, + "step": 2590 + }, + { + "entropy": 0.3968845188617706, + "epoch": 0.414272030651341, + "grad_norm": 1.5188629627227783, + "learning_rate": 4.984154905487961e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8736822605133057, + "num_tokens": 21255500.0, + "step": 2595 + }, + { + "entropy": 0.4000989317893982, + "epoch": 0.41507024265644954, + "grad_norm": 1.6347826719284058, + "learning_rate": 4.984093849579325e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8751783609390259, + "num_tokens": 21296460.0, + "step": 2600 + }, + { + "epoch": 0.41507024265644954, + "eval_entropy": 0.4033074338436127, + "eval_loss": 0.3634474575519562, + "eval_mean_token_accuracy": 0.8728289442062378, + "eval_num_tokens": 21296460.0, + "eval_runtime": 69.1125, + "eval_samples_per_second": 14.469, + "eval_steps_per_second": 1.809, + "step": 2600 + }, + { + "entropy": 0.38013476729393003, + "epoch": 0.4158684546615581, + "grad_norm": 1.4854322671890259, + "learning_rate": 4.984032676764482e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8823701858520507, + "num_tokens": 21337420.0, + "step": 2605 + }, + { + "entropy": 0.4014547288417816, + "epoch": 0.4166666666666667, + "grad_norm": 1.5105518102645874, + "learning_rate": 4.983971387047279e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.875412392616272, + "num_tokens": 21378380.0, + "step": 2610 + }, + { + "entropy": 0.39206184148788453, + "epoch": 0.41746487867177523, + "grad_norm": 1.7710962295532227, + "learning_rate": 4.983909980431572e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8740858912467957, + "num_tokens": 21419340.0, + "step": 2615 + }, + { + "entropy": 0.43426434993743895, + "epoch": 0.4182630906768838, + "grad_norm": 1.6484614610671997, + "learning_rate": 4.9838484569212195e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8656102776527405, + "num_tokens": 21460300.0, + "step": 2620 + }, + { + "entropy": 0.40168466567993166, + "epoch": 0.4190613026819923, + "grad_norm": 1.6557681560516357, + "learning_rate": 4.983786816520092e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8744718313217164, + "num_tokens": 21501260.0, + "step": 2625 + }, + { + "entropy": 0.40869095325469973, + "epoch": 0.4198595146871009, + "grad_norm": 1.9138432741165161, + "learning_rate": 4.983725059232066e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8678873181343079, + "num_tokens": 21542220.0, + "step": 2630 + }, + { + "entropy": 0.3851506352424622, + "epoch": 0.42065772669220947, + "grad_norm": 1.5859148502349854, + "learning_rate": 4.983663185061024e-06, + "loss": 0.336, + "mean_token_accuracy": 0.882533586025238, + "num_tokens": 21583180.0, + "step": 2635 + }, + { + "entropy": 0.414807003736496, + "epoch": 0.421455938697318, + "grad_norm": 1.611284613609314, + "learning_rate": 4.983601194010857e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8661292314529419, + "num_tokens": 21624140.0, + "step": 2640 + }, + { + "entropy": 0.3500566601753235, + "epoch": 0.42225415070242656, + "grad_norm": 1.4524935483932495, + "learning_rate": 4.983539086085464e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8918946385383606, + "num_tokens": 21665100.0, + "step": 2645 + }, + { + "entropy": 0.4368880867958069, + "epoch": 0.4230523627075351, + "grad_norm": 1.770884394645691, + "learning_rate": 4.983476861288751e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8645419955253602, + "num_tokens": 21706060.0, + "step": 2650 + }, + { + "entropy": 0.40582605004310607, + "epoch": 0.4238505747126437, + "grad_norm": 1.5398057699203491, + "learning_rate": 4.983414519624629e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8746888637542725, + "num_tokens": 21747020.0, + "step": 2655 + }, + { + "entropy": 0.38067981600761414, + "epoch": 0.42464878671775225, + "grad_norm": 1.5007768869400024, + "learning_rate": 4.983352061097018e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8799324750900268, + "num_tokens": 21787980.0, + "step": 2660 + }, + { + "entropy": 0.40576404333114624, + "epoch": 0.4254469987228608, + "grad_norm": 1.7250927686691284, + "learning_rate": 4.9832894857098476e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8712058424949646, + "num_tokens": 21828940.0, + "step": 2665 + }, + { + "entropy": 0.38202984929084777, + "epoch": 0.42624521072796934, + "grad_norm": 1.3978954553604126, + "learning_rate": 4.983226793467053e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.876945185661316, + "num_tokens": 21869900.0, + "step": 2670 + }, + { + "entropy": 0.3940341711044312, + "epoch": 0.4270434227330779, + "grad_norm": 1.5598876476287842, + "learning_rate": 4.983163984372575e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8790436387062073, + "num_tokens": 21910860.0, + "step": 2675 + }, + { + "entropy": 0.40419002771377566, + "epoch": 0.4278416347381865, + "grad_norm": 1.6398168802261353, + "learning_rate": 4.983101058430364e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.870511507987976, + "num_tokens": 21951820.0, + "step": 2680 + }, + { + "entropy": 0.41107907295227053, + "epoch": 0.428639846743295, + "grad_norm": 1.6293483972549438, + "learning_rate": 4.983038015644376e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.870405089855194, + "num_tokens": 21992780.0, + "step": 2685 + }, + { + "entropy": 0.40680499076843263, + "epoch": 0.4294380587484036, + "grad_norm": 1.5343927145004272, + "learning_rate": 4.982974856018576e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8700067877769471, + "num_tokens": 22033740.0, + "step": 2690 + }, + { + "entropy": 0.38571652173995974, + "epoch": 0.4302362707535121, + "grad_norm": 1.4988517761230469, + "learning_rate": 4.982911579556937e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8792693257331848, + "num_tokens": 22074700.0, + "step": 2695 + }, + { + "entropy": 0.39872742891311647, + "epoch": 0.43103448275862066, + "grad_norm": 1.4958524703979492, + "learning_rate": 4.982848186263436e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.873319935798645, + "num_tokens": 22115660.0, + "step": 2700 + }, + { + "entropy": 0.41141684651374816, + "epoch": 0.43183269476372926, + "grad_norm": 1.6881943941116333, + "learning_rate": 4.98278467614206e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.870596992969513, + "num_tokens": 22156620.0, + "step": 2705 + }, + { + "entropy": 0.3768575727939606, + "epoch": 0.4326309067688378, + "grad_norm": 1.5993489027023315, + "learning_rate": 4.982721049196804e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8819820642471313, + "num_tokens": 22197580.0, + "step": 2710 + }, + { + "entropy": 0.3916157424449921, + "epoch": 0.43342911877394635, + "grad_norm": 1.469970703125, + "learning_rate": 4.982657305431668e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.875360107421875, + "num_tokens": 22238540.0, + "step": 2715 + }, + { + "entropy": 0.4240422070026398, + "epoch": 0.4342273307790549, + "grad_norm": 1.6995346546173096, + "learning_rate": 4.982593444850658e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8621967434883118, + "num_tokens": 22279500.0, + "step": 2720 + }, + { + "entropy": 0.36775652766227723, + "epoch": 0.4350255427841635, + "grad_norm": 1.548226237297058, + "learning_rate": 4.982529467457795e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8852847933769226, + "num_tokens": 22320460.0, + "step": 2725 + }, + { + "entropy": 0.3609649360179901, + "epoch": 0.43582375478927204, + "grad_norm": 1.4408965110778809, + "learning_rate": 4.982465373257098e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.891053307056427, + "num_tokens": 22361420.0, + "step": 2730 + }, + { + "entropy": 0.3790753722190857, + "epoch": 0.4366219667943806, + "grad_norm": 1.5519766807556152, + "learning_rate": 4.982401162252599e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8807764530181885, + "num_tokens": 22402380.0, + "step": 2735 + }, + { + "entropy": 0.43087227940559386, + "epoch": 0.43742017879948913, + "grad_norm": 1.7707847356796265, + "learning_rate": 4.982336834448336e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8637638330459595, + "num_tokens": 22443340.0, + "step": 2740 + }, + { + "entropy": 0.3714826762676239, + "epoch": 0.4382183908045977, + "grad_norm": 1.6115907430648804, + "learning_rate": 4.982272389848354e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.880763578414917, + "num_tokens": 22484300.0, + "step": 2745 + }, + { + "entropy": 0.37650308609008787, + "epoch": 0.4390166028097063, + "grad_norm": 1.711526870727539, + "learning_rate": 4.982207828456705e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8830351948738098, + "num_tokens": 22525260.0, + "step": 2750 + }, + { + "entropy": 0.4391146719455719, + "epoch": 0.4398148148148148, + "grad_norm": 1.8127883672714233, + "learning_rate": 4.982143150277448e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8630771994590759, + "num_tokens": 22566220.0, + "step": 2755 + }, + { + "entropy": 0.41789884567260743, + "epoch": 0.44061302681992337, + "grad_norm": 1.5921564102172852, + "learning_rate": 4.982078355314654e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8702207565307617, + "num_tokens": 22607180.0, + "step": 2760 + }, + { + "entropy": 0.38065919280052185, + "epoch": 0.4414112388250319, + "grad_norm": 1.8245171308517456, + "learning_rate": 4.982013443572392e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8805180788040161, + "num_tokens": 22648140.0, + "step": 2765 + }, + { + "entropy": 0.38244200944900514, + "epoch": 0.44220945083014046, + "grad_norm": 1.4093703031539917, + "learning_rate": 4.9819484150547485e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8840342402458191, + "num_tokens": 22689100.0, + "step": 2770 + }, + { + "entropy": 0.4221570134162903, + "epoch": 0.44300766283524906, + "grad_norm": 1.7415305376052856, + "learning_rate": 4.981883269765809e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8687703728675842, + "num_tokens": 22730060.0, + "step": 2775 + }, + { + "entropy": 0.3891486167907715, + "epoch": 0.4438058748403576, + "grad_norm": 1.4292845726013184, + "learning_rate": 4.981818007709674e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.879124915599823, + "num_tokens": 22771020.0, + "step": 2780 + }, + { + "entropy": 0.4010182499885559, + "epoch": 0.44460408684546615, + "grad_norm": 1.5577126741409302, + "learning_rate": 4.981752628890445e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.87374427318573, + "num_tokens": 22811980.0, + "step": 2785 + }, + { + "entropy": 0.35749190449714663, + "epoch": 0.4454022988505747, + "grad_norm": 1.5466744899749756, + "learning_rate": 4.981687133312233e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8853784084320069, + "num_tokens": 22852940.0, + "step": 2790 + }, + { + "entropy": 0.42231556177139284, + "epoch": 0.4462005108556833, + "grad_norm": 1.9182018041610718, + "learning_rate": 4.981621520979157e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8668047189712524, + "num_tokens": 22893900.0, + "step": 2795 + }, + { + "entropy": 0.37705173492431643, + "epoch": 0.44699872286079184, + "grad_norm": 1.5075479745864868, + "learning_rate": 4.9815557918953444e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8799205899238587, + "num_tokens": 22934860.0, + "step": 2800 + }, + { + "epoch": 0.44699872286079184, + "eval_entropy": 0.4021283230781555, + "eval_loss": 0.3617970049381256, + "eval_mean_token_accuracy": 0.8734477977752686, + "eval_num_tokens": 22934860.0, + "eval_runtime": 69.2427, + "eval_samples_per_second": 14.442, + "eval_steps_per_second": 1.805, + "step": 2800 + }, + { + "entropy": 0.4065449595451355, + "epoch": 0.4477969348659004, + "grad_norm": 1.647086262702942, + "learning_rate": 4.981489946064926e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.871274995803833, + "num_tokens": 22975820.0, + "step": 2805 + }, + { + "entropy": 0.3849412977695465, + "epoch": 0.44859514687100893, + "grad_norm": 1.7600377798080444, + "learning_rate": 4.9814239834920445e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.878504502773285, + "num_tokens": 23016780.0, + "step": 2810 + }, + { + "entropy": 0.40428113341331484, + "epoch": 0.4493933588761175, + "grad_norm": 1.734229564666748, + "learning_rate": 4.981357904180847e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8724266767501831, + "num_tokens": 23057740.0, + "step": 2815 + }, + { + "entropy": 0.38803368210792544, + "epoch": 0.4501915708812261, + "grad_norm": 1.6498732566833496, + "learning_rate": 4.981291708135488e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8783321976661682, + "num_tokens": 23098700.0, + "step": 2820 + }, + { + "entropy": 0.4112163186073303, + "epoch": 0.4509897828863346, + "grad_norm": 1.5178463459014893, + "learning_rate": 4.981225395360131e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8709302544593811, + "num_tokens": 23139660.0, + "step": 2825 + }, + { + "entropy": 0.4096134901046753, + "epoch": 0.45178799489144317, + "grad_norm": 1.6412782669067383, + "learning_rate": 4.9811589658589464e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8746277451515198, + "num_tokens": 23180620.0, + "step": 2830 + }, + { + "entropy": 0.3961434066295624, + "epoch": 0.4525862068965517, + "grad_norm": 1.5939996242523193, + "learning_rate": 4.981092419636111e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8744441866874695, + "num_tokens": 23221580.0, + "step": 2835 + }, + { + "entropy": 0.40303857922554015, + "epoch": 0.45338441890166026, + "grad_norm": 1.6157113313674927, + "learning_rate": 4.981025756695809e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8736197471618652, + "num_tokens": 23262540.0, + "step": 2840 + }, + { + "entropy": 0.38037262558937074, + "epoch": 0.45418263090676886, + "grad_norm": 1.565201997756958, + "learning_rate": 4.980958977042233e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8818344831466675, + "num_tokens": 23303500.0, + "step": 2845 + }, + { + "entropy": 0.39692175984382627, + "epoch": 0.4549808429118774, + "grad_norm": 1.5817967653274536, + "learning_rate": 4.980892080679582e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8763553977012635, + "num_tokens": 23344460.0, + "step": 2850 + }, + { + "entropy": 0.40519008636474607, + "epoch": 0.45577905491698595, + "grad_norm": 1.681086778640747, + "learning_rate": 4.980825067612063e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8710267305374145, + "num_tokens": 23385201.0, + "step": 2855 + }, + { + "entropy": 0.4324694097042084, + "epoch": 0.4565772669220945, + "grad_norm": 1.7982279062271118, + "learning_rate": 4.9807579378438905e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8650486469268799, + "num_tokens": 23426161.0, + "step": 2860 + }, + { + "entropy": 0.3910203635692596, + "epoch": 0.45737547892720304, + "grad_norm": 1.6430963277816772, + "learning_rate": 4.980690691379284e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8780064105987548, + "num_tokens": 23467121.0, + "step": 2865 + }, + { + "entropy": 0.40927610993385316, + "epoch": 0.45817369093231164, + "grad_norm": 1.5375142097473145, + "learning_rate": 4.980623328222475e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8677163481712341, + "num_tokens": 23508081.0, + "step": 2870 + }, + { + "entropy": 0.40993956923484803, + "epoch": 0.4589719029374202, + "grad_norm": 1.4786747694015503, + "learning_rate": 4.980555848377696e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8713637351989746, + "num_tokens": 23549041.0, + "step": 2875 + }, + { + "entropy": 0.3585766851902008, + "epoch": 0.45977011494252873, + "grad_norm": 1.4331398010253906, + "learning_rate": 4.9804882518491936e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8895103693008423, + "num_tokens": 23590001.0, + "step": 2880 + }, + { + "entropy": 0.42248719930648804, + "epoch": 0.4605683269476373, + "grad_norm": 1.7327814102172852, + "learning_rate": 4.980420538641217e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8655761480331421, + "num_tokens": 23630961.0, + "step": 2885 + }, + { + "entropy": 0.39616808891296384, + "epoch": 0.4613665389527459, + "grad_norm": 1.533501148223877, + "learning_rate": 4.980352708758025e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8772770881652832, + "num_tokens": 23671921.0, + "step": 2890 + }, + { + "entropy": 0.4002328336238861, + "epoch": 0.4621647509578544, + "grad_norm": 1.5775973796844482, + "learning_rate": 4.980284762203882e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.878447163105011, + "num_tokens": 23712881.0, + "step": 2895 + }, + { + "entropy": 0.38118772506713866, + "epoch": 0.46296296296296297, + "grad_norm": 1.5659089088439941, + "learning_rate": 4.98021669898306e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8805113434791565, + "num_tokens": 23753841.0, + "step": 2900 + }, + { + "entropy": 0.4109515011310577, + "epoch": 0.4637611749680715, + "grad_norm": 1.481997013092041, + "learning_rate": 4.980148519099842e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8691876173019409, + "num_tokens": 23794801.0, + "step": 2905 + }, + { + "entropy": 0.4154976367950439, + "epoch": 0.46455938697318006, + "grad_norm": 1.6384352445602417, + "learning_rate": 4.980080222558512e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8686699986457824, + "num_tokens": 23835761.0, + "step": 2910 + }, + { + "entropy": 0.44558030366897583, + "epoch": 0.46535759897828866, + "grad_norm": 1.8630101680755615, + "learning_rate": 4.9800118093633675e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8595394015312194, + "num_tokens": 23876721.0, + "step": 2915 + }, + { + "entropy": 0.46090860962867736, + "epoch": 0.4661558109833972, + "grad_norm": 1.6846959590911865, + "learning_rate": 4.979943279518709e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8549802303314209, + "num_tokens": 23917681.0, + "step": 2920 + }, + { + "entropy": 0.36205923557281494, + "epoch": 0.46695402298850575, + "grad_norm": 1.501598834991455, + "learning_rate": 4.979874633028846e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8894065141677856, + "num_tokens": 23958641.0, + "step": 2925 + }, + { + "entropy": 0.4432215213775635, + "epoch": 0.4677522349936143, + "grad_norm": 1.6874146461486816, + "learning_rate": 4.979805869898095e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8603995084762573, + "num_tokens": 23999601.0, + "step": 2930 + }, + { + "entropy": 0.39056233167648313, + "epoch": 0.46855044699872284, + "grad_norm": 1.5292900800704956, + "learning_rate": 4.9797369901307815e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8762812137603759, + "num_tokens": 24040561.0, + "step": 2935 + }, + { + "entropy": 0.37874605059623717, + "epoch": 0.46934865900383144, + "grad_norm": 1.7269057035446167, + "learning_rate": 4.979667993731235e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8776691555976868, + "num_tokens": 24081521.0, + "step": 2940 + }, + { + "entropy": 0.38322940468788147, + "epoch": 0.47014687100894, + "grad_norm": 1.7017403841018677, + "learning_rate": 4.979598880703796e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8816770792007447, + "num_tokens": 24122481.0, + "step": 2945 + }, + { + "entropy": 0.364909029006958, + "epoch": 0.4709450830140485, + "grad_norm": 1.543266773223877, + "learning_rate": 4.979529651052809e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.884519624710083, + "num_tokens": 24163441.0, + "step": 2950 + }, + { + "entropy": 0.34366172552108765, + "epoch": 0.47174329501915707, + "grad_norm": 1.5701253414154053, + "learning_rate": 4.979460304782628e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8912683486938476, + "num_tokens": 24204401.0, + "step": 2955 + }, + { + "entropy": 0.4112046599388123, + "epoch": 0.4725415070242657, + "grad_norm": 1.6567833423614502, + "learning_rate": 4.979390841897615e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8730028867721558, + "num_tokens": 24245361.0, + "step": 2960 + }, + { + "entropy": 0.3741000950336456, + "epoch": 0.4733397190293742, + "grad_norm": 1.6795862913131714, + "learning_rate": 4.979321262402136e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8805624365806579, + "num_tokens": 24286321.0, + "step": 2965 + }, + { + "entropy": 0.3774628400802612, + "epoch": 0.47413793103448276, + "grad_norm": 1.640445590019226, + "learning_rate": 4.979251566300568e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8825162529945374, + "num_tokens": 24327281.0, + "step": 2970 + }, + { + "entropy": 0.3802816331386566, + "epoch": 0.4749361430395913, + "grad_norm": 1.6335704326629639, + "learning_rate": 4.979181753597292e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8781920194625854, + "num_tokens": 24368241.0, + "step": 2975 + }, + { + "entropy": 0.3621665060520172, + "epoch": 0.47573435504469985, + "grad_norm": 1.4962271451950073, + "learning_rate": 4.9791118242966996e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8808696627616882, + "num_tokens": 24409201.0, + "step": 2980 + }, + { + "entropy": 0.37013022899627684, + "epoch": 0.47653256704980845, + "grad_norm": 1.4742448329925537, + "learning_rate": 4.9790417784031875e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8842559218406677, + "num_tokens": 24450161.0, + "step": 2985 + }, + { + "entropy": 0.3834933042526245, + "epoch": 0.477330779054917, + "grad_norm": 1.6567814350128174, + "learning_rate": 4.978971615921161e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8838761329650879, + "num_tokens": 24491121.0, + "step": 2990 + }, + { + "entropy": 0.41710891723632815, + "epoch": 0.47812899106002554, + "grad_norm": 1.7117583751678467, + "learning_rate": 4.978901336855031e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8669528484344482, + "num_tokens": 24532081.0, + "step": 2995 + }, + { + "entropy": 0.37902289628982544, + "epoch": 0.4789272030651341, + "grad_norm": 1.5434080362319946, + "learning_rate": 4.978830941209218e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8823231101036072, + "num_tokens": 24573041.0, + "step": 3000 + }, + { + "epoch": 0.4789272030651341, + "eval_entropy": 0.4031079981327057, + "eval_loss": 0.36067894101142883, + "eval_mean_token_accuracy": 0.8737649130821228, + "eval_num_tokens": 24573041.0, + "eval_runtime": 69.1391, + "eval_samples_per_second": 14.464, + "eval_steps_per_second": 1.808, + "step": 3000 + }, + { + "entropy": 0.3913613796234131, + "epoch": 0.47972541507024263, + "grad_norm": 1.642884612083435, + "learning_rate": 4.978760428988149e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8776594400405884, + "num_tokens": 24614001.0, + "step": 3005 + }, + { + "entropy": 0.39155340790748594, + "epoch": 0.48052362707535123, + "grad_norm": 1.5776258707046509, + "learning_rate": 4.978689800196257e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8766720056533813, + "num_tokens": 24654961.0, + "step": 3010 + }, + { + "entropy": 0.36276376247406006, + "epoch": 0.4813218390804598, + "grad_norm": 1.6027809381484985, + "learning_rate": 4.978619054837984e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8871202230453491, + "num_tokens": 24695587.0, + "step": 3015 + }, + { + "entropy": 0.40595887303352357, + "epoch": 0.4821200510855683, + "grad_norm": 1.7121602296829224, + "learning_rate": 4.978548192917779e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8705784201622009, + "num_tokens": 24736547.0, + "step": 3020 + }, + { + "entropy": 0.39229520559310915, + "epoch": 0.48291826309067687, + "grad_norm": 1.6675854921340942, + "learning_rate": 4.9784772144400976e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8745187759399414, + "num_tokens": 24777507.0, + "step": 3025 + }, + { + "entropy": 0.44495280385017394, + "epoch": 0.4837164750957854, + "grad_norm": 1.521255612373352, + "learning_rate": 4.978406119409403e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8569184064865112, + "num_tokens": 24818467.0, + "step": 3030 + }, + { + "entropy": 0.41049495339393616, + "epoch": 0.484514687100894, + "grad_norm": 1.782544732093811, + "learning_rate": 4.978334907830164e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8683913707733154, + "num_tokens": 24859427.0, + "step": 3035 + }, + { + "entropy": 0.3862737715244293, + "epoch": 0.48531289910600256, + "grad_norm": 1.696000576019287, + "learning_rate": 4.978263579706862e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8780618906021118, + "num_tokens": 24900387.0, + "step": 3040 + }, + { + "entropy": 0.3594102025032043, + "epoch": 0.4861111111111111, + "grad_norm": 1.5718748569488525, + "learning_rate": 4.978192135043982e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8889271020889282, + "num_tokens": 24941347.0, + "step": 3045 + }, + { + "entropy": 0.3866430759429932, + "epoch": 0.48690932311621965, + "grad_norm": 1.7633681297302246, + "learning_rate": 4.9781205738460155e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8774181127548217, + "num_tokens": 24982307.0, + "step": 3050 + }, + { + "entropy": 0.40436485409736633, + "epoch": 0.48770753512132825, + "grad_norm": 1.350527286529541, + "learning_rate": 4.978048896117462e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8719983696937561, + "num_tokens": 25023267.0, + "step": 3055 + }, + { + "entropy": 0.40219146609306333, + "epoch": 0.4885057471264368, + "grad_norm": 1.4634770154953003, + "learning_rate": 4.9779771018628296e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8723389029502868, + "num_tokens": 25064227.0, + "step": 3060 + }, + { + "entropy": 0.3927314758300781, + "epoch": 0.48930395913154534, + "grad_norm": 1.6988699436187744, + "learning_rate": 4.977905191086634e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8752070426940918, + "num_tokens": 25105187.0, + "step": 3065 + }, + { + "entropy": 0.37555994391441344, + "epoch": 0.4901021711366539, + "grad_norm": 1.4584192037582397, + "learning_rate": 4.977833163793395e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8834568738937378, + "num_tokens": 25146147.0, + "step": 3070 + }, + { + "entropy": 0.40433894395828246, + "epoch": 0.49090038314176243, + "grad_norm": 1.6292520761489868, + "learning_rate": 4.977761019987642e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8728869557380676, + "num_tokens": 25187107.0, + "step": 3075 + }, + { + "entropy": 0.38123475313186644, + "epoch": 0.49169859514687103, + "grad_norm": 1.3067562580108643, + "learning_rate": 4.977688759673916e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8789713621139527, + "num_tokens": 25228067.0, + "step": 3080 + }, + { + "entropy": 0.37763266563415526, + "epoch": 0.4924968071519796, + "grad_norm": 1.6079351902008057, + "learning_rate": 4.977616382856755e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8850642085075379, + "num_tokens": 25269027.0, + "step": 3085 + }, + { + "entropy": 0.3939137518405914, + "epoch": 0.4932950191570881, + "grad_norm": 1.75822913646698, + "learning_rate": 4.977543889540713e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8792263031005859, + "num_tokens": 25309987.0, + "step": 3090 + }, + { + "entropy": 0.36902579069137575, + "epoch": 0.49409323116219667, + "grad_norm": 1.7407400608062744, + "learning_rate": 4.977471279730349e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8828243494033814, + "num_tokens": 25350947.0, + "step": 3095 + }, + { + "entropy": 0.4192457675933838, + "epoch": 0.4948914431673052, + "grad_norm": 1.7239857912063599, + "learning_rate": 4.97739855343023e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8699460387229919, + "num_tokens": 25391907.0, + "step": 3100 + }, + { + "entropy": 0.377583783864975, + "epoch": 0.4956896551724138, + "grad_norm": 1.643020510673523, + "learning_rate": 4.977325710644927e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8839346528053283, + "num_tokens": 25432867.0, + "step": 3105 + }, + { + "entropy": 0.37346488833427427, + "epoch": 0.49648786717752236, + "grad_norm": 1.7278952598571777, + "learning_rate": 4.977252751379021e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8824220895767212, + "num_tokens": 25473805.0, + "step": 3110 + }, + { + "entropy": 0.3702260494232178, + "epoch": 0.4972860791826309, + "grad_norm": 1.620824933052063, + "learning_rate": 4.9771796756371005e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.881265652179718, + "num_tokens": 25514765.0, + "step": 3115 + }, + { + "entropy": 0.3484692215919495, + "epoch": 0.49808429118773945, + "grad_norm": 1.5499067306518555, + "learning_rate": 4.9771064834237605e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8905044913291931, + "num_tokens": 25555725.0, + "step": 3120 + }, + { + "entropy": 0.4137328088283539, + "epoch": 0.49888250319284805, + "grad_norm": 1.6705187559127808, + "learning_rate": 4.977033174743604e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8712103247642518, + "num_tokens": 25596685.0, + "step": 3125 + }, + { + "entropy": 0.4071082234382629, + "epoch": 0.4996807151979566, + "grad_norm": 1.6194504499435425, + "learning_rate": 4.9769597496012405e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8755745053291321, + "num_tokens": 25637645.0, + "step": 3130 + }, + { + "entropy": 0.41910150051116946, + "epoch": 0.5004789272030651, + "grad_norm": 1.5641275644302368, + "learning_rate": 4.976886208001287e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8706887483596801, + "num_tokens": 25678605.0, + "step": 3135 + }, + { + "entropy": 0.4073460757732391, + "epoch": 0.5012771392081737, + "grad_norm": 1.7469613552093506, + "learning_rate": 4.9768125499483695e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8685148596763611, + "num_tokens": 25719565.0, + "step": 3140 + }, + { + "entropy": 0.39724062085151673, + "epoch": 0.5020753512132823, + "grad_norm": 1.6528843641281128, + "learning_rate": 4.976738775447118e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8764813065528869, + "num_tokens": 25760525.0, + "step": 3145 + }, + { + "entropy": 0.36525858044624326, + "epoch": 0.5028735632183908, + "grad_norm": 1.4947924613952637, + "learning_rate": 4.976664884502172e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8846045970916748, + "num_tokens": 25801485.0, + "step": 3150 + }, + { + "entropy": 0.39498506784439086, + "epoch": 0.5036717752234994, + "grad_norm": 1.5894827842712402, + "learning_rate": 4.9765908771181795e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8751391053199769, + "num_tokens": 25842445.0, + "step": 3155 + }, + { + "entropy": 0.3828314483165741, + "epoch": 0.5044699872286079, + "grad_norm": 1.512500524520874, + "learning_rate": 4.976516753299793e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8800006866455078, + "num_tokens": 25883405.0, + "step": 3160 + }, + { + "entropy": 0.39771613478660583, + "epoch": 0.5052681992337165, + "grad_norm": 1.5987226963043213, + "learning_rate": 4.976442513051674e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8785083174705506, + "num_tokens": 25924365.0, + "step": 3165 + }, + { + "entropy": 0.44791218638420105, + "epoch": 0.506066411238825, + "grad_norm": 1.728903889656067, + "learning_rate": 4.97636815637849e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8605072498321533, + "num_tokens": 25965325.0, + "step": 3170 + }, + { + "entropy": 0.4219264924526215, + "epoch": 0.5068646232439336, + "grad_norm": 1.9006085395812988, + "learning_rate": 4.976293683284918e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8648006916046143, + "num_tokens": 26006285.0, + "step": 3175 + }, + { + "entropy": 0.3821633756160736, + "epoch": 0.5076628352490421, + "grad_norm": 1.7359493970870972, + "learning_rate": 4.976219093775642e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8779027462005615, + "num_tokens": 26047245.0, + "step": 3180 + }, + { + "entropy": 0.4139282822608948, + "epoch": 0.5084610472541508, + "grad_norm": 1.6639729738235474, + "learning_rate": 4.976144387855351e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8683338642120362, + "num_tokens": 26088205.0, + "step": 3185 + }, + { + "entropy": 0.3756455063819885, + "epoch": 0.5092592592592593, + "grad_norm": 1.600915551185608, + "learning_rate": 4.976069565528743e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8827805042266845, + "num_tokens": 26129165.0, + "step": 3190 + }, + { + "entropy": 0.4037862837314606, + "epoch": 0.5100574712643678, + "grad_norm": 1.614333987236023, + "learning_rate": 4.9759946268005224e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8748448014259338, + "num_tokens": 26170125.0, + "step": 3195 + }, + { + "entropy": 0.4018108367919922, + "epoch": 0.5108556832694764, + "grad_norm": 1.522694706916809, + "learning_rate": 4.975919571675403e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8725090503692627, + "num_tokens": 26211085.0, + "step": 3200 + }, + { + "epoch": 0.5108556832694764, + "eval_entropy": 0.3950038187503815, + "eval_loss": 0.35790205001831055, + "eval_mean_token_accuracy": 0.8749810900688172, + "eval_num_tokens": 26211085.0, + "eval_runtime": 69.1405, + "eval_samples_per_second": 14.463, + "eval_steps_per_second": 1.808, + "step": 3200 + }, + { + "entropy": 0.4304787814617157, + "epoch": 0.5116538952745849, + "grad_norm": 1.5640943050384521, + "learning_rate": 4.975844400158104e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8626617193222046, + "num_tokens": 26252045.0, + "step": 3205 + }, + { + "entropy": 0.38331787586212157, + "epoch": 0.5124521072796935, + "grad_norm": 1.618471384048462, + "learning_rate": 4.975769112253352e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8774407744407654, + "num_tokens": 26293005.0, + "step": 3210 + }, + { + "entropy": 0.3924212872982025, + "epoch": 0.513250319284802, + "grad_norm": 1.6554707288742065, + "learning_rate": 4.975693707965882e-06, + "loss": 0.35, + "mean_token_accuracy": 0.878236734867096, + "num_tokens": 26333965.0, + "step": 3215 + }, + { + "entropy": 0.3892656922340393, + "epoch": 0.5140485312899106, + "grad_norm": 1.5585850477218628, + "learning_rate": 4.975618187300435e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.87829430103302, + "num_tokens": 26374925.0, + "step": 3220 + }, + { + "entropy": 0.42128477096557615, + "epoch": 0.5148467432950191, + "grad_norm": 1.7279189825057983, + "learning_rate": 4.975542550261761e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8681336760520935, + "num_tokens": 26415885.0, + "step": 3225 + }, + { + "entropy": 0.37716048359870913, + "epoch": 0.5156449553001277, + "grad_norm": 1.4353517293930054, + "learning_rate": 4.975466796854615e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8789421796798706, + "num_tokens": 26456845.0, + "step": 3230 + }, + { + "entropy": 0.3674571573734283, + "epoch": 0.5164431673052363, + "grad_norm": 1.4046391248703003, + "learning_rate": 4.975390927083762e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8862491250038147, + "num_tokens": 26497805.0, + "step": 3235 + }, + { + "entropy": 0.41730799078941344, + "epoch": 0.5172413793103449, + "grad_norm": 1.554059624671936, + "learning_rate": 4.975314940953972e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8708946228027343, + "num_tokens": 26538765.0, + "step": 3240 + }, + { + "entropy": 0.363294380903244, + "epoch": 0.5180395913154534, + "grad_norm": 1.5008974075317383, + "learning_rate": 4.9752388384700235e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8853865742683411, + "num_tokens": 26579725.0, + "step": 3245 + }, + { + "entropy": 0.3818063735961914, + "epoch": 0.518837803320562, + "grad_norm": 1.5581785440444946, + "learning_rate": 4.975162619636702e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8779454112052918, + "num_tokens": 26620685.0, + "step": 3250 + }, + { + "entropy": 0.4436651051044464, + "epoch": 0.5196360153256705, + "grad_norm": 1.7735083103179932, + "learning_rate": 4.975086284458801e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.859112286567688, + "num_tokens": 26661645.0, + "step": 3255 + }, + { + "entropy": 0.4206997752189636, + "epoch": 0.520434227330779, + "grad_norm": 1.5185121297836304, + "learning_rate": 4.97500983294112e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.868882167339325, + "num_tokens": 26702605.0, + "step": 3260 + }, + { + "entropy": 0.36768691539764403, + "epoch": 0.5212324393358876, + "grad_norm": 1.4248138666152954, + "learning_rate": 4.974933265088468e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8884208679199219, + "num_tokens": 26743565.0, + "step": 3265 + }, + { + "entropy": 0.37526572942733766, + "epoch": 0.5220306513409961, + "grad_norm": 1.447968602180481, + "learning_rate": 4.974856580905656e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8792654156684876, + "num_tokens": 26784525.0, + "step": 3270 + }, + { + "entropy": 0.35951876640319824, + "epoch": 0.5228288633461047, + "grad_norm": 1.3812257051467896, + "learning_rate": 4.974779780397511e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.883990478515625, + "num_tokens": 26825485.0, + "step": 3275 + }, + { + "entropy": 0.3781895101070404, + "epoch": 0.5236270753512133, + "grad_norm": 1.5094199180603027, + "learning_rate": 4.974702863568859e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8790646076202393, + "num_tokens": 26866445.0, + "step": 3280 + }, + { + "entropy": 0.41062875390052794, + "epoch": 0.5244252873563219, + "grad_norm": 1.8724991083145142, + "learning_rate": 4.9746258304245385e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8707447409629822, + "num_tokens": 26907405.0, + "step": 3285 + }, + { + "entropy": 0.35251411199569704, + "epoch": 0.5252234993614304, + "grad_norm": 1.6266639232635498, + "learning_rate": 4.974548680969394e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8902901887893677, + "num_tokens": 26948365.0, + "step": 3290 + }, + { + "entropy": 0.39582002758979795, + "epoch": 0.526021711366539, + "grad_norm": 1.7405627965927124, + "learning_rate": 4.974471415208275e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8775237441062927, + "num_tokens": 26989325.0, + "step": 3295 + }, + { + "entropy": 0.4057934045791626, + "epoch": 0.5268199233716475, + "grad_norm": 1.7901203632354736, + "learning_rate": 4.974394033146042e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.870267128944397, + "num_tokens": 27030285.0, + "step": 3300 + }, + { + "entropy": 0.3816504716873169, + "epoch": 0.5276181353767561, + "grad_norm": 1.5628036260604858, + "learning_rate": 4.97431653478756e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8798413515090943, + "num_tokens": 27071245.0, + "step": 3305 + }, + { + "entropy": 0.3856058239936829, + "epoch": 0.5284163473818646, + "grad_norm": 1.5495929718017578, + "learning_rate": 4.9742389201377025e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8771246552467347, + "num_tokens": 27112205.0, + "step": 3310 + }, + { + "entropy": 0.4250369966030121, + "epoch": 0.5292145593869731, + "grad_norm": 1.7381726503372192, + "learning_rate": 4.974161189201351e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8662177205085755, + "num_tokens": 27153165.0, + "step": 3315 + }, + { + "entropy": 0.408034086227417, + "epoch": 0.5300127713920817, + "grad_norm": 1.7087209224700928, + "learning_rate": 4.974083341983393e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.872035276889801, + "num_tokens": 27194125.0, + "step": 3320 + }, + { + "entropy": 0.41455634832382204, + "epoch": 0.5308109833971902, + "grad_norm": 1.5264880657196045, + "learning_rate": 4.974005378488724e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8700628280639648, + "num_tokens": 27235085.0, + "step": 3325 + }, + { + "entropy": 0.3727712512016296, + "epoch": 0.5316091954022989, + "grad_norm": 1.5364115238189697, + "learning_rate": 4.973927298722247e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8815152525901795, + "num_tokens": 27276045.0, + "step": 3330 + }, + { + "entropy": 0.38355987668037417, + "epoch": 0.5324074074074074, + "grad_norm": 1.6234145164489746, + "learning_rate": 4.973849102688869e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.879750919342041, + "num_tokens": 27317005.0, + "step": 3335 + }, + { + "entropy": 0.3729994535446167, + "epoch": 0.533205619412516, + "grad_norm": 1.6391899585723877, + "learning_rate": 4.973770790393511e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8811629414558411, + "num_tokens": 27357965.0, + "step": 3340 + }, + { + "entropy": 0.42625300884246825, + "epoch": 0.5340038314176245, + "grad_norm": 1.614808440208435, + "learning_rate": 4.973692361841096e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8686782002449036, + "num_tokens": 27398925.0, + "step": 3345 + }, + { + "entropy": 0.4034900009632111, + "epoch": 0.5348020434227331, + "grad_norm": 1.578659176826477, + "learning_rate": 4.973613817036555e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8695542097091675, + "num_tokens": 27439885.0, + "step": 3350 + }, + { + "entropy": 0.344242250919342, + "epoch": 0.5356002554278416, + "grad_norm": 1.570183277130127, + "learning_rate": 4.973535155984829e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8920693874359131, + "num_tokens": 27480845.0, + "step": 3355 + }, + { + "entropy": 0.3859936952590942, + "epoch": 0.5363984674329502, + "grad_norm": 1.7761328220367432, + "learning_rate": 4.973456378690864e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8740076899528504, + "num_tokens": 27521805.0, + "step": 3360 + }, + { + "entropy": 0.4139319658279419, + "epoch": 0.5371966794380587, + "grad_norm": 1.5927379131317139, + "learning_rate": 4.973377485159612e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8684604287147522, + "num_tokens": 27562765.0, + "step": 3365 + }, + { + "entropy": 0.37344526648521426, + "epoch": 0.5379948914431673, + "grad_norm": 1.5802881717681885, + "learning_rate": 4.973298475396037e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.884388267993927, + "num_tokens": 27603725.0, + "step": 3370 + }, + { + "entropy": 0.4616575360298157, + "epoch": 0.5387931034482759, + "grad_norm": 1.5784549713134766, + "learning_rate": 4.973219349405104e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8549342632293702, + "num_tokens": 27644685.0, + "step": 3375 + }, + { + "entropy": 0.40048813819885254, + "epoch": 0.5395913154533845, + "grad_norm": 1.5036301612854004, + "learning_rate": 4.973140107191792e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8724414825439453, + "num_tokens": 27685645.0, + "step": 3380 + }, + { + "entropy": 0.38270660638809206, + "epoch": 0.540389527458493, + "grad_norm": 1.6607155799865723, + "learning_rate": 4.973060748761081e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8778811931610108, + "num_tokens": 27726605.0, + "step": 3385 + }, + { + "entropy": 0.35836416482925415, + "epoch": 0.5411877394636015, + "grad_norm": 1.6206783056259155, + "learning_rate": 4.972981274117965e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8881120204925537, + "num_tokens": 27767565.0, + "step": 3390 + }, + { + "entropy": 0.38160343170166017, + "epoch": 0.5419859514687101, + "grad_norm": 1.6077865362167358, + "learning_rate": 4.9729016832674385e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8760847568511962, + "num_tokens": 27808525.0, + "step": 3395 + }, + { + "entropy": 0.40834884643554686, + "epoch": 0.5427841634738186, + "grad_norm": 2.050952911376953, + "learning_rate": 4.972821976214507e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.87137770652771, + "num_tokens": 27849485.0, + "step": 3400 + }, + { + "epoch": 0.5427841634738186, + "eval_entropy": 0.40098182344436645, + "eval_loss": 0.35640591382980347, + "eval_mean_token_accuracy": 0.8753156752586365, + "eval_num_tokens": 27849485.0, + "eval_runtime": 69.2629, + "eval_samples_per_second": 14.438, + "eval_steps_per_second": 1.805, + "step": 3400 + }, + { + "entropy": 0.3933412194252014, + "epoch": 0.5435823754789272, + "grad_norm": 1.3825479745864868, + "learning_rate": 4.972742152964184e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8757568240165711, + "num_tokens": 27890445.0, + "step": 3405 + }, + { + "entropy": 0.3877404689788818, + "epoch": 0.5443805874840357, + "grad_norm": 1.7192943096160889, + "learning_rate": 4.9726622135214876e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8776248097419739, + "num_tokens": 27931405.0, + "step": 3410 + }, + { + "entropy": 0.4037211060523987, + "epoch": 0.5451787994891443, + "grad_norm": 1.6639846563339233, + "learning_rate": 4.9725821578914454e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8690607786178589, + "num_tokens": 27972365.0, + "step": 3415 + }, + { + "entropy": 0.39761547446250917, + "epoch": 0.5459770114942529, + "grad_norm": 1.7780673503875732, + "learning_rate": 4.972501986079093e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8701196074485779, + "num_tokens": 28013325.0, + "step": 3420 + }, + { + "entropy": 0.374272894859314, + "epoch": 0.5467752234993615, + "grad_norm": 1.5579581260681152, + "learning_rate": 4.972421698089469e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8811684727668763, + "num_tokens": 28054285.0, + "step": 3425 + }, + { + "entropy": 0.37306315898895265, + "epoch": 0.54757343550447, + "grad_norm": 1.443179965019226, + "learning_rate": 4.9723412939276235e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8828484654426575, + "num_tokens": 28095245.0, + "step": 3430 + }, + { + "entropy": 0.3839456558227539, + "epoch": 0.5483716475095786, + "grad_norm": 1.6152342557907104, + "learning_rate": 4.972260773598614e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8802782416343689, + "num_tokens": 28136205.0, + "step": 3435 + }, + { + "entropy": 0.34850752353668213, + "epoch": 0.5491698595146871, + "grad_norm": 1.3699902296066284, + "learning_rate": 4.972180137107502e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8939025521278381, + "num_tokens": 28177165.0, + "step": 3440 + }, + { + "entropy": 0.42212756276130675, + "epoch": 0.5499680715197957, + "grad_norm": 1.4698554277420044, + "learning_rate": 4.9720993844593575e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8676527976989746, + "num_tokens": 28218125.0, + "step": 3445 + }, + { + "entropy": 0.3826135993003845, + "epoch": 0.5507662835249042, + "grad_norm": 1.751415729522705, + "learning_rate": 4.972018515659261e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8748811244964599, + "num_tokens": 28259085.0, + "step": 3450 + }, + { + "entropy": 0.3984654664993286, + "epoch": 0.5515644955300127, + "grad_norm": 1.5529251098632812, + "learning_rate": 4.971937530712297e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8713862895965576, + "num_tokens": 28300045.0, + "step": 3455 + }, + { + "entropy": 0.43848037123680117, + "epoch": 0.5523627075351213, + "grad_norm": 1.7217085361480713, + "learning_rate": 4.971856429623557e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8617102026939392, + "num_tokens": 28341005.0, + "step": 3460 + }, + { + "entropy": 0.4245146453380585, + "epoch": 0.5531609195402298, + "grad_norm": 1.6098260879516602, + "learning_rate": 4.971775212398143e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8645444631576538, + "num_tokens": 28381965.0, + "step": 3465 + }, + { + "entropy": 0.3518525302410126, + "epoch": 0.5539591315453385, + "grad_norm": 1.5915374755859375, + "learning_rate": 4.971693879041159e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8894816517829895, + "num_tokens": 28422925.0, + "step": 3470 + }, + { + "entropy": 0.3965813875198364, + "epoch": 0.554757343550447, + "grad_norm": 1.6396276950836182, + "learning_rate": 4.971612429557722e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8739223599433898, + "num_tokens": 28463885.0, + "step": 3475 + }, + { + "entropy": 0.40137404203414917, + "epoch": 0.5555555555555556, + "grad_norm": 1.5679564476013184, + "learning_rate": 4.971530863952952e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8753439903259277, + "num_tokens": 28504845.0, + "step": 3480 + }, + { + "entropy": 0.37675178050994873, + "epoch": 0.5563537675606641, + "grad_norm": 1.4812265634536743, + "learning_rate": 4.971449182231981e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8853613138198853, + "num_tokens": 28545805.0, + "step": 3485 + }, + { + "entropy": 0.4472045123577118, + "epoch": 0.5571519795657727, + "grad_norm": 1.7498054504394531, + "learning_rate": 4.971367384399944e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8568356394767761, + "num_tokens": 28586765.0, + "step": 3490 + }, + { + "entropy": 0.3870035171508789, + "epoch": 0.5579501915708812, + "grad_norm": 1.7072486877441406, + "learning_rate": 4.971285470461984e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8799459099769592, + "num_tokens": 28627725.0, + "step": 3495 + }, + { + "entropy": 0.3909365952014923, + "epoch": 0.5587484035759898, + "grad_norm": 1.6284451484680176, + "learning_rate": 4.971203440423252e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8747354984283447, + "num_tokens": 28668685.0, + "step": 3500 + }, + { + "entropy": 0.37294856905937196, + "epoch": 0.5595466155810983, + "grad_norm": 1.5810928344726562, + "learning_rate": 4.971121294288907e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8826012730598449, + "num_tokens": 28709645.0, + "step": 3505 + }, + { + "entropy": 0.381207799911499, + "epoch": 0.5603448275862069, + "grad_norm": 1.5926982164382935, + "learning_rate": 4.971039032064114e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8776193737983704, + "num_tokens": 28750605.0, + "step": 3510 + }, + { + "entropy": 0.36891435384750365, + "epoch": 0.5611430395913155, + "grad_norm": 1.635933756828308, + "learning_rate": 4.970956653754047e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8841464638710022, + "num_tokens": 28791565.0, + "step": 3515 + }, + { + "entropy": 0.40799201130867, + "epoch": 0.561941251596424, + "grad_norm": 1.3482568264007568, + "learning_rate": 4.970874159363886e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8733816742897034, + "num_tokens": 28832525.0, + "step": 3520 + }, + { + "entropy": 0.3592789232730865, + "epoch": 0.5627394636015326, + "grad_norm": 1.4991364479064941, + "learning_rate": 4.970791548898818e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8870156288146973, + "num_tokens": 28873485.0, + "step": 3525 + }, + { + "entropy": 0.37787640690803526, + "epoch": 0.5635376756066411, + "grad_norm": 1.7729026079177856, + "learning_rate": 4.9707088223640375e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8828484416007996, + "num_tokens": 28914445.0, + "step": 3530 + }, + { + "entropy": 0.3990558385848999, + "epoch": 0.5643358876117497, + "grad_norm": 1.6233949661254883, + "learning_rate": 4.970625979764747e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.873016095161438, + "num_tokens": 28955405.0, + "step": 3535 + }, + { + "entropy": 0.391368693113327, + "epoch": 0.5651340996168582, + "grad_norm": 1.5034476518630981, + "learning_rate": 4.970543021106156e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8756026506423951, + "num_tokens": 28996365.0, + "step": 3540 + }, + { + "entropy": 0.38302173614501955, + "epoch": 0.5659323116219668, + "grad_norm": 1.5422619581222534, + "learning_rate": 4.970459946393482e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.879080057144165, + "num_tokens": 29037325.0, + "step": 3545 + }, + { + "entropy": 0.37263641953468324, + "epoch": 0.5667305236270753, + "grad_norm": 1.5199742317199707, + "learning_rate": 4.970376755631948e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8836228370666503, + "num_tokens": 29078285.0, + "step": 3550 + }, + { + "entropy": 0.32374128699302673, + "epoch": 0.5675287356321839, + "grad_norm": 1.549601435661316, + "learning_rate": 4.970293448826786e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8980387449264526, + "num_tokens": 29119245.0, + "step": 3555 + }, + { + "entropy": 0.36209123730659487, + "epoch": 0.5683269476372924, + "grad_norm": 1.5335177183151245, + "learning_rate": 4.970210025983234e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8839428782463074, + "num_tokens": 29160205.0, + "step": 3560 + }, + { + "entropy": 0.41070149540901185, + "epoch": 0.5691251596424011, + "grad_norm": 1.475563645362854, + "learning_rate": 4.970126487106537e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8688075542449951, + "num_tokens": 29201165.0, + "step": 3565 + }, + { + "entropy": 0.37354364395141604, + "epoch": 0.5699233716475096, + "grad_norm": 1.6093169450759888, + "learning_rate": 4.970042832201951e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8846863627433776, + "num_tokens": 29242125.0, + "step": 3570 + }, + { + "entropy": 0.39829763770103455, + "epoch": 0.5707215836526182, + "grad_norm": 1.666699767112732, + "learning_rate": 4.969959061274734e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8715177774429321, + "num_tokens": 29283085.0, + "step": 3575 + }, + { + "entropy": 0.3605415463447571, + "epoch": 0.5715197956577267, + "grad_norm": 1.4592363834381104, + "learning_rate": 4.969875174330155e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8872054219245911, + "num_tokens": 29324045.0, + "step": 3580 + }, + { + "entropy": 0.38993356823921205, + "epoch": 0.5723180076628352, + "grad_norm": 1.775950312614441, + "learning_rate": 4.969791171373488e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8806413054466248, + "num_tokens": 29365005.0, + "step": 3585 + }, + { + "entropy": 0.3733646094799042, + "epoch": 0.5731162196679438, + "grad_norm": 1.5953619480133057, + "learning_rate": 4.969707052410016e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.884143841266632, + "num_tokens": 29405965.0, + "step": 3590 + }, + { + "entropy": 0.39912459850311277, + "epoch": 0.5739144316730523, + "grad_norm": 1.7145378589630127, + "learning_rate": 4.96962281744503e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8690350651741028, + "num_tokens": 29446925.0, + "step": 3595 + }, + { + "entropy": 0.367926698923111, + "epoch": 0.5747126436781609, + "grad_norm": 1.452913522720337, + "learning_rate": 4.969538466483826e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8845783591270446, + "num_tokens": 29487885.0, + "step": 3600 + }, + { + "epoch": 0.5747126436781609, + "eval_entropy": 0.39141595339775087, + "eval_loss": 0.35446324944496155, + "eval_mean_token_accuracy": 0.8758203558921814, + "eval_num_tokens": 29487885.0, + "eval_runtime": 69.2621, + "eval_samples_per_second": 14.438, + "eval_steps_per_second": 1.805, + "step": 3600 + }, + { + "entropy": 0.3865828216075897, + "epoch": 0.5755108556832694, + "grad_norm": 1.5825527906417847, + "learning_rate": 4.969453999531707e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.877928352355957, + "num_tokens": 29528845.0, + "step": 3605 + }, + { + "entropy": 0.35882670879364015, + "epoch": 0.5763090676883781, + "grad_norm": 1.3376578092575073, + "learning_rate": 4.969369416593987e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8848472118377686, + "num_tokens": 29569805.0, + "step": 3610 + }, + { + "entropy": 0.373634397983551, + "epoch": 0.5771072796934866, + "grad_norm": 1.5386207103729248, + "learning_rate": 4.969284717675983e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8834493279457092, + "num_tokens": 29610765.0, + "step": 3615 + }, + { + "entropy": 0.35085774064064024, + "epoch": 0.5779054916985952, + "grad_norm": 1.5583409070968628, + "learning_rate": 4.9691999027830215e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8899118542671204, + "num_tokens": 29651712.0, + "step": 3620 + }, + { + "entropy": 0.4144057631492615, + "epoch": 0.5787037037037037, + "grad_norm": 1.6685035228729248, + "learning_rate": 4.969114971920436e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8672501444816589, + "num_tokens": 29692672.0, + "step": 3625 + }, + { + "entropy": 0.39991188049316406, + "epoch": 0.5795019157088123, + "grad_norm": 1.4954372644424438, + "learning_rate": 4.969029925093568e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8725775599479675, + "num_tokens": 29733632.0, + "step": 3630 + }, + { + "entropy": 0.40609376430511473, + "epoch": 0.5803001277139208, + "grad_norm": 1.5095274448394775, + "learning_rate": 4.968944762307764e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.871772575378418, + "num_tokens": 29774592.0, + "step": 3635 + }, + { + "entropy": 0.3526857793331146, + "epoch": 0.5810983397190294, + "grad_norm": 1.5209416151046753, + "learning_rate": 4.968859483568382e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8915246844291687, + "num_tokens": 29815552.0, + "step": 3640 + }, + { + "entropy": 0.393011087179184, + "epoch": 0.5818965517241379, + "grad_norm": 1.6194604635238647, + "learning_rate": 4.96877408888078e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8710348963737488, + "num_tokens": 29856512.0, + "step": 3645 + }, + { + "entropy": 0.4141296446323395, + "epoch": 0.5826947637292464, + "grad_norm": 1.655099868774414, + "learning_rate": 4.968688578250333e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8657617568969727, + "num_tokens": 29897472.0, + "step": 3650 + }, + { + "entropy": 0.35265921950340273, + "epoch": 0.583492975734355, + "grad_norm": 1.6760668754577637, + "learning_rate": 4.968602951682415e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8901681065559387, + "num_tokens": 29938432.0, + "step": 3655 + }, + { + "entropy": 0.3939144194126129, + "epoch": 0.5842911877394636, + "grad_norm": 1.6867501735687256, + "learning_rate": 4.968517209182412e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8784684062004089, + "num_tokens": 29979392.0, + "step": 3660 + }, + { + "entropy": 0.4020268201828003, + "epoch": 0.5850893997445722, + "grad_norm": 1.7138423919677734, + "learning_rate": 4.968431350755716e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8738431334495544, + "num_tokens": 30020352.0, + "step": 3665 + }, + { + "entropy": 0.38859901428222654, + "epoch": 0.5858876117496807, + "grad_norm": 1.564834475517273, + "learning_rate": 4.968345376407724e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8791564702987671, + "num_tokens": 30061312.0, + "step": 3670 + }, + { + "entropy": 0.3994689702987671, + "epoch": 0.5866858237547893, + "grad_norm": 1.8667218685150146, + "learning_rate": 4.968259286143844e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8723411798477173, + "num_tokens": 30102272.0, + "step": 3675 + }, + { + "entropy": 0.40553962588310244, + "epoch": 0.5874840357598978, + "grad_norm": 1.5423719882965088, + "learning_rate": 4.96817307996949e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8713503837585449, + "num_tokens": 30143232.0, + "step": 3680 + }, + { + "entropy": 0.39541796445846555, + "epoch": 0.5882822477650064, + "grad_norm": 1.7450207471847534, + "learning_rate": 4.968086757890082e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8758617758750915, + "num_tokens": 30184192.0, + "step": 3685 + }, + { + "entropy": 0.4130732357501984, + "epoch": 0.5890804597701149, + "grad_norm": 1.8434921503067017, + "learning_rate": 4.968000319911049e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8648181915283203, + "num_tokens": 30225152.0, + "step": 3690 + }, + { + "entropy": 0.3264343738555908, + "epoch": 0.5898786717752235, + "grad_norm": 1.4113878011703491, + "learning_rate": 4.967913766037825e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.9002414584159851, + "num_tokens": 30266112.0, + "step": 3695 + }, + { + "entropy": 0.3778840720653534, + "epoch": 0.590676883780332, + "grad_norm": 1.3939766883850098, + "learning_rate": 4.967827096275854e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8849563717842102, + "num_tokens": 30307072.0, + "step": 3700 + }, + { + "entropy": 0.3920683264732361, + "epoch": 0.5914750957854407, + "grad_norm": 1.552365779876709, + "learning_rate": 4.967740310630587e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8735494256019593, + "num_tokens": 30348032.0, + "step": 3705 + }, + { + "entropy": 0.3946221828460693, + "epoch": 0.5922733077905492, + "grad_norm": 1.7382375001907349, + "learning_rate": 4.96765340910748e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8761067390441895, + "num_tokens": 30388992.0, + "step": 3710 + }, + { + "entropy": 0.390900981426239, + "epoch": 0.5930715197956578, + "grad_norm": 1.4038467407226562, + "learning_rate": 4.9675663917119975e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8776496767997741, + "num_tokens": 30429952.0, + "step": 3715 + }, + { + "entropy": 0.4173464775085449, + "epoch": 0.5938697318007663, + "grad_norm": 1.4674110412597656, + "learning_rate": 4.967479258449612e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8679859876632691, + "num_tokens": 30470912.0, + "step": 3720 + }, + { + "entropy": 0.397634357213974, + "epoch": 0.5946679438058748, + "grad_norm": 1.523329734802246, + "learning_rate": 4.967392009325803e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8761757016181946, + "num_tokens": 30511872.0, + "step": 3725 + }, + { + "entropy": 0.3922095954418182, + "epoch": 0.5954661558109834, + "grad_norm": 1.778900384902954, + "learning_rate": 4.967304644346056e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8769558191299438, + "num_tokens": 30552832.0, + "step": 3730 + }, + { + "entropy": 0.39952392578125, + "epoch": 0.5962643678160919, + "grad_norm": 1.4663424491882324, + "learning_rate": 4.967217163515866e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8723309278488159, + "num_tokens": 30593792.0, + "step": 3735 + }, + { + "entropy": 0.4193924903869629, + "epoch": 0.5970625798212005, + "grad_norm": 1.5840364694595337, + "learning_rate": 4.9671295668407346e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8661716938018799, + "num_tokens": 30634752.0, + "step": 3740 + }, + { + "entropy": 0.3981863796710968, + "epoch": 0.597860791826309, + "grad_norm": 1.7133623361587524, + "learning_rate": 4.9670418543261674e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.869674825668335, + "num_tokens": 30675712.0, + "step": 3745 + }, + { + "entropy": 0.4032014787197113, + "epoch": 0.5986590038314177, + "grad_norm": 1.6495577096939087, + "learning_rate": 4.966954025977683e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8763142943382263, + "num_tokens": 30716672.0, + "step": 3750 + }, + { + "entropy": 0.39403932094573973, + "epoch": 0.5994572158365262, + "grad_norm": 1.4911088943481445, + "learning_rate": 4.966866081800803e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8772829055786133, + "num_tokens": 30757632.0, + "step": 3755 + }, + { + "entropy": 0.395513778924942, + "epoch": 0.6002554278416348, + "grad_norm": 1.5269211530685425, + "learning_rate": 4.966778021801058e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8761088252067566, + "num_tokens": 30798592.0, + "step": 3760 + }, + { + "entropy": 0.3795560717582703, + "epoch": 0.6010536398467433, + "grad_norm": 1.6205265522003174, + "learning_rate": 4.966689845983985e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8801708817481995, + "num_tokens": 30839552.0, + "step": 3765 + }, + { + "entropy": 0.3631418526172638, + "epoch": 0.6018518518518519, + "grad_norm": 1.3927897214889526, + "learning_rate": 4.966601554355129e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.884639322757721, + "num_tokens": 30880512.0, + "step": 3770 + }, + { + "entropy": 0.4569432020187378, + "epoch": 0.6026500638569604, + "grad_norm": 1.4657988548278809, + "learning_rate": 4.966513146920044e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8528200626373291, + "num_tokens": 30921472.0, + "step": 3775 + }, + { + "entropy": 0.3831614673137665, + "epoch": 0.603448275862069, + "grad_norm": 1.5960118770599365, + "learning_rate": 4.966424623684285e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8764102578163147, + "num_tokens": 30962432.0, + "step": 3780 + }, + { + "entropy": 0.393023943901062, + "epoch": 0.6042464878671775, + "grad_norm": 1.5771610736846924, + "learning_rate": 4.966335984653423e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8748709559440613, + "num_tokens": 31003392.0, + "step": 3785 + }, + { + "entropy": 0.3829107344150543, + "epoch": 0.605044699872286, + "grad_norm": 1.5862095355987549, + "learning_rate": 4.966247229833029e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8772776603698731, + "num_tokens": 31044352.0, + "step": 3790 + }, + { + "entropy": 0.40039422512054446, + "epoch": 0.6058429118773946, + "grad_norm": 1.9182721376419067, + "learning_rate": 4.9661583592286864e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8720365285873413, + "num_tokens": 31085312.0, + "step": 3795 + }, + { + "entropy": 0.39912710785865785, + "epoch": 0.6066411238825032, + "grad_norm": 1.7170418500900269, + "learning_rate": 4.966069372845982e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8749265432357788, + "num_tokens": 31126272.0, + "step": 3800 + }, + { + "epoch": 0.6066411238825032, + "eval_entropy": 0.3909790225028992, + "eval_loss": 0.35294920206069946, + "eval_mean_token_accuracy": 0.8761810812950134, + "eval_num_tokens": 31126272.0, + "eval_runtime": 69.2385, + "eval_samples_per_second": 14.443, + "eval_steps_per_second": 1.805, + "step": 3800 + }, + { + "entropy": 0.420094895362854, + "epoch": 0.6074393358876118, + "grad_norm": 1.5807826519012451, + "learning_rate": 4.9659802706905125e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8649389863014221, + "num_tokens": 31167232.0, + "step": 3805 + }, + { + "entropy": 0.37068371176719667, + "epoch": 0.6082375478927203, + "grad_norm": 1.5241514444351196, + "learning_rate": 4.965891052767881e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8824189662933349, + "num_tokens": 31208192.0, + "step": 3810 + }, + { + "entropy": 0.43431633710861206, + "epoch": 0.6090357598978289, + "grad_norm": 1.515228509902954, + "learning_rate": 4.965801719083697e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8619320034980774, + "num_tokens": 31249152.0, + "step": 3815 + }, + { + "entropy": 0.4021922886371613, + "epoch": 0.6098339719029374, + "grad_norm": 1.5357593297958374, + "learning_rate": 4.965712269643578e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.873773992061615, + "num_tokens": 31290112.0, + "step": 3820 + }, + { + "entropy": 0.34870944619178773, + "epoch": 0.610632183908046, + "grad_norm": 1.3976689577102661, + "learning_rate": 4.9656227044531505e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8884419322013855, + "num_tokens": 31331072.0, + "step": 3825 + }, + { + "entropy": 0.3720965325832367, + "epoch": 0.6114303959131545, + "grad_norm": 1.5985755920410156, + "learning_rate": 4.965533023518046e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8805681109428406, + "num_tokens": 31372032.0, + "step": 3830 + }, + { + "entropy": 0.35963661074638364, + "epoch": 0.6122286079182631, + "grad_norm": 1.347583532333374, + "learning_rate": 4.965443226843903e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8867804765701294, + "num_tokens": 31412992.0, + "step": 3835 + }, + { + "entropy": 0.38517152070999144, + "epoch": 0.6130268199233716, + "grad_norm": 1.4462475776672363, + "learning_rate": 4.965353314436368e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8791717529296875, + "num_tokens": 31453952.0, + "step": 3840 + }, + { + "entropy": 0.32794575691223143, + "epoch": 0.6138250319284803, + "grad_norm": 1.6038838624954224, + "learning_rate": 4.965263286301097e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8971529364585876, + "num_tokens": 31494912.0, + "step": 3845 + }, + { + "entropy": 0.38298609256744387, + "epoch": 0.6146232439335888, + "grad_norm": 1.4308631420135498, + "learning_rate": 4.96517314244375e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8804044246673584, + "num_tokens": 31535872.0, + "step": 3850 + }, + { + "entropy": 0.4029498934745789, + "epoch": 0.6154214559386973, + "grad_norm": 1.6430120468139648, + "learning_rate": 4.965082882869996e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8700898885726929, + "num_tokens": 31576832.0, + "step": 3855 + }, + { + "entropy": 0.3919563353061676, + "epoch": 0.6162196679438059, + "grad_norm": 1.7798478603363037, + "learning_rate": 4.96499250758551e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8769108176231384, + "num_tokens": 31617792.0, + "step": 3860 + }, + { + "entropy": 0.3522378861904144, + "epoch": 0.6170178799489144, + "grad_norm": 1.5173027515411377, + "learning_rate": 4.964902016595976e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8938339829444886, + "num_tokens": 31658752.0, + "step": 3865 + }, + { + "entropy": 0.36898383498191833, + "epoch": 0.617816091954023, + "grad_norm": 1.6988641023635864, + "learning_rate": 4.964811409907084e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8814649105072021, + "num_tokens": 31699712.0, + "step": 3870 + }, + { + "entropy": 0.40858992338180544, + "epoch": 0.6186143039591315, + "grad_norm": 1.7161165475845337, + "learning_rate": 4.9647206875245305e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8708085536956787, + "num_tokens": 31740672.0, + "step": 3875 + }, + { + "entropy": 0.409778368473053, + "epoch": 0.6194125159642401, + "grad_norm": 1.6546376943588257, + "learning_rate": 4.964629849454022e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.870085597038269, + "num_tokens": 31781632.0, + "step": 3880 + }, + { + "entropy": 0.4049523532390594, + "epoch": 0.6202107279693486, + "grad_norm": 1.3990082740783691, + "learning_rate": 4.964538895701272e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8743181347846984, + "num_tokens": 31822592.0, + "step": 3885 + }, + { + "entropy": 0.4319458603858948, + "epoch": 0.6210089399744572, + "grad_norm": 1.7687848806381226, + "learning_rate": 4.964447826271997e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8653620839118957, + "num_tokens": 31863552.0, + "step": 3890 + }, + { + "entropy": 0.41185548305511477, + "epoch": 0.6218071519795658, + "grad_norm": 1.4894161224365234, + "learning_rate": 4.964356641171925e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8705492496490479, + "num_tokens": 31904512.0, + "step": 3895 + }, + { + "entropy": 0.3748886942863464, + "epoch": 0.6226053639846744, + "grad_norm": 1.609660029411316, + "learning_rate": 4.964265340406789e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8815550446510315, + "num_tokens": 31945472.0, + "step": 3900 + }, + { + "entropy": 0.40155192017555236, + "epoch": 0.6234035759897829, + "grad_norm": 1.4394019842147827, + "learning_rate": 4.964173923982334e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8742201566696167, + "num_tokens": 31986432.0, + "step": 3905 + }, + { + "entropy": 0.4126347959041595, + "epoch": 0.6242017879948915, + "grad_norm": 1.5783936977386475, + "learning_rate": 4.964082391904305e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8702286601066589, + "num_tokens": 32027392.0, + "step": 3910 + }, + { + "entropy": 0.3948510766029358, + "epoch": 0.625, + "grad_norm": 1.6075055599212646, + "learning_rate": 4.963990744178458e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.874882161617279, + "num_tokens": 32068352.0, + "step": 3915 + }, + { + "entropy": 0.40940110087394715, + "epoch": 0.6257982120051085, + "grad_norm": 1.6805075407028198, + "learning_rate": 4.963898980810557e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.864269745349884, + "num_tokens": 32109312.0, + "step": 3920 + }, + { + "entropy": 0.3619593560695648, + "epoch": 0.6265964240102171, + "grad_norm": 1.6504465341567993, + "learning_rate": 4.963807101806373e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8852601170539856, + "num_tokens": 32150272.0, + "step": 3925 + }, + { + "entropy": 0.38621042370796205, + "epoch": 0.6273946360153256, + "grad_norm": 1.7133197784423828, + "learning_rate": 4.963715107171683e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.879948091506958, + "num_tokens": 32191232.0, + "step": 3930 + }, + { + "entropy": 0.38483706712722776, + "epoch": 0.6281928480204342, + "grad_norm": 1.431302785873413, + "learning_rate": 4.963622996912272e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8818876624107361, + "num_tokens": 32232192.0, + "step": 3935 + }, + { + "entropy": 0.37900099754333494, + "epoch": 0.6289910600255428, + "grad_norm": 1.4894486665725708, + "learning_rate": 4.963530771033931e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8806081175804138, + "num_tokens": 32273152.0, + "step": 3940 + }, + { + "entropy": 0.3738178789615631, + "epoch": 0.6297892720306514, + "grad_norm": 1.5760552883148193, + "learning_rate": 4.963438429542461e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.883822786808014, + "num_tokens": 32314112.0, + "step": 3945 + }, + { + "entropy": 0.3659499764442444, + "epoch": 0.6305874840357599, + "grad_norm": 2.1062514781951904, + "learning_rate": 4.96334597244367e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8866788744926453, + "num_tokens": 32355072.0, + "step": 3950 + }, + { + "entropy": 0.37738183736801145, + "epoch": 0.6313856960408685, + "grad_norm": 1.7195035219192505, + "learning_rate": 4.963253399743368e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8807390928268433, + "num_tokens": 32396032.0, + "step": 3955 + }, + { + "entropy": 0.3469261348247528, + "epoch": 0.632183908045977, + "grad_norm": 1.5165008306503296, + "learning_rate": 4.9631607114473804e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8905919075012207, + "num_tokens": 32436992.0, + "step": 3960 + }, + { + "entropy": 0.36719757318496704, + "epoch": 0.6329821200510856, + "grad_norm": 1.4283252954483032, + "learning_rate": 4.963067907561534e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.881699550151825, + "num_tokens": 32477952.0, + "step": 3965 + }, + { + "entropy": 0.3427632570266724, + "epoch": 0.6337803320561941, + "grad_norm": 1.526005506515503, + "learning_rate": 4.962974988091664e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8933268189430237, + "num_tokens": 32518912.0, + "step": 3970 + }, + { + "entropy": 0.36962045431137086, + "epoch": 0.6345785440613027, + "grad_norm": 1.4629669189453125, + "learning_rate": 4.962881953043614e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8862110733985901, + "num_tokens": 32559850.0, + "step": 3975 + }, + { + "entropy": 0.3889750599861145, + "epoch": 0.6353767560664112, + "grad_norm": 1.4501713514328003, + "learning_rate": 4.962788802423236e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8809821367263794, + "num_tokens": 32600746.0, + "step": 3980 + }, + { + "entropy": 0.375089293718338, + "epoch": 0.6361749680715197, + "grad_norm": 1.5705161094665527, + "learning_rate": 4.962695536236385e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.880041527748108, + "num_tokens": 32641706.0, + "step": 3985 + }, + { + "entropy": 0.31232019066810607, + "epoch": 0.6369731800766284, + "grad_norm": 1.4807438850402832, + "learning_rate": 4.962602154488927e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.9007895469665528, + "num_tokens": 32682666.0, + "step": 3990 + }, + { + "entropy": 0.358897465467453, + "epoch": 0.6377713920817369, + "grad_norm": 1.5279514789581299, + "learning_rate": 4.962508657186734e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8849632740020752, + "num_tokens": 32723626.0, + "step": 3995 + }, + { + "entropy": 0.3594439685344696, + "epoch": 0.6385696040868455, + "grad_norm": 1.6460912227630615, + "learning_rate": 4.962415044335687e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8889756202697754, + "num_tokens": 32764586.0, + "step": 4000 + }, + { + "epoch": 0.6385696040868455, + "eval_entropy": 0.3917965919971466, + "eval_loss": 0.3518121838569641, + "eval_mean_token_accuracy": 0.8764794664382934, + "eval_num_tokens": 32764586.0, + "eval_runtime": 69.2274, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 4000 + }, + { + "entropy": 0.38373176455497743, + "epoch": 0.639367816091954, + "grad_norm": 1.583543300628662, + "learning_rate": 4.96232131594167e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8794745564460754, + "num_tokens": 32805546.0, + "step": 4005 + }, + { + "entropy": 0.3702171742916107, + "epoch": 0.6401660280970626, + "grad_norm": 1.4130009412765503, + "learning_rate": 4.962227472010579e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8848128080368042, + "num_tokens": 32846506.0, + "step": 4010 + }, + { + "entropy": 0.4183795750141144, + "epoch": 0.6409642401021711, + "grad_norm": 1.8898104429244995, + "learning_rate": 4.962133512548314e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8680359721183777, + "num_tokens": 32887466.0, + "step": 4015 + }, + { + "entropy": 0.34782321453094484, + "epoch": 0.6417624521072797, + "grad_norm": 1.4358257055282593, + "learning_rate": 4.962039437560785e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8888446807861328, + "num_tokens": 32928426.0, + "step": 4020 + }, + { + "entropy": 0.38004317283630373, + "epoch": 0.6425606641123882, + "grad_norm": 1.6532775163650513, + "learning_rate": 4.961945247053906e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8797279477119446, + "num_tokens": 32969386.0, + "step": 4025 + }, + { + "entropy": 0.3863183081150055, + "epoch": 0.6433588761174968, + "grad_norm": 1.4063210487365723, + "learning_rate": 4.9618509410336015e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8798073172569275, + "num_tokens": 33010346.0, + "step": 4030 + }, + { + "entropy": 0.37526068091392517, + "epoch": 0.6441570881226054, + "grad_norm": 1.416603922843933, + "learning_rate": 4.961756519505801e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8806898117065429, + "num_tokens": 33051306.0, + "step": 4035 + }, + { + "entropy": 0.3895104885101318, + "epoch": 0.644955300127714, + "grad_norm": 1.463862419128418, + "learning_rate": 4.9616619824764414e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8775312304496765, + "num_tokens": 33092266.0, + "step": 4040 + }, + { + "entropy": 0.35795719623565675, + "epoch": 0.6457535121328225, + "grad_norm": 1.5325723886489868, + "learning_rate": 4.961567329951469e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8871722459793091, + "num_tokens": 33133226.0, + "step": 4045 + }, + { + "entropy": 0.3536506354808807, + "epoch": 0.646551724137931, + "grad_norm": 1.5098241567611694, + "learning_rate": 4.961472561936834e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8913549780845642, + "num_tokens": 33174186.0, + "step": 4050 + }, + { + "entropy": 0.3890280067920685, + "epoch": 0.6473499361430396, + "grad_norm": 1.5493937730789185, + "learning_rate": 4.961377678438498e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8763817191123963, + "num_tokens": 33215146.0, + "step": 4055 + }, + { + "entropy": 0.36107209920883176, + "epoch": 0.6481481481481481, + "grad_norm": 1.5306475162506104, + "learning_rate": 4.961282679462427e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8859923839569092, + "num_tokens": 33256106.0, + "step": 4060 + }, + { + "entropy": 0.3484861731529236, + "epoch": 0.6489463601532567, + "grad_norm": 1.5191750526428223, + "learning_rate": 4.961187565014593e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8896730542182922, + "num_tokens": 33297066.0, + "step": 4065 + }, + { + "entropy": 0.39441166520118714, + "epoch": 0.6497445721583652, + "grad_norm": 1.5315167903900146, + "learning_rate": 4.961092335100979e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.873190951347351, + "num_tokens": 33338026.0, + "step": 4070 + }, + { + "entropy": 0.35233972668647767, + "epoch": 0.6505427841634738, + "grad_norm": 1.319646954536438, + "learning_rate": 4.960996989727574e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8912659168243409, + "num_tokens": 33378986.0, + "step": 4075 + }, + { + "entropy": 0.4316498875617981, + "epoch": 0.6513409961685823, + "grad_norm": 1.7222440242767334, + "learning_rate": 4.960901528900371e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.866232967376709, + "num_tokens": 33419946.0, + "step": 4080 + }, + { + "entropy": 0.40433109998703004, + "epoch": 0.652139208173691, + "grad_norm": 1.6289095878601074, + "learning_rate": 4.960805952625374e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8714369535446167, + "num_tokens": 33460906.0, + "step": 4085 + }, + { + "entropy": 0.34908521771430967, + "epoch": 0.6529374201787995, + "grad_norm": 1.2474722862243652, + "learning_rate": 4.960710260908595e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8889854311943054, + "num_tokens": 33501866.0, + "step": 4090 + }, + { + "entropy": 0.3515977382659912, + "epoch": 0.6537356321839081, + "grad_norm": 1.4092243909835815, + "learning_rate": 4.96061445375605e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8876409888267517, + "num_tokens": 33542826.0, + "step": 4095 + }, + { + "entropy": 0.39068559408187864, + "epoch": 0.6545338441890166, + "grad_norm": 1.7305725812911987, + "learning_rate": 4.960518531173763e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8775494694709778, + "num_tokens": 33583786.0, + "step": 4100 + }, + { + "entropy": 0.4475154936313629, + "epoch": 0.6553320561941252, + "grad_norm": 1.5801259279251099, + "learning_rate": 4.960422493167767e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8554537773132325, + "num_tokens": 33624746.0, + "step": 4105 + }, + { + "entropy": 0.3612799346446991, + "epoch": 0.6561302681992337, + "grad_norm": 1.3840538263320923, + "learning_rate": 4.9603263397441e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.885944414138794, + "num_tokens": 33665706.0, + "step": 4110 + }, + { + "entropy": 0.37169676423072817, + "epoch": 0.6569284802043422, + "grad_norm": 1.5911798477172852, + "learning_rate": 4.96023007090881e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8856913924217225, + "num_tokens": 33706666.0, + "step": 4115 + }, + { + "entropy": 0.3725708842277527, + "epoch": 0.6577266922094508, + "grad_norm": 1.5351768732070923, + "learning_rate": 4.96013368666795e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8846683621406555, + "num_tokens": 33747626.0, + "step": 4120 + }, + { + "entropy": 0.35295900106430056, + "epoch": 0.6585249042145593, + "grad_norm": 1.6065768003463745, + "learning_rate": 4.960037187027581e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8901387929916382, + "num_tokens": 33788586.0, + "step": 4125 + }, + { + "entropy": 0.40923756957054136, + "epoch": 0.659323116219668, + "grad_norm": 1.6720138788223267, + "learning_rate": 4.959940571993771e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8710006952285767, + "num_tokens": 33829546.0, + "step": 4130 + }, + { + "entropy": 0.39302995800971985, + "epoch": 0.6601213282247765, + "grad_norm": 1.5660480260849, + "learning_rate": 4.959843841572596e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8789271354675293, + "num_tokens": 33870506.0, + "step": 4135 + }, + { + "entropy": 0.39711377024650574, + "epoch": 0.6609195402298851, + "grad_norm": 1.6311522722244263, + "learning_rate": 4.959746995770137e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8728868365287781, + "num_tokens": 33911466.0, + "step": 4140 + }, + { + "entropy": 0.38717801570892335, + "epoch": 0.6617177522349936, + "grad_norm": 1.5107231140136719, + "learning_rate": 4.959650034592487e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8788737893104553, + "num_tokens": 33952426.0, + "step": 4145 + }, + { + "entropy": 0.39666436314582826, + "epoch": 0.6625159642401022, + "grad_norm": 1.5432357788085938, + "learning_rate": 4.959552958045742e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8723564863204956, + "num_tokens": 33993386.0, + "step": 4150 + }, + { + "entropy": 0.3947413682937622, + "epoch": 0.6633141762452107, + "grad_norm": 1.6945823431015015, + "learning_rate": 4.959455766136005e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8749089002609253, + "num_tokens": 34034346.0, + "step": 4155 + }, + { + "entropy": 0.37710251808166506, + "epoch": 0.6641123882503193, + "grad_norm": 1.5467009544372559, + "learning_rate": 4.95935845886939e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.885505223274231, + "num_tokens": 34075306.0, + "step": 4160 + }, + { + "entropy": 0.3872054398059845, + "epoch": 0.6649106002554278, + "grad_norm": 1.6126255989074707, + "learning_rate": 4.959261036252014e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.880678677558899, + "num_tokens": 34116266.0, + "step": 4165 + }, + { + "entropy": 0.38808879256248474, + "epoch": 0.6657088122605364, + "grad_norm": 1.415348768234253, + "learning_rate": 4.959163498290004e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8778594136238098, + "num_tokens": 34157226.0, + "step": 4170 + }, + { + "entropy": 0.41391260027885435, + "epoch": 0.666507024265645, + "grad_norm": 1.713935375213623, + "learning_rate": 4.9590658449894944e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8717985391616822, + "num_tokens": 34198186.0, + "step": 4175 + }, + { + "entropy": 0.3717846214771271, + "epoch": 0.6673052362707536, + "grad_norm": 1.655297875404358, + "learning_rate": 4.958968076356625e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8843464732170105, + "num_tokens": 34239146.0, + "step": 4180 + }, + { + "entropy": 0.4275781691074371, + "epoch": 0.6681034482758621, + "grad_norm": 1.537449598312378, + "learning_rate": 4.958870192397544e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.86600261926651, + "num_tokens": 34280106.0, + "step": 4185 + }, + { + "entropy": 0.369396436214447, + "epoch": 0.6689016602809706, + "grad_norm": 1.7176984548568726, + "learning_rate": 4.958772193118408e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8829890727996826, + "num_tokens": 34321066.0, + "step": 4190 + }, + { + "entropy": 0.37659146189689635, + "epoch": 0.6696998722860792, + "grad_norm": 1.522311806678772, + "learning_rate": 4.958674078525378e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.88204345703125, + "num_tokens": 34362026.0, + "step": 4195 + }, + { + "entropy": 0.40196934938430784, + "epoch": 0.6704980842911877, + "grad_norm": 1.57537043094635, + "learning_rate": 4.958575848624624e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8716495633125305, + "num_tokens": 34402986.0, + "step": 4200 + }, + { + "epoch": 0.6704980842911877, + "eval_entropy": 0.39062203240394594, + "eval_loss": 0.34974205493927, + "eval_mean_token_accuracy": 0.877336347579956, + "eval_num_tokens": 34402986.0, + "eval_runtime": 69.2668, + "eval_samples_per_second": 14.437, + "eval_steps_per_second": 1.805, + "step": 4200 + }, + { + "entropy": 0.41640692949295044, + "epoch": 0.6712962962962963, + "grad_norm": 1.7407722473144531, + "learning_rate": 4.9584775034223224e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8696022033691406, + "num_tokens": 34443946.0, + "step": 4205 + }, + { + "entropy": 0.4036651015281677, + "epoch": 0.6720945083014048, + "grad_norm": 1.6700903177261353, + "learning_rate": 4.958379042924658e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8763973951339722, + "num_tokens": 34484906.0, + "step": 4210 + }, + { + "entropy": 0.4063118636608124, + "epoch": 0.6728927203065134, + "grad_norm": 1.66812002658844, + "learning_rate": 4.958280467137824e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8714771866798401, + "num_tokens": 34525866.0, + "step": 4215 + }, + { + "entropy": 0.3816158056259155, + "epoch": 0.6736909323116219, + "grad_norm": 1.613732099533081, + "learning_rate": 4.958181776068017e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8783071279525757, + "num_tokens": 34566826.0, + "step": 4220 + }, + { + "entropy": 0.4148737370967865, + "epoch": 0.6744891443167306, + "grad_norm": 1.5773872137069702, + "learning_rate": 4.958082969721444e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8704238414764405, + "num_tokens": 34607786.0, + "step": 4225 + }, + { + "entropy": 0.405037522315979, + "epoch": 0.6752873563218391, + "grad_norm": 1.681789755821228, + "learning_rate": 4.957984048104318e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8717342495918274, + "num_tokens": 34648746.0, + "step": 4230 + }, + { + "entropy": 0.39438945055007935, + "epoch": 0.6760855683269477, + "grad_norm": 1.8257066011428833, + "learning_rate": 4.957885011222859e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.879536759853363, + "num_tokens": 34689706.0, + "step": 4235 + }, + { + "entropy": 0.3599630892276764, + "epoch": 0.6768837803320562, + "grad_norm": 1.4572027921676636, + "learning_rate": 4.957785859083297e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8867843866348266, + "num_tokens": 34730666.0, + "step": 4240 + }, + { + "entropy": 0.37403408288955686, + "epoch": 0.6776819923371648, + "grad_norm": 1.614817500114441, + "learning_rate": 4.957686591691864e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8855644464492798, + "num_tokens": 34771626.0, + "step": 4245 + }, + { + "entropy": 0.41073269248008726, + "epoch": 0.6784802043422733, + "grad_norm": 1.6293586492538452, + "learning_rate": 4.957587209054804e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8730408787727356, + "num_tokens": 34812586.0, + "step": 4250 + }, + { + "entropy": 0.4003384351730347, + "epoch": 0.6792784163473818, + "grad_norm": 1.5987569093704224, + "learning_rate": 4.957487711178366e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8723024964332581, + "num_tokens": 34853546.0, + "step": 4255 + }, + { + "entropy": 0.39334952235221865, + "epoch": 0.6800766283524904, + "grad_norm": 1.5379856824874878, + "learning_rate": 4.957388098068808e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8757344603538513, + "num_tokens": 34894312.0, + "step": 4260 + }, + { + "entropy": 0.38406250476837156, + "epoch": 0.6808748403575989, + "grad_norm": 1.5372836589813232, + "learning_rate": 4.9572883697323926e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8771077871322632, + "num_tokens": 34935272.0, + "step": 4265 + }, + { + "entropy": 0.38156905174255373, + "epoch": 0.6816730523627076, + "grad_norm": 1.669527292251587, + "learning_rate": 4.957188526175391e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8789000630378723, + "num_tokens": 34976232.0, + "step": 4270 + }, + { + "entropy": 0.36010006070137024, + "epoch": 0.6824712643678161, + "grad_norm": 1.5515540838241577, + "learning_rate": 4.957088567404082e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8882894873619079, + "num_tokens": 35017192.0, + "step": 4275 + }, + { + "entropy": 0.3551903784275055, + "epoch": 0.6832694763729247, + "grad_norm": 1.6245423555374146, + "learning_rate": 4.956988493424753e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8872243881225585, + "num_tokens": 35058152.0, + "step": 4280 + }, + { + "entropy": 0.35940446257591246, + "epoch": 0.6840676883780332, + "grad_norm": 1.5272520780563354, + "learning_rate": 4.956888304243695e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8874533534049988, + "num_tokens": 35099112.0, + "step": 4285 + }, + { + "entropy": 0.3756145477294922, + "epoch": 0.6848659003831418, + "grad_norm": 1.5808496475219727, + "learning_rate": 4.9567879998672075e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8830928564071655, + "num_tokens": 35140072.0, + "step": 4290 + }, + { + "entropy": 0.37835639119148257, + "epoch": 0.6856641123882503, + "grad_norm": 1.5594112873077393, + "learning_rate": 4.9566875803016e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8826068878173828, + "num_tokens": 35181032.0, + "step": 4295 + }, + { + "entropy": 0.39985790848731995, + "epoch": 0.6864623243933589, + "grad_norm": 1.4914906024932861, + "learning_rate": 4.956587045553186e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8740491151809693, + "num_tokens": 35221992.0, + "step": 4300 + }, + { + "entropy": 0.4111567497253418, + "epoch": 0.6872605363984674, + "grad_norm": 1.6156212091445923, + "learning_rate": 4.956486395628289e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8734665513038635, + "num_tokens": 35262952.0, + "step": 4305 + }, + { + "entropy": 0.4254221498966217, + "epoch": 0.688058748403576, + "grad_norm": 1.5075019598007202, + "learning_rate": 4.956385630533236e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8709608435630798, + "num_tokens": 35303912.0, + "step": 4310 + }, + { + "entropy": 0.3952524304389954, + "epoch": 0.6888569604086845, + "grad_norm": 1.6715117692947388, + "learning_rate": 4.956284750274366e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8758527517318726, + "num_tokens": 35344872.0, + "step": 4315 + }, + { + "entropy": 0.3389691412448883, + "epoch": 0.6896551724137931, + "grad_norm": 1.405334711074829, + "learning_rate": 4.95618375485802e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8922526597976684, + "num_tokens": 35385832.0, + "step": 4320 + }, + { + "entropy": 0.3625007688999176, + "epoch": 0.6904533844189017, + "grad_norm": 1.6644231081008911, + "learning_rate": 4.956082644290551e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8843298196792603, + "num_tokens": 35426792.0, + "step": 4325 + }, + { + "entropy": 0.3890696942806244, + "epoch": 0.6912515964240102, + "grad_norm": 1.4922293424606323, + "learning_rate": 4.955981418578316e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8751874804496765, + "num_tokens": 35467752.0, + "step": 4330 + }, + { + "entropy": 0.3943796694278717, + "epoch": 0.6920498084291188, + "grad_norm": 1.3469480276107788, + "learning_rate": 4.955880077727681e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8783192157745361, + "num_tokens": 35508712.0, + "step": 4335 + }, + { + "entropy": 0.3815250813961029, + "epoch": 0.6928480204342273, + "grad_norm": 1.6829841136932373, + "learning_rate": 4.955778621745019e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.878368365764618, + "num_tokens": 35549672.0, + "step": 4340 + }, + { + "entropy": 0.3641778290271759, + "epoch": 0.6936462324393359, + "grad_norm": 1.3170735836029053, + "learning_rate": 4.955677050636709e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8862357020378113, + "num_tokens": 35590632.0, + "step": 4345 + }, + { + "entropy": 0.3830491900444031, + "epoch": 0.6944444444444444, + "grad_norm": 1.4673370122909546, + "learning_rate": 4.955575364409138e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8813378095626831, + "num_tokens": 35631592.0, + "step": 4350 + }, + { + "entropy": 0.3623965919017792, + "epoch": 0.695242656449553, + "grad_norm": 1.958211898803711, + "learning_rate": 4.955473563068702e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8858277082443238, + "num_tokens": 35672552.0, + "step": 4355 + }, + { + "entropy": 0.38546149134635926, + "epoch": 0.6960408684546615, + "grad_norm": 1.5618401765823364, + "learning_rate": 4.955371646621801e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8819298505783081, + "num_tokens": 35713512.0, + "step": 4360 + }, + { + "entropy": 0.37305532693862914, + "epoch": 0.6968390804597702, + "grad_norm": 1.4023091793060303, + "learning_rate": 4.955269615074843e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8842706322669983, + "num_tokens": 35754472.0, + "step": 4365 + }, + { + "entropy": 0.38944450616836546, + "epoch": 0.6976372924648787, + "grad_norm": 1.6398652791976929, + "learning_rate": 4.955167468434247e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8770965456962585, + "num_tokens": 35795432.0, + "step": 4370 + }, + { + "entropy": 0.41156930327415464, + "epoch": 0.6984355044699873, + "grad_norm": 1.5604636669158936, + "learning_rate": 4.955065206706435e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8737661957740783, + "num_tokens": 35836392.0, + "step": 4375 + }, + { + "entropy": 0.3764748632907867, + "epoch": 0.6992337164750958, + "grad_norm": 1.3062576055526733, + "learning_rate": 4.954962829897838e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8827279210090637, + "num_tokens": 35877352.0, + "step": 4380 + }, + { + "entropy": 0.3629325807094574, + "epoch": 0.7000319284802043, + "grad_norm": 1.4810214042663574, + "learning_rate": 4.954860338014892e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8866893887519837, + "num_tokens": 35918312.0, + "step": 4385 + }, + { + "entropy": 0.3667100667953491, + "epoch": 0.7008301404853129, + "grad_norm": 1.7354620695114136, + "learning_rate": 4.954757731064044e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8835196852684021, + "num_tokens": 35959272.0, + "step": 4390 + }, + { + "entropy": 0.3856470465660095, + "epoch": 0.7016283524904214, + "grad_norm": 1.5140351057052612, + "learning_rate": 4.954655009051745e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8756252408027649, + "num_tokens": 36000232.0, + "step": 4395 + }, + { + "entropy": 0.4114512622356415, + "epoch": 0.70242656449553, + "grad_norm": 1.543448805809021, + "learning_rate": 4.954552171984455e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.867775559425354, + "num_tokens": 36041192.0, + "step": 4400 + }, + { + "epoch": 0.70242656449553, + "eval_entropy": 0.39369331550598147, + "eval_loss": 0.34902307391166687, + "eval_mean_token_accuracy": 0.8771976776123047, + "eval_num_tokens": 36041192.0, + "eval_runtime": 69.12, + "eval_samples_per_second": 14.468, + "eval_steps_per_second": 1.808, + "step": 4400 + }, + { + "entropy": 0.39357762932777407, + "epoch": 0.7032247765006385, + "grad_norm": 1.480932354927063, + "learning_rate": 4.95444921986864e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8758068084716797, + "num_tokens": 36082152.0, + "step": 4405 + }, + { + "entropy": 0.36230148673057555, + "epoch": 0.7040229885057471, + "grad_norm": 1.4368892908096313, + "learning_rate": 4.9543461527107765e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8867041110992432, + "num_tokens": 36123112.0, + "step": 4410 + }, + { + "entropy": 0.4771185517311096, + "epoch": 0.7048212005108557, + "grad_norm": 1.6853086948394775, + "learning_rate": 4.954242970517343e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8508485913276672, + "num_tokens": 36164072.0, + "step": 4415 + }, + { + "entropy": 0.37011151313781737, + "epoch": 0.7056194125159643, + "grad_norm": 1.3787293434143066, + "learning_rate": 4.954139673294828e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8846035480499268, + "num_tokens": 36205032.0, + "step": 4420 + }, + { + "entropy": 0.38793652057647704, + "epoch": 0.7064176245210728, + "grad_norm": 1.5404571294784546, + "learning_rate": 4.95403626104973e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8788623690605164, + "num_tokens": 36245992.0, + "step": 4425 + }, + { + "entropy": 0.3592794477939606, + "epoch": 0.7072158365261814, + "grad_norm": 1.6141693592071533, + "learning_rate": 4.95393273378855e-06, + "loss": 0.316, + "mean_token_accuracy": 0.888990044593811, + "num_tokens": 36286952.0, + "step": 4430 + }, + { + "entropy": 0.36488319635391236, + "epoch": 0.7080140485312899, + "grad_norm": 1.6408461332321167, + "learning_rate": 4.953829091517797e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8837617397308349, + "num_tokens": 36327912.0, + "step": 4435 + }, + { + "entropy": 0.3899082064628601, + "epoch": 0.7088122605363985, + "grad_norm": 1.3915435075759888, + "learning_rate": 4.95372533424399e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8771253943443298, + "num_tokens": 36368872.0, + "step": 4440 + }, + { + "entropy": 0.3807647466659546, + "epoch": 0.709610472541507, + "grad_norm": 1.6155505180358887, + "learning_rate": 4.953621461973653e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8812530994415283, + "num_tokens": 36409832.0, + "step": 4445 + }, + { + "entropy": 0.41307615041732787, + "epoch": 0.7104086845466155, + "grad_norm": 1.70521080493927, + "learning_rate": 4.953517474713318e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8698232889175415, + "num_tokens": 36450792.0, + "step": 4450 + }, + { + "entropy": 0.3725646257400513, + "epoch": 0.7112068965517241, + "grad_norm": 1.4554929733276367, + "learning_rate": 4.9534133724695244e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8859057426452637, + "num_tokens": 36491752.0, + "step": 4455 + }, + { + "entropy": 0.37655861377716066, + "epoch": 0.7120051085568327, + "grad_norm": 1.519578456878662, + "learning_rate": 4.953309155248818e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.880521821975708, + "num_tokens": 36532712.0, + "step": 4460 + }, + { + "entropy": 0.3744112551212311, + "epoch": 0.7128033205619413, + "grad_norm": 1.5445661544799805, + "learning_rate": 4.953204823057752e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8822221159934998, + "num_tokens": 36573672.0, + "step": 4465 + }, + { + "entropy": 0.42416757345199585, + "epoch": 0.7136015325670498, + "grad_norm": 1.565629005432129, + "learning_rate": 4.953100375902889e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8664975762367249, + "num_tokens": 36614632.0, + "step": 4470 + }, + { + "entropy": 0.4182154297828674, + "epoch": 0.7143997445721584, + "grad_norm": 1.5452251434326172, + "learning_rate": 4.952995813790795e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8647769689559937, + "num_tokens": 36655592.0, + "step": 4475 + }, + { + "entropy": 0.3633765935897827, + "epoch": 0.7151979565772669, + "grad_norm": 1.2990319728851318, + "learning_rate": 4.9528911367280465e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8850704431533813, + "num_tokens": 36696552.0, + "step": 4480 + }, + { + "entropy": 0.384858101606369, + "epoch": 0.7159961685823755, + "grad_norm": 1.4177272319793701, + "learning_rate": 4.952786344721225e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8788316130638123, + "num_tokens": 36737512.0, + "step": 4485 + }, + { + "entropy": 0.4165548741817474, + "epoch": 0.716794380587484, + "grad_norm": 1.6013543605804443, + "learning_rate": 4.95268143777692e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8671042084693908, + "num_tokens": 36778472.0, + "step": 4490 + }, + { + "entropy": 0.34506208300590513, + "epoch": 0.7175925925925926, + "grad_norm": 1.5734069347381592, + "learning_rate": 4.95257641590173e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8941628456115722, + "num_tokens": 36819432.0, + "step": 4495 + }, + { + "entropy": 0.3837789297103882, + "epoch": 0.7183908045977011, + "grad_norm": 1.5752099752426147, + "learning_rate": 4.9524712791022565e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8817675113677979, + "num_tokens": 36860392.0, + "step": 4500 + }, + { + "entropy": 0.3931439518928528, + "epoch": 0.7191890166028098, + "grad_norm": 1.6178399324417114, + "learning_rate": 4.952366027385114e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8789139747619629, + "num_tokens": 36901352.0, + "step": 4505 + }, + { + "entropy": 0.39552693963050845, + "epoch": 0.7199872286079183, + "grad_norm": 1.5380029678344727, + "learning_rate": 4.952260660756919e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8769156217575074, + "num_tokens": 36942312.0, + "step": 4510 + }, + { + "entropy": 0.4134032607078552, + "epoch": 0.7207854406130269, + "grad_norm": 1.5812859535217285, + "learning_rate": 4.952155179224298e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8702471852302551, + "num_tokens": 36983272.0, + "step": 4515 + }, + { + "entropy": 0.388623321056366, + "epoch": 0.7215836526181354, + "grad_norm": 1.444392204284668, + "learning_rate": 4.952049582793884e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8805589318275452, + "num_tokens": 37024232.0, + "step": 4520 + }, + { + "entropy": 0.357010555267334, + "epoch": 0.7223818646232439, + "grad_norm": 1.4659650325775146, + "learning_rate": 4.951943871472317e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8911778092384338, + "num_tokens": 37065192.0, + "step": 4525 + }, + { + "entropy": 0.379272598028183, + "epoch": 0.7231800766283525, + "grad_norm": 1.5867300033569336, + "learning_rate": 4.951838045266244e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8777178883552551, + "num_tokens": 37106152.0, + "step": 4530 + }, + { + "entropy": 0.37308881282806394, + "epoch": 0.723978288633461, + "grad_norm": 1.279022455215454, + "learning_rate": 4.951732104182321e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8849388480186462, + "num_tokens": 37147112.0, + "step": 4535 + }, + { + "entropy": 0.42798325419425964, + "epoch": 0.7247765006385696, + "grad_norm": 1.603519082069397, + "learning_rate": 4.95162604822721e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8669981360435486, + "num_tokens": 37188072.0, + "step": 4540 + }, + { + "entropy": 0.3764370262622833, + "epoch": 0.7255747126436781, + "grad_norm": 1.557897686958313, + "learning_rate": 4.951519877407579e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.88344247341156, + "num_tokens": 37229032.0, + "step": 4545 + }, + { + "entropy": 0.3716597259044647, + "epoch": 0.7263729246487867, + "grad_norm": 1.5863113403320312, + "learning_rate": 4.951413591730104e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8820095300674439, + "num_tokens": 37269992.0, + "step": 4550 + }, + { + "entropy": 0.41372754573822024, + "epoch": 0.7271711366538953, + "grad_norm": 1.6460305452346802, + "learning_rate": 4.95130719120147e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.871246612071991, + "num_tokens": 37310952.0, + "step": 4555 + }, + { + "entropy": 0.4125048518180847, + "epoch": 0.7279693486590039, + "grad_norm": 1.271487832069397, + "learning_rate": 4.951200675828368e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8682136774063111, + "num_tokens": 37351912.0, + "step": 4560 + }, + { + "entropy": 0.3915758430957794, + "epoch": 0.7287675606641124, + "grad_norm": 1.6986738443374634, + "learning_rate": 4.951094045617495e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8767178058624268, + "num_tokens": 37392872.0, + "step": 4565 + }, + { + "entropy": 0.3821511447429657, + "epoch": 0.729565772669221, + "grad_norm": 1.677868366241455, + "learning_rate": 4.950987300575557e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8818759441375732, + "num_tokens": 37433832.0, + "step": 4570 + }, + { + "entropy": 0.39576552510261537, + "epoch": 0.7303639846743295, + "grad_norm": 1.4657864570617676, + "learning_rate": 4.950880440709266e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8784390687942505, + "num_tokens": 37474792.0, + "step": 4575 + }, + { + "entropy": 0.34899569153785703, + "epoch": 0.731162196679438, + "grad_norm": 1.3611983060836792, + "learning_rate": 4.950773466025342e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8884916543960572, + "num_tokens": 37515752.0, + "step": 4580 + }, + { + "entropy": 0.3783299565315247, + "epoch": 0.7319604086845466, + "grad_norm": 1.5644665956497192, + "learning_rate": 4.950666376530511e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.884823739528656, + "num_tokens": 37556712.0, + "step": 4585 + }, + { + "entropy": 0.3779717743396759, + "epoch": 0.7327586206896551, + "grad_norm": 1.4334276914596558, + "learning_rate": 4.950559172231508e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.878784453868866, + "num_tokens": 37597672.0, + "step": 4590 + }, + { + "entropy": 0.4076384246349335, + "epoch": 0.7335568326947637, + "grad_norm": 1.5926692485809326, + "learning_rate": 4.950451853135075e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8714548945426941, + "num_tokens": 37638632.0, + "step": 4595 + }, + { + "entropy": 0.4062245607376099, + "epoch": 0.7343550446998723, + "grad_norm": 1.5608155727386475, + "learning_rate": 4.95034441924796e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8716461777687072, + "num_tokens": 37679592.0, + "step": 4600 + }, + { + "epoch": 0.7343550446998723, + "eval_entropy": 0.3939617915153503, + "eval_loss": 0.34785565733909607, + "eval_mean_token_accuracy": 0.8777779283523559, + "eval_num_tokens": 37679592.0, + "eval_runtime": 69.3074, + "eval_samples_per_second": 14.428, + "eval_steps_per_second": 1.804, + "step": 4600 + }, + { + "entropy": 0.3811860024929047, + "epoch": 0.7351532567049809, + "grad_norm": 1.516083002090454, + "learning_rate": 4.950236870576917e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.882855236530304, + "num_tokens": 37720552.0, + "step": 4605 + }, + { + "entropy": 0.4058854401111603, + "epoch": 0.7359514687100894, + "grad_norm": 1.468159794807434, + "learning_rate": 4.9501292071287134e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8699978351593017, + "num_tokens": 37761512.0, + "step": 4610 + }, + { + "entropy": 0.35052935481071473, + "epoch": 0.736749680715198, + "grad_norm": 1.3547862768173218, + "learning_rate": 4.950021428910114e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8876534581184388, + "num_tokens": 37802472.0, + "step": 4615 + }, + { + "entropy": 0.39081133604049684, + "epoch": 0.7375478927203065, + "grad_norm": 1.598197102546692, + "learning_rate": 4.949913535927901e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8756549954414368, + "num_tokens": 37843432.0, + "step": 4620 + }, + { + "entropy": 0.3502376556396484, + "epoch": 0.7383461047254151, + "grad_norm": 1.2976590394973755, + "learning_rate": 4.949805528188857e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8896740078926086, + "num_tokens": 37884392.0, + "step": 4625 + }, + { + "entropy": 0.3819392502307892, + "epoch": 0.7391443167305236, + "grad_norm": 1.6175434589385986, + "learning_rate": 4.949697405699774e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8793386459350586, + "num_tokens": 37925352.0, + "step": 4630 + }, + { + "entropy": 0.3609308242797852, + "epoch": 0.7399425287356322, + "grad_norm": 1.6114457845687866, + "learning_rate": 4.949589168467451e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8883061528205871, + "num_tokens": 37966312.0, + "step": 4635 + }, + { + "entropy": 0.40381277799606324, + "epoch": 0.7407407407407407, + "grad_norm": 1.5138126611709595, + "learning_rate": 4.949480816498694e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.875266969203949, + "num_tokens": 38007272.0, + "step": 4640 + }, + { + "entropy": 0.38398600816726686, + "epoch": 0.7415389527458492, + "grad_norm": 1.5208909511566162, + "learning_rate": 4.949372349800317e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8778509616851806, + "num_tokens": 38048232.0, + "step": 4645 + }, + { + "entropy": 0.3965888023376465, + "epoch": 0.7423371647509579, + "grad_norm": 1.5762202739715576, + "learning_rate": 4.949263768379141e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8777368307113648, + "num_tokens": 38089192.0, + "step": 4650 + }, + { + "entropy": 0.3549324214458466, + "epoch": 0.7431353767560664, + "grad_norm": 1.760347843170166, + "learning_rate": 4.949155072241994e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8862186551094056, + "num_tokens": 38130152.0, + "step": 4655 + }, + { + "entropy": 0.35094980597496034, + "epoch": 0.743933588761175, + "grad_norm": 1.3990799188613892, + "learning_rate": 4.949046261395711e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8917679905891418, + "num_tokens": 38171112.0, + "step": 4660 + }, + { + "entropy": 0.38255144357681276, + "epoch": 0.7447318007662835, + "grad_norm": 1.5849868059158325, + "learning_rate": 4.948937335847135e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8809654116630554, + "num_tokens": 38212072.0, + "step": 4665 + }, + { + "entropy": 0.39698757529258727, + "epoch": 0.7455300127713921, + "grad_norm": 1.5714181661605835, + "learning_rate": 4.948828295603114e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8753175854682922, + "num_tokens": 38253032.0, + "step": 4670 + }, + { + "entropy": 0.4402578055858612, + "epoch": 0.7463282247765006, + "grad_norm": 1.7324473857879639, + "learning_rate": 4.948719140670506e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8609963536262513, + "num_tokens": 38293992.0, + "step": 4675 + }, + { + "entropy": 0.3861755609512329, + "epoch": 0.7471264367816092, + "grad_norm": 1.4474149942398071, + "learning_rate": 4.948609871056175e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8831609845161438, + "num_tokens": 38334952.0, + "step": 4680 + }, + { + "entropy": 0.39228718876838686, + "epoch": 0.7479246487867177, + "grad_norm": 1.720402717590332, + "learning_rate": 4.948500486766991e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.876913046836853, + "num_tokens": 38375577.0, + "step": 4685 + }, + { + "entropy": 0.3946023941040039, + "epoch": 0.7487228607918263, + "grad_norm": 1.6547355651855469, + "learning_rate": 4.948390987809836e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8741081357002258, + "num_tokens": 38416537.0, + "step": 4690 + }, + { + "entropy": 0.39651496410369874, + "epoch": 0.7495210727969349, + "grad_norm": 1.4734386205673218, + "learning_rate": 4.9482813741915905e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.872340977191925, + "num_tokens": 38457497.0, + "step": 4695 + }, + { + "entropy": 0.36745116114616394, + "epoch": 0.7503192848020435, + "grad_norm": 1.674511432647705, + "learning_rate": 4.948171645919152e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8847783923149108, + "num_tokens": 38498457.0, + "step": 4700 + }, + { + "entropy": 0.3797731637954712, + "epoch": 0.751117496807152, + "grad_norm": 1.4765043258666992, + "learning_rate": 4.948061802999418e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8845394730567933, + "num_tokens": 38539417.0, + "step": 4705 + }, + { + "entropy": 0.38238744139671327, + "epoch": 0.7519157088122606, + "grad_norm": 1.5898044109344482, + "learning_rate": 4.947951845439296e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8808130502700806, + "num_tokens": 38580377.0, + "step": 4710 + }, + { + "entropy": 0.430251544713974, + "epoch": 0.7527139208173691, + "grad_norm": 1.7207558155059814, + "learning_rate": 4.9478417732457015e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8684833407402038, + "num_tokens": 38621337.0, + "step": 4715 + }, + { + "entropy": 0.36144039034843445, + "epoch": 0.7535121328224776, + "grad_norm": 1.4919703006744385, + "learning_rate": 4.947731586425555e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8856022119522095, + "num_tokens": 38662297.0, + "step": 4720 + }, + { + "entropy": 0.33562816977500914, + "epoch": 0.7543103448275862, + "grad_norm": 1.5665706396102905, + "learning_rate": 4.9476212849857875e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8960858225822449, + "num_tokens": 38703257.0, + "step": 4725 + }, + { + "entropy": 0.3331297695636749, + "epoch": 0.7551085568326947, + "grad_norm": 1.2354601621627808, + "learning_rate": 4.947510868933333e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8963336110115051, + "num_tokens": 38744217.0, + "step": 4730 + }, + { + "entropy": 0.3897834599018097, + "epoch": 0.7559067688378033, + "grad_norm": 1.7782258987426758, + "learning_rate": 4.947400338275135e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8771867871284484, + "num_tokens": 38785177.0, + "step": 4735 + }, + { + "entropy": 0.37567186951637266, + "epoch": 0.7567049808429118, + "grad_norm": 1.7346822023391724, + "learning_rate": 4.947289693018145e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8829320907592774, + "num_tokens": 38826137.0, + "step": 4740 + }, + { + "entropy": 0.3920513987541199, + "epoch": 0.7575031928480205, + "grad_norm": 1.7325843572616577, + "learning_rate": 4.9471789331693206e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8777777433395386, + "num_tokens": 38867097.0, + "step": 4745 + }, + { + "entropy": 0.36579321026802064, + "epoch": 0.758301404853129, + "grad_norm": 1.4960813522338867, + "learning_rate": 4.9470680587356265e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8874636650085449, + "num_tokens": 38908057.0, + "step": 4750 + }, + { + "entropy": 0.3968794882297516, + "epoch": 0.7590996168582376, + "grad_norm": 1.4070299863815308, + "learning_rate": 4.9469570697240355e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8750940442085267, + "num_tokens": 38949017.0, + "step": 4755 + }, + { + "entropy": 0.3771853268146515, + "epoch": 0.7598978288633461, + "grad_norm": 1.4297999143600464, + "learning_rate": 4.9468459661415255e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.881128466129303, + "num_tokens": 38989977.0, + "step": 4760 + }, + { + "entropy": 0.3986180305480957, + "epoch": 0.7606960408684547, + "grad_norm": 1.511413335800171, + "learning_rate": 4.9467347479950845e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8775883197784424, + "num_tokens": 39030937.0, + "step": 4765 + }, + { + "entropy": 0.3891195774078369, + "epoch": 0.7614942528735632, + "grad_norm": 1.6162978410720825, + "learning_rate": 4.9466234152917056e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8752928972244263, + "num_tokens": 39071897.0, + "step": 4770 + }, + { + "entropy": 0.3707997024059296, + "epoch": 0.7622924648786717, + "grad_norm": 1.4906481504440308, + "learning_rate": 4.94651196803839e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8805507779121399, + "num_tokens": 39112857.0, + "step": 4775 + }, + { + "entropy": 0.3961538434028625, + "epoch": 0.7630906768837803, + "grad_norm": 1.5734567642211914, + "learning_rate": 4.946400406242147e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8761051297187805, + "num_tokens": 39153817.0, + "step": 4780 + }, + { + "entropy": 0.3866142988204956, + "epoch": 0.7638888888888888, + "grad_norm": 1.5984771251678467, + "learning_rate": 4.946288729909989e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8792457580566406, + "num_tokens": 39194777.0, + "step": 4785 + }, + { + "entropy": 0.41122140884399416, + "epoch": 0.7646871008939975, + "grad_norm": 1.4738880395889282, + "learning_rate": 4.94617693904894e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8724663138389588, + "num_tokens": 39235737.0, + "step": 4790 + }, + { + "entropy": 0.37069701552391054, + "epoch": 0.765485312899106, + "grad_norm": 1.4634195566177368, + "learning_rate": 4.946065033666032e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8836962461471558, + "num_tokens": 39276697.0, + "step": 4795 + }, + { + "entropy": 0.4035445749759674, + "epoch": 0.7662835249042146, + "grad_norm": 1.467256784439087, + "learning_rate": 4.945953013768299e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8714822649955749, + "num_tokens": 39317657.0, + "step": 4800 + }, + { + "epoch": 0.7662835249042146, + "eval_entropy": 0.3911557722091675, + "eval_loss": 0.3467390835285187, + "eval_mean_token_accuracy": 0.8777855606079101, + "eval_num_tokens": 39317657.0, + "eval_runtime": 69.2296, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 4800 + }, + { + "entropy": 0.35707170963287355, + "epoch": 0.7670817369093231, + "grad_norm": 1.3541243076324463, + "learning_rate": 4.9458408793627875e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.889268672466278, + "num_tokens": 39358617.0, + "step": 4805 + }, + { + "entropy": 0.37148754596710204, + "epoch": 0.7678799489144317, + "grad_norm": 1.415036678314209, + "learning_rate": 4.945728630456546e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8829362630844116, + "num_tokens": 39399577.0, + "step": 4810 + }, + { + "entropy": 0.3493272364139557, + "epoch": 0.7686781609195402, + "grad_norm": 1.5002624988555908, + "learning_rate": 4.945616267056636e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8897889256477356, + "num_tokens": 39440537.0, + "step": 4815 + }, + { + "entropy": 0.37543088793754575, + "epoch": 0.7694763729246488, + "grad_norm": 1.3540980815887451, + "learning_rate": 4.945503789170123e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8817245364189148, + "num_tokens": 39481497.0, + "step": 4820 + }, + { + "entropy": 0.39512808322906495, + "epoch": 0.7702745849297573, + "grad_norm": 1.615538239479065, + "learning_rate": 4.945391196804078e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8730616807937622, + "num_tokens": 39522457.0, + "step": 4825 + }, + { + "entropy": 0.3639287889003754, + "epoch": 0.7710727969348659, + "grad_norm": 1.5440593957901, + "learning_rate": 4.945278489965583e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.887544846534729, + "num_tokens": 39563417.0, + "step": 4830 + }, + { + "entropy": 0.3264977991580963, + "epoch": 0.7718710089399745, + "grad_norm": 1.3868415355682373, + "learning_rate": 4.945165668661724e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8973047494888305, + "num_tokens": 39604377.0, + "step": 4835 + }, + { + "entropy": 0.3820899188518524, + "epoch": 0.7726692209450831, + "grad_norm": 1.5096489191055298, + "learning_rate": 4.945052732899597e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.882480239868164, + "num_tokens": 39645337.0, + "step": 4840 + }, + { + "entropy": 0.3705361783504486, + "epoch": 0.7734674329501916, + "grad_norm": 1.3227615356445312, + "learning_rate": 4.944939682686303e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.884414803981781, + "num_tokens": 39686297.0, + "step": 4845 + }, + { + "entropy": 0.3665523946285248, + "epoch": 0.7742656449553001, + "grad_norm": 1.456167221069336, + "learning_rate": 4.94482651802895e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8854122757911682, + "num_tokens": 39727257.0, + "step": 4850 + }, + { + "entropy": 0.38584417700767515, + "epoch": 0.7750638569604087, + "grad_norm": 1.4760644435882568, + "learning_rate": 4.944713238934658e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8807156682014465, + "num_tokens": 39768217.0, + "step": 4855 + }, + { + "entropy": 0.3925118625164032, + "epoch": 0.7758620689655172, + "grad_norm": 1.7033679485321045, + "learning_rate": 4.944599845410545e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8756734251976013, + "num_tokens": 39809177.0, + "step": 4860 + }, + { + "entropy": 0.3694494187831879, + "epoch": 0.7766602809706258, + "grad_norm": 2.6237292289733887, + "learning_rate": 4.944486337463745e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8807478785514832, + "num_tokens": 39850137.0, + "step": 4865 + }, + { + "entropy": 0.33902106881141664, + "epoch": 0.7774584929757343, + "grad_norm": 1.4346832036972046, + "learning_rate": 4.944372715101396e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8959431409835815, + "num_tokens": 39891097.0, + "step": 4870 + }, + { + "entropy": 0.3760982811450958, + "epoch": 0.7782567049808429, + "grad_norm": 1.4758082628250122, + "learning_rate": 4.944258978330641e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8848476409912109, + "num_tokens": 39932057.0, + "step": 4875 + }, + { + "entropy": 0.35401169061660764, + "epoch": 0.7790549169859514, + "grad_norm": 1.478041172027588, + "learning_rate": 4.944145127158633e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8890954971313476, + "num_tokens": 39973017.0, + "step": 4880 + }, + { + "entropy": 0.37507704496383665, + "epoch": 0.7798531289910601, + "grad_norm": 1.4558285474777222, + "learning_rate": 4.944031161592532e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8805190443992614, + "num_tokens": 40013977.0, + "step": 4885 + }, + { + "entropy": 0.3771682620048523, + "epoch": 0.7806513409961686, + "grad_norm": 1.4338040351867676, + "learning_rate": 4.943917081639505e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8837599635124207, + "num_tokens": 40054937.0, + "step": 4890 + }, + { + "entropy": 0.34878968000411986, + "epoch": 0.7814495530012772, + "grad_norm": 1.6373964548110962, + "learning_rate": 4.943802887306723e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.890072476863861, + "num_tokens": 40095897.0, + "step": 4895 + }, + { + "entropy": 0.3822789669036865, + "epoch": 0.7822477650063857, + "grad_norm": 1.621001958847046, + "learning_rate": 4.943688578601369e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.881649649143219, + "num_tokens": 40136857.0, + "step": 4900 + }, + { + "entropy": 0.40567703247070314, + "epoch": 0.7830459770114943, + "grad_norm": 1.345913052558899, + "learning_rate": 4.943574155530631e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8728673458099365, + "num_tokens": 40177817.0, + "step": 4905 + }, + { + "entropy": 0.37211456298828127, + "epoch": 0.7838441890166028, + "grad_norm": 1.4501780271530151, + "learning_rate": 4.943459618101706e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8833699584007263, + "num_tokens": 40218777.0, + "step": 4910 + }, + { + "entropy": 0.36408587694168093, + "epoch": 0.7846424010217113, + "grad_norm": 1.6432631015777588, + "learning_rate": 4.9433449663217925e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8842684388160705, + "num_tokens": 40259737.0, + "step": 4915 + }, + { + "entropy": 0.3740302503108978, + "epoch": 0.7854406130268199, + "grad_norm": 1.705114722251892, + "learning_rate": 4.943230200198102e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.883201253414154, + "num_tokens": 40300697.0, + "step": 4920 + }, + { + "entropy": 0.37626251578330994, + "epoch": 0.7862388250319284, + "grad_norm": 1.6285901069641113, + "learning_rate": 4.943115319737854e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8847034335136413, + "num_tokens": 40341657.0, + "step": 4925 + }, + { + "entropy": 0.3706506729125977, + "epoch": 0.7870370370370371, + "grad_norm": 1.4745314121246338, + "learning_rate": 4.943000324948269e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8840723276138306, + "num_tokens": 40382617.0, + "step": 4930 + }, + { + "entropy": 0.3797015368938446, + "epoch": 0.7878352490421456, + "grad_norm": 1.5776362419128418, + "learning_rate": 4.9428852158365805e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8814828038215637, + "num_tokens": 40423577.0, + "step": 4935 + }, + { + "entropy": 0.4147985756397247, + "epoch": 0.7886334610472542, + "grad_norm": 1.7615753412246704, + "learning_rate": 4.942769992410025e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8718114852905273, + "num_tokens": 40464537.0, + "step": 4940 + }, + { + "entropy": 0.3999458372592926, + "epoch": 0.7894316730523627, + "grad_norm": 1.6666876077651978, + "learning_rate": 4.9426546546758495e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8717912793159485, + "num_tokens": 40505497.0, + "step": 4945 + }, + { + "entropy": 0.38414047956466674, + "epoch": 0.7902298850574713, + "grad_norm": 1.5749131441116333, + "learning_rate": 4.942539202641306e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8756724238395691, + "num_tokens": 40546457.0, + "step": 4950 + }, + { + "entropy": 0.35958670973777773, + "epoch": 0.7910280970625798, + "grad_norm": 1.5309059619903564, + "learning_rate": 4.9424236363136555e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.886970865726471, + "num_tokens": 40587417.0, + "step": 4955 + }, + { + "entropy": 0.3650970160961151, + "epoch": 0.7918263090676884, + "grad_norm": 1.6460506916046143, + "learning_rate": 4.942307955700165e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8842700004577637, + "num_tokens": 40628377.0, + "step": 4960 + }, + { + "entropy": 0.4351780354976654, + "epoch": 0.7926245210727969, + "grad_norm": 1.7340452671051025, + "learning_rate": 4.942192160808108e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8620139837265015, + "num_tokens": 40669337.0, + "step": 4965 + }, + { + "entropy": 0.36986536979675294, + "epoch": 0.7934227330779055, + "grad_norm": 1.5631247758865356, + "learning_rate": 4.942076251644767e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8860733151435852, + "num_tokens": 40710297.0, + "step": 4970 + }, + { + "entropy": 0.39304872155189513, + "epoch": 0.794220945083014, + "grad_norm": 1.3219350576400757, + "learning_rate": 4.941960228217431e-06, + "loss": 0.348, + "mean_token_accuracy": 0.876360547542572, + "num_tokens": 40751257.0, + "step": 4975 + }, + { + "entropy": 0.37213549613952634, + "epoch": 0.7950191570881227, + "grad_norm": 1.434804081916809, + "learning_rate": 4.941844090533396e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8791846871376038, + "num_tokens": 40792217.0, + "step": 4980 + }, + { + "entropy": 0.37901111245155333, + "epoch": 0.7958173690932312, + "grad_norm": 1.5676732063293457, + "learning_rate": 4.941727838599964e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8803433775901794, + "num_tokens": 40833177.0, + "step": 4985 + }, + { + "entropy": 0.38844203352928164, + "epoch": 0.7966155810983397, + "grad_norm": 1.5531984567642212, + "learning_rate": 4.941611472424445e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8778400778770447, + "num_tokens": 40874137.0, + "step": 4990 + }, + { + "entropy": 0.4182100772857666, + "epoch": 0.7974137931034483, + "grad_norm": 1.4862117767333984, + "learning_rate": 4.941494992014158e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8644268989562989, + "num_tokens": 40915097.0, + "step": 4995 + }, + { + "entropy": 0.38949393630027773, + "epoch": 0.7982120051085568, + "grad_norm": 1.5576521158218384, + "learning_rate": 4.9413783973764275e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8797020792961121, + "num_tokens": 40956057.0, + "step": 5000 + }, + { + "epoch": 0.7982120051085568, + "eval_entropy": 0.39517786979675296, + "eval_loss": 0.34585344791412354, + "eval_mean_token_accuracy": 0.8783624324798583, + "eval_num_tokens": 40956057.0, + "eval_runtime": 69.2282, + "eval_samples_per_second": 14.445, + "eval_steps_per_second": 1.806, + "step": 5000 + }, + { + "entropy": 0.39468519687652587, + "epoch": 0.7990102171136654, + "grad_norm": 1.5206162929534912, + "learning_rate": 4.9412616885185844e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8757318615913391, + "num_tokens": 40997017.0, + "step": 5005 + }, + { + "entropy": 0.37782185077667235, + "epoch": 0.7998084291187739, + "grad_norm": 1.4788711071014404, + "learning_rate": 4.941144865447969e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8795312762260437, + "num_tokens": 41037977.0, + "step": 5010 + }, + { + "entropy": 0.3883132815361023, + "epoch": 0.8006066411238825, + "grad_norm": 1.6302030086517334, + "learning_rate": 4.941027928171927e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8747356057167053, + "num_tokens": 41078937.0, + "step": 5015 + }, + { + "entropy": 0.3945412039756775, + "epoch": 0.801404853128991, + "grad_norm": 1.3573029041290283, + "learning_rate": 4.94091087669781e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8777291893959045, + "num_tokens": 41119897.0, + "step": 5020 + }, + { + "entropy": 0.36984707713127135, + "epoch": 0.8022030651340997, + "grad_norm": 1.6052615642547607, + "learning_rate": 4.940793711032982e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8812803864479065, + "num_tokens": 41160857.0, + "step": 5025 + }, + { + "entropy": 0.37686986327171323, + "epoch": 0.8030012771392082, + "grad_norm": 1.5283271074295044, + "learning_rate": 4.940676431184808e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8817309975624085, + "num_tokens": 41201817.0, + "step": 5030 + }, + { + "entropy": 0.42779810428619386, + "epoch": 0.8037994891443168, + "grad_norm": 1.754578709602356, + "learning_rate": 4.9405590371606645e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8636540412902832, + "num_tokens": 41242777.0, + "step": 5035 + }, + { + "entropy": 0.3827055513858795, + "epoch": 0.8045977011494253, + "grad_norm": 1.5831722021102905, + "learning_rate": 4.940441528967933e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8784476161003113, + "num_tokens": 41283737.0, + "step": 5040 + }, + { + "entropy": 0.3860118448734283, + "epoch": 0.8053959131545338, + "grad_norm": 1.460633397102356, + "learning_rate": 4.940323906614003e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8752281665802002, + "num_tokens": 41324697.0, + "step": 5045 + }, + { + "entropy": 0.3734028100967407, + "epoch": 0.8061941251596424, + "grad_norm": 1.401058316230774, + "learning_rate": 4.940206170106272e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8852216362953186, + "num_tokens": 41365657.0, + "step": 5050 + }, + { + "entropy": 0.41934972405433657, + "epoch": 0.8069923371647509, + "grad_norm": 1.7835423946380615, + "learning_rate": 4.940088319452141e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8679613947868348, + "num_tokens": 41406617.0, + "step": 5055 + }, + { + "entropy": 0.39728267788887023, + "epoch": 0.8077905491698595, + "grad_norm": 1.2957251071929932, + "learning_rate": 4.939970354659024e-06, + "loss": 0.355, + "mean_token_accuracy": 0.874215042591095, + "num_tokens": 41447577.0, + "step": 5060 + }, + { + "entropy": 0.36827427744865415, + "epoch": 0.808588761174968, + "grad_norm": 1.5750200748443604, + "learning_rate": 4.939852275734336e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8860924124717713, + "num_tokens": 41488537.0, + "step": 5065 + }, + { + "entropy": 0.37820854783058167, + "epoch": 0.8093869731800766, + "grad_norm": 1.4481803178787231, + "learning_rate": 4.939734082685505e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8807801008224487, + "num_tokens": 41529497.0, + "step": 5070 + }, + { + "entropy": 0.38313864469528197, + "epoch": 0.8101851851851852, + "grad_norm": 1.6074419021606445, + "learning_rate": 4.939615775519962e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8779990315437317, + "num_tokens": 41570457.0, + "step": 5075 + }, + { + "entropy": 0.3798495590686798, + "epoch": 0.8109833971902938, + "grad_norm": 1.6640052795410156, + "learning_rate": 4.939497354245146e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8838315844535828, + "num_tokens": 41611417.0, + "step": 5080 + }, + { + "entropy": 0.4095848798751831, + "epoch": 0.8117816091954023, + "grad_norm": 1.4963005781173706, + "learning_rate": 4.939378818868506e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.875108253955841, + "num_tokens": 41652377.0, + "step": 5085 + }, + { + "entropy": 0.3914333820343018, + "epoch": 0.8125798212005109, + "grad_norm": 1.7386914491653442, + "learning_rate": 4.9392601693974915e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8770987033843994, + "num_tokens": 41693337.0, + "step": 5090 + }, + { + "entropy": 0.3680845439434052, + "epoch": 0.8133780332056194, + "grad_norm": 1.5366642475128174, + "learning_rate": 4.939141405839569e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8843695878982544, + "num_tokens": 41734297.0, + "step": 5095 + }, + { + "entropy": 0.3906676173210144, + "epoch": 0.814176245210728, + "grad_norm": 1.6059380769729614, + "learning_rate": 4.939022528202203e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8771692991256714, + "num_tokens": 41775257.0, + "step": 5100 + }, + { + "entropy": 0.340567022562027, + "epoch": 0.8149744572158365, + "grad_norm": 1.5068142414093018, + "learning_rate": 4.93890353649287e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8926120519638061, + "num_tokens": 41816217.0, + "step": 5105 + }, + { + "entropy": 0.38606288433074953, + "epoch": 0.815772669220945, + "grad_norm": 1.6737275123596191, + "learning_rate": 4.9387844307190536e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8784497022628784, + "num_tokens": 41857177.0, + "step": 5110 + }, + { + "entropy": 0.3974448382854462, + "epoch": 0.8165708812260536, + "grad_norm": 1.3572559356689453, + "learning_rate": 4.938665210888242e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.874246883392334, + "num_tokens": 41898137.0, + "step": 5115 + }, + { + "entropy": 0.3437396764755249, + "epoch": 0.8173690932311622, + "grad_norm": 1.3938586711883545, + "learning_rate": 4.938545877007933e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8925269246101379, + "num_tokens": 41939097.0, + "step": 5120 + }, + { + "entropy": 0.35920992493629456, + "epoch": 0.8181673052362708, + "grad_norm": 1.4645718336105347, + "learning_rate": 4.938426429085631e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8877154946327209, + "num_tokens": 41980057.0, + "step": 5125 + }, + { + "entropy": 0.4538248538970947, + "epoch": 0.8189655172413793, + "grad_norm": 1.5313849449157715, + "learning_rate": 4.938306867128847e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8557774543762207, + "num_tokens": 42021017.0, + "step": 5130 + }, + { + "entropy": 0.3992255091667175, + "epoch": 0.8197637292464879, + "grad_norm": 1.5350704193115234, + "learning_rate": 4.938187191145099e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8749038934707641, + "num_tokens": 42061977.0, + "step": 5135 + }, + { + "entropy": 0.37963120341300965, + "epoch": 0.8205619412515964, + "grad_norm": 1.4105563163757324, + "learning_rate": 4.938067401141912e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8834235668182373, + "num_tokens": 42102937.0, + "step": 5140 + }, + { + "entropy": 0.39865819215774534, + "epoch": 0.821360153256705, + "grad_norm": 1.4852190017700195, + "learning_rate": 4.937947497126821e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8768776297569275, + "num_tokens": 42143897.0, + "step": 5145 + }, + { + "entropy": 0.39312014579772947, + "epoch": 0.8221583652618135, + "grad_norm": 1.7987556457519531, + "learning_rate": 4.937827479107365e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8770885109901428, + "num_tokens": 42184857.0, + "step": 5150 + }, + { + "entropy": 0.4142571449279785, + "epoch": 0.8229565772669221, + "grad_norm": 1.6814552545547485, + "learning_rate": 4.93770734709109e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8683848857879639, + "num_tokens": 42225817.0, + "step": 5155 + }, + { + "entropy": 0.35767839550971986, + "epoch": 0.8237547892720306, + "grad_norm": 1.4671154022216797, + "learning_rate": 4.937587101085551e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8885256052017212, + "num_tokens": 42266777.0, + "step": 5160 + }, + { + "entropy": 0.37379557490348814, + "epoch": 0.8245530012771393, + "grad_norm": 1.4868617057800293, + "learning_rate": 4.93746674109831e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8791596293449402, + "num_tokens": 42307737.0, + "step": 5165 + }, + { + "entropy": 0.33919037580490113, + "epoch": 0.8253512132822478, + "grad_norm": 1.6345292329788208, + "learning_rate": 4.937346267136936e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8931122422218323, + "num_tokens": 42348697.0, + "step": 5170 + }, + { + "entropy": 0.3418751239776611, + "epoch": 0.8261494252873564, + "grad_norm": 1.6059125661849976, + "learning_rate": 4.937225679209003e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8965206980705261, + "num_tokens": 42389657.0, + "step": 5175 + }, + { + "entropy": 0.4018648386001587, + "epoch": 0.8269476372924649, + "grad_norm": 1.7123645544052124, + "learning_rate": 4.937104977322097e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8732232570648193, + "num_tokens": 42430617.0, + "step": 5180 + }, + { + "entropy": 0.38013545870780946, + "epoch": 0.8277458492975734, + "grad_norm": 1.3969297409057617, + "learning_rate": 4.936984161483805e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8795966863632202, + "num_tokens": 42471577.0, + "step": 5185 + }, + { + "entropy": 0.3990564405918121, + "epoch": 0.828544061302682, + "grad_norm": 1.7368186712265015, + "learning_rate": 4.9368632317017255e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8751740336418152, + "num_tokens": 42512537.0, + "step": 5190 + }, + { + "entropy": 0.40104894042015077, + "epoch": 0.8293422733077905, + "grad_norm": 1.5812550783157349, + "learning_rate": 4.936742187983464e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.87327082157135, + "num_tokens": 42553497.0, + "step": 5195 + }, + { + "entropy": 0.37812411189079287, + "epoch": 0.8301404853128991, + "grad_norm": 1.393334984779358, + "learning_rate": 4.936621030336631e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8836560845375061, + "num_tokens": 42594457.0, + "step": 5200 + }, + { + "epoch": 0.8301404853128991, + "eval_entropy": 0.3916443166732788, + "eval_loss": 0.34470275044441223, + "eval_mean_token_accuracy": 0.878385350227356, + "eval_num_tokens": 42594457.0, + "eval_runtime": 69.2115, + "eval_samples_per_second": 14.448, + "eval_steps_per_second": 1.806, + "step": 5200 + }, + { + "entropy": 0.37057461142539977, + "epoch": 0.8309386973180076, + "grad_norm": 1.6118886470794678, + "learning_rate": 4.9364997587688444e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8839616656303406, + "num_tokens": 42635417.0, + "step": 5205 + }, + { + "entropy": 0.37779011130332946, + "epoch": 0.8317369093231162, + "grad_norm": 1.5371465682983398, + "learning_rate": 4.936378373287733e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.881714928150177, + "num_tokens": 42676377.0, + "step": 5210 + }, + { + "entropy": 0.36580815315246584, + "epoch": 0.8325351213282248, + "grad_norm": 1.3122960329055786, + "learning_rate": 4.936256873900927e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8857713937759399, + "num_tokens": 42717337.0, + "step": 5215 + }, + { + "entropy": 0.3645298182964325, + "epoch": 0.8333333333333334, + "grad_norm": 1.441042184829712, + "learning_rate": 4.936135260616069e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8843689322471618, + "num_tokens": 42758297.0, + "step": 5220 + }, + { + "entropy": 0.39551939964294436, + "epoch": 0.8341315453384419, + "grad_norm": 1.3785526752471924, + "learning_rate": 4.936013533440804e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8716706156730651, + "num_tokens": 42799257.0, + "step": 5225 + }, + { + "entropy": 0.36737871170043945, + "epoch": 0.8349297573435505, + "grad_norm": 1.4625060558319092, + "learning_rate": 4.935891692382789e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8853367805480957, + "num_tokens": 42840217.0, + "step": 5230 + }, + { + "entropy": 0.35239975452423095, + "epoch": 0.835727969348659, + "grad_norm": 1.7056565284729004, + "learning_rate": 4.935769737449686e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8921406984329223, + "num_tokens": 42881177.0, + "step": 5235 + }, + { + "entropy": 0.4111760914325714, + "epoch": 0.8365261813537676, + "grad_norm": 1.4598015546798706, + "learning_rate": 4.9356476686491605e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8678424596786499, + "num_tokens": 42922137.0, + "step": 5240 + }, + { + "entropy": 0.3474355399608612, + "epoch": 0.8373243933588761, + "grad_norm": 1.505201816558838, + "learning_rate": 4.935525485988892e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.891577672958374, + "num_tokens": 42963097.0, + "step": 5245 + }, + { + "entropy": 0.41879857778549195, + "epoch": 0.8381226053639846, + "grad_norm": 1.8078423738479614, + "learning_rate": 4.935403189476563e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.869062352180481, + "num_tokens": 43004057.0, + "step": 5250 + }, + { + "entropy": 0.38555397391319274, + "epoch": 0.8389208173690932, + "grad_norm": 1.5440025329589844, + "learning_rate": 4.9352807791198635e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.878856647014618, + "num_tokens": 43045017.0, + "step": 5255 + }, + { + "entropy": 0.38125547766685486, + "epoch": 0.8397190293742018, + "grad_norm": 1.4443796873092651, + "learning_rate": 4.93515825492649e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8804764747619629, + "num_tokens": 43085977.0, + "step": 5260 + }, + { + "entropy": 0.3209161102771759, + "epoch": 0.8405172413793104, + "grad_norm": 1.6840972900390625, + "learning_rate": 4.935035616904149e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8992561101913452, + "num_tokens": 43126937.0, + "step": 5265 + }, + { + "entropy": 0.3630024969577789, + "epoch": 0.8413154533844189, + "grad_norm": 1.4851804971694946, + "learning_rate": 4.934912865060552e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8831697225570678, + "num_tokens": 43167897.0, + "step": 5270 + }, + { + "entropy": 0.3577308297157288, + "epoch": 0.8421136653895275, + "grad_norm": 1.7565642595291138, + "learning_rate": 4.934789999403418e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8880985140800476, + "num_tokens": 43208857.0, + "step": 5275 + }, + { + "entropy": 0.3566504120826721, + "epoch": 0.842911877394636, + "grad_norm": 1.513185739517212, + "learning_rate": 4.934667019940474e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8909970402717591, + "num_tokens": 43249817.0, + "step": 5280 + }, + { + "entropy": 0.41708238124847413, + "epoch": 0.8437100893997446, + "grad_norm": 1.550545573234558, + "learning_rate": 4.934543926679449e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8711596012115479, + "num_tokens": 43290777.0, + "step": 5285 + }, + { + "entropy": 0.3585968017578125, + "epoch": 0.8445083014048531, + "grad_norm": 1.507271409034729, + "learning_rate": 4.93442071962809e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.887067461013794, + "num_tokens": 43331737.0, + "step": 5290 + }, + { + "entropy": 0.3447089433670044, + "epoch": 0.8453065134099617, + "grad_norm": 1.5838521718978882, + "learning_rate": 4.934297398794141e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8938217759132385, + "num_tokens": 43372697.0, + "step": 5295 + }, + { + "entropy": 0.39036492705345155, + "epoch": 0.8461047254150702, + "grad_norm": 1.4771947860717773, + "learning_rate": 4.934173964185357e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8789735913276673, + "num_tokens": 43413657.0, + "step": 5300 + }, + { + "entropy": 0.35201868414878845, + "epoch": 0.8469029374201787, + "grad_norm": 1.5779476165771484, + "learning_rate": 4.934050415809502e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8905610561370849, + "num_tokens": 43454617.0, + "step": 5305 + }, + { + "entropy": 0.4234282970428467, + "epoch": 0.8477011494252874, + "grad_norm": 1.6354950666427612, + "learning_rate": 4.933926753674342e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.868942129611969, + "num_tokens": 43495577.0, + "step": 5310 + }, + { + "entropy": 0.36352901458740233, + "epoch": 0.848499361430396, + "grad_norm": 1.4287358522415161, + "learning_rate": 4.933802977787655e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8870725750923156, + "num_tokens": 43536537.0, + "step": 5315 + }, + { + "entropy": 0.4029152154922485, + "epoch": 0.8492975734355045, + "grad_norm": 1.4122129678726196, + "learning_rate": 4.933679088157226e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8763118505477905, + "num_tokens": 43577497.0, + "step": 5320 + }, + { + "entropy": 0.3681135237216949, + "epoch": 0.850095785440613, + "grad_norm": 1.486761450767517, + "learning_rate": 4.933555084790842e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.886578106880188, + "num_tokens": 43618457.0, + "step": 5325 + }, + { + "entropy": 0.3565655589103699, + "epoch": 0.8508939974457216, + "grad_norm": 1.3662333488464355, + "learning_rate": 4.933430967696303e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8898347854614258, + "num_tokens": 43659417.0, + "step": 5330 + }, + { + "entropy": 0.3615683376789093, + "epoch": 0.8516922094508301, + "grad_norm": 1.4146095514297485, + "learning_rate": 4.933306736881415e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8881414651870727, + "num_tokens": 43700377.0, + "step": 5335 + }, + { + "entropy": 0.3876829206943512, + "epoch": 0.8524904214559387, + "grad_norm": 1.4862658977508545, + "learning_rate": 4.933182392353988e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8772210717201233, + "num_tokens": 43741337.0, + "step": 5340 + }, + { + "entropy": 0.36353917717933654, + "epoch": 0.8532886334610472, + "grad_norm": 1.651447057723999, + "learning_rate": 4.933057934121842e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8850520014762878, + "num_tokens": 43782297.0, + "step": 5345 + }, + { + "entropy": 0.37396968007087705, + "epoch": 0.8540868454661558, + "grad_norm": 1.345488429069519, + "learning_rate": 4.932933362192804e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8870288133621216, + "num_tokens": 43823257.0, + "step": 5350 + }, + { + "entropy": 0.3930931568145752, + "epoch": 0.8548850574712644, + "grad_norm": 1.3853410482406616, + "learning_rate": 4.932808676574704e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8762860059738159, + "num_tokens": 43864217.0, + "step": 5355 + }, + { + "entropy": 0.38437947630882263, + "epoch": 0.855683269476373, + "grad_norm": 1.5177319049835205, + "learning_rate": 4.932683877275388e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8778773427009583, + "num_tokens": 43905177.0, + "step": 5360 + }, + { + "entropy": 0.402529639005661, + "epoch": 0.8564814814814815, + "grad_norm": 1.8101162910461426, + "learning_rate": 4.932558964302701e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8761422276496887, + "num_tokens": 43946137.0, + "step": 5365 + }, + { + "entropy": 0.392457115650177, + "epoch": 0.85727969348659, + "grad_norm": 1.7096900939941406, + "learning_rate": 4.9324339376644975e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8748411536216736, + "num_tokens": 43987097.0, + "step": 5370 + }, + { + "entropy": 0.35902159214019774, + "epoch": 0.8580779054916986, + "grad_norm": 1.3886840343475342, + "learning_rate": 4.93230879736864e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8903650403022766, + "num_tokens": 44028057.0, + "step": 5375 + }, + { + "entropy": 0.3581184148788452, + "epoch": 0.8588761174968071, + "grad_norm": 1.6585997343063354, + "learning_rate": 4.932183543422999e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8876467704772949, + "num_tokens": 44069017.0, + "step": 5380 + }, + { + "entropy": 0.3886809110641479, + "epoch": 0.8596743295019157, + "grad_norm": 1.4889298677444458, + "learning_rate": 4.93205817583545e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.884128725528717, + "num_tokens": 44109977.0, + "step": 5385 + }, + { + "entropy": 0.3871449947357178, + "epoch": 0.8604725415070242, + "grad_norm": 1.4008923768997192, + "learning_rate": 4.931932694613876e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8785978078842163, + "num_tokens": 44150937.0, + "step": 5390 + }, + { + "entropy": 0.3678101718425751, + "epoch": 0.8612707535121328, + "grad_norm": 1.5337679386138916, + "learning_rate": 4.931807099766168e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.885176682472229, + "num_tokens": 44191897.0, + "step": 5395 + }, + { + "entropy": 0.388889354467392, + "epoch": 0.8620689655172413, + "grad_norm": 2.2841343879699707, + "learning_rate": 4.9316813913002246e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8753127932548523, + "num_tokens": 44232857.0, + "step": 5400 + }, + { + "epoch": 0.8620689655172413, + "eval_entropy": 0.3888051829338074, + "eval_loss": 0.34354865550994873, + "eval_mean_token_accuracy": 0.8788944683074951, + "eval_num_tokens": 44232857.0, + "eval_runtime": 69.2106, + "eval_samples_per_second": 14.449, + "eval_steps_per_second": 1.806, + "step": 5400 + }, + { + "entropy": 0.35687355399131776, + "epoch": 0.86286717752235, + "grad_norm": 1.4147300720214844, + "learning_rate": 4.93155556922395e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.88960120677948, + "num_tokens": 44273817.0, + "step": 5405 + }, + { + "entropy": 0.4274792790412903, + "epoch": 0.8636653895274585, + "grad_norm": 1.7667378187179565, + "learning_rate": 4.931429633545257e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8630432486534119, + "num_tokens": 44314777.0, + "step": 5410 + }, + { + "entropy": 0.37117738127708433, + "epoch": 0.8644636015325671, + "grad_norm": 1.7041641473770142, + "learning_rate": 4.9313035842720644e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8814984917640686, + "num_tokens": 44355737.0, + "step": 5415 + }, + { + "entropy": 0.35860843062400816, + "epoch": 0.8652618135376756, + "grad_norm": 1.5495671033859253, + "learning_rate": 4.931177421412298e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8854899287223816, + "num_tokens": 44396697.0, + "step": 5420 + }, + { + "entropy": 0.38937310576438905, + "epoch": 0.8660600255427842, + "grad_norm": 1.618882656097412, + "learning_rate": 4.931051144973892e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8780934572219848, + "num_tokens": 44437657.0, + "step": 5425 + }, + { + "entropy": 0.35249313712120056, + "epoch": 0.8668582375478927, + "grad_norm": 1.5120997428894043, + "learning_rate": 4.930924754964788e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8902511358261108, + "num_tokens": 44478617.0, + "step": 5430 + }, + { + "entropy": 0.408038991689682, + "epoch": 0.8676564495530013, + "grad_norm": 1.6065163612365723, + "learning_rate": 4.930798251392932e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8714466691017151, + "num_tokens": 44519577.0, + "step": 5435 + }, + { + "entropy": 0.42532867193222046, + "epoch": 0.8684546615581098, + "grad_norm": 1.576435923576355, + "learning_rate": 4.9306716342662795e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8677833437919616, + "num_tokens": 44560537.0, + "step": 5440 + }, + { + "entropy": 0.3784036099910736, + "epoch": 0.8692528735632183, + "grad_norm": 1.5071748495101929, + "learning_rate": 4.930544903592794e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8795307159423829, + "num_tokens": 44601497.0, + "step": 5445 + }, + { + "entropy": 0.37425318360328674, + "epoch": 0.870051085568327, + "grad_norm": 1.6781151294708252, + "learning_rate": 4.930418059380444e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8800019502639771, + "num_tokens": 44642457.0, + "step": 5450 + }, + { + "entropy": 0.34272149205207825, + "epoch": 0.8708492975734355, + "grad_norm": 1.5050232410430908, + "learning_rate": 4.930291101637205e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8940603494644165, + "num_tokens": 44683417.0, + "step": 5455 + }, + { + "entropy": 0.3740146219730377, + "epoch": 0.8716475095785441, + "grad_norm": 1.4340434074401855, + "learning_rate": 4.9301640303710606e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8837826609611511, + "num_tokens": 44724377.0, + "step": 5460 + }, + { + "entropy": 0.35212224125862124, + "epoch": 0.8724457215836526, + "grad_norm": 1.3832968473434448, + "learning_rate": 4.9300368455900024e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8901192903518677, + "num_tokens": 44765337.0, + "step": 5465 + }, + { + "entropy": 0.3691311001777649, + "epoch": 0.8732439335887612, + "grad_norm": 1.5648622512817383, + "learning_rate": 4.929909547302028e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8848111033439636, + "num_tokens": 44806297.0, + "step": 5470 + }, + { + "entropy": 0.3800596833229065, + "epoch": 0.8740421455938697, + "grad_norm": 1.5399657487869263, + "learning_rate": 4.929782135515143e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.882738995552063, + "num_tokens": 44847257.0, + "step": 5475 + }, + { + "entropy": 0.405544126033783, + "epoch": 0.8748403575989783, + "grad_norm": 1.5004374980926514, + "learning_rate": 4.929654610237359e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8739061474800109, + "num_tokens": 44888217.0, + "step": 5480 + }, + { + "entropy": 0.34500548243522644, + "epoch": 0.8756385696040868, + "grad_norm": 1.4212785959243774, + "learning_rate": 4.929526971476694e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8921091079711914, + "num_tokens": 44929177.0, + "step": 5485 + }, + { + "entropy": 0.3969034910202026, + "epoch": 0.8764367816091954, + "grad_norm": 1.6262997388839722, + "learning_rate": 4.929399219241175e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8802304983139038, + "num_tokens": 44970137.0, + "step": 5490 + }, + { + "entropy": 0.40439229011535643, + "epoch": 0.8772349936143039, + "grad_norm": 1.5303850173950195, + "learning_rate": 4.929271353538837e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8780505299568176, + "num_tokens": 45011097.0, + "step": 5495 + }, + { + "entropy": 0.368317711353302, + "epoch": 0.8780332056194126, + "grad_norm": 1.537245273590088, + "learning_rate": 4.92914337437772e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8815723538398743, + "num_tokens": 45052057.0, + "step": 5500 + }, + { + "entropy": 0.3903080582618713, + "epoch": 0.8788314176245211, + "grad_norm": 1.5359094142913818, + "learning_rate": 4.929015281765869e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8795314073562622, + "num_tokens": 45093017.0, + "step": 5505 + }, + { + "entropy": 0.37571442127227783, + "epoch": 0.8796296296296297, + "grad_norm": 1.6674305200576782, + "learning_rate": 4.928887075711343e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8841602087020874, + "num_tokens": 45133977.0, + "step": 5510 + }, + { + "entropy": 0.38340579271316527, + "epoch": 0.8804278416347382, + "grad_norm": 1.3645216226577759, + "learning_rate": 4.9287587562222005e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.879256546497345, + "num_tokens": 45174937.0, + "step": 5515 + }, + { + "entropy": 0.3849440336227417, + "epoch": 0.8812260536398467, + "grad_norm": 1.5876308679580688, + "learning_rate": 4.928630323306514e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8795434355735778, + "num_tokens": 45215897.0, + "step": 5520 + }, + { + "entropy": 0.379747611284256, + "epoch": 0.8820242656449553, + "grad_norm": 1.4216405153274536, + "learning_rate": 4.928501776972357e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8834027528762818, + "num_tokens": 45256857.0, + "step": 5525 + }, + { + "entropy": 0.37451350688934326, + "epoch": 0.8828224776500638, + "grad_norm": 1.4833821058273315, + "learning_rate": 4.9283731172278145e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8831079363822937, + "num_tokens": 45297817.0, + "step": 5530 + }, + { + "entropy": 0.40575386881828307, + "epoch": 0.8836206896551724, + "grad_norm": 1.6087018251419067, + "learning_rate": 4.928244344080977e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8704880237579345, + "num_tokens": 45338777.0, + "step": 5535 + }, + { + "entropy": 0.3431255519390106, + "epoch": 0.8844189016602809, + "grad_norm": 1.4994218349456787, + "learning_rate": 4.928115457539941e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8945744037628174, + "num_tokens": 45379737.0, + "step": 5540 + }, + { + "entropy": 0.3710679769515991, + "epoch": 0.8852171136653896, + "grad_norm": 1.5049368143081665, + "learning_rate": 4.9279864576128135e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8831719040870667, + "num_tokens": 45420697.0, + "step": 5545 + }, + { + "entropy": 0.3995101869106293, + "epoch": 0.8860153256704981, + "grad_norm": 1.692454218864441, + "learning_rate": 4.927857344307704e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8794248223304748, + "num_tokens": 45461657.0, + "step": 5550 + }, + { + "entropy": 0.4114351987838745, + "epoch": 0.8868135376756067, + "grad_norm": 1.5567283630371094, + "learning_rate": 4.927728117632733e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.873246169090271, + "num_tokens": 45502617.0, + "step": 5555 + }, + { + "entropy": 0.40230279564857485, + "epoch": 0.8876117496807152, + "grad_norm": 1.5717520713806152, + "learning_rate": 4.927598777596027e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8731892108917236, + "num_tokens": 45543577.0, + "step": 5560 + }, + { + "entropy": 0.38769946098327634, + "epoch": 0.8884099616858238, + "grad_norm": 1.5756887197494507, + "learning_rate": 4.927469324205719e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8776480913162231, + "num_tokens": 45584537.0, + "step": 5565 + }, + { + "entropy": 0.3639233231544495, + "epoch": 0.8892081736909323, + "grad_norm": 1.5897159576416016, + "learning_rate": 4.927339757469949e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8838191628456116, + "num_tokens": 45625497.0, + "step": 5570 + }, + { + "entropy": 0.3755005121231079, + "epoch": 0.8900063856960408, + "grad_norm": 1.4302053451538086, + "learning_rate": 4.927210077396864e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.883170211315155, + "num_tokens": 45666457.0, + "step": 5575 + }, + { + "entropy": 0.36951016783714297, + "epoch": 0.8908045977011494, + "grad_norm": 1.571903109550476, + "learning_rate": 4.9270802839946195e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8858746647834778, + "num_tokens": 45707417.0, + "step": 5580 + }, + { + "entropy": 0.3636231780052185, + "epoch": 0.8916028097062579, + "grad_norm": 1.8068180084228516, + "learning_rate": 4.926950377271379e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8879209876060485, + "num_tokens": 45748377.0, + "step": 5585 + }, + { + "entropy": 0.3747856080532074, + "epoch": 0.8924010217113666, + "grad_norm": 1.4714301824569702, + "learning_rate": 4.926820357235309e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8826592564582825, + "num_tokens": 45789337.0, + "step": 5590 + }, + { + "entropy": 0.35584996342659, + "epoch": 0.8931992337164751, + "grad_norm": 1.4386074542999268, + "learning_rate": 4.926690223894587e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8903956413269043, + "num_tokens": 45830297.0, + "step": 5595 + }, + { + "entropy": 0.3578944027423859, + "epoch": 0.8939974457215837, + "grad_norm": 1.4989902973175049, + "learning_rate": 4.926559977257395e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8872125625610352, + "num_tokens": 45871257.0, + "step": 5600 + }, + { + "epoch": 0.8939974457215837, + "eval_entropy": 0.3874563302993774, + "eval_loss": 0.3424855172634125, + "eval_mean_token_accuracy": 0.8790763306617737, + "eval_num_tokens": 45871257.0, + "eval_runtime": 69.221, + "eval_samples_per_second": 14.446, + "eval_steps_per_second": 1.806, + "step": 5600 + }, + { + "entropy": 0.4076481223106384, + "epoch": 0.8947956577266922, + "grad_norm": 1.4201501607894897, + "learning_rate": 4.9264296173319236e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8703855156898499, + "num_tokens": 45912217.0, + "step": 5605 + }, + { + "entropy": 0.384282773733139, + "epoch": 0.8955938697318008, + "grad_norm": 1.6850939989089966, + "learning_rate": 4.926299144126372e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8778869271278381, + "num_tokens": 45953177.0, + "step": 5610 + }, + { + "entropy": 0.4003660798072815, + "epoch": 0.8963920817369093, + "grad_norm": 1.6248348951339722, + "learning_rate": 4.926168557648943e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.875260329246521, + "num_tokens": 45994137.0, + "step": 5615 + }, + { + "entropy": 0.3878970563411713, + "epoch": 0.8971902937420179, + "grad_norm": 1.6481446027755737, + "learning_rate": 4.926037857907849e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8781666278839111, + "num_tokens": 46035097.0, + "step": 5620 + }, + { + "entropy": 0.37414196133613586, + "epoch": 0.8979885057471264, + "grad_norm": 1.3256474733352661, + "learning_rate": 4.92590704491131e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.884190309047699, + "num_tokens": 46076057.0, + "step": 5625 + }, + { + "entropy": 0.39382901787757874, + "epoch": 0.898786717752235, + "grad_norm": 1.5669851303100586, + "learning_rate": 4.925776118667549e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.875950026512146, + "num_tokens": 46117017.0, + "step": 5630 + }, + { + "entropy": 0.3600032448768616, + "epoch": 0.8995849297573435, + "grad_norm": 1.3882217407226562, + "learning_rate": 4.925645079184802e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8872683763504028, + "num_tokens": 46157977.0, + "step": 5635 + }, + { + "entropy": 0.3779283821582794, + "epoch": 0.9003831417624522, + "grad_norm": 1.4759502410888672, + "learning_rate": 4.925513926471307e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.885478389263153, + "num_tokens": 46198937.0, + "step": 5640 + }, + { + "entropy": 0.3804099500179291, + "epoch": 0.9011813537675607, + "grad_norm": 1.38573157787323, + "learning_rate": 4.9253826605353135e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8853496789932251, + "num_tokens": 46239897.0, + "step": 5645 + }, + { + "entropy": 0.3461835443973541, + "epoch": 0.9019795657726692, + "grad_norm": 1.3838683366775513, + "learning_rate": 4.925251281385074e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8930078864097595, + "num_tokens": 46280857.0, + "step": 5650 + }, + { + "entropy": 0.3484264135360718, + "epoch": 0.9027777777777778, + "grad_norm": 1.5145350694656372, + "learning_rate": 4.925119789028852e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8926341891288757, + "num_tokens": 46321817.0, + "step": 5655 + }, + { + "entropy": 0.37027230858802795, + "epoch": 0.9035759897828863, + "grad_norm": 1.5724906921386719, + "learning_rate": 4.924988183474915e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8843621373176574, + "num_tokens": 46362777.0, + "step": 5660 + }, + { + "entropy": 0.3780488550662994, + "epoch": 0.9043742017879949, + "grad_norm": 1.6730051040649414, + "learning_rate": 4.924856464731538e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8800442934036254, + "num_tokens": 46403737.0, + "step": 5665 + }, + { + "entropy": 0.3991661310195923, + "epoch": 0.9051724137931034, + "grad_norm": 1.5829823017120361, + "learning_rate": 4.924724632807005e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8738561511039734, + "num_tokens": 46444697.0, + "step": 5670 + }, + { + "entropy": 0.38700241446495054, + "epoch": 0.905970625798212, + "grad_norm": 1.5838910341262817, + "learning_rate": 4.924592687709606e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8770587086677551, + "num_tokens": 46485657.0, + "step": 5675 + }, + { + "entropy": 0.39548492431640625, + "epoch": 0.9067688378033205, + "grad_norm": 1.6779003143310547, + "learning_rate": 4.9244606294476385e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8760750770568848, + "num_tokens": 46526617.0, + "step": 5680 + }, + { + "entropy": 0.37442941069602964, + "epoch": 0.9075670498084292, + "grad_norm": 1.4389050006866455, + "learning_rate": 4.924328458029406e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8821128606796265, + "num_tokens": 46567577.0, + "step": 5685 + }, + { + "entropy": 0.3987495958805084, + "epoch": 0.9083652618135377, + "grad_norm": 1.6939518451690674, + "learning_rate": 4.924196173463219e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8778408288955688, + "num_tokens": 46608537.0, + "step": 5690 + }, + { + "entropy": 0.36648149490356446, + "epoch": 0.9091634738186463, + "grad_norm": 1.3086787462234497, + "learning_rate": 4.924063775757399e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8856496095657349, + "num_tokens": 46649497.0, + "step": 5695 + }, + { + "entropy": 0.3645837366580963, + "epoch": 0.9099616858237548, + "grad_norm": 1.6085964441299438, + "learning_rate": 4.9239312649202694e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8886914491653443, + "num_tokens": 46690457.0, + "step": 5700 + }, + { + "entropy": 0.40077937245368955, + "epoch": 0.9107598978288634, + "grad_norm": 1.790521502494812, + "learning_rate": 4.923798640960163e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.871863043308258, + "num_tokens": 46731417.0, + "step": 5705 + }, + { + "entropy": 0.3492118358612061, + "epoch": 0.9115581098339719, + "grad_norm": 1.5939642190933228, + "learning_rate": 4.92366590388542e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8899153947830201, + "num_tokens": 46772377.0, + "step": 5710 + }, + { + "entropy": 0.3977007269859314, + "epoch": 0.9123563218390804, + "grad_norm": 1.5747590065002441, + "learning_rate": 4.923533053704388e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8730810046195984, + "num_tokens": 46813337.0, + "step": 5715 + }, + { + "entropy": 0.35613282918930056, + "epoch": 0.913154533844189, + "grad_norm": 1.5095136165618896, + "learning_rate": 4.92340009042542e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8895549297332763, + "num_tokens": 46854297.0, + "step": 5720 + }, + { + "entropy": 0.3774961352348328, + "epoch": 0.9139527458492975, + "grad_norm": 1.7125418186187744, + "learning_rate": 4.923267014056878e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8814448356628418, + "num_tokens": 46895257.0, + "step": 5725 + }, + { + "entropy": 0.44346152544021605, + "epoch": 0.9147509578544061, + "grad_norm": 1.4897512197494507, + "learning_rate": 4.92313382460713e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8611808538436889, + "num_tokens": 46936217.0, + "step": 5730 + }, + { + "entropy": 0.3868241310119629, + "epoch": 0.9155491698595147, + "grad_norm": 1.4248846769332886, + "learning_rate": 4.923000522084551e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8827209115028382, + "num_tokens": 46977177.0, + "step": 5735 + }, + { + "entropy": 0.38528451323509216, + "epoch": 0.9163473818646233, + "grad_norm": 1.7550557851791382, + "learning_rate": 4.922867106497524e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8812140226364136, + "num_tokens": 47018137.0, + "step": 5740 + }, + { + "entropy": 0.357247930765152, + "epoch": 0.9171455938697318, + "grad_norm": 1.3566182851791382, + "learning_rate": 4.922733577854438e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8896707653999328, + "num_tokens": 47059097.0, + "step": 5745 + }, + { + "entropy": 0.344160258769989, + "epoch": 0.9179438058748404, + "grad_norm": 1.5768409967422485, + "learning_rate": 4.9225999361636915e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8920184731483459, + "num_tokens": 47100057.0, + "step": 5750 + }, + { + "entropy": 0.35273231863975524, + "epoch": 0.9187420178799489, + "grad_norm": 1.4575929641723633, + "learning_rate": 4.922466181433686e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8913999795913696, + "num_tokens": 47141017.0, + "step": 5755 + }, + { + "entropy": 0.3938547372817993, + "epoch": 0.9195402298850575, + "grad_norm": 1.5926954746246338, + "learning_rate": 4.922332313672834e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8751298427581787, + "num_tokens": 47181977.0, + "step": 5760 + }, + { + "entropy": 0.40632343888282774, + "epoch": 0.920338441890166, + "grad_norm": 1.456726312637329, + "learning_rate": 4.922198332889553e-06, + "loss": 0.345, + "mean_token_accuracy": 0.876978600025177, + "num_tokens": 47222937.0, + "step": 5765 + }, + { + "entropy": 0.3740855813026428, + "epoch": 0.9211366538952745, + "grad_norm": 1.4371447563171387, + "learning_rate": 4.922064239092269e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.885735023021698, + "num_tokens": 47263897.0, + "step": 5770 + }, + { + "entropy": 0.3803277492523193, + "epoch": 0.9219348659003831, + "grad_norm": 1.611755609512329, + "learning_rate": 4.9219300322894125e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8782493829727173, + "num_tokens": 47304857.0, + "step": 5775 + }, + { + "entropy": 0.34562104344367983, + "epoch": 0.9227330779054917, + "grad_norm": 1.4883754253387451, + "learning_rate": 4.921795712489425e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8944500207901, + "num_tokens": 47345817.0, + "step": 5780 + }, + { + "entropy": 0.3530261814594269, + "epoch": 0.9235312899106003, + "grad_norm": 1.4684337377548218, + "learning_rate": 4.921661279700751e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8889976978302002, + "num_tokens": 47386777.0, + "step": 5785 + }, + { + "entropy": 0.368403947353363, + "epoch": 0.9243295019157088, + "grad_norm": 1.5497897863388062, + "learning_rate": 4.921526733931846e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8838266372680664, + "num_tokens": 47427737.0, + "step": 5790 + }, + { + "entropy": 0.3576655864715576, + "epoch": 0.9251277139208174, + "grad_norm": 1.334320306777954, + "learning_rate": 4.9213920751911696e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8881598234176635, + "num_tokens": 47468543.0, + "step": 5795 + }, + { + "entropy": 0.35151341557502747, + "epoch": 0.9259259259259259, + "grad_norm": 1.5386505126953125, + "learning_rate": 4.921257303487189e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8897369027137756, + "num_tokens": 47509503.0, + "step": 5800 + }, + { + "epoch": 0.9259259259259259, + "eval_entropy": 0.39153709745407106, + "eval_loss": 0.3416091501712799, + "eval_mean_token_accuracy": 0.87915345287323, + "eval_num_tokens": 47509503.0, + "eval_runtime": 69.2155, + "eval_samples_per_second": 14.448, + "eval_steps_per_second": 1.806, + "step": 5800 + }, + { + "entropy": 0.3985960602760315, + "epoch": 0.9267241379310345, + "grad_norm": 1.565441370010376, + "learning_rate": 4.9211224188283804e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8771282076835633, + "num_tokens": 47550463.0, + "step": 5805 + }, + { + "entropy": 0.3795722007751465, + "epoch": 0.927522349936143, + "grad_norm": 1.5687583684921265, + "learning_rate": 4.9209874212232245e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8816348910331726, + "num_tokens": 47591423.0, + "step": 5810 + }, + { + "entropy": 0.3920009911060333, + "epoch": 0.9283205619412516, + "grad_norm": 1.607857346534729, + "learning_rate": 4.920852310680212e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8741375207901001, + "num_tokens": 47632383.0, + "step": 5815 + }, + { + "entropy": 0.39364256858825686, + "epoch": 0.9291187739463601, + "grad_norm": 1.6685956716537476, + "learning_rate": 4.920717087207838e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8809046626091004, + "num_tokens": 47673343.0, + "step": 5820 + }, + { + "entropy": 0.3499061703681946, + "epoch": 0.9299169859514687, + "grad_norm": 1.3277673721313477, + "learning_rate": 4.920581750814606e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8917477488517761, + "num_tokens": 47714303.0, + "step": 5825 + }, + { + "entropy": 0.38631321787834166, + "epoch": 0.9307151979565773, + "grad_norm": 1.5384690761566162, + "learning_rate": 4.9204463015090275e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8766877293586731, + "num_tokens": 47755263.0, + "step": 5830 + }, + { + "entropy": 0.37582285404205323, + "epoch": 0.9315134099616859, + "grad_norm": 1.4166748523712158, + "learning_rate": 4.920310739299619e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8838165760040283, + "num_tokens": 47796223.0, + "step": 5835 + }, + { + "entropy": 0.3942254841327667, + "epoch": 0.9323116219667944, + "grad_norm": 1.6179897785186768, + "learning_rate": 4.920175064194904e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8769045948982239, + "num_tokens": 47837183.0, + "step": 5840 + }, + { + "entropy": 0.3526209771633148, + "epoch": 0.933109833971903, + "grad_norm": 1.6264334917068481, + "learning_rate": 4.920039276203416e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8918890833854676, + "num_tokens": 47878143.0, + "step": 5845 + }, + { + "entropy": 0.3778443932533264, + "epoch": 0.9339080459770115, + "grad_norm": 1.6795600652694702, + "learning_rate": 4.919903375333693e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8826277613639831, + "num_tokens": 47919103.0, + "step": 5850 + }, + { + "entropy": 0.3853157699108124, + "epoch": 0.93470625798212, + "grad_norm": 1.6166398525238037, + "learning_rate": 4.919767361594281e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8792406916618347, + "num_tokens": 47960063.0, + "step": 5855 + }, + { + "entropy": 0.4113163113594055, + "epoch": 0.9355044699872286, + "grad_norm": 1.6133720874786377, + "learning_rate": 4.919631234993734e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8711008191108703, + "num_tokens": 48001023.0, + "step": 5860 + }, + { + "entropy": 0.37159796357154845, + "epoch": 0.9363026819923371, + "grad_norm": 1.6776436567306519, + "learning_rate": 4.91949499554061e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8851324081420898, + "num_tokens": 48041983.0, + "step": 5865 + }, + { + "entropy": 0.3699815392494202, + "epoch": 0.9371008939974457, + "grad_norm": 1.6786829233169556, + "learning_rate": 4.919358643243478e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8858237981796264, + "num_tokens": 48082943.0, + "step": 5870 + }, + { + "entropy": 0.3756616234779358, + "epoch": 0.9378991060025543, + "grad_norm": 1.6404286623001099, + "learning_rate": 4.919222178110911e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8846615314483642, + "num_tokens": 48123903.0, + "step": 5875 + }, + { + "entropy": 0.33975174427032473, + "epoch": 0.9386973180076629, + "grad_norm": 1.3744854927062988, + "learning_rate": 4.919085600151493e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8948165059089661, + "num_tokens": 48164863.0, + "step": 5880 + }, + { + "entropy": 0.38145941495895386, + "epoch": 0.9394955300127714, + "grad_norm": 1.4947644472122192, + "learning_rate": 4.91894890937381e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8807326078414917, + "num_tokens": 48205823.0, + "step": 5885 + }, + { + "entropy": 0.361408406496048, + "epoch": 0.94029374201788, + "grad_norm": 1.532977819442749, + "learning_rate": 4.918812105786457e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8895909547805786, + "num_tokens": 48246783.0, + "step": 5890 + }, + { + "entropy": 0.35269999504089355, + "epoch": 0.9410919540229885, + "grad_norm": 1.704317331314087, + "learning_rate": 4.918675189398039e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8849400162696839, + "num_tokens": 48287743.0, + "step": 5895 + }, + { + "entropy": 0.4164234399795532, + "epoch": 0.941890166028097, + "grad_norm": 1.43415105342865, + "learning_rate": 4.918538160217165e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8696430563926697, + "num_tokens": 48328703.0, + "step": 5900 + }, + { + "entropy": 0.40871328115463257, + "epoch": 0.9426883780332056, + "grad_norm": 1.314746379852295, + "learning_rate": 4.91840101825245e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8742193460464478, + "num_tokens": 48369663.0, + "step": 5905 + }, + { + "entropy": 0.37529211640357973, + "epoch": 0.9434865900383141, + "grad_norm": 1.604024052619934, + "learning_rate": 4.918263763512521e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8838613867759705, + "num_tokens": 48410623.0, + "step": 5910 + }, + { + "entropy": 0.3645563364028931, + "epoch": 0.9442848020434227, + "grad_norm": 1.5141392946243286, + "learning_rate": 4.918126396006006e-06, + "loss": 0.321, + "mean_token_accuracy": 0.885771381855011, + "num_tokens": 48451583.0, + "step": 5915 + }, + { + "entropy": 0.38649824261665344, + "epoch": 0.9450830140485313, + "grad_norm": 1.6629687547683716, + "learning_rate": 4.917988915741546e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8791480541229248, + "num_tokens": 48492543.0, + "step": 5920 + }, + { + "entropy": 0.34921335577964785, + "epoch": 0.9458812260536399, + "grad_norm": 1.4933561086654663, + "learning_rate": 4.9178513227277845e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8922902464866638, + "num_tokens": 48533503.0, + "step": 5925 + }, + { + "entropy": 0.3478983163833618, + "epoch": 0.9466794380587484, + "grad_norm": 1.5103713274002075, + "learning_rate": 4.9177136169733745e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.890423309803009, + "num_tokens": 48574463.0, + "step": 5930 + }, + { + "entropy": 0.4012789189815521, + "epoch": 0.947477650063857, + "grad_norm": 1.7785980701446533, + "learning_rate": 4.917575798486975e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8741880536079407, + "num_tokens": 48615423.0, + "step": 5935 + }, + { + "entropy": 0.3763855755329132, + "epoch": 0.9482758620689655, + "grad_norm": 1.6077227592468262, + "learning_rate": 4.9174378672772525e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8826831579208374, + "num_tokens": 48656383.0, + "step": 5940 + }, + { + "entropy": 0.3697520852088928, + "epoch": 0.9490740740740741, + "grad_norm": 1.4580429792404175, + "learning_rate": 4.91729982335288e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8852989912033081, + "num_tokens": 48697343.0, + "step": 5945 + }, + { + "entropy": 0.37058043479919434, + "epoch": 0.9498722860791826, + "grad_norm": 1.444654107093811, + "learning_rate": 4.91716166672254e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8842874765396118, + "num_tokens": 48738303.0, + "step": 5950 + }, + { + "entropy": 0.34524454474449157, + "epoch": 0.9506704980842912, + "grad_norm": 1.448460578918457, + "learning_rate": 4.917023397394919e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8934790849685669, + "num_tokens": 48779263.0, + "step": 5955 + }, + { + "entropy": 0.37682254910469054, + "epoch": 0.9514687100893997, + "grad_norm": 1.5126111507415771, + "learning_rate": 4.916885015378712e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8782764315605164, + "num_tokens": 48820223.0, + "step": 5960 + }, + { + "entropy": 0.3758767068386078, + "epoch": 0.9522669220945083, + "grad_norm": 1.451427936553955, + "learning_rate": 4.9167465206826205e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8802876353263855, + "num_tokens": 48861183.0, + "step": 5965 + }, + { + "entropy": 0.3713228702545166, + "epoch": 0.9530651340996169, + "grad_norm": 1.577522873878479, + "learning_rate": 4.9166079133153545e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8837321996688843, + "num_tokens": 48902143.0, + "step": 5970 + }, + { + "entropy": 0.36461412310600283, + "epoch": 0.9538633461047255, + "grad_norm": 1.4041016101837158, + "learning_rate": 4.916469193285629e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8864709377288819, + "num_tokens": 48943103.0, + "step": 5975 + }, + { + "entropy": 0.3806270360946655, + "epoch": 0.954661558109834, + "grad_norm": 1.4749987125396729, + "learning_rate": 4.916330360602168e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8856003403663635, + "num_tokens": 48984063.0, + "step": 5980 + }, + { + "entropy": 0.3979497790336609, + "epoch": 0.9554597701149425, + "grad_norm": 1.6750006675720215, + "learning_rate": 4.916191415273702e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8770854830741882, + "num_tokens": 49025023.0, + "step": 5985 + }, + { + "entropy": 0.39571131467819215, + "epoch": 0.9562579821200511, + "grad_norm": 1.5523713827133179, + "learning_rate": 4.916052357308968e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8779788494110108, + "num_tokens": 49065983.0, + "step": 5990 + }, + { + "entropy": 0.3467148780822754, + "epoch": 0.9570561941251596, + "grad_norm": 1.393660306930542, + "learning_rate": 4.91591318671671e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8936139225959778, + "num_tokens": 49106943.0, + "step": 5995 + }, + { + "entropy": 0.3784030795097351, + "epoch": 0.9578544061302682, + "grad_norm": 1.6584866046905518, + "learning_rate": 4.91577390350568e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8783522844314575, + "num_tokens": 49147903.0, + "step": 6000 + }, + { + "epoch": 0.9578544061302682, + "eval_entropy": 0.38134945607185367, + "eval_loss": 0.34127527475357056, + "eval_mean_token_accuracy": 0.8793023900985718, + "eval_num_tokens": 49147903.0, + "eval_runtime": 69.2337, + "eval_samples_per_second": 14.444, + "eval_steps_per_second": 1.805, + "step": 6000 + }, + { + "entropy": 0.3653820753097534, + "epoch": 0.9586526181353767, + "grad_norm": 1.5888646841049194, + "learning_rate": 4.9156345076846355e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8857032895088196, + "num_tokens": 49188863.0, + "step": 6005 + }, + { + "entropy": 0.38212187886238097, + "epoch": 0.9594508301404853, + "grad_norm": 1.6170986890792847, + "learning_rate": 4.915494999262345e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.878742003440857, + "num_tokens": 49229823.0, + "step": 6010 + }, + { + "entropy": 0.41390907764434814, + "epoch": 0.9602490421455939, + "grad_norm": 1.6242729425430298, + "learning_rate": 4.9153553782475785e-06, + "loss": 0.372, + "mean_token_accuracy": 0.870467746257782, + "num_tokens": 49270783.0, + "step": 6015 + }, + { + "entropy": 0.41197664141654966, + "epoch": 0.9610472541507025, + "grad_norm": 1.510725498199463, + "learning_rate": 4.9152156446491165e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8672952890396118, + "num_tokens": 49311743.0, + "step": 6020 + }, + { + "entropy": 0.3467439472675323, + "epoch": 0.961845466155811, + "grad_norm": 1.3229261636734009, + "learning_rate": 4.9150757984757465e-06, + "loss": 0.301, + "mean_token_accuracy": 0.893600058555603, + "num_tokens": 49352703.0, + "step": 6025 + }, + { + "entropy": 0.41660587191581727, + "epoch": 0.9626436781609196, + "grad_norm": 1.7598029375076294, + "learning_rate": 4.9149358397362625e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8731543898582459, + "num_tokens": 49393663.0, + "step": 6030 + }, + { + "entropy": 0.34480465650558473, + "epoch": 0.9634418901660281, + "grad_norm": 1.4702306985855103, + "learning_rate": 4.914795768439465e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8925211548805236, + "num_tokens": 49434623.0, + "step": 6035 + }, + { + "entropy": 0.4240105390548706, + "epoch": 0.9642401021711366, + "grad_norm": 1.5750335454940796, + "learning_rate": 4.914655584594163e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8653579473495483, + "num_tokens": 49475583.0, + "step": 6040 + }, + { + "entropy": 0.3583949863910675, + "epoch": 0.9650383141762452, + "grad_norm": 1.5241199731826782, + "learning_rate": 4.91451528820917e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8832314372062683, + "num_tokens": 49516543.0, + "step": 6045 + }, + { + "entropy": 0.39159261584281924, + "epoch": 0.9658365261813537, + "grad_norm": 1.463606595993042, + "learning_rate": 4.91437487929331e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8794732451438904, + "num_tokens": 49557503.0, + "step": 6050 + }, + { + "entropy": 0.34033394455909727, + "epoch": 0.9666347381864623, + "grad_norm": 1.3643134832382202, + "learning_rate": 4.914234357855413e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8985254287719726, + "num_tokens": 49598463.0, + "step": 6055 + }, + { + "entropy": 0.38294048309326173, + "epoch": 0.9674329501915708, + "grad_norm": 1.5533177852630615, + "learning_rate": 4.914093723904313e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8760846972465515, + "num_tokens": 49639423.0, + "step": 6060 + }, + { + "entropy": 0.38406974673271177, + "epoch": 0.9682311621966795, + "grad_norm": 1.681125521659851, + "learning_rate": 4.913952977448856e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8754361867904663, + "num_tokens": 49680383.0, + "step": 6065 + }, + { + "entropy": 0.3886194467544556, + "epoch": 0.969029374201788, + "grad_norm": 1.4837889671325684, + "learning_rate": 4.9138121184978915e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8797842979431152, + "num_tokens": 49721343.0, + "step": 6070 + }, + { + "entropy": 0.3772536039352417, + "epoch": 0.9698275862068966, + "grad_norm": 1.5158101320266724, + "learning_rate": 4.913671147060276e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8815980911254883, + "num_tokens": 49762303.0, + "step": 6075 + }, + { + "entropy": 0.37093993425369265, + "epoch": 0.9706257982120051, + "grad_norm": 1.58279550075531, + "learning_rate": 4.9135300631448765e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8813050985336304, + "num_tokens": 49803263.0, + "step": 6080 + }, + { + "entropy": 0.38539561033248904, + "epoch": 0.9714240102171137, + "grad_norm": 1.4010246992111206, + "learning_rate": 4.913388866760565e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8799989581108093, + "num_tokens": 49844223.0, + "step": 6085 + }, + { + "entropy": 0.37510964274406433, + "epoch": 0.9722222222222222, + "grad_norm": 1.5841959714889526, + "learning_rate": 4.913247557916217e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8827460169792175, + "num_tokens": 49885183.0, + "step": 6090 + }, + { + "entropy": 0.33448782563209534, + "epoch": 0.9730204342273308, + "grad_norm": 1.6392887830734253, + "learning_rate": 4.9131061366207225e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8942730307579041, + "num_tokens": 49926143.0, + "step": 6095 + }, + { + "entropy": 0.3597862184047699, + "epoch": 0.9738186462324393, + "grad_norm": 1.2924385070800781, + "learning_rate": 4.912964602882973e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8883277893066406, + "num_tokens": 49967103.0, + "step": 6100 + }, + { + "entropy": 0.36224318742752076, + "epoch": 0.9746168582375478, + "grad_norm": 1.5063951015472412, + "learning_rate": 4.9128229567118664e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8854564070701599, + "num_tokens": 50008063.0, + "step": 6105 + }, + { + "entropy": 0.3745680212974548, + "epoch": 0.9754150702426565, + "grad_norm": 1.2758285999298096, + "learning_rate": 4.912681198116314e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.88257395029068, + "num_tokens": 50049023.0, + "step": 6110 + }, + { + "entropy": 0.3942704081535339, + "epoch": 0.976213282247765, + "grad_norm": 1.7089884281158447, + "learning_rate": 4.912539327105228e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8745615720748902, + "num_tokens": 50089983.0, + "step": 6115 + }, + { + "entropy": 0.3974311351776123, + "epoch": 0.9770114942528736, + "grad_norm": 1.4831726551055908, + "learning_rate": 4.912397343687528e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8768528819084167, + "num_tokens": 50130943.0, + "step": 6120 + }, + { + "entropy": 0.40303121209144593, + "epoch": 0.9778097062579821, + "grad_norm": 1.7240445613861084, + "learning_rate": 4.912255247872145e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8710102081298828, + "num_tokens": 50171903.0, + "step": 6125 + }, + { + "entropy": 0.41418219804763795, + "epoch": 0.9786079182630907, + "grad_norm": 1.6801888942718506, + "learning_rate": 4.9121130396680146e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8712199687957763, + "num_tokens": 50212863.0, + "step": 6130 + }, + { + "entropy": 0.37163949608802793, + "epoch": 0.9794061302681992, + "grad_norm": 1.6908286809921265, + "learning_rate": 4.911970719084077e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8853805422782898, + "num_tokens": 50253823.0, + "step": 6135 + }, + { + "entropy": 0.4254068911075592, + "epoch": 0.9802043422733078, + "grad_norm": 1.6032263040542603, + "learning_rate": 4.911828286129284e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8682022571563721, + "num_tokens": 50294783.0, + "step": 6140 + }, + { + "entropy": 0.38155118823051454, + "epoch": 0.9810025542784163, + "grad_norm": 1.4514435529708862, + "learning_rate": 4.911685740812592e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8800020813941956, + "num_tokens": 50335743.0, + "step": 6145 + }, + { + "entropy": 0.38690811991691587, + "epoch": 0.9818007662835249, + "grad_norm": 1.5835672616958618, + "learning_rate": 4.911543083142963e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.882764458656311, + "num_tokens": 50376703.0, + "step": 6150 + }, + { + "entropy": 0.40217992663383484, + "epoch": 0.9825989782886334, + "grad_norm": 1.369538426399231, + "learning_rate": 4.91140031312937e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8751148462295533, + "num_tokens": 50417663.0, + "step": 6155 + }, + { + "entropy": 0.4109044909477234, + "epoch": 0.9833971902937421, + "grad_norm": 1.5834952592849731, + "learning_rate": 4.911257430780789e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8720200419425964, + "num_tokens": 50458623.0, + "step": 6160 + }, + { + "entropy": 0.3819911301136017, + "epoch": 0.9841954022988506, + "grad_norm": 1.4536290168762207, + "learning_rate": 4.911114436106207e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8796203851699829, + "num_tokens": 50499583.0, + "step": 6165 + }, + { + "entropy": 0.38514203429222105, + "epoch": 0.9849936143039592, + "grad_norm": 1.4666593074798584, + "learning_rate": 4.9109713291146134e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8806746363639831, + "num_tokens": 50540543.0, + "step": 6170 + }, + { + "entropy": 0.39998855590820315, + "epoch": 0.9857918263090677, + "grad_norm": 1.4971364736557007, + "learning_rate": 4.910828109815009e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8744049787521362, + "num_tokens": 50581503.0, + "step": 6175 + }, + { + "entropy": 0.3624740481376648, + "epoch": 0.9865900383141762, + "grad_norm": 1.5488523244857788, + "learning_rate": 4.9106847782164e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8884861707687378, + "num_tokens": 50622463.0, + "step": 6180 + }, + { + "entropy": 0.40431431531906126, + "epoch": 0.9873882503192848, + "grad_norm": 1.4721726179122925, + "learning_rate": 4.910541334327798e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8711833119392395, + "num_tokens": 50663423.0, + "step": 6185 + }, + { + "entropy": 0.3674971103668213, + "epoch": 0.9881864623243933, + "grad_norm": 1.5480612516403198, + "learning_rate": 4.910397778158226e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.882783031463623, + "num_tokens": 50704383.0, + "step": 6190 + }, + { + "entropy": 0.4075448215007782, + "epoch": 0.9889846743295019, + "grad_norm": 1.640990138053894, + "learning_rate": 4.91025410971671e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8722677826881409, + "num_tokens": 50745343.0, + "step": 6195 + }, + { + "entropy": 0.380616694688797, + "epoch": 0.9897828863346104, + "grad_norm": 1.3555705547332764, + "learning_rate": 4.910110329012282e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8813807725906372, + "num_tokens": 50786303.0, + "step": 6200 + }, + { + "epoch": 0.9897828863346104, + "eval_entropy": 0.39050144743919374, + "eval_loss": 0.3402261435985565, + "eval_mean_token_accuracy": 0.8799290285110474, + "eval_num_tokens": 50786303.0, + "eval_runtime": 69.2249, + "eval_samples_per_second": 14.446, + "eval_steps_per_second": 1.806, + "step": 6200 + } + ], + "logging_steps": 5, + "max_steps": 62640, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1071958288148756e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}