{ "best_global_step": 6200, "best_metric": 0.3402261435985565, "best_model_checkpoint": "./sft_model/checkpoint-6200", "epoch": 0.9897828863346104, "eval_steps": 200, "global_step": 6200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.2782653570175171, "epoch": 0.0007982120051085569, "grad_norm": 11.474634170532227, "learning_rate": 4.999999962269939e-06, "loss": 1.5842, "mean_token_accuracy": 0.7387546181678772, "num_tokens": 40960.0, "step": 5 }, { "entropy": 0.6856095910072326, "epoch": 0.0015964240102171138, "grad_norm": 17.486251831054688, "learning_rate": 4.99999980899157e-06, "loss": 1.2661, "mean_token_accuracy": 0.7310832023620606, "num_tokens": 81920.0, "step": 10 }, { "entropy": 0.5054598093032837, "epoch": 0.0023946360153256703, "grad_norm": 4.929881572723389, "learning_rate": 4.999999537806773e-06, "loss": 0.9656, "mean_token_accuracy": 0.7575001239776611, "num_tokens": 122880.0, "step": 15 }, { "entropy": 0.5710949540138245, "epoch": 0.0031928480204342275, "grad_norm": 2.9681556224823, "learning_rate": 4.999999148715565e-06, "loss": 0.7784, "mean_token_accuracy": 0.7802138924598694, "num_tokens": 163840.0, "step": 20 }, { "entropy": 0.6400085330009461, "epoch": 0.003991060025542784, "grad_norm": 2.278716564178467, "learning_rate": 4.999998641717971e-06, "loss": 0.6463, "mean_token_accuracy": 0.8072834253311157, "num_tokens": 204800.0, "step": 25 }, { "entropy": 0.5254894733428955, "epoch": 0.004789272030651341, "grad_norm": 2.6655399799346924, "learning_rate": 4.999998016814023e-06, "loss": 0.631, "mean_token_accuracy": 0.8123680472373962, "num_tokens": 245760.0, "step": 30 }, { "entropy": 0.6046146094799042, "epoch": 0.005587484035759898, "grad_norm": 2.30499005317688, "learning_rate": 4.99999727400376e-06, "loss": 0.6116, "mean_token_accuracy": 0.8119417071342468, "num_tokens": 286720.0, "step": 35 }, { "entropy": 0.532450532913208, "epoch": 0.006385696040868455, "grad_norm": 1.6848894357681274, "learning_rate": 4.999996413287229e-06, "loss": 0.5343, "mean_token_accuracy": 0.834265160560608, "num_tokens": 327680.0, "step": 40 }, { "entropy": 0.5985349774360657, "epoch": 0.007183908045977011, "grad_norm": 2.2001168727874756, "learning_rate": 4.999995434664483e-06, "loss": 0.605, "mean_token_accuracy": 0.8122081875801086, "num_tokens": 368640.0, "step": 45 }, { "entropy": 0.5565362870693207, "epoch": 0.007982120051085569, "grad_norm": 1.8532217741012573, "learning_rate": 4.9999943381355846e-06, "loss": 0.5568, "mean_token_accuracy": 0.8251007676124573, "num_tokens": 409600.0, "step": 50 }, { "entropy": 0.5010187983512878, "epoch": 0.008780332056194126, "grad_norm": 1.9065581560134888, "learning_rate": 4.999993123700602e-06, "loss": 0.4856, "mean_token_accuracy": 0.8482044339179993, "num_tokens": 450560.0, "step": 55 }, { "entropy": 0.5530545592308045, "epoch": 0.009578544061302681, "grad_norm": 1.9626890420913696, "learning_rate": 4.999991791359612e-06, "loss": 0.5356, "mean_token_accuracy": 0.8315532445907593, "num_tokens": 491520.0, "step": 60 }, { "entropy": 0.5973767757415771, "epoch": 0.010376756066411238, "grad_norm": 1.920264482498169, "learning_rate": 4.999990341112699e-06, "loss": 0.5913, "mean_token_accuracy": 0.815071988105774, "num_tokens": 532480.0, "step": 65 }, { "entropy": 0.5213993549346924, "epoch": 0.011174968071519796, "grad_norm": 1.8560436964035034, "learning_rate": 4.999988772959954e-06, "loss": 0.5046, "mean_token_accuracy": 0.8370494961738586, "num_tokens": 573440.0, "step": 70 }, { "entropy": 0.47170872092247007, "epoch": 0.011973180076628353, "grad_norm": 1.7740520238876343, "learning_rate": 4.999987086901475e-06, "loss": 0.4683, "mean_token_accuracy": 0.8490700483322143, "num_tokens": 614400.0, "step": 75 }, { "entropy": 0.5295802831649781, "epoch": 0.01277139208173691, "grad_norm": 1.9278310537338257, "learning_rate": 4.999985282937368e-06, "loss": 0.4998, "mean_token_accuracy": 0.8388131141662598, "num_tokens": 655360.0, "step": 80 }, { "entropy": 0.5061579287052155, "epoch": 0.013569604086845466, "grad_norm": 1.9197410345077515, "learning_rate": 4.999983361067747e-06, "loss": 0.4897, "mean_token_accuracy": 0.8418465852737427, "num_tokens": 696320.0, "step": 85 }, { "entropy": 0.5168275713920594, "epoch": 0.014367816091954023, "grad_norm": 4.458969593048096, "learning_rate": 4.999981321292733e-06, "loss": 0.5083, "mean_token_accuracy": 0.8336622238159179, "num_tokens": 737280.0, "step": 90 }, { "entropy": 0.5189633309841156, "epoch": 0.01516602809706258, "grad_norm": 1.9136766195297241, "learning_rate": 4.9999791636124526e-06, "loss": 0.5065, "mean_token_accuracy": 0.8349774599075317, "num_tokens": 778240.0, "step": 95 }, { "entropy": 0.5578200101852417, "epoch": 0.015964240102171137, "grad_norm": 1.945987582206726, "learning_rate": 4.999976888027044e-06, "loss": 0.5269, "mean_token_accuracy": 0.8311297535896301, "num_tokens": 819200.0, "step": 100 }, { "entropy": 0.5309346675872803, "epoch": 0.016762452107279693, "grad_norm": 1.6795809268951416, "learning_rate": 4.999974494536648e-06, "loss": 0.5112, "mean_token_accuracy": 0.8364368081092834, "num_tokens": 860160.0, "step": 105 }, { "entropy": 0.4735103607177734, "epoch": 0.01756066411238825, "grad_norm": 2.085732936859131, "learning_rate": 4.9999719831414165e-06, "loss": 0.4598, "mean_token_accuracy": 0.848661732673645, "num_tokens": 901120.0, "step": 110 }, { "entropy": 0.46962087154388427, "epoch": 0.018358876117496807, "grad_norm": 1.8151161670684814, "learning_rate": 4.999969353841507e-06, "loss": 0.4247, "mean_token_accuracy": 0.8622438430786132, "num_tokens": 942080.0, "step": 115 }, { "entropy": 0.5223342657089234, "epoch": 0.019157088122605363, "grad_norm": 1.746271014213562, "learning_rate": 4.9999666066370854e-06, "loss": 0.4971, "mean_token_accuracy": 0.8385911345481872, "num_tokens": 983040.0, "step": 120 }, { "entropy": 0.5088116765022278, "epoch": 0.01995530012771392, "grad_norm": 1.7642600536346436, "learning_rate": 4.999963741528323e-06, "loss": 0.4981, "mean_token_accuracy": 0.836833918094635, "num_tokens": 1024000.0, "step": 125 }, { "entropy": 0.5108103513717651, "epoch": 0.020753512132822477, "grad_norm": 1.894156813621521, "learning_rate": 4.999960758515402e-06, "loss": 0.4773, "mean_token_accuracy": 0.8450562000274658, "num_tokens": 1064960.0, "step": 130 }, { "entropy": 0.525665408372879, "epoch": 0.021551724137931036, "grad_norm": 1.8148276805877686, "learning_rate": 4.999957657598509e-06, "loss": 0.5051, "mean_token_accuracy": 0.8344801664352417, "num_tokens": 1105920.0, "step": 135 }, { "entropy": 0.46289663314819335, "epoch": 0.02234993614303959, "grad_norm": 1.8011553287506104, "learning_rate": 4.9999544387778385e-06, "loss": 0.4506, "mean_token_accuracy": 0.8528064727783203, "num_tokens": 1146880.0, "step": 140 }, { "entropy": 0.47348416447639463, "epoch": 0.023148148148148147, "grad_norm": 1.655940055847168, "learning_rate": 4.999951102053593e-06, "loss": 0.4476, "mean_token_accuracy": 0.8520541548728943, "num_tokens": 1187840.0, "step": 145 }, { "entropy": 0.4965839982032776, "epoch": 0.023946360153256706, "grad_norm": 1.81589674949646, "learning_rate": 4.999947647425983e-06, "loss": 0.4634, "mean_token_accuracy": 0.8466312050819397, "num_tokens": 1228800.0, "step": 150 }, { "entropy": 0.4940677106380463, "epoch": 0.02474457215836526, "grad_norm": 1.7606313228607178, "learning_rate": 4.999944074895225e-06, "loss": 0.4595, "mean_token_accuracy": 0.847114372253418, "num_tokens": 1269760.0, "step": 155 }, { "entropy": 0.5418534994125366, "epoch": 0.02554278416347382, "grad_norm": 1.75408935546875, "learning_rate": 4.999940384461543e-06, "loss": 0.5244, "mean_token_accuracy": 0.8292249202728271, "num_tokens": 1310720.0, "step": 160 }, { "entropy": 0.4658185839653015, "epoch": 0.026340996168582376, "grad_norm": 1.655227541923523, "learning_rate": 4.999936576125173e-06, "loss": 0.4481, "mean_token_accuracy": 0.8518807768821717, "num_tokens": 1351680.0, "step": 165 }, { "entropy": 0.576075655221939, "epoch": 0.02713920817369093, "grad_norm": 2.072255849838257, "learning_rate": 4.999932649886349e-06, "loss": 0.554, "mean_token_accuracy": 0.8192096590995789, "num_tokens": 1392640.0, "step": 170 }, { "entropy": 0.4817675709724426, "epoch": 0.02793742017879949, "grad_norm": 2.084299087524414, "learning_rate": 4.999928605745321e-06, "loss": 0.4496, "mean_token_accuracy": 0.8512140274047851, "num_tokens": 1433600.0, "step": 175 }, { "entropy": 0.45817756056785586, "epoch": 0.028735632183908046, "grad_norm": 1.553299903869629, "learning_rate": 4.999924443702344e-06, "loss": 0.4381, "mean_token_accuracy": 0.8554870367050171, "num_tokens": 1474560.0, "step": 180 }, { "entropy": 0.5050954639911651, "epoch": 0.029533844189016605, "grad_norm": 1.8562591075897217, "learning_rate": 4.9999201637576775e-06, "loss": 0.4793, "mean_token_accuracy": 0.8387478709220886, "num_tokens": 1515520.0, "step": 185 }, { "entropy": 0.4602130055427551, "epoch": 0.03033205619412516, "grad_norm": 1.8530664443969727, "learning_rate": 4.999915765911592e-06, "loss": 0.4406, "mean_token_accuracy": 0.8530979156494141, "num_tokens": 1556480.0, "step": 190 }, { "entropy": 0.47528478503227234, "epoch": 0.031130268199233715, "grad_norm": 1.7237930297851562, "learning_rate": 4.9999112501643635e-06, "loss": 0.4387, "mean_token_accuracy": 0.8522473692893981, "num_tokens": 1597440.0, "step": 195 }, { "entropy": 0.48855978846549986, "epoch": 0.031928480204342274, "grad_norm": 1.7260383367538452, "learning_rate": 4.9999066165162755e-06, "loss": 0.4654, "mean_token_accuracy": 0.8458796501159668, "num_tokens": 1638400.0, "step": 200 }, { "epoch": 0.031928480204342274, "eval_entropy": 0.47576266503334047, "eval_loss": 0.45461931824684143, "eval_mean_token_accuracy": 0.8487958540916443, "eval_num_tokens": 1638400.0, "eval_runtime": 69.1687, "eval_samples_per_second": 14.457, "eval_steps_per_second": 1.807, "step": 200 }, { "entropy": 0.515596067905426, "epoch": 0.03272669220945083, "grad_norm": 2.013831377029419, "learning_rate": 4.999901864967621e-06, "loss": 0.4978, "mean_token_accuracy": 0.8359745025634766, "num_tokens": 1679360.0, "step": 205 }, { "entropy": 0.48475693464279174, "epoch": 0.033524904214559385, "grad_norm": 1.9115880727767944, "learning_rate": 4.999896995518698e-06, "loss": 0.4394, "mean_token_accuracy": 0.8532404661178589, "num_tokens": 1720320.0, "step": 210 }, { "entropy": 0.4396271646022797, "epoch": 0.034323116219667944, "grad_norm": 1.712744116783142, "learning_rate": 4.999892008169811e-06, "loss": 0.4015, "mean_token_accuracy": 0.8664462685585022, "num_tokens": 1761280.0, "step": 215 }, { "entropy": 0.5019235789775849, "epoch": 0.0351213282247765, "grad_norm": 1.7797718048095703, "learning_rate": 4.9998869029212766e-06, "loss": 0.4814, "mean_token_accuracy": 0.8428232192993164, "num_tokens": 1802240.0, "step": 220 }, { "entropy": 0.45359727144241335, "epoch": 0.035919540229885055, "grad_norm": 1.6519943475723267, "learning_rate": 4.999881679773414e-06, "loss": 0.4211, "mean_token_accuracy": 0.8586545705795288, "num_tokens": 1843200.0, "step": 225 }, { "entropy": 0.45751402378082273, "epoch": 0.036717752234993614, "grad_norm": 1.8015882968902588, "learning_rate": 4.999876338726552e-06, "loss": 0.4233, "mean_token_accuracy": 0.8589802742004394, "num_tokens": 1884160.0, "step": 230 }, { "entropy": 0.447160130739212, "epoch": 0.03751596424010217, "grad_norm": 1.7113579511642456, "learning_rate": 4.999870879781027e-06, "loss": 0.4196, "mean_token_accuracy": 0.8592816829681397, "num_tokens": 1925120.0, "step": 235 }, { "entropy": 0.4892503976821899, "epoch": 0.038314176245210725, "grad_norm": 2.0123822689056396, "learning_rate": 4.999865302937182e-06, "loss": 0.469, "mean_token_accuracy": 0.8455776095390319, "num_tokens": 1966080.0, "step": 240 }, { "entropy": 0.4409046471118927, "epoch": 0.039112388250319284, "grad_norm": 1.7804988622665405, "learning_rate": 4.999859608195366e-06, "loss": 0.4153, "mean_token_accuracy": 0.8591686248779297, "num_tokens": 2007040.0, "step": 245 }, { "entropy": 0.4628835916519165, "epoch": 0.03991060025542784, "grad_norm": 1.730065941810608, "learning_rate": 4.999853795555939e-06, "loss": 0.4414, "mean_token_accuracy": 0.8521682500839234, "num_tokens": 2048000.0, "step": 250 }, { "entropy": 0.40462678074836733, "epoch": 0.040708812260536395, "grad_norm": 1.8947077989578247, "learning_rate": 4.999847865019267e-06, "loss": 0.3738, "mean_token_accuracy": 0.8727653503417969, "num_tokens": 2088960.0, "step": 255 }, { "entropy": 0.4546548843383789, "epoch": 0.041507024265644954, "grad_norm": 1.7862297296524048, "learning_rate": 4.999841816585722e-06, "loss": 0.442, "mean_token_accuracy": 0.8530672907829284, "num_tokens": 2129490.0, "step": 260 }, { "entropy": 0.45019919872283937, "epoch": 0.04230523627075351, "grad_norm": 1.7621231079101562, "learning_rate": 4.999835650255683e-06, "loss": 0.418, "mean_token_accuracy": 0.8593084692955018, "num_tokens": 2170450.0, "step": 265 }, { "entropy": 0.5008151054382324, "epoch": 0.04310344827586207, "grad_norm": 1.917815923690796, "learning_rate": 4.99982936602954e-06, "loss": 0.472, "mean_token_accuracy": 0.8447394847869873, "num_tokens": 2211180.0, "step": 270 }, { "entropy": 0.4359456837177277, "epoch": 0.043901660280970624, "grad_norm": 1.7914767265319824, "learning_rate": 4.999822963907688e-06, "loss": 0.4118, "mean_token_accuracy": 0.8615276694297791, "num_tokens": 2252140.0, "step": 275 }, { "entropy": 0.4787565290927887, "epoch": 0.04469987228607918, "grad_norm": 2.0970892906188965, "learning_rate": 4.999816443890528e-06, "loss": 0.452, "mean_token_accuracy": 0.8491812825202942, "num_tokens": 2293100.0, "step": 280 }, { "entropy": 0.45131545066833495, "epoch": 0.04549808429118774, "grad_norm": 1.5809624195098877, "learning_rate": 4.99980980597847e-06, "loss": 0.4239, "mean_token_accuracy": 0.8572892904281616, "num_tokens": 2334060.0, "step": 285 }, { "entropy": 0.4229037821292877, "epoch": 0.046296296296296294, "grad_norm": 1.4807138442993164, "learning_rate": 4.999803050171935e-06, "loss": 0.3986, "mean_token_accuracy": 0.8665313005447388, "num_tokens": 2375020.0, "step": 290 }, { "entropy": 0.4446040511131287, "epoch": 0.04709450830140485, "grad_norm": 1.7885520458221436, "learning_rate": 4.999796176471343e-06, "loss": 0.4291, "mean_token_accuracy": 0.8560836672782898, "num_tokens": 2415980.0, "step": 295 }, { "entropy": 0.45139994025230407, "epoch": 0.04789272030651341, "grad_norm": 1.6485087871551514, "learning_rate": 4.999789184877129e-06, "loss": 0.4159, "mean_token_accuracy": 0.8609436988830567, "num_tokens": 2456940.0, "step": 300 }, { "entropy": 0.4461036443710327, "epoch": 0.048690932311621964, "grad_norm": 1.5833100080490112, "learning_rate": 4.999782075389732e-06, "loss": 0.4172, "mean_token_accuracy": 0.8589345335960388, "num_tokens": 2497900.0, "step": 305 }, { "entropy": 0.39054363369941714, "epoch": 0.04948914431673052, "grad_norm": 1.6196880340576172, "learning_rate": 4.9997748480096e-06, "loss": 0.3677, "mean_token_accuracy": 0.873320484161377, "num_tokens": 2538860.0, "step": 310 }, { "entropy": 0.49510942697525023, "epoch": 0.05028735632183908, "grad_norm": 1.8079206943511963, "learning_rate": 4.9997675027371855e-06, "loss": 0.4708, "mean_token_accuracy": 0.8449161767959594, "num_tokens": 2579820.0, "step": 315 }, { "entropy": 0.49750194549560545, "epoch": 0.05108556832694764, "grad_norm": 1.7544002532958984, "learning_rate": 4.999760039572952e-06, "loss": 0.4609, "mean_token_accuracy": 0.8446771740913391, "num_tokens": 2620780.0, "step": 320 }, { "entropy": 0.4802740037441254, "epoch": 0.05188378033205619, "grad_norm": 1.807607889175415, "learning_rate": 4.999752458517367e-06, "loss": 0.445, "mean_token_accuracy": 0.852498996257782, "num_tokens": 2661740.0, "step": 325 }, { "entropy": 0.45189476013183594, "epoch": 0.05268199233716475, "grad_norm": 1.7295165061950684, "learning_rate": 4.99974475957091e-06, "loss": 0.4218, "mean_token_accuracy": 0.8570088505744934, "num_tokens": 2702700.0, "step": 330 }, { "entropy": 0.4113829493522644, "epoch": 0.05348020434227331, "grad_norm": 3.3795394897460938, "learning_rate": 4.9997369427340635e-06, "loss": 0.3874, "mean_token_accuracy": 0.8682220339775085, "num_tokens": 2743660.0, "step": 335 }, { "entropy": 0.46611291766166685, "epoch": 0.05427841634738186, "grad_norm": 1.7831217050552368, "learning_rate": 4.999729008007319e-06, "loss": 0.4438, "mean_token_accuracy": 0.8525894761085511, "num_tokens": 2784620.0, "step": 340 }, { "entropy": 0.43543928265571596, "epoch": 0.05507662835249042, "grad_norm": 1.7297419309616089, "learning_rate": 4.9997209553911755e-06, "loss": 0.4031, "mean_token_accuracy": 0.8630126476287842, "num_tokens": 2825580.0, "step": 345 }, { "entropy": 0.46474372744560244, "epoch": 0.05587484035759898, "grad_norm": 1.8551281690597534, "learning_rate": 4.99971278488614e-06, "loss": 0.4417, "mean_token_accuracy": 0.849402928352356, "num_tokens": 2866540.0, "step": 350 }, { "entropy": 0.4573587000370026, "epoch": 0.05667305236270753, "grad_norm": 1.977996587753296, "learning_rate": 4.999704496492726e-06, "loss": 0.4469, "mean_token_accuracy": 0.8525743603706359, "num_tokens": 2907500.0, "step": 355 }, { "entropy": 0.4407190024852753, "epoch": 0.05747126436781609, "grad_norm": 1.7498115301132202, "learning_rate": 4.999696090211454e-06, "loss": 0.4168, "mean_token_accuracy": 0.8584057688713074, "num_tokens": 2948460.0, "step": 360 }, { "entropy": 0.43572052717208865, "epoch": 0.05826947637292465, "grad_norm": 1.7404518127441406, "learning_rate": 4.999687566042853e-06, "loss": 0.4029, "mean_token_accuracy": 0.8642658710479736, "num_tokens": 2989420.0, "step": 365 }, { "entropy": 0.43591675758361814, "epoch": 0.05906768837803321, "grad_norm": 1.9127355813980103, "learning_rate": 4.999678923987459e-06, "loss": 0.4124, "mean_token_accuracy": 0.8618336081504822, "num_tokens": 3030380.0, "step": 370 }, { "entropy": 0.48763387799263, "epoch": 0.05986590038314176, "grad_norm": 1.9815560579299927, "learning_rate": 4.999670164045816e-06, "loss": 0.4629, "mean_token_accuracy": 0.8467477440834046, "num_tokens": 3071340.0, "step": 375 }, { "entropy": 0.49417877197265625, "epoch": 0.06066411238825032, "grad_norm": 1.7125637531280518, "learning_rate": 4.9996612862184745e-06, "loss": 0.4605, "mean_token_accuracy": 0.8464985489845276, "num_tokens": 3112300.0, "step": 380 }, { "entropy": 0.4436693489551544, "epoch": 0.06146232439335888, "grad_norm": 1.6935256719589233, "learning_rate": 4.999652290505993e-06, "loss": 0.415, "mean_token_accuracy": 0.8586057186126709, "num_tokens": 3153260.0, "step": 385 }, { "entropy": 0.3957001864910126, "epoch": 0.06226053639846743, "grad_norm": 1.5968080759048462, "learning_rate": 4.999643176908937e-06, "loss": 0.368, "mean_token_accuracy": 0.8730635643005371, "num_tokens": 3194220.0, "step": 390 }, { "entropy": 0.4333923041820526, "epoch": 0.06305874840357599, "grad_norm": 1.6355855464935303, "learning_rate": 4.999633945427879e-06, "loss": 0.4084, "mean_token_accuracy": 0.8614879369735717, "num_tokens": 3235180.0, "step": 395 }, { "entropy": 0.4490614771842957, "epoch": 0.06385696040868455, "grad_norm": 1.7251191139221191, "learning_rate": 4.999624596063401e-06, "loss": 0.4125, "mean_token_accuracy": 0.8589560508728027, "num_tokens": 3276140.0, "step": 400 }, { "epoch": 0.06385696040868455, "eval_entropy": 0.456509140253067, "eval_loss": 0.42409759759902954, "eval_mean_token_accuracy": 0.8566836671829223, "eval_num_tokens": 3276140.0, "eval_runtime": 69.3986, "eval_samples_per_second": 14.41, "eval_steps_per_second": 1.801, "step": 400 }, { "entropy": 0.4575593590736389, "epoch": 0.06465517241379311, "grad_norm": 1.9416770935058594, "learning_rate": 4.9996151288160885e-06, "loss": 0.4348, "mean_token_accuracy": 0.852801513671875, "num_tokens": 3317100.0, "step": 405 }, { "entropy": 0.4455989181995392, "epoch": 0.06545338441890167, "grad_norm": 1.639709711074829, "learning_rate": 4.9996055436865395e-06, "loss": 0.4279, "mean_token_accuracy": 0.8566305279731751, "num_tokens": 3358060.0, "step": 410 }, { "entropy": 0.436484295129776, "epoch": 0.06625159642401021, "grad_norm": 1.7730813026428223, "learning_rate": 4.999595840675355e-06, "loss": 0.4092, "mean_token_accuracy": 0.8643952012062073, "num_tokens": 3399020.0, "step": 415 }, { "entropy": 0.4210900843143463, "epoch": 0.06704980842911877, "grad_norm": 1.8874180316925049, "learning_rate": 4.999586019783145e-06, "loss": 0.3929, "mean_token_accuracy": 0.8668930411338807, "num_tokens": 3439980.0, "step": 420 }, { "entropy": 0.4613493621349335, "epoch": 0.06784802043422733, "grad_norm": 1.9141099452972412, "learning_rate": 4.999576081010529e-06, "loss": 0.4333, "mean_token_accuracy": 0.8520909667015075, "num_tokens": 3480940.0, "step": 425 }, { "entropy": 0.4226214110851288, "epoch": 0.06864623243933589, "grad_norm": 1.8183785676956177, "learning_rate": 4.99956602435813e-06, "loss": 0.397, "mean_token_accuracy": 0.8657994747161866, "num_tokens": 3521900.0, "step": 430 }, { "entropy": 0.3875891506671906, "epoch": 0.06944444444444445, "grad_norm": 1.5938745737075806, "learning_rate": 4.999555849826582e-06, "loss": 0.3564, "mean_token_accuracy": 0.8773833990097046, "num_tokens": 3562860.0, "step": 435 }, { "entropy": 0.4186811327934265, "epoch": 0.070242656449553, "grad_norm": 1.7199757099151611, "learning_rate": 4.999545557416523e-06, "loss": 0.3958, "mean_token_accuracy": 0.8652471303939819, "num_tokens": 3603820.0, "step": 440 }, { "entropy": 0.43029149770736697, "epoch": 0.07104086845466155, "grad_norm": 1.851920247077942, "learning_rate": 4.9995351471286015e-06, "loss": 0.3945, "mean_token_accuracy": 0.8676915645599366, "num_tokens": 3644780.0, "step": 445 }, { "entropy": 0.45375868678092957, "epoch": 0.07183908045977011, "grad_norm": 1.766336441040039, "learning_rate": 4.9995246189634715e-06, "loss": 0.4266, "mean_token_accuracy": 0.8571653246879578, "num_tokens": 3685740.0, "step": 450 }, { "entropy": 0.4406971275806427, "epoch": 0.07263729246487867, "grad_norm": 1.7922120094299316, "learning_rate": 4.999513972921796e-06, "loss": 0.4145, "mean_token_accuracy": 0.8598119735717773, "num_tokens": 3726700.0, "step": 455 }, { "entropy": 0.3757154107093811, "epoch": 0.07343550446998723, "grad_norm": 1.6567347049713135, "learning_rate": 4.999503209004244e-06, "loss": 0.3465, "mean_token_accuracy": 0.8820300817489624, "num_tokens": 3767660.0, "step": 460 }, { "entropy": 0.4569710433483124, "epoch": 0.07423371647509579, "grad_norm": 1.678511142730713, "learning_rate": 4.9994923272114905e-06, "loss": 0.4329, "mean_token_accuracy": 0.8535709857940674, "num_tokens": 3808620.0, "step": 465 }, { "entropy": 0.4190040946006775, "epoch": 0.07503192848020435, "grad_norm": 1.7449942827224731, "learning_rate": 4.999481327544224e-06, "loss": 0.3764, "mean_token_accuracy": 0.8726458072662353, "num_tokens": 3849580.0, "step": 470 }, { "entropy": 0.4649412989616394, "epoch": 0.0758301404853129, "grad_norm": 1.9117767810821533, "learning_rate": 4.999470210003132e-06, "loss": 0.4461, "mean_token_accuracy": 0.8512534976005555, "num_tokens": 3890540.0, "step": 475 }, { "entropy": 0.4217049300670624, "epoch": 0.07662835249042145, "grad_norm": 1.6323946714401245, "learning_rate": 4.9994589745889155e-06, "loss": 0.3923, "mean_token_accuracy": 0.8682831287384033, "num_tokens": 3931500.0, "step": 480 }, { "entropy": 0.4169506013393402, "epoch": 0.07742656449553001, "grad_norm": 1.7119159698486328, "learning_rate": 4.9994476213022804e-06, "loss": 0.3859, "mean_token_accuracy": 0.8702028751373291, "num_tokens": 3972460.0, "step": 485 }, { "entropy": 0.4700320720672607, "epoch": 0.07822477650063857, "grad_norm": 1.8065766096115112, "learning_rate": 4.999436150143941e-06, "loss": 0.4473, "mean_token_accuracy": 0.8486562728881836, "num_tokens": 4013420.0, "step": 490 }, { "entropy": 0.5004218697547913, "epoch": 0.07902298850574713, "grad_norm": 1.7902945280075073, "learning_rate": 4.99942456111462e-06, "loss": 0.4784, "mean_token_accuracy": 0.8416624784469604, "num_tokens": 4054380.0, "step": 495 }, { "entropy": 0.38799951076507566, "epoch": 0.07982120051085569, "grad_norm": 1.724098563194275, "learning_rate": 4.999412854215044e-06, "loss": 0.3535, "mean_token_accuracy": 0.8762063622474671, "num_tokens": 4095340.0, "step": 500 }, { "entropy": 0.42835206985473634, "epoch": 0.08061941251596424, "grad_norm": 1.63278067111969, "learning_rate": 4.99940102944595e-06, "loss": 0.4044, "mean_token_accuracy": 0.8626661181449891, "num_tokens": 4136300.0, "step": 505 }, { "entropy": 0.4672918081283569, "epoch": 0.08141762452107279, "grad_norm": 1.9030835628509521, "learning_rate": 4.999389086808082e-06, "loss": 0.447, "mean_token_accuracy": 0.8479044079780579, "num_tokens": 4177260.0, "step": 510 }, { "entropy": 0.43039371371269225, "epoch": 0.08221583652618135, "grad_norm": 1.7228889465332031, "learning_rate": 4.99937702630219e-06, "loss": 0.3903, "mean_token_accuracy": 0.8647573947906494, "num_tokens": 4218220.0, "step": 515 }, { "entropy": 0.44171770811080935, "epoch": 0.08301404853128991, "grad_norm": 1.7838937044143677, "learning_rate": 4.999364847929033e-06, "loss": 0.4037, "mean_token_accuracy": 0.8621063232421875, "num_tokens": 4259180.0, "step": 520 }, { "entropy": 0.44105868935585024, "epoch": 0.08381226053639847, "grad_norm": 1.8959293365478516, "learning_rate": 4.9993525516893775e-06, "loss": 0.427, "mean_token_accuracy": 0.8541361808776855, "num_tokens": 4300140.0, "step": 525 }, { "entropy": 0.46709821820259095, "epoch": 0.08461047254150703, "grad_norm": 1.7269469499588013, "learning_rate": 4.999340137583995e-06, "loss": 0.4238, "mean_token_accuracy": 0.8574911475181579, "num_tokens": 4341100.0, "step": 530 }, { "entropy": 0.5288523077964783, "epoch": 0.08540868454661558, "grad_norm": 1.8445000648498535, "learning_rate": 4.999327605613668e-06, "loss": 0.4833, "mean_token_accuracy": 0.839403486251831, "num_tokens": 4382060.0, "step": 535 }, { "entropy": 0.42816867828369143, "epoch": 0.08620689655172414, "grad_norm": 1.7641640901565552, "learning_rate": 4.999314955779183e-06, "loss": 0.3985, "mean_token_accuracy": 0.8624008417129516, "num_tokens": 4423020.0, "step": 540 }, { "entropy": 0.40726423263549805, "epoch": 0.08700510855683269, "grad_norm": 1.865195870399475, "learning_rate": 4.999302188081338e-06, "loss": 0.3883, "mean_token_accuracy": 0.868608021736145, "num_tokens": 4463980.0, "step": 545 }, { "entropy": 0.4436700284481049, "epoch": 0.08780332056194125, "grad_norm": 1.7468849420547485, "learning_rate": 4.999289302520932e-06, "loss": 0.4059, "mean_token_accuracy": 0.8610162615776062, "num_tokens": 4504795.0, "step": 550 }, { "entropy": 0.4452336847782135, "epoch": 0.0886015325670498, "grad_norm": 1.5321035385131836, "learning_rate": 4.999276299098779e-06, "loss": 0.4051, "mean_token_accuracy": 0.8643596410751343, "num_tokens": 4545755.0, "step": 555 }, { "entropy": 0.3910303771495819, "epoch": 0.08939974457215837, "grad_norm": 1.6199473142623901, "learning_rate": 4.999263177815693e-06, "loss": 0.3656, "mean_token_accuracy": 0.874867069721222, "num_tokens": 4586715.0, "step": 560 }, { "entropy": 0.44607588052749636, "epoch": 0.09019795657726692, "grad_norm": 1.7175859212875366, "learning_rate": 4.999249938672503e-06, "loss": 0.4234, "mean_token_accuracy": 0.8542021036148071, "num_tokens": 4627675.0, "step": 565 }, { "entropy": 0.4354964554309845, "epoch": 0.09099616858237548, "grad_norm": 2.050355911254883, "learning_rate": 4.9992365816700394e-06, "loss": 0.4081, "mean_token_accuracy": 0.8629257440567016, "num_tokens": 4668635.0, "step": 570 }, { "entropy": 0.48360870480537416, "epoch": 0.09179438058748404, "grad_norm": 1.6667331457138062, "learning_rate": 4.999223106809142e-06, "loss": 0.4452, "mean_token_accuracy": 0.8496447801589966, "num_tokens": 4709595.0, "step": 575 }, { "entropy": 0.46697754263877866, "epoch": 0.09259259259259259, "grad_norm": 1.8308717012405396, "learning_rate": 4.999209514090658e-06, "loss": 0.4385, "mean_token_accuracy": 0.8522969365119935, "num_tokens": 4750555.0, "step": 580 }, { "entropy": 0.431224399805069, "epoch": 0.09339080459770115, "grad_norm": 1.7282050848007202, "learning_rate": 4.999195803515444e-06, "loss": 0.4063, "mean_token_accuracy": 0.8619638442993164, "num_tokens": 4791515.0, "step": 585 }, { "entropy": 0.48489547371864317, "epoch": 0.0941890166028097, "grad_norm": 1.8322498798370361, "learning_rate": 4.99918197508436e-06, "loss": 0.461, "mean_token_accuracy": 0.8451943159103393, "num_tokens": 4832475.0, "step": 590 }, { "entropy": 0.42147684693336485, "epoch": 0.09498722860791826, "grad_norm": 1.637582540512085, "learning_rate": 4.999168028798277e-06, "loss": 0.3821, "mean_token_accuracy": 0.8680244445800781, "num_tokens": 4873435.0, "step": 595 }, { "entropy": 0.4362496554851532, "epoch": 0.09578544061302682, "grad_norm": 1.850331425666809, "learning_rate": 4.999153964658071e-06, "loss": 0.4072, "mean_token_accuracy": 0.8630183935165405, "num_tokens": 4914395.0, "step": 600 }, { "epoch": 0.09578544061302682, "eval_entropy": 0.44316129422187805, "eval_loss": 0.4095243811607361, "eval_mean_token_accuracy": 0.8603043389320374, "eval_num_tokens": 4914395.0, "eval_runtime": 69.1898, "eval_samples_per_second": 14.453, "eval_steps_per_second": 1.807, "step": 600 }, { "entropy": 0.44732285141944883, "epoch": 0.09658365261813538, "grad_norm": 1.751281499862671, "learning_rate": 4.999139782664627e-06, "loss": 0.4144, "mean_token_accuracy": 0.8580738425254821, "num_tokens": 4955355.0, "step": 605 }, { "entropy": 0.44980551600456237, "epoch": 0.09738186462324393, "grad_norm": 1.8443349599838257, "learning_rate": 4.999125482818837e-06, "loss": 0.4414, "mean_token_accuracy": 0.8509649872779846, "num_tokens": 4996315.0, "step": 610 }, { "entropy": 0.4144567608833313, "epoch": 0.09818007662835249, "grad_norm": 1.7080366611480713, "learning_rate": 4.9991110651216e-06, "loss": 0.3745, "mean_token_accuracy": 0.8717723250389099, "num_tokens": 5037275.0, "step": 615 }, { "entropy": 0.38187943696975707, "epoch": 0.09897828863346105, "grad_norm": 1.6727166175842285, "learning_rate": 4.999096529573822e-06, "loss": 0.3472, "mean_token_accuracy": 0.8789412021636963, "num_tokens": 5078235.0, "step": 620 }, { "entropy": 0.41196991205215455, "epoch": 0.0997765006385696, "grad_norm": 1.7526803016662598, "learning_rate": 4.999081876176418e-06, "loss": 0.3794, "mean_token_accuracy": 0.8686864018440247, "num_tokens": 5119195.0, "step": 625 }, { "entropy": 0.4536400198936462, "epoch": 0.10057471264367816, "grad_norm": 1.753389596939087, "learning_rate": 4.999067104930308e-06, "loss": 0.4134, "mean_token_accuracy": 0.8584004759788513, "num_tokens": 5160105.0, "step": 630 }, { "entropy": 0.42510269284248353, "epoch": 0.10137292464878672, "grad_norm": 1.7358981370925903, "learning_rate": 4.999052215836423e-06, "loss": 0.3947, "mean_token_accuracy": 0.8650004863739014, "num_tokens": 5201065.0, "step": 635 }, { "entropy": 0.4355582058429718, "epoch": 0.10217113665389528, "grad_norm": 1.642331600189209, "learning_rate": 4.999037208895699e-06, "loss": 0.4238, "mean_token_accuracy": 0.8562347054481506, "num_tokens": 5242025.0, "step": 640 }, { "entropy": 0.44882028698921206, "epoch": 0.10296934865900383, "grad_norm": 6.373381614685059, "learning_rate": 4.9990220841090775e-06, "loss": 0.4207, "mean_token_accuracy": 0.8576697468757629, "num_tokens": 5282985.0, "step": 645 }, { "entropy": 0.408430814743042, "epoch": 0.10376756066411238, "grad_norm": 1.6307820081710815, "learning_rate": 4.999006841477512e-06, "loss": 0.3734, "mean_token_accuracy": 0.8739194750785828, "num_tokens": 5323945.0, "step": 650 }, { "entropy": 0.40124152302742006, "epoch": 0.10456577266922094, "grad_norm": 1.6492515802383423, "learning_rate": 4.998991481001959e-06, "loss": 0.3628, "mean_token_accuracy": 0.8745116472244263, "num_tokens": 5364905.0, "step": 655 }, { "entropy": 0.44308696389198304, "epoch": 0.1053639846743295, "grad_norm": 1.9647578001022339, "learning_rate": 4.998976002683385e-06, "loss": 0.4069, "mean_token_accuracy": 0.8612207174301147, "num_tokens": 5405865.0, "step": 660 }, { "entropy": 0.4585953652858734, "epoch": 0.10616219667943806, "grad_norm": 1.82151460647583, "learning_rate": 4.9989604065227655e-06, "loss": 0.4445, "mean_token_accuracy": 0.8487472653388977, "num_tokens": 5446825.0, "step": 665 }, { "entropy": 0.40014599561691283, "epoch": 0.10696040868454662, "grad_norm": 1.619546890258789, "learning_rate": 4.998944692521078e-06, "loss": 0.3722, "mean_token_accuracy": 0.8710795283317566, "num_tokens": 5487785.0, "step": 670 }, { "entropy": 0.439222115278244, "epoch": 0.10775862068965517, "grad_norm": 1.5537372827529907, "learning_rate": 4.9989288606793126e-06, "loss": 0.395, "mean_token_accuracy": 0.8664328694343567, "num_tokens": 5528745.0, "step": 675 }, { "entropy": 0.3984904527664185, "epoch": 0.10855683269476372, "grad_norm": 1.7261492013931274, "learning_rate": 4.998912910998465e-06, "loss": 0.3585, "mean_token_accuracy": 0.875708258152008, "num_tokens": 5569705.0, "step": 680 }, { "entropy": 0.40191400051116943, "epoch": 0.10935504469987228, "grad_norm": 1.675265908241272, "learning_rate": 4.998896843479537e-06, "loss": 0.3725, "mean_token_accuracy": 0.8739235281944275, "num_tokens": 5610665.0, "step": 685 }, { "entropy": 0.3728193402290344, "epoch": 0.11015325670498084, "grad_norm": 1.590753436088562, "learning_rate": 4.9988806581235385e-06, "loss": 0.3467, "mean_token_accuracy": 0.8822915911674499, "num_tokens": 5651625.0, "step": 690 }, { "entropy": 0.42136245369911196, "epoch": 0.1109514687100894, "grad_norm": 1.5533643960952759, "learning_rate": 4.9988643549314895e-06, "loss": 0.3914, "mean_token_accuracy": 0.8653243780136108, "num_tokens": 5692585.0, "step": 695 }, { "entropy": 0.43504267930984497, "epoch": 0.11174968071519796, "grad_norm": 1.715844750404358, "learning_rate": 4.998847933904414e-06, "loss": 0.4022, "mean_token_accuracy": 0.8631648063659668, "num_tokens": 5733545.0, "step": 700 }, { "entropy": 0.4285335838794708, "epoch": 0.11254789272030652, "grad_norm": 1.579676866531372, "learning_rate": 4.998831395043344e-06, "loss": 0.4, "mean_token_accuracy": 0.8621499538421631, "num_tokens": 5774505.0, "step": 705 }, { "entropy": 0.42913134694099425, "epoch": 0.11334610472541506, "grad_norm": 1.932811975479126, "learning_rate": 4.998814738349322e-06, "loss": 0.4089, "mean_token_accuracy": 0.8622578978538513, "num_tokens": 5815465.0, "step": 710 }, { "entropy": 0.44279545545578003, "epoch": 0.11414431673052362, "grad_norm": 1.821500539779663, "learning_rate": 4.99879796382339e-06, "loss": 0.4123, "mean_token_accuracy": 0.8612294793128967, "num_tokens": 5856425.0, "step": 715 }, { "entropy": 0.423015832901001, "epoch": 0.11494252873563218, "grad_norm": 1.6205596923828125, "learning_rate": 4.998781071466609e-06, "loss": 0.3909, "mean_token_accuracy": 0.8664390563964843, "num_tokens": 5897385.0, "step": 720 }, { "entropy": 0.43310112357139585, "epoch": 0.11574074074074074, "grad_norm": 1.5314807891845703, "learning_rate": 4.9987640612800395e-06, "loss": 0.3979, "mean_token_accuracy": 0.8635281443595886, "num_tokens": 5938345.0, "step": 725 }, { "entropy": 0.45245509743690493, "epoch": 0.1165389527458493, "grad_norm": 1.9062870740890503, "learning_rate": 4.998746933264749e-06, "loss": 0.4138, "mean_token_accuracy": 0.8594639301300049, "num_tokens": 5979305.0, "step": 730 }, { "entropy": 0.4959665656089783, "epoch": 0.11733716475095786, "grad_norm": 1.8808233737945557, "learning_rate": 4.998729687421816e-06, "loss": 0.4772, "mean_token_accuracy": 0.8400404214859009, "num_tokens": 6020265.0, "step": 735 }, { "entropy": 0.4221862554550171, "epoch": 0.11813537675606642, "grad_norm": 1.8408641815185547, "learning_rate": 4.998712323752325e-06, "loss": 0.3848, "mean_token_accuracy": 0.8666460990905762, "num_tokens": 6061225.0, "step": 740 }, { "entropy": 0.43175222873687746, "epoch": 0.11893358876117496, "grad_norm": 1.737856388092041, "learning_rate": 4.998694842257367e-06, "loss": 0.3946, "mean_token_accuracy": 0.865670645236969, "num_tokens": 6102185.0, "step": 745 }, { "entropy": 0.43961183428764344, "epoch": 0.11973180076628352, "grad_norm": 1.624342679977417, "learning_rate": 4.998677242938043e-06, "loss": 0.4117, "mean_token_accuracy": 0.859315299987793, "num_tokens": 6143145.0, "step": 750 }, { "entropy": 0.45612156987190244, "epoch": 0.12053001277139208, "grad_norm": 1.8945883512496948, "learning_rate": 4.998659525795459e-06, "loss": 0.422, "mean_token_accuracy": 0.854542326927185, "num_tokens": 6184105.0, "step": 755 }, { "entropy": 0.41530930399894717, "epoch": 0.12132822477650064, "grad_norm": 1.8583099842071533, "learning_rate": 4.998641690830728e-06, "loss": 0.3815, "mean_token_accuracy": 0.8694210529327393, "num_tokens": 6225065.0, "step": 760 }, { "entropy": 0.39272274971008303, "epoch": 0.1221264367816092, "grad_norm": 1.621265172958374, "learning_rate": 4.9986237380449734e-06, "loss": 0.347, "mean_token_accuracy": 0.880846381187439, "num_tokens": 6266025.0, "step": 765 }, { "entropy": 0.4349696278572083, "epoch": 0.12292464878671776, "grad_norm": 1.6760231256484985, "learning_rate": 4.998605667439322e-06, "loss": 0.4078, "mean_token_accuracy": 0.8591925144195557, "num_tokens": 6306985.0, "step": 770 }, { "entropy": 0.3744224488735199, "epoch": 0.1237228607918263, "grad_norm": 1.481189250946045, "learning_rate": 4.998587479014912e-06, "loss": 0.3481, "mean_token_accuracy": 0.8797758460044861, "num_tokens": 6347945.0, "step": 775 }, { "entropy": 0.39019187688827517, "epoch": 0.12452107279693486, "grad_norm": 1.720441460609436, "learning_rate": 4.998569172772886e-06, "loss": 0.3486, "mean_token_accuracy": 0.8817312359809876, "num_tokens": 6388905.0, "step": 780 }, { "entropy": 0.4243058919906616, "epoch": 0.12531928480204343, "grad_norm": 1.5835771560668945, "learning_rate": 4.9985507487143964e-06, "loss": 0.3864, "mean_token_accuracy": 0.8680808663368225, "num_tokens": 6429865.0, "step": 785 }, { "entropy": 0.43315653800964354, "epoch": 0.12611749680715198, "grad_norm": 1.8805222511291504, "learning_rate": 4.9985322068406e-06, "loss": 0.411, "mean_token_accuracy": 0.8571700334548951, "num_tokens": 6470825.0, "step": 790 }, { "entropy": 0.4055586576461792, "epoch": 0.12691570881226052, "grad_norm": 1.5204501152038574, "learning_rate": 4.998513547152665e-06, "loss": 0.3726, "mean_token_accuracy": 0.8707727670669556, "num_tokens": 6511785.0, "step": 795 }, { "entropy": 0.4430310487747192, "epoch": 0.1277139208173691, "grad_norm": 1.6038336753845215, "learning_rate": 4.998494769651762e-06, "loss": 0.4114, "mean_token_accuracy": 0.8609652161598206, "num_tokens": 6552745.0, "step": 800 }, { "epoch": 0.1277139208173691, "eval_entropy": 0.4325378756523132, "eval_loss": 0.4014604687690735, "eval_mean_token_accuracy": 0.8624647760391235, "eval_num_tokens": 6552745.0, "eval_runtime": 69.2056, "eval_samples_per_second": 14.45, "eval_steps_per_second": 1.806, "step": 800 }, { "entropy": 0.4043457269668579, "epoch": 0.12851213282247764, "grad_norm": 1.5992178916931152, "learning_rate": 4.998475874339074e-06, "loss": 0.3732, "mean_token_accuracy": 0.8718292593955994, "num_tokens": 6593705.0, "step": 805 }, { "entropy": 0.42595568895339964, "epoch": 0.12931034482758622, "grad_norm": 1.7485393285751343, "learning_rate": 4.998456861215789e-06, "loss": 0.3905, "mean_token_accuracy": 0.8653177976608276, "num_tokens": 6634665.0, "step": 810 }, { "entropy": 0.4524208724498749, "epoch": 0.13010855683269476, "grad_norm": 1.942795753479004, "learning_rate": 4.998437730283102e-06, "loss": 0.4301, "mean_token_accuracy": 0.8551061153411865, "num_tokens": 6675625.0, "step": 815 }, { "entropy": 0.4193409144878387, "epoch": 0.13090676883780333, "grad_norm": 1.827155590057373, "learning_rate": 4.998418481542215e-06, "loss": 0.3841, "mean_token_accuracy": 0.8670408844947814, "num_tokens": 6716585.0, "step": 820 }, { "entropy": 0.44782190322875975, "epoch": 0.13170498084291188, "grad_norm": 1.7226262092590332, "learning_rate": 4.998399114994341e-06, "loss": 0.412, "mean_token_accuracy": 0.8589698910713196, "num_tokens": 6757545.0, "step": 825 }, { "entropy": 0.4577144384384155, "epoch": 0.13250319284802042, "grad_norm": 1.8020867109298706, "learning_rate": 4.998379630640696e-06, "loss": 0.4248, "mean_token_accuracy": 0.8565351963043213, "num_tokens": 6798505.0, "step": 830 }, { "entropy": 0.38689329028129577, "epoch": 0.133301404853129, "grad_norm": 1.7878338098526, "learning_rate": 4.998360028482505e-06, "loss": 0.3514, "mean_token_accuracy": 0.878675889968872, "num_tokens": 6839465.0, "step": 835 }, { "entropy": 0.4039497375488281, "epoch": 0.13409961685823754, "grad_norm": 1.794539213180542, "learning_rate": 4.998340308521002e-06, "loss": 0.3773, "mean_token_accuracy": 0.8708958387374878, "num_tokens": 6880425.0, "step": 840 }, { "entropy": 0.40037272572517396, "epoch": 0.13489782886334611, "grad_norm": 1.7989890575408936, "learning_rate": 4.998320470757426e-06, "loss": 0.3706, "mean_token_accuracy": 0.8725654721260071, "num_tokens": 6921385.0, "step": 845 }, { "entropy": 0.4616117298603058, "epoch": 0.13569604086845466, "grad_norm": 1.8218706846237183, "learning_rate": 4.998300515193026e-06, "loss": 0.4366, "mean_token_accuracy": 0.8513168811798095, "num_tokens": 6962345.0, "step": 850 }, { "entropy": 0.440923935174942, "epoch": 0.13649425287356323, "grad_norm": 1.712561845779419, "learning_rate": 4.998280441829054e-06, "loss": 0.4009, "mean_token_accuracy": 0.8618036508560181, "num_tokens": 7003305.0, "step": 855 }, { "entropy": 0.4474347770214081, "epoch": 0.13729246487867178, "grad_norm": 1.7691107988357544, "learning_rate": 4.998260250666775e-06, "loss": 0.4169, "mean_token_accuracy": 0.8586588621139526, "num_tokens": 7044265.0, "step": 860 }, { "entropy": 0.39236196875572205, "epoch": 0.13809067688378032, "grad_norm": 1.5966753959655762, "learning_rate": 4.998239941707457e-06, "loss": 0.3582, "mean_token_accuracy": 0.875457501411438, "num_tokens": 7085225.0, "step": 865 }, { "entropy": 0.41359818577766416, "epoch": 0.1388888888888889, "grad_norm": 1.8600451946258545, "learning_rate": 4.998219514952378e-06, "loss": 0.3847, "mean_token_accuracy": 0.866973626613617, "num_tokens": 7126185.0, "step": 870 }, { "entropy": 0.42218191623687745, "epoch": 0.13968710089399744, "grad_norm": 1.6097564697265625, "learning_rate": 4.998198970402822e-06, "loss": 0.3955, "mean_token_accuracy": 0.866248095035553, "num_tokens": 7167145.0, "step": 875 }, { "entropy": 0.41699051260948183, "epoch": 0.140485312899106, "grad_norm": 1.737248182296753, "learning_rate": 4.998178308060082e-06, "loss": 0.3835, "mean_token_accuracy": 0.8678730607032776, "num_tokens": 7208105.0, "step": 880 }, { "entropy": 0.4190727710723877, "epoch": 0.14128352490421456, "grad_norm": 1.5152932405471802, "learning_rate": 4.998157527925456e-06, "loss": 0.3922, "mean_token_accuracy": 0.8653331279754639, "num_tokens": 7249065.0, "step": 885 }, { "entropy": 0.4779325366020203, "epoch": 0.1420817369093231, "grad_norm": 1.8298388719558716, "learning_rate": 4.998136630000251e-06, "loss": 0.437, "mean_token_accuracy": 0.84938884973526, "num_tokens": 7290025.0, "step": 890 }, { "entropy": 0.4784541606903076, "epoch": 0.14287994891443168, "grad_norm": 1.6877150535583496, "learning_rate": 4.998115614285782e-06, "loss": 0.4536, "mean_token_accuracy": 0.8492711663246155, "num_tokens": 7330985.0, "step": 895 }, { "entropy": 0.44440594911575315, "epoch": 0.14367816091954022, "grad_norm": 1.829339623451233, "learning_rate": 4.99809448078337e-06, "loss": 0.411, "mean_token_accuracy": 0.859615421295166, "num_tokens": 7371945.0, "step": 900 }, { "entropy": 0.41583908200263975, "epoch": 0.1444763729246488, "grad_norm": 1.5231865644454956, "learning_rate": 4.9980732294943435e-06, "loss": 0.3768, "mean_token_accuracy": 0.8711596846580505, "num_tokens": 7412905.0, "step": 905 }, { "entropy": 0.45024530291557313, "epoch": 0.14527458492975734, "grad_norm": 1.843286156654358, "learning_rate": 4.998051860420039e-06, "loss": 0.4215, "mean_token_accuracy": 0.8572163820266724, "num_tokens": 7453865.0, "step": 910 }, { "entropy": 0.39993720054626464, "epoch": 0.1460727969348659, "grad_norm": 1.4822101593017578, "learning_rate": 4.998030373561801e-06, "loss": 0.3712, "mean_token_accuracy": 0.8721619606018066, "num_tokens": 7494825.0, "step": 915 }, { "entropy": 0.4568325638771057, "epoch": 0.14687100893997446, "grad_norm": 1.7281900644302368, "learning_rate": 4.99800876892098e-06, "loss": 0.4241, "mean_token_accuracy": 0.8540958523750305, "num_tokens": 7535785.0, "step": 920 }, { "entropy": 0.3957813024520874, "epoch": 0.147669220945083, "grad_norm": 1.746334195137024, "learning_rate": 4.997987046498934e-06, "loss": 0.3688, "mean_token_accuracy": 0.8738395810127259, "num_tokens": 7576745.0, "step": 925 }, { "entropy": 0.4682184398174286, "epoch": 0.14846743295019157, "grad_norm": 1.788724660873413, "learning_rate": 4.99796520629703e-06, "loss": 0.4454, "mean_token_accuracy": 0.848229706287384, "num_tokens": 7617705.0, "step": 930 }, { "entropy": 0.43413129448890686, "epoch": 0.14926564495530012, "grad_norm": 1.674914002418518, "learning_rate": 4.9979432483166415e-06, "loss": 0.3954, "mean_token_accuracy": 0.8650355458259582, "num_tokens": 7658665.0, "step": 935 }, { "entropy": 0.40451282262802124, "epoch": 0.1500638569604087, "grad_norm": 1.8498766422271729, "learning_rate": 4.997921172559149e-06, "loss": 0.3732, "mean_token_accuracy": 0.8703109145164489, "num_tokens": 7699625.0, "step": 940 }, { "entropy": 0.395086282491684, "epoch": 0.15086206896551724, "grad_norm": 1.7072008848190308, "learning_rate": 4.99789897902594e-06, "loss": 0.3653, "mean_token_accuracy": 0.8722005605697631, "num_tokens": 7740585.0, "step": 945 }, { "entropy": 0.4430574059486389, "epoch": 0.1516602809706258, "grad_norm": 1.6196160316467285, "learning_rate": 4.997876667718411e-06, "loss": 0.4107, "mean_token_accuracy": 0.8589556455612183, "num_tokens": 7781545.0, "step": 950 }, { "entropy": 0.38962970972061156, "epoch": 0.15245849297573436, "grad_norm": 1.6539169549942017, "learning_rate": 4.997854238637964e-06, "loss": 0.3567, "mean_token_accuracy": 0.8773809552192688, "num_tokens": 7822505.0, "step": 955 }, { "entropy": 0.42701526880264284, "epoch": 0.1532567049808429, "grad_norm": 1.7930479049682617, "learning_rate": 4.9978316917860115e-06, "loss": 0.3854, "mean_token_accuracy": 0.8697462320327759, "num_tokens": 7863465.0, "step": 960 }, { "entropy": 0.46107348799705505, "epoch": 0.15405491698595147, "grad_norm": 1.8451169729232788, "learning_rate": 4.997809027163969e-06, "loss": 0.4281, "mean_token_accuracy": 0.8550493597984314, "num_tokens": 7904425.0, "step": 965 }, { "entropy": 0.4252850949764252, "epoch": 0.15485312899106002, "grad_norm": 1.5299930572509766, "learning_rate": 4.997786244773263e-06, "loss": 0.3975, "mean_token_accuracy": 0.8656162261962891, "num_tokens": 7945385.0, "step": 970 }, { "entropy": 0.40233043432235716, "epoch": 0.1556513409961686, "grad_norm": 1.7764883041381836, "learning_rate": 4.997763344615325e-06, "loss": 0.3671, "mean_token_accuracy": 0.8747176647186279, "num_tokens": 7986345.0, "step": 975 }, { "entropy": 0.41937166452407837, "epoch": 0.15644955300127714, "grad_norm": 1.5040113925933838, "learning_rate": 4.997740326691597e-06, "loss": 0.3888, "mean_token_accuracy": 0.8664495825767518, "num_tokens": 8027305.0, "step": 980 }, { "entropy": 0.4262326657772064, "epoch": 0.1572477650063857, "grad_norm": 1.381565809249878, "learning_rate": 4.997717191003525e-06, "loss": 0.386, "mean_token_accuracy": 0.8680181741714478, "num_tokens": 8068265.0, "step": 985 }, { "entropy": 0.4555032432079315, "epoch": 0.15804597701149425, "grad_norm": 1.8633453845977783, "learning_rate": 4.997693937552564e-06, "loss": 0.419, "mean_token_accuracy": 0.8548953294754028, "num_tokens": 8109225.0, "step": 990 }, { "entropy": 0.4013682246208191, "epoch": 0.1588441890166028, "grad_norm": 1.7503520250320435, "learning_rate": 4.997670566340176e-06, "loss": 0.3699, "mean_token_accuracy": 0.8736639976501465, "num_tokens": 8150185.0, "step": 995 }, { "entropy": 0.402683812379837, "epoch": 0.15964240102171137, "grad_norm": 1.6846781969070435, "learning_rate": 4.997647077367831e-06, "loss": 0.367, "mean_token_accuracy": 0.8719624161720276, "num_tokens": 8191145.0, "step": 1000 }, { "epoch": 0.15964240102171137, "eval_entropy": 0.42480656147003176, "eval_loss": 0.3921413719654083, "eval_mean_token_accuracy": 0.8651772165298461, "eval_num_tokens": 8191145.0, "eval_runtime": 69.347, "eval_samples_per_second": 14.42, "eval_steps_per_second": 1.803, "step": 1000 }, { "entropy": 0.36306692361831666, "epoch": 0.16044061302681992, "grad_norm": 1.4345459938049316, "learning_rate": 4.997623470637007e-06, "loss": 0.32, "mean_token_accuracy": 0.8871942043304444, "num_tokens": 8232105.0, "step": 1005 }, { "entropy": 0.41327076554298403, "epoch": 0.1612388250319285, "grad_norm": 1.675029993057251, "learning_rate": 4.997599746149186e-06, "loss": 0.3878, "mean_token_accuracy": 0.8690639019012452, "num_tokens": 8273065.0, "step": 1010 }, { "entropy": 0.4097064435482025, "epoch": 0.16203703703703703, "grad_norm": 1.8893979787826538, "learning_rate": 4.997575903905863e-06, "loss": 0.3793, "mean_token_accuracy": 0.8692134499549866, "num_tokens": 8314025.0, "step": 1015 }, { "entropy": 0.3804477572441101, "epoch": 0.16283524904214558, "grad_norm": 1.4815566539764404, "learning_rate": 4.997551943908536e-06, "loss": 0.3442, "mean_token_accuracy": 0.8803953766822815, "num_tokens": 8354985.0, "step": 1020 }, { "entropy": 0.40303739309310915, "epoch": 0.16363346104725415, "grad_norm": 1.9521483182907104, "learning_rate": 4.99752786615871e-06, "loss": 0.3694, "mean_token_accuracy": 0.8721148848533631, "num_tokens": 8395945.0, "step": 1025 }, { "entropy": 0.4187192976474762, "epoch": 0.1644316730523627, "grad_norm": 1.6583740711212158, "learning_rate": 4.9975036706579015e-06, "loss": 0.3873, "mean_token_accuracy": 0.8685552954673768, "num_tokens": 8436905.0, "step": 1030 }, { "entropy": 0.4502474308013916, "epoch": 0.16522988505747127, "grad_norm": 1.5872939825057983, "learning_rate": 4.997479357407631e-06, "loss": 0.419, "mean_token_accuracy": 0.8583998322486878, "num_tokens": 8477865.0, "step": 1035 }, { "entropy": 0.4257081806659698, "epoch": 0.16602809706257982, "grad_norm": 1.6918632984161377, "learning_rate": 4.997454926409427e-06, "loss": 0.3814, "mean_token_accuracy": 0.8680700540542603, "num_tokens": 8518825.0, "step": 1040 }, { "entropy": 0.42606639862060547, "epoch": 0.1668263090676884, "grad_norm": 1.7691048383712769, "learning_rate": 4.997430377664826e-06, "loss": 0.3852, "mean_token_accuracy": 0.8670408368110657, "num_tokens": 8559785.0, "step": 1045 }, { "entropy": 0.4012975811958313, "epoch": 0.16762452107279693, "grad_norm": 1.622340202331543, "learning_rate": 4.997405711175373e-06, "loss": 0.3712, "mean_token_accuracy": 0.8724196314811706, "num_tokens": 8600745.0, "step": 1050 }, { "entropy": 0.3835790574550629, "epoch": 0.16842273307790548, "grad_norm": 1.6021138429641724, "learning_rate": 4.9973809269426175e-06, "loss": 0.3466, "mean_token_accuracy": 0.8784301280975342, "num_tokens": 8641705.0, "step": 1055 }, { "entropy": 0.41893797516822817, "epoch": 0.16922094508301405, "grad_norm": 1.6803170442581177, "learning_rate": 4.997356024968118e-06, "loss": 0.3848, "mean_token_accuracy": 0.8650649070739747, "num_tokens": 8682665.0, "step": 1060 }, { "entropy": 0.4112949728965759, "epoch": 0.1700191570881226, "grad_norm": 1.7723191976547241, "learning_rate": 4.997331005253442e-06, "loss": 0.3737, "mean_token_accuracy": 0.8707603693008423, "num_tokens": 8723625.0, "step": 1065 }, { "entropy": 0.429644775390625, "epoch": 0.17081736909323117, "grad_norm": 1.5139392614364624, "learning_rate": 4.9973058678001605e-06, "loss": 0.4006, "mean_token_accuracy": 0.8641191124916077, "num_tokens": 8764585.0, "step": 1070 }, { "entropy": 0.4199754595756531, "epoch": 0.17161558109833971, "grad_norm": 1.7999293804168701, "learning_rate": 4.997280612609857e-06, "loss": 0.3842, "mean_token_accuracy": 0.8690139651298523, "num_tokens": 8805545.0, "step": 1075 }, { "entropy": 0.4308688461780548, "epoch": 0.1724137931034483, "grad_norm": 1.652504563331604, "learning_rate": 4.9972552396841175e-06, "loss": 0.3967, "mean_token_accuracy": 0.8629522681236267, "num_tokens": 8846505.0, "step": 1080 }, { "entropy": 0.4425890207290649, "epoch": 0.17321200510855683, "grad_norm": 1.6451890468597412, "learning_rate": 4.997229749024538e-06, "loss": 0.4071, "mean_token_accuracy": 0.8623061180114746, "num_tokens": 8887465.0, "step": 1085 }, { "entropy": 0.4118497908115387, "epoch": 0.17401021711366538, "grad_norm": 1.6473734378814697, "learning_rate": 4.997204140632722e-06, "loss": 0.373, "mean_token_accuracy": 0.8697147250175477, "num_tokens": 8928425.0, "step": 1090 }, { "entropy": 0.4137786865234375, "epoch": 0.17480842911877395, "grad_norm": 1.679807186126709, "learning_rate": 4.99717841451028e-06, "loss": 0.3767, "mean_token_accuracy": 0.8703425407409668, "num_tokens": 8969385.0, "step": 1095 }, { "entropy": 0.3895216822624207, "epoch": 0.1756066411238825, "grad_norm": 1.7490582466125488, "learning_rate": 4.997152570658829e-06, "loss": 0.3602, "mean_token_accuracy": 0.8749435305595398, "num_tokens": 9010345.0, "step": 1100 }, { "entropy": 0.4318494439125061, "epoch": 0.17640485312899107, "grad_norm": 1.7533351182937622, "learning_rate": 4.997126609079993e-06, "loss": 0.3956, "mean_token_accuracy": 0.8652554154396057, "num_tokens": 9051305.0, "step": 1105 }, { "entropy": 0.374624902009964, "epoch": 0.1772030651340996, "grad_norm": 1.5839003324508667, "learning_rate": 4.9971005297754075e-06, "loss": 0.3516, "mean_token_accuracy": 0.8784180879592896, "num_tokens": 9092265.0, "step": 1110 }, { "entropy": 0.4146465241909027, "epoch": 0.17800127713920819, "grad_norm": 1.6019837856292725, "learning_rate": 4.99707433274671e-06, "loss": 0.3776, "mean_token_accuracy": 0.8676789045333863, "num_tokens": 9133225.0, "step": 1115 }, { "entropy": 0.4506120502948761, "epoch": 0.17879948914431673, "grad_norm": 1.6022826433181763, "learning_rate": 4.99704801799555e-06, "loss": 0.4064, "mean_token_accuracy": 0.8616150975227356, "num_tokens": 9174185.0, "step": 1120 }, { "entropy": 0.4405996799468994, "epoch": 0.17959770114942528, "grad_norm": 1.7077938318252563, "learning_rate": 4.99702158552358e-06, "loss": 0.4052, "mean_token_accuracy": 0.8605331301689148, "num_tokens": 9215145.0, "step": 1125 }, { "entropy": 0.45600707530975343, "epoch": 0.18039591315453385, "grad_norm": 1.7418618202209473, "learning_rate": 4.9969950353324635e-06, "loss": 0.4307, "mean_token_accuracy": 0.8526764512062073, "num_tokens": 9256105.0, "step": 1130 }, { "entropy": 0.4088563621044159, "epoch": 0.1811941251596424, "grad_norm": 1.6304500102996826, "learning_rate": 4.9969683674238704e-06, "loss": 0.3787, "mean_token_accuracy": 0.8687464475631714, "num_tokens": 9297065.0, "step": 1135 }, { "entropy": 0.4286839008331299, "epoch": 0.18199233716475097, "grad_norm": 1.7950910329818726, "learning_rate": 4.996941581799476e-06, "loss": 0.3956, "mean_token_accuracy": 0.8645257353782654, "num_tokens": 9338025.0, "step": 1140 }, { "entropy": 0.4073693871498108, "epoch": 0.1827905491698595, "grad_norm": 1.8495526313781738, "learning_rate": 4.996914678460966e-06, "loss": 0.3705, "mean_token_accuracy": 0.8706475257873535, "num_tokens": 9378985.0, "step": 1145 }, { "entropy": 0.37133595943450926, "epoch": 0.18358876117496808, "grad_norm": 1.507561206817627, "learning_rate": 4.996887657410032e-06, "loss": 0.3291, "mean_token_accuracy": 0.8849801659584046, "num_tokens": 9419945.0, "step": 1150 }, { "entropy": 0.3901697754859924, "epoch": 0.18438697318007663, "grad_norm": 1.7057150602340698, "learning_rate": 4.996860518648373e-06, "loss": 0.3581, "mean_token_accuracy": 0.8752909541130066, "num_tokens": 9460905.0, "step": 1155 }, { "entropy": 0.37988726496696473, "epoch": 0.18518518518518517, "grad_norm": 1.510456919670105, "learning_rate": 4.9968332621776956e-06, "loss": 0.3475, "mean_token_accuracy": 0.8772407650947571, "num_tokens": 9501865.0, "step": 1160 }, { "entropy": 0.38587768077850343, "epoch": 0.18598339719029375, "grad_norm": 1.4991780519485474, "learning_rate": 4.996805887999713e-06, "loss": 0.3532, "mean_token_accuracy": 0.8768713116645813, "num_tokens": 9542825.0, "step": 1165 }, { "entropy": 0.38253093957901, "epoch": 0.1867816091954023, "grad_norm": 1.5953854322433472, "learning_rate": 4.996778396116149e-06, "loss": 0.3411, "mean_token_accuracy": 0.8802226543426513, "num_tokens": 9583785.0, "step": 1170 }, { "entropy": 0.4161043405532837, "epoch": 0.18757982120051087, "grad_norm": 1.6561322212219238, "learning_rate": 4.99675078652873e-06, "loss": 0.3837, "mean_token_accuracy": 0.8681931853294372, "num_tokens": 9624745.0, "step": 1175 }, { "entropy": 0.4351208686828613, "epoch": 0.1883780332056194, "grad_norm": 1.580498456954956, "learning_rate": 4.996723059239193e-06, "loss": 0.4098, "mean_token_accuracy": 0.8593961119651794, "num_tokens": 9665705.0, "step": 1180 }, { "entropy": 0.4033635318279266, "epoch": 0.18917624521072796, "grad_norm": 1.4505336284637451, "learning_rate": 4.9966952142492815e-06, "loss": 0.36, "mean_token_accuracy": 0.8754417181015015, "num_tokens": 9706665.0, "step": 1185 }, { "entropy": 0.43068079352378846, "epoch": 0.18997445721583653, "grad_norm": 1.7219266891479492, "learning_rate": 4.996667251560747e-06, "loss": 0.4022, "mean_token_accuracy": 0.8612993121147156, "num_tokens": 9747625.0, "step": 1190 }, { "entropy": 0.36189231276512146, "epoch": 0.19077266922094507, "grad_norm": 1.5952274799346924, "learning_rate": 4.9966391711753465e-06, "loss": 0.3241, "mean_token_accuracy": 0.8871069312095642, "num_tokens": 9788585.0, "step": 1195 }, { "entropy": 0.4119158685207367, "epoch": 0.19157088122605365, "grad_norm": 1.6258676052093506, "learning_rate": 4.996610973094848e-06, "loss": 0.3785, "mean_token_accuracy": 0.8701352715492249, "num_tokens": 9829545.0, "step": 1200 }, { "epoch": 0.19157088122605365, "eval_entropy": 0.41716468119621275, "eval_loss": 0.3879784643650055, "eval_mean_token_accuracy": 0.8664626970291137, "eval_num_tokens": 9829545.0, "eval_runtime": 69.3572, "eval_samples_per_second": 14.418, "eval_steps_per_second": 1.802, "step": 1200 }, { "entropy": 0.38318485021591187, "epoch": 0.1923690932311622, "grad_norm": 1.6998229026794434, "learning_rate": 4.996582657321022e-06, "loss": 0.3561, "mean_token_accuracy": 0.8767566442489624, "num_tokens": 9870505.0, "step": 1205 }, { "entropy": 0.3972532510757446, "epoch": 0.19316730523627076, "grad_norm": 1.586173176765442, "learning_rate": 4.996554223855652e-06, "loss": 0.3695, "mean_token_accuracy": 0.8730546355247497, "num_tokens": 9911465.0, "step": 1210 }, { "entropy": 0.4261841118335724, "epoch": 0.1939655172413793, "grad_norm": 2.009549140930176, "learning_rate": 4.996525672700523e-06, "loss": 0.3865, "mean_token_accuracy": 0.8656880736351014, "num_tokens": 9952425.0, "step": 1215 }, { "entropy": 0.39083985686302186, "epoch": 0.19476372924648785, "grad_norm": 1.5788795948028564, "learning_rate": 4.9964970038574326e-06, "loss": 0.3553, "mean_token_accuracy": 0.876624321937561, "num_tokens": 9993385.0, "step": 1220 }, { "entropy": 0.41900729537010195, "epoch": 0.19556194125159643, "grad_norm": 1.7903603315353394, "learning_rate": 4.996468217328183e-06, "loss": 0.3903, "mean_token_accuracy": 0.8655580163002015, "num_tokens": 10034345.0, "step": 1225 }, { "entropy": 0.4208202064037323, "epoch": 0.19636015325670497, "grad_norm": 1.9505068063735962, "learning_rate": 4.996439313114584e-06, "loss": 0.3806, "mean_token_accuracy": 0.8678246140480042, "num_tokens": 10075305.0, "step": 1230 }, { "entropy": 0.3945443332195282, "epoch": 0.19715836526181355, "grad_norm": 3.14209246635437, "learning_rate": 4.9964102912184535e-06, "loss": 0.3555, "mean_token_accuracy": 0.8775973558425904, "num_tokens": 10116265.0, "step": 1235 }, { "entropy": 0.42588833570480344, "epoch": 0.1979565772669221, "grad_norm": 1.7747955322265625, "learning_rate": 4.9963811516416165e-06, "loss": 0.3968, "mean_token_accuracy": 0.8639430999755859, "num_tokens": 10157225.0, "step": 1240 }, { "entropy": 0.41568330526351926, "epoch": 0.19875478927203066, "grad_norm": 1.7321171760559082, "learning_rate": 4.996351894385906e-06, "loss": 0.3877, "mean_token_accuracy": 0.865398371219635, "num_tokens": 10198185.0, "step": 1245 }, { "entropy": 0.3887935698032379, "epoch": 0.1995530012771392, "grad_norm": 1.696902871131897, "learning_rate": 4.99632251945316e-06, "loss": 0.3475, "mean_token_accuracy": 0.8796878099441529, "num_tokens": 10239145.0, "step": 1250 }, { "entropy": 0.45456741452217103, "epoch": 0.20035121328224775, "grad_norm": 1.7765012979507446, "learning_rate": 4.996293026845228e-06, "loss": 0.4148, "mean_token_accuracy": 0.8554024338722229, "num_tokens": 10280105.0, "step": 1255 }, { "entropy": 0.4271567165851593, "epoch": 0.20114942528735633, "grad_norm": 1.7857844829559326, "learning_rate": 4.996263416563963e-06, "loss": 0.3864, "mean_token_accuracy": 0.8687859892845153, "num_tokens": 10321065.0, "step": 1260 }, { "entropy": 0.41049537658691404, "epoch": 0.20194763729246487, "grad_norm": 1.8603590726852417, "learning_rate": 4.996233688611227e-06, "loss": 0.3869, "mean_token_accuracy": 0.8668972373008728, "num_tokens": 10362025.0, "step": 1265 }, { "entropy": 0.4028216958045959, "epoch": 0.20274584929757344, "grad_norm": 1.6554845571517944, "learning_rate": 4.996203842988891e-06, "loss": 0.3633, "mean_token_accuracy": 0.8734982132911682, "num_tokens": 10402985.0, "step": 1270 }, { "entropy": 0.3871129274368286, "epoch": 0.203544061302682, "grad_norm": 1.6808788776397705, "learning_rate": 4.99617387969883e-06, "loss": 0.3413, "mean_token_accuracy": 0.8815069556236267, "num_tokens": 10443945.0, "step": 1275 }, { "entropy": 0.40030001401901244, "epoch": 0.20434227330779056, "grad_norm": 1.705714225769043, "learning_rate": 4.9961437987429285e-06, "loss": 0.3804, "mean_token_accuracy": 0.8696768760681153, "num_tokens": 10484905.0, "step": 1280 }, { "entropy": 0.4262424409389496, "epoch": 0.2051404853128991, "grad_norm": 1.873246192932129, "learning_rate": 4.996113600123079e-06, "loss": 0.3867, "mean_token_accuracy": 0.8671039581298828, "num_tokens": 10525865.0, "step": 1285 }, { "entropy": 0.40372244119644163, "epoch": 0.20593869731800765, "grad_norm": 1.584495186805725, "learning_rate": 4.996083283841179e-06, "loss": 0.3644, "mean_token_accuracy": 0.8726119875907898, "num_tokens": 10566825.0, "step": 1290 }, { "entropy": 0.37636016607284545, "epoch": 0.20673690932311622, "grad_norm": 1.3686262369155884, "learning_rate": 4.996052849899136e-06, "loss": 0.3449, "mean_token_accuracy": 0.8790806293487549, "num_tokens": 10607785.0, "step": 1295 }, { "entropy": 0.39137051701545716, "epoch": 0.20753512132822477, "grad_norm": 1.361937403678894, "learning_rate": 4.996022298298866e-06, "loss": 0.3574, "mean_token_accuracy": 0.8768706321716309, "num_tokens": 10648745.0, "step": 1300 }, { "entropy": 0.4153124690055847, "epoch": 0.20833333333333334, "grad_norm": 1.7018482685089111, "learning_rate": 4.995991629042286e-06, "loss": 0.3724, "mean_token_accuracy": 0.8716052293777465, "num_tokens": 10689705.0, "step": 1305 }, { "entropy": 0.43957144021987915, "epoch": 0.2091315453384419, "grad_norm": 1.761306643486023, "learning_rate": 4.995960842131326e-06, "loss": 0.4097, "mean_token_accuracy": 0.8598607301712036, "num_tokens": 10730665.0, "step": 1310 }, { "entropy": 0.42865965962409974, "epoch": 0.20992975734355046, "grad_norm": 1.6357239484786987, "learning_rate": 4.995929937567922e-06, "loss": 0.4011, "mean_token_accuracy": 0.861563766002655, "num_tokens": 10771625.0, "step": 1315 }, { "entropy": 0.38937110304832456, "epoch": 0.210727969348659, "grad_norm": 1.6105756759643555, "learning_rate": 4.9958989153540186e-06, "loss": 0.3492, "mean_token_accuracy": 0.8771203875541687, "num_tokens": 10812585.0, "step": 1320 }, { "entropy": 0.46120203137397764, "epoch": 0.21152618135376755, "grad_norm": 1.6245516538619995, "learning_rate": 4.995867775491567e-06, "loss": 0.4376, "mean_token_accuracy": 0.8491848587989808, "num_tokens": 10853545.0, "step": 1325 }, { "entropy": 0.3691314458847046, "epoch": 0.21232439335887612, "grad_norm": 1.4518979787826538, "learning_rate": 4.995836517982522e-06, "loss": 0.3341, "mean_token_accuracy": 0.8822991251945496, "num_tokens": 10894505.0, "step": 1330 }, { "entropy": 0.40480037331581115, "epoch": 0.21312260536398467, "grad_norm": 1.7446019649505615, "learning_rate": 4.995805142828852e-06, "loss": 0.3665, "mean_token_accuracy": 0.8746320247650147, "num_tokens": 10935465.0, "step": 1335 }, { "entropy": 0.4062842130661011, "epoch": 0.21392081736909324, "grad_norm": 1.6260181665420532, "learning_rate": 4.99577365003253e-06, "loss": 0.3728, "mean_token_accuracy": 0.8707757234573364, "num_tokens": 10976425.0, "step": 1340 }, { "entropy": 0.46745346784591674, "epoch": 0.2147190293742018, "grad_norm": 1.8439220190048218, "learning_rate": 4.9957420395955345e-06, "loss": 0.4262, "mean_token_accuracy": 0.8548336029052734, "num_tokens": 11017385.0, "step": 1345 }, { "entropy": 0.44144474864006045, "epoch": 0.21551724137931033, "grad_norm": 1.6311644315719604, "learning_rate": 4.9957103115198556e-06, "loss": 0.4032, "mean_token_accuracy": 0.8604014754295349, "num_tokens": 11058345.0, "step": 1350 }, { "entropy": 0.40855112075805666, "epoch": 0.2163154533844189, "grad_norm": 1.7732211351394653, "learning_rate": 4.995678465807486e-06, "loss": 0.3733, "mean_token_accuracy": 0.8709009766578675, "num_tokens": 11099305.0, "step": 1355 }, { "entropy": 0.397980147600174, "epoch": 0.21711366538952745, "grad_norm": 1.7110074758529663, "learning_rate": 4.995646502460431e-06, "loss": 0.368, "mean_token_accuracy": 0.8722142934799194, "num_tokens": 11139855.0, "step": 1360 }, { "entropy": 0.41088297963142395, "epoch": 0.21791187739463602, "grad_norm": 1.6622663736343384, "learning_rate": 4.995614421480699e-06, "loss": 0.3718, "mean_token_accuracy": 0.8695937275886536, "num_tokens": 11180815.0, "step": 1365 }, { "entropy": 0.4248778760433197, "epoch": 0.21871008939974457, "grad_norm": 1.6985255479812622, "learning_rate": 4.995582222870306e-06, "loss": 0.3895, "mean_token_accuracy": 0.8654048562049865, "num_tokens": 11221775.0, "step": 1370 }, { "entropy": 0.39926210045814514, "epoch": 0.21950830140485314, "grad_norm": 1.6084798574447632, "learning_rate": 4.9955499066312795e-06, "loss": 0.3625, "mean_token_accuracy": 0.8749465584754944, "num_tokens": 11262735.0, "step": 1375 }, { "entropy": 0.40205529928207395, "epoch": 0.22030651340996169, "grad_norm": 1.5554734468460083, "learning_rate": 4.995517472765651e-06, "loss": 0.3669, "mean_token_accuracy": 0.8706400275230408, "num_tokens": 11303695.0, "step": 1380 }, { "entropy": 0.4029591023921967, "epoch": 0.22110472541507023, "grad_norm": 1.5115835666656494, "learning_rate": 4.995484921275457e-06, "loss": 0.3656, "mean_token_accuracy": 0.8746947407722473, "num_tokens": 11344655.0, "step": 1385 }, { "entropy": 0.39063696265220643, "epoch": 0.2219029374201788, "grad_norm": 1.6652828454971313, "learning_rate": 4.995452252162749e-06, "loss": 0.3639, "mean_token_accuracy": 0.8736486792564392, "num_tokens": 11385615.0, "step": 1390 }, { "entropy": 0.3742422580718994, "epoch": 0.22270114942528735, "grad_norm": 1.5964741706848145, "learning_rate": 4.99541946542958e-06, "loss": 0.3402, "mean_token_accuracy": 0.881929349899292, "num_tokens": 11426575.0, "step": 1395 }, { "entropy": 0.4363535761833191, "epoch": 0.22349936143039592, "grad_norm": 1.8830647468566895, "learning_rate": 4.9953865610780095e-06, "loss": 0.4058, "mean_token_accuracy": 0.8590501070022583, "num_tokens": 11467535.0, "step": 1400 }, { "epoch": 0.22349936143039592, "eval_entropy": 0.41811490869522094, "eval_loss": 0.3820214867591858, "eval_mean_token_accuracy": 0.8677202377319336, "eval_num_tokens": 11467535.0, "eval_runtime": 69.4634, "eval_samples_per_second": 14.396, "eval_steps_per_second": 1.8, "step": 1400 }, { "entropy": 0.4278635323047638, "epoch": 0.22429757343550447, "grad_norm": 1.6763865947723389, "learning_rate": 4.995353539110108e-06, "loss": 0.4, "mean_token_accuracy": 0.8624410390853882, "num_tokens": 11508495.0, "step": 1405 }, { "entropy": 0.4178195416927338, "epoch": 0.22509578544061304, "grad_norm": 1.5268527269363403, "learning_rate": 4.9953203995279525e-06, "loss": 0.3828, "mean_token_accuracy": 0.8675993204116821, "num_tokens": 11549455.0, "step": 1410 }, { "entropy": 0.3592303812503815, "epoch": 0.22589399744572158, "grad_norm": 1.5988072156906128, "learning_rate": 4.995287142333627e-06, "loss": 0.3223, "mean_token_accuracy": 0.8882538080215454, "num_tokens": 11590415.0, "step": 1415 }, { "entropy": 0.39708020687103274, "epoch": 0.22669220945083013, "grad_norm": 1.5527164936065674, "learning_rate": 4.995253767529222e-06, "loss": 0.3575, "mean_token_accuracy": 0.8767055392265319, "num_tokens": 11631375.0, "step": 1420 }, { "entropy": 0.43148024678230285, "epoch": 0.2274904214559387, "grad_norm": 1.753352403640747, "learning_rate": 4.995220275116836e-06, "loss": 0.3921, "mean_token_accuracy": 0.8635330319404602, "num_tokens": 11672335.0, "step": 1425 }, { "entropy": 0.3746448040008545, "epoch": 0.22828863346104725, "grad_norm": 1.6090322732925415, "learning_rate": 4.995186665098577e-06, "loss": 0.3336, "mean_token_accuracy": 0.8834616661071777, "num_tokens": 11713295.0, "step": 1430 }, { "entropy": 0.41809865832328796, "epoch": 0.22908684546615582, "grad_norm": 1.6184989213943481, "learning_rate": 4.9951529374765576e-06, "loss": 0.3926, "mean_token_accuracy": 0.8649685978889465, "num_tokens": 11754255.0, "step": 1435 }, { "entropy": 0.41490686535835264, "epoch": 0.22988505747126436, "grad_norm": 1.7772951126098633, "learning_rate": 4.995119092252898e-06, "loss": 0.3777, "mean_token_accuracy": 0.8693205952644348, "num_tokens": 11795215.0, "step": 1440 }, { "entropy": 0.37119303941726683, "epoch": 0.23068326947637294, "grad_norm": 1.6093522310256958, "learning_rate": 4.9950851294297265e-06, "loss": 0.3302, "mean_token_accuracy": 0.884914219379425, "num_tokens": 11836175.0, "step": 1445 }, { "entropy": 0.411442232131958, "epoch": 0.23148148148148148, "grad_norm": 1.5621263980865479, "learning_rate": 4.99505104900918e-06, "loss": 0.3729, "mean_token_accuracy": 0.8707284569740296, "num_tokens": 11877135.0, "step": 1450 }, { "entropy": 0.43793959021568296, "epoch": 0.23227969348659003, "grad_norm": 1.8224726915359497, "learning_rate": 4.9950168509934e-06, "loss": 0.4039, "mean_token_accuracy": 0.8595661878585815, "num_tokens": 11918095.0, "step": 1455 }, { "entropy": 0.4227015495300293, "epoch": 0.2330779054916986, "grad_norm": 1.6003056764602661, "learning_rate": 4.994982535384538e-06, "loss": 0.4021, "mean_token_accuracy": 0.8620925664901733, "num_tokens": 11959055.0, "step": 1460 }, { "entropy": 0.4124914050102234, "epoch": 0.23387611749680715, "grad_norm": 1.7063300609588623, "learning_rate": 4.994948102184751e-06, "loss": 0.3747, "mean_token_accuracy": 0.8710904479026794, "num_tokens": 12000015.0, "step": 1465 }, { "entropy": 0.40440365076065066, "epoch": 0.23467432950191572, "grad_norm": 1.4815607070922852, "learning_rate": 4.994913551396206e-06, "loss": 0.3609, "mean_token_accuracy": 0.8743473768234253, "num_tokens": 12040975.0, "step": 1470 }, { "entropy": 0.44553318023681643, "epoch": 0.23547254150702426, "grad_norm": 1.8663265705108643, "learning_rate": 4.994878883021073e-06, "loss": 0.419, "mean_token_accuracy": 0.8563253045082092, "num_tokens": 12081935.0, "step": 1475 }, { "entropy": 0.4078981041908264, "epoch": 0.23627075351213284, "grad_norm": 1.9143680334091187, "learning_rate": 4.994844097061536e-06, "loss": 0.3845, "mean_token_accuracy": 0.8678545594215393, "num_tokens": 12122895.0, "step": 1480 }, { "entropy": 0.41621212363243104, "epoch": 0.23706896551724138, "grad_norm": 1.7652965784072876, "learning_rate": 4.99480919351978e-06, "loss": 0.3791, "mean_token_accuracy": 0.8677355766296386, "num_tokens": 12163855.0, "step": 1485 }, { "entropy": 0.411162918806076, "epoch": 0.23786717752234993, "grad_norm": 1.578628659248352, "learning_rate": 4.994774172397998e-06, "loss": 0.3675, "mean_token_accuracy": 0.8704973578453064, "num_tokens": 12204815.0, "step": 1490 }, { "entropy": 0.4328682243824005, "epoch": 0.2386653895274585, "grad_norm": 1.5921419858932495, "learning_rate": 4.994739033698395e-06, "loss": 0.3927, "mean_token_accuracy": 0.867378830909729, "num_tokens": 12245775.0, "step": 1495 }, { "entropy": 0.4088943064212799, "epoch": 0.23946360153256704, "grad_norm": 1.5701849460601807, "learning_rate": 4.994703777423181e-06, "loss": 0.3738, "mean_token_accuracy": 0.8706322550773621, "num_tokens": 12286735.0, "step": 1500 }, { "entropy": 0.37183575630187987, "epoch": 0.24026181353767562, "grad_norm": 1.63997220993042, "learning_rate": 4.994668403574571e-06, "loss": 0.3394, "mean_token_accuracy": 0.880238938331604, "num_tokens": 12327695.0, "step": 1505 }, { "entropy": 0.3934726595878601, "epoch": 0.24106002554278416, "grad_norm": 1.68795645236969, "learning_rate": 4.9946329121547906e-06, "loss": 0.3693, "mean_token_accuracy": 0.8705344796180725, "num_tokens": 12368655.0, "step": 1510 }, { "entropy": 0.4208143472671509, "epoch": 0.2418582375478927, "grad_norm": 1.404971957206726, "learning_rate": 4.9945973031660715e-06, "loss": 0.3811, "mean_token_accuracy": 0.8673396229743957, "num_tokens": 12409615.0, "step": 1515 }, { "entropy": 0.3697489082813263, "epoch": 0.24265644955300128, "grad_norm": 1.5042173862457275, "learning_rate": 4.994561576610652e-06, "loss": 0.3305, "mean_token_accuracy": 0.8833913207054138, "num_tokens": 12450575.0, "step": 1520 }, { "entropy": 0.3957326591014862, "epoch": 0.24345466155810983, "grad_norm": 1.3875174522399902, "learning_rate": 4.99452573249078e-06, "loss": 0.3641, "mean_token_accuracy": 0.8749968886375428, "num_tokens": 12491535.0, "step": 1525 }, { "entropy": 0.436593234539032, "epoch": 0.2442528735632184, "grad_norm": 1.6070129871368408, "learning_rate": 4.994489770808709e-06, "loss": 0.4044, "mean_token_accuracy": 0.8614607095718384, "num_tokens": 12532495.0, "step": 1530 }, { "entropy": 0.40942646861076354, "epoch": 0.24505108556832694, "grad_norm": 1.7601579427719116, "learning_rate": 4.994453691566701e-06, "loss": 0.364, "mean_token_accuracy": 0.8729120850563049, "num_tokens": 12573455.0, "step": 1535 }, { "entropy": 0.4324551522731781, "epoch": 0.24584929757343552, "grad_norm": 1.7088350057601929, "learning_rate": 4.994417494767024e-06, "loss": 0.3957, "mean_token_accuracy": 0.8620304942131043, "num_tokens": 12614415.0, "step": 1540 }, { "entropy": 0.4621385753154755, "epoch": 0.24664750957854406, "grad_norm": 1.619863510131836, "learning_rate": 4.9943811804119535e-06, "loss": 0.4336, "mean_token_accuracy": 0.8502998352050781, "num_tokens": 12655375.0, "step": 1545 }, { "entropy": 0.38465359807014465, "epoch": 0.2474457215836526, "grad_norm": 1.7016675472259521, "learning_rate": 4.9943447485037744e-06, "loss": 0.3461, "mean_token_accuracy": 0.8797366380691528, "num_tokens": 12696335.0, "step": 1550 }, { "entropy": 0.41603352427482604, "epoch": 0.24824393358876118, "grad_norm": 1.6022270917892456, "learning_rate": 4.994308199044777e-06, "loss": 0.3851, "mean_token_accuracy": 0.866445803642273, "num_tokens": 12737295.0, "step": 1555 }, { "entropy": 0.4290209412574768, "epoch": 0.24904214559386972, "grad_norm": 1.666760802268982, "learning_rate": 4.99427153203726e-06, "loss": 0.3842, "mean_token_accuracy": 0.8672929883003235, "num_tokens": 12778255.0, "step": 1560 }, { "entropy": 0.40751102566719055, "epoch": 0.2498403575989783, "grad_norm": 1.7844884395599365, "learning_rate": 4.99423474748353e-06, "loss": 0.3708, "mean_token_accuracy": 0.8718531012535096, "num_tokens": 12819215.0, "step": 1565 }, { "entropy": 0.41257060766220094, "epoch": 0.25063856960408687, "grad_norm": 1.6598808765411377, "learning_rate": 4.994197845385897e-06, "loss": 0.369, "mean_token_accuracy": 0.8714763522148132, "num_tokens": 12860175.0, "step": 1570 }, { "entropy": 0.4657106280326843, "epoch": 0.2514367816091954, "grad_norm": 1.7844305038452148, "learning_rate": 4.994160825746686e-06, "loss": 0.4289, "mean_token_accuracy": 0.8530724525451661, "num_tokens": 12901135.0, "step": 1575 }, { "entropy": 0.42342046499252317, "epoch": 0.25223499361430396, "grad_norm": 1.8664227724075317, "learning_rate": 4.994123688568222e-06, "loss": 0.3881, "mean_token_accuracy": 0.8675359487533569, "num_tokens": 12942095.0, "step": 1580 }, { "entropy": 0.40034735202789307, "epoch": 0.2530332056194125, "grad_norm": 1.5805197954177856, "learning_rate": 4.994086433852841e-06, "loss": 0.3658, "mean_token_accuracy": 0.8732864022254944, "num_tokens": 12983055.0, "step": 1585 }, { "entropy": 0.44049054980278013, "epoch": 0.25383141762452105, "grad_norm": 1.6537421941757202, "learning_rate": 4.994049061602883e-06, "loss": 0.4035, "mean_token_accuracy": 0.8610108494758606, "num_tokens": 13024015.0, "step": 1590 }, { "entropy": 0.39807049036026, "epoch": 0.25462962962962965, "grad_norm": 1.674716591835022, "learning_rate": 4.9940115718207035e-06, "loss": 0.3596, "mean_token_accuracy": 0.875378680229187, "num_tokens": 13064975.0, "step": 1595 }, { "entropy": 0.4111276030540466, "epoch": 0.2554278416347382, "grad_norm": 1.793379545211792, "learning_rate": 4.993973964508657e-06, "loss": 0.3709, "mean_token_accuracy": 0.8707456111907959, "num_tokens": 13105935.0, "step": 1600 }, { "epoch": 0.2554278416347382, "eval_entropy": 0.4195484278202057, "eval_loss": 0.37786614894866943, "eval_mean_token_accuracy": 0.8689507064819336, "eval_num_tokens": 13105935.0, "eval_runtime": 69.3072, "eval_samples_per_second": 14.429, "eval_steps_per_second": 1.804, "step": 1600 }, { "entropy": 0.40499958395957947, "epoch": 0.25622605363984674, "grad_norm": 1.7946947813034058, "learning_rate": 4.993936239669108e-06, "loss": 0.3646, "mean_token_accuracy": 0.8722654223442078, "num_tokens": 13146895.0, "step": 1605 }, { "entropy": 0.43797704577445984, "epoch": 0.2570242656449553, "grad_norm": 1.5900914669036865, "learning_rate": 4.993898397304429e-06, "loss": 0.4048, "mean_token_accuracy": 0.8610888838768005, "num_tokens": 13187855.0, "step": 1610 }, { "entropy": 0.4190030813217163, "epoch": 0.25782247765006383, "grad_norm": 1.7633140087127686, "learning_rate": 4.993860437417e-06, "loss": 0.3906, "mean_token_accuracy": 0.863499665260315, "num_tokens": 13228729.0, "step": 1615 }, { "entropy": 0.4361671805381775, "epoch": 0.25862068965517243, "grad_norm": 2.0313098430633545, "learning_rate": 4.993822360009209e-06, "loss": 0.4021, "mean_token_accuracy": 0.8606663584709168, "num_tokens": 13269689.0, "step": 1620 }, { "entropy": 0.43784735798835756, "epoch": 0.259418901660281, "grad_norm": 1.6960283517837524, "learning_rate": 4.993784165083449e-06, "loss": 0.3946, "mean_token_accuracy": 0.863409411907196, "num_tokens": 13310649.0, "step": 1625 }, { "entropy": 0.3651863753795624, "epoch": 0.2602171136653895, "grad_norm": 1.549738883972168, "learning_rate": 4.993745852642122e-06, "loss": 0.3228, "mean_token_accuracy": 0.8872558116912842, "num_tokens": 13351609.0, "step": 1630 }, { "entropy": 0.43080597519874575, "epoch": 0.26101532567049807, "grad_norm": 1.7376998662948608, "learning_rate": 4.9937074226876385e-06, "loss": 0.3925, "mean_token_accuracy": 0.8636496901512146, "num_tokens": 13392569.0, "step": 1635 }, { "entropy": 0.36666577458381655, "epoch": 0.26181353767560667, "grad_norm": 1.6298332214355469, "learning_rate": 4.993668875222413e-06, "loss": 0.3328, "mean_token_accuracy": 0.884006929397583, "num_tokens": 13433529.0, "step": 1640 }, { "entropy": 0.3666319906711578, "epoch": 0.2626117496807152, "grad_norm": 1.4979429244995117, "learning_rate": 4.993630210248872e-06, "loss": 0.3422, "mean_token_accuracy": 0.881742000579834, "num_tokens": 13474489.0, "step": 1645 }, { "entropy": 0.3915225386619568, "epoch": 0.26340996168582376, "grad_norm": 1.614754319190979, "learning_rate": 4.993591427769444e-06, "loss": 0.353, "mean_token_accuracy": 0.8764796495437622, "num_tokens": 13515449.0, "step": 1650 }, { "entropy": 0.41066797971725466, "epoch": 0.2642081736909323, "grad_norm": 1.5959410667419434, "learning_rate": 4.99355252778657e-06, "loss": 0.3716, "mean_token_accuracy": 0.8697964787483216, "num_tokens": 13556409.0, "step": 1655 }, { "entropy": 0.4193182408809662, "epoch": 0.26500638569604085, "grad_norm": 1.795488715171814, "learning_rate": 4.9935135103026955e-06, "loss": 0.3945, "mean_token_accuracy": 0.8639777302742004, "num_tokens": 13597369.0, "step": 1660 }, { "entropy": 0.4457306921482086, "epoch": 0.26580459770114945, "grad_norm": 1.7554657459259033, "learning_rate": 4.993474375320274e-06, "loss": 0.4033, "mean_token_accuracy": 0.8610527634620666, "num_tokens": 13638329.0, "step": 1665 }, { "entropy": 0.4340463042259216, "epoch": 0.266602809706258, "grad_norm": 1.7473564147949219, "learning_rate": 4.993435122841766e-06, "loss": 0.3918, "mean_token_accuracy": 0.8637238264083862, "num_tokens": 13679289.0, "step": 1670 }, { "entropy": 0.41148359775543214, "epoch": 0.26740102171136654, "grad_norm": 1.6237411499023438, "learning_rate": 4.9933957528696404e-06, "loss": 0.3802, "mean_token_accuracy": 0.8702250599861145, "num_tokens": 13720249.0, "step": 1675 }, { "entropy": 0.37870003581047057, "epoch": 0.2681992337164751, "grad_norm": 1.5291156768798828, "learning_rate": 4.993356265406373e-06, "loss": 0.3386, "mean_token_accuracy": 0.882771122455597, "num_tokens": 13761209.0, "step": 1680 }, { "entropy": 0.4399440348148346, "epoch": 0.26899744572158363, "grad_norm": 1.4831913709640503, "learning_rate": 4.993316660454447e-06, "loss": 0.4061, "mean_token_accuracy": 0.860145092010498, "num_tokens": 13802169.0, "step": 1685 }, { "entropy": 0.4290666043758392, "epoch": 0.26979565772669223, "grad_norm": 1.7227705717086792, "learning_rate": 4.993276938016352e-06, "loss": 0.3974, "mean_token_accuracy": 0.8653980612754821, "num_tokens": 13843129.0, "step": 1690 }, { "entropy": 0.4140145003795624, "epoch": 0.2705938697318008, "grad_norm": 1.556611180305481, "learning_rate": 4.993237098094587e-06, "loss": 0.3813, "mean_token_accuracy": 0.8696804285049439, "num_tokens": 13884089.0, "step": 1695 }, { "entropy": 0.4349702954292297, "epoch": 0.2713920817369093, "grad_norm": 1.7088748216629028, "learning_rate": 4.993197140691657e-06, "loss": 0.4044, "mean_token_accuracy": 0.8617449760437011, "num_tokens": 13925049.0, "step": 1700 }, { "entropy": 0.37474197149276733, "epoch": 0.27219029374201786, "grad_norm": 1.496077060699463, "learning_rate": 4.993157065810074e-06, "loss": 0.3379, "mean_token_accuracy": 0.8825637340545655, "num_tokens": 13966009.0, "step": 1705 }, { "entropy": 0.38251233100891113, "epoch": 0.27298850574712646, "grad_norm": 1.817301630973816, "learning_rate": 4.993116873452358e-06, "loss": 0.3542, "mean_token_accuracy": 0.875286865234375, "num_tokens": 14006969.0, "step": 1710 }, { "entropy": 0.4126917839050293, "epoch": 0.273786717752235, "grad_norm": 1.7066811323165894, "learning_rate": 4.993076563621037e-06, "loss": 0.3738, "mean_token_accuracy": 0.869832706451416, "num_tokens": 14047929.0, "step": 1715 }, { "entropy": 0.4246919810771942, "epoch": 0.27458492975734355, "grad_norm": 1.6397778987884521, "learning_rate": 4.993036136318646e-06, "loss": 0.3936, "mean_token_accuracy": 0.865372383594513, "num_tokens": 14088889.0, "step": 1720 }, { "entropy": 0.4292417407035828, "epoch": 0.2753831417624521, "grad_norm": 1.785861611366272, "learning_rate": 4.992995591547727e-06, "loss": 0.3868, "mean_token_accuracy": 0.8645003080368042, "num_tokens": 14129849.0, "step": 1725 }, { "entropy": 0.42507190704345704, "epoch": 0.27618135376756064, "grad_norm": 1.6692979335784912, "learning_rate": 4.99295492931083e-06, "loss": 0.385, "mean_token_accuracy": 0.8670867443084717, "num_tokens": 14170809.0, "step": 1730 }, { "entropy": 0.39853169322013854, "epoch": 0.27697956577266925, "grad_norm": 1.610140085220337, "learning_rate": 4.992914149610511e-06, "loss": 0.3624, "mean_token_accuracy": 0.8755905747413635, "num_tokens": 14211769.0, "step": 1735 }, { "entropy": 0.37006590366363523, "epoch": 0.2777777777777778, "grad_norm": 1.4653048515319824, "learning_rate": 4.992873252449335e-06, "loss": 0.3301, "mean_token_accuracy": 0.8839511036872864, "num_tokens": 14252729.0, "step": 1740 }, { "entropy": 0.40155852437019346, "epoch": 0.27857598978288634, "grad_norm": 1.59873366355896, "learning_rate": 4.9928322378298736e-06, "loss": 0.3706, "mean_token_accuracy": 0.8729602098464966, "num_tokens": 14293689.0, "step": 1745 }, { "entropy": 0.38730211853981017, "epoch": 0.2793742017879949, "grad_norm": 1.5279861688613892, "learning_rate": 4.9927911057547065e-06, "loss": 0.3568, "mean_token_accuracy": 0.8754580855369568, "num_tokens": 14334649.0, "step": 1750 }, { "entropy": 0.4186099946498871, "epoch": 0.2801724137931034, "grad_norm": 1.773861050605774, "learning_rate": 4.992749856226419e-06, "loss": 0.3774, "mean_token_accuracy": 0.8691788077354431, "num_tokens": 14375609.0, "step": 1755 }, { "entropy": 0.41721214056015016, "epoch": 0.280970625798212, "grad_norm": 1.6732553243637085, "learning_rate": 4.992708489247606e-06, "loss": 0.3869, "mean_token_accuracy": 0.8656150698661804, "num_tokens": 14416569.0, "step": 1760 }, { "entropy": 0.40146389603614807, "epoch": 0.28176883780332057, "grad_norm": 1.6273856163024902, "learning_rate": 4.992667004820868e-06, "loss": 0.3584, "mean_token_accuracy": 0.8757187008857727, "num_tokens": 14457529.0, "step": 1765 }, { "entropy": 0.3743233919143677, "epoch": 0.2825670498084291, "grad_norm": 1.336595058441162, "learning_rate": 4.992625402948815e-06, "loss": 0.3417, "mean_token_accuracy": 0.8810940265655518, "num_tokens": 14498489.0, "step": 1770 }, { "entropy": 0.4039636254310608, "epoch": 0.28336526181353766, "grad_norm": 1.54541015625, "learning_rate": 4.992583683634061e-06, "loss": 0.3668, "mean_token_accuracy": 0.8718450546264649, "num_tokens": 14539449.0, "step": 1775 }, { "entropy": 0.43368394374847413, "epoch": 0.2841634738186462, "grad_norm": 1.88645601272583, "learning_rate": 4.992541846879232e-06, "loss": 0.4118, "mean_token_accuracy": 0.8583693742752075, "num_tokens": 14580409.0, "step": 1780 }, { "entropy": 0.36426802277565, "epoch": 0.2849616858237548, "grad_norm": 1.3493199348449707, "learning_rate": 4.992499892686957e-06, "loss": 0.3245, "mean_token_accuracy": 0.886569344997406, "num_tokens": 14621369.0, "step": 1785 }, { "entropy": 0.39198213815689087, "epoch": 0.28575989782886335, "grad_norm": 1.6369388103485107, "learning_rate": 4.992457821059875e-06, "loss": 0.3582, "mean_token_accuracy": 0.8761023283004761, "num_tokens": 14662329.0, "step": 1790 }, { "entropy": 0.38651488423347474, "epoch": 0.2865581098339719, "grad_norm": 1.5533897876739502, "learning_rate": 4.992415632000631e-06, "loss": 0.3443, "mean_token_accuracy": 0.8807941317558289, "num_tokens": 14703289.0, "step": 1795 }, { "entropy": 0.3910827159881592, "epoch": 0.28735632183908044, "grad_norm": 1.7036851644515991, "learning_rate": 4.992373325511878e-06, "loss": 0.3522, "mean_token_accuracy": 0.8766248464584351, "num_tokens": 14744249.0, "step": 1800 }, { "epoch": 0.28735632183908044, "eval_entropy": 0.4121441757678986, "eval_loss": 0.3748484253883362, "eval_mean_token_accuracy": 0.8696171550750732, "eval_num_tokens": 14744249.0, "eval_runtime": 69.3324, "eval_samples_per_second": 14.423, "eval_steps_per_second": 1.803, "step": 1800 }, { "entropy": 0.4093415796756744, "epoch": 0.28815453384418904, "grad_norm": 1.698420524597168, "learning_rate": 4.992330901596277e-06, "loss": 0.3773, "mean_token_accuracy": 0.8717585563659668, "num_tokens": 14785209.0, "step": 1805 }, { "entropy": 0.3527294874191284, "epoch": 0.2889527458492976, "grad_norm": 1.5491911172866821, "learning_rate": 4.9922883602564966e-06, "loss": 0.314, "mean_token_accuracy": 0.8896806955337524, "num_tokens": 14826169.0, "step": 1810 }, { "entropy": 0.42450478076934817, "epoch": 0.28975095785440613, "grad_norm": 1.58077073097229, "learning_rate": 4.99224570149521e-06, "loss": 0.3859, "mean_token_accuracy": 0.8667027711868286, "num_tokens": 14867129.0, "step": 1815 }, { "entropy": 0.43766342401504515, "epoch": 0.2905491698595147, "grad_norm": 1.547052025794983, "learning_rate": 4.9922029253151e-06, "loss": 0.3978, "mean_token_accuracy": 0.8626722812652587, "num_tokens": 14908089.0, "step": 1820 }, { "entropy": 0.4361523449420929, "epoch": 0.2913473818646232, "grad_norm": 1.8358014822006226, "learning_rate": 4.992160031718859e-06, "loss": 0.4019, "mean_token_accuracy": 0.8596999645233154, "num_tokens": 14949049.0, "step": 1825 }, { "entropy": 0.407147616147995, "epoch": 0.2921455938697318, "grad_norm": 1.7107361555099487, "learning_rate": 4.99211702070918e-06, "loss": 0.37, "mean_token_accuracy": 0.8710406422615051, "num_tokens": 14990009.0, "step": 1830 }, { "entropy": 0.40477213859558103, "epoch": 0.29294380587484037, "grad_norm": 1.570025086402893, "learning_rate": 4.992073892288772e-06, "loss": 0.3801, "mean_token_accuracy": 0.8652504324913025, "num_tokens": 15030969.0, "step": 1835 }, { "entropy": 0.421870356798172, "epoch": 0.2937420178799489, "grad_norm": 1.607050895690918, "learning_rate": 4.992030646460344e-06, "loss": 0.3886, "mean_token_accuracy": 0.8673770546913147, "num_tokens": 15071929.0, "step": 1840 }, { "entropy": 0.41957166194915774, "epoch": 0.29454022988505746, "grad_norm": 1.6813313961029053, "learning_rate": 4.991987283226617e-06, "loss": 0.3845, "mean_token_accuracy": 0.866571569442749, "num_tokens": 15112889.0, "step": 1845 }, { "entropy": 0.3803585350513458, "epoch": 0.295338441890166, "grad_norm": 1.6727192401885986, "learning_rate": 4.9919438025903175e-06, "loss": 0.3505, "mean_token_accuracy": 0.8757133960723877, "num_tokens": 15153849.0, "step": 1850 }, { "entropy": 0.41606191992759706, "epoch": 0.2961366538952746, "grad_norm": 1.6527150869369507, "learning_rate": 4.99190020455418e-06, "loss": 0.3736, "mean_token_accuracy": 0.8708307147026062, "num_tokens": 15194809.0, "step": 1855 }, { "entropy": 0.42957100868225095, "epoch": 0.29693486590038315, "grad_norm": 1.4922149181365967, "learning_rate": 4.991856489120946e-06, "loss": 0.3931, "mean_token_accuracy": 0.8649673223495483, "num_tokens": 15235769.0, "step": 1860 }, { "entropy": 0.4057033896446228, "epoch": 0.2977330779054917, "grad_norm": 1.660406231880188, "learning_rate": 4.991812656293363e-06, "loss": 0.3598, "mean_token_accuracy": 0.8736201524734497, "num_tokens": 15276729.0, "step": 1865 }, { "entropy": 0.40441020131111144, "epoch": 0.29853128991060024, "grad_norm": 1.577506184577942, "learning_rate": 4.991768706074188e-06, "loss": 0.3663, "mean_token_accuracy": 0.8724161505699157, "num_tokens": 15317689.0, "step": 1870 }, { "entropy": 0.373467230796814, "epoch": 0.29932950191570884, "grad_norm": 1.6950846910476685, "learning_rate": 4.991724638466186e-06, "loss": 0.3496, "mean_token_accuracy": 0.8780014872550964, "num_tokens": 15358649.0, "step": 1875 }, { "entropy": 0.3957930028438568, "epoch": 0.3001277139208174, "grad_norm": 1.5578737258911133, "learning_rate": 4.991680453472129e-06, "loss": 0.36, "mean_token_accuracy": 0.8749694466590882, "num_tokens": 15399609.0, "step": 1880 }, { "entropy": 0.38373724818229676, "epoch": 0.30092592592592593, "grad_norm": 1.620660662651062, "learning_rate": 4.991636151094792e-06, "loss": 0.3444, "mean_token_accuracy": 0.878369402885437, "num_tokens": 15440569.0, "step": 1885 }, { "entropy": 0.38547171354293824, "epoch": 0.3017241379310345, "grad_norm": 1.5540834665298462, "learning_rate": 4.991591731336964e-06, "loss": 0.3431, "mean_token_accuracy": 0.8797260999679566, "num_tokens": 15481529.0, "step": 1890 }, { "entropy": 0.39545257687568663, "epoch": 0.302522349936143, "grad_norm": 1.7817574739456177, "learning_rate": 4.991547194201436e-06, "loss": 0.3628, "mean_token_accuracy": 0.8727735161781311, "num_tokens": 15522489.0, "step": 1895 }, { "entropy": 0.41355872750282285, "epoch": 0.3033205619412516, "grad_norm": 1.624796748161316, "learning_rate": 4.991502539691011e-06, "loss": 0.3742, "mean_token_accuracy": 0.8697443842887879, "num_tokens": 15563449.0, "step": 1900 }, { "entropy": 0.4103052496910095, "epoch": 0.30411877394636017, "grad_norm": 1.7812132835388184, "learning_rate": 4.991457767808494e-06, "loss": 0.3729, "mean_token_accuracy": 0.8709424614906311, "num_tokens": 15604409.0, "step": 1905 }, { "entropy": 0.41177705526351926, "epoch": 0.3049169859514687, "grad_norm": 1.5392616987228394, "learning_rate": 4.991412878556704e-06, "loss": 0.3754, "mean_token_accuracy": 0.8675289511680603, "num_tokens": 15645369.0, "step": 1910 }, { "entropy": 0.4046268105506897, "epoch": 0.30571519795657726, "grad_norm": 1.865882396697998, "learning_rate": 4.99136787193846e-06, "loss": 0.3653, "mean_token_accuracy": 0.8721927642822266, "num_tokens": 15686329.0, "step": 1915 }, { "entropy": 0.46906509399414065, "epoch": 0.3065134099616858, "grad_norm": 1.639910340309143, "learning_rate": 4.991322747956596e-06, "loss": 0.4253, "mean_token_accuracy": 0.8528730273246765, "num_tokens": 15727289.0, "step": 1920 }, { "entropy": 0.38653295040130614, "epoch": 0.3073116219667944, "grad_norm": 1.516993761062622, "learning_rate": 4.991277506613948e-06, "loss": 0.3534, "mean_token_accuracy": 0.8772754549980164, "num_tokens": 15768249.0, "step": 1925 }, { "entropy": 0.42357757687568665, "epoch": 0.30810983397190295, "grad_norm": 1.6272945404052734, "learning_rate": 4.991232147913359e-06, "loss": 0.3923, "mean_token_accuracy": 0.8665747880935669, "num_tokens": 15809209.0, "step": 1930 }, { "entropy": 0.42249606251716615, "epoch": 0.3089080459770115, "grad_norm": 1.7107625007629395, "learning_rate": 4.991186671857683e-06, "loss": 0.3843, "mean_token_accuracy": 0.866193950176239, "num_tokens": 15850169.0, "step": 1935 }, { "entropy": 0.40224887132644654, "epoch": 0.30970625798212004, "grad_norm": 1.562688946723938, "learning_rate": 4.991141078449779e-06, "loss": 0.36, "mean_token_accuracy": 0.8737279891967773, "num_tokens": 15891129.0, "step": 1940 }, { "entropy": 0.3883086383342743, "epoch": 0.3105044699872286, "grad_norm": 1.5433399677276611, "learning_rate": 4.9910953676925165e-06, "loss": 0.3511, "mean_token_accuracy": 0.8784520983695984, "num_tokens": 15931668.0, "step": 1945 }, { "entropy": 0.41734763979911804, "epoch": 0.3113026819923372, "grad_norm": 1.7147794961929321, "learning_rate": 4.991049539588768e-06, "loss": 0.3755, "mean_token_accuracy": 0.8696650862693787, "num_tokens": 15972628.0, "step": 1950 }, { "entropy": 0.40874959230422975, "epoch": 0.3121008939974457, "grad_norm": 1.5410938262939453, "learning_rate": 4.991003594141414e-06, "loss": 0.3749, "mean_token_accuracy": 0.8690267920494079, "num_tokens": 16013588.0, "step": 1955 }, { "entropy": 0.42068531513214114, "epoch": 0.3128991060025543, "grad_norm": 1.5189158916473389, "learning_rate": 4.990957531353346e-06, "loss": 0.3749, "mean_token_accuracy": 0.8713066935539245, "num_tokens": 16054548.0, "step": 1960 }, { "entropy": 0.4159302175045013, "epoch": 0.3136973180076628, "grad_norm": 1.880847454071045, "learning_rate": 4.99091135122746e-06, "loss": 0.3719, "mean_token_accuracy": 0.8719622373580933, "num_tokens": 16095508.0, "step": 1965 }, { "entropy": 0.4221408247947693, "epoch": 0.3144955300127714, "grad_norm": 1.5728129148483276, "learning_rate": 4.990865053766659e-06, "loss": 0.3976, "mean_token_accuracy": 0.8609007358551025, "num_tokens": 16136468.0, "step": 1970 }, { "entropy": 0.438957542181015, "epoch": 0.31529374201787996, "grad_norm": 1.8245769739151, "learning_rate": 4.9908186389738564e-06, "loss": 0.4156, "mean_token_accuracy": 0.8557961821556092, "num_tokens": 16177428.0, "step": 1975 }, { "entropy": 0.441065239906311, "epoch": 0.3160919540229885, "grad_norm": 1.7864265441894531, "learning_rate": 4.9907721068519686e-06, "loss": 0.4018, "mean_token_accuracy": 0.8611894130706788, "num_tokens": 16218388.0, "step": 1980 }, { "entropy": 0.39668573141098024, "epoch": 0.31689016602809705, "grad_norm": 1.5603904724121094, "learning_rate": 4.990725457403923e-06, "loss": 0.3518, "mean_token_accuracy": 0.8762275218963623, "num_tokens": 16259348.0, "step": 1985 }, { "entropy": 0.39811150431632997, "epoch": 0.3176883780332056, "grad_norm": 1.4460902214050293, "learning_rate": 4.990678690632652e-06, "loss": 0.3643, "mean_token_accuracy": 0.8742186784744262, "num_tokens": 16300308.0, "step": 1990 }, { "entropy": 0.4311108887195587, "epoch": 0.3184865900383142, "grad_norm": 1.4956344366073608, "learning_rate": 4.990631806541098e-06, "loss": 0.4036, "mean_token_accuracy": 0.8633324503898621, "num_tokens": 16341268.0, "step": 1995 }, { "entropy": 0.4104701280593872, "epoch": 0.31928480204342274, "grad_norm": 1.7465938329696655, "learning_rate": 4.990584805132208e-06, "loss": 0.3699, "mean_token_accuracy": 0.8737729787826538, "num_tokens": 16382228.0, "step": 2000 }, { "epoch": 0.31928480204342274, "eval_entropy": 0.4038234350681305, "eval_loss": 0.3711216151714325, "eval_mean_token_accuracy": 0.8712776474952698, "eval_num_tokens": 16382228.0, "eval_runtime": 69.2679, "eval_samples_per_second": 14.437, "eval_steps_per_second": 1.805, "step": 2000 }, { "entropy": 0.4014134407043457, "epoch": 0.3200830140485313, "grad_norm": 1.7096283435821533, "learning_rate": 4.990537686408939e-06, "loss": 0.3704, "mean_token_accuracy": 0.8709941625595092, "num_tokens": 16423188.0, "step": 2005 }, { "entropy": 0.3651737213134766, "epoch": 0.32088122605363983, "grad_norm": 1.3931198120117188, "learning_rate": 4.990490450374251e-06, "loss": 0.3298, "mean_token_accuracy": 0.8843650698661805, "num_tokens": 16464148.0, "step": 2010 }, { "entropy": 0.3889296054840088, "epoch": 0.3216794380587484, "grad_norm": 1.6823824644088745, "learning_rate": 4.990443097031118e-06, "loss": 0.3461, "mean_token_accuracy": 0.8789047360420227, "num_tokens": 16505108.0, "step": 2015 }, { "entropy": 0.37964903116226195, "epoch": 0.322477650063857, "grad_norm": 1.5910693407058716, "learning_rate": 4.9903956263825155e-06, "loss": 0.3459, "mean_token_accuracy": 0.8798817157745361, "num_tokens": 16546068.0, "step": 2020 }, { "entropy": 0.4190361201763153, "epoch": 0.3232758620689655, "grad_norm": 1.602673888206482, "learning_rate": 4.99034803843143e-06, "loss": 0.3863, "mean_token_accuracy": 0.8659457564353943, "num_tokens": 16587028.0, "step": 2025 }, { "entropy": 0.41818163394927976, "epoch": 0.32407407407407407, "grad_norm": 1.5723743438720703, "learning_rate": 4.990300333180853e-06, "loss": 0.3812, "mean_token_accuracy": 0.8674409031867981, "num_tokens": 16627988.0, "step": 2030 }, { "entropy": 0.4227443754673004, "epoch": 0.3248722860791826, "grad_norm": 1.5853668451309204, "learning_rate": 4.990252510633785e-06, "loss": 0.3895, "mean_token_accuracy": 0.8644875526428223, "num_tokens": 16668948.0, "step": 2035 }, { "entropy": 0.40144317150115966, "epoch": 0.32567049808429116, "grad_norm": 1.6941038370132446, "learning_rate": 4.9902045707932315e-06, "loss": 0.363, "mean_token_accuracy": 0.8758120179176331, "num_tokens": 16709908.0, "step": 2040 }, { "entropy": 0.4214106798171997, "epoch": 0.32646871008939976, "grad_norm": 1.6031938791275024, "learning_rate": 4.99015651366221e-06, "loss": 0.3895, "mean_token_accuracy": 0.8669756650924683, "num_tokens": 16750868.0, "step": 2045 }, { "entropy": 0.36540584564208983, "epoch": 0.3272669220945083, "grad_norm": 1.5755289793014526, "learning_rate": 4.99010833924374e-06, "loss": 0.3242, "mean_token_accuracy": 0.8872041344642639, "num_tokens": 16791828.0, "step": 2050 }, { "entropy": 0.4037174999713898, "epoch": 0.32806513409961685, "grad_norm": 1.5162173509597778, "learning_rate": 4.990060047540852e-06, "loss": 0.3642, "mean_token_accuracy": 0.8711183547973633, "num_tokens": 16832788.0, "step": 2055 }, { "entropy": 0.39979116916656493, "epoch": 0.3288633461047254, "grad_norm": 1.638429880142212, "learning_rate": 4.9900116385565825e-06, "loss": 0.3606, "mean_token_accuracy": 0.8761036038398743, "num_tokens": 16873748.0, "step": 2060 }, { "entropy": 0.4227651596069336, "epoch": 0.329661558109834, "grad_norm": 1.749085545539856, "learning_rate": 4.989963112293977e-06, "loss": 0.382, "mean_token_accuracy": 0.8673272252082824, "num_tokens": 16914708.0, "step": 2065 }, { "entropy": 0.37639381885528567, "epoch": 0.33045977011494254, "grad_norm": 1.413428544998169, "learning_rate": 4.989914468756084e-06, "loss": 0.3362, "mean_token_accuracy": 0.8843110561370849, "num_tokens": 16955668.0, "step": 2070 }, { "entropy": 0.38432916402816775, "epoch": 0.3312579821200511, "grad_norm": 1.6481380462646484, "learning_rate": 4.989865707945965e-06, "loss": 0.354, "mean_token_accuracy": 0.8767096996307373, "num_tokens": 16996628.0, "step": 2075 }, { "entropy": 0.4439726769924164, "epoch": 0.33205619412515963, "grad_norm": 1.812071681022644, "learning_rate": 4.989816829866686e-06, "loss": 0.4003, "mean_token_accuracy": 0.8621229887008667, "num_tokens": 17037588.0, "step": 2080 }, { "entropy": 0.4180517435073853, "epoch": 0.3328544061302682, "grad_norm": 1.8122986555099487, "learning_rate": 4.989767834521318e-06, "loss": 0.3716, "mean_token_accuracy": 0.8722500562667846, "num_tokens": 17078548.0, "step": 2085 }, { "entropy": 0.37677949070930483, "epoch": 0.3336526181353768, "grad_norm": 1.6600393056869507, "learning_rate": 4.989718721912946e-06, "loss": 0.3461, "mean_token_accuracy": 0.8783737897872925, "num_tokens": 17119508.0, "step": 2090 }, { "entropy": 0.39887595772743223, "epoch": 0.3344508301404853, "grad_norm": 1.576180338859558, "learning_rate": 4.989669492044655e-06, "loss": 0.3704, "mean_token_accuracy": 0.8705180883407593, "num_tokens": 17160468.0, "step": 2095 }, { "entropy": 0.3965007960796356, "epoch": 0.33524904214559387, "grad_norm": 1.63950514793396, "learning_rate": 4.989620144919543e-06, "loss": 0.3674, "mean_token_accuracy": 0.8723701715469361, "num_tokens": 17201428.0, "step": 2100 }, { "entropy": 0.41184504628181456, "epoch": 0.3360472541507024, "grad_norm": 1.544554591178894, "learning_rate": 4.989570680540712e-06, "loss": 0.3653, "mean_token_accuracy": 0.8735958099365234, "num_tokens": 17242388.0, "step": 2105 }, { "entropy": 0.34826286435127257, "epoch": 0.33684546615581096, "grad_norm": 1.3255348205566406, "learning_rate": 4.989521098911272e-06, "loss": 0.3056, "mean_token_accuracy": 0.8921523332595825, "num_tokens": 17283348.0, "step": 2110 }, { "entropy": 0.39912793040275574, "epoch": 0.33764367816091956, "grad_norm": 1.566488265991211, "learning_rate": 4.989471400034343e-06, "loss": 0.3708, "mean_token_accuracy": 0.8726530551910401, "num_tokens": 17324308.0, "step": 2115 }, { "entropy": 0.3988477885723114, "epoch": 0.3384418901660281, "grad_norm": 1.8146026134490967, "learning_rate": 4.989421583913047e-06, "loss": 0.3563, "mean_token_accuracy": 0.8763777494430542, "num_tokens": 17365268.0, "step": 2120 }, { "entropy": 0.4144528448581696, "epoch": 0.33924010217113665, "grad_norm": 1.733222246170044, "learning_rate": 4.989371650550519e-06, "loss": 0.3767, "mean_token_accuracy": 0.8681719303131104, "num_tokens": 17406228.0, "step": 2125 }, { "entropy": 0.3982251286506653, "epoch": 0.3400383141762452, "grad_norm": 1.4397218227386475, "learning_rate": 4.9893215999499e-06, "loss": 0.3621, "mean_token_accuracy": 0.8726215600967407, "num_tokens": 17447188.0, "step": 2130 }, { "entropy": 0.3918505072593689, "epoch": 0.3408365261813538, "grad_norm": 1.5369162559509277, "learning_rate": 4.9892714321143346e-06, "loss": 0.3453, "mean_token_accuracy": 0.8802524089813233, "num_tokens": 17488148.0, "step": 2135 }, { "entropy": 0.4021181404590607, "epoch": 0.34163473818646234, "grad_norm": 1.5608594417572021, "learning_rate": 4.9892211470469775e-06, "loss": 0.3624, "mean_token_accuracy": 0.8733767867088318, "num_tokens": 17529108.0, "step": 2140 }, { "entropy": 0.3666627109050751, "epoch": 0.3424329501915709, "grad_norm": 1.418404459953308, "learning_rate": 4.989170744750993e-06, "loss": 0.3247, "mean_token_accuracy": 0.8857243657112122, "num_tokens": 17570068.0, "step": 2145 }, { "entropy": 0.3634820461273193, "epoch": 0.34323116219667943, "grad_norm": 1.5730950832366943, "learning_rate": 4.9891202252295495e-06, "loss": 0.3284, "mean_token_accuracy": 0.8852996826171875, "num_tokens": 17611028.0, "step": 2150 }, { "entropy": 0.40299826860427856, "epoch": 0.344029374201788, "grad_norm": 1.8429025411605835, "learning_rate": 4.989069588485824e-06, "loss": 0.3727, "mean_token_accuracy": 0.8722189664840698, "num_tokens": 17651988.0, "step": 2155 }, { "entropy": 0.39875529408454896, "epoch": 0.3448275862068966, "grad_norm": 1.7688708305358887, "learning_rate": 4.989018834523001e-06, "loss": 0.3665, "mean_token_accuracy": 0.8719764232635498, "num_tokens": 17692948.0, "step": 2160 }, { "entropy": 0.41312822699546814, "epoch": 0.3456257982120051, "grad_norm": 1.5525833368301392, "learning_rate": 4.9889679633442706e-06, "loss": 0.3629, "mean_token_accuracy": 0.8743491053581238, "num_tokens": 17733908.0, "step": 2165 }, { "entropy": 0.43781871199607847, "epoch": 0.34642401021711366, "grad_norm": 1.7160017490386963, "learning_rate": 4.988916974952833e-06, "loss": 0.4104, "mean_token_accuracy": 0.8564818978309632, "num_tokens": 17774868.0, "step": 2170 }, { "entropy": 0.38255208134651186, "epoch": 0.3472222222222222, "grad_norm": 1.643141508102417, "learning_rate": 4.988865869351895e-06, "loss": 0.3479, "mean_token_accuracy": 0.8778419256210327, "num_tokens": 17815828.0, "step": 2175 }, { "entropy": 0.42076377272605897, "epoch": 0.34802043422733075, "grad_norm": 1.8374202251434326, "learning_rate": 4.988814646544669e-06, "loss": 0.383, "mean_token_accuracy": 0.8678974390029908, "num_tokens": 17856788.0, "step": 2180 }, { "entropy": 0.4139479219913483, "epoch": 0.34881864623243936, "grad_norm": 1.75631582736969, "learning_rate": 4.988763306534376e-06, "loss": 0.3756, "mean_token_accuracy": 0.8708728432655335, "num_tokens": 17897748.0, "step": 2185 }, { "entropy": 0.3780826270580292, "epoch": 0.3496168582375479, "grad_norm": 1.620390772819519, "learning_rate": 4.988711849324247e-06, "loss": 0.3355, "mean_token_accuracy": 0.8815474390983582, "num_tokens": 17938708.0, "step": 2190 }, { "entropy": 0.3495974004268646, "epoch": 0.35041507024265645, "grad_norm": 1.5734292268753052, "learning_rate": 4.988660274917515e-06, "loss": 0.3059, "mean_token_accuracy": 0.8917647361755371, "num_tokens": 17979668.0, "step": 2195 }, { "entropy": 0.3996447205543518, "epoch": 0.351213282247765, "grad_norm": 1.7574753761291504, "learning_rate": 4.988608583317424e-06, "loss": 0.3731, "mean_token_accuracy": 0.8691877007484436, "num_tokens": 18020628.0, "step": 2200 }, { "epoch": 0.351213282247765, "eval_entropy": 0.40308074259758, "eval_loss": 0.3682883381843567, "eval_mean_token_accuracy": 0.8719504637718201, "eval_num_tokens": 18020628.0, "eval_runtime": 69.2047, "eval_samples_per_second": 14.45, "eval_steps_per_second": 1.806, "step": 2200 }, { "entropy": 0.4189161598682404, "epoch": 0.35201149425287354, "grad_norm": 1.8016271591186523, "learning_rate": 4.988556774527226e-06, "loss": 0.3878, "mean_token_accuracy": 0.864763867855072, "num_tokens": 18061368.0, "step": 2205 }, { "entropy": 0.4112794458866119, "epoch": 0.35280970625798214, "grad_norm": 1.630026936531067, "learning_rate": 4.988504848550175e-06, "loss": 0.3535, "mean_token_accuracy": 0.8749266624450683, "num_tokens": 18102328.0, "step": 2210 }, { "entropy": 0.3925338864326477, "epoch": 0.3536079182630907, "grad_norm": 1.639182448387146, "learning_rate": 4.988452805389541e-06, "loss": 0.3642, "mean_token_accuracy": 0.8734994173049927, "num_tokens": 18143288.0, "step": 2215 }, { "entropy": 0.383824622631073, "epoch": 0.3544061302681992, "grad_norm": 1.7463170289993286, "learning_rate": 4.9884006450485935e-06, "loss": 0.3365, "mean_token_accuracy": 0.8832499623298645, "num_tokens": 18184248.0, "step": 2220 }, { "entropy": 0.40624321103096006, "epoch": 0.35520434227330777, "grad_norm": 1.669136881828308, "learning_rate": 4.9883483675306144e-06, "loss": 0.3646, "mean_token_accuracy": 0.872031056880951, "num_tokens": 18225208.0, "step": 2225 }, { "entropy": 0.3683569490909576, "epoch": 0.35600255427841637, "grad_norm": 1.5674257278442383, "learning_rate": 4.98829597283889e-06, "loss": 0.3335, "mean_token_accuracy": 0.8851659893989563, "num_tokens": 18266168.0, "step": 2230 }, { "entropy": 0.39384169578552247, "epoch": 0.3568007662835249, "grad_norm": 1.5700942277908325, "learning_rate": 4.988243460976715e-06, "loss": 0.3556, "mean_token_accuracy": 0.8744176506996155, "num_tokens": 18307128.0, "step": 2235 }, { "entropy": 0.40435452461242677, "epoch": 0.35759897828863346, "grad_norm": 1.5019632577896118, "learning_rate": 4.988190831947391e-06, "loss": 0.3732, "mean_token_accuracy": 0.8701990246772766, "num_tokens": 18348088.0, "step": 2240 }, { "entropy": 0.39732733368873596, "epoch": 0.358397190293742, "grad_norm": 1.7081998586654663, "learning_rate": 4.988138085754229e-06, "loss": 0.3622, "mean_token_accuracy": 0.8721163272857666, "num_tokens": 18389048.0, "step": 2245 }, { "entropy": 0.4120040833950043, "epoch": 0.35919540229885055, "grad_norm": 1.8202104568481445, "learning_rate": 4.988085222400546e-06, "loss": 0.3851, "mean_token_accuracy": 0.8671935796737671, "num_tokens": 18430008.0, "step": 2250 }, { "entropy": 0.39716677069664, "epoch": 0.35999361430395915, "grad_norm": 1.6864914894104004, "learning_rate": 4.988032241889665e-06, "loss": 0.3544, "mean_token_accuracy": 0.8778967261314392, "num_tokens": 18470968.0, "step": 2255 }, { "entropy": 0.4007763683795929, "epoch": 0.3607918263090677, "grad_norm": 1.6153517961502075, "learning_rate": 4.987979144224917e-06, "loss": 0.3646, "mean_token_accuracy": 0.8720175743103027, "num_tokens": 18511928.0, "step": 2260 }, { "entropy": 0.38735673427581785, "epoch": 0.36159003831417624, "grad_norm": 1.4474482536315918, "learning_rate": 4.9879259294096426e-06, "loss": 0.3507, "mean_token_accuracy": 0.8750486254692078, "num_tokens": 18552888.0, "step": 2265 }, { "entropy": 0.4178376317024231, "epoch": 0.3623882503192848, "grad_norm": 1.721549391746521, "learning_rate": 4.987872597447188e-06, "loss": 0.3861, "mean_token_accuracy": 0.8684224009513855, "num_tokens": 18593848.0, "step": 2270 }, { "entropy": 0.39219988584518434, "epoch": 0.36318646232439333, "grad_norm": 1.5142766237258911, "learning_rate": 4.987819148340906e-06, "loss": 0.3427, "mean_token_accuracy": 0.8798442125320435, "num_tokens": 18634808.0, "step": 2275 }, { "entropy": 0.4054730415344238, "epoch": 0.36398467432950193, "grad_norm": 1.5823688507080078, "learning_rate": 4.987765582094158e-06, "loss": 0.3713, "mean_token_accuracy": 0.8720792055130004, "num_tokens": 18675768.0, "step": 2280 }, { "entropy": 0.46094911694526675, "epoch": 0.3647828863346105, "grad_norm": 1.7849924564361572, "learning_rate": 4.987711898710312e-06, "loss": 0.4243, "mean_token_accuracy": 0.8514412760734558, "num_tokens": 18716728.0, "step": 2285 }, { "entropy": 0.38086835741996766, "epoch": 0.365581098339719, "grad_norm": 1.7232701778411865, "learning_rate": 4.987658098192745e-06, "loss": 0.34, "mean_token_accuracy": 0.8816040277481079, "num_tokens": 18757688.0, "step": 2290 }, { "entropy": 0.3888665437698364, "epoch": 0.36637931034482757, "grad_norm": 1.341325044631958, "learning_rate": 4.987604180544839e-06, "loss": 0.3523, "mean_token_accuracy": 0.8755691409111023, "num_tokens": 18798648.0, "step": 2295 }, { "entropy": 0.431248152256012, "epoch": 0.36717752234993617, "grad_norm": 1.6617993116378784, "learning_rate": 4.987550145769986e-06, "loss": 0.393, "mean_token_accuracy": 0.8633793830871582, "num_tokens": 18839608.0, "step": 2300 }, { "entropy": 0.36826140284538267, "epoch": 0.3679757343550447, "grad_norm": 1.32558012008667, "learning_rate": 4.987495993871582e-06, "loss": 0.3238, "mean_token_accuracy": 0.8857893109321594, "num_tokens": 18880568.0, "step": 2305 }, { "entropy": 0.3880879878997803, "epoch": 0.36877394636015326, "grad_norm": 1.639264464378357, "learning_rate": 4.9874417248530325e-06, "loss": 0.3489, "mean_token_accuracy": 0.8788592934608459, "num_tokens": 18921528.0, "step": 2310 }, { "entropy": 0.3850403368473053, "epoch": 0.3695721583652618, "grad_norm": 1.4602205753326416, "learning_rate": 4.9873873387177515e-06, "loss": 0.3384, "mean_token_accuracy": 0.8812838077545166, "num_tokens": 18962146.0, "step": 2315 }, { "entropy": 0.39695783257484435, "epoch": 0.37037037037037035, "grad_norm": 1.3622066974639893, "learning_rate": 4.987332835469158e-06, "loss": 0.3568, "mean_token_accuracy": 0.8771039724349976, "num_tokens": 19003106.0, "step": 2320 }, { "entropy": 0.37232043147087096, "epoch": 0.37116858237547895, "grad_norm": 1.464799404144287, "learning_rate": 4.98727821511068e-06, "loss": 0.3215, "mean_token_accuracy": 0.8875339508056641, "num_tokens": 19044066.0, "step": 2325 }, { "entropy": 0.4204703152179718, "epoch": 0.3719667943805875, "grad_norm": 1.6495721340179443, "learning_rate": 4.9872234776457515e-06, "loss": 0.3876, "mean_token_accuracy": 0.8661399126052857, "num_tokens": 19085026.0, "step": 2330 }, { "entropy": 0.368586528301239, "epoch": 0.37276500638569604, "grad_norm": 1.5902228355407715, "learning_rate": 4.987168623077815e-06, "loss": 0.3275, "mean_token_accuracy": 0.8872491240501403, "num_tokens": 19125986.0, "step": 2335 }, { "entropy": 0.4211663603782654, "epoch": 0.3735632183908046, "grad_norm": 1.6928033828735352, "learning_rate": 4.9871136514103194e-06, "loss": 0.386, "mean_token_accuracy": 0.8665610313415527, "num_tokens": 19166946.0, "step": 2340 }, { "entropy": 0.44752530455589296, "epoch": 0.37436143039591313, "grad_norm": 1.8591210842132568, "learning_rate": 4.987058562646722e-06, "loss": 0.4097, "mean_token_accuracy": 0.8581960439682007, "num_tokens": 19207906.0, "step": 2345 }, { "entropy": 0.3855793416500092, "epoch": 0.37515964240102173, "grad_norm": 1.5569732189178467, "learning_rate": 4.987003356790487e-06, "loss": 0.3484, "mean_token_accuracy": 0.877441143989563, "num_tokens": 19248866.0, "step": 2350 }, { "entropy": 0.37906638383865354, "epoch": 0.3759578544061303, "grad_norm": 1.720017910003662, "learning_rate": 4.986948033845086e-06, "loss": 0.3317, "mean_token_accuracy": 0.8832611203193664, "num_tokens": 19289826.0, "step": 2355 }, { "entropy": 0.4253597676753998, "epoch": 0.3767560664112388, "grad_norm": 1.5680551528930664, "learning_rate": 4.986892593813998e-06, "loss": 0.3888, "mean_token_accuracy": 0.8657567858695984, "num_tokens": 19330786.0, "step": 2360 }, { "entropy": 0.4145656943321228, "epoch": 0.37755427841634737, "grad_norm": 1.678577184677124, "learning_rate": 4.986837036700708e-06, "loss": 0.3807, "mean_token_accuracy": 0.8670534491539001, "num_tokens": 19371746.0, "step": 2365 }, { "entropy": 0.42500597834587095, "epoch": 0.3783524904214559, "grad_norm": 1.7057173252105713, "learning_rate": 4.986781362508711e-06, "loss": 0.3859, "mean_token_accuracy": 0.8676841259002686, "num_tokens": 19412706.0, "step": 2370 }, { "entropy": 0.4140367269515991, "epoch": 0.3791507024265645, "grad_norm": 1.6917656660079956, "learning_rate": 4.986725571241508e-06, "loss": 0.3651, "mean_token_accuracy": 0.8730501174926758, "num_tokens": 19453666.0, "step": 2375 }, { "entropy": 0.4040639877319336, "epoch": 0.37994891443167306, "grad_norm": 1.5331501960754395, "learning_rate": 4.986669662902607e-06, "loss": 0.3637, "mean_token_accuracy": 0.8733095169067383, "num_tokens": 19494626.0, "step": 2380 }, { "entropy": 0.4209416925907135, "epoch": 0.3807471264367816, "grad_norm": 1.7044132947921753, "learning_rate": 4.986613637495524e-06, "loss": 0.3843, "mean_token_accuracy": 0.8655770182609558, "num_tokens": 19535586.0, "step": 2385 }, { "entropy": 0.4252281904220581, "epoch": 0.38154533844189015, "grad_norm": 1.5789233446121216, "learning_rate": 4.986557495023781e-06, "loss": 0.3942, "mean_token_accuracy": 0.8630497813224792, "num_tokens": 19576546.0, "step": 2390 }, { "entropy": 0.3741147518157959, "epoch": 0.38234355044699875, "grad_norm": 1.4773911237716675, "learning_rate": 4.986501235490909e-06, "loss": 0.3329, "mean_token_accuracy": 0.882919716835022, "num_tokens": 19617506.0, "step": 2395 }, { "entropy": 0.4245776295661926, "epoch": 0.3831417624521073, "grad_norm": 1.5666937828063965, "learning_rate": 4.986444858900447e-06, "loss": 0.3799, "mean_token_accuracy": 0.8653708696365356, "num_tokens": 19658466.0, "step": 2400 }, { "epoch": 0.3831417624521073, "eval_entropy": 0.40449087238311765, "eval_loss": 0.3654989004135132, "eval_mean_token_accuracy": 0.8728414220809937, "eval_num_tokens": 19658466.0, "eval_runtime": 69.2913, "eval_samples_per_second": 14.432, "eval_steps_per_second": 1.804, "step": 2400 }, { "entropy": 0.4110782384872437, "epoch": 0.38393997445721584, "grad_norm": 1.9226335287094116, "learning_rate": 4.986388365255937e-06, "loss": 0.38, "mean_token_accuracy": 0.8674784779548645, "num_tokens": 19699426.0, "step": 2405 }, { "entropy": 0.40915406942367555, "epoch": 0.3847381864623244, "grad_norm": 1.647294044494629, "learning_rate": 4.9863317545609355e-06, "loss": 0.3775, "mean_token_accuracy": 0.8719962477684021, "num_tokens": 19740386.0, "step": 2410 }, { "entropy": 0.40981475114822385, "epoch": 0.38553639846743293, "grad_norm": 1.625821590423584, "learning_rate": 4.986275026818999e-06, "loss": 0.3675, "mean_token_accuracy": 0.8711891293525695, "num_tokens": 19781346.0, "step": 2415 }, { "entropy": 0.4291534602642059, "epoch": 0.38633461047254153, "grad_norm": 1.5893938541412354, "learning_rate": 4.986218182033697e-06, "loss": 0.3918, "mean_token_accuracy": 0.8647595286369324, "num_tokens": 19822306.0, "step": 2420 }, { "entropy": 0.4076783239841461, "epoch": 0.3871328224776501, "grad_norm": 1.6474665403366089, "learning_rate": 4.986161220208604e-06, "loss": 0.3806, "mean_token_accuracy": 0.8684025168418884, "num_tokens": 19863266.0, "step": 2425 }, { "entropy": 0.3912591695785522, "epoch": 0.3879310344827586, "grad_norm": 3.1487748622894287, "learning_rate": 4.986104141347301e-06, "loss": 0.3445, "mean_token_accuracy": 0.8792547583580017, "num_tokens": 19904226.0, "step": 2430 }, { "entropy": 0.4334920525550842, "epoch": 0.38872924648786716, "grad_norm": 1.6844035387039185, "learning_rate": 4.9860469454533775e-06, "loss": 0.3975, "mean_token_accuracy": 0.863440215587616, "num_tokens": 19945186.0, "step": 2435 }, { "entropy": 0.3987038731575012, "epoch": 0.3895274584929757, "grad_norm": 1.6433604955673218, "learning_rate": 4.98598963253043e-06, "loss": 0.3577, "mean_token_accuracy": 0.8737675786018372, "num_tokens": 19986146.0, "step": 2440 }, { "entropy": 0.40528036952018737, "epoch": 0.3903256704980843, "grad_norm": 1.5972541570663452, "learning_rate": 4.985932202582062e-06, "loss": 0.3688, "mean_token_accuracy": 0.873537254333496, "num_tokens": 20027106.0, "step": 2445 }, { "entropy": 0.44673908948898317, "epoch": 0.39112388250319285, "grad_norm": 1.6527669429779053, "learning_rate": 4.985874655611887e-06, "loss": 0.4147, "mean_token_accuracy": 0.8543389558792114, "num_tokens": 20068066.0, "step": 2450 }, { "entropy": 0.35718379616737367, "epoch": 0.3919220945083014, "grad_norm": 1.511210560798645, "learning_rate": 4.985816991623521e-06, "loss": 0.3148, "mean_token_accuracy": 0.890525484085083, "num_tokens": 20109026.0, "step": 2455 }, { "entropy": 0.39349877238273623, "epoch": 0.39272030651340994, "grad_norm": 1.6557594537734985, "learning_rate": 4.985759210620593e-06, "loss": 0.3624, "mean_token_accuracy": 0.8725938200950623, "num_tokens": 20149986.0, "step": 2460 }, { "entropy": 0.39194449186325075, "epoch": 0.39351851851851855, "grad_norm": 1.434440016746521, "learning_rate": 4.985701312606735e-06, "loss": 0.3509, "mean_token_accuracy": 0.8777651786804199, "num_tokens": 20190946.0, "step": 2465 }, { "entropy": 0.38448486328125, "epoch": 0.3943167305236271, "grad_norm": 1.5193266868591309, "learning_rate": 4.985643297585587e-06, "loss": 0.3418, "mean_token_accuracy": 0.8788821578025818, "num_tokens": 20231906.0, "step": 2470 }, { "entropy": 0.37055438160896303, "epoch": 0.39511494252873564, "grad_norm": 1.668213129043579, "learning_rate": 4.985585165560798e-06, "loss": 0.3314, "mean_token_accuracy": 0.8830554604530334, "num_tokens": 20272866.0, "step": 2475 }, { "entropy": 0.3725506365299225, "epoch": 0.3959131545338442, "grad_norm": 1.5674769878387451, "learning_rate": 4.985526916536024e-06, "loss": 0.3411, "mean_token_accuracy": 0.8806295394897461, "num_tokens": 20313826.0, "step": 2480 }, { "entropy": 0.41256141662597656, "epoch": 0.3967113665389527, "grad_norm": 1.7599780559539795, "learning_rate": 4.985468550514928e-06, "loss": 0.3762, "mean_token_accuracy": 0.8694740176200867, "num_tokens": 20354786.0, "step": 2485 }, { "entropy": 0.4078804194927216, "epoch": 0.3975095785440613, "grad_norm": 1.5582984685897827, "learning_rate": 4.985410067501178e-06, "loss": 0.3696, "mean_token_accuracy": 0.8715253114700318, "num_tokens": 20395746.0, "step": 2490 }, { "entropy": 0.3821629822254181, "epoch": 0.39830779054916987, "grad_norm": 1.3672593832015991, "learning_rate": 4.985351467498455e-06, "loss": 0.3406, "mean_token_accuracy": 0.8817133903503418, "num_tokens": 20436706.0, "step": 2495 }, { "entropy": 0.36203756332397463, "epoch": 0.3991060025542784, "grad_norm": 1.6370218992233276, "learning_rate": 4.985292750510442e-06, "loss": 0.318, "mean_token_accuracy": 0.8887966752052308, "num_tokens": 20477666.0, "step": 2500 }, { "entropy": 0.4348323345184326, "epoch": 0.39990421455938696, "grad_norm": 1.5410937070846558, "learning_rate": 4.9852339165408305e-06, "loss": 0.3948, "mean_token_accuracy": 0.8647068858146667, "num_tokens": 20518626.0, "step": 2505 }, { "entropy": 0.3871714770793915, "epoch": 0.4007024265644955, "grad_norm": 1.6580190658569336, "learning_rate": 4.985174965593323e-06, "loss": 0.3484, "mean_token_accuracy": 0.8785181164741516, "num_tokens": 20559586.0, "step": 2510 }, { "entropy": 0.4144402027130127, "epoch": 0.4015006385696041, "grad_norm": 1.7754415273666382, "learning_rate": 4.985115897671624e-06, "loss": 0.3727, "mean_token_accuracy": 0.8701366901397705, "num_tokens": 20600546.0, "step": 2515 }, { "entropy": 0.40664401054382326, "epoch": 0.40229885057471265, "grad_norm": 1.7231062650680542, "learning_rate": 4.985056712779449e-06, "loss": 0.3678, "mean_token_accuracy": 0.8717346429824829, "num_tokens": 20641506.0, "step": 2520 }, { "entropy": 0.39899550676345824, "epoch": 0.4030970625798212, "grad_norm": 1.7191085815429688, "learning_rate": 4.984997410920519e-06, "loss": 0.3661, "mean_token_accuracy": 0.873437762260437, "num_tokens": 20682466.0, "step": 2525 }, { "entropy": 0.41406258940696716, "epoch": 0.40389527458492974, "grad_norm": 1.402048945426941, "learning_rate": 4.984937992098563e-06, "loss": 0.3714, "mean_token_accuracy": 0.8696890354156495, "num_tokens": 20723426.0, "step": 2530 }, { "entropy": 0.3955492854118347, "epoch": 0.4046934865900383, "grad_norm": 1.577100396156311, "learning_rate": 4.984878456317319e-06, "loss": 0.3379, "mean_token_accuracy": 0.8827073097229003, "num_tokens": 20764386.0, "step": 2535 }, { "entropy": 0.43836078643798826, "epoch": 0.4054916985951469, "grad_norm": 1.6845039129257202, "learning_rate": 4.98481880358053e-06, "loss": 0.3974, "mean_token_accuracy": 0.8614120006561279, "num_tokens": 20805346.0, "step": 2540 }, { "entropy": 0.39585062861442566, "epoch": 0.40628991060025543, "grad_norm": 1.5522538423538208, "learning_rate": 4.984759033891947e-06, "loss": 0.3672, "mean_token_accuracy": 0.8703001976013184, "num_tokens": 20846306.0, "step": 2545 }, { "entropy": 0.3866327404975891, "epoch": 0.407088122605364, "grad_norm": 1.6172707080841064, "learning_rate": 4.984699147255328e-06, "loss": 0.3555, "mean_token_accuracy": 0.8748281240463257, "num_tokens": 20887266.0, "step": 2550 }, { "entropy": 0.3750840961933136, "epoch": 0.4078863346104725, "grad_norm": 1.4508105516433716, "learning_rate": 4.98463914367444e-06, "loss": 0.3368, "mean_token_accuracy": 0.8814164757728576, "num_tokens": 20928226.0, "step": 2555 }, { "entropy": 0.41147985458374026, "epoch": 0.4086845466155811, "grad_norm": 1.4683401584625244, "learning_rate": 4.984579023153055e-06, "loss": 0.3689, "mean_token_accuracy": 0.8700311422348023, "num_tokens": 20969186.0, "step": 2560 }, { "entropy": 0.392798638343811, "epoch": 0.40948275862068967, "grad_norm": 1.5705111026763916, "learning_rate": 4.984518785694955e-06, "loss": 0.3469, "mean_token_accuracy": 0.8787789344787598, "num_tokens": 21009740.0, "step": 2565 }, { "entropy": 0.3867809295654297, "epoch": 0.4102809706257982, "grad_norm": 1.5291118621826172, "learning_rate": 4.984458431303926e-06, "loss": 0.3485, "mean_token_accuracy": 0.8778688192367554, "num_tokens": 21050700.0, "step": 2570 }, { "entropy": 0.40426079630851747, "epoch": 0.41107918263090676, "grad_norm": 1.7074692249298096, "learning_rate": 4.984397959983767e-06, "loss": 0.3584, "mean_token_accuracy": 0.8738163113594055, "num_tokens": 21091660.0, "step": 2575 }, { "entropy": 0.3680769085884094, "epoch": 0.4118773946360153, "grad_norm": 1.4109336137771606, "learning_rate": 4.984337371738276e-06, "loss": 0.3266, "mean_token_accuracy": 0.8867873907089233, "num_tokens": 21132620.0, "step": 2580 }, { "entropy": 0.4069203794002533, "epoch": 0.4126756066411239, "grad_norm": 1.6039010286331177, "learning_rate": 4.984276666571265e-06, "loss": 0.3654, "mean_token_accuracy": 0.8743149757385253, "num_tokens": 21173580.0, "step": 2585 }, { "entropy": 0.3620123088359833, "epoch": 0.41347381864623245, "grad_norm": 1.5748003721237183, "learning_rate": 4.984215844486552e-06, "loss": 0.3298, "mean_token_accuracy": 0.8843584299087525, "num_tokens": 21214540.0, "step": 2590 }, { "entropy": 0.3968845188617706, "epoch": 0.414272030651341, "grad_norm": 1.5188629627227783, "learning_rate": 4.984154905487961e-06, "loss": 0.3616, "mean_token_accuracy": 0.8736822605133057, "num_tokens": 21255500.0, "step": 2595 }, { "entropy": 0.4000989317893982, "epoch": 0.41507024265644954, "grad_norm": 1.6347826719284058, "learning_rate": 4.984093849579325e-06, "loss": 0.3592, "mean_token_accuracy": 0.8751783609390259, "num_tokens": 21296460.0, "step": 2600 }, { "epoch": 0.41507024265644954, "eval_entropy": 0.4033074338436127, "eval_loss": 0.3634474575519562, "eval_mean_token_accuracy": 0.8728289442062378, "eval_num_tokens": 21296460.0, "eval_runtime": 69.1125, "eval_samples_per_second": 14.469, "eval_steps_per_second": 1.809, "step": 2600 }, { "entropy": 0.38013476729393003, "epoch": 0.4158684546615581, "grad_norm": 1.4854322671890259, "learning_rate": 4.984032676764482e-06, "loss": 0.3342, "mean_token_accuracy": 0.8823701858520507, "num_tokens": 21337420.0, "step": 2605 }, { "entropy": 0.4014547288417816, "epoch": 0.4166666666666667, "grad_norm": 1.5105518102645874, "learning_rate": 4.983971387047279e-06, "loss": 0.3563, "mean_token_accuracy": 0.875412392616272, "num_tokens": 21378380.0, "step": 2610 }, { "entropy": 0.39206184148788453, "epoch": 0.41746487867177523, "grad_norm": 1.7710962295532227, "learning_rate": 4.983909980431572e-06, "loss": 0.3597, "mean_token_accuracy": 0.8740858912467957, "num_tokens": 21419340.0, "step": 2615 }, { "entropy": 0.43426434993743895, "epoch": 0.4182630906768838, "grad_norm": 1.6484614610671997, "learning_rate": 4.9838484569212195e-06, "loss": 0.3927, "mean_token_accuracy": 0.8656102776527405, "num_tokens": 21460300.0, "step": 2620 }, { "entropy": 0.40168466567993166, "epoch": 0.4190613026819923, "grad_norm": 1.6557681560516357, "learning_rate": 4.983786816520092e-06, "loss": 0.36, "mean_token_accuracy": 0.8744718313217164, "num_tokens": 21501260.0, "step": 2625 }, { "entropy": 0.40869095325469973, "epoch": 0.4198595146871009, "grad_norm": 1.9138432741165161, "learning_rate": 4.983725059232066e-06, "loss": 0.3766, "mean_token_accuracy": 0.8678873181343079, "num_tokens": 21542220.0, "step": 2630 }, { "entropy": 0.3851506352424622, "epoch": 0.42065772669220947, "grad_norm": 1.5859148502349854, "learning_rate": 4.983663185061024e-06, "loss": 0.336, "mean_token_accuracy": 0.882533586025238, "num_tokens": 21583180.0, "step": 2635 }, { "entropy": 0.414807003736496, "epoch": 0.421455938697318, "grad_norm": 1.611284613609314, "learning_rate": 4.983601194010857e-06, "loss": 0.3875, "mean_token_accuracy": 0.8661292314529419, "num_tokens": 21624140.0, "step": 2640 }, { "entropy": 0.3500566601753235, "epoch": 0.42225415070242656, "grad_norm": 1.4524935483932495, "learning_rate": 4.983539086085464e-06, "loss": 0.3105, "mean_token_accuracy": 0.8918946385383606, "num_tokens": 21665100.0, "step": 2645 }, { "entropy": 0.4368880867958069, "epoch": 0.4230523627075351, "grad_norm": 1.770884394645691, "learning_rate": 4.983476861288751e-06, "loss": 0.3941, "mean_token_accuracy": 0.8645419955253602, "num_tokens": 21706060.0, "step": 2650 }, { "entropy": 0.40582605004310607, "epoch": 0.4238505747126437, "grad_norm": 1.5398057699203491, "learning_rate": 4.983414519624629e-06, "loss": 0.3567, "mean_token_accuracy": 0.8746888637542725, "num_tokens": 21747020.0, "step": 2655 }, { "entropy": 0.38067981600761414, "epoch": 0.42464878671775225, "grad_norm": 1.5007768869400024, "learning_rate": 4.983352061097018e-06, "loss": 0.3399, "mean_token_accuracy": 0.8799324750900268, "num_tokens": 21787980.0, "step": 2660 }, { "entropy": 0.40576404333114624, "epoch": 0.4254469987228608, "grad_norm": 1.7250927686691284, "learning_rate": 4.9832894857098476e-06, "loss": 0.3693, "mean_token_accuracy": 0.8712058424949646, "num_tokens": 21828940.0, "step": 2665 }, { "entropy": 0.38202984929084777, "epoch": 0.42624521072796934, "grad_norm": 1.3978954553604126, "learning_rate": 4.983226793467053e-06, "loss": 0.3482, "mean_token_accuracy": 0.876945185661316, "num_tokens": 21869900.0, "step": 2670 }, { "entropy": 0.3940341711044312, "epoch": 0.4270434227330779, "grad_norm": 1.5598876476287842, "learning_rate": 4.983163984372575e-06, "loss": 0.3507, "mean_token_accuracy": 0.8790436387062073, "num_tokens": 21910860.0, "step": 2675 }, { "entropy": 0.40419002771377566, "epoch": 0.4278416347381865, "grad_norm": 1.6398168802261353, "learning_rate": 4.983101058430364e-06, "loss": 0.3683, "mean_token_accuracy": 0.870511507987976, "num_tokens": 21951820.0, "step": 2680 }, { "entropy": 0.41107907295227053, "epoch": 0.428639846743295, "grad_norm": 1.6293483972549438, "learning_rate": 4.983038015644376e-06, "loss": 0.3761, "mean_token_accuracy": 0.870405089855194, "num_tokens": 21992780.0, "step": 2685 }, { "entropy": 0.40680499076843263, "epoch": 0.4294380587484036, "grad_norm": 1.5343927145004272, "learning_rate": 4.982974856018576e-06, "loss": 0.3718, "mean_token_accuracy": 0.8700067877769471, "num_tokens": 22033740.0, "step": 2690 }, { "entropy": 0.38571652173995974, "epoch": 0.4302362707535121, "grad_norm": 1.4988517761230469, "learning_rate": 4.982911579556937e-06, "loss": 0.344, "mean_token_accuracy": 0.8792693257331848, "num_tokens": 22074700.0, "step": 2695 }, { "entropy": 0.39872742891311647, "epoch": 0.43103448275862066, "grad_norm": 1.4958524703979492, "learning_rate": 4.982848186263436e-06, "loss": 0.3603, "mean_token_accuracy": 0.873319935798645, "num_tokens": 22115660.0, "step": 2700 }, { "entropy": 0.41141684651374816, "epoch": 0.43183269476372926, "grad_norm": 1.6881943941116333, "learning_rate": 4.98278467614206e-06, "loss": 0.3722, "mean_token_accuracy": 0.870596992969513, "num_tokens": 22156620.0, "step": 2705 }, { "entropy": 0.3768575727939606, "epoch": 0.4326309067688378, "grad_norm": 1.5993489027023315, "learning_rate": 4.982721049196804e-06, "loss": 0.3351, "mean_token_accuracy": 0.8819820642471313, "num_tokens": 22197580.0, "step": 2710 }, { "entropy": 0.3916157424449921, "epoch": 0.43342911877394635, "grad_norm": 1.469970703125, "learning_rate": 4.982657305431668e-06, "loss": 0.3539, "mean_token_accuracy": 0.875360107421875, "num_tokens": 22238540.0, "step": 2715 }, { "entropy": 0.4240422070026398, "epoch": 0.4342273307790549, "grad_norm": 1.6995346546173096, "learning_rate": 4.982593444850658e-06, "loss": 0.3941, "mean_token_accuracy": 0.8621967434883118, "num_tokens": 22279500.0, "step": 2720 }, { "entropy": 0.36775652766227723, "epoch": 0.4350255427841635, "grad_norm": 1.548226237297058, "learning_rate": 4.982529467457795e-06, "loss": 0.3257, "mean_token_accuracy": 0.8852847933769226, "num_tokens": 22320460.0, "step": 2725 }, { "entropy": 0.3609649360179901, "epoch": 0.43582375478927204, "grad_norm": 1.4408965110778809, "learning_rate": 4.982465373257098e-06, "loss": 0.3123, "mean_token_accuracy": 0.891053307056427, "num_tokens": 22361420.0, "step": 2730 }, { "entropy": 0.3790753722190857, "epoch": 0.4366219667943806, "grad_norm": 1.5519766807556152, "learning_rate": 4.982401162252599e-06, "loss": 0.3386, "mean_token_accuracy": 0.8807764530181885, "num_tokens": 22402380.0, "step": 2735 }, { "entropy": 0.43087227940559386, "epoch": 0.43742017879948913, "grad_norm": 1.7707847356796265, "learning_rate": 4.982336834448336e-06, "loss": 0.3945, "mean_token_accuracy": 0.8637638330459595, "num_tokens": 22443340.0, "step": 2740 }, { "entropy": 0.3714826762676239, "epoch": 0.4382183908045977, "grad_norm": 1.6115907430648804, "learning_rate": 4.982272389848354e-06, "loss": 0.3326, "mean_token_accuracy": 0.880763578414917, "num_tokens": 22484300.0, "step": 2745 }, { "entropy": 0.37650308609008787, "epoch": 0.4390166028097063, "grad_norm": 1.711526870727539, "learning_rate": 4.982207828456705e-06, "loss": 0.3369, "mean_token_accuracy": 0.8830351948738098, "num_tokens": 22525260.0, "step": 2750 }, { "entropy": 0.4391146719455719, "epoch": 0.4398148148148148, "grad_norm": 1.8127883672714233, "learning_rate": 4.982143150277448e-06, "loss": 0.3955, "mean_token_accuracy": 0.8630771994590759, "num_tokens": 22566220.0, "step": 2755 }, { "entropy": 0.41789884567260743, "epoch": 0.44061302681992337, "grad_norm": 1.5921564102172852, "learning_rate": 4.982078355314654e-06, "loss": 0.3756, "mean_token_accuracy": 0.8702207565307617, "num_tokens": 22607180.0, "step": 2760 }, { "entropy": 0.38065919280052185, "epoch": 0.4414112388250319, "grad_norm": 1.8245171308517456, "learning_rate": 4.982013443572392e-06, "loss": 0.3422, "mean_token_accuracy": 0.8805180788040161, "num_tokens": 22648140.0, "step": 2765 }, { "entropy": 0.38244200944900514, "epoch": 0.44220945083014046, "grad_norm": 1.4093703031539917, "learning_rate": 4.9819484150547485e-06, "loss": 0.3397, "mean_token_accuracy": 0.8840342402458191, "num_tokens": 22689100.0, "step": 2770 }, { "entropy": 0.4221570134162903, "epoch": 0.44300766283524906, "grad_norm": 1.7415305376052856, "learning_rate": 4.981883269765809e-06, "loss": 0.379, "mean_token_accuracy": 0.8687703728675842, "num_tokens": 22730060.0, "step": 2775 }, { "entropy": 0.3891486167907715, "epoch": 0.4438058748403576, "grad_norm": 1.4292845726013184, "learning_rate": 4.981818007709674e-06, "loss": 0.3474, "mean_token_accuracy": 0.879124915599823, "num_tokens": 22771020.0, "step": 2780 }, { "entropy": 0.4010182499885559, "epoch": 0.44460408684546615, "grad_norm": 1.5577126741409302, "learning_rate": 4.981752628890445e-06, "loss": 0.3628, "mean_token_accuracy": 0.87374427318573, "num_tokens": 22811980.0, "step": 2785 }, { "entropy": 0.35749190449714663, "epoch": 0.4454022988505747, "grad_norm": 1.5466744899749756, "learning_rate": 4.981687133312233e-06, "loss": 0.3208, "mean_token_accuracy": 0.8853784084320069, "num_tokens": 22852940.0, "step": 2790 }, { "entropy": 0.42231556177139284, "epoch": 0.4462005108556833, "grad_norm": 1.9182018041610718, "learning_rate": 4.981621520979157e-06, "loss": 0.3868, "mean_token_accuracy": 0.8668047189712524, "num_tokens": 22893900.0, "step": 2795 }, { "entropy": 0.37705173492431643, "epoch": 0.44699872286079184, "grad_norm": 1.5075479745864868, "learning_rate": 4.9815557918953444e-06, "loss": 0.3417, "mean_token_accuracy": 0.8799205899238587, "num_tokens": 22934860.0, "step": 2800 }, { "epoch": 0.44699872286079184, "eval_entropy": 0.4021283230781555, "eval_loss": 0.3617970049381256, "eval_mean_token_accuracy": 0.8734477977752686, "eval_num_tokens": 22934860.0, "eval_runtime": 69.2427, "eval_samples_per_second": 14.442, "eval_steps_per_second": 1.805, "step": 2800 }, { "entropy": 0.4065449595451355, "epoch": 0.4477969348659004, "grad_norm": 1.647086262702942, "learning_rate": 4.981489946064926e-06, "loss": 0.3633, "mean_token_accuracy": 0.871274995803833, "num_tokens": 22975820.0, "step": 2805 }, { "entropy": 0.3849412977695465, "epoch": 0.44859514687100893, "grad_norm": 1.7600377798080444, "learning_rate": 4.9814239834920445e-06, "loss": 0.3431, "mean_token_accuracy": 0.878504502773285, "num_tokens": 23016780.0, "step": 2810 }, { "entropy": 0.40428113341331484, "epoch": 0.4493933588761175, "grad_norm": 1.734229564666748, "learning_rate": 4.981357904180847e-06, "loss": 0.3679, "mean_token_accuracy": 0.8724266767501831, "num_tokens": 23057740.0, "step": 2815 }, { "entropy": 0.38803368210792544, "epoch": 0.4501915708812261, "grad_norm": 1.6498732566833496, "learning_rate": 4.981291708135488e-06, "loss": 0.3525, "mean_token_accuracy": 0.8783321976661682, "num_tokens": 23098700.0, "step": 2820 }, { "entropy": 0.4112163186073303, "epoch": 0.4509897828863346, "grad_norm": 1.5178463459014893, "learning_rate": 4.981225395360131e-06, "loss": 0.3734, "mean_token_accuracy": 0.8709302544593811, "num_tokens": 23139660.0, "step": 2825 }, { "entropy": 0.4096134901046753, "epoch": 0.45178799489144317, "grad_norm": 1.6412782669067383, "learning_rate": 4.9811589658589464e-06, "loss": 0.3589, "mean_token_accuracy": 0.8746277451515198, "num_tokens": 23180620.0, "step": 2830 }, { "entropy": 0.3961434066295624, "epoch": 0.4525862068965517, "grad_norm": 1.5939996242523193, "learning_rate": 4.981092419636111e-06, "loss": 0.3623, "mean_token_accuracy": 0.8744441866874695, "num_tokens": 23221580.0, "step": 2835 }, { "entropy": 0.40303857922554015, "epoch": 0.45338441890166026, "grad_norm": 1.6157113313674927, "learning_rate": 4.981025756695809e-06, "loss": 0.3635, "mean_token_accuracy": 0.8736197471618652, "num_tokens": 23262540.0, "step": 2840 }, { "entropy": 0.38037262558937074, "epoch": 0.45418263090676886, "grad_norm": 1.565201997756958, "learning_rate": 4.980958977042233e-06, "loss": 0.3347, "mean_token_accuracy": 0.8818344831466675, "num_tokens": 23303500.0, "step": 2845 }, { "entropy": 0.39692175984382627, "epoch": 0.4549808429118774, "grad_norm": 1.5817967653274536, "learning_rate": 4.980892080679582e-06, "loss": 0.3558, "mean_token_accuracy": 0.8763553977012635, "num_tokens": 23344460.0, "step": 2850 }, { "entropy": 0.40519008636474607, "epoch": 0.45577905491698595, "grad_norm": 1.681086778640747, "learning_rate": 4.980825067612063e-06, "loss": 0.3701, "mean_token_accuracy": 0.8710267305374145, "num_tokens": 23385201.0, "step": 2855 }, { "entropy": 0.4324694097042084, "epoch": 0.4565772669220945, "grad_norm": 1.7982279062271118, "learning_rate": 4.9807579378438905e-06, "loss": 0.3926, "mean_token_accuracy": 0.8650486469268799, "num_tokens": 23426161.0, "step": 2860 }, { "entropy": 0.3910203635692596, "epoch": 0.45737547892720304, "grad_norm": 1.6430963277816772, "learning_rate": 4.980690691379284e-06, "loss": 0.3453, "mean_token_accuracy": 0.8780064105987548, "num_tokens": 23467121.0, "step": 2865 }, { "entropy": 0.40927610993385316, "epoch": 0.45817369093231164, "grad_norm": 1.5375142097473145, "learning_rate": 4.980623328222475e-06, "loss": 0.377, "mean_token_accuracy": 0.8677163481712341, "num_tokens": 23508081.0, "step": 2870 }, { "entropy": 0.40993956923484803, "epoch": 0.4589719029374202, "grad_norm": 1.4786747694015503, "learning_rate": 4.980555848377696e-06, "loss": 0.3671, "mean_token_accuracy": 0.8713637351989746, "num_tokens": 23549041.0, "step": 2875 }, { "entropy": 0.3585766851902008, "epoch": 0.45977011494252873, "grad_norm": 1.4331398010253906, "learning_rate": 4.9804882518491936e-06, "loss": 0.3152, "mean_token_accuracy": 0.8895103693008423, "num_tokens": 23590001.0, "step": 2880 }, { "entropy": 0.42248719930648804, "epoch": 0.4605683269476373, "grad_norm": 1.7327814102172852, "learning_rate": 4.980420538641217e-06, "loss": 0.3855, "mean_token_accuracy": 0.8655761480331421, "num_tokens": 23630961.0, "step": 2885 }, { "entropy": 0.39616808891296384, "epoch": 0.4613665389527459, "grad_norm": 1.533501148223877, "learning_rate": 4.980352708758025e-06, "loss": 0.3539, "mean_token_accuracy": 0.8772770881652832, "num_tokens": 23671921.0, "step": 2890 }, { "entropy": 0.4002328336238861, "epoch": 0.4621647509578544, "grad_norm": 1.5775973796844482, "learning_rate": 4.980284762203882e-06, "loss": 0.3529, "mean_token_accuracy": 0.878447163105011, "num_tokens": 23712881.0, "step": 2895 }, { "entropy": 0.38118772506713866, "epoch": 0.46296296296296297, "grad_norm": 1.5659089088439941, "learning_rate": 4.98021669898306e-06, "loss": 0.3376, "mean_token_accuracy": 0.8805113434791565, "num_tokens": 23753841.0, "step": 2900 }, { "entropy": 0.4109515011310577, "epoch": 0.4637611749680715, "grad_norm": 1.481997013092041, "learning_rate": 4.980148519099842e-06, "loss": 0.3751, "mean_token_accuracy": 0.8691876173019409, "num_tokens": 23794801.0, "step": 2905 }, { "entropy": 0.4154976367950439, "epoch": 0.46455938697318006, "grad_norm": 1.6384352445602417, "learning_rate": 4.980080222558512e-06, "loss": 0.383, "mean_token_accuracy": 0.8686699986457824, "num_tokens": 23835761.0, "step": 2910 }, { "entropy": 0.44558030366897583, "epoch": 0.46535759897828866, "grad_norm": 1.8630101680755615, "learning_rate": 4.9800118093633675e-06, "loss": 0.3996, "mean_token_accuracy": 0.8595394015312194, "num_tokens": 23876721.0, "step": 2915 }, { "entropy": 0.46090860962867736, "epoch": 0.4661558109833972, "grad_norm": 1.6846959590911865, "learning_rate": 4.979943279518709e-06, "loss": 0.4232, "mean_token_accuracy": 0.8549802303314209, "num_tokens": 23917681.0, "step": 2920 }, { "entropy": 0.36205923557281494, "epoch": 0.46695402298850575, "grad_norm": 1.501598834991455, "learning_rate": 4.979874633028846e-06, "loss": 0.3113, "mean_token_accuracy": 0.8894065141677856, "num_tokens": 23958641.0, "step": 2925 }, { "entropy": 0.4432215213775635, "epoch": 0.4677522349936143, "grad_norm": 1.6874146461486816, "learning_rate": 4.979805869898095e-06, "loss": 0.4057, "mean_token_accuracy": 0.8603995084762573, "num_tokens": 23999601.0, "step": 2930 }, { "entropy": 0.39056233167648313, "epoch": 0.46855044699872284, "grad_norm": 1.5292900800704956, "learning_rate": 4.9797369901307815e-06, "loss": 0.3562, "mean_token_accuracy": 0.8762812137603759, "num_tokens": 24040561.0, "step": 2935 }, { "entropy": 0.37874605059623717, "epoch": 0.46934865900383144, "grad_norm": 1.7269057035446167, "learning_rate": 4.979667993731235e-06, "loss": 0.3488, "mean_token_accuracy": 0.8776691555976868, "num_tokens": 24081521.0, "step": 2940 }, { "entropy": 0.38322940468788147, "epoch": 0.47014687100894, "grad_norm": 1.7017403841018677, "learning_rate": 4.979598880703796e-06, "loss": 0.3369, "mean_token_accuracy": 0.8816770792007447, "num_tokens": 24122481.0, "step": 2945 }, { "entropy": 0.364909029006958, "epoch": 0.4709450830140485, "grad_norm": 1.543266773223877, "learning_rate": 4.979529651052809e-06, "loss": 0.3262, "mean_token_accuracy": 0.884519624710083, "num_tokens": 24163441.0, "step": 2950 }, { "entropy": 0.34366172552108765, "epoch": 0.47174329501915707, "grad_norm": 1.5701253414154053, "learning_rate": 4.979460304782628e-06, "loss": 0.3083, "mean_token_accuracy": 0.8912683486938476, "num_tokens": 24204401.0, "step": 2955 }, { "entropy": 0.4112046599388123, "epoch": 0.4725415070242657, "grad_norm": 1.6567833423614502, "learning_rate": 4.979390841897615e-06, "loss": 0.3667, "mean_token_accuracy": 0.8730028867721558, "num_tokens": 24245361.0, "step": 2960 }, { "entropy": 0.3741000950336456, "epoch": 0.4733397190293742, "grad_norm": 1.6795862913131714, "learning_rate": 4.979321262402136e-06, "loss": 0.3368, "mean_token_accuracy": 0.8805624365806579, "num_tokens": 24286321.0, "step": 2965 }, { "entropy": 0.3774628400802612, "epoch": 0.47413793103448276, "grad_norm": 1.640445590019226, "learning_rate": 4.979251566300568e-06, "loss": 0.339, "mean_token_accuracy": 0.8825162529945374, "num_tokens": 24327281.0, "step": 2970 }, { "entropy": 0.3802816331386566, "epoch": 0.4749361430395913, "grad_norm": 1.6335704326629639, "learning_rate": 4.979181753597292e-06, "loss": 0.3471, "mean_token_accuracy": 0.8781920194625854, "num_tokens": 24368241.0, "step": 2975 }, { "entropy": 0.3621665060520172, "epoch": 0.47573435504469985, "grad_norm": 1.4962271451950073, "learning_rate": 4.9791118242966996e-06, "loss": 0.3319, "mean_token_accuracy": 0.8808696627616882, "num_tokens": 24409201.0, "step": 2980 }, { "entropy": 0.37013022899627684, "epoch": 0.47653256704980845, "grad_norm": 1.4742448329925537, "learning_rate": 4.9790417784031875e-06, "loss": 0.3305, "mean_token_accuracy": 0.8842559218406677, "num_tokens": 24450161.0, "step": 2985 }, { "entropy": 0.3834933042526245, "epoch": 0.477330779054917, "grad_norm": 1.6567814350128174, "learning_rate": 4.978971615921161e-06, "loss": 0.3368, "mean_token_accuracy": 0.8838761329650879, "num_tokens": 24491121.0, "step": 2990 }, { "entropy": 0.41710891723632815, "epoch": 0.47812899106002554, "grad_norm": 1.7117583751678467, "learning_rate": 4.978901336855031e-06, "loss": 0.3841, "mean_token_accuracy": 0.8669528484344482, "num_tokens": 24532081.0, "step": 2995 }, { "entropy": 0.37902289628982544, "epoch": 0.4789272030651341, "grad_norm": 1.5434080362319946, "learning_rate": 4.978830941209218e-06, "loss": 0.3335, "mean_token_accuracy": 0.8823231101036072, "num_tokens": 24573041.0, "step": 3000 }, { "epoch": 0.4789272030651341, "eval_entropy": 0.4031079981327057, "eval_loss": 0.36067894101142883, "eval_mean_token_accuracy": 0.8737649130821228, "eval_num_tokens": 24573041.0, "eval_runtime": 69.1391, "eval_samples_per_second": 14.464, "eval_steps_per_second": 1.808, "step": 3000 }, { "entropy": 0.3913613796234131, "epoch": 0.47972541507024263, "grad_norm": 1.642884612083435, "learning_rate": 4.978760428988149e-06, "loss": 0.3484, "mean_token_accuracy": 0.8776594400405884, "num_tokens": 24614001.0, "step": 3005 }, { "entropy": 0.39155340790748594, "epoch": 0.48052362707535123, "grad_norm": 1.5776258707046509, "learning_rate": 4.978689800196257e-06, "loss": 0.3501, "mean_token_accuracy": 0.8766720056533813, "num_tokens": 24654961.0, "step": 3010 }, { "entropy": 0.36276376247406006, "epoch": 0.4813218390804598, "grad_norm": 1.6027809381484985, "learning_rate": 4.978619054837984e-06, "loss": 0.3161, "mean_token_accuracy": 0.8871202230453491, "num_tokens": 24695587.0, "step": 3015 }, { "entropy": 0.40595887303352357, "epoch": 0.4821200510855683, "grad_norm": 1.7121602296829224, "learning_rate": 4.978548192917779e-06, "loss": 0.3675, "mean_token_accuracy": 0.8705784201622009, "num_tokens": 24736547.0, "step": 3020 }, { "entropy": 0.39229520559310915, "epoch": 0.48291826309067687, "grad_norm": 1.6675854921340942, "learning_rate": 4.9784772144400976e-06, "loss": 0.3576, "mean_token_accuracy": 0.8745187759399414, "num_tokens": 24777507.0, "step": 3025 }, { "entropy": 0.44495280385017394, "epoch": 0.4837164750957854, "grad_norm": 1.521255612373352, "learning_rate": 4.978406119409403e-06, "loss": 0.4153, "mean_token_accuracy": 0.8569184064865112, "num_tokens": 24818467.0, "step": 3030 }, { "entropy": 0.41049495339393616, "epoch": 0.484514687100894, "grad_norm": 1.782544732093811, "learning_rate": 4.978334907830164e-06, "loss": 0.381, "mean_token_accuracy": 0.8683913707733154, "num_tokens": 24859427.0, "step": 3035 }, { "entropy": 0.3862737715244293, "epoch": 0.48531289910600256, "grad_norm": 1.696000576019287, "learning_rate": 4.978263579706862e-06, "loss": 0.3456, "mean_token_accuracy": 0.8780618906021118, "num_tokens": 24900387.0, "step": 3040 }, { "entropy": 0.3594102025032043, "epoch": 0.4861111111111111, "grad_norm": 1.5718748569488525, "learning_rate": 4.978192135043982e-06, "loss": 0.3142, "mean_token_accuracy": 0.8889271020889282, "num_tokens": 24941347.0, "step": 3045 }, { "entropy": 0.3866430759429932, "epoch": 0.48690932311621965, "grad_norm": 1.7633681297302246, "learning_rate": 4.9781205738460155e-06, "loss": 0.3542, "mean_token_accuracy": 0.8774181127548217, "num_tokens": 24982307.0, "step": 3050 }, { "entropy": 0.40436485409736633, "epoch": 0.48770753512132825, "grad_norm": 1.350527286529541, "learning_rate": 4.978048896117462e-06, "loss": 0.3653, "mean_token_accuracy": 0.8719983696937561, "num_tokens": 25023267.0, "step": 3055 }, { "entropy": 0.40219146609306333, "epoch": 0.4885057471264368, "grad_norm": 1.4634770154953003, "learning_rate": 4.9779771018628296e-06, "loss": 0.3668, "mean_token_accuracy": 0.8723389029502868, "num_tokens": 25064227.0, "step": 3060 }, { "entropy": 0.3927314758300781, "epoch": 0.48930395913154534, "grad_norm": 1.6988699436187744, "learning_rate": 4.977905191086634e-06, "loss": 0.36, "mean_token_accuracy": 0.8752070426940918, "num_tokens": 25105187.0, "step": 3065 }, { "entropy": 0.37555994391441344, "epoch": 0.4901021711366539, "grad_norm": 1.4584192037582397, "learning_rate": 4.977833163793395e-06, "loss": 0.3299, "mean_token_accuracy": 0.8834568738937378, "num_tokens": 25146147.0, "step": 3070 }, { "entropy": 0.40433894395828246, "epoch": 0.49090038314176243, "grad_norm": 1.6292520761489868, "learning_rate": 4.977761019987642e-06, "loss": 0.3651, "mean_token_accuracy": 0.8728869557380676, "num_tokens": 25187107.0, "step": 3075 }, { "entropy": 0.38123475313186644, "epoch": 0.49169859514687103, "grad_norm": 1.3067562580108643, "learning_rate": 4.977688759673916e-06, "loss": 0.344, "mean_token_accuracy": 0.8789713621139527, "num_tokens": 25228067.0, "step": 3080 }, { "entropy": 0.37763266563415526, "epoch": 0.4924968071519796, "grad_norm": 1.6079351902008057, "learning_rate": 4.977616382856755e-06, "loss": 0.3267, "mean_token_accuracy": 0.8850642085075379, "num_tokens": 25269027.0, "step": 3085 }, { "entropy": 0.3939137518405914, "epoch": 0.4932950191570881, "grad_norm": 1.75822913646698, "learning_rate": 4.977543889540713e-06, "loss": 0.3446, "mean_token_accuracy": 0.8792263031005859, "num_tokens": 25309987.0, "step": 3090 }, { "entropy": 0.36902579069137575, "epoch": 0.49409323116219667, "grad_norm": 1.7407400608062744, "learning_rate": 4.977471279730349e-06, "loss": 0.3314, "mean_token_accuracy": 0.8828243494033814, "num_tokens": 25350947.0, "step": 3095 }, { "entropy": 0.4192457675933838, "epoch": 0.4948914431673052, "grad_norm": 1.7239857912063599, "learning_rate": 4.97739855343023e-06, "loss": 0.3784, "mean_token_accuracy": 0.8699460387229919, "num_tokens": 25391907.0, "step": 3100 }, { "entropy": 0.377583783864975, "epoch": 0.4956896551724138, "grad_norm": 1.643020510673523, "learning_rate": 4.977325710644927e-06, "loss": 0.3309, "mean_token_accuracy": 0.8839346528053283, "num_tokens": 25432867.0, "step": 3105 }, { "entropy": 0.37346488833427427, "epoch": 0.49648786717752236, "grad_norm": 1.7278952598571777, "learning_rate": 4.977252751379021e-06, "loss": 0.337, "mean_token_accuracy": 0.8824220895767212, "num_tokens": 25473805.0, "step": 3110 }, { "entropy": 0.3702260494232178, "epoch": 0.4972860791826309, "grad_norm": 1.620824933052063, "learning_rate": 4.9771796756371005e-06, "loss": 0.3344, "mean_token_accuracy": 0.881265652179718, "num_tokens": 25514765.0, "step": 3115 }, { "entropy": 0.3484692215919495, "epoch": 0.49808429118773945, "grad_norm": 1.5499067306518555, "learning_rate": 4.9771064834237605e-06, "loss": 0.3131, "mean_token_accuracy": 0.8905044913291931, "num_tokens": 25555725.0, "step": 3120 }, { "entropy": 0.4137328088283539, "epoch": 0.49888250319284805, "grad_norm": 1.6705187559127808, "learning_rate": 4.977033174743604e-06, "loss": 0.3724, "mean_token_accuracy": 0.8712103247642518, "num_tokens": 25596685.0, "step": 3125 }, { "entropy": 0.4071082234382629, "epoch": 0.4996807151979566, "grad_norm": 1.6194504499435425, "learning_rate": 4.9769597496012405e-06, "loss": 0.3566, "mean_token_accuracy": 0.8755745053291321, "num_tokens": 25637645.0, "step": 3130 }, { "entropy": 0.41910150051116946, "epoch": 0.5004789272030651, "grad_norm": 1.5641275644302368, "learning_rate": 4.976886208001287e-06, "loss": 0.3717, "mean_token_accuracy": 0.8706887483596801, "num_tokens": 25678605.0, "step": 3135 }, { "entropy": 0.4073460757732391, "epoch": 0.5012771392081737, "grad_norm": 1.7469613552093506, "learning_rate": 4.9768125499483695e-06, "loss": 0.3774, "mean_token_accuracy": 0.8685148596763611, "num_tokens": 25719565.0, "step": 3140 }, { "entropy": 0.39724062085151673, "epoch": 0.5020753512132823, "grad_norm": 1.6528843641281128, "learning_rate": 4.976738775447118e-06, "loss": 0.3619, "mean_token_accuracy": 0.8764813065528869, "num_tokens": 25760525.0, "step": 3145 }, { "entropy": 0.36525858044624326, "epoch": 0.5028735632183908, "grad_norm": 1.4947924613952637, "learning_rate": 4.976664884502172e-06, "loss": 0.3251, "mean_token_accuracy": 0.8846045970916748, "num_tokens": 25801485.0, "step": 3150 }, { "entropy": 0.39498506784439086, "epoch": 0.5036717752234994, "grad_norm": 1.5894827842712402, "learning_rate": 4.9765908771181795e-06, "loss": 0.355, "mean_token_accuracy": 0.8751391053199769, "num_tokens": 25842445.0, "step": 3155 }, { "entropy": 0.3828314483165741, "epoch": 0.5044699872286079, "grad_norm": 1.512500524520874, "learning_rate": 4.976516753299793e-06, "loss": 0.3499, "mean_token_accuracy": 0.8800006866455078, "num_tokens": 25883405.0, "step": 3160 }, { "entropy": 0.39771613478660583, "epoch": 0.5052681992337165, "grad_norm": 1.5987226963043213, "learning_rate": 4.976442513051674e-06, "loss": 0.3458, "mean_token_accuracy": 0.8785083174705506, "num_tokens": 25924365.0, "step": 3165 }, { "entropy": 0.44791218638420105, "epoch": 0.506066411238825, "grad_norm": 1.728903889656067, "learning_rate": 4.97636815637849e-06, "loss": 0.4038, "mean_token_accuracy": 0.8605072498321533, "num_tokens": 25965325.0, "step": 3170 }, { "entropy": 0.4219264924526215, "epoch": 0.5068646232439336, "grad_norm": 1.9006085395812988, "learning_rate": 4.976293683284918e-06, "loss": 0.392, "mean_token_accuracy": 0.8648006916046143, "num_tokens": 26006285.0, "step": 3175 }, { "entropy": 0.3821633756160736, "epoch": 0.5076628352490421, "grad_norm": 1.7359493970870972, "learning_rate": 4.976219093775642e-06, "loss": 0.3496, "mean_token_accuracy": 0.8779027462005615, "num_tokens": 26047245.0, "step": 3180 }, { "entropy": 0.4139282822608948, "epoch": 0.5084610472541508, "grad_norm": 1.6639729738235474, "learning_rate": 4.976144387855351e-06, "loss": 0.3751, "mean_token_accuracy": 0.8683338642120362, "num_tokens": 26088205.0, "step": 3185 }, { "entropy": 0.3756455063819885, "epoch": 0.5092592592592593, "grad_norm": 1.600915551185608, "learning_rate": 4.976069565528743e-06, "loss": 0.3329, "mean_token_accuracy": 0.8827805042266845, "num_tokens": 26129165.0, "step": 3190 }, { "entropy": 0.4037862837314606, "epoch": 0.5100574712643678, "grad_norm": 1.614333987236023, "learning_rate": 4.9759946268005224e-06, "loss": 0.3557, "mean_token_accuracy": 0.8748448014259338, "num_tokens": 26170125.0, "step": 3195 }, { "entropy": 0.4018108367919922, "epoch": 0.5108556832694764, "grad_norm": 1.522694706916809, "learning_rate": 4.975919571675403e-06, "loss": 0.3671, "mean_token_accuracy": 0.8725090503692627, "num_tokens": 26211085.0, "step": 3200 }, { "epoch": 0.5108556832694764, "eval_entropy": 0.3950038187503815, "eval_loss": 0.35790205001831055, "eval_mean_token_accuracy": 0.8749810900688172, "eval_num_tokens": 26211085.0, "eval_runtime": 69.1405, "eval_samples_per_second": 14.463, "eval_steps_per_second": 1.808, "step": 3200 }, { "entropy": 0.4304787814617157, "epoch": 0.5116538952745849, "grad_norm": 1.5640943050384521, "learning_rate": 4.975844400158104e-06, "loss": 0.3979, "mean_token_accuracy": 0.8626617193222046, "num_tokens": 26252045.0, "step": 3205 }, { "entropy": 0.38331787586212157, "epoch": 0.5124521072796935, "grad_norm": 1.618471384048462, "learning_rate": 4.975769112253352e-06, "loss": 0.347, "mean_token_accuracy": 0.8774407744407654, "num_tokens": 26293005.0, "step": 3210 }, { "entropy": 0.3924212872982025, "epoch": 0.513250319284802, "grad_norm": 1.6554707288742065, "learning_rate": 4.975693707965882e-06, "loss": 0.35, "mean_token_accuracy": 0.878236734867096, "num_tokens": 26333965.0, "step": 3215 }, { "entropy": 0.3892656922340393, "epoch": 0.5140485312899106, "grad_norm": 1.5585850477218628, "learning_rate": 4.975618187300435e-06, "loss": 0.3503, "mean_token_accuracy": 0.87829430103302, "num_tokens": 26374925.0, "step": 3220 }, { "entropy": 0.42128477096557615, "epoch": 0.5148467432950191, "grad_norm": 1.7279189825057983, "learning_rate": 4.975542550261761e-06, "loss": 0.3763, "mean_token_accuracy": 0.8681336760520935, "num_tokens": 26415885.0, "step": 3225 }, { "entropy": 0.37716048359870913, "epoch": 0.5156449553001277, "grad_norm": 1.4353517293930054, "learning_rate": 4.975466796854615e-06, "loss": 0.3504, "mean_token_accuracy": 0.8789421796798706, "num_tokens": 26456845.0, "step": 3230 }, { "entropy": 0.3674571573734283, "epoch": 0.5164431673052363, "grad_norm": 1.4046391248703003, "learning_rate": 4.975390927083762e-06, "loss": 0.3254, "mean_token_accuracy": 0.8862491250038147, "num_tokens": 26497805.0, "step": 3235 }, { "entropy": 0.41730799078941344, "epoch": 0.5172413793103449, "grad_norm": 1.554059624671936, "learning_rate": 4.975314940953972e-06, "loss": 0.3739, "mean_token_accuracy": 0.8708946228027343, "num_tokens": 26538765.0, "step": 3240 }, { "entropy": 0.363294380903244, "epoch": 0.5180395913154534, "grad_norm": 1.5008974075317383, "learning_rate": 4.9752388384700235e-06, "loss": 0.3261, "mean_token_accuracy": 0.8853865742683411, "num_tokens": 26579725.0, "step": 3245 }, { "entropy": 0.3818063735961914, "epoch": 0.518837803320562, "grad_norm": 1.5581785440444946, "learning_rate": 4.975162619636702e-06, "loss": 0.347, "mean_token_accuracy": 0.8779454112052918, "num_tokens": 26620685.0, "step": 3250 }, { "entropy": 0.4436651051044464, "epoch": 0.5196360153256705, "grad_norm": 1.7735083103179932, "learning_rate": 4.975086284458801e-06, "loss": 0.4083, "mean_token_accuracy": 0.859112286567688, "num_tokens": 26661645.0, "step": 3255 }, { "entropy": 0.4206997752189636, "epoch": 0.520434227330779, "grad_norm": 1.5185121297836304, "learning_rate": 4.97500983294112e-06, "loss": 0.3744, "mean_token_accuracy": 0.868882167339325, "num_tokens": 26702605.0, "step": 3260 }, { "entropy": 0.36768691539764403, "epoch": 0.5212324393358876, "grad_norm": 1.4248138666152954, "learning_rate": 4.974933265088468e-06, "loss": 0.3169, "mean_token_accuracy": 0.8884208679199219, "num_tokens": 26743565.0, "step": 3265 }, { "entropy": 0.37526572942733766, "epoch": 0.5220306513409961, "grad_norm": 1.447968602180481, "learning_rate": 4.974856580905656e-06, "loss": 0.3427, "mean_token_accuracy": 0.8792654156684876, "num_tokens": 26784525.0, "step": 3270 }, { "entropy": 0.35951876640319824, "epoch": 0.5228288633461047, "grad_norm": 1.3812257051467896, "learning_rate": 4.974779780397511e-06, "loss": 0.3239, "mean_token_accuracy": 0.883990478515625, "num_tokens": 26825485.0, "step": 3275 }, { "entropy": 0.3781895101070404, "epoch": 0.5236270753512133, "grad_norm": 1.5094199180603027, "learning_rate": 4.974702863568859e-06, "loss": 0.339, "mean_token_accuracy": 0.8790646076202393, "num_tokens": 26866445.0, "step": 3280 }, { "entropy": 0.41062875390052794, "epoch": 0.5244252873563219, "grad_norm": 1.8724991083145142, "learning_rate": 4.9746258304245385e-06, "loss": 0.3774, "mean_token_accuracy": 0.8707447409629822, "num_tokens": 26907405.0, "step": 3285 }, { "entropy": 0.35251411199569704, "epoch": 0.5252234993614304, "grad_norm": 1.6266639232635498, "learning_rate": 4.974548680969394e-06, "loss": 0.3123, "mean_token_accuracy": 0.8902901887893677, "num_tokens": 26948365.0, "step": 3290 }, { "entropy": 0.39582002758979795, "epoch": 0.526021711366539, "grad_norm": 1.7405627965927124, "learning_rate": 4.974471415208275e-06, "loss": 0.3449, "mean_token_accuracy": 0.8775237441062927, "num_tokens": 26989325.0, "step": 3295 }, { "entropy": 0.4057934045791626, "epoch": 0.5268199233716475, "grad_norm": 1.7901203632354736, "learning_rate": 4.974394033146042e-06, "loss": 0.3707, "mean_token_accuracy": 0.870267128944397, "num_tokens": 27030285.0, "step": 3300 }, { "entropy": 0.3816504716873169, "epoch": 0.5276181353767561, "grad_norm": 1.5628036260604858, "learning_rate": 4.97431653478756e-06, "loss": 0.3441, "mean_token_accuracy": 0.8798413515090943, "num_tokens": 27071245.0, "step": 3305 }, { "entropy": 0.3856058239936829, "epoch": 0.5284163473818646, "grad_norm": 1.5495929718017578, "learning_rate": 4.9742389201377025e-06, "loss": 0.3496, "mean_token_accuracy": 0.8771246552467347, "num_tokens": 27112205.0, "step": 3310 }, { "entropy": 0.4250369966030121, "epoch": 0.5292145593869731, "grad_norm": 1.7381726503372192, "learning_rate": 4.974161189201351e-06, "loss": 0.3816, "mean_token_accuracy": 0.8662177205085755, "num_tokens": 27153165.0, "step": 3315 }, { "entropy": 0.408034086227417, "epoch": 0.5300127713920817, "grad_norm": 1.7087209224700928, "learning_rate": 4.974083341983393e-06, "loss": 0.3661, "mean_token_accuracy": 0.872035276889801, "num_tokens": 27194125.0, "step": 3320 }, { "entropy": 0.41455634832382204, "epoch": 0.5308109833971902, "grad_norm": 1.5264880657196045, "learning_rate": 4.974005378488724e-06, "loss": 0.3761, "mean_token_accuracy": 0.8700628280639648, "num_tokens": 27235085.0, "step": 3325 }, { "entropy": 0.3727712512016296, "epoch": 0.5316091954022989, "grad_norm": 1.5364115238189697, "learning_rate": 4.973927298722247e-06, "loss": 0.3394, "mean_token_accuracy": 0.8815152525901795, "num_tokens": 27276045.0, "step": 3330 }, { "entropy": 0.38355987668037417, "epoch": 0.5324074074074074, "grad_norm": 1.6234145164489746, "learning_rate": 4.973849102688869e-06, "loss": 0.3375, "mean_token_accuracy": 0.879750919342041, "num_tokens": 27317005.0, "step": 3335 }, { "entropy": 0.3729994535446167, "epoch": 0.533205619412516, "grad_norm": 1.6391899585723877, "learning_rate": 4.973770790393511e-06, "loss": 0.335, "mean_token_accuracy": 0.8811629414558411, "num_tokens": 27357965.0, "step": 3340 }, { "entropy": 0.42625300884246825, "epoch": 0.5340038314176245, "grad_norm": 1.614808440208435, "learning_rate": 4.973692361841096e-06, "loss": 0.3815, "mean_token_accuracy": 0.8686782002449036, "num_tokens": 27398925.0, "step": 3345 }, { "entropy": 0.4034900009632111, "epoch": 0.5348020434227331, "grad_norm": 1.578659176826477, "learning_rate": 4.973613817036555e-06, "loss": 0.3678, "mean_token_accuracy": 0.8695542097091675, "num_tokens": 27439885.0, "step": 3350 }, { "entropy": 0.344242250919342, "epoch": 0.5356002554278416, "grad_norm": 1.570183277130127, "learning_rate": 4.973535155984829e-06, "loss": 0.3006, "mean_token_accuracy": 0.8920693874359131, "num_tokens": 27480845.0, "step": 3355 }, { "entropy": 0.3859936952590942, "epoch": 0.5363984674329502, "grad_norm": 1.7761328220367432, "learning_rate": 4.973456378690864e-06, "loss": 0.3549, "mean_token_accuracy": 0.8740076899528504, "num_tokens": 27521805.0, "step": 3360 }, { "entropy": 0.4139319658279419, "epoch": 0.5371966794380587, "grad_norm": 1.5927379131317139, "learning_rate": 4.973377485159612e-06, "loss": 0.379, "mean_token_accuracy": 0.8684604287147522, "num_tokens": 27562765.0, "step": 3365 }, { "entropy": 0.37344526648521426, "epoch": 0.5379948914431673, "grad_norm": 1.5802881717681885, "learning_rate": 4.973298475396037e-06, "loss": 0.3241, "mean_token_accuracy": 0.884388267993927, "num_tokens": 27603725.0, "step": 3370 }, { "entropy": 0.4616575360298157, "epoch": 0.5387931034482759, "grad_norm": 1.5784549713134766, "learning_rate": 4.973219349405104e-06, "loss": 0.4118, "mean_token_accuracy": 0.8549342632293702, "num_tokens": 27644685.0, "step": 3375 }, { "entropy": 0.40048813819885254, "epoch": 0.5395913154533845, "grad_norm": 1.5036301612854004, "learning_rate": 4.973140107191792e-06, "loss": 0.3617, "mean_token_accuracy": 0.8724414825439453, "num_tokens": 27685645.0, "step": 3380 }, { "entropy": 0.38270660638809206, "epoch": 0.540389527458493, "grad_norm": 1.6607155799865723, "learning_rate": 4.973060748761081e-06, "loss": 0.3521, "mean_token_accuracy": 0.8778811931610108, "num_tokens": 27726605.0, "step": 3385 }, { "entropy": 0.35836416482925415, "epoch": 0.5411877394636015, "grad_norm": 1.6206783056259155, "learning_rate": 4.972981274117965e-06, "loss": 0.3169, "mean_token_accuracy": 0.8881120204925537, "num_tokens": 27767565.0, "step": 3390 }, { "entropy": 0.38160343170166017, "epoch": 0.5419859514687101, "grad_norm": 1.6077865362167358, "learning_rate": 4.9729016832674385e-06, "loss": 0.3552, "mean_token_accuracy": 0.8760847568511962, "num_tokens": 27808525.0, "step": 3395 }, { "entropy": 0.40834884643554686, "epoch": 0.5427841634738186, "grad_norm": 2.050952911376953, "learning_rate": 4.972821976214507e-06, "loss": 0.3615, "mean_token_accuracy": 0.87137770652771, "num_tokens": 27849485.0, "step": 3400 }, { "epoch": 0.5427841634738186, "eval_entropy": 0.40098182344436645, "eval_loss": 0.35640591382980347, "eval_mean_token_accuracy": 0.8753156752586365, "eval_num_tokens": 27849485.0, "eval_runtime": 69.2629, "eval_samples_per_second": 14.438, "eval_steps_per_second": 1.805, "step": 3400 }, { "entropy": 0.3933412194252014, "epoch": 0.5435823754789272, "grad_norm": 1.3825479745864868, "learning_rate": 4.972742152964184e-06, "loss": 0.3508, "mean_token_accuracy": 0.8757568240165711, "num_tokens": 27890445.0, "step": 3405 }, { "entropy": 0.3877404689788818, "epoch": 0.5443805874840357, "grad_norm": 1.7192943096160889, "learning_rate": 4.9726622135214876e-06, "loss": 0.3492, "mean_token_accuracy": 0.8776248097419739, "num_tokens": 27931405.0, "step": 3410 }, { "entropy": 0.4037211060523987, "epoch": 0.5451787994891443, "grad_norm": 1.6639846563339233, "learning_rate": 4.9725821578914454e-06, "loss": 0.3684, "mean_token_accuracy": 0.8690607786178589, "num_tokens": 27972365.0, "step": 3415 }, { "entropy": 0.39761547446250917, "epoch": 0.5459770114942529, "grad_norm": 1.7780673503875732, "learning_rate": 4.972501986079093e-06, "loss": 0.3658, "mean_token_accuracy": 0.8701196074485779, "num_tokens": 28013325.0, "step": 3420 }, { "entropy": 0.374272894859314, "epoch": 0.5467752234993615, "grad_norm": 1.5579581260681152, "learning_rate": 4.972421698089469e-06, "loss": 0.3423, "mean_token_accuracy": 0.8811684727668763, "num_tokens": 28054285.0, "step": 3425 }, { "entropy": 0.37306315898895265, "epoch": 0.54757343550447, "grad_norm": 1.443179965019226, "learning_rate": 4.9723412939276235e-06, "loss": 0.3332, "mean_token_accuracy": 0.8828484654426575, "num_tokens": 28095245.0, "step": 3430 }, { "entropy": 0.3839456558227539, "epoch": 0.5483716475095786, "grad_norm": 1.6152342557907104, "learning_rate": 4.972260773598614e-06, "loss": 0.3452, "mean_token_accuracy": 0.8802782416343689, "num_tokens": 28136205.0, "step": 3435 }, { "entropy": 0.34850752353668213, "epoch": 0.5491698595146871, "grad_norm": 1.3699902296066284, "learning_rate": 4.972180137107502e-06, "loss": 0.3023, "mean_token_accuracy": 0.8939025521278381, "num_tokens": 28177165.0, "step": 3440 }, { "entropy": 0.42212756276130675, "epoch": 0.5499680715197957, "grad_norm": 1.4698554277420044, "learning_rate": 4.9720993844593575e-06, "loss": 0.3796, "mean_token_accuracy": 0.8676527976989746, "num_tokens": 28218125.0, "step": 3445 }, { "entropy": 0.3826135993003845, "epoch": 0.5507662835249042, "grad_norm": 1.751415729522705, "learning_rate": 4.972018515659261e-06, "loss": 0.3542, "mean_token_accuracy": 0.8748811244964599, "num_tokens": 28259085.0, "step": 3450 }, { "entropy": 0.3984654664993286, "epoch": 0.5515644955300127, "grad_norm": 1.5529251098632812, "learning_rate": 4.971937530712297e-06, "loss": 0.3665, "mean_token_accuracy": 0.8713862895965576, "num_tokens": 28300045.0, "step": 3455 }, { "entropy": 0.43848037123680117, "epoch": 0.5523627075351213, "grad_norm": 1.7217085361480713, "learning_rate": 4.971856429623557e-06, "loss": 0.3947, "mean_token_accuracy": 0.8617102026939392, "num_tokens": 28341005.0, "step": 3460 }, { "entropy": 0.4245146453380585, "epoch": 0.5531609195402298, "grad_norm": 1.6098260879516602, "learning_rate": 4.971775212398143e-06, "loss": 0.3813, "mean_token_accuracy": 0.8645444631576538, "num_tokens": 28381965.0, "step": 3465 }, { "entropy": 0.3518525302410126, "epoch": 0.5539591315453385, "grad_norm": 1.5915374755859375, "learning_rate": 4.971693879041159e-06, "loss": 0.3178, "mean_token_accuracy": 0.8894816517829895, "num_tokens": 28422925.0, "step": 3470 }, { "entropy": 0.3965813875198364, "epoch": 0.554757343550447, "grad_norm": 1.6396276950836182, "learning_rate": 4.971612429557722e-06, "loss": 0.3604, "mean_token_accuracy": 0.8739223599433898, "num_tokens": 28463885.0, "step": 3475 }, { "entropy": 0.40137404203414917, "epoch": 0.5555555555555556, "grad_norm": 1.5679564476013184, "learning_rate": 4.971530863952952e-06, "loss": 0.3572, "mean_token_accuracy": 0.8753439903259277, "num_tokens": 28504845.0, "step": 3480 }, { "entropy": 0.37675178050994873, "epoch": 0.5563537675606641, "grad_norm": 1.4812265634536743, "learning_rate": 4.971449182231981e-06, "loss": 0.33, "mean_token_accuracy": 0.8853613138198853, "num_tokens": 28545805.0, "step": 3485 }, { "entropy": 0.4472045123577118, "epoch": 0.5571519795657727, "grad_norm": 1.7498054504394531, "learning_rate": 4.971367384399944e-06, "loss": 0.4195, "mean_token_accuracy": 0.8568356394767761, "num_tokens": 28586765.0, "step": 3490 }, { "entropy": 0.3870035171508789, "epoch": 0.5579501915708812, "grad_norm": 1.7072486877441406, "learning_rate": 4.971285470461984e-06, "loss": 0.3442, "mean_token_accuracy": 0.8799459099769592, "num_tokens": 28627725.0, "step": 3495 }, { "entropy": 0.3909365952014923, "epoch": 0.5587484035759898, "grad_norm": 1.6284451484680176, "learning_rate": 4.971203440423252e-06, "loss": 0.3539, "mean_token_accuracy": 0.8747354984283447, "num_tokens": 28668685.0, "step": 3500 }, { "entropy": 0.37294856905937196, "epoch": 0.5595466155810983, "grad_norm": 1.5810928344726562, "learning_rate": 4.971121294288907e-06, "loss": 0.3369, "mean_token_accuracy": 0.8826012730598449, "num_tokens": 28709645.0, "step": 3505 }, { "entropy": 0.381207799911499, "epoch": 0.5603448275862069, "grad_norm": 1.5926982164382935, "learning_rate": 4.971039032064114e-06, "loss": 0.3489, "mean_token_accuracy": 0.8776193737983704, "num_tokens": 28750605.0, "step": 3510 }, { "entropy": 0.36891435384750365, "epoch": 0.5611430395913155, "grad_norm": 1.635933756828308, "learning_rate": 4.970956653754047e-06, "loss": 0.3275, "mean_token_accuracy": 0.8841464638710022, "num_tokens": 28791565.0, "step": 3515 }, { "entropy": 0.40799201130867, "epoch": 0.561941251596424, "grad_norm": 1.3482568264007568, "learning_rate": 4.970874159363886e-06, "loss": 0.3592, "mean_token_accuracy": 0.8733816742897034, "num_tokens": 28832525.0, "step": 3520 }, { "entropy": 0.3592789232730865, "epoch": 0.5627394636015326, "grad_norm": 1.4991364479064941, "learning_rate": 4.970791548898818e-06, "loss": 0.3285, "mean_token_accuracy": 0.8870156288146973, "num_tokens": 28873485.0, "step": 3525 }, { "entropy": 0.37787640690803526, "epoch": 0.5635376756066411, "grad_norm": 1.7729026079177856, "learning_rate": 4.9707088223640375e-06, "loss": 0.3391, "mean_token_accuracy": 0.8828484416007996, "num_tokens": 28914445.0, "step": 3530 }, { "entropy": 0.3990558385848999, "epoch": 0.5643358876117497, "grad_norm": 1.6233949661254883, "learning_rate": 4.970625979764747e-06, "loss": 0.3564, "mean_token_accuracy": 0.873016095161438, "num_tokens": 28955405.0, "step": 3535 }, { "entropy": 0.391368693113327, "epoch": 0.5651340996168582, "grad_norm": 1.5034476518630981, "learning_rate": 4.970543021106156e-06, "loss": 0.3504, "mean_token_accuracy": 0.8756026506423951, "num_tokens": 28996365.0, "step": 3540 }, { "entropy": 0.38302173614501955, "epoch": 0.5659323116219668, "grad_norm": 1.5422619581222534, "learning_rate": 4.970459946393482e-06, "loss": 0.3447, "mean_token_accuracy": 0.879080057144165, "num_tokens": 29037325.0, "step": 3545 }, { "entropy": 0.37263641953468324, "epoch": 0.5667305236270753, "grad_norm": 1.5199742317199707, "learning_rate": 4.970376755631948e-06, "loss": 0.3288, "mean_token_accuracy": 0.8836228370666503, "num_tokens": 29078285.0, "step": 3550 }, { "entropy": 0.32374128699302673, "epoch": 0.5675287356321839, "grad_norm": 1.549601435661316, "learning_rate": 4.970293448826786e-06, "loss": 0.2848, "mean_token_accuracy": 0.8980387449264526, "num_tokens": 29119245.0, "step": 3555 }, { "entropy": 0.36209123730659487, "epoch": 0.5683269476372924, "grad_norm": 1.5335177183151245, "learning_rate": 4.970210025983234e-06, "loss": 0.3257, "mean_token_accuracy": 0.8839428782463074, "num_tokens": 29160205.0, "step": 3560 }, { "entropy": 0.41070149540901185, "epoch": 0.5691251596424011, "grad_norm": 1.475563645362854, "learning_rate": 4.970126487106537e-06, "loss": 0.3738, "mean_token_accuracy": 0.8688075542449951, "num_tokens": 29201165.0, "step": 3565 }, { "entropy": 0.37354364395141604, "epoch": 0.5699233716475096, "grad_norm": 1.6093169450759888, "learning_rate": 4.970042832201951e-06, "loss": 0.3278, "mean_token_accuracy": 0.8846863627433776, "num_tokens": 29242125.0, "step": 3570 }, { "entropy": 0.39829763770103455, "epoch": 0.5707215836526182, "grad_norm": 1.666699767112732, "learning_rate": 4.969959061274734e-06, "loss": 0.3698, "mean_token_accuracy": 0.8715177774429321, "num_tokens": 29283085.0, "step": 3575 }, { "entropy": 0.3605415463447571, "epoch": 0.5715197956577267, "grad_norm": 1.4592363834381104, "learning_rate": 4.969875174330155e-06, "loss": 0.3189, "mean_token_accuracy": 0.8872054219245911, "num_tokens": 29324045.0, "step": 3580 }, { "entropy": 0.38993356823921205, "epoch": 0.5723180076628352, "grad_norm": 1.775950312614441, "learning_rate": 4.969791171373488e-06, "loss": 0.3411, "mean_token_accuracy": 0.8806413054466248, "num_tokens": 29365005.0, "step": 3585 }, { "entropy": 0.3733646094799042, "epoch": 0.5731162196679438, "grad_norm": 1.5953619480133057, "learning_rate": 4.969707052410016e-06, "loss": 0.3304, "mean_token_accuracy": 0.884143841266632, "num_tokens": 29405965.0, "step": 3590 }, { "entropy": 0.39912459850311277, "epoch": 0.5739144316730523, "grad_norm": 1.7145378589630127, "learning_rate": 4.96962281744503e-06, "loss": 0.3734, "mean_token_accuracy": 0.8690350651741028, "num_tokens": 29446925.0, "step": 3595 }, { "entropy": 0.367926698923111, "epoch": 0.5747126436781609, "grad_norm": 1.452913522720337, "learning_rate": 4.969538466483826e-06, "loss": 0.3309, "mean_token_accuracy": 0.8845783591270446, "num_tokens": 29487885.0, "step": 3600 }, { "epoch": 0.5747126436781609, "eval_entropy": 0.39141595339775087, "eval_loss": 0.35446324944496155, "eval_mean_token_accuracy": 0.8758203558921814, "eval_num_tokens": 29487885.0, "eval_runtime": 69.2621, "eval_samples_per_second": 14.438, "eval_steps_per_second": 1.805, "step": 3600 }, { "entropy": 0.3865828216075897, "epoch": 0.5755108556832694, "grad_norm": 1.5825527906417847, "learning_rate": 4.969453999531707e-06, "loss": 0.3452, "mean_token_accuracy": 0.877928352355957, "num_tokens": 29528845.0, "step": 3605 }, { "entropy": 0.35882670879364015, "epoch": 0.5763090676883781, "grad_norm": 1.3376578092575073, "learning_rate": 4.969369416593987e-06, "loss": 0.3239, "mean_token_accuracy": 0.8848472118377686, "num_tokens": 29569805.0, "step": 3610 }, { "entropy": 0.373634397983551, "epoch": 0.5771072796934866, "grad_norm": 1.5386207103729248, "learning_rate": 4.969284717675983e-06, "loss": 0.33, "mean_token_accuracy": 0.8834493279457092, "num_tokens": 29610765.0, "step": 3615 }, { "entropy": 0.35085774064064024, "epoch": 0.5779054916985952, "grad_norm": 1.5583409070968628, "learning_rate": 4.9691999027830215e-06, "loss": 0.3109, "mean_token_accuracy": 0.8899118542671204, "num_tokens": 29651712.0, "step": 3620 }, { "entropy": 0.4144057631492615, "epoch": 0.5787037037037037, "grad_norm": 1.6685035228729248, "learning_rate": 4.969114971920436e-06, "loss": 0.3768, "mean_token_accuracy": 0.8672501444816589, "num_tokens": 29692672.0, "step": 3625 }, { "entropy": 0.39991188049316406, "epoch": 0.5795019157088123, "grad_norm": 1.4954372644424438, "learning_rate": 4.969029925093568e-06, "loss": 0.3599, "mean_token_accuracy": 0.8725775599479675, "num_tokens": 29733632.0, "step": 3630 }, { "entropy": 0.40609376430511473, "epoch": 0.5803001277139208, "grad_norm": 1.5095274448394775, "learning_rate": 4.968944762307764e-06, "loss": 0.3669, "mean_token_accuracy": 0.871772575378418, "num_tokens": 29774592.0, "step": 3635 }, { "entropy": 0.3526857793331146, "epoch": 0.5810983397190294, "grad_norm": 1.5209416151046753, "learning_rate": 4.968859483568382e-06, "loss": 0.3077, "mean_token_accuracy": 0.8915246844291687, "num_tokens": 29815552.0, "step": 3640 }, { "entropy": 0.393011087179184, "epoch": 0.5818965517241379, "grad_norm": 1.6194604635238647, "learning_rate": 4.96877408888078e-06, "loss": 0.3621, "mean_token_accuracy": 0.8710348963737488, "num_tokens": 29856512.0, "step": 3645 }, { "entropy": 0.4141296446323395, "epoch": 0.5826947637292464, "grad_norm": 1.655099868774414, "learning_rate": 4.968688578250333e-06, "loss": 0.3888, "mean_token_accuracy": 0.8657617568969727, "num_tokens": 29897472.0, "step": 3650 }, { "entropy": 0.35265921950340273, "epoch": 0.583492975734355, "grad_norm": 1.6760668754577637, "learning_rate": 4.968602951682415e-06, "loss": 0.3123, "mean_token_accuracy": 0.8901681065559387, "num_tokens": 29938432.0, "step": 3655 }, { "entropy": 0.3939144194126129, "epoch": 0.5842911877394636, "grad_norm": 1.6867501735687256, "learning_rate": 4.968517209182412e-06, "loss": 0.3486, "mean_token_accuracy": 0.8784684062004089, "num_tokens": 29979392.0, "step": 3660 }, { "entropy": 0.4020268201828003, "epoch": 0.5850893997445722, "grad_norm": 1.7138423919677734, "learning_rate": 4.968431350755716e-06, "loss": 0.3567, "mean_token_accuracy": 0.8738431334495544, "num_tokens": 30020352.0, "step": 3665 }, { "entropy": 0.38859901428222654, "epoch": 0.5858876117496807, "grad_norm": 1.564834475517273, "learning_rate": 4.968345376407724e-06, "loss": 0.3511, "mean_token_accuracy": 0.8791564702987671, "num_tokens": 30061312.0, "step": 3670 }, { "entropy": 0.3994689702987671, "epoch": 0.5866858237547893, "grad_norm": 1.8667218685150146, "learning_rate": 4.968259286143844e-06, "loss": 0.3641, "mean_token_accuracy": 0.8723411798477173, "num_tokens": 30102272.0, "step": 3675 }, { "entropy": 0.40553962588310244, "epoch": 0.5874840357598978, "grad_norm": 1.5423719882965088, "learning_rate": 4.96817307996949e-06, "loss": 0.369, "mean_token_accuracy": 0.8713503837585449, "num_tokens": 30143232.0, "step": 3680 }, { "entropy": 0.39541796445846555, "epoch": 0.5882822477650064, "grad_norm": 1.7450207471847534, "learning_rate": 4.968086757890082e-06, "loss": 0.3523, "mean_token_accuracy": 0.8758617758750915, "num_tokens": 30184192.0, "step": 3685 }, { "entropy": 0.4130732357501984, "epoch": 0.5890804597701149, "grad_norm": 1.8434921503067017, "learning_rate": 4.968000319911049e-06, "loss": 0.3868, "mean_token_accuracy": 0.8648181915283203, "num_tokens": 30225152.0, "step": 3690 }, { "entropy": 0.3264343738555908, "epoch": 0.5898786717752235, "grad_norm": 1.4113878011703491, "learning_rate": 4.967913766037825e-06, "loss": 0.2857, "mean_token_accuracy": 0.9002414584159851, "num_tokens": 30266112.0, "step": 3695 }, { "entropy": 0.3778840720653534, "epoch": 0.590676883780332, "grad_norm": 1.3939766883850098, "learning_rate": 4.967827096275854e-06, "loss": 0.3292, "mean_token_accuracy": 0.8849563717842102, "num_tokens": 30307072.0, "step": 3700 }, { "entropy": 0.3920683264732361, "epoch": 0.5914750957854407, "grad_norm": 1.552365779876709, "learning_rate": 4.967740310630587e-06, "loss": 0.3583, "mean_token_accuracy": 0.8735494256019593, "num_tokens": 30348032.0, "step": 3705 }, { "entropy": 0.3946221828460693, "epoch": 0.5922733077905492, "grad_norm": 1.7382375001907349, "learning_rate": 4.96765340910748e-06, "loss": 0.3537, "mean_token_accuracy": 0.8761067390441895, "num_tokens": 30388992.0, "step": 3710 }, { "entropy": 0.390900981426239, "epoch": 0.5930715197956578, "grad_norm": 1.4038467407226562, "learning_rate": 4.9675663917119975e-06, "loss": 0.3513, "mean_token_accuracy": 0.8776496767997741, "num_tokens": 30429952.0, "step": 3715 }, { "entropy": 0.4173464775085449, "epoch": 0.5938697318007663, "grad_norm": 1.4674110412597656, "learning_rate": 4.967479258449612e-06, "loss": 0.3763, "mean_token_accuracy": 0.8679859876632691, "num_tokens": 30470912.0, "step": 3720 }, { "entropy": 0.397634357213974, "epoch": 0.5946679438058748, "grad_norm": 1.523329734802246, "learning_rate": 4.967392009325803e-06, "loss": 0.3572, "mean_token_accuracy": 0.8761757016181946, "num_tokens": 30511872.0, "step": 3725 }, { "entropy": 0.3922095954418182, "epoch": 0.5954661558109834, "grad_norm": 1.778900384902954, "learning_rate": 4.967304644346056e-06, "loss": 0.3566, "mean_token_accuracy": 0.8769558191299438, "num_tokens": 30552832.0, "step": 3730 }, { "entropy": 0.39952392578125, "epoch": 0.5962643678160919, "grad_norm": 1.4663424491882324, "learning_rate": 4.967217163515866e-06, "loss": 0.3646, "mean_token_accuracy": 0.8723309278488159, "num_tokens": 30593792.0, "step": 3735 }, { "entropy": 0.4193924903869629, "epoch": 0.5970625798212005, "grad_norm": 1.5840364694595337, "learning_rate": 4.9671295668407346e-06, "loss": 0.3842, "mean_token_accuracy": 0.8661716938018799, "num_tokens": 30634752.0, "step": 3740 }, { "entropy": 0.3981863796710968, "epoch": 0.597860791826309, "grad_norm": 1.7133623361587524, "learning_rate": 4.9670418543261674e-06, "loss": 0.3693, "mean_token_accuracy": 0.869674825668335, "num_tokens": 30675712.0, "step": 3745 }, { "entropy": 0.4032014787197113, "epoch": 0.5986590038314177, "grad_norm": 1.6495577096939087, "learning_rate": 4.966954025977683e-06, "loss": 0.3577, "mean_token_accuracy": 0.8763142943382263, "num_tokens": 30716672.0, "step": 3750 }, { "entropy": 0.39403932094573973, "epoch": 0.5994572158365262, "grad_norm": 1.4911088943481445, "learning_rate": 4.966866081800803e-06, "loss": 0.35, "mean_token_accuracy": 0.8772829055786133, "num_tokens": 30757632.0, "step": 3755 }, { "entropy": 0.395513778924942, "epoch": 0.6002554278416348, "grad_norm": 1.5269211530685425, "learning_rate": 4.966778021801058e-06, "loss": 0.3532, "mean_token_accuracy": 0.8761088252067566, "num_tokens": 30798592.0, "step": 3760 }, { "entropy": 0.3795560717582703, "epoch": 0.6010536398467433, "grad_norm": 1.6205265522003174, "learning_rate": 4.966689845983985e-06, "loss": 0.3402, "mean_token_accuracy": 0.8801708817481995, "num_tokens": 30839552.0, "step": 3765 }, { "entropy": 0.3631418526172638, "epoch": 0.6018518518518519, "grad_norm": 1.3927897214889526, "learning_rate": 4.966601554355129e-06, "loss": 0.3271, "mean_token_accuracy": 0.884639322757721, "num_tokens": 30880512.0, "step": 3770 }, { "entropy": 0.4569432020187378, "epoch": 0.6026500638569604, "grad_norm": 1.4657988548278809, "learning_rate": 4.966513146920044e-06, "loss": 0.4178, "mean_token_accuracy": 0.8528200626373291, "num_tokens": 30921472.0, "step": 3775 }, { "entropy": 0.3831614673137665, "epoch": 0.603448275862069, "grad_norm": 1.5960118770599365, "learning_rate": 4.966424623684285e-06, "loss": 0.3432, "mean_token_accuracy": 0.8764102578163147, "num_tokens": 30962432.0, "step": 3780 }, { "entropy": 0.393023943901062, "epoch": 0.6042464878671775, "grad_norm": 1.5771610736846924, "learning_rate": 4.966335984653423e-06, "loss": 0.3546, "mean_token_accuracy": 0.8748709559440613, "num_tokens": 31003392.0, "step": 3785 }, { "entropy": 0.3829107344150543, "epoch": 0.605044699872286, "grad_norm": 1.5862095355987549, "learning_rate": 4.966247229833029e-06, "loss": 0.3475, "mean_token_accuracy": 0.8772776603698731, "num_tokens": 31044352.0, "step": 3790 }, { "entropy": 0.40039422512054446, "epoch": 0.6058429118773946, "grad_norm": 1.9182721376419067, "learning_rate": 4.9661583592286864e-06, "loss": 0.3657, "mean_token_accuracy": 0.8720365285873413, "num_tokens": 31085312.0, "step": 3795 }, { "entropy": 0.39912710785865785, "epoch": 0.6066411238825032, "grad_norm": 1.7170418500900269, "learning_rate": 4.966069372845982e-06, "loss": 0.3569, "mean_token_accuracy": 0.8749265432357788, "num_tokens": 31126272.0, "step": 3800 }, { "epoch": 0.6066411238825032, "eval_entropy": 0.3909790225028992, "eval_loss": 0.35294920206069946, "eval_mean_token_accuracy": 0.8761810812950134, "eval_num_tokens": 31126272.0, "eval_runtime": 69.2385, "eval_samples_per_second": 14.443, "eval_steps_per_second": 1.805, "step": 3800 }, { "entropy": 0.420094895362854, "epoch": 0.6074393358876118, "grad_norm": 1.5807826519012451, "learning_rate": 4.9659802706905125e-06, "loss": 0.3874, "mean_token_accuracy": 0.8649389863014221, "num_tokens": 31167232.0, "step": 3805 }, { "entropy": 0.37068371176719667, "epoch": 0.6082375478927203, "grad_norm": 1.5241514444351196, "learning_rate": 4.965891052767881e-06, "loss": 0.3306, "mean_token_accuracy": 0.8824189662933349, "num_tokens": 31208192.0, "step": 3810 }, { "entropy": 0.43431633710861206, "epoch": 0.6090357598978289, "grad_norm": 1.515228509902954, "learning_rate": 4.965801719083697e-06, "loss": 0.392, "mean_token_accuracy": 0.8619320034980774, "num_tokens": 31249152.0, "step": 3815 }, { "entropy": 0.4021922886371613, "epoch": 0.6098339719029374, "grad_norm": 1.5357593297958374, "learning_rate": 4.965712269643578e-06, "loss": 0.3572, "mean_token_accuracy": 0.873773992061615, "num_tokens": 31290112.0, "step": 3820 }, { "entropy": 0.34870944619178773, "epoch": 0.610632183908046, "grad_norm": 1.3976689577102661, "learning_rate": 4.9656227044531505e-06, "loss": 0.3082, "mean_token_accuracy": 0.8884419322013855, "num_tokens": 31331072.0, "step": 3825 }, { "entropy": 0.3720965325832367, "epoch": 0.6114303959131545, "grad_norm": 1.5985755920410156, "learning_rate": 4.965533023518046e-06, "loss": 0.3375, "mean_token_accuracy": 0.8805681109428406, "num_tokens": 31372032.0, "step": 3830 }, { "entropy": 0.35963661074638364, "epoch": 0.6122286079182631, "grad_norm": 1.347583532333374, "learning_rate": 4.965443226843903e-06, "loss": 0.3201, "mean_token_accuracy": 0.8867804765701294, "num_tokens": 31412992.0, "step": 3835 }, { "entropy": 0.38517152070999144, "epoch": 0.6130268199233716, "grad_norm": 1.4462475776672363, "learning_rate": 4.965353314436368e-06, "loss": 0.3383, "mean_token_accuracy": 0.8791717529296875, "num_tokens": 31453952.0, "step": 3840 }, { "entropy": 0.32794575691223143, "epoch": 0.6138250319284803, "grad_norm": 1.6038838624954224, "learning_rate": 4.965263286301097e-06, "loss": 0.2911, "mean_token_accuracy": 0.8971529364585876, "num_tokens": 31494912.0, "step": 3845 }, { "entropy": 0.38298609256744387, "epoch": 0.6146232439335888, "grad_norm": 1.4308631420135498, "learning_rate": 4.96517314244375e-06, "loss": 0.345, "mean_token_accuracy": 0.8804044246673584, "num_tokens": 31535872.0, "step": 3850 }, { "entropy": 0.4029498934745789, "epoch": 0.6154214559386973, "grad_norm": 1.6430120468139648, "learning_rate": 4.965082882869996e-06, "loss": 0.3703, "mean_token_accuracy": 0.8700898885726929, "num_tokens": 31576832.0, "step": 3855 }, { "entropy": 0.3919563353061676, "epoch": 0.6162196679438059, "grad_norm": 1.7798478603363037, "learning_rate": 4.96499250758551e-06, "loss": 0.3523, "mean_token_accuracy": 0.8769108176231384, "num_tokens": 31617792.0, "step": 3860 }, { "entropy": 0.3522378861904144, "epoch": 0.6170178799489144, "grad_norm": 1.5173027515411377, "learning_rate": 4.964902016595976e-06, "loss": 0.3017, "mean_token_accuracy": 0.8938339829444886, "num_tokens": 31658752.0, "step": 3865 }, { "entropy": 0.36898383498191833, "epoch": 0.617816091954023, "grad_norm": 1.6988641023635864, "learning_rate": 4.964811409907084e-06, "loss": 0.3354, "mean_token_accuracy": 0.8814649105072021, "num_tokens": 31699712.0, "step": 3870 }, { "entropy": 0.40858992338180544, "epoch": 0.6186143039591315, "grad_norm": 1.7161165475845337, "learning_rate": 4.9647206875245305e-06, "loss": 0.3685, "mean_token_accuracy": 0.8708085536956787, "num_tokens": 31740672.0, "step": 3875 }, { "entropy": 0.409778368473053, "epoch": 0.6194125159642401, "grad_norm": 1.6546376943588257, "learning_rate": 4.964629849454022e-06, "loss": 0.3706, "mean_token_accuracy": 0.870085597038269, "num_tokens": 31781632.0, "step": 3880 }, { "entropy": 0.4049523532390594, "epoch": 0.6202107279693486, "grad_norm": 1.3990082740783691, "learning_rate": 4.964538895701272e-06, "loss": 0.3557, "mean_token_accuracy": 0.8743181347846984, "num_tokens": 31822592.0, "step": 3885 }, { "entropy": 0.4319458603858948, "epoch": 0.6210089399744572, "grad_norm": 1.7687848806381226, "learning_rate": 4.964447826271997e-06, "loss": 0.3934, "mean_token_accuracy": 0.8653620839118957, "num_tokens": 31863552.0, "step": 3890 }, { "entropy": 0.41185548305511477, "epoch": 0.6218071519795658, "grad_norm": 1.4894161224365234, "learning_rate": 4.964356641171925e-06, "loss": 0.371, "mean_token_accuracy": 0.8705492496490479, "num_tokens": 31904512.0, "step": 3895 }, { "entropy": 0.3748886942863464, "epoch": 0.6226053639846744, "grad_norm": 1.609660029411316, "learning_rate": 4.964265340406789e-06, "loss": 0.3365, "mean_token_accuracy": 0.8815550446510315, "num_tokens": 31945472.0, "step": 3900 }, { "entropy": 0.40155192017555236, "epoch": 0.6234035759897829, "grad_norm": 1.4394019842147827, "learning_rate": 4.964173923982334e-06, "loss": 0.3651, "mean_token_accuracy": 0.8742201566696167, "num_tokens": 31986432.0, "step": 3905 }, { "entropy": 0.4126347959041595, "epoch": 0.6242017879948915, "grad_norm": 1.5783936977386475, "learning_rate": 4.964082391904305e-06, "loss": 0.3635, "mean_token_accuracy": 0.8702286601066589, "num_tokens": 32027392.0, "step": 3910 }, { "entropy": 0.3948510766029358, "epoch": 0.625, "grad_norm": 1.6075055599212646, "learning_rate": 4.963990744178458e-06, "loss": 0.3527, "mean_token_accuracy": 0.874882161617279, "num_tokens": 32068352.0, "step": 3915 }, { "entropy": 0.40940110087394715, "epoch": 0.6257982120051085, "grad_norm": 1.6805075407028198, "learning_rate": 4.963898980810557e-06, "loss": 0.3865, "mean_token_accuracy": 0.864269745349884, "num_tokens": 32109312.0, "step": 3920 }, { "entropy": 0.3619593560695648, "epoch": 0.6265964240102171, "grad_norm": 1.6504465341567993, "learning_rate": 4.963807101806373e-06, "loss": 0.3252, "mean_token_accuracy": 0.8852601170539856, "num_tokens": 32150272.0, "step": 3925 }, { "entropy": 0.38621042370796205, "epoch": 0.6273946360153256, "grad_norm": 1.7133197784423828, "learning_rate": 4.963715107171683e-06, "loss": 0.3428, "mean_token_accuracy": 0.879948091506958, "num_tokens": 32191232.0, "step": 3930 }, { "entropy": 0.38483706712722776, "epoch": 0.6281928480204342, "grad_norm": 1.431302785873413, "learning_rate": 4.963622996912272e-06, "loss": 0.3374, "mean_token_accuracy": 0.8818876624107361, "num_tokens": 32232192.0, "step": 3935 }, { "entropy": 0.37900099754333494, "epoch": 0.6289910600255428, "grad_norm": 1.4894486665725708, "learning_rate": 4.963530771033931e-06, "loss": 0.3394, "mean_token_accuracy": 0.8806081175804138, "num_tokens": 32273152.0, "step": 3940 }, { "entropy": 0.3738178789615631, "epoch": 0.6297892720306514, "grad_norm": 1.5760552883148193, "learning_rate": 4.963438429542461e-06, "loss": 0.3277, "mean_token_accuracy": 0.883822786808014, "num_tokens": 32314112.0, "step": 3945 }, { "entropy": 0.3659499764442444, "epoch": 0.6305874840357599, "grad_norm": 2.1062514781951904, "learning_rate": 4.96334597244367e-06, "loss": 0.324, "mean_token_accuracy": 0.8866788744926453, "num_tokens": 32355072.0, "step": 3950 }, { "entropy": 0.37738183736801145, "epoch": 0.6313856960408685, "grad_norm": 1.7195035219192505, "learning_rate": 4.963253399743368e-06, "loss": 0.3345, "mean_token_accuracy": 0.8807390928268433, "num_tokens": 32396032.0, "step": 3955 }, { "entropy": 0.3469261348247528, "epoch": 0.632183908045977, "grad_norm": 1.5165008306503296, "learning_rate": 4.9631607114473804e-06, "loss": 0.312, "mean_token_accuracy": 0.8905919075012207, "num_tokens": 32436992.0, "step": 3960 }, { "entropy": 0.36719757318496704, "epoch": 0.6329821200510856, "grad_norm": 1.4283252954483032, "learning_rate": 4.963067907561534e-06, "loss": 0.3344, "mean_token_accuracy": 0.881699550151825, "num_tokens": 32477952.0, "step": 3965 }, { "entropy": 0.3427632570266724, "epoch": 0.6337803320561941, "grad_norm": 1.526005506515503, "learning_rate": 4.962974988091664e-06, "loss": 0.3026, "mean_token_accuracy": 0.8933268189430237, "num_tokens": 32518912.0, "step": 3970 }, { "entropy": 0.36962045431137086, "epoch": 0.6345785440613027, "grad_norm": 1.4629669189453125, "learning_rate": 4.962881953043614e-06, "loss": 0.3241, "mean_token_accuracy": 0.8862110733985901, "num_tokens": 32559850.0, "step": 3975 }, { "entropy": 0.3889750599861145, "epoch": 0.6353767560664112, "grad_norm": 1.4501713514328003, "learning_rate": 4.962788802423236e-06, "loss": 0.3402, "mean_token_accuracy": 0.8809821367263794, "num_tokens": 32600746.0, "step": 3980 }, { "entropy": 0.375089293718338, "epoch": 0.6361749680715197, "grad_norm": 1.5705161094665527, "learning_rate": 4.962695536236385e-06, "loss": 0.3342, "mean_token_accuracy": 0.880041527748108, "num_tokens": 32641706.0, "step": 3985 }, { "entropy": 0.31232019066810607, "epoch": 0.6369731800766284, "grad_norm": 1.4807438850402832, "learning_rate": 4.962602154488927e-06, "loss": 0.2771, "mean_token_accuracy": 0.9007895469665528, "num_tokens": 32682666.0, "step": 3990 }, { "entropy": 0.358897465467453, "epoch": 0.6377713920817369, "grad_norm": 1.5279514789581299, "learning_rate": 4.962508657186734e-06, "loss": 0.3251, "mean_token_accuracy": 0.8849632740020752, "num_tokens": 32723626.0, "step": 3995 }, { "entropy": 0.3594439685344696, "epoch": 0.6385696040868455, "grad_norm": 1.6460912227630615, "learning_rate": 4.962415044335687e-06, "loss": 0.3132, "mean_token_accuracy": 0.8889756202697754, "num_tokens": 32764586.0, "step": 4000 }, { "epoch": 0.6385696040868455, "eval_entropy": 0.3917965919971466, "eval_loss": 0.3518121838569641, "eval_mean_token_accuracy": 0.8764794664382934, "eval_num_tokens": 32764586.0, "eval_runtime": 69.2274, "eval_samples_per_second": 14.445, "eval_steps_per_second": 1.806, "step": 4000 }, { "entropy": 0.38373176455497743, "epoch": 0.639367816091954, "grad_norm": 1.583543300628662, "learning_rate": 4.96232131594167e-06, "loss": 0.3448, "mean_token_accuracy": 0.8794745564460754, "num_tokens": 32805546.0, "step": 4005 }, { "entropy": 0.3702171742916107, "epoch": 0.6401660280970626, "grad_norm": 1.4130009412765503, "learning_rate": 4.962227472010579e-06, "loss": 0.3229, "mean_token_accuracy": 0.8848128080368042, "num_tokens": 32846506.0, "step": 4010 }, { "entropy": 0.4183795750141144, "epoch": 0.6409642401021711, "grad_norm": 1.8898104429244995, "learning_rate": 4.962133512548314e-06, "loss": 0.3754, "mean_token_accuracy": 0.8680359721183777, "num_tokens": 32887466.0, "step": 4015 }, { "entropy": 0.34782321453094484, "epoch": 0.6417624521072797, "grad_norm": 1.4358257055282593, "learning_rate": 4.962039437560785e-06, "loss": 0.312, "mean_token_accuracy": 0.8888446807861328, "num_tokens": 32928426.0, "step": 4020 }, { "entropy": 0.38004317283630373, "epoch": 0.6425606641123882, "grad_norm": 1.6532775163650513, "learning_rate": 4.961945247053906e-06, "loss": 0.3478, "mean_token_accuracy": 0.8797279477119446, "num_tokens": 32969386.0, "step": 4025 }, { "entropy": 0.3863183081150055, "epoch": 0.6433588761174968, "grad_norm": 1.4063210487365723, "learning_rate": 4.9618509410336015e-06, "loss": 0.3403, "mean_token_accuracy": 0.8798073172569275, "num_tokens": 33010346.0, "step": 4030 }, { "entropy": 0.37526068091392517, "epoch": 0.6441570881226054, "grad_norm": 1.416603922843933, "learning_rate": 4.961756519505801e-06, "loss": 0.3378, "mean_token_accuracy": 0.8806898117065429, "num_tokens": 33051306.0, "step": 4035 }, { "entropy": 0.3895104885101318, "epoch": 0.644955300127714, "grad_norm": 1.463862419128418, "learning_rate": 4.9616619824764414e-06, "loss": 0.3459, "mean_token_accuracy": 0.8775312304496765, "num_tokens": 33092266.0, "step": 4040 }, { "entropy": 0.35795719623565675, "epoch": 0.6457535121328225, "grad_norm": 1.5325723886489868, "learning_rate": 4.961567329951469e-06, "loss": 0.3193, "mean_token_accuracy": 0.8871722459793091, "num_tokens": 33133226.0, "step": 4045 }, { "entropy": 0.3536506354808807, "epoch": 0.646551724137931, "grad_norm": 1.5098241567611694, "learning_rate": 4.961472561936834e-06, "loss": 0.3071, "mean_token_accuracy": 0.8913549780845642, "num_tokens": 33174186.0, "step": 4050 }, { "entropy": 0.3890280067920685, "epoch": 0.6473499361430396, "grad_norm": 1.5493937730789185, "learning_rate": 4.961377678438498e-06, "loss": 0.3521, "mean_token_accuracy": 0.8763817191123963, "num_tokens": 33215146.0, "step": 4055 }, { "entropy": 0.36107209920883176, "epoch": 0.6481481481481481, "grad_norm": 1.5306475162506104, "learning_rate": 4.961282679462427e-06, "loss": 0.3292, "mean_token_accuracy": 0.8859923839569092, "num_tokens": 33256106.0, "step": 4060 }, { "entropy": 0.3484861731529236, "epoch": 0.6489463601532567, "grad_norm": 1.5191750526428223, "learning_rate": 4.961187565014593e-06, "loss": 0.3095, "mean_token_accuracy": 0.8896730542182922, "num_tokens": 33297066.0, "step": 4065 }, { "entropy": 0.39441166520118714, "epoch": 0.6497445721583652, "grad_norm": 1.5315167903900146, "learning_rate": 4.961092335100979e-06, "loss": 0.3613, "mean_token_accuracy": 0.873190951347351, "num_tokens": 33338026.0, "step": 4070 }, { "entropy": 0.35233972668647767, "epoch": 0.6505427841634738, "grad_norm": 1.319646954536438, "learning_rate": 4.960996989727574e-06, "loss": 0.3026, "mean_token_accuracy": 0.8912659168243409, "num_tokens": 33378986.0, "step": 4075 }, { "entropy": 0.4316498875617981, "epoch": 0.6513409961685823, "grad_norm": 1.7222440242767334, "learning_rate": 4.960901528900371e-06, "loss": 0.3866, "mean_token_accuracy": 0.866232967376709, "num_tokens": 33419946.0, "step": 4080 }, { "entropy": 0.40433109998703004, "epoch": 0.652139208173691, "grad_norm": 1.6289095878601074, "learning_rate": 4.960805952625374e-06, "loss": 0.3664, "mean_token_accuracy": 0.8714369535446167, "num_tokens": 33460906.0, "step": 4085 }, { "entropy": 0.34908521771430967, "epoch": 0.6529374201787995, "grad_norm": 1.2474722862243652, "learning_rate": 4.960710260908595e-06, "loss": 0.3096, "mean_token_accuracy": 0.8889854311943054, "num_tokens": 33501866.0, "step": 4090 }, { "entropy": 0.3515977382659912, "epoch": 0.6537356321839081, "grad_norm": 1.4092243909835815, "learning_rate": 4.96061445375605e-06, "loss": 0.3187, "mean_token_accuracy": 0.8876409888267517, "num_tokens": 33542826.0, "step": 4095 }, { "entropy": 0.39068559408187864, "epoch": 0.6545338441890166, "grad_norm": 1.7305725812911987, "learning_rate": 4.960518531173763e-06, "loss": 0.3491, "mean_token_accuracy": 0.8775494694709778, "num_tokens": 33583786.0, "step": 4100 }, { "entropy": 0.4475154936313629, "epoch": 0.6553320561941252, "grad_norm": 1.5801259279251099, "learning_rate": 4.960422493167767e-06, "loss": 0.4084, "mean_token_accuracy": 0.8554537773132325, "num_tokens": 33624746.0, "step": 4105 }, { "entropy": 0.3612799346446991, "epoch": 0.6561302681992337, "grad_norm": 1.3840538263320923, "learning_rate": 4.9603263397441e-06, "loss": 0.3254, "mean_token_accuracy": 0.885944414138794, "num_tokens": 33665706.0, "step": 4110 }, { "entropy": 0.37169676423072817, "epoch": 0.6569284802043422, "grad_norm": 1.5911798477172852, "learning_rate": 4.96023007090881e-06, "loss": 0.3204, "mean_token_accuracy": 0.8856913924217225, "num_tokens": 33706666.0, "step": 4115 }, { "entropy": 0.3725708842277527, "epoch": 0.6577266922094508, "grad_norm": 1.5351768732070923, "learning_rate": 4.96013368666795e-06, "loss": 0.3295, "mean_token_accuracy": 0.8846683621406555, "num_tokens": 33747626.0, "step": 4120 }, { "entropy": 0.35295900106430056, "epoch": 0.6585249042145593, "grad_norm": 1.6065768003463745, "learning_rate": 4.960037187027581e-06, "loss": 0.3107, "mean_token_accuracy": 0.8901387929916382, "num_tokens": 33788586.0, "step": 4125 }, { "entropy": 0.40923756957054136, "epoch": 0.659323116219668, "grad_norm": 1.6720138788223267, "learning_rate": 4.959940571993771e-06, "loss": 0.3673, "mean_token_accuracy": 0.8710006952285767, "num_tokens": 33829546.0, "step": 4130 }, { "entropy": 0.39302995800971985, "epoch": 0.6601213282247765, "grad_norm": 1.5660480260849, "learning_rate": 4.959843841572596e-06, "loss": 0.3407, "mean_token_accuracy": 0.8789271354675293, "num_tokens": 33870506.0, "step": 4135 }, { "entropy": 0.39711377024650574, "epoch": 0.6609195402298851, "grad_norm": 1.6311522722244263, "learning_rate": 4.959746995770137e-06, "loss": 0.3625, "mean_token_accuracy": 0.8728868365287781, "num_tokens": 33911466.0, "step": 4140 }, { "entropy": 0.38717801570892335, "epoch": 0.6617177522349936, "grad_norm": 1.5107231140136719, "learning_rate": 4.959650034592487e-06, "loss": 0.349, "mean_token_accuracy": 0.8788737893104553, "num_tokens": 33952426.0, "step": 4145 }, { "entropy": 0.39666436314582826, "epoch": 0.6625159642401022, "grad_norm": 1.5432357788085938, "learning_rate": 4.959552958045742e-06, "loss": 0.3581, "mean_token_accuracy": 0.8723564863204956, "num_tokens": 33993386.0, "step": 4150 }, { "entropy": 0.3947413682937622, "epoch": 0.6633141762452107, "grad_norm": 1.6945823431015015, "learning_rate": 4.959455766136005e-06, "loss": 0.3597, "mean_token_accuracy": 0.8749089002609253, "num_tokens": 34034346.0, "step": 4155 }, { "entropy": 0.37710251808166506, "epoch": 0.6641123882503193, "grad_norm": 1.5467009544372559, "learning_rate": 4.95935845886939e-06, "loss": 0.3257, "mean_token_accuracy": 0.885505223274231, "num_tokens": 34075306.0, "step": 4160 }, { "entropy": 0.3872054398059845, "epoch": 0.6649106002554278, "grad_norm": 1.6126255989074707, "learning_rate": 4.959261036252014e-06, "loss": 0.3426, "mean_token_accuracy": 0.880678677558899, "num_tokens": 34116266.0, "step": 4165 }, { "entropy": 0.38808879256248474, "epoch": 0.6657088122605364, "grad_norm": 1.415348768234253, "learning_rate": 4.959163498290004e-06, "loss": 0.3453, "mean_token_accuracy": 0.8778594136238098, "num_tokens": 34157226.0, "step": 4170 }, { "entropy": 0.41391260027885435, "epoch": 0.666507024265645, "grad_norm": 1.713935375213623, "learning_rate": 4.9590658449894944e-06, "loss": 0.3702, "mean_token_accuracy": 0.8717985391616822, "num_tokens": 34198186.0, "step": 4175 }, { "entropy": 0.3717846214771271, "epoch": 0.6673052362707536, "grad_norm": 1.655297875404358, "learning_rate": 4.958968076356625e-06, "loss": 0.3303, "mean_token_accuracy": 0.8843464732170105, "num_tokens": 34239146.0, "step": 4180 }, { "entropy": 0.4275781691074371, "epoch": 0.6681034482758621, "grad_norm": 1.537449598312378, "learning_rate": 4.958870192397544e-06, "loss": 0.3843, "mean_token_accuracy": 0.86600261926651, "num_tokens": 34280106.0, "step": 4185 }, { "entropy": 0.369396436214447, "epoch": 0.6689016602809706, "grad_norm": 1.7176984548568726, "learning_rate": 4.958772193118408e-06, "loss": 0.3319, "mean_token_accuracy": 0.8829890727996826, "num_tokens": 34321066.0, "step": 4190 }, { "entropy": 0.37659146189689635, "epoch": 0.6696998722860792, "grad_norm": 1.522311806678772, "learning_rate": 4.958674078525378e-06, "loss": 0.3318, "mean_token_accuracy": 0.88204345703125, "num_tokens": 34362026.0, "step": 4195 }, { "entropy": 0.40196934938430784, "epoch": 0.6704980842911877, "grad_norm": 1.57537043094635, "learning_rate": 4.958575848624624e-06, "loss": 0.3664, "mean_token_accuracy": 0.8716495633125305, "num_tokens": 34402986.0, "step": 4200 }, { "epoch": 0.6704980842911877, "eval_entropy": 0.39062203240394594, "eval_loss": 0.34974205493927, "eval_mean_token_accuracy": 0.877336347579956, "eval_num_tokens": 34402986.0, "eval_runtime": 69.2668, "eval_samples_per_second": 14.437, "eval_steps_per_second": 1.805, "step": 4200 }, { "entropy": 0.41640692949295044, "epoch": 0.6712962962962963, "grad_norm": 1.7407722473144531, "learning_rate": 4.9584775034223224e-06, "loss": 0.3753, "mean_token_accuracy": 0.8696022033691406, "num_tokens": 34443946.0, "step": 4205 }, { "entropy": 0.4036651015281677, "epoch": 0.6720945083014048, "grad_norm": 1.6700903177261353, "learning_rate": 4.958379042924658e-06, "loss": 0.3506, "mean_token_accuracy": 0.8763973951339722, "num_tokens": 34484906.0, "step": 4210 }, { "entropy": 0.4063118636608124, "epoch": 0.6728927203065134, "grad_norm": 1.66812002658844, "learning_rate": 4.958280467137824e-06, "loss": 0.3646, "mean_token_accuracy": 0.8714771866798401, "num_tokens": 34525866.0, "step": 4215 }, { "entropy": 0.3816158056259155, "epoch": 0.6736909323116219, "grad_norm": 1.613732099533081, "learning_rate": 4.958181776068017e-06, "loss": 0.3478, "mean_token_accuracy": 0.8783071279525757, "num_tokens": 34566826.0, "step": 4220 }, { "entropy": 0.4148737370967865, "epoch": 0.6744891443167306, "grad_norm": 1.5773872137069702, "learning_rate": 4.958082969721444e-06, "loss": 0.3703, "mean_token_accuracy": 0.8704238414764405, "num_tokens": 34607786.0, "step": 4225 }, { "entropy": 0.405037522315979, "epoch": 0.6752873563218391, "grad_norm": 1.681789755821228, "learning_rate": 4.957984048104318e-06, "loss": 0.3659, "mean_token_accuracy": 0.8717342495918274, "num_tokens": 34648746.0, "step": 4230 }, { "entropy": 0.39438945055007935, "epoch": 0.6760855683269477, "grad_norm": 1.8257066011428833, "learning_rate": 4.957885011222859e-06, "loss": 0.3479, "mean_token_accuracy": 0.879536759853363, "num_tokens": 34689706.0, "step": 4235 }, { "entropy": 0.3599630892276764, "epoch": 0.6768837803320562, "grad_norm": 1.4572027921676636, "learning_rate": 4.957785859083297e-06, "loss": 0.3189, "mean_token_accuracy": 0.8867843866348266, "num_tokens": 34730666.0, "step": 4240 }, { "entropy": 0.37403408288955686, "epoch": 0.6776819923371648, "grad_norm": 1.614817500114441, "learning_rate": 4.957686591691864e-06, "loss": 0.324, "mean_token_accuracy": 0.8855644464492798, "num_tokens": 34771626.0, "step": 4245 }, { "entropy": 0.41073269248008726, "epoch": 0.6784802043422733, "grad_norm": 1.6293586492538452, "learning_rate": 4.957587209054804e-06, "loss": 0.3642, "mean_token_accuracy": 0.8730408787727356, "num_tokens": 34812586.0, "step": 4250 }, { "entropy": 0.4003384351730347, "epoch": 0.6792784163473818, "grad_norm": 1.5987569093704224, "learning_rate": 4.957487711178366e-06, "loss": 0.3577, "mean_token_accuracy": 0.8723024964332581, "num_tokens": 34853546.0, "step": 4255 }, { "entropy": 0.39334952235221865, "epoch": 0.6800766283524904, "grad_norm": 1.5379856824874878, "learning_rate": 4.957388098068808e-06, "loss": 0.3472, "mean_token_accuracy": 0.8757344603538513, "num_tokens": 34894312.0, "step": 4260 }, { "entropy": 0.38406250476837156, "epoch": 0.6808748403575989, "grad_norm": 1.5372836589813232, "learning_rate": 4.9572883697323926e-06, "loss": 0.3492, "mean_token_accuracy": 0.8771077871322632, "num_tokens": 34935272.0, "step": 4265 }, { "entropy": 0.38156905174255373, "epoch": 0.6816730523627076, "grad_norm": 1.669527292251587, "learning_rate": 4.957188526175391e-06, "loss": 0.3479, "mean_token_accuracy": 0.8789000630378723, "num_tokens": 34976232.0, "step": 4270 }, { "entropy": 0.36010006070137024, "epoch": 0.6824712643678161, "grad_norm": 1.5515540838241577, "learning_rate": 4.957088567404082e-06, "loss": 0.3156, "mean_token_accuracy": 0.8882894873619079, "num_tokens": 35017192.0, "step": 4275 }, { "entropy": 0.3551903784275055, "epoch": 0.6832694763729247, "grad_norm": 1.6245423555374146, "learning_rate": 4.956988493424753e-06, "loss": 0.3146, "mean_token_accuracy": 0.8872243881225585, "num_tokens": 35058152.0, "step": 4280 }, { "entropy": 0.35940446257591246, "epoch": 0.6840676883780332, "grad_norm": 1.5272520780563354, "learning_rate": 4.956888304243695e-06, "loss": 0.3176, "mean_token_accuracy": 0.8874533534049988, "num_tokens": 35099112.0, "step": 4285 }, { "entropy": 0.3756145477294922, "epoch": 0.6848659003831418, "grad_norm": 1.5808496475219727, "learning_rate": 4.9567879998672075e-06, "loss": 0.3351, "mean_token_accuracy": 0.8830928564071655, "num_tokens": 35140072.0, "step": 4290 }, { "entropy": 0.37835639119148257, "epoch": 0.6856641123882503, "grad_norm": 1.5594112873077393, "learning_rate": 4.9566875803016e-06, "loss": 0.3339, "mean_token_accuracy": 0.8826068878173828, "num_tokens": 35181032.0, "step": 4295 }, { "entropy": 0.39985790848731995, "epoch": 0.6864623243933589, "grad_norm": 1.4914906024932861, "learning_rate": 4.956587045553186e-06, "loss": 0.3645, "mean_token_accuracy": 0.8740491151809693, "num_tokens": 35221992.0, "step": 4300 }, { "entropy": 0.4111567497253418, "epoch": 0.6872605363984674, "grad_norm": 1.6156212091445923, "learning_rate": 4.956486395628289e-06, "loss": 0.3672, "mean_token_accuracy": 0.8734665513038635, "num_tokens": 35262952.0, "step": 4305 }, { "entropy": 0.4254221498966217, "epoch": 0.688058748403576, "grad_norm": 1.5075019598007202, "learning_rate": 4.956385630533236e-06, "loss": 0.374, "mean_token_accuracy": 0.8709608435630798, "num_tokens": 35303912.0, "step": 4310 }, { "entropy": 0.3952524304389954, "epoch": 0.6888569604086845, "grad_norm": 1.6715117692947388, "learning_rate": 4.956284750274366e-06, "loss": 0.3537, "mean_token_accuracy": 0.8758527517318726, "num_tokens": 35344872.0, "step": 4315 }, { "entropy": 0.3389691412448883, "epoch": 0.6896551724137931, "grad_norm": 1.405334711074829, "learning_rate": 4.95618375485802e-06, "loss": 0.3019, "mean_token_accuracy": 0.8922526597976684, "num_tokens": 35385832.0, "step": 4320 }, { "entropy": 0.3625007688999176, "epoch": 0.6904533844189017, "grad_norm": 1.6644231081008911, "learning_rate": 4.956082644290551e-06, "loss": 0.3311, "mean_token_accuracy": 0.8843298196792603, "num_tokens": 35426792.0, "step": 4325 }, { "entropy": 0.3890696942806244, "epoch": 0.6912515964240102, "grad_norm": 1.4922293424606323, "learning_rate": 4.955981418578316e-06, "loss": 0.3521, "mean_token_accuracy": 0.8751874804496765, "num_tokens": 35467752.0, "step": 4330 }, { "entropy": 0.3943796694278717, "epoch": 0.6920498084291188, "grad_norm": 1.3469480276107788, "learning_rate": 4.955880077727681e-06, "loss": 0.3485, "mean_token_accuracy": 0.8783192157745361, "num_tokens": 35508712.0, "step": 4335 }, { "entropy": 0.3815250813961029, "epoch": 0.6928480204342273, "grad_norm": 1.6829841136932373, "learning_rate": 4.955778621745019e-06, "loss": 0.3441, "mean_token_accuracy": 0.878368365764618, "num_tokens": 35549672.0, "step": 4340 }, { "entropy": 0.3641778290271759, "epoch": 0.6936462324393359, "grad_norm": 1.3170735836029053, "learning_rate": 4.955677050636709e-06, "loss": 0.3221, "mean_token_accuracy": 0.8862357020378113, "num_tokens": 35590632.0, "step": 4345 }, { "entropy": 0.3830491900444031, "epoch": 0.6944444444444444, "grad_norm": 1.4673370122909546, "learning_rate": 4.955575364409138e-06, "loss": 0.337, "mean_token_accuracy": 0.8813378095626831, "num_tokens": 35631592.0, "step": 4350 }, { "entropy": 0.3623965919017792, "epoch": 0.695242656449553, "grad_norm": 1.958211898803711, "learning_rate": 4.955473563068702e-06, "loss": 0.3208, "mean_token_accuracy": 0.8858277082443238, "num_tokens": 35672552.0, "step": 4355 }, { "entropy": 0.38546149134635926, "epoch": 0.6960408684546615, "grad_norm": 1.5618401765823364, "learning_rate": 4.955371646621801e-06, "loss": 0.3425, "mean_token_accuracy": 0.8819298505783081, "num_tokens": 35713512.0, "step": 4360 }, { "entropy": 0.37305532693862914, "epoch": 0.6968390804597702, "grad_norm": 1.4023091793060303, "learning_rate": 4.955269615074843e-06, "loss": 0.3292, "mean_token_accuracy": 0.8842706322669983, "num_tokens": 35754472.0, "step": 4365 }, { "entropy": 0.38944450616836546, "epoch": 0.6976372924648787, "grad_norm": 1.6398652791976929, "learning_rate": 4.955167468434247e-06, "loss": 0.352, "mean_token_accuracy": 0.8770965456962585, "num_tokens": 35795432.0, "step": 4370 }, { "entropy": 0.41156930327415464, "epoch": 0.6984355044699873, "grad_norm": 1.5604636669158936, "learning_rate": 4.955065206706435e-06, "loss": 0.368, "mean_token_accuracy": 0.8737661957740783, "num_tokens": 35836392.0, "step": 4375 }, { "entropy": 0.3764748632907867, "epoch": 0.6992337164750958, "grad_norm": 1.3062576055526733, "learning_rate": 4.954962829897838e-06, "loss": 0.3282, "mean_token_accuracy": 0.8827279210090637, "num_tokens": 35877352.0, "step": 4380 }, { "entropy": 0.3629325807094574, "epoch": 0.7000319284802043, "grad_norm": 1.4810214042663574, "learning_rate": 4.954860338014892e-06, "loss": 0.3217, "mean_token_accuracy": 0.8866893887519837, "num_tokens": 35918312.0, "step": 4385 }, { "entropy": 0.3667100667953491, "epoch": 0.7008301404853129, "grad_norm": 1.7354620695114136, "learning_rate": 4.954757731064044e-06, "loss": 0.3301, "mean_token_accuracy": 0.8835196852684021, "num_tokens": 35959272.0, "step": 4390 }, { "entropy": 0.3856470465660095, "epoch": 0.7016283524904214, "grad_norm": 1.5140351057052612, "learning_rate": 4.954655009051745e-06, "loss": 0.3508, "mean_token_accuracy": 0.8756252408027649, "num_tokens": 36000232.0, "step": 4395 }, { "entropy": 0.4114512622356415, "epoch": 0.70242656449553, "grad_norm": 1.543448805809021, "learning_rate": 4.954552171984455e-06, "loss": 0.3737, "mean_token_accuracy": 0.867775559425354, "num_tokens": 36041192.0, "step": 4400 }, { "epoch": 0.70242656449553, "eval_entropy": 0.39369331550598147, "eval_loss": 0.34902307391166687, "eval_mean_token_accuracy": 0.8771976776123047, "eval_num_tokens": 36041192.0, "eval_runtime": 69.12, "eval_samples_per_second": 14.468, "eval_steps_per_second": 1.808, "step": 4400 }, { "entropy": 0.39357762932777407, "epoch": 0.7032247765006385, "grad_norm": 1.480932354927063, "learning_rate": 4.95444921986864e-06, "loss": 0.3478, "mean_token_accuracy": 0.8758068084716797, "num_tokens": 36082152.0, "step": 4405 }, { "entropy": 0.36230148673057555, "epoch": 0.7040229885057471, "grad_norm": 1.4368892908096313, "learning_rate": 4.9543461527107765e-06, "loss": 0.3228, "mean_token_accuracy": 0.8867041110992432, "num_tokens": 36123112.0, "step": 4410 }, { "entropy": 0.4771185517311096, "epoch": 0.7048212005108557, "grad_norm": 1.6853086948394775, "learning_rate": 4.954242970517343e-06, "loss": 0.4349, "mean_token_accuracy": 0.8508485913276672, "num_tokens": 36164072.0, "step": 4415 }, { "entropy": 0.37011151313781737, "epoch": 0.7056194125159643, "grad_norm": 1.3787293434143066, "learning_rate": 4.954139673294828e-06, "loss": 0.3265, "mean_token_accuracy": 0.8846035480499268, "num_tokens": 36205032.0, "step": 4420 }, { "entropy": 0.38793652057647704, "epoch": 0.7064176245210728, "grad_norm": 1.5404571294784546, "learning_rate": 4.95403626104973e-06, "loss": 0.345, "mean_token_accuracy": 0.8788623690605164, "num_tokens": 36245992.0, "step": 4425 }, { "entropy": 0.3592794477939606, "epoch": 0.7072158365261814, "grad_norm": 1.6141693592071533, "learning_rate": 4.95393273378855e-06, "loss": 0.316, "mean_token_accuracy": 0.888990044593811, "num_tokens": 36286952.0, "step": 4430 }, { "entropy": 0.36488319635391236, "epoch": 0.7080140485312899, "grad_norm": 1.6408461332321167, "learning_rate": 4.953829091517797e-06, "loss": 0.3268, "mean_token_accuracy": 0.8837617397308349, "num_tokens": 36327912.0, "step": 4435 }, { "entropy": 0.3899082064628601, "epoch": 0.7088122605363985, "grad_norm": 1.3915435075759888, "learning_rate": 4.95372533424399e-06, "loss": 0.3515, "mean_token_accuracy": 0.8771253943443298, "num_tokens": 36368872.0, "step": 4440 }, { "entropy": 0.3807647466659546, "epoch": 0.709610472541507, "grad_norm": 1.6155505180358887, "learning_rate": 4.953621461973653e-06, "loss": 0.3365, "mean_token_accuracy": 0.8812530994415283, "num_tokens": 36409832.0, "step": 4445 }, { "entropy": 0.41307615041732787, "epoch": 0.7104086845466155, "grad_norm": 1.70521080493927, "learning_rate": 4.953517474713318e-06, "loss": 0.3748, "mean_token_accuracy": 0.8698232889175415, "num_tokens": 36450792.0, "step": 4450 }, { "entropy": 0.3725646257400513, "epoch": 0.7112068965517241, "grad_norm": 1.4554929733276367, "learning_rate": 4.9534133724695244e-06, "loss": 0.3222, "mean_token_accuracy": 0.8859057426452637, "num_tokens": 36491752.0, "step": 4455 }, { "entropy": 0.37655861377716066, "epoch": 0.7120051085568327, "grad_norm": 1.519578456878662, "learning_rate": 4.953309155248818e-06, "loss": 0.3355, "mean_token_accuracy": 0.880521821975708, "num_tokens": 36532712.0, "step": 4460 }, { "entropy": 0.3744112551212311, "epoch": 0.7128033205619413, "grad_norm": 1.5445661544799805, "learning_rate": 4.953204823057752e-06, "loss": 0.3383, "mean_token_accuracy": 0.8822221159934998, "num_tokens": 36573672.0, "step": 4465 }, { "entropy": 0.42416757345199585, "epoch": 0.7136015325670498, "grad_norm": 1.565629005432129, "learning_rate": 4.953100375902889e-06, "loss": 0.3807, "mean_token_accuracy": 0.8664975762367249, "num_tokens": 36614632.0, "step": 4470 }, { "entropy": 0.4182154297828674, "epoch": 0.7143997445721584, "grad_norm": 1.5452251434326172, "learning_rate": 4.952995813790795e-06, "loss": 0.3786, "mean_token_accuracy": 0.8647769689559937, "num_tokens": 36655592.0, "step": 4475 }, { "entropy": 0.3633765935897827, "epoch": 0.7151979565772669, "grad_norm": 1.2990319728851318, "learning_rate": 4.9528911367280465e-06, "loss": 0.3256, "mean_token_accuracy": 0.8850704431533813, "num_tokens": 36696552.0, "step": 4480 }, { "entropy": 0.384858101606369, "epoch": 0.7159961685823755, "grad_norm": 1.4177272319793701, "learning_rate": 4.952786344721225e-06, "loss": 0.3481, "mean_token_accuracy": 0.8788316130638123, "num_tokens": 36737512.0, "step": 4485 }, { "entropy": 0.4165548741817474, "epoch": 0.716794380587484, "grad_norm": 1.6013543605804443, "learning_rate": 4.95268143777692e-06, "loss": 0.3821, "mean_token_accuracy": 0.8671042084693908, "num_tokens": 36778472.0, "step": 4490 }, { "entropy": 0.34506208300590513, "epoch": 0.7175925925925926, "grad_norm": 1.5734069347381592, "learning_rate": 4.95257641590173e-06, "loss": 0.3009, "mean_token_accuracy": 0.8941628456115722, "num_tokens": 36819432.0, "step": 4495 }, { "entropy": 0.3837789297103882, "epoch": 0.7183908045977011, "grad_norm": 1.5752099752426147, "learning_rate": 4.9524712791022565e-06, "loss": 0.3371, "mean_token_accuracy": 0.8817675113677979, "num_tokens": 36860392.0, "step": 4500 }, { "entropy": 0.3931439518928528, "epoch": 0.7191890166028098, "grad_norm": 1.6178399324417114, "learning_rate": 4.952366027385114e-06, "loss": 0.3475, "mean_token_accuracy": 0.8789139747619629, "num_tokens": 36901352.0, "step": 4505 }, { "entropy": 0.39552693963050845, "epoch": 0.7199872286079183, "grad_norm": 1.5380029678344727, "learning_rate": 4.952260660756919e-06, "loss": 0.357, "mean_token_accuracy": 0.8769156217575074, "num_tokens": 36942312.0, "step": 4510 }, { "entropy": 0.4134032607078552, "epoch": 0.7207854406130269, "grad_norm": 1.5812859535217285, "learning_rate": 4.952155179224298e-06, "loss": 0.3711, "mean_token_accuracy": 0.8702471852302551, "num_tokens": 36983272.0, "step": 4515 }, { "entropy": 0.388623321056366, "epoch": 0.7215836526181354, "grad_norm": 1.444392204284668, "learning_rate": 4.952049582793884e-06, "loss": 0.3403, "mean_token_accuracy": 0.8805589318275452, "num_tokens": 37024232.0, "step": 4520 }, { "entropy": 0.357010555267334, "epoch": 0.7223818646232439, "grad_norm": 1.4659650325775146, "learning_rate": 4.951943871472317e-06, "loss": 0.3127, "mean_token_accuracy": 0.8911778092384338, "num_tokens": 37065192.0, "step": 4525 }, { "entropy": 0.379272598028183, "epoch": 0.7231800766283525, "grad_norm": 1.5867300033569336, "learning_rate": 4.951838045266244e-06, "loss": 0.3438, "mean_token_accuracy": 0.8777178883552551, "num_tokens": 37106152.0, "step": 4530 }, { "entropy": 0.37308881282806394, "epoch": 0.723978288633461, "grad_norm": 1.279022455215454, "learning_rate": 4.951732104182321e-06, "loss": 0.3286, "mean_token_accuracy": 0.8849388480186462, "num_tokens": 37147112.0, "step": 4535 }, { "entropy": 0.42798325419425964, "epoch": 0.7247765006385696, "grad_norm": 1.603519082069397, "learning_rate": 4.95162604822721e-06, "loss": 0.3839, "mean_token_accuracy": 0.8669981360435486, "num_tokens": 37188072.0, "step": 4540 }, { "entropy": 0.3764370262622833, "epoch": 0.7255747126436781, "grad_norm": 1.557897686958313, "learning_rate": 4.951519877407579e-06, "loss": 0.3324, "mean_token_accuracy": 0.88344247341156, "num_tokens": 37229032.0, "step": 4545 }, { "entropy": 0.3716597259044647, "epoch": 0.7263729246487867, "grad_norm": 1.5863113403320312, "learning_rate": 4.951413591730104e-06, "loss": 0.3291, "mean_token_accuracy": 0.8820095300674439, "num_tokens": 37269992.0, "step": 4550 }, { "entropy": 0.41372754573822024, "epoch": 0.7271711366538953, "grad_norm": 1.6460305452346802, "learning_rate": 4.95130719120147e-06, "loss": 0.3692, "mean_token_accuracy": 0.871246612071991, "num_tokens": 37310952.0, "step": 4555 }, { "entropy": 0.4125048518180847, "epoch": 0.7279693486590039, "grad_norm": 1.271487832069397, "learning_rate": 4.951200675828368e-06, "loss": 0.3754, "mean_token_accuracy": 0.8682136774063111, "num_tokens": 37351912.0, "step": 4560 }, { "entropy": 0.3915758430957794, "epoch": 0.7287675606641124, "grad_norm": 1.6986738443374634, "learning_rate": 4.951094045617495e-06, "loss": 0.3474, "mean_token_accuracy": 0.8767178058624268, "num_tokens": 37392872.0, "step": 4565 }, { "entropy": 0.3821511447429657, "epoch": 0.729565772669221, "grad_norm": 1.677868366241455, "learning_rate": 4.950987300575557e-06, "loss": 0.3393, "mean_token_accuracy": 0.8818759441375732, "num_tokens": 37433832.0, "step": 4570 }, { "entropy": 0.39576552510261537, "epoch": 0.7303639846743295, "grad_norm": 1.4657864570617676, "learning_rate": 4.950880440709266e-06, "loss": 0.3507, "mean_token_accuracy": 0.8784390687942505, "num_tokens": 37474792.0, "step": 4575 }, { "entropy": 0.34899569153785703, "epoch": 0.731162196679438, "grad_norm": 1.3611983060836792, "learning_rate": 4.950773466025342e-06, "loss": 0.308, "mean_token_accuracy": 0.8884916543960572, "num_tokens": 37515752.0, "step": 4580 }, { "entropy": 0.3783299565315247, "epoch": 0.7319604086845466, "grad_norm": 1.5644665956497192, "learning_rate": 4.950666376530511e-06, "loss": 0.3307, "mean_token_accuracy": 0.884823739528656, "num_tokens": 37556712.0, "step": 4585 }, { "entropy": 0.3779717743396759, "epoch": 0.7327586206896551, "grad_norm": 1.4334276914596558, "learning_rate": 4.950559172231508e-06, "loss": 0.3427, "mean_token_accuracy": 0.878784453868866, "num_tokens": 37597672.0, "step": 4590 }, { "entropy": 0.4076384246349335, "epoch": 0.7335568326947637, "grad_norm": 1.5926692485809326, "learning_rate": 4.950451853135075e-06, "loss": 0.3632, "mean_token_accuracy": 0.8714548945426941, "num_tokens": 37638632.0, "step": 4595 }, { "entropy": 0.4062245607376099, "epoch": 0.7343550446998723, "grad_norm": 1.5608155727386475, "learning_rate": 4.95034441924796e-06, "loss": 0.3646, "mean_token_accuracy": 0.8716461777687072, "num_tokens": 37679592.0, "step": 4600 }, { "epoch": 0.7343550446998723, "eval_entropy": 0.3939617915153503, "eval_loss": 0.34785565733909607, "eval_mean_token_accuracy": 0.8777779283523559, "eval_num_tokens": 37679592.0, "eval_runtime": 69.3074, "eval_samples_per_second": 14.428, "eval_steps_per_second": 1.804, "step": 4600 }, { "entropy": 0.3811860024929047, "epoch": 0.7351532567049809, "grad_norm": 1.516083002090454, "learning_rate": 4.950236870576917e-06, "loss": 0.3343, "mean_token_accuracy": 0.882855236530304, "num_tokens": 37720552.0, "step": 4605 }, { "entropy": 0.4058854401111603, "epoch": 0.7359514687100894, "grad_norm": 1.468159794807434, "learning_rate": 4.9501292071287134e-06, "loss": 0.3676, "mean_token_accuracy": 0.8699978351593017, "num_tokens": 37761512.0, "step": 4610 }, { "entropy": 0.35052935481071473, "epoch": 0.736749680715198, "grad_norm": 1.3547862768173218, "learning_rate": 4.950021428910114e-06, "loss": 0.3138, "mean_token_accuracy": 0.8876534581184388, "num_tokens": 37802472.0, "step": 4615 }, { "entropy": 0.39081133604049684, "epoch": 0.7375478927203065, "grad_norm": 1.598197102546692, "learning_rate": 4.949913535927901e-06, "loss": 0.3558, "mean_token_accuracy": 0.8756549954414368, "num_tokens": 37843432.0, "step": 4620 }, { "entropy": 0.3502376556396484, "epoch": 0.7383461047254151, "grad_norm": 1.2976590394973755, "learning_rate": 4.949805528188857e-06, "loss": 0.3084, "mean_token_accuracy": 0.8896740078926086, "num_tokens": 37884392.0, "step": 4625 }, { "entropy": 0.3819392502307892, "epoch": 0.7391443167305236, "grad_norm": 1.6175434589385986, "learning_rate": 4.949697405699774e-06, "loss": 0.3391, "mean_token_accuracy": 0.8793386459350586, "num_tokens": 37925352.0, "step": 4630 }, { "entropy": 0.3609308242797852, "epoch": 0.7399425287356322, "grad_norm": 1.6114457845687866, "learning_rate": 4.949589168467451e-06, "loss": 0.3201, "mean_token_accuracy": 0.8883061528205871, "num_tokens": 37966312.0, "step": 4635 }, { "entropy": 0.40381277799606324, "epoch": 0.7407407407407407, "grad_norm": 1.5138126611709595, "learning_rate": 4.949480816498694e-06, "loss": 0.3584, "mean_token_accuracy": 0.875266969203949, "num_tokens": 38007272.0, "step": 4640 }, { "entropy": 0.38398600816726686, "epoch": 0.7415389527458492, "grad_norm": 1.5208909511566162, "learning_rate": 4.949372349800317e-06, "loss": 0.3435, "mean_token_accuracy": 0.8778509616851806, "num_tokens": 38048232.0, "step": 4645 }, { "entropy": 0.3965888023376465, "epoch": 0.7423371647509579, "grad_norm": 1.5762202739715576, "learning_rate": 4.949263768379141e-06, "loss": 0.356, "mean_token_accuracy": 0.8777368307113648, "num_tokens": 38089192.0, "step": 4650 }, { "entropy": 0.3549324214458466, "epoch": 0.7431353767560664, "grad_norm": 1.760347843170166, "learning_rate": 4.949155072241994e-06, "loss": 0.3184, "mean_token_accuracy": 0.8862186551094056, "num_tokens": 38130152.0, "step": 4655 }, { "entropy": 0.35094980597496034, "epoch": 0.743933588761175, "grad_norm": 1.3990799188613892, "learning_rate": 4.949046261395711e-06, "loss": 0.308, "mean_token_accuracy": 0.8917679905891418, "num_tokens": 38171112.0, "step": 4660 }, { "entropy": 0.38255144357681276, "epoch": 0.7447318007662835, "grad_norm": 1.5849868059158325, "learning_rate": 4.948937335847135e-06, "loss": 0.3424, "mean_token_accuracy": 0.8809654116630554, "num_tokens": 38212072.0, "step": 4665 }, { "entropy": 0.39698757529258727, "epoch": 0.7455300127713921, "grad_norm": 1.5714181661605835, "learning_rate": 4.948828295603114e-06, "loss": 0.3578, "mean_token_accuracy": 0.8753175854682922, "num_tokens": 38253032.0, "step": 4670 }, { "entropy": 0.4402578055858612, "epoch": 0.7463282247765006, "grad_norm": 1.7324473857879639, "learning_rate": 4.948719140670506e-06, "loss": 0.3999, "mean_token_accuracy": 0.8609963536262513, "num_tokens": 38293992.0, "step": 4675 }, { "entropy": 0.3861755609512329, "epoch": 0.7471264367816092, "grad_norm": 1.4474149942398071, "learning_rate": 4.948609871056175e-06, "loss": 0.3366, "mean_token_accuracy": 0.8831609845161438, "num_tokens": 38334952.0, "step": 4680 }, { "entropy": 0.39228718876838686, "epoch": 0.7479246487867177, "grad_norm": 1.720402717590332, "learning_rate": 4.948500486766991e-06, "loss": 0.3462, "mean_token_accuracy": 0.876913046836853, "num_tokens": 38375577.0, "step": 4685 }, { "entropy": 0.3946023941040039, "epoch": 0.7487228607918263, "grad_norm": 1.6547355651855469, "learning_rate": 4.948390987809836e-06, "loss": 0.3593, "mean_token_accuracy": 0.8741081357002258, "num_tokens": 38416537.0, "step": 4690 }, { "entropy": 0.39651496410369874, "epoch": 0.7495210727969349, "grad_norm": 1.4734386205673218, "learning_rate": 4.9482813741915905e-06, "loss": 0.3676, "mean_token_accuracy": 0.872340977191925, "num_tokens": 38457497.0, "step": 4695 }, { "entropy": 0.36745116114616394, "epoch": 0.7503192848020435, "grad_norm": 1.674511432647705, "learning_rate": 4.948171645919152e-06, "loss": 0.321, "mean_token_accuracy": 0.8847783923149108, "num_tokens": 38498457.0, "step": 4700 }, { "entropy": 0.3797731637954712, "epoch": 0.751117496807152, "grad_norm": 1.4765043258666992, "learning_rate": 4.948061802999418e-06, "loss": 0.3264, "mean_token_accuracy": 0.8845394730567933, "num_tokens": 38539417.0, "step": 4705 }, { "entropy": 0.38238744139671327, "epoch": 0.7519157088122606, "grad_norm": 1.5898044109344482, "learning_rate": 4.947951845439296e-06, "loss": 0.3413, "mean_token_accuracy": 0.8808130502700806, "num_tokens": 38580377.0, "step": 4710 }, { "entropy": 0.430251544713974, "epoch": 0.7527139208173691, "grad_norm": 1.7207558155059814, "learning_rate": 4.9478417732457015e-06, "loss": 0.3806, "mean_token_accuracy": 0.8684833407402038, "num_tokens": 38621337.0, "step": 4715 }, { "entropy": 0.36144039034843445, "epoch": 0.7535121328224776, "grad_norm": 1.4919703006744385, "learning_rate": 4.947731586425555e-06, "loss": 0.3241, "mean_token_accuracy": 0.8856022119522095, "num_tokens": 38662297.0, "step": 4720 }, { "entropy": 0.33562816977500914, "epoch": 0.7543103448275862, "grad_norm": 1.5665706396102905, "learning_rate": 4.9476212849857875e-06, "loss": 0.295, "mean_token_accuracy": 0.8960858225822449, "num_tokens": 38703257.0, "step": 4725 }, { "entropy": 0.3331297695636749, "epoch": 0.7551085568326947, "grad_norm": 1.2354601621627808, "learning_rate": 4.947510868933333e-06, "loss": 0.2909, "mean_token_accuracy": 0.8963336110115051, "num_tokens": 38744217.0, "step": 4730 }, { "entropy": 0.3897834599018097, "epoch": 0.7559067688378033, "grad_norm": 1.7782258987426758, "learning_rate": 4.947400338275135e-06, "loss": 0.353, "mean_token_accuracy": 0.8771867871284484, "num_tokens": 38785177.0, "step": 4735 }, { "entropy": 0.37567186951637266, "epoch": 0.7567049808429118, "grad_norm": 1.7346822023391724, "learning_rate": 4.947289693018145e-06, "loss": 0.334, "mean_token_accuracy": 0.8829320907592774, "num_tokens": 38826137.0, "step": 4740 }, { "entropy": 0.3920513987541199, "epoch": 0.7575031928480205, "grad_norm": 1.7325843572616577, "learning_rate": 4.9471789331693206e-06, "loss": 0.3478, "mean_token_accuracy": 0.8777777433395386, "num_tokens": 38867097.0, "step": 4745 }, { "entropy": 0.36579321026802064, "epoch": 0.758301404853129, "grad_norm": 1.4960813522338867, "learning_rate": 4.9470680587356265e-06, "loss": 0.3191, "mean_token_accuracy": 0.8874636650085449, "num_tokens": 38908057.0, "step": 4750 }, { "entropy": 0.3968794882297516, "epoch": 0.7590996168582376, "grad_norm": 1.4070299863815308, "learning_rate": 4.9469570697240355e-06, "loss": 0.3553, "mean_token_accuracy": 0.8750940442085267, "num_tokens": 38949017.0, "step": 4755 }, { "entropy": 0.3771853268146515, "epoch": 0.7598978288633461, "grad_norm": 1.4297999143600464, "learning_rate": 4.9468459661415255e-06, "loss": 0.3398, "mean_token_accuracy": 0.881128466129303, "num_tokens": 38989977.0, "step": 4760 }, { "entropy": 0.3986180305480957, "epoch": 0.7606960408684547, "grad_norm": 1.511413335800171, "learning_rate": 4.9467347479950845e-06, "loss": 0.3471, "mean_token_accuracy": 0.8775883197784424, "num_tokens": 39030937.0, "step": 4765 }, { "entropy": 0.3891195774078369, "epoch": 0.7614942528735632, "grad_norm": 1.6162978410720825, "learning_rate": 4.9466234152917056e-06, "loss": 0.3545, "mean_token_accuracy": 0.8752928972244263, "num_tokens": 39071897.0, "step": 4770 }, { "entropy": 0.3707997024059296, "epoch": 0.7622924648786717, "grad_norm": 1.4906481504440308, "learning_rate": 4.94651196803839e-06, "loss": 0.3301, "mean_token_accuracy": 0.8805507779121399, "num_tokens": 39112857.0, "step": 4775 }, { "entropy": 0.3961538434028625, "epoch": 0.7630906768837803, "grad_norm": 1.5734567642211914, "learning_rate": 4.946400406242147e-06, "loss": 0.352, "mean_token_accuracy": 0.8761051297187805, "num_tokens": 39153817.0, "step": 4780 }, { "entropy": 0.3866142988204956, "epoch": 0.7638888888888888, "grad_norm": 1.5984771251678467, "learning_rate": 4.946288729909989e-06, "loss": 0.3441, "mean_token_accuracy": 0.8792457580566406, "num_tokens": 39194777.0, "step": 4785 }, { "entropy": 0.41122140884399416, "epoch": 0.7646871008939975, "grad_norm": 1.4738880395889282, "learning_rate": 4.94617693904894e-06, "loss": 0.3666, "mean_token_accuracy": 0.8724663138389588, "num_tokens": 39235737.0, "step": 4790 }, { "entropy": 0.37069701552391054, "epoch": 0.765485312899106, "grad_norm": 1.4634195566177368, "learning_rate": 4.946065033666032e-06, "loss": 0.3298, "mean_token_accuracy": 0.8836962461471558, "num_tokens": 39276697.0, "step": 4795 }, { "entropy": 0.4035445749759674, "epoch": 0.7662835249042146, "grad_norm": 1.467256784439087, "learning_rate": 4.945953013768299e-06, "loss": 0.36, "mean_token_accuracy": 0.8714822649955749, "num_tokens": 39317657.0, "step": 4800 }, { "epoch": 0.7662835249042146, "eval_entropy": 0.3911557722091675, "eval_loss": 0.3467390835285187, "eval_mean_token_accuracy": 0.8777855606079101, "eval_num_tokens": 39317657.0, "eval_runtime": 69.2296, "eval_samples_per_second": 14.445, "eval_steps_per_second": 1.806, "step": 4800 }, { "entropy": 0.35707170963287355, "epoch": 0.7670817369093231, "grad_norm": 1.3541243076324463, "learning_rate": 4.9458408793627875e-06, "loss": 0.3135, "mean_token_accuracy": 0.889268672466278, "num_tokens": 39358617.0, "step": 4805 }, { "entropy": 0.37148754596710204, "epoch": 0.7678799489144317, "grad_norm": 1.415036678314209, "learning_rate": 4.945728630456546e-06, "loss": 0.3248, "mean_token_accuracy": 0.8829362630844116, "num_tokens": 39399577.0, "step": 4810 }, { "entropy": 0.3493272364139557, "epoch": 0.7686781609195402, "grad_norm": 1.5002624988555908, "learning_rate": 4.945616267056636e-06, "loss": 0.3094, "mean_token_accuracy": 0.8897889256477356, "num_tokens": 39440537.0, "step": 4815 }, { "entropy": 0.37543088793754575, "epoch": 0.7694763729246488, "grad_norm": 1.3540980815887451, "learning_rate": 4.945503789170123e-06, "loss": 0.3317, "mean_token_accuracy": 0.8817245364189148, "num_tokens": 39481497.0, "step": 4820 }, { "entropy": 0.39512808322906495, "epoch": 0.7702745849297573, "grad_norm": 1.615538239479065, "learning_rate": 4.945391196804078e-06, "loss": 0.3614, "mean_token_accuracy": 0.8730616807937622, "num_tokens": 39522457.0, "step": 4825 }, { "entropy": 0.3639287889003754, "epoch": 0.7710727969348659, "grad_norm": 1.5440593957901, "learning_rate": 4.945278489965583e-06, "loss": 0.3218, "mean_token_accuracy": 0.887544846534729, "num_tokens": 39563417.0, "step": 4830 }, { "entropy": 0.3264977991580963, "epoch": 0.7718710089399745, "grad_norm": 1.3868415355682373, "learning_rate": 4.945165668661724e-06, "loss": 0.2889, "mean_token_accuracy": 0.8973047494888305, "num_tokens": 39604377.0, "step": 4835 }, { "entropy": 0.3820899188518524, "epoch": 0.7726692209450831, "grad_norm": 1.5096489191055298, "learning_rate": 4.945052732899597e-06, "loss": 0.3385, "mean_token_accuracy": 0.882480239868164, "num_tokens": 39645337.0, "step": 4840 }, { "entropy": 0.3705361783504486, "epoch": 0.7734674329501916, "grad_norm": 1.3227615356445312, "learning_rate": 4.944939682686303e-06, "loss": 0.3282, "mean_token_accuracy": 0.884414803981781, "num_tokens": 39686297.0, "step": 4845 }, { "entropy": 0.3665523946285248, "epoch": 0.7742656449553001, "grad_norm": 1.456167221069336, "learning_rate": 4.94482651802895e-06, "loss": 0.328, "mean_token_accuracy": 0.8854122757911682, "num_tokens": 39727257.0, "step": 4850 }, { "entropy": 0.38584417700767515, "epoch": 0.7750638569604087, "grad_norm": 1.4760644435882568, "learning_rate": 4.944713238934658e-06, "loss": 0.3432, "mean_token_accuracy": 0.8807156682014465, "num_tokens": 39768217.0, "step": 4855 }, { "entropy": 0.3925118625164032, "epoch": 0.7758620689655172, "grad_norm": 1.7033679485321045, "learning_rate": 4.944599845410545e-06, "loss": 0.3552, "mean_token_accuracy": 0.8756734251976013, "num_tokens": 39809177.0, "step": 4860 }, { "entropy": 0.3694494187831879, "epoch": 0.7766602809706258, "grad_norm": 2.6237292289733887, "learning_rate": 4.944486337463745e-06, "loss": 0.3376, "mean_token_accuracy": 0.8807478785514832, "num_tokens": 39850137.0, "step": 4865 }, { "entropy": 0.33902106881141664, "epoch": 0.7774584929757343, "grad_norm": 1.4346832036972046, "learning_rate": 4.944372715101396e-06, "loss": 0.295, "mean_token_accuracy": 0.8959431409835815, "num_tokens": 39891097.0, "step": 4870 }, { "entropy": 0.3760982811450958, "epoch": 0.7782567049808429, "grad_norm": 1.4758082628250122, "learning_rate": 4.944258978330641e-06, "loss": 0.3249, "mean_token_accuracy": 0.8848476409912109, "num_tokens": 39932057.0, "step": 4875 }, { "entropy": 0.35401169061660764, "epoch": 0.7790549169859514, "grad_norm": 1.478041172027588, "learning_rate": 4.944145127158633e-06, "loss": 0.3163, "mean_token_accuracy": 0.8890954971313476, "num_tokens": 39973017.0, "step": 4880 }, { "entropy": 0.37507704496383665, "epoch": 0.7798531289910601, "grad_norm": 1.4558285474777222, "learning_rate": 4.944031161592532e-06, "loss": 0.3316, "mean_token_accuracy": 0.8805190443992614, "num_tokens": 40013977.0, "step": 4885 }, { "entropy": 0.3771682620048523, "epoch": 0.7806513409961686, "grad_norm": 1.4338040351867676, "learning_rate": 4.943917081639505e-06, "loss": 0.3285, "mean_token_accuracy": 0.8837599635124207, "num_tokens": 40054937.0, "step": 4890 }, { "entropy": 0.34878968000411986, "epoch": 0.7814495530012772, "grad_norm": 1.6373964548110962, "learning_rate": 4.943802887306723e-06, "loss": 0.3084, "mean_token_accuracy": 0.890072476863861, "num_tokens": 40095897.0, "step": 4895 }, { "entropy": 0.3822789669036865, "epoch": 0.7822477650063857, "grad_norm": 1.621001958847046, "learning_rate": 4.943688578601369e-06, "loss": 0.3368, "mean_token_accuracy": 0.881649649143219, "num_tokens": 40136857.0, "step": 4900 }, { "entropy": 0.40567703247070314, "epoch": 0.7830459770114943, "grad_norm": 1.345913052558899, "learning_rate": 4.943574155530631e-06, "loss": 0.3622, "mean_token_accuracy": 0.8728673458099365, "num_tokens": 40177817.0, "step": 4905 }, { "entropy": 0.37211456298828127, "epoch": 0.7838441890166028, "grad_norm": 1.4501780271530151, "learning_rate": 4.943459618101706e-06, "loss": 0.3318, "mean_token_accuracy": 0.8833699584007263, "num_tokens": 40218777.0, "step": 4910 }, { "entropy": 0.36408587694168093, "epoch": 0.7846424010217113, "grad_norm": 1.6432631015777588, "learning_rate": 4.9433449663217925e-06, "loss": 0.3268, "mean_token_accuracy": 0.8842684388160705, "num_tokens": 40259737.0, "step": 4915 }, { "entropy": 0.3740302503108978, "epoch": 0.7854406130268199, "grad_norm": 1.705114722251892, "learning_rate": 4.943230200198102e-06, "loss": 0.3338, "mean_token_accuracy": 0.883201253414154, "num_tokens": 40300697.0, "step": 4920 }, { "entropy": 0.37626251578330994, "epoch": 0.7862388250319284, "grad_norm": 1.6285901069641113, "learning_rate": 4.943115319737854e-06, "loss": 0.3238, "mean_token_accuracy": 0.8847034335136413, "num_tokens": 40341657.0, "step": 4925 }, { "entropy": 0.3706506729125977, "epoch": 0.7870370370370371, "grad_norm": 1.4745314121246338, "learning_rate": 4.943000324948269e-06, "loss": 0.3304, "mean_token_accuracy": 0.8840723276138306, "num_tokens": 40382617.0, "step": 4930 }, { "entropy": 0.3797015368938446, "epoch": 0.7878352490421456, "grad_norm": 1.5776362419128418, "learning_rate": 4.9428852158365805e-06, "loss": 0.336, "mean_token_accuracy": 0.8814828038215637, "num_tokens": 40423577.0, "step": 4935 }, { "entropy": 0.4147985756397247, "epoch": 0.7886334610472542, "grad_norm": 1.7615753412246704, "learning_rate": 4.942769992410025e-06, "loss": 0.3697, "mean_token_accuracy": 0.8718114852905273, "num_tokens": 40464537.0, "step": 4940 }, { "entropy": 0.3999458372592926, "epoch": 0.7894316730523627, "grad_norm": 1.6666876077651978, "learning_rate": 4.9426546546758495e-06, "loss": 0.3673, "mean_token_accuracy": 0.8717912793159485, "num_tokens": 40505497.0, "step": 4945 }, { "entropy": 0.38414047956466674, "epoch": 0.7902298850574713, "grad_norm": 1.5749131441116333, "learning_rate": 4.942539202641306e-06, "loss": 0.3506, "mean_token_accuracy": 0.8756724238395691, "num_tokens": 40546457.0, "step": 4950 }, { "entropy": 0.35958670973777773, "epoch": 0.7910280970625798, "grad_norm": 1.5309059619903564, "learning_rate": 4.9424236363136555e-06, "loss": 0.3158, "mean_token_accuracy": 0.886970865726471, "num_tokens": 40587417.0, "step": 4955 }, { "entropy": 0.3650970160961151, "epoch": 0.7918263090676884, "grad_norm": 1.6460506916046143, "learning_rate": 4.942307955700165e-06, "loss": 0.3262, "mean_token_accuracy": 0.8842700004577637, "num_tokens": 40628377.0, "step": 4960 }, { "entropy": 0.4351780354976654, "epoch": 0.7926245210727969, "grad_norm": 1.7340452671051025, "learning_rate": 4.942192160808108e-06, "loss": 0.3949, "mean_token_accuracy": 0.8620139837265015, "num_tokens": 40669337.0, "step": 4965 }, { "entropy": 0.36986536979675294, "epoch": 0.7934227330779055, "grad_norm": 1.5631247758865356, "learning_rate": 4.942076251644767e-06, "loss": 0.3232, "mean_token_accuracy": 0.8860733151435852, "num_tokens": 40710297.0, "step": 4970 }, { "entropy": 0.39304872155189513, "epoch": 0.794220945083014, "grad_norm": 1.3219350576400757, "learning_rate": 4.941960228217431e-06, "loss": 0.348, "mean_token_accuracy": 0.876360547542572, "num_tokens": 40751257.0, "step": 4975 }, { "entropy": 0.37213549613952634, "epoch": 0.7950191570881227, "grad_norm": 1.434804081916809, "learning_rate": 4.941844090533396e-06, "loss": 0.3408, "mean_token_accuracy": 0.8791846871376038, "num_tokens": 40792217.0, "step": 4980 }, { "entropy": 0.37901111245155333, "epoch": 0.7958173690932312, "grad_norm": 1.5676732063293457, "learning_rate": 4.941727838599964e-06, "loss": 0.3379, "mean_token_accuracy": 0.8803433775901794, "num_tokens": 40833177.0, "step": 4985 }, { "entropy": 0.38844203352928164, "epoch": 0.7966155810983397, "grad_norm": 1.5531984567642212, "learning_rate": 4.941611472424445e-06, "loss": 0.3478, "mean_token_accuracy": 0.8778400778770447, "num_tokens": 40874137.0, "step": 4990 }, { "entropy": 0.4182100772857666, "epoch": 0.7974137931034483, "grad_norm": 1.4862117767333984, "learning_rate": 4.941494992014158e-06, "loss": 0.3859, "mean_token_accuracy": 0.8644268989562989, "num_tokens": 40915097.0, "step": 4995 }, { "entropy": 0.38949393630027773, "epoch": 0.7982120051085568, "grad_norm": 1.5576521158218384, "learning_rate": 4.9413783973764275e-06, "loss": 0.3374, "mean_token_accuracy": 0.8797020792961121, "num_tokens": 40956057.0, "step": 5000 }, { "epoch": 0.7982120051085568, "eval_entropy": 0.39517786979675296, "eval_loss": 0.34585344791412354, "eval_mean_token_accuracy": 0.8783624324798583, "eval_num_tokens": 40956057.0, "eval_runtime": 69.2282, "eval_samples_per_second": 14.445, "eval_steps_per_second": 1.806, "step": 5000 }, { "entropy": 0.39468519687652587, "epoch": 0.7990102171136654, "grad_norm": 1.5206162929534912, "learning_rate": 4.9412616885185844e-06, "loss": 0.3573, "mean_token_accuracy": 0.8757318615913391, "num_tokens": 40997017.0, "step": 5005 }, { "entropy": 0.37782185077667235, "epoch": 0.7998084291187739, "grad_norm": 1.4788711071014404, "learning_rate": 4.941144865447969e-06, "loss": 0.3395, "mean_token_accuracy": 0.8795312762260437, "num_tokens": 41037977.0, "step": 5010 }, { "entropy": 0.3883132815361023, "epoch": 0.8006066411238825, "grad_norm": 1.6302030086517334, "learning_rate": 4.941027928171927e-06, "loss": 0.3572, "mean_token_accuracy": 0.8747356057167053, "num_tokens": 41078937.0, "step": 5015 }, { "entropy": 0.3945412039756775, "epoch": 0.801404853128991, "grad_norm": 1.3573029041290283, "learning_rate": 4.94091087669781e-06, "loss": 0.3509, "mean_token_accuracy": 0.8777291893959045, "num_tokens": 41119897.0, "step": 5020 }, { "entropy": 0.36984707713127135, "epoch": 0.8022030651340997, "grad_norm": 1.6052615642547607, "learning_rate": 4.940793711032982e-06, "loss": 0.3335, "mean_token_accuracy": 0.8812803864479065, "num_tokens": 41160857.0, "step": 5025 }, { "entropy": 0.37686986327171323, "epoch": 0.8030012771392082, "grad_norm": 1.5283271074295044, "learning_rate": 4.940676431184808e-06, "loss": 0.3369, "mean_token_accuracy": 0.8817309975624085, "num_tokens": 41201817.0, "step": 5030 }, { "entropy": 0.42779810428619386, "epoch": 0.8037994891443168, "grad_norm": 1.754578709602356, "learning_rate": 4.9405590371606645e-06, "loss": 0.3844, "mean_token_accuracy": 0.8636540412902832, "num_tokens": 41242777.0, "step": 5035 }, { "entropy": 0.3827055513858795, "epoch": 0.8045977011494253, "grad_norm": 1.5831722021102905, "learning_rate": 4.940441528967933e-06, "loss": 0.346, "mean_token_accuracy": 0.8784476161003113, "num_tokens": 41283737.0, "step": 5040 }, { "entropy": 0.3860118448734283, "epoch": 0.8053959131545338, "grad_norm": 1.460633397102356, "learning_rate": 4.940323906614003e-06, "loss": 0.3503, "mean_token_accuracy": 0.8752281665802002, "num_tokens": 41324697.0, "step": 5045 }, { "entropy": 0.3734028100967407, "epoch": 0.8061941251596424, "grad_norm": 1.401058316230774, "learning_rate": 4.940206170106272e-06, "loss": 0.3295, "mean_token_accuracy": 0.8852216362953186, "num_tokens": 41365657.0, "step": 5050 }, { "entropy": 0.41934972405433657, "epoch": 0.8069923371647509, "grad_norm": 1.7835423946380615, "learning_rate": 4.940088319452141e-06, "loss": 0.375, "mean_token_accuracy": 0.8679613947868348, "num_tokens": 41406617.0, "step": 5055 }, { "entropy": 0.39728267788887023, "epoch": 0.8077905491698595, "grad_norm": 1.2957251071929932, "learning_rate": 4.939970354659024e-06, "loss": 0.355, "mean_token_accuracy": 0.874215042591095, "num_tokens": 41447577.0, "step": 5060 }, { "entropy": 0.36827427744865415, "epoch": 0.808588761174968, "grad_norm": 1.5750200748443604, "learning_rate": 4.939852275734336e-06, "loss": 0.3218, "mean_token_accuracy": 0.8860924124717713, "num_tokens": 41488537.0, "step": 5065 }, { "entropy": 0.37820854783058167, "epoch": 0.8093869731800766, "grad_norm": 1.4481803178787231, "learning_rate": 4.939734082685505e-06, "loss": 0.3417, "mean_token_accuracy": 0.8807801008224487, "num_tokens": 41529497.0, "step": 5070 }, { "entropy": 0.38313864469528197, "epoch": 0.8101851851851852, "grad_norm": 1.6074419021606445, "learning_rate": 4.939615775519962e-06, "loss": 0.3432, "mean_token_accuracy": 0.8779990315437317, "num_tokens": 41570457.0, "step": 5075 }, { "entropy": 0.3798495590686798, "epoch": 0.8109833971902938, "grad_norm": 1.6640052795410156, "learning_rate": 4.939497354245146e-06, "loss": 0.3319, "mean_token_accuracy": 0.8838315844535828, "num_tokens": 41611417.0, "step": 5080 }, { "entropy": 0.4095848798751831, "epoch": 0.8117816091954023, "grad_norm": 1.4963005781173706, "learning_rate": 4.939378818868506e-06, "loss": 0.3584, "mean_token_accuracy": 0.875108253955841, "num_tokens": 41652377.0, "step": 5085 }, { "entropy": 0.3914333820343018, "epoch": 0.8125798212005109, "grad_norm": 1.7386914491653442, "learning_rate": 4.9392601693974915e-06, "loss": 0.3504, "mean_token_accuracy": 0.8770987033843994, "num_tokens": 41693337.0, "step": 5090 }, { "entropy": 0.3680845439434052, "epoch": 0.8133780332056194, "grad_norm": 1.5366642475128174, "learning_rate": 4.939141405839569e-06, "loss": 0.3324, "mean_token_accuracy": 0.8843695878982544, "num_tokens": 41734297.0, "step": 5095 }, { "entropy": 0.3906676173210144, "epoch": 0.814176245210728, "grad_norm": 1.6059380769729614, "learning_rate": 4.939022528202203e-06, "loss": 0.3453, "mean_token_accuracy": 0.8771692991256714, "num_tokens": 41775257.0, "step": 5100 }, { "entropy": 0.340567022562027, "epoch": 0.8149744572158365, "grad_norm": 1.5068142414093018, "learning_rate": 4.93890353649287e-06, "loss": 0.2987, "mean_token_accuracy": 0.8926120519638061, "num_tokens": 41816217.0, "step": 5105 }, { "entropy": 0.38606288433074953, "epoch": 0.815772669220945, "grad_norm": 1.6737275123596191, "learning_rate": 4.9387844307190536e-06, "loss": 0.3459, "mean_token_accuracy": 0.8784497022628784, "num_tokens": 41857177.0, "step": 5110 }, { "entropy": 0.3974448382854462, "epoch": 0.8165708812260536, "grad_norm": 1.3572559356689453, "learning_rate": 4.938665210888242e-06, "loss": 0.3586, "mean_token_accuracy": 0.874246883392334, "num_tokens": 41898137.0, "step": 5115 }, { "entropy": 0.3437396764755249, "epoch": 0.8173690932311622, "grad_norm": 1.3938586711883545, "learning_rate": 4.938545877007933e-06, "loss": 0.2994, "mean_token_accuracy": 0.8925269246101379, "num_tokens": 41939097.0, "step": 5120 }, { "entropy": 0.35920992493629456, "epoch": 0.8181673052362708, "grad_norm": 1.4645718336105347, "learning_rate": 4.938426429085631e-06, "loss": 0.3128, "mean_token_accuracy": 0.8877154946327209, "num_tokens": 41980057.0, "step": 5125 }, { "entropy": 0.4538248538970947, "epoch": 0.8189655172413793, "grad_norm": 1.5313849449157715, "learning_rate": 4.938306867128847e-06, "loss": 0.4069, "mean_token_accuracy": 0.8557774543762207, "num_tokens": 42021017.0, "step": 5130 }, { "entropy": 0.3992255091667175, "epoch": 0.8197637292464879, "grad_norm": 1.5350704193115234, "learning_rate": 4.938187191145099e-06, "loss": 0.3585, "mean_token_accuracy": 0.8749038934707641, "num_tokens": 42061977.0, "step": 5135 }, { "entropy": 0.37963120341300965, "epoch": 0.8205619412515964, "grad_norm": 1.4105563163757324, "learning_rate": 4.938067401141912e-06, "loss": 0.3291, "mean_token_accuracy": 0.8834235668182373, "num_tokens": 42102937.0, "step": 5140 }, { "entropy": 0.39865819215774534, "epoch": 0.821360153256705, "grad_norm": 1.4852190017700195, "learning_rate": 4.937947497126821e-06, "loss": 0.3528, "mean_token_accuracy": 0.8768776297569275, "num_tokens": 42143897.0, "step": 5145 }, { "entropy": 0.39312014579772947, "epoch": 0.8221583652618135, "grad_norm": 1.7987556457519531, "learning_rate": 4.937827479107365e-06, "loss": 0.3517, "mean_token_accuracy": 0.8770885109901428, "num_tokens": 42184857.0, "step": 5150 }, { "entropy": 0.4142571449279785, "epoch": 0.8229565772669221, "grad_norm": 1.6814552545547485, "learning_rate": 4.93770734709109e-06, "loss": 0.3748, "mean_token_accuracy": 0.8683848857879639, "num_tokens": 42225817.0, "step": 5155 }, { "entropy": 0.35767839550971986, "epoch": 0.8237547892720306, "grad_norm": 1.4671154022216797, "learning_rate": 4.937587101085551e-06, "loss": 0.3179, "mean_token_accuracy": 0.8885256052017212, "num_tokens": 42266777.0, "step": 5160 }, { "entropy": 0.37379557490348814, "epoch": 0.8245530012771393, "grad_norm": 1.4868617057800293, "learning_rate": 4.93746674109831e-06, "loss": 0.3396, "mean_token_accuracy": 0.8791596293449402, "num_tokens": 42307737.0, "step": 5165 }, { "entropy": 0.33919037580490113, "epoch": 0.8253512132822478, "grad_norm": 1.6345292329788208, "learning_rate": 4.937346267136936e-06, "loss": 0.3007, "mean_token_accuracy": 0.8931122422218323, "num_tokens": 42348697.0, "step": 5170 }, { "entropy": 0.3418751239776611, "epoch": 0.8261494252873564, "grad_norm": 1.6059125661849976, "learning_rate": 4.937225679209003e-06, "loss": 0.2951, "mean_token_accuracy": 0.8965206980705261, "num_tokens": 42389657.0, "step": 5175 }, { "entropy": 0.4018648386001587, "epoch": 0.8269476372924649, "grad_norm": 1.7123645544052124, "learning_rate": 4.937104977322097e-06, "loss": 0.3614, "mean_token_accuracy": 0.8732232570648193, "num_tokens": 42430617.0, "step": 5180 }, { "entropy": 0.38013545870780946, "epoch": 0.8277458492975734, "grad_norm": 1.3969297409057617, "learning_rate": 4.936984161483805e-06, "loss": 0.3432, "mean_token_accuracy": 0.8795966863632202, "num_tokens": 42471577.0, "step": 5185 }, { "entropy": 0.3990564405918121, "epoch": 0.828544061302682, "grad_norm": 1.7368186712265015, "learning_rate": 4.9368632317017255e-06, "loss": 0.3527, "mean_token_accuracy": 0.8751740336418152, "num_tokens": 42512537.0, "step": 5190 }, { "entropy": 0.40104894042015077, "epoch": 0.8293422733077905, "grad_norm": 1.5812550783157349, "learning_rate": 4.936742187983464e-06, "loss": 0.3578, "mean_token_accuracy": 0.87327082157135, "num_tokens": 42553497.0, "step": 5195 }, { "entropy": 0.37812411189079287, "epoch": 0.8301404853128991, "grad_norm": 1.393334984779358, "learning_rate": 4.936621030336631e-06, "loss": 0.3286, "mean_token_accuracy": 0.8836560845375061, "num_tokens": 42594457.0, "step": 5200 }, { "epoch": 0.8301404853128991, "eval_entropy": 0.3916443166732788, "eval_loss": 0.34470275044441223, "eval_mean_token_accuracy": 0.878385350227356, "eval_num_tokens": 42594457.0, "eval_runtime": 69.2115, "eval_samples_per_second": 14.448, "eval_steps_per_second": 1.806, "step": 5200 }, { "entropy": 0.37057461142539977, "epoch": 0.8309386973180076, "grad_norm": 1.6118886470794678, "learning_rate": 4.9364997587688444e-06, "loss": 0.3224, "mean_token_accuracy": 0.8839616656303406, "num_tokens": 42635417.0, "step": 5205 }, { "entropy": 0.37779011130332946, "epoch": 0.8317369093231162, "grad_norm": 1.5371465682983398, "learning_rate": 4.936378373287733e-06, "loss": 0.3412, "mean_token_accuracy": 0.881714928150177, "num_tokens": 42676377.0, "step": 5210 }, { "entropy": 0.36580815315246584, "epoch": 0.8325351213282248, "grad_norm": 1.3122960329055786, "learning_rate": 4.936256873900927e-06, "loss": 0.3256, "mean_token_accuracy": 0.8857713937759399, "num_tokens": 42717337.0, "step": 5215 }, { "entropy": 0.3645298182964325, "epoch": 0.8333333333333334, "grad_norm": 1.441042184829712, "learning_rate": 4.936135260616069e-06, "loss": 0.3264, "mean_token_accuracy": 0.8843689322471618, "num_tokens": 42758297.0, "step": 5220 }, { "entropy": 0.39551939964294436, "epoch": 0.8341315453384419, "grad_norm": 1.3785526752471924, "learning_rate": 4.936013533440804e-06, "loss": 0.3621, "mean_token_accuracy": 0.8716706156730651, "num_tokens": 42799257.0, "step": 5225 }, { "entropy": 0.36737871170043945, "epoch": 0.8349297573435505, "grad_norm": 1.4625060558319092, "learning_rate": 4.935891692382789e-06, "loss": 0.3225, "mean_token_accuracy": 0.8853367805480957, "num_tokens": 42840217.0, "step": 5230 }, { "entropy": 0.35239975452423095, "epoch": 0.835727969348659, "grad_norm": 1.7056565284729004, "learning_rate": 4.935769737449686e-06, "loss": 0.307, "mean_token_accuracy": 0.8921406984329223, "num_tokens": 42881177.0, "step": 5235 }, { "entropy": 0.4111760914325714, "epoch": 0.8365261813537676, "grad_norm": 1.4598015546798706, "learning_rate": 4.9356476686491605e-06, "loss": 0.3734, "mean_token_accuracy": 0.8678424596786499, "num_tokens": 42922137.0, "step": 5240 }, { "entropy": 0.3474355399608612, "epoch": 0.8373243933588761, "grad_norm": 1.505201816558838, "learning_rate": 4.935525485988892e-06, "loss": 0.3024, "mean_token_accuracy": 0.891577672958374, "num_tokens": 42963097.0, "step": 5245 }, { "entropy": 0.41879857778549195, "epoch": 0.8381226053639846, "grad_norm": 1.8078423738479614, "learning_rate": 4.935403189476563e-06, "loss": 0.3733, "mean_token_accuracy": 0.869062352180481, "num_tokens": 43004057.0, "step": 5250 }, { "entropy": 0.38555397391319274, "epoch": 0.8389208173690932, "grad_norm": 1.5440025329589844, "learning_rate": 4.9352807791198635e-06, "loss": 0.3389, "mean_token_accuracy": 0.878856647014618, "num_tokens": 43045017.0, "step": 5255 }, { "entropy": 0.38125547766685486, "epoch": 0.8397190293742018, "grad_norm": 1.4443796873092651, "learning_rate": 4.93515825492649e-06, "loss": 0.3402, "mean_token_accuracy": 0.8804764747619629, "num_tokens": 43085977.0, "step": 5260 }, { "entropy": 0.3209161102771759, "epoch": 0.8405172413793104, "grad_norm": 1.6840972900390625, "learning_rate": 4.935035616904149e-06, "loss": 0.2834, "mean_token_accuracy": 0.8992561101913452, "num_tokens": 43126937.0, "step": 5265 }, { "entropy": 0.3630024969577789, "epoch": 0.8413154533844189, "grad_norm": 1.4851804971694946, "learning_rate": 4.934912865060552e-06, "loss": 0.3259, "mean_token_accuracy": 0.8831697225570678, "num_tokens": 43167897.0, "step": 5270 }, { "entropy": 0.3577308297157288, "epoch": 0.8421136653895275, "grad_norm": 1.7565642595291138, "learning_rate": 4.934789999403418e-06, "loss": 0.3168, "mean_token_accuracy": 0.8880985140800476, "num_tokens": 43208857.0, "step": 5275 }, { "entropy": 0.3566504120826721, "epoch": 0.842911877394636, "grad_norm": 1.513185739517212, "learning_rate": 4.934667019940474e-06, "loss": 0.3077, "mean_token_accuracy": 0.8909970402717591, "num_tokens": 43249817.0, "step": 5280 }, { "entropy": 0.41708238124847413, "epoch": 0.8437100893997446, "grad_norm": 1.550545573234558, "learning_rate": 4.934543926679449e-06, "loss": 0.3708, "mean_token_accuracy": 0.8711596012115479, "num_tokens": 43290777.0, "step": 5285 }, { "entropy": 0.3585968017578125, "epoch": 0.8445083014048531, "grad_norm": 1.507271409034729, "learning_rate": 4.93442071962809e-06, "loss": 0.3241, "mean_token_accuracy": 0.887067461013794, "num_tokens": 43331737.0, "step": 5290 }, { "entropy": 0.3447089433670044, "epoch": 0.8453065134099617, "grad_norm": 1.5838521718978882, "learning_rate": 4.934297398794141e-06, "loss": 0.2981, "mean_token_accuracy": 0.8938217759132385, "num_tokens": 43372697.0, "step": 5295 }, { "entropy": 0.39036492705345155, "epoch": 0.8461047254150702, "grad_norm": 1.4771947860717773, "learning_rate": 4.934173964185357e-06, "loss": 0.3477, "mean_token_accuracy": 0.8789735913276673, "num_tokens": 43413657.0, "step": 5300 }, { "entropy": 0.35201868414878845, "epoch": 0.8469029374201787, "grad_norm": 1.5779476165771484, "learning_rate": 4.934050415809502e-06, "loss": 0.3089, "mean_token_accuracy": 0.8905610561370849, "num_tokens": 43454617.0, "step": 5305 }, { "entropy": 0.4234282970428467, "epoch": 0.8477011494252874, "grad_norm": 1.6354950666427612, "learning_rate": 4.933926753674342e-06, "loss": 0.3809, "mean_token_accuracy": 0.868942129611969, "num_tokens": 43495577.0, "step": 5310 }, { "entropy": 0.36352901458740233, "epoch": 0.848499361430396, "grad_norm": 1.4287358522415161, "learning_rate": 4.933802977787655e-06, "loss": 0.3227, "mean_token_accuracy": 0.8870725750923156, "num_tokens": 43536537.0, "step": 5315 }, { "entropy": 0.4029152154922485, "epoch": 0.8492975734355045, "grad_norm": 1.4122129678726196, "learning_rate": 4.933679088157226e-06, "loss": 0.3558, "mean_token_accuracy": 0.8763118505477905, "num_tokens": 43577497.0, "step": 5320 }, { "entropy": 0.3681135237216949, "epoch": 0.850095785440613, "grad_norm": 1.486761450767517, "learning_rate": 4.933555084790842e-06, "loss": 0.3229, "mean_token_accuracy": 0.886578106880188, "num_tokens": 43618457.0, "step": 5325 }, { "entropy": 0.3565655589103699, "epoch": 0.8508939974457216, "grad_norm": 1.3662333488464355, "learning_rate": 4.933430967696303e-06, "loss": 0.3126, "mean_token_accuracy": 0.8898347854614258, "num_tokens": 43659417.0, "step": 5330 }, { "entropy": 0.3615683376789093, "epoch": 0.8516922094508301, "grad_norm": 1.4146095514297485, "learning_rate": 4.933306736881415e-06, "loss": 0.3164, "mean_token_accuracy": 0.8881414651870727, "num_tokens": 43700377.0, "step": 5335 }, { "entropy": 0.3876829206943512, "epoch": 0.8524904214559387, "grad_norm": 1.4862658977508545, "learning_rate": 4.933182392353988e-06, "loss": 0.3491, "mean_token_accuracy": 0.8772210717201233, "num_tokens": 43741337.0, "step": 5340 }, { "entropy": 0.36353917717933654, "epoch": 0.8532886334610472, "grad_norm": 1.651447057723999, "learning_rate": 4.933057934121842e-06, "loss": 0.3268, "mean_token_accuracy": 0.8850520014762878, "num_tokens": 43782297.0, "step": 5345 }, { "entropy": 0.37396968007087705, "epoch": 0.8540868454661558, "grad_norm": 1.345488429069519, "learning_rate": 4.932933362192804e-06, "loss": 0.3255, "mean_token_accuracy": 0.8870288133621216, "num_tokens": 43823257.0, "step": 5350 }, { "entropy": 0.3930931568145752, "epoch": 0.8548850574712644, "grad_norm": 1.3853410482406616, "learning_rate": 4.932808676574704e-06, "loss": 0.3513, "mean_token_accuracy": 0.8762860059738159, "num_tokens": 43864217.0, "step": 5355 }, { "entropy": 0.38437947630882263, "epoch": 0.855683269476373, "grad_norm": 1.5177319049835205, "learning_rate": 4.932683877275388e-06, "loss": 0.3427, "mean_token_accuracy": 0.8778773427009583, "num_tokens": 43905177.0, "step": 5360 }, { "entropy": 0.402529639005661, "epoch": 0.8564814814814815, "grad_norm": 1.8101162910461426, "learning_rate": 4.932558964302701e-06, "loss": 0.3558, "mean_token_accuracy": 0.8761422276496887, "num_tokens": 43946137.0, "step": 5365 }, { "entropy": 0.392457115650177, "epoch": 0.85727969348659, "grad_norm": 1.7096900939941406, "learning_rate": 4.9324339376644975e-06, "loss": 0.3556, "mean_token_accuracy": 0.8748411536216736, "num_tokens": 43987097.0, "step": 5370 }, { "entropy": 0.35902159214019774, "epoch": 0.8580779054916986, "grad_norm": 1.3886840343475342, "learning_rate": 4.93230879736864e-06, "loss": 0.3092, "mean_token_accuracy": 0.8903650403022766, "num_tokens": 44028057.0, "step": 5375 }, { "entropy": 0.3581184148788452, "epoch": 0.8588761174968071, "grad_norm": 1.6585997343063354, "learning_rate": 4.932183543422999e-06, "loss": 0.316, "mean_token_accuracy": 0.8876467704772949, "num_tokens": 44069017.0, "step": 5380 }, { "entropy": 0.3886809110641479, "epoch": 0.8596743295019157, "grad_norm": 1.4889298677444458, "learning_rate": 4.93205817583545e-06, "loss": 0.3383, "mean_token_accuracy": 0.884128725528717, "num_tokens": 44109977.0, "step": 5385 }, { "entropy": 0.3871449947357178, "epoch": 0.8604725415070242, "grad_norm": 1.4008923768997192, "learning_rate": 4.931932694613876e-06, "loss": 0.3398, "mean_token_accuracy": 0.8785978078842163, "num_tokens": 44150937.0, "step": 5390 }, { "entropy": 0.3678101718425751, "epoch": 0.8612707535121328, "grad_norm": 1.5337679386138916, "learning_rate": 4.931807099766168e-06, "loss": 0.3234, "mean_token_accuracy": 0.885176682472229, "num_tokens": 44191897.0, "step": 5395 }, { "entropy": 0.388889354467392, "epoch": 0.8620689655172413, "grad_norm": 2.2841343879699707, "learning_rate": 4.9316813913002246e-06, "loss": 0.3527, "mean_token_accuracy": 0.8753127932548523, "num_tokens": 44232857.0, "step": 5400 }, { "epoch": 0.8620689655172413, "eval_entropy": 0.3888051829338074, "eval_loss": 0.34354865550994873, "eval_mean_token_accuracy": 0.8788944683074951, "eval_num_tokens": 44232857.0, "eval_runtime": 69.2106, "eval_samples_per_second": 14.449, "eval_steps_per_second": 1.806, "step": 5400 }, { "entropy": 0.35687355399131776, "epoch": 0.86286717752235, "grad_norm": 1.4147300720214844, "learning_rate": 4.93155556922395e-06, "loss": 0.3101, "mean_token_accuracy": 0.88960120677948, "num_tokens": 44273817.0, "step": 5405 }, { "entropy": 0.4274792790412903, "epoch": 0.8636653895274585, "grad_norm": 1.7667378187179565, "learning_rate": 4.931429633545257e-06, "loss": 0.3845, "mean_token_accuracy": 0.8630432486534119, "num_tokens": 44314777.0, "step": 5410 }, { "entropy": 0.37117738127708433, "epoch": 0.8644636015325671, "grad_norm": 1.7041641473770142, "learning_rate": 4.9313035842720644e-06, "loss": 0.3348, "mean_token_accuracy": 0.8814984917640686, "num_tokens": 44355737.0, "step": 5415 }, { "entropy": 0.35860843062400816, "epoch": 0.8652618135376756, "grad_norm": 1.5495671033859253, "learning_rate": 4.931177421412298e-06, "loss": 0.3192, "mean_token_accuracy": 0.8854899287223816, "num_tokens": 44396697.0, "step": 5420 }, { "entropy": 0.38937310576438905, "epoch": 0.8660600255427842, "grad_norm": 1.618882656097412, "learning_rate": 4.931051144973892e-06, "loss": 0.3442, "mean_token_accuracy": 0.8780934572219848, "num_tokens": 44437657.0, "step": 5425 }, { "entropy": 0.35249313712120056, "epoch": 0.8668582375478927, "grad_norm": 1.5120997428894043, "learning_rate": 4.930924754964788e-06, "loss": 0.3114, "mean_token_accuracy": 0.8902511358261108, "num_tokens": 44478617.0, "step": 5430 }, { "entropy": 0.408038991689682, "epoch": 0.8676564495530013, "grad_norm": 1.6065163612365723, "learning_rate": 4.930798251392932e-06, "loss": 0.3618, "mean_token_accuracy": 0.8714466691017151, "num_tokens": 44519577.0, "step": 5435 }, { "entropy": 0.42532867193222046, "epoch": 0.8684546615581098, "grad_norm": 1.576435923576355, "learning_rate": 4.9306716342662795e-06, "loss": 0.3779, "mean_token_accuracy": 0.8677833437919616, "num_tokens": 44560537.0, "step": 5440 }, { "entropy": 0.3784036099910736, "epoch": 0.8692528735632183, "grad_norm": 1.5071748495101929, "learning_rate": 4.930544903592794e-06, "loss": 0.3387, "mean_token_accuracy": 0.8795307159423829, "num_tokens": 44601497.0, "step": 5445 }, { "entropy": 0.37425318360328674, "epoch": 0.870051085568327, "grad_norm": 1.6781151294708252, "learning_rate": 4.930418059380444e-06, "loss": 0.336, "mean_token_accuracy": 0.8800019502639771, "num_tokens": 44642457.0, "step": 5450 }, { "entropy": 0.34272149205207825, "epoch": 0.8708492975734355, "grad_norm": 1.5050232410430908, "learning_rate": 4.930291101637205e-06, "loss": 0.2971, "mean_token_accuracy": 0.8940603494644165, "num_tokens": 44683417.0, "step": 5455 }, { "entropy": 0.3740146219730377, "epoch": 0.8716475095785441, "grad_norm": 1.4340434074401855, "learning_rate": 4.9301640303710606e-06, "loss": 0.3334, "mean_token_accuracy": 0.8837826609611511, "num_tokens": 44724377.0, "step": 5460 }, { "entropy": 0.35212224125862124, "epoch": 0.8724457215836526, "grad_norm": 1.3832968473434448, "learning_rate": 4.9300368455900024e-06, "loss": 0.3087, "mean_token_accuracy": 0.8901192903518677, "num_tokens": 44765337.0, "step": 5465 }, { "entropy": 0.3691311001777649, "epoch": 0.8732439335887612, "grad_norm": 1.5648622512817383, "learning_rate": 4.929909547302028e-06, "loss": 0.3278, "mean_token_accuracy": 0.8848111033439636, "num_tokens": 44806297.0, "step": 5470 }, { "entropy": 0.3800596833229065, "epoch": 0.8740421455938697, "grad_norm": 1.5399657487869263, "learning_rate": 4.929782135515143e-06, "loss": 0.3276, "mean_token_accuracy": 0.882738995552063, "num_tokens": 44847257.0, "step": 5475 }, { "entropy": 0.405544126033783, "epoch": 0.8748403575989783, "grad_norm": 1.5004374980926514, "learning_rate": 4.929654610237359e-06, "loss": 0.3608, "mean_token_accuracy": 0.8739061474800109, "num_tokens": 44888217.0, "step": 5480 }, { "entropy": 0.34500548243522644, "epoch": 0.8756385696040868, "grad_norm": 1.4212785959243774, "learning_rate": 4.929526971476694e-06, "loss": 0.3049, "mean_token_accuracy": 0.8921091079711914, "num_tokens": 44929177.0, "step": 5485 }, { "entropy": 0.3969034910202026, "epoch": 0.8764367816091954, "grad_norm": 1.6262997388839722, "learning_rate": 4.929399219241175e-06, "loss": 0.3412, "mean_token_accuracy": 0.8802304983139038, "num_tokens": 44970137.0, "step": 5490 }, { "entropy": 0.40439229011535643, "epoch": 0.8772349936143039, "grad_norm": 1.5303850173950195, "learning_rate": 4.929271353538837e-06, "loss": 0.348, "mean_token_accuracy": 0.8780505299568176, "num_tokens": 45011097.0, "step": 5495 }, { "entropy": 0.368317711353302, "epoch": 0.8780332056194126, "grad_norm": 1.537245273590088, "learning_rate": 4.92914337437772e-06, "loss": 0.3316, "mean_token_accuracy": 0.8815723538398743, "num_tokens": 45052057.0, "step": 5500 }, { "entropy": 0.3903080582618713, "epoch": 0.8788314176245211, "grad_norm": 1.5359094142913818, "learning_rate": 4.929015281765869e-06, "loss": 0.3479, "mean_token_accuracy": 0.8795314073562622, "num_tokens": 45093017.0, "step": 5505 }, { "entropy": 0.37571442127227783, "epoch": 0.8796296296296297, "grad_norm": 1.6674305200576782, "learning_rate": 4.928887075711343e-06, "loss": 0.3293, "mean_token_accuracy": 0.8841602087020874, "num_tokens": 45133977.0, "step": 5510 }, { "entropy": 0.38340579271316527, "epoch": 0.8804278416347382, "grad_norm": 1.3645216226577759, "learning_rate": 4.9287587562222005e-06, "loss": 0.3393, "mean_token_accuracy": 0.879256546497345, "num_tokens": 45174937.0, "step": 5515 }, { "entropy": 0.3849440336227417, "epoch": 0.8812260536398467, "grad_norm": 1.5876308679580688, "learning_rate": 4.928630323306514e-06, "loss": 0.3413, "mean_token_accuracy": 0.8795434355735778, "num_tokens": 45215897.0, "step": 5520 }, { "entropy": 0.379747611284256, "epoch": 0.8820242656449553, "grad_norm": 1.4216405153274536, "learning_rate": 4.928501776972357e-06, "loss": 0.3322, "mean_token_accuracy": 0.8834027528762818, "num_tokens": 45256857.0, "step": 5525 }, { "entropy": 0.37451350688934326, "epoch": 0.8828224776500638, "grad_norm": 1.4833821058273315, "learning_rate": 4.9283731172278145e-06, "loss": 0.3369, "mean_token_accuracy": 0.8831079363822937, "num_tokens": 45297817.0, "step": 5530 }, { "entropy": 0.40575386881828307, "epoch": 0.8836206896551724, "grad_norm": 1.6087018251419067, "learning_rate": 4.928244344080977e-06, "loss": 0.3725, "mean_token_accuracy": 0.8704880237579345, "num_tokens": 45338777.0, "step": 5535 }, { "entropy": 0.3431255519390106, "epoch": 0.8844189016602809, "grad_norm": 1.4994218349456787, "learning_rate": 4.928115457539941e-06, "loss": 0.298, "mean_token_accuracy": 0.8945744037628174, "num_tokens": 45379737.0, "step": 5540 }, { "entropy": 0.3710679769515991, "epoch": 0.8852171136653896, "grad_norm": 1.5049368143081665, "learning_rate": 4.9279864576128135e-06, "loss": 0.3292, "mean_token_accuracy": 0.8831719040870667, "num_tokens": 45420697.0, "step": 5545 }, { "entropy": 0.3995101869106293, "epoch": 0.8860153256704981, "grad_norm": 1.692454218864441, "learning_rate": 4.927857344307704e-06, "loss": 0.3483, "mean_token_accuracy": 0.8794248223304748, "num_tokens": 45461657.0, "step": 5550 }, { "entropy": 0.4114351987838745, "epoch": 0.8868135376756067, "grad_norm": 1.5567283630371094, "learning_rate": 4.927728117632733e-06, "loss": 0.3646, "mean_token_accuracy": 0.873246169090271, "num_tokens": 45502617.0, "step": 5555 }, { "entropy": 0.40230279564857485, "epoch": 0.8876117496807152, "grad_norm": 1.5717520713806152, "learning_rate": 4.927598777596027e-06, "loss": 0.3585, "mean_token_accuracy": 0.8731892108917236, "num_tokens": 45543577.0, "step": 5560 }, { "entropy": 0.38769946098327634, "epoch": 0.8884099616858238, "grad_norm": 1.5756887197494507, "learning_rate": 4.927469324205719e-06, "loss": 0.3455, "mean_token_accuracy": 0.8776480913162231, "num_tokens": 45584537.0, "step": 5565 }, { "entropy": 0.3639233231544495, "epoch": 0.8892081736909323, "grad_norm": 1.5897159576416016, "learning_rate": 4.927339757469949e-06, "loss": 0.3238, "mean_token_accuracy": 0.8838191628456116, "num_tokens": 45625497.0, "step": 5570 }, { "entropy": 0.3755005121231079, "epoch": 0.8900063856960408, "grad_norm": 1.4302053451538086, "learning_rate": 4.927210077396864e-06, "loss": 0.3343, "mean_token_accuracy": 0.883170211315155, "num_tokens": 45666457.0, "step": 5575 }, { "entropy": 0.36951016783714297, "epoch": 0.8908045977011494, "grad_norm": 1.571903109550476, "learning_rate": 4.9270802839946195e-06, "loss": 0.3204, "mean_token_accuracy": 0.8858746647834778, "num_tokens": 45707417.0, "step": 5580 }, { "entropy": 0.3636231780052185, "epoch": 0.8916028097062579, "grad_norm": 1.8068180084228516, "learning_rate": 4.926950377271379e-06, "loss": 0.32, "mean_token_accuracy": 0.8879209876060485, "num_tokens": 45748377.0, "step": 5585 }, { "entropy": 0.3747856080532074, "epoch": 0.8924010217113666, "grad_norm": 1.4714301824569702, "learning_rate": 4.926820357235309e-06, "loss": 0.3325, "mean_token_accuracy": 0.8826592564582825, "num_tokens": 45789337.0, "step": 5590 }, { "entropy": 0.35584996342659, "epoch": 0.8931992337164751, "grad_norm": 1.4386074542999268, "learning_rate": 4.926690223894587e-06, "loss": 0.3126, "mean_token_accuracy": 0.8903956413269043, "num_tokens": 45830297.0, "step": 5595 }, { "entropy": 0.3578944027423859, "epoch": 0.8939974457215837, "grad_norm": 1.4989902973175049, "learning_rate": 4.926559977257395e-06, "loss": 0.3182, "mean_token_accuracy": 0.8872125625610352, "num_tokens": 45871257.0, "step": 5600 }, { "epoch": 0.8939974457215837, "eval_entropy": 0.3874563302993774, "eval_loss": 0.3424855172634125, "eval_mean_token_accuracy": 0.8790763306617737, "eval_num_tokens": 45871257.0, "eval_runtime": 69.221, "eval_samples_per_second": 14.446, "eval_steps_per_second": 1.806, "step": 5600 }, { "entropy": 0.4076481223106384, "epoch": 0.8947956577266922, "grad_norm": 1.4201501607894897, "learning_rate": 4.9264296173319236e-06, "loss": 0.3624, "mean_token_accuracy": 0.8703855156898499, "num_tokens": 45912217.0, "step": 5605 }, { "entropy": 0.384282773733139, "epoch": 0.8955938697318008, "grad_norm": 1.6850939989089966, "learning_rate": 4.926299144126372e-06, "loss": 0.3461, "mean_token_accuracy": 0.8778869271278381, "num_tokens": 45953177.0, "step": 5610 }, { "entropy": 0.4003660798072815, "epoch": 0.8963920817369093, "grad_norm": 1.6248348951339722, "learning_rate": 4.926168557648943e-06, "loss": 0.3544, "mean_token_accuracy": 0.875260329246521, "num_tokens": 45994137.0, "step": 5615 }, { "entropy": 0.3878970563411713, "epoch": 0.8971902937420179, "grad_norm": 1.6481446027755737, "learning_rate": 4.926037857907849e-06, "loss": 0.3447, "mean_token_accuracy": 0.8781666278839111, "num_tokens": 46035097.0, "step": 5620 }, { "entropy": 0.37414196133613586, "epoch": 0.8979885057471264, "grad_norm": 1.3256474733352661, "learning_rate": 4.92590704491131e-06, "loss": 0.3294, "mean_token_accuracy": 0.884190309047699, "num_tokens": 46076057.0, "step": 5625 }, { "entropy": 0.39382901787757874, "epoch": 0.898786717752235, "grad_norm": 1.5669851303100586, "learning_rate": 4.925776118667549e-06, "loss": 0.3508, "mean_token_accuracy": 0.875950026512146, "num_tokens": 46117017.0, "step": 5630 }, { "entropy": 0.3600032448768616, "epoch": 0.8995849297573435, "grad_norm": 1.3882217407226562, "learning_rate": 4.925645079184802e-06, "loss": 0.3222, "mean_token_accuracy": 0.8872683763504028, "num_tokens": 46157977.0, "step": 5635 }, { "entropy": 0.3779283821582794, "epoch": 0.9003831417624522, "grad_norm": 1.4759502410888672, "learning_rate": 4.925513926471307e-06, "loss": 0.3266, "mean_token_accuracy": 0.885478389263153, "num_tokens": 46198937.0, "step": 5640 }, { "entropy": 0.3804099500179291, "epoch": 0.9011813537675607, "grad_norm": 1.38573157787323, "learning_rate": 4.9253826605353135e-06, "loss": 0.325, "mean_token_accuracy": 0.8853496789932251, "num_tokens": 46239897.0, "step": 5645 }, { "entropy": 0.3461835443973541, "epoch": 0.9019795657726692, "grad_norm": 1.3838683366775513, "learning_rate": 4.925251281385074e-06, "loss": 0.3013, "mean_token_accuracy": 0.8930078864097595, "num_tokens": 46280857.0, "step": 5650 }, { "entropy": 0.3484264135360718, "epoch": 0.9027777777777778, "grad_norm": 1.5145350694656372, "learning_rate": 4.925119789028852e-06, "loss": 0.3024, "mean_token_accuracy": 0.8926341891288757, "num_tokens": 46321817.0, "step": 5655 }, { "entropy": 0.37027230858802795, "epoch": 0.9035759897828863, "grad_norm": 1.5724906921386719, "learning_rate": 4.924988183474915e-06, "loss": 0.3232, "mean_token_accuracy": 0.8843621373176574, "num_tokens": 46362777.0, "step": 5660 }, { "entropy": 0.3780488550662994, "epoch": 0.9043742017879949, "grad_norm": 1.6730051040649414, "learning_rate": 4.924856464731538e-06, "loss": 0.3351, "mean_token_accuracy": 0.8800442934036254, "num_tokens": 46403737.0, "step": 5665 }, { "entropy": 0.3991661310195923, "epoch": 0.9051724137931034, "grad_norm": 1.5829823017120361, "learning_rate": 4.924724632807005e-06, "loss": 0.3605, "mean_token_accuracy": 0.8738561511039734, "num_tokens": 46444697.0, "step": 5670 }, { "entropy": 0.38700241446495054, "epoch": 0.905970625798212, "grad_norm": 1.5838910341262817, "learning_rate": 4.924592687709606e-06, "loss": 0.3468, "mean_token_accuracy": 0.8770587086677551, "num_tokens": 46485657.0, "step": 5675 }, { "entropy": 0.39548492431640625, "epoch": 0.9067688378033205, "grad_norm": 1.6779003143310547, "learning_rate": 4.9244606294476385e-06, "loss": 0.3574, "mean_token_accuracy": 0.8760750770568848, "num_tokens": 46526617.0, "step": 5680 }, { "entropy": 0.37442941069602964, "epoch": 0.9075670498084292, "grad_norm": 1.4389050006866455, "learning_rate": 4.924328458029406e-06, "loss": 0.331, "mean_token_accuracy": 0.8821128606796265, "num_tokens": 46567577.0, "step": 5685 }, { "entropy": 0.3987495958805084, "epoch": 0.9083652618135377, "grad_norm": 1.6939518451690674, "learning_rate": 4.924196173463219e-06, "loss": 0.3495, "mean_token_accuracy": 0.8778408288955688, "num_tokens": 46608537.0, "step": 5690 }, { "entropy": 0.36648149490356446, "epoch": 0.9091634738186463, "grad_norm": 1.3086787462234497, "learning_rate": 4.924063775757399e-06, "loss": 0.318, "mean_token_accuracy": 0.8856496095657349, "num_tokens": 46649497.0, "step": 5695 }, { "entropy": 0.3645837366580963, "epoch": 0.9099616858237548, "grad_norm": 1.6085964441299438, "learning_rate": 4.9239312649202694e-06, "loss": 0.3154, "mean_token_accuracy": 0.8886914491653443, "num_tokens": 46690457.0, "step": 5700 }, { "entropy": 0.40077937245368955, "epoch": 0.9107598978288634, "grad_norm": 1.790521502494812, "learning_rate": 4.923798640960163e-06, "loss": 0.3631, "mean_token_accuracy": 0.871863043308258, "num_tokens": 46731417.0, "step": 5705 }, { "entropy": 0.3492118358612061, "epoch": 0.9115581098339719, "grad_norm": 1.5939642190933228, "learning_rate": 4.92366590388542e-06, "loss": 0.3172, "mean_token_accuracy": 0.8899153947830201, "num_tokens": 46772377.0, "step": 5710 }, { "entropy": 0.3977007269859314, "epoch": 0.9123563218390804, "grad_norm": 1.5747590065002441, "learning_rate": 4.923533053704388e-06, "loss": 0.3564, "mean_token_accuracy": 0.8730810046195984, "num_tokens": 46813337.0, "step": 5715 }, { "entropy": 0.35613282918930056, "epoch": 0.913154533844189, "grad_norm": 1.5095136165618896, "learning_rate": 4.92340009042542e-06, "loss": 0.3125, "mean_token_accuracy": 0.8895549297332763, "num_tokens": 46854297.0, "step": 5720 }, { "entropy": 0.3774961352348328, "epoch": 0.9139527458492975, "grad_norm": 1.7125418186187744, "learning_rate": 4.923267014056878e-06, "loss": 0.3338, "mean_token_accuracy": 0.8814448356628418, "num_tokens": 46895257.0, "step": 5725 }, { "entropy": 0.44346152544021605, "epoch": 0.9147509578544061, "grad_norm": 1.4897512197494507, "learning_rate": 4.92313382460713e-06, "loss": 0.3973, "mean_token_accuracy": 0.8611808538436889, "num_tokens": 46936217.0, "step": 5730 }, { "entropy": 0.3868241310119629, "epoch": 0.9155491698595147, "grad_norm": 1.4248846769332886, "learning_rate": 4.923000522084551e-06, "loss": 0.3361, "mean_token_accuracy": 0.8827209115028382, "num_tokens": 46977177.0, "step": 5735 }, { "entropy": 0.38528451323509216, "epoch": 0.9163473818646233, "grad_norm": 1.7550557851791382, "learning_rate": 4.922867106497524e-06, "loss": 0.3377, "mean_token_accuracy": 0.8812140226364136, "num_tokens": 47018137.0, "step": 5740 }, { "entropy": 0.357247930765152, "epoch": 0.9171455938697318, "grad_norm": 1.3566182851791382, "learning_rate": 4.922733577854438e-06, "loss": 0.3089, "mean_token_accuracy": 0.8896707653999328, "num_tokens": 47059097.0, "step": 5745 }, { "entropy": 0.344160258769989, "epoch": 0.9179438058748404, "grad_norm": 1.5768409967422485, "learning_rate": 4.9225999361636915e-06, "loss": 0.3008, "mean_token_accuracy": 0.8920184731483459, "num_tokens": 47100057.0, "step": 5750 }, { "entropy": 0.35273231863975524, "epoch": 0.9187420178799489, "grad_norm": 1.4575929641723633, "learning_rate": 4.922466181433686e-06, "loss": 0.3069, "mean_token_accuracy": 0.8913999795913696, "num_tokens": 47141017.0, "step": 5755 }, { "entropy": 0.3938547372817993, "epoch": 0.9195402298850575, "grad_norm": 1.5926954746246338, "learning_rate": 4.922332313672834e-06, "loss": 0.3529, "mean_token_accuracy": 0.8751298427581787, "num_tokens": 47181977.0, "step": 5760 }, { "entropy": 0.40632343888282774, "epoch": 0.920338441890166, "grad_norm": 1.456726312637329, "learning_rate": 4.922198332889553e-06, "loss": 0.345, "mean_token_accuracy": 0.876978600025177, "num_tokens": 47222937.0, "step": 5765 }, { "entropy": 0.3740855813026428, "epoch": 0.9211366538952745, "grad_norm": 1.4371447563171387, "learning_rate": 4.922064239092269e-06, "loss": 0.3203, "mean_token_accuracy": 0.885735023021698, "num_tokens": 47263897.0, "step": 5770 }, { "entropy": 0.3803277492523193, "epoch": 0.9219348659003831, "grad_norm": 1.611755609512329, "learning_rate": 4.9219300322894125e-06, "loss": 0.3428, "mean_token_accuracy": 0.8782493829727173, "num_tokens": 47304857.0, "step": 5775 }, { "entropy": 0.34562104344367983, "epoch": 0.9227330779054917, "grad_norm": 1.4883754253387451, "learning_rate": 4.921795712489425e-06, "loss": 0.2992, "mean_token_accuracy": 0.8944500207901, "num_tokens": 47345817.0, "step": 5780 }, { "entropy": 0.3530261814594269, "epoch": 0.9235312899106003, "grad_norm": 1.4684337377548218, "learning_rate": 4.921661279700751e-06, "loss": 0.3091, "mean_token_accuracy": 0.8889976978302002, "num_tokens": 47386777.0, "step": 5785 }, { "entropy": 0.368403947353363, "epoch": 0.9243295019157088, "grad_norm": 1.5497897863388062, "learning_rate": 4.921526733931846e-06, "loss": 0.3298, "mean_token_accuracy": 0.8838266372680664, "num_tokens": 47427737.0, "step": 5790 }, { "entropy": 0.3576655864715576, "epoch": 0.9251277139208174, "grad_norm": 1.334320306777954, "learning_rate": 4.9213920751911696e-06, "loss": 0.3172, "mean_token_accuracy": 0.8881598234176635, "num_tokens": 47468543.0, "step": 5795 }, { "entropy": 0.35151341557502747, "epoch": 0.9259259259259259, "grad_norm": 1.5386505126953125, "learning_rate": 4.921257303487189e-06, "loss": 0.3055, "mean_token_accuracy": 0.8897369027137756, "num_tokens": 47509503.0, "step": 5800 }, { "epoch": 0.9259259259259259, "eval_entropy": 0.39153709745407106, "eval_loss": 0.3416091501712799, "eval_mean_token_accuracy": 0.87915345287323, "eval_num_tokens": 47509503.0, "eval_runtime": 69.2155, "eval_samples_per_second": 14.448, "eval_steps_per_second": 1.806, "step": 5800 }, { "entropy": 0.3985960602760315, "epoch": 0.9267241379310345, "grad_norm": 1.565441370010376, "learning_rate": 4.9211224188283804e-06, "loss": 0.3486, "mean_token_accuracy": 0.8771282076835633, "num_tokens": 47550463.0, "step": 5805 }, { "entropy": 0.3795722007751465, "epoch": 0.927522349936143, "grad_norm": 1.5687583684921265, "learning_rate": 4.9209874212232245e-06, "loss": 0.3338, "mean_token_accuracy": 0.8816348910331726, "num_tokens": 47591423.0, "step": 5810 }, { "entropy": 0.3920009911060333, "epoch": 0.9283205619412516, "grad_norm": 1.607857346534729, "learning_rate": 4.920852310680212e-06, "loss": 0.3595, "mean_token_accuracy": 0.8741375207901001, "num_tokens": 47632383.0, "step": 5815 }, { "entropy": 0.39364256858825686, "epoch": 0.9291187739463601, "grad_norm": 1.6685956716537476, "learning_rate": 4.920717087207838e-06, "loss": 0.341, "mean_token_accuracy": 0.8809046626091004, "num_tokens": 47673343.0, "step": 5820 }, { "entropy": 0.3499061703681946, "epoch": 0.9299169859514687, "grad_norm": 1.3277673721313477, "learning_rate": 4.920581750814606e-06, "loss": 0.3022, "mean_token_accuracy": 0.8917477488517761, "num_tokens": 47714303.0, "step": 5825 }, { "entropy": 0.38631321787834166, "epoch": 0.9307151979565773, "grad_norm": 1.5384690761566162, "learning_rate": 4.9204463015090275e-06, "loss": 0.3461, "mean_token_accuracy": 0.8766877293586731, "num_tokens": 47755263.0, "step": 5830 }, { "entropy": 0.37582285404205323, "epoch": 0.9315134099616859, "grad_norm": 1.4166748523712158, "learning_rate": 4.920310739299619e-06, "loss": 0.3324, "mean_token_accuracy": 0.8838165760040283, "num_tokens": 47796223.0, "step": 5835 }, { "entropy": 0.3942254841327667, "epoch": 0.9323116219667944, "grad_norm": 1.6179897785186768, "learning_rate": 4.920175064194904e-06, "loss": 0.354, "mean_token_accuracy": 0.8769045948982239, "num_tokens": 47837183.0, "step": 5840 }, { "entropy": 0.3526209771633148, "epoch": 0.933109833971903, "grad_norm": 1.6264334917068481, "learning_rate": 4.920039276203416e-06, "loss": 0.3055, "mean_token_accuracy": 0.8918890833854676, "num_tokens": 47878143.0, "step": 5845 }, { "entropy": 0.3778443932533264, "epoch": 0.9339080459770115, "grad_norm": 1.6795600652694702, "learning_rate": 4.919903375333693e-06, "loss": 0.3369, "mean_token_accuracy": 0.8826277613639831, "num_tokens": 47919103.0, "step": 5850 }, { "entropy": 0.3853157699108124, "epoch": 0.93470625798212, "grad_norm": 1.6166398525238037, "learning_rate": 4.919767361594281e-06, "loss": 0.3424, "mean_token_accuracy": 0.8792406916618347, "num_tokens": 47960063.0, "step": 5855 }, { "entropy": 0.4113163113594055, "epoch": 0.9355044699872286, "grad_norm": 1.6133720874786377, "learning_rate": 4.919631234993734e-06, "loss": 0.3678, "mean_token_accuracy": 0.8711008191108703, "num_tokens": 48001023.0, "step": 5860 }, { "entropy": 0.37159796357154845, "epoch": 0.9363026819923371, "grad_norm": 1.6776436567306519, "learning_rate": 4.91949499554061e-06, "loss": 0.3328, "mean_token_accuracy": 0.8851324081420898, "num_tokens": 48041983.0, "step": 5865 }, { "entropy": 0.3699815392494202, "epoch": 0.9371008939974457, "grad_norm": 1.6786829233169556, "learning_rate": 4.919358643243478e-06, "loss": 0.3219, "mean_token_accuracy": 0.8858237981796264, "num_tokens": 48082943.0, "step": 5870 }, { "entropy": 0.3756616234779358, "epoch": 0.9378991060025543, "grad_norm": 1.6404286623001099, "learning_rate": 4.919222178110911e-06, "loss": 0.3302, "mean_token_accuracy": 0.8846615314483642, "num_tokens": 48123903.0, "step": 5875 }, { "entropy": 0.33975174427032473, "epoch": 0.9386973180076629, "grad_norm": 1.3744854927062988, "learning_rate": 4.919085600151493e-06, "loss": 0.2995, "mean_token_accuracy": 0.8948165059089661, "num_tokens": 48164863.0, "step": 5880 }, { "entropy": 0.38145941495895386, "epoch": 0.9394955300127714, "grad_norm": 1.4947644472122192, "learning_rate": 4.91894890937381e-06, "loss": 0.3412, "mean_token_accuracy": 0.8807326078414917, "num_tokens": 48205823.0, "step": 5885 }, { "entropy": 0.361408406496048, "epoch": 0.94029374201788, "grad_norm": 1.532977819442749, "learning_rate": 4.918812105786457e-06, "loss": 0.3105, "mean_token_accuracy": 0.8895909547805786, "num_tokens": 48246783.0, "step": 5890 }, { "entropy": 0.35269999504089355, "epoch": 0.9410919540229885, "grad_norm": 1.704317331314087, "learning_rate": 4.918675189398039e-06, "loss": 0.3234, "mean_token_accuracy": 0.8849400162696839, "num_tokens": 48287743.0, "step": 5895 }, { "entropy": 0.4164234399795532, "epoch": 0.941890166028097, "grad_norm": 1.43415105342865, "learning_rate": 4.918538160217165e-06, "loss": 0.3731, "mean_token_accuracy": 0.8696430563926697, "num_tokens": 48328703.0, "step": 5900 }, { "entropy": 0.40871328115463257, "epoch": 0.9426883780332056, "grad_norm": 1.314746379852295, "learning_rate": 4.91840101825245e-06, "loss": 0.3572, "mean_token_accuracy": 0.8742193460464478, "num_tokens": 48369663.0, "step": 5905 }, { "entropy": 0.37529211640357973, "epoch": 0.9434865900383141, "grad_norm": 1.604024052619934, "learning_rate": 4.918263763512521e-06, "loss": 0.3312, "mean_token_accuracy": 0.8838613867759705, "num_tokens": 48410623.0, "step": 5910 }, { "entropy": 0.3645563364028931, "epoch": 0.9442848020434227, "grad_norm": 1.5141392946243286, "learning_rate": 4.918126396006006e-06, "loss": 0.321, "mean_token_accuracy": 0.885771381855011, "num_tokens": 48451583.0, "step": 5915 }, { "entropy": 0.38649824261665344, "epoch": 0.9450830140485313, "grad_norm": 1.6629687547683716, "learning_rate": 4.917988915741546e-06, "loss": 0.3447, "mean_token_accuracy": 0.8791480541229248, "num_tokens": 48492543.0, "step": 5920 }, { "entropy": 0.34921335577964785, "epoch": 0.9458812260536399, "grad_norm": 1.4933561086654663, "learning_rate": 4.9178513227277845e-06, "loss": 0.303, "mean_token_accuracy": 0.8922902464866638, "num_tokens": 48533503.0, "step": 5925 }, { "entropy": 0.3478983163833618, "epoch": 0.9466794380587484, "grad_norm": 1.5103713274002075, "learning_rate": 4.9177136169733745e-06, "loss": 0.3061, "mean_token_accuracy": 0.890423309803009, "num_tokens": 48574463.0, "step": 5930 }, { "entropy": 0.4012789189815521, "epoch": 0.947477650063857, "grad_norm": 1.7785980701446533, "learning_rate": 4.917575798486975e-06, "loss": 0.3597, "mean_token_accuracy": 0.8741880536079407, "num_tokens": 48615423.0, "step": 5935 }, { "entropy": 0.3763855755329132, "epoch": 0.9482758620689655, "grad_norm": 1.6077227592468262, "learning_rate": 4.9174378672772525e-06, "loss": 0.3293, "mean_token_accuracy": 0.8826831579208374, "num_tokens": 48656383.0, "step": 5940 }, { "entropy": 0.3697520852088928, "epoch": 0.9490740740740741, "grad_norm": 1.4580429792404175, "learning_rate": 4.91729982335288e-06, "loss": 0.3287, "mean_token_accuracy": 0.8852989912033081, "num_tokens": 48697343.0, "step": 5945 }, { "entropy": 0.37058043479919434, "epoch": 0.9498722860791826, "grad_norm": 1.444654107093811, "learning_rate": 4.91716166672254e-06, "loss": 0.3246, "mean_token_accuracy": 0.8842874765396118, "num_tokens": 48738303.0, "step": 5950 }, { "entropy": 0.34524454474449157, "epoch": 0.9506704980842912, "grad_norm": 1.448460578918457, "learning_rate": 4.917023397394919e-06, "loss": 0.2962, "mean_token_accuracy": 0.8934790849685669, "num_tokens": 48779263.0, "step": 5955 }, { "entropy": 0.37682254910469054, "epoch": 0.9514687100893997, "grad_norm": 1.5126111507415771, "learning_rate": 4.916885015378712e-06, "loss": 0.3412, "mean_token_accuracy": 0.8782764315605164, "num_tokens": 48820223.0, "step": 5960 }, { "entropy": 0.3758767068386078, "epoch": 0.9522669220945083, "grad_norm": 1.451427936553955, "learning_rate": 4.9167465206826205e-06, "loss": 0.3394, "mean_token_accuracy": 0.8802876353263855, "num_tokens": 48861183.0, "step": 5965 }, { "entropy": 0.3713228702545166, "epoch": 0.9530651340996169, "grad_norm": 1.577522873878479, "learning_rate": 4.9166079133153545e-06, "loss": 0.3305, "mean_token_accuracy": 0.8837321996688843, "num_tokens": 48902143.0, "step": 5970 }, { "entropy": 0.36461412310600283, "epoch": 0.9538633461047255, "grad_norm": 1.4041016101837158, "learning_rate": 4.916469193285629e-06, "loss": 0.3227, "mean_token_accuracy": 0.8864709377288819, "num_tokens": 48943103.0, "step": 5975 }, { "entropy": 0.3806270360946655, "epoch": 0.954661558109834, "grad_norm": 1.4749987125396729, "learning_rate": 4.916330360602168e-06, "loss": 0.3232, "mean_token_accuracy": 0.8856003403663635, "num_tokens": 48984063.0, "step": 5980 }, { "entropy": 0.3979497790336609, "epoch": 0.9554597701149425, "grad_norm": 1.6750006675720215, "learning_rate": 4.916191415273702e-06, "loss": 0.3476, "mean_token_accuracy": 0.8770854830741882, "num_tokens": 49025023.0, "step": 5985 }, { "entropy": 0.39571131467819215, "epoch": 0.9562579821200511, "grad_norm": 1.5523713827133179, "learning_rate": 4.916052357308968e-06, "loss": 0.3455, "mean_token_accuracy": 0.8779788494110108, "num_tokens": 49065983.0, "step": 5990 }, { "entropy": 0.3467148780822754, "epoch": 0.9570561941251596, "grad_norm": 1.393660306930542, "learning_rate": 4.91591318671671e-06, "loss": 0.2973, "mean_token_accuracy": 0.8936139225959778, "num_tokens": 49106943.0, "step": 5995 }, { "entropy": 0.3784030795097351, "epoch": 0.9578544061302682, "grad_norm": 1.6584866046905518, "learning_rate": 4.91577390350568e-06, "loss": 0.3391, "mean_token_accuracy": 0.8783522844314575, "num_tokens": 49147903.0, "step": 6000 }, { "epoch": 0.9578544061302682, "eval_entropy": 0.38134945607185367, "eval_loss": 0.34127527475357056, "eval_mean_token_accuracy": 0.8793023900985718, "eval_num_tokens": 49147903.0, "eval_runtime": 69.2337, "eval_samples_per_second": 14.444, "eval_steps_per_second": 1.805, "step": 6000 }, { "entropy": 0.3653820753097534, "epoch": 0.9586526181353767, "grad_norm": 1.5888646841049194, "learning_rate": 4.9156345076846355e-06, "loss": 0.3216, "mean_token_accuracy": 0.8857032895088196, "num_tokens": 49188863.0, "step": 6005 }, { "entropy": 0.38212187886238097, "epoch": 0.9594508301404853, "grad_norm": 1.6170986890792847, "learning_rate": 4.915494999262345e-06, "loss": 0.3391, "mean_token_accuracy": 0.878742003440857, "num_tokens": 49229823.0, "step": 6010 }, { "entropy": 0.41390907764434814, "epoch": 0.9602490421455939, "grad_norm": 1.6242729425430298, "learning_rate": 4.9153553782475785e-06, "loss": 0.372, "mean_token_accuracy": 0.870467746257782, "num_tokens": 49270783.0, "step": 6015 }, { "entropy": 0.41197664141654966, "epoch": 0.9610472541507025, "grad_norm": 1.510725498199463, "learning_rate": 4.9152156446491165e-06, "loss": 0.3798, "mean_token_accuracy": 0.8672952890396118, "num_tokens": 49311743.0, "step": 6020 }, { "entropy": 0.3467439472675323, "epoch": 0.961845466155811, "grad_norm": 1.3229261636734009, "learning_rate": 4.9150757984757465e-06, "loss": 0.301, "mean_token_accuracy": 0.893600058555603, "num_tokens": 49352703.0, "step": 6025 }, { "entropy": 0.41660587191581727, "epoch": 0.9626436781609196, "grad_norm": 1.7598029375076294, "learning_rate": 4.9149358397362625e-06, "loss": 0.3625, "mean_token_accuracy": 0.8731543898582459, "num_tokens": 49393663.0, "step": 6030 }, { "entropy": 0.34480465650558473, "epoch": 0.9634418901660281, "grad_norm": 1.4702306985855103, "learning_rate": 4.914795768439465e-06, "loss": 0.3007, "mean_token_accuracy": 0.8925211548805236, "num_tokens": 49434623.0, "step": 6035 }, { "entropy": 0.4240105390548706, "epoch": 0.9642401021711366, "grad_norm": 1.5750335454940796, "learning_rate": 4.914655584594163e-06, "loss": 0.3867, "mean_token_accuracy": 0.8653579473495483, "num_tokens": 49475583.0, "step": 6040 }, { "entropy": 0.3583949863910675, "epoch": 0.9650383141762452, "grad_norm": 1.5241199731826782, "learning_rate": 4.91451528820917e-06, "loss": 0.3261, "mean_token_accuracy": 0.8832314372062683, "num_tokens": 49516543.0, "step": 6045 }, { "entropy": 0.39159261584281924, "epoch": 0.9658365261813537, "grad_norm": 1.463606595993042, "learning_rate": 4.91437487929331e-06, "loss": 0.3373, "mean_token_accuracy": 0.8794732451438904, "num_tokens": 49557503.0, "step": 6050 }, { "entropy": 0.34033394455909727, "epoch": 0.9666347381864623, "grad_norm": 1.3643134832382202, "learning_rate": 4.914234357855413e-06, "loss": 0.2878, "mean_token_accuracy": 0.8985254287719726, "num_tokens": 49598463.0, "step": 6055 }, { "entropy": 0.38294048309326173, "epoch": 0.9674329501915708, "grad_norm": 1.5533177852630615, "learning_rate": 4.914093723904313e-06, "loss": 0.3502, "mean_token_accuracy": 0.8760846972465515, "num_tokens": 49639423.0, "step": 6060 }, { "entropy": 0.38406974673271177, "epoch": 0.9682311621966795, "grad_norm": 1.681125521659851, "learning_rate": 4.913952977448856e-06, "loss": 0.3526, "mean_token_accuracy": 0.8754361867904663, "num_tokens": 49680383.0, "step": 6065 }, { "entropy": 0.3886194467544556, "epoch": 0.969029374201788, "grad_norm": 1.4837889671325684, "learning_rate": 4.9138121184978915e-06, "loss": 0.338, "mean_token_accuracy": 0.8797842979431152, "num_tokens": 49721343.0, "step": 6070 }, { "entropy": 0.3772536039352417, "epoch": 0.9698275862068966, "grad_norm": 1.5158101320266724, "learning_rate": 4.913671147060276e-06, "loss": 0.3335, "mean_token_accuracy": 0.8815980911254883, "num_tokens": 49762303.0, "step": 6075 }, { "entropy": 0.37093993425369265, "epoch": 0.9706257982120051, "grad_norm": 1.58279550075531, "learning_rate": 4.9135300631448765e-06, "loss": 0.3286, "mean_token_accuracy": 0.8813050985336304, "num_tokens": 49803263.0, "step": 6080 }, { "entropy": 0.38539561033248904, "epoch": 0.9714240102171137, "grad_norm": 1.4010246992111206, "learning_rate": 4.913388866760565e-06, "loss": 0.3413, "mean_token_accuracy": 0.8799989581108093, "num_tokens": 49844223.0, "step": 6085 }, { "entropy": 0.37510964274406433, "epoch": 0.9722222222222222, "grad_norm": 1.5841959714889526, "learning_rate": 4.913247557916217e-06, "loss": 0.3314, "mean_token_accuracy": 0.8827460169792175, "num_tokens": 49885183.0, "step": 6090 }, { "entropy": 0.33448782563209534, "epoch": 0.9730204342273308, "grad_norm": 1.6392887830734253, "learning_rate": 4.9131061366207225e-06, "loss": 0.2976, "mean_token_accuracy": 0.8942730307579041, "num_tokens": 49926143.0, "step": 6095 }, { "entropy": 0.3597862184047699, "epoch": 0.9738186462324393, "grad_norm": 1.2924385070800781, "learning_rate": 4.912964602882973e-06, "loss": 0.3148, "mean_token_accuracy": 0.8883277893066406, "num_tokens": 49967103.0, "step": 6100 }, { "entropy": 0.36224318742752076, "epoch": 0.9746168582375478, "grad_norm": 1.5063951015472412, "learning_rate": 4.9128229567118664e-06, "loss": 0.3221, "mean_token_accuracy": 0.8854564070701599, "num_tokens": 50008063.0, "step": 6105 }, { "entropy": 0.3745680212974548, "epoch": 0.9754150702426565, "grad_norm": 1.2758285999298096, "learning_rate": 4.912681198116314e-06, "loss": 0.3343, "mean_token_accuracy": 0.88257395029068, "num_tokens": 50049023.0, "step": 6110 }, { "entropy": 0.3942704081535339, "epoch": 0.976213282247765, "grad_norm": 1.7089884281158447, "learning_rate": 4.912539327105228e-06, "loss": 0.3548, "mean_token_accuracy": 0.8745615720748902, "num_tokens": 50089983.0, "step": 6115 }, { "entropy": 0.3974311351776123, "epoch": 0.9770114942528736, "grad_norm": 1.4831726551055908, "learning_rate": 4.912397343687528e-06, "loss": 0.354, "mean_token_accuracy": 0.8768528819084167, "num_tokens": 50130943.0, "step": 6120 }, { "entropy": 0.40303121209144593, "epoch": 0.9778097062579821, "grad_norm": 1.7240445613861084, "learning_rate": 4.912255247872145e-06, "loss": 0.368, "mean_token_accuracy": 0.8710102081298828, "num_tokens": 50171903.0, "step": 6125 }, { "entropy": 0.41418219804763795, "epoch": 0.9786079182630907, "grad_norm": 1.6801888942718506, "learning_rate": 4.9121130396680146e-06, "loss": 0.3704, "mean_token_accuracy": 0.8712199687957763, "num_tokens": 50212863.0, "step": 6130 }, { "entropy": 0.37163949608802793, "epoch": 0.9794061302681992, "grad_norm": 1.6908286809921265, "learning_rate": 4.911970719084077e-06, "loss": 0.3295, "mean_token_accuracy": 0.8853805422782898, "num_tokens": 50253823.0, "step": 6135 }, { "entropy": 0.4254068911075592, "epoch": 0.9802043422733078, "grad_norm": 1.6032263040542603, "learning_rate": 4.911828286129284e-06, "loss": 0.3765, "mean_token_accuracy": 0.8682022571563721, "num_tokens": 50294783.0, "step": 6140 }, { "entropy": 0.38155118823051454, "epoch": 0.9810025542784163, "grad_norm": 1.4514435529708862, "learning_rate": 4.911685740812592e-06, "loss": 0.3406, "mean_token_accuracy": 0.8800020813941956, "num_tokens": 50335743.0, "step": 6145 }, { "entropy": 0.38690811991691587, "epoch": 0.9818007662835249, "grad_norm": 1.5835672616958618, "learning_rate": 4.911543083142963e-06, "loss": 0.3393, "mean_token_accuracy": 0.882764458656311, "num_tokens": 50376703.0, "step": 6150 }, { "entropy": 0.40217992663383484, "epoch": 0.9825989782886334, "grad_norm": 1.369538426399231, "learning_rate": 4.91140031312937e-06, "loss": 0.3573, "mean_token_accuracy": 0.8751148462295533, "num_tokens": 50417663.0, "step": 6155 }, { "entropy": 0.4109044909477234, "epoch": 0.9833971902937421, "grad_norm": 1.5834952592849731, "learning_rate": 4.911257430780789e-06, "loss": 0.3662, "mean_token_accuracy": 0.8720200419425964, "num_tokens": 50458623.0, "step": 6160 }, { "entropy": 0.3819911301136017, "epoch": 0.9841954022988506, "grad_norm": 1.4536290168762207, "learning_rate": 4.911114436106207e-06, "loss": 0.343, "mean_token_accuracy": 0.8796203851699829, "num_tokens": 50499583.0, "step": 6165 }, { "entropy": 0.38514203429222105, "epoch": 0.9849936143039592, "grad_norm": 1.4666593074798584, "learning_rate": 4.9109713291146134e-06, "loss": 0.3422, "mean_token_accuracy": 0.8806746363639831, "num_tokens": 50540543.0, "step": 6170 }, { "entropy": 0.39998855590820315, "epoch": 0.9857918263090677, "grad_norm": 1.4971364736557007, "learning_rate": 4.910828109815009e-06, "loss": 0.3564, "mean_token_accuracy": 0.8744049787521362, "num_tokens": 50581503.0, "step": 6175 }, { "entropy": 0.3624740481376648, "epoch": 0.9865900383141762, "grad_norm": 1.5488523244857788, "learning_rate": 4.9106847782164e-06, "loss": 0.3136, "mean_token_accuracy": 0.8884861707687378, "num_tokens": 50622463.0, "step": 6180 }, { "entropy": 0.40431431531906126, "epoch": 0.9873882503192848, "grad_norm": 1.4721726179122925, "learning_rate": 4.910541334327798e-06, "loss": 0.363, "mean_token_accuracy": 0.8711833119392395, "num_tokens": 50663423.0, "step": 6185 }, { "entropy": 0.3674971103668213, "epoch": 0.9881864623243933, "grad_norm": 1.5480612516403198, "learning_rate": 4.910397778158226e-06, "loss": 0.3296, "mean_token_accuracy": 0.882783031463623, "num_tokens": 50704383.0, "step": 6190 }, { "entropy": 0.4075448215007782, "epoch": 0.9889846743295019, "grad_norm": 1.640990138053894, "learning_rate": 4.91025410971671e-06, "loss": 0.3717, "mean_token_accuracy": 0.8722677826881409, "num_tokens": 50745343.0, "step": 6195 }, { "entropy": 0.380616694688797, "epoch": 0.9897828863346104, "grad_norm": 1.3555705547332764, "learning_rate": 4.910110329012282e-06, "loss": 0.3366, "mean_token_accuracy": 0.8813807725906372, "num_tokens": 50786303.0, "step": 6200 }, { "epoch": 0.9897828863346104, "eval_entropy": 0.39050144743919374, "eval_loss": 0.3402261435985565, "eval_mean_token_accuracy": 0.8799290285110474, "eval_num_tokens": 50786303.0, "eval_runtime": 69.2249, "eval_samples_per_second": 14.446, "eval_steps_per_second": 1.806, "step": 6200 } ], "logging_steps": 5, "max_steps": 62640, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1071958288148756e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }