{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03443437239052022, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.413330078125, "epoch": 1.721718619526011e-05, "grad_norm": 0.6972781419754028, "learning_rate": 0.0, "loss": 2.3998, "mean_token_accuracy": 0.4832073478028178, "num_tokens": 152243.0, "step": 1 }, { "entropy": 2.4014892578125, "epoch": 3.443437239052022e-05, "grad_norm": 0.703504204750061, "learning_rate": 1e-08, "loss": 2.3565, "mean_token_accuracy": 0.48620754200965166, "num_tokens": 302755.0, "step": 2 }, { "entropy": 2.3984375, "epoch": 5.165155858578032e-05, "grad_norm": 0.7933295369148254, "learning_rate": 2e-08, "loss": 2.3735, "mean_token_accuracy": 0.4893745076842606, "num_tokens": 430137.0, "step": 3 }, { "entropy": 2.4783935546875, "epoch": 6.886874478104043e-05, "grad_norm": 0.6833076477050781, "learning_rate": 3.0000000000000004e-08, "loss": 2.4365, "mean_token_accuracy": 0.4767027348279953, "num_tokens": 575284.0, "step": 4 }, { "entropy": 2.457275390625, "epoch": 8.608593097630054e-05, "grad_norm": 0.7900882363319397, "learning_rate": 4e-08, "loss": 2.4671, "mean_token_accuracy": 0.47980545135214925, "num_tokens": 714418.0, "step": 5 }, { "entropy": 2.367431640625, "epoch": 0.00010330311717156065, "grad_norm": 0.6312107443809509, "learning_rate": 5.0000000000000004e-08, "loss": 2.3371, "mean_token_accuracy": 0.49468297231942415, "num_tokens": 883298.0, "step": 6 }, { "entropy": 2.45263671875, "epoch": 0.00012052030336682076, "grad_norm": 0.6607802510261536, "learning_rate": 6.000000000000001e-08, "loss": 2.4411, "mean_token_accuracy": 0.4778139302507043, "num_tokens": 1034855.0, "step": 7 }, { "entropy": 2.482421875, "epoch": 0.00013773748956208087, "grad_norm": 0.6800277829170227, "learning_rate": 7e-08, "loss": 2.439, "mean_token_accuracy": 0.4751331675797701, "num_tokens": 1176490.0, "step": 8 }, { "entropy": 2.391357421875, "epoch": 0.000154954675757341, "grad_norm": 0.7818512916564941, "learning_rate": 8e-08, "loss": 2.3771, "mean_token_accuracy": 0.48402665881440043, "num_tokens": 1311861.0, "step": 9 }, { "entropy": 2.39453125, "epoch": 0.00017217186195260108, "grad_norm": 0.7182415127754211, "learning_rate": 9e-08, "loss": 2.3644, "mean_token_accuracy": 0.4823771519586444, "num_tokens": 1454916.0, "step": 10 }, { "entropy": 2.477783203125, "epoch": 0.0001893890481478612, "grad_norm": 0.7052203416824341, "learning_rate": 1.0000000000000001e-07, "loss": 2.4729, "mean_token_accuracy": 0.4706865563057363, "num_tokens": 1587290.0, "step": 11 }, { "entropy": 2.4482421875, "epoch": 0.0002066062343431213, "grad_norm": 0.6999819278717041, "learning_rate": 1.1e-07, "loss": 2.4037, "mean_token_accuracy": 0.47885329788550735, "num_tokens": 1724163.0, "step": 12 }, { "entropy": 2.4818115234375, "epoch": 0.0002238234205383814, "grad_norm": 0.7237532734870911, "learning_rate": 1.2000000000000002e-07, "loss": 2.5056, "mean_token_accuracy": 0.46921027079224586, "num_tokens": 1865666.0, "step": 13 }, { "entropy": 2.392333984375, "epoch": 0.00024104060673364153, "grad_norm": 0.6929540634155273, "learning_rate": 1.3e-07, "loss": 2.3854, "mean_token_accuracy": 0.4831690890714526, "num_tokens": 2020018.0, "step": 14 }, { "entropy": 2.4205322265625, "epoch": 0.00025825779292890165, "grad_norm": 0.6675299406051636, "learning_rate": 1.4e-07, "loss": 2.3398, "mean_token_accuracy": 0.48172238236293197, "num_tokens": 2158990.0, "step": 15 }, { "entropy": 2.385498046875, "epoch": 0.00027547497912416174, "grad_norm": 0.7503660321235657, "learning_rate": 1.5000000000000002e-07, "loss": 2.3352, "mean_token_accuracy": 0.4925760827027261, "num_tokens": 2296401.0, "step": 16 }, { "entropy": 2.4197998046875, "epoch": 0.00029269216531942183, "grad_norm": 0.6798411011695862, "learning_rate": 1.6e-07, "loss": 2.4002, "mean_token_accuracy": 0.4798423429019749, "num_tokens": 2445270.0, "step": 17 }, { "entropy": 2.474609375, "epoch": 0.000309909351514682, "grad_norm": 0.6871540546417236, "learning_rate": 1.7000000000000001e-07, "loss": 2.4938, "mean_token_accuracy": 0.47186582954600453, "num_tokens": 2586867.0, "step": 18 }, { "entropy": 2.50732421875, "epoch": 0.00032712653770994207, "grad_norm": 0.717059314250946, "learning_rate": 1.8e-07, "loss": 2.525, "mean_token_accuracy": 0.46879610791802406, "num_tokens": 2720721.0, "step": 19 }, { "entropy": 2.424560546875, "epoch": 0.00034434372390520216, "grad_norm": 0.6196162700653076, "learning_rate": 1.9e-07, "loss": 2.4176, "mean_token_accuracy": 0.4790610708296299, "num_tokens": 2876790.0, "step": 20 }, { "entropy": 2.4759521484375, "epoch": 0.0003615609101004623, "grad_norm": 0.6419414281845093, "learning_rate": 2.0000000000000002e-07, "loss": 2.4874, "mean_token_accuracy": 0.471744445618242, "num_tokens": 3033198.0, "step": 21 }, { "entropy": 2.383056640625, "epoch": 0.0003787780962957224, "grad_norm": 0.6465677618980408, "learning_rate": 2.1000000000000003e-07, "loss": 2.3503, "mean_token_accuracy": 0.48874560045078397, "num_tokens": 3181718.0, "step": 22 }, { "entropy": 2.3697509765625, "epoch": 0.0003959952824909825, "grad_norm": 0.7614319920539856, "learning_rate": 2.2e-07, "loss": 2.3713, "mean_token_accuracy": 0.48694683285430074, "num_tokens": 3328269.0, "step": 23 }, { "entropy": 2.3336181640625, "epoch": 0.0004132124686862426, "grad_norm": 0.7275253534317017, "learning_rate": 2.3000000000000002e-07, "loss": 2.3167, "mean_token_accuracy": 0.4986211028881371, "num_tokens": 3482113.0, "step": 24 }, { "entropy": 2.44775390625, "epoch": 0.0004304296548815027, "grad_norm": 0.6546138525009155, "learning_rate": 2.4000000000000003e-07, "loss": 2.4172, "mean_token_accuracy": 0.48186044162139297, "num_tokens": 3634079.0, "step": 25 }, { "entropy": 2.495849609375, "epoch": 0.0004476468410767628, "grad_norm": 0.6819922924041748, "learning_rate": 2.5000000000000004e-07, "loss": 2.4707, "mean_token_accuracy": 0.47281224094331264, "num_tokens": 3784902.0, "step": 26 }, { "entropy": 2.490966796875, "epoch": 0.0004648640272720229, "grad_norm": 0.6698691248893738, "learning_rate": 2.6e-07, "loss": 2.4847, "mean_token_accuracy": 0.46762992488220334, "num_tokens": 3923281.0, "step": 27 }, { "entropy": 2.4964599609375, "epoch": 0.00048208121346728306, "grad_norm": 0.6502507925033569, "learning_rate": 2.7e-07, "loss": 2.4646, "mean_token_accuracy": 0.46740976348519325, "num_tokens": 4074956.0, "step": 28 }, { "entropy": 2.4283447265625, "epoch": 0.0004992983996625432, "grad_norm": 0.7344250679016113, "learning_rate": 2.8e-07, "loss": 2.425, "mean_token_accuracy": 0.47232619673013687, "num_tokens": 4218108.0, "step": 29 }, { "entropy": 2.453857421875, "epoch": 0.0005165155858578033, "grad_norm": 0.7299566268920898, "learning_rate": 2.9000000000000003e-07, "loss": 2.4514, "mean_token_accuracy": 0.477730430662632, "num_tokens": 4352208.0, "step": 30 }, { "entropy": 2.39404296875, "epoch": 0.0005337327720530634, "grad_norm": 0.6726910471916199, "learning_rate": 3.0000000000000004e-07, "loss": 2.3704, "mean_token_accuracy": 0.4892354430630803, "num_tokens": 4510862.0, "step": 31 }, { "entropy": 2.37158203125, "epoch": 0.0005509499582483235, "grad_norm": 0.6483345627784729, "learning_rate": 3.1000000000000005e-07, "loss": 2.3378, "mean_token_accuracy": 0.4861680665053427, "num_tokens": 4655812.0, "step": 32 }, { "entropy": 2.4970703125, "epoch": 0.0005681671444435836, "grad_norm": 0.7244667410850525, "learning_rate": 3.2e-07, "loss": 2.4361, "mean_token_accuracy": 0.4727164036594331, "num_tokens": 4796670.0, "step": 33 }, { "entropy": 2.4234619140625, "epoch": 0.0005853843306388437, "grad_norm": 0.6697008013725281, "learning_rate": 3.3e-07, "loss": 2.4176, "mean_token_accuracy": 0.48080282052978873, "num_tokens": 4946476.0, "step": 34 }, { "entropy": 2.4354248046875, "epoch": 0.0006026015168341038, "grad_norm": 0.6681280732154846, "learning_rate": 3.4000000000000003e-07, "loss": 2.3967, "mean_token_accuracy": 0.48242951929569244, "num_tokens": 5097062.0, "step": 35 }, { "entropy": 2.4337158203125, "epoch": 0.000619818703029364, "grad_norm": 0.6784984469413757, "learning_rate": 3.5000000000000004e-07, "loss": 2.4061, "mean_token_accuracy": 0.4787406297400594, "num_tokens": 5241121.0, "step": 36 }, { "entropy": 2.3486328125, "epoch": 0.000637035889224624, "grad_norm": 0.7190965414047241, "learning_rate": 3.6e-07, "loss": 2.3033, "mean_token_accuracy": 0.4977131159976125, "num_tokens": 5379636.0, "step": 37 }, { "entropy": 2.400390625, "epoch": 0.0006542530754198841, "grad_norm": 0.7516965866088867, "learning_rate": 3.7e-07, "loss": 2.4019, "mean_token_accuracy": 0.48473000153899193, "num_tokens": 5513259.0, "step": 38 }, { "entropy": 2.4234619140625, "epoch": 0.0006714702616151442, "grad_norm": 0.6816972494125366, "learning_rate": 3.8e-07, "loss": 2.4078, "mean_token_accuracy": 0.4816413172520697, "num_tokens": 5657625.0, "step": 39 }, { "entropy": 2.4327392578125, "epoch": 0.0006886874478104043, "grad_norm": 0.7355881929397583, "learning_rate": 3.9e-07, "loss": 2.4284, "mean_token_accuracy": 0.4844762939028442, "num_tokens": 5801203.0, "step": 40 }, { "entropy": 2.4520263671875, "epoch": 0.0007059046340056644, "grad_norm": 0.6229036450386047, "learning_rate": 4.0000000000000003e-07, "loss": 2.4646, "mean_token_accuracy": 0.47269141068682075, "num_tokens": 5953949.0, "step": 41 }, { "entropy": 2.4000244140625, "epoch": 0.0007231218202009246, "grad_norm": 1.3853771686553955, "learning_rate": 4.1000000000000004e-07, "loss": 2.415, "mean_token_accuracy": 0.48673709062859416, "num_tokens": 6124847.0, "step": 42 }, { "entropy": 2.443603515625, "epoch": 0.0007403390063961847, "grad_norm": 0.6690321564674377, "learning_rate": 4.2000000000000006e-07, "loss": 2.4083, "mean_token_accuracy": 0.4795906525105238, "num_tokens": 6277763.0, "step": 43 }, { "entropy": 2.38916015625, "epoch": 0.0007575561925914448, "grad_norm": 0.6857286691665649, "learning_rate": 4.3e-07, "loss": 2.3703, "mean_token_accuracy": 0.4900118997320533, "num_tokens": 6426281.0, "step": 44 }, { "entropy": 2.4334716796875, "epoch": 0.0007747733787867049, "grad_norm": 0.7215378284454346, "learning_rate": 4.4e-07, "loss": 2.4171, "mean_token_accuracy": 0.48149433452636003, "num_tokens": 6569929.0, "step": 45 }, { "entropy": 2.4718017578125, "epoch": 0.000791990564981965, "grad_norm": 0.6353131532669067, "learning_rate": 4.5000000000000003e-07, "loss": 2.4404, "mean_token_accuracy": 0.4716165652498603, "num_tokens": 6715254.0, "step": 46 }, { "entropy": 2.4588623046875, "epoch": 0.0008092077511772251, "grad_norm": 0.7047929763793945, "learning_rate": 4.6000000000000004e-07, "loss": 2.4363, "mean_token_accuracy": 0.47845502756536007, "num_tokens": 6858114.0, "step": 47 }, { "entropy": 2.4326171875, "epoch": 0.0008264249373724852, "grad_norm": 0.6554984450340271, "learning_rate": 4.7000000000000005e-07, "loss": 2.4103, "mean_token_accuracy": 0.48302431078627706, "num_tokens": 7003888.0, "step": 48 }, { "entropy": 2.424072265625, "epoch": 0.0008436421235677454, "grad_norm": 0.615967333316803, "learning_rate": 4.800000000000001e-07, "loss": 2.3426, "mean_token_accuracy": 0.48605213360860944, "num_tokens": 7178232.0, "step": 49 }, { "entropy": 2.3641357421875, "epoch": 0.0008608593097630055, "grad_norm": 0.7089155316352844, "learning_rate": 4.900000000000001e-07, "loss": 2.3231, "mean_token_accuracy": 0.49523815233260393, "num_tokens": 7323317.0, "step": 50 }, { "entropy": 2.355224609375, "epoch": 0.0008780764959582655, "grad_norm": 0.6622390151023865, "learning_rate": 5.000000000000001e-07, "loss": 2.328, "mean_token_accuracy": 0.49393986631184816, "num_tokens": 7473982.0, "step": 51 }, { "entropy": 2.41357421875, "epoch": 0.0008952936821535256, "grad_norm": 0.7162524461746216, "learning_rate": 5.1e-07, "loss": 2.3915, "mean_token_accuracy": 0.4822638025507331, "num_tokens": 7618445.0, "step": 52 }, { "entropy": 2.4310302734375, "epoch": 0.0009125108683487857, "grad_norm": 0.7647550106048584, "learning_rate": 5.2e-07, "loss": 2.3974, "mean_token_accuracy": 0.4842324573546648, "num_tokens": 7759148.0, "step": 53 }, { "entropy": 2.4063720703125, "epoch": 0.0009297280545440458, "grad_norm": 0.6380776166915894, "learning_rate": 5.3e-07, "loss": 2.4044, "mean_token_accuracy": 0.48621372459456325, "num_tokens": 7909476.0, "step": 54 }, { "entropy": 2.40234375, "epoch": 0.000946945240739306, "grad_norm": 0.6949607729911804, "learning_rate": 5.4e-07, "loss": 2.3508, "mean_token_accuracy": 0.48716961592435837, "num_tokens": 8052542.0, "step": 55 }, { "entropy": 2.361328125, "epoch": 0.0009641624269345661, "grad_norm": 0.6398380398750305, "learning_rate": 5.5e-07, "loss": 2.298, "mean_token_accuracy": 0.49046063888818026, "num_tokens": 8198359.0, "step": 56 }, { "entropy": 2.424560546875, "epoch": 0.000981379613129826, "grad_norm": 0.6661210060119629, "learning_rate": 5.6e-07, "loss": 2.3812, "mean_token_accuracy": 0.4842396741732955, "num_tokens": 8350561.0, "step": 57 }, { "entropy": 2.42333984375, "epoch": 0.0009985967993250864, "grad_norm": 0.6568598747253418, "learning_rate": 5.7e-07, "loss": 2.386, "mean_token_accuracy": 0.4811908514238894, "num_tokens": 8498421.0, "step": 58 }, { "entropy": 2.4390869140625, "epoch": 0.0010158139855203465, "grad_norm": 0.6521676778793335, "learning_rate": 5.800000000000001e-07, "loss": 2.4007, "mean_token_accuracy": 0.47653205832466483, "num_tokens": 8645838.0, "step": 59 }, { "entropy": 2.4642333984375, "epoch": 0.0010330311717156066, "grad_norm": 0.6818094849586487, "learning_rate": 5.900000000000001e-07, "loss": 2.4183, "mean_token_accuracy": 0.47624633833765984, "num_tokens": 8781213.0, "step": 60 }, { "entropy": 2.4544677734375, "epoch": 0.0010502483579108667, "grad_norm": 0.7099837064743042, "learning_rate": 6.000000000000001e-07, "loss": 2.429, "mean_token_accuracy": 0.4788547111675143, "num_tokens": 8932181.0, "step": 61 }, { "entropy": 2.475830078125, "epoch": 0.0010674655441061268, "grad_norm": 0.6400408744812012, "learning_rate": 6.100000000000001e-07, "loss": 2.452, "mean_token_accuracy": 0.4709730679169297, "num_tokens": 9069468.0, "step": 62 }, { "entropy": 2.4625244140625, "epoch": 0.0010846827303013869, "grad_norm": 0.6099865436553955, "learning_rate": 6.200000000000001e-07, "loss": 2.4152, "mean_token_accuracy": 0.4845339651219547, "num_tokens": 9238211.0, "step": 63 }, { "entropy": 2.4390869140625, "epoch": 0.001101899916496647, "grad_norm": 0.6318409442901611, "learning_rate": 6.3e-07, "loss": 2.402, "mean_token_accuracy": 0.4825730072334409, "num_tokens": 9393830.0, "step": 64 }, { "entropy": 2.3837890625, "epoch": 0.001119117102691907, "grad_norm": 0.6888275742530823, "learning_rate": 6.4e-07, "loss": 2.3289, "mean_token_accuracy": 0.4893337092362344, "num_tokens": 9536691.0, "step": 65 }, { "entropy": 2.476806640625, "epoch": 0.0011363342888871671, "grad_norm": 0.6647155284881592, "learning_rate": 6.5e-07, "loss": 2.4071, "mean_token_accuracy": 0.47626868123188615, "num_tokens": 9672776.0, "step": 66 }, { "entropy": 2.4517822265625, "epoch": 0.0011535514750824272, "grad_norm": 0.6950869560241699, "learning_rate": 6.6e-07, "loss": 2.4153, "mean_token_accuracy": 0.4799360786564648, "num_tokens": 9823719.0, "step": 67 }, { "entropy": 2.4583740234375, "epoch": 0.0011707686612776873, "grad_norm": 0.5642852783203125, "learning_rate": 6.7e-07, "loss": 2.4354, "mean_token_accuracy": 0.47825055569410324, "num_tokens": 9979166.0, "step": 68 }, { "entropy": 2.503173828125, "epoch": 0.0011879858474729474, "grad_norm": 0.6830074787139893, "learning_rate": 6.800000000000001e-07, "loss": 2.4759, "mean_token_accuracy": 0.46751530282199383, "num_tokens": 10123771.0, "step": 69 }, { "entropy": 2.4609375, "epoch": 0.0012052030336682075, "grad_norm": 0.611301600933075, "learning_rate": 6.900000000000001e-07, "loss": 2.3415, "mean_token_accuracy": 0.4828020860441029, "num_tokens": 10266978.0, "step": 70 }, { "entropy": 2.393798828125, "epoch": 0.0012224202198634678, "grad_norm": 0.6329925060272217, "learning_rate": 7.000000000000001e-07, "loss": 2.3527, "mean_token_accuracy": 0.4885860946960747, "num_tokens": 10411005.0, "step": 71 }, { "entropy": 2.4456787109375, "epoch": 0.001239637406058728, "grad_norm": 0.6201856732368469, "learning_rate": 7.1e-07, "loss": 2.3725, "mean_token_accuracy": 0.4878034754656255, "num_tokens": 10560620.0, "step": 72 }, { "entropy": 2.50439453125, "epoch": 0.001256854592253988, "grad_norm": 0.6518511772155762, "learning_rate": 7.2e-07, "loss": 2.4772, "mean_token_accuracy": 0.47327897092327476, "num_tokens": 10692258.0, "step": 73 }, { "entropy": 2.5009765625, "epoch": 0.001274071778449248, "grad_norm": 0.6756422519683838, "learning_rate": 7.3e-07, "loss": 2.5074, "mean_token_accuracy": 0.4726545801386237, "num_tokens": 10841410.0, "step": 74 }, { "entropy": 2.47998046875, "epoch": 0.0012912889646445082, "grad_norm": 0.5928777456283569, "learning_rate": 7.4e-07, "loss": 2.4521, "mean_token_accuracy": 0.4727823534049094, "num_tokens": 10996060.0, "step": 75 }, { "entropy": 2.499755859375, "epoch": 0.0013085061508397683, "grad_norm": 0.6060748100280762, "learning_rate": 7.5e-07, "loss": 2.4569, "mean_token_accuracy": 0.4745932733640075, "num_tokens": 11146252.0, "step": 76 }, { "entropy": 2.429443359375, "epoch": 0.0013257233370350284, "grad_norm": 0.633307933807373, "learning_rate": 7.6e-07, "loss": 2.3987, "mean_token_accuracy": 0.4819001527503133, "num_tokens": 11287713.0, "step": 77 }, { "entropy": 2.38232421875, "epoch": 0.0013429405232302885, "grad_norm": 0.6448901295661926, "learning_rate": 7.7e-07, "loss": 2.3849, "mean_token_accuracy": 0.4902081396430731, "num_tokens": 11427731.0, "step": 78 }, { "entropy": 2.4320068359375, "epoch": 0.0013601577094255485, "grad_norm": 0.7015244364738464, "learning_rate": 7.8e-07, "loss": 2.3898, "mean_token_accuracy": 0.48666849778965116, "num_tokens": 11570107.0, "step": 79 }, { "entropy": 2.47998046875, "epoch": 0.0013773748956208086, "grad_norm": 0.6528029441833496, "learning_rate": 7.900000000000001e-07, "loss": 2.4978, "mean_token_accuracy": 0.4753390052355826, "num_tokens": 11704020.0, "step": 80 }, { "entropy": 2.433837890625, "epoch": 0.0013945920818160687, "grad_norm": 0.7266194820404053, "learning_rate": 8.000000000000001e-07, "loss": 2.3828, "mean_token_accuracy": 0.479493273422122, "num_tokens": 11849459.0, "step": 81 }, { "entropy": 2.4395751953125, "epoch": 0.0014118092680113288, "grad_norm": 0.6236125230789185, "learning_rate": 8.100000000000001e-07, "loss": 2.3505, "mean_token_accuracy": 0.48662899900227785, "num_tokens": 12010828.0, "step": 82 }, { "entropy": 2.475830078125, "epoch": 0.001429026454206589, "grad_norm": 0.7149572968482971, "learning_rate": 8.200000000000001e-07, "loss": 2.4566, "mean_token_accuracy": 0.47693332051858306, "num_tokens": 12157399.0, "step": 83 }, { "entropy": 2.438720703125, "epoch": 0.0014462436404018492, "grad_norm": 0.6647018790245056, "learning_rate": 8.300000000000001e-07, "loss": 2.3986, "mean_token_accuracy": 0.4824375621974468, "num_tokens": 12287842.0, "step": 84 }, { "entropy": 2.41455078125, "epoch": 0.0014634608265971093, "grad_norm": 0.6005454063415527, "learning_rate": 8.400000000000001e-07, "loss": 2.3891, "mean_token_accuracy": 0.4884789031930268, "num_tokens": 12436104.0, "step": 85 }, { "entropy": 2.4703369140625, "epoch": 0.0014806780127923694, "grad_norm": 0.6212813854217529, "learning_rate": 8.500000000000001e-07, "loss": 2.34, "mean_token_accuracy": 0.4844899824820459, "num_tokens": 12575067.0, "step": 86 }, { "entropy": 2.6005859375, "epoch": 0.0014978951989876295, "grad_norm": 0.8042929172515869, "learning_rate": 8.6e-07, "loss": 2.6196, "mean_token_accuracy": 0.4688438312150538, "num_tokens": 12730536.0, "step": 87 }, { "entropy": 2.4884033203125, "epoch": 0.0015151123851828896, "grad_norm": 0.5991901755332947, "learning_rate": 8.7e-07, "loss": 2.4611, "mean_token_accuracy": 0.4766750931739807, "num_tokens": 12882514.0, "step": 88 }, { "entropy": 2.4810791015625, "epoch": 0.0015323295713781497, "grad_norm": 0.6494613289833069, "learning_rate": 8.8e-07, "loss": 2.4435, "mean_token_accuracy": 0.47679867735132575, "num_tokens": 13023006.0, "step": 89 }, { "entropy": 2.4412841796875, "epoch": 0.0015495467575734098, "grad_norm": 0.6427425146102905, "learning_rate": 8.900000000000001e-07, "loss": 2.3905, "mean_token_accuracy": 0.4838052117265761, "num_tokens": 13156996.0, "step": 90 }, { "entropy": 2.394775390625, "epoch": 0.0015667639437686699, "grad_norm": 0.6279881596565247, "learning_rate": 9.000000000000001e-07, "loss": 2.3753, "mean_token_accuracy": 0.48632694967091084, "num_tokens": 13299631.0, "step": 91 }, { "entropy": 2.4176025390625, "epoch": 0.00158398112996393, "grad_norm": 0.6744757294654846, "learning_rate": 9.100000000000001e-07, "loss": 2.3854, "mean_token_accuracy": 0.4846075074747205, "num_tokens": 13440366.0, "step": 92 }, { "entropy": 2.4541015625, "epoch": 0.00160119831615919, "grad_norm": 0.6708775758743286, "learning_rate": 9.200000000000001e-07, "loss": 2.4069, "mean_token_accuracy": 0.481810312718153, "num_tokens": 13566615.0, "step": 93 }, { "entropy": 2.443115234375, "epoch": 0.0016184155023544501, "grad_norm": 0.5979477167129517, "learning_rate": 9.300000000000001e-07, "loss": 2.3846, "mean_token_accuracy": 0.48044557217508554, "num_tokens": 13719522.0, "step": 94 }, { "entropy": 2.45263671875, "epoch": 0.0016356326885497102, "grad_norm": 0.6026092171669006, "learning_rate": 9.400000000000001e-07, "loss": 2.4051, "mean_token_accuracy": 0.4736237172037363, "num_tokens": 13864674.0, "step": 95 }, { "entropy": 2.3526611328125, "epoch": 0.0016528498747449703, "grad_norm": 0.5873211622238159, "learning_rate": 9.500000000000001e-07, "loss": 2.2743, "mean_token_accuracy": 0.49419589480385184, "num_tokens": 14027711.0, "step": 96 }, { "entropy": 2.460693359375, "epoch": 0.0016700670609402306, "grad_norm": 0.5936735272407532, "learning_rate": 9.600000000000001e-07, "loss": 2.3992, "mean_token_accuracy": 0.4806458824314177, "num_tokens": 14172381.0, "step": 97 }, { "entropy": 2.45458984375, "epoch": 0.0016872842471354907, "grad_norm": 1.1826030015945435, "learning_rate": 9.7e-07, "loss": 2.4255, "mean_token_accuracy": 0.4772787392139435, "num_tokens": 14309500.0, "step": 98 }, { "entropy": 2.51806640625, "epoch": 0.0017045014333307508, "grad_norm": 0.6550461053848267, "learning_rate": 9.800000000000001e-07, "loss": 2.4692, "mean_token_accuracy": 0.47163511207327247, "num_tokens": 14449340.0, "step": 99 }, { "entropy": 2.446533203125, "epoch": 0.001721718619526011, "grad_norm": 0.5825348496437073, "learning_rate": 9.9e-07, "loss": 2.4099, "mean_token_accuracy": 0.47821634402498603, "num_tokens": 14600293.0, "step": 100 }, { "entropy": 2.454833984375, "epoch": 0.001738935805721271, "grad_norm": 0.5967729091644287, "learning_rate": 1.0000000000000002e-06, "loss": 2.4471, "mean_token_accuracy": 0.4838520339690149, "num_tokens": 14754154.0, "step": 101 }, { "entropy": 2.470703125, "epoch": 0.001756152991916531, "grad_norm": 0.7432575225830078, "learning_rate": 1.01e-06, "loss": 2.4286, "mean_token_accuracy": 0.48514684848487377, "num_tokens": 14912580.0, "step": 102 }, { "entropy": 2.5068359375, "epoch": 0.0017733701781117912, "grad_norm": 0.6083717942237854, "learning_rate": 1.02e-06, "loss": 2.4618, "mean_token_accuracy": 0.47291735280305147, "num_tokens": 15054619.0, "step": 103 }, { "entropy": 2.37841796875, "epoch": 0.0017905873643070513, "grad_norm": 0.664107084274292, "learning_rate": 1.03e-06, "loss": 2.3513, "mean_token_accuracy": 0.49140653293579817, "num_tokens": 15184259.0, "step": 104 }, { "entropy": 2.408935546875, "epoch": 0.0018078045505023114, "grad_norm": 0.6227862238883972, "learning_rate": 1.04e-06, "loss": 2.3885, "mean_token_accuracy": 0.4819969036616385, "num_tokens": 15328191.0, "step": 105 }, { "entropy": 2.4210205078125, "epoch": 0.0018250217366975715, "grad_norm": 0.5873227119445801, "learning_rate": 1.0500000000000001e-06, "loss": 2.3571, "mean_token_accuracy": 0.4860111135058105, "num_tokens": 15484579.0, "step": 106 }, { "entropy": 2.4600830078125, "epoch": 0.0018422389228928315, "grad_norm": 0.7568690776824951, "learning_rate": 1.06e-06, "loss": 2.4228, "mean_token_accuracy": 0.4871084992773831, "num_tokens": 15621836.0, "step": 107 }, { "entropy": 2.443603515625, "epoch": 0.0018594561090880916, "grad_norm": 0.5804514288902283, "learning_rate": 1.0700000000000001e-06, "loss": 2.4028, "mean_token_accuracy": 0.4824648411013186, "num_tokens": 15778932.0, "step": 108 }, { "entropy": 2.455810546875, "epoch": 0.001876673295283352, "grad_norm": 0.5958721041679382, "learning_rate": 1.08e-06, "loss": 2.3833, "mean_token_accuracy": 0.48080389108508825, "num_tokens": 15924525.0, "step": 109 }, { "entropy": 2.3779296875, "epoch": 0.001893890481478612, "grad_norm": 0.6376572251319885, "learning_rate": 1.0900000000000002e-06, "loss": 2.3314, "mean_token_accuracy": 0.4905442167073488, "num_tokens": 16080275.0, "step": 110 }, { "entropy": 2.4273681640625, "epoch": 0.0019111076676738721, "grad_norm": 0.6554363369941711, "learning_rate": 1.1e-06, "loss": 2.4002, "mean_token_accuracy": 0.4839071067981422, "num_tokens": 16212132.0, "step": 111 }, { "entropy": 2.4451904296875, "epoch": 0.0019283248538691322, "grad_norm": 0.6245486736297607, "learning_rate": 1.1100000000000002e-06, "loss": 2.4013, "mean_token_accuracy": 0.4785764031112194, "num_tokens": 16346241.0, "step": 112 }, { "entropy": 2.44677734375, "epoch": 0.0019455420400643923, "grad_norm": 0.6356763243675232, "learning_rate": 1.12e-06, "loss": 2.3992, "mean_token_accuracy": 0.4817821686156094, "num_tokens": 16477490.0, "step": 113 }, { "entropy": 2.480224609375, "epoch": 0.001962759226259652, "grad_norm": 0.7369113564491272, "learning_rate": 1.1300000000000002e-06, "loss": 2.4604, "mean_token_accuracy": 0.4817192433401942, "num_tokens": 16635712.0, "step": 114 }, { "entropy": 2.446533203125, "epoch": 0.0019799764124549123, "grad_norm": 0.5834375023841858, "learning_rate": 1.14e-06, "loss": 2.3876, "mean_token_accuracy": 0.48327695531770587, "num_tokens": 16790029.0, "step": 115 }, { "entropy": 2.43994140625, "epoch": 0.001997193598650173, "grad_norm": 0.589958906173706, "learning_rate": 1.1500000000000002e-06, "loss": 2.3869, "mean_token_accuracy": 0.4807235752232373, "num_tokens": 16939941.0, "step": 116 }, { "entropy": 2.4541015625, "epoch": 0.002014410784845433, "grad_norm": 0.5870490074157715, "learning_rate": 1.1600000000000001e-06, "loss": 2.3979, "mean_token_accuracy": 0.47932128235697746, "num_tokens": 17090653.0, "step": 117 }, { "entropy": 2.4052734375, "epoch": 0.002031627971040693, "grad_norm": 0.6636145710945129, "learning_rate": 1.1700000000000002e-06, "loss": 2.3499, "mean_token_accuracy": 0.48607511818408966, "num_tokens": 17225388.0, "step": 118 }, { "entropy": 2.4976806640625, "epoch": 0.002048845157235953, "grad_norm": 0.8917343020439148, "learning_rate": 1.1800000000000001e-06, "loss": 2.4271, "mean_token_accuracy": 0.47680498752743006, "num_tokens": 17365012.0, "step": 119 }, { "entropy": 2.4691162109375, "epoch": 0.002066062343431213, "grad_norm": 0.623699963092804, "learning_rate": 1.19e-06, "loss": 2.4145, "mean_token_accuracy": 0.4758805222809315, "num_tokens": 17515672.0, "step": 120 }, { "entropy": 2.508544921875, "epoch": 0.0020832795296264733, "grad_norm": 0.5833603143692017, "learning_rate": 1.2000000000000002e-06, "loss": 2.4093, "mean_token_accuracy": 0.48172463616356254, "num_tokens": 17664187.0, "step": 121 }, { "entropy": 2.4619140625, "epoch": 0.0021004967158217334, "grad_norm": 0.6505944728851318, "learning_rate": 1.21e-06, "loss": 2.4608, "mean_token_accuracy": 0.47443893272429705, "num_tokens": 17806861.0, "step": 122 }, { "entropy": 2.5152587890625, "epoch": 0.0021177139020169935, "grad_norm": 0.6009451150894165, "learning_rate": 1.2200000000000002e-06, "loss": 2.4738, "mean_token_accuracy": 0.4736114493571222, "num_tokens": 17949930.0, "step": 123 }, { "entropy": 2.415771484375, "epoch": 0.0021349310882122535, "grad_norm": 0.5856963992118835, "learning_rate": 1.23e-06, "loss": 2.3834, "mean_token_accuracy": 0.4865727727301419, "num_tokens": 18096555.0, "step": 124 }, { "entropy": 2.3616943359375, "epoch": 0.0021521482744075136, "grad_norm": 0.5739309787750244, "learning_rate": 1.2400000000000002e-06, "loss": 2.3385, "mean_token_accuracy": 0.49280355405062437, "num_tokens": 18249345.0, "step": 125 }, { "entropy": 2.4732666015625, "epoch": 0.0021693654606027737, "grad_norm": 0.6043084859848022, "learning_rate": 1.25e-06, "loss": 2.416, "mean_token_accuracy": 0.47766703460365534, "num_tokens": 18385525.0, "step": 126 }, { "entropy": 2.5040283203125, "epoch": 0.002186582646798034, "grad_norm": 0.6954610347747803, "learning_rate": 1.26e-06, "loss": 2.4997, "mean_token_accuracy": 0.4837381485849619, "num_tokens": 18522666.0, "step": 127 }, { "entropy": 2.45166015625, "epoch": 0.002203799832993294, "grad_norm": 0.5906988382339478, "learning_rate": 1.2700000000000001e-06, "loss": 2.4099, "mean_token_accuracy": 0.47865421138703823, "num_tokens": 18662132.0, "step": 128 }, { "entropy": 2.395263671875, "epoch": 0.002221017019188554, "grad_norm": 0.5896300077438354, "learning_rate": 1.28e-06, "loss": 2.3787, "mean_token_accuracy": 0.49431150034070015, "num_tokens": 18803105.0, "step": 129 }, { "entropy": 2.435546875, "epoch": 0.002238234205383814, "grad_norm": 0.618240475654602, "learning_rate": 1.2900000000000001e-06, "loss": 2.3837, "mean_token_accuracy": 0.48449931014329195, "num_tokens": 18942086.0, "step": 130 }, { "entropy": 2.387451171875, "epoch": 0.002255451391579074, "grad_norm": 0.5633107423782349, "learning_rate": 1.3e-06, "loss": 2.3315, "mean_token_accuracy": 0.4937907229177654, "num_tokens": 19097977.0, "step": 131 }, { "entropy": 2.431640625, "epoch": 0.0022726685777743343, "grad_norm": 0.5887622833251953, "learning_rate": 1.3100000000000002e-06, "loss": 2.381, "mean_token_accuracy": 0.48834379855543375, "num_tokens": 19242582.0, "step": 132 }, { "entropy": 2.476806640625, "epoch": 0.0022898857639695944, "grad_norm": 0.633418083190918, "learning_rate": 1.32e-06, "loss": 2.4313, "mean_token_accuracy": 0.4792938116006553, "num_tokens": 19379757.0, "step": 133 }, { "entropy": 2.42333984375, "epoch": 0.0023071029501648545, "grad_norm": 0.6043598651885986, "learning_rate": 1.3300000000000002e-06, "loss": 2.343, "mean_token_accuracy": 0.4908184530213475, "num_tokens": 19524622.0, "step": 134 }, { "entropy": 2.46484375, "epoch": 0.0023243201363601146, "grad_norm": 0.6375739574432373, "learning_rate": 1.34e-06, "loss": 2.4083, "mean_token_accuracy": 0.4828670499846339, "num_tokens": 19663378.0, "step": 135 }, { "entropy": 2.428955078125, "epoch": 0.0023415373225553746, "grad_norm": 0.5688341856002808, "learning_rate": 1.3500000000000002e-06, "loss": 2.3791, "mean_token_accuracy": 0.48436517268419266, "num_tokens": 19816124.0, "step": 136 }, { "entropy": 2.4317626953125, "epoch": 0.0023587545087506347, "grad_norm": 0.5994829535484314, "learning_rate": 1.3600000000000001e-06, "loss": 2.3781, "mean_token_accuracy": 0.4828225467354059, "num_tokens": 19958536.0, "step": 137 }, { "entropy": 2.4664306640625, "epoch": 0.002375971694945895, "grad_norm": 0.5764389634132385, "learning_rate": 1.3700000000000002e-06, "loss": 2.4281, "mean_token_accuracy": 0.47715017944574356, "num_tokens": 20101195.0, "step": 138 }, { "entropy": 2.515869140625, "epoch": 0.002393188881141155, "grad_norm": 0.5928436517715454, "learning_rate": 1.3800000000000001e-06, "loss": 2.4664, "mean_token_accuracy": 0.4714601272717118, "num_tokens": 20250847.0, "step": 139 }, { "entropy": 2.47509765625, "epoch": 0.002410406067336415, "grad_norm": 0.7260881662368774, "learning_rate": 1.3900000000000002e-06, "loss": 2.4494, "mean_token_accuracy": 0.47532156156376004, "num_tokens": 20391934.0, "step": 140 }, { "entropy": 2.42236328125, "epoch": 0.002427623253531675, "grad_norm": 0.5777806043624878, "learning_rate": 1.4000000000000001e-06, "loss": 2.3889, "mean_token_accuracy": 0.4810917223803699, "num_tokens": 20542473.0, "step": 141 }, { "entropy": 2.426025390625, "epoch": 0.0024448404397269356, "grad_norm": 0.5795040130615234, "learning_rate": 1.41e-06, "loss": 2.357, "mean_token_accuracy": 0.4965799758210778, "num_tokens": 20692496.0, "step": 142 }, { "entropy": 2.446044921875, "epoch": 0.0024620576259221957, "grad_norm": 0.584563672542572, "learning_rate": 1.42e-06, "loss": 2.3614, "mean_token_accuracy": 0.4806175325065851, "num_tokens": 20829269.0, "step": 143 }, { "entropy": 2.431640625, "epoch": 0.002479274812117456, "grad_norm": 0.5803564190864563, "learning_rate": 1.43e-06, "loss": 2.3908, "mean_token_accuracy": 0.4852483980357647, "num_tokens": 20976041.0, "step": 144 }, { "entropy": 2.4169921875, "epoch": 0.002496491998312716, "grad_norm": 0.5736103057861328, "learning_rate": 1.44e-06, "loss": 2.407, "mean_token_accuracy": 0.48149813804775476, "num_tokens": 21123880.0, "step": 145 }, { "entropy": 2.501708984375, "epoch": 0.002513709184507976, "grad_norm": 0.5667904615402222, "learning_rate": 1.45e-06, "loss": 2.4569, "mean_token_accuracy": 0.470580879598856, "num_tokens": 21267685.0, "step": 146 }, { "entropy": 2.431396484375, "epoch": 0.002530926370703236, "grad_norm": 0.6545902490615845, "learning_rate": 1.46e-06, "loss": 2.368, "mean_token_accuracy": 0.4863086869008839, "num_tokens": 21415641.0, "step": 147 }, { "entropy": 2.4876708984375, "epoch": 0.002548143556898496, "grad_norm": 0.6045504808425903, "learning_rate": 1.4700000000000001e-06, "loss": 2.4502, "mean_token_accuracy": 0.47358084423467517, "num_tokens": 21568579.0, "step": 148 }, { "entropy": 2.4735107421875, "epoch": 0.0025653607430937563, "grad_norm": 0.5392025113105774, "learning_rate": 1.48e-06, "loss": 2.3939, "mean_token_accuracy": 0.47306955326348543, "num_tokens": 21725147.0, "step": 149 }, { "entropy": 2.38037109375, "epoch": 0.0025825779292890164, "grad_norm": 0.6125035285949707, "learning_rate": 1.4900000000000001e-06, "loss": 2.3062, "mean_token_accuracy": 0.49953836342319846, "num_tokens": 21861243.0, "step": 150 }, { "entropy": 2.40234375, "epoch": 0.0025997951154842765, "grad_norm": 0.6005491614341736, "learning_rate": 1.5e-06, "loss": 2.3803, "mean_token_accuracy": 0.483843975700438, "num_tokens": 21996041.0, "step": 151 }, { "entropy": 2.51171875, "epoch": 0.0026170123016795365, "grad_norm": 0.6047178506851196, "learning_rate": 1.5100000000000002e-06, "loss": 2.4551, "mean_token_accuracy": 0.4724902934394777, "num_tokens": 22124115.0, "step": 152 }, { "entropy": 2.4654541015625, "epoch": 0.0026342294878747966, "grad_norm": 0.5984538197517395, "learning_rate": 1.52e-06, "loss": 2.4066, "mean_token_accuracy": 0.47982197999954224, "num_tokens": 22268397.0, "step": 153 }, { "entropy": 2.5594482421875, "epoch": 0.0026514466740700567, "grad_norm": 0.5999759435653687, "learning_rate": 1.5300000000000002e-06, "loss": 2.5766, "mean_token_accuracy": 0.46560019347816706, "num_tokens": 22417806.0, "step": 154 }, { "entropy": 2.468017578125, "epoch": 0.002668663860265317, "grad_norm": 0.5409292578697205, "learning_rate": 1.54e-06, "loss": 2.4253, "mean_token_accuracy": 0.471966958604753, "num_tokens": 22581097.0, "step": 155 }, { "entropy": 2.4951171875, "epoch": 0.002685881046460577, "grad_norm": 0.5459778904914856, "learning_rate": 1.5500000000000002e-06, "loss": 2.4459, "mean_token_accuracy": 0.47298973286524415, "num_tokens": 22735616.0, "step": 156 }, { "entropy": 2.4017333984375, "epoch": 0.002703098232655837, "grad_norm": 0.5974356532096863, "learning_rate": 1.56e-06, "loss": 2.3332, "mean_token_accuracy": 0.49170317640528083, "num_tokens": 22876476.0, "step": 157 }, { "entropy": 2.447021484375, "epoch": 0.002720315418851097, "grad_norm": 0.6058854460716248, "learning_rate": 1.5700000000000002e-06, "loss": 2.3759, "mean_token_accuracy": 0.48308438109233975, "num_tokens": 23000697.0, "step": 158 }, { "entropy": 2.4227294921875, "epoch": 0.002737532605046357, "grad_norm": 0.6649252772331238, "learning_rate": 1.5800000000000001e-06, "loss": 2.3364, "mean_token_accuracy": 0.49115608306601644, "num_tokens": 23128424.0, "step": 159 }, { "entropy": 2.459716796875, "epoch": 0.0027547497912416173, "grad_norm": 0.571075439453125, "learning_rate": 1.5900000000000002e-06, "loss": 2.4113, "mean_token_accuracy": 0.4818022232502699, "num_tokens": 23269190.0, "step": 160 }, { "entropy": 2.45654296875, "epoch": 0.0027719669774368774, "grad_norm": 0.5999649167060852, "learning_rate": 1.6000000000000001e-06, "loss": 2.4138, "mean_token_accuracy": 0.4814398717135191, "num_tokens": 23423206.0, "step": 161 }, { "entropy": 2.4156494140625, "epoch": 0.0027891841636321375, "grad_norm": 0.5621162056922913, "learning_rate": 1.6100000000000003e-06, "loss": 2.3751, "mean_token_accuracy": 0.48285360960289836, "num_tokens": 23572561.0, "step": 162 }, { "entropy": 2.3802490234375, "epoch": 0.0028064013498273976, "grad_norm": 0.6032527089118958, "learning_rate": 1.6200000000000002e-06, "loss": 2.3422, "mean_token_accuracy": 0.4902229546569288, "num_tokens": 23718051.0, "step": 163 }, { "entropy": 2.370849609375, "epoch": 0.0028236185360226576, "grad_norm": 0.55767822265625, "learning_rate": 1.6300000000000003e-06, "loss": 2.3473, "mean_token_accuracy": 0.49248579889535904, "num_tokens": 23877563.0, "step": 164 }, { "entropy": 2.5054931640625, "epoch": 0.0028408357222179177, "grad_norm": 0.5944464206695557, "learning_rate": 1.6400000000000002e-06, "loss": 2.4374, "mean_token_accuracy": 0.47586293099448085, "num_tokens": 24013173.0, "step": 165 }, { "entropy": 2.4613037109375, "epoch": 0.002858052908413178, "grad_norm": 0.6158614754676819, "learning_rate": 1.6500000000000003e-06, "loss": 2.4078, "mean_token_accuracy": 0.48460292909294367, "num_tokens": 24150366.0, "step": 166 }, { "entropy": 2.4307861328125, "epoch": 0.0028752700946084384, "grad_norm": 0.562500536441803, "learning_rate": 1.6600000000000002e-06, "loss": 2.3867, "mean_token_accuracy": 0.4834507182240486, "num_tokens": 24300680.0, "step": 167 }, { "entropy": 2.4783935546875, "epoch": 0.0028924872808036984, "grad_norm": 0.5674195289611816, "learning_rate": 1.6700000000000003e-06, "loss": 2.4497, "mean_token_accuracy": 0.47510121995583177, "num_tokens": 24445534.0, "step": 168 }, { "entropy": 2.4825439453125, "epoch": 0.0029097044669989585, "grad_norm": 0.6282191276550293, "learning_rate": 1.6800000000000002e-06, "loss": 2.4444, "mean_token_accuracy": 0.4739877316169441, "num_tokens": 24568649.0, "step": 169 }, { "entropy": 2.4176025390625, "epoch": 0.0029269216531942186, "grad_norm": 0.6034136414527893, "learning_rate": 1.6900000000000003e-06, "loss": 2.3804, "mean_token_accuracy": 0.48952830489724874, "num_tokens": 24710505.0, "step": 170 }, { "entropy": 2.441650390625, "epoch": 0.0029441388393894787, "grad_norm": 0.579363226890564, "learning_rate": 1.7000000000000002e-06, "loss": 2.3715, "mean_token_accuracy": 0.4852260472252965, "num_tokens": 24860551.0, "step": 171 }, { "entropy": 2.4251708984375, "epoch": 0.002961356025584739, "grad_norm": 0.6410456895828247, "learning_rate": 1.7100000000000004e-06, "loss": 2.38, "mean_token_accuracy": 0.4843108947388828, "num_tokens": 24997729.0, "step": 172 }, { "entropy": 2.430419921875, "epoch": 0.002978573211779999, "grad_norm": 0.5755802392959595, "learning_rate": 1.72e-06, "loss": 2.3886, "mean_token_accuracy": 0.4809368369169533, "num_tokens": 25140050.0, "step": 173 }, { "entropy": 2.485595703125, "epoch": 0.002995790397975259, "grad_norm": 0.606526792049408, "learning_rate": 1.73e-06, "loss": 2.4497, "mean_token_accuracy": 0.48244015080854297, "num_tokens": 25280791.0, "step": 174 }, { "entropy": 2.48046875, "epoch": 0.003013007584170519, "grad_norm": 0.6201883554458618, "learning_rate": 1.74e-06, "loss": 2.4686, "mean_token_accuracy": 0.4720367449335754, "num_tokens": 25429327.0, "step": 175 }, { "entropy": 2.40380859375, "epoch": 0.003030224770365779, "grad_norm": 0.6322990655899048, "learning_rate": 1.75e-06, "loss": 2.3775, "mean_token_accuracy": 0.48563892720267177, "num_tokens": 25556902.0, "step": 176 }, { "entropy": 2.4178466796875, "epoch": 0.0030474419565610393, "grad_norm": 0.5754444003105164, "learning_rate": 1.76e-06, "loss": 2.3881, "mean_token_accuracy": 0.4864646405912936, "num_tokens": 25706350.0, "step": 177 }, { "entropy": 2.4537353515625, "epoch": 0.0030646591427562994, "grad_norm": 0.5913891792297363, "learning_rate": 1.77e-06, "loss": 2.4168, "mean_token_accuracy": 0.47908906172960997, "num_tokens": 25845686.0, "step": 178 }, { "entropy": 2.442138671875, "epoch": 0.0030818763289515595, "grad_norm": 0.5810282826423645, "learning_rate": 1.7800000000000001e-06, "loss": 2.3687, "mean_token_accuracy": 0.48843948962166905, "num_tokens": 25991169.0, "step": 179 }, { "entropy": 2.430908203125, "epoch": 0.0030990935151468195, "grad_norm": 0.59124356508255, "learning_rate": 1.79e-06, "loss": 2.345, "mean_token_accuracy": 0.49094062810763717, "num_tokens": 26137613.0, "step": 180 }, { "entropy": 2.43798828125, "epoch": 0.0031163107013420796, "grad_norm": 0.5589233636856079, "learning_rate": 1.8000000000000001e-06, "loss": 2.3647, "mean_token_accuracy": 0.47995236283168197, "num_tokens": 26304122.0, "step": 181 }, { "entropy": 2.43017578125, "epoch": 0.0031335278875373397, "grad_norm": 0.6091126203536987, "learning_rate": 1.81e-06, "loss": 2.3233, "mean_token_accuracy": 0.49118568608537316, "num_tokens": 26439950.0, "step": 182 }, { "entropy": 2.39697265625, "epoch": 0.0031507450737326, "grad_norm": 0.6027998328208923, "learning_rate": 1.8200000000000002e-06, "loss": 2.332, "mean_token_accuracy": 0.4913685843348503, "num_tokens": 26579726.0, "step": 183 }, { "entropy": 2.404296875, "epoch": 0.00316796225992786, "grad_norm": 0.5764933228492737, "learning_rate": 1.83e-06, "loss": 2.3598, "mean_token_accuracy": 0.48573345225304365, "num_tokens": 26724853.0, "step": 184 }, { "entropy": 2.4344482421875, "epoch": 0.00318517944612312, "grad_norm": 0.5961573123931885, "learning_rate": 1.8400000000000002e-06, "loss": 2.3914, "mean_token_accuracy": 0.48558472096920013, "num_tokens": 26863674.0, "step": 185 }, { "entropy": 2.446044921875, "epoch": 0.00320239663231838, "grad_norm": 0.570022702217102, "learning_rate": 1.85e-06, "loss": 2.4299, "mean_token_accuracy": 0.48616287065669894, "num_tokens": 27016137.0, "step": 186 }, { "entropy": 2.4267578125, "epoch": 0.00321961381851364, "grad_norm": 0.5620612502098083, "learning_rate": 1.8600000000000002e-06, "loss": 2.3965, "mean_token_accuracy": 0.4951572152785957, "num_tokens": 27168668.0, "step": 187 }, { "entropy": 2.5091552734375, "epoch": 0.0032368310047089003, "grad_norm": 0.5910755395889282, "learning_rate": 1.87e-06, "loss": 2.4964, "mean_token_accuracy": 0.47218859009444714, "num_tokens": 27303758.0, "step": 188 }, { "entropy": 2.41259765625, "epoch": 0.0032540481909041604, "grad_norm": 0.5817851424217224, "learning_rate": 1.8800000000000002e-06, "loss": 2.3692, "mean_token_accuracy": 0.48497994616627693, "num_tokens": 27442544.0, "step": 189 }, { "entropy": 2.4501953125, "epoch": 0.0032712653770994205, "grad_norm": 0.565247654914856, "learning_rate": 1.8900000000000001e-06, "loss": 2.4424, "mean_token_accuracy": 0.4795730533078313, "num_tokens": 27596457.0, "step": 190 }, { "entropy": 2.4776611328125, "epoch": 0.0032884825632946806, "grad_norm": 0.5709455609321594, "learning_rate": 1.9000000000000002e-06, "loss": 2.4314, "mean_token_accuracy": 0.47365658916532993, "num_tokens": 27743357.0, "step": 191 }, { "entropy": 2.5281982421875, "epoch": 0.0033056997494899406, "grad_norm": 0.629690945148468, "learning_rate": 1.9100000000000003e-06, "loss": 2.4736, "mean_token_accuracy": 0.47410575672984123, "num_tokens": 27876803.0, "step": 192 }, { "entropy": 2.4635009765625, "epoch": 0.003322916935685201, "grad_norm": 0.637241780757904, "learning_rate": 1.9200000000000003e-06, "loss": 2.4301, "mean_token_accuracy": 0.48486198624596, "num_tokens": 28028469.0, "step": 193 }, { "entropy": 2.486083984375, "epoch": 0.0033401341218804613, "grad_norm": 0.5691868662834167, "learning_rate": 1.93e-06, "loss": 2.437, "mean_token_accuracy": 0.4730471963994205, "num_tokens": 28169802.0, "step": 194 }, { "entropy": 2.43017578125, "epoch": 0.0033573513080757214, "grad_norm": 0.6118927597999573, "learning_rate": 1.94e-06, "loss": 2.3835, "mean_token_accuracy": 0.4860619972459972, "num_tokens": 28299767.0, "step": 195 }, { "entropy": 2.4813232421875, "epoch": 0.0033745684942709814, "grad_norm": 0.564987063407898, "learning_rate": 1.9500000000000004e-06, "loss": 2.4519, "mean_token_accuracy": 0.4765043603256345, "num_tokens": 28442455.0, "step": 196 }, { "entropy": 2.4971923828125, "epoch": 0.0033917856804662415, "grad_norm": 0.5519808530807495, "learning_rate": 1.9600000000000003e-06, "loss": 2.4937, "mean_token_accuracy": 0.4625990390777588, "num_tokens": 28585079.0, "step": 197 }, { "entropy": 2.464599609375, "epoch": 0.0034090028666615016, "grad_norm": 0.6350486874580383, "learning_rate": 1.97e-06, "loss": 2.4561, "mean_token_accuracy": 0.4764754744246602, "num_tokens": 28722871.0, "step": 198 }, { "entropy": 2.4639892578125, "epoch": 0.0034262200528567617, "grad_norm": 0.5707213878631592, "learning_rate": 1.98e-06, "loss": 2.4378, "mean_token_accuracy": 0.48255998734384775, "num_tokens": 28868682.0, "step": 199 }, { "entropy": 2.415771484375, "epoch": 0.003443437239052022, "grad_norm": 0.593158483505249, "learning_rate": 1.9900000000000004e-06, "loss": 2.3004, "mean_token_accuracy": 0.49145969236269593, "num_tokens": 29016713.0, "step": 200 }, { "entropy": 2.462890625, "epoch": 0.003460654425247282, "grad_norm": 0.6505548357963562, "learning_rate": 2.0000000000000003e-06, "loss": 2.3531, "mean_token_accuracy": 0.48328409856185317, "num_tokens": 29143923.0, "step": 201 }, { "entropy": 2.3577880859375, "epoch": 0.003477871611442542, "grad_norm": 0.6153655052185059, "learning_rate": 2.0100000000000002e-06, "loss": 2.3246, "mean_token_accuracy": 0.4954985845834017, "num_tokens": 29285239.0, "step": 202 }, { "entropy": 2.51025390625, "epoch": 0.003495088797637802, "grad_norm": 0.5901440978050232, "learning_rate": 2.02e-06, "loss": 2.4444, "mean_token_accuracy": 0.47322959266602993, "num_tokens": 29423789.0, "step": 203 }, { "entropy": 2.473388671875, "epoch": 0.003512305983833062, "grad_norm": 0.6093935370445251, "learning_rate": 2.0300000000000005e-06, "loss": 2.4143, "mean_token_accuracy": 0.4813940548337996, "num_tokens": 29571521.0, "step": 204 }, { "entropy": 2.423828125, "epoch": 0.0035295231700283223, "grad_norm": 0.5486546754837036, "learning_rate": 2.04e-06, "loss": 2.3655, "mean_token_accuracy": 0.4821597971022129, "num_tokens": 29726314.0, "step": 205 }, { "entropy": 2.4498291015625, "epoch": 0.0035467403562235824, "grad_norm": 0.57234126329422, "learning_rate": 2.05e-06, "loss": 2.425, "mean_token_accuracy": 0.47626253589987755, "num_tokens": 29880000.0, "step": 206 }, { "entropy": 2.416015625, "epoch": 0.0035639575424188425, "grad_norm": 0.561396062374115, "learning_rate": 2.06e-06, "loss": 2.3439, "mean_token_accuracy": 0.4864202947355807, "num_tokens": 30027628.0, "step": 207 }, { "entropy": 2.4002685546875, "epoch": 0.0035811747286141025, "grad_norm": 0.569572389125824, "learning_rate": 2.07e-06, "loss": 2.3673, "mean_token_accuracy": 0.4875691821798682, "num_tokens": 30169974.0, "step": 208 }, { "entropy": 2.455078125, "epoch": 0.0035983919148093626, "grad_norm": 0.5601251721382141, "learning_rate": 2.08e-06, "loss": 2.4015, "mean_token_accuracy": 0.4847674574702978, "num_tokens": 30317198.0, "step": 209 }, { "entropy": 2.53271484375, "epoch": 0.0036156091010046227, "grad_norm": 0.5616832375526428, "learning_rate": 2.09e-06, "loss": 2.4877, "mean_token_accuracy": 0.47258848743513227, "num_tokens": 30460614.0, "step": 210 }, { "entropy": 2.454345703125, "epoch": 0.003632826287199883, "grad_norm": 0.5962870717048645, "learning_rate": 2.1000000000000002e-06, "loss": 2.4077, "mean_token_accuracy": 0.47984306002035737, "num_tokens": 30595270.0, "step": 211 }, { "entropy": 2.414306640625, "epoch": 0.003650043473395143, "grad_norm": 0.5458055734634399, "learning_rate": 2.11e-06, "loss": 2.349, "mean_token_accuracy": 0.48367989249527454, "num_tokens": 30754164.0, "step": 212 }, { "entropy": 2.3519287109375, "epoch": 0.003667260659590403, "grad_norm": 0.6218618154525757, "learning_rate": 2.12e-06, "loss": 2.2976, "mean_token_accuracy": 0.49590098252519965, "num_tokens": 30910624.0, "step": 213 }, { "entropy": 2.4219970703125, "epoch": 0.003684477845785663, "grad_norm": 0.6083913445472717, "learning_rate": 2.13e-06, "loss": 2.3687, "mean_token_accuracy": 0.4871506607159972, "num_tokens": 31045412.0, "step": 214 }, { "entropy": 2.3848876953125, "epoch": 0.003701695031980923, "grad_norm": 0.5669545531272888, "learning_rate": 2.1400000000000003e-06, "loss": 2.3481, "mean_token_accuracy": 0.4880279768258333, "num_tokens": 31198584.0, "step": 215 }, { "entropy": 2.497314453125, "epoch": 0.0037189122181761833, "grad_norm": 0.5737752318382263, "learning_rate": 2.15e-06, "loss": 2.4598, "mean_token_accuracy": 0.47354970779269934, "num_tokens": 31341345.0, "step": 216 }, { "entropy": 2.4847412109375, "epoch": 0.0037361294043714434, "grad_norm": 0.5978350639343262, "learning_rate": 2.16e-06, "loss": 2.4907, "mean_token_accuracy": 0.4729965156875551, "num_tokens": 31503528.0, "step": 217 }, { "entropy": 2.4783935546875, "epoch": 0.003753346590566704, "grad_norm": 0.5699813961982727, "learning_rate": 2.17e-06, "loss": 2.3885, "mean_token_accuracy": 0.4826196124777198, "num_tokens": 31641779.0, "step": 218 }, { "entropy": 2.436279296875, "epoch": 0.003770563776761964, "grad_norm": 0.5500178337097168, "learning_rate": 2.1800000000000003e-06, "loss": 2.4012, "mean_token_accuracy": 0.4823254165239632, "num_tokens": 31811688.0, "step": 219 }, { "entropy": 2.431884765625, "epoch": 0.003787780962957224, "grad_norm": 0.5497094392776489, "learning_rate": 2.19e-06, "loss": 2.4188, "mean_token_accuracy": 0.48219793336465955, "num_tokens": 31964463.0, "step": 220 }, { "entropy": 2.4559326171875, "epoch": 0.003804998149152484, "grad_norm": 0.5953599810600281, "learning_rate": 2.2e-06, "loss": 2.4074, "mean_token_accuracy": 0.4810604937374592, "num_tokens": 32106174.0, "step": 221 }, { "entropy": 2.4549560546875, "epoch": 0.0038222153353477443, "grad_norm": 0.5802156329154968, "learning_rate": 2.21e-06, "loss": 2.401, "mean_token_accuracy": 0.4832776212133467, "num_tokens": 32240256.0, "step": 222 }, { "entropy": 2.5172119140625, "epoch": 0.0038394325215430044, "grad_norm": 0.7379570007324219, "learning_rate": 2.2200000000000003e-06, "loss": 2.4759, "mean_token_accuracy": 0.47404406825080514, "num_tokens": 32384376.0, "step": 223 }, { "entropy": 2.4268798828125, "epoch": 0.0038566497077382644, "grad_norm": 0.535229504108429, "learning_rate": 2.2300000000000002e-06, "loss": 2.3825, "mean_token_accuracy": 0.48500062711536884, "num_tokens": 32547276.0, "step": 224 }, { "entropy": 2.4552001953125, "epoch": 0.0038738668939335245, "grad_norm": 0.5618354082107544, "learning_rate": 2.24e-06, "loss": 2.4164, "mean_token_accuracy": 0.4798359959386289, "num_tokens": 32695574.0, "step": 225 }, { "entropy": 2.491455078125, "epoch": 0.0038910840801287846, "grad_norm": 0.5642921924591064, "learning_rate": 2.25e-06, "loss": 2.4474, "mean_token_accuracy": 0.4762350879609585, "num_tokens": 32836847.0, "step": 226 }, { "entropy": 2.415283203125, "epoch": 0.003908301266324045, "grad_norm": 0.614703893661499, "learning_rate": 2.2600000000000004e-06, "loss": 2.3591, "mean_token_accuracy": 0.48611282790079713, "num_tokens": 32970328.0, "step": 227 }, { "entropy": 2.4599609375, "epoch": 0.003925518452519304, "grad_norm": 0.5725888013839722, "learning_rate": 2.2700000000000003e-06, "loss": 2.4113, "mean_token_accuracy": 0.4802963142283261, "num_tokens": 33112724.0, "step": 228 }, { "entropy": 2.4117431640625, "epoch": 0.003942735638714565, "grad_norm": 0.6599273681640625, "learning_rate": 2.28e-06, "loss": 2.3471, "mean_token_accuracy": 0.4909554719924927, "num_tokens": 33250365.0, "step": 229 }, { "entropy": 2.417236328125, "epoch": 0.0039599528249098246, "grad_norm": 0.5717114806175232, "learning_rate": 2.29e-06, "loss": 2.3673, "mean_token_accuracy": 0.4863877324387431, "num_tokens": 33402602.0, "step": 230 }, { "entropy": 2.4541015625, "epoch": 0.003977170011105085, "grad_norm": 0.5801478028297424, "learning_rate": 2.3000000000000004e-06, "loss": 2.4395, "mean_token_accuracy": 0.47902765218168497, "num_tokens": 33544113.0, "step": 231 }, { "entropy": 2.454345703125, "epoch": 0.003994387197300346, "grad_norm": 0.6313830614089966, "learning_rate": 2.3100000000000003e-06, "loss": 2.368, "mean_token_accuracy": 0.48253711173310876, "num_tokens": 33693199.0, "step": 232 }, { "entropy": 2.411376953125, "epoch": 0.004011604383495605, "grad_norm": 0.5773042440414429, "learning_rate": 2.3200000000000002e-06, "loss": 2.3673, "mean_token_accuracy": 0.4857380697503686, "num_tokens": 33834160.0, "step": 233 }, { "entropy": 2.4261474609375, "epoch": 0.004028821569690866, "grad_norm": 0.6097912788391113, "learning_rate": 2.33e-06, "loss": 2.4526, "mean_token_accuracy": 0.4798373435623944, "num_tokens": 33966216.0, "step": 234 }, { "entropy": 2.47802734375, "epoch": 0.0040460387558861255, "grad_norm": 0.5512007474899292, "learning_rate": 2.3400000000000005e-06, "loss": 2.3984, "mean_token_accuracy": 0.47808168828487396, "num_tokens": 34113686.0, "step": 235 }, { "entropy": 2.4539794921875, "epoch": 0.004063255942081386, "grad_norm": 0.5286883115768433, "learning_rate": 2.35e-06, "loss": 2.412, "mean_token_accuracy": 0.48627515137195587, "num_tokens": 34282616.0, "step": 236 }, { "entropy": 2.4539794921875, "epoch": 0.004080473128276646, "grad_norm": 0.564912736415863, "learning_rate": 2.3600000000000003e-06, "loss": 2.3459, "mean_token_accuracy": 0.48682177206501365, "num_tokens": 34436498.0, "step": 237 }, { "entropy": 2.4637451171875, "epoch": 0.004097690314471906, "grad_norm": 0.6081207990646362, "learning_rate": 2.37e-06, "loss": 2.4327, "mean_token_accuracy": 0.4862173437140882, "num_tokens": 34579113.0, "step": 238 }, { "entropy": 2.3974609375, "epoch": 0.004114907500667166, "grad_norm": 0.5634726285934448, "learning_rate": 2.38e-06, "loss": 2.3362, "mean_token_accuracy": 0.49311843886971474, "num_tokens": 34723287.0, "step": 239 }, { "entropy": 2.3797607421875, "epoch": 0.004132124686862426, "grad_norm": 0.5402041673660278, "learning_rate": 2.39e-06, "loss": 2.3051, "mean_token_accuracy": 0.4971098625101149, "num_tokens": 34879330.0, "step": 240 }, { "entropy": 2.4957275390625, "epoch": 0.004149341873057686, "grad_norm": 0.6134316325187683, "learning_rate": 2.4000000000000003e-06, "loss": 2.4642, "mean_token_accuracy": 0.4705090904608369, "num_tokens": 35007660.0, "step": 241 }, { "entropy": 2.5213623046875, "epoch": 0.0041665590592529465, "grad_norm": 0.5660735368728638, "learning_rate": 2.4100000000000002e-06, "loss": 2.503, "mean_token_accuracy": 0.4669585754163563, "num_tokens": 35155284.0, "step": 242 }, { "entropy": 2.4287109375, "epoch": 0.004183776245448206, "grad_norm": 0.5641495585441589, "learning_rate": 2.42e-06, "loss": 2.411, "mean_token_accuracy": 0.4803205137141049, "num_tokens": 35306435.0, "step": 243 }, { "entropy": 2.47705078125, "epoch": 0.004200993431643467, "grad_norm": 0.5801253914833069, "learning_rate": 2.43e-06, "loss": 2.4781, "mean_token_accuracy": 0.47467044508084655, "num_tokens": 35458606.0, "step": 244 }, { "entropy": 2.3427734375, "epoch": 0.004218210617838726, "grad_norm": 0.5665391683578491, "learning_rate": 2.4400000000000004e-06, "loss": 2.2603, "mean_token_accuracy": 0.5001557641662657, "num_tokens": 35599151.0, "step": 245 }, { "entropy": 2.4097900390625, "epoch": 0.004235427804033987, "grad_norm": 0.5967923998832703, "learning_rate": 2.4500000000000003e-06, "loss": 2.3661, "mean_token_accuracy": 0.4944393504410982, "num_tokens": 35753307.0, "step": 246 }, { "entropy": 2.3837890625, "epoch": 0.0042526449902292466, "grad_norm": 0.5936510562896729, "learning_rate": 2.46e-06, "loss": 2.3321, "mean_token_accuracy": 0.49646031484007835, "num_tokens": 35892908.0, "step": 247 }, { "entropy": 2.4852294921875, "epoch": 0.004269862176424507, "grad_norm": 0.5804598927497864, "learning_rate": 2.47e-06, "loss": 2.3769, "mean_token_accuracy": 0.47953970776870847, "num_tokens": 36042857.0, "step": 248 }, { "entropy": 2.4976806640625, "epoch": 0.004287079362619767, "grad_norm": 0.6185864806175232, "learning_rate": 2.4800000000000004e-06, "loss": 2.4387, "mean_token_accuracy": 0.4731791294179857, "num_tokens": 36194862.0, "step": 249 }, { "entropy": 2.4356689453125, "epoch": 0.004304296548815027, "grad_norm": 0.6290936470031738, "learning_rate": 2.4900000000000003e-06, "loss": 2.3616, "mean_token_accuracy": 0.4869615500792861, "num_tokens": 36347599.0, "step": 250 }, { "entropy": 2.4132080078125, "epoch": 0.004321513735010287, "grad_norm": 0.6308562755584717, "learning_rate": 2.5e-06, "loss": 2.355, "mean_token_accuracy": 0.48826425010338426, "num_tokens": 36506643.0, "step": 251 }, { "entropy": 2.4609375, "epoch": 0.0043387309212055475, "grad_norm": 0.5634401440620422, "learning_rate": 2.51e-06, "loss": 2.4154, "mean_token_accuracy": 0.4824108784087002, "num_tokens": 36652239.0, "step": 252 }, { "entropy": 2.360595703125, "epoch": 0.004355948107400807, "grad_norm": 0.574517548084259, "learning_rate": 2.52e-06, "loss": 2.3125, "mean_token_accuracy": 0.49683515494689345, "num_tokens": 36803342.0, "step": 253 }, { "entropy": 2.440185546875, "epoch": 0.004373165293596068, "grad_norm": 0.6065932512283325, "learning_rate": 2.5300000000000003e-06, "loss": 2.4206, "mean_token_accuracy": 0.48550984309986234, "num_tokens": 36942721.0, "step": 254 }, { "entropy": 2.485107421875, "epoch": 0.004390382479791327, "grad_norm": 0.8562254905700684, "learning_rate": 2.5400000000000002e-06, "loss": 2.4961, "mean_token_accuracy": 0.47687816014513373, "num_tokens": 37068241.0, "step": 255 }, { "entropy": 2.3944091796875, "epoch": 0.004407599665986588, "grad_norm": 0.547681987285614, "learning_rate": 2.55e-06, "loss": 2.3469, "mean_token_accuracy": 0.49269043607637286, "num_tokens": 37221499.0, "step": 256 }, { "entropy": 2.4814453125, "epoch": 0.004424816852181848, "grad_norm": 0.5839248299598694, "learning_rate": 2.56e-06, "loss": 2.4229, "mean_token_accuracy": 0.47095031198114157, "num_tokens": 37373368.0, "step": 257 }, { "entropy": 2.354736328125, "epoch": 0.004442034038377108, "grad_norm": 0.587020993232727, "learning_rate": 2.5700000000000004e-06, "loss": 2.3414, "mean_token_accuracy": 0.4924391624517739, "num_tokens": 37518300.0, "step": 258 }, { "entropy": 2.37255859375, "epoch": 0.0044592512245723685, "grad_norm": 0.5921294093132019, "learning_rate": 2.5800000000000003e-06, "loss": 2.3288, "mean_token_accuracy": 0.4891428491100669, "num_tokens": 37668963.0, "step": 259 }, { "entropy": 2.461181640625, "epoch": 0.004476468410767628, "grad_norm": 0.6068063378334045, "learning_rate": 2.59e-06, "loss": 2.4455, "mean_token_accuracy": 0.47976093366742134, "num_tokens": 37805995.0, "step": 260 }, { "entropy": 2.4947509765625, "epoch": 0.004493685596962889, "grad_norm": 0.6007203459739685, "learning_rate": 2.6e-06, "loss": 2.4731, "mean_token_accuracy": 0.4695629784837365, "num_tokens": 37953374.0, "step": 261 }, { "entropy": 2.4515380859375, "epoch": 0.004510902783158148, "grad_norm": 0.6087055206298828, "learning_rate": 2.6100000000000004e-06, "loss": 2.3859, "mean_token_accuracy": 0.48078236635774374, "num_tokens": 38090268.0, "step": 262 }, { "entropy": 2.405029296875, "epoch": 0.004528119969353409, "grad_norm": 0.6398684978485107, "learning_rate": 2.6200000000000003e-06, "loss": 2.3682, "mean_token_accuracy": 0.4855854455381632, "num_tokens": 38230536.0, "step": 263 }, { "entropy": 2.484375, "epoch": 0.0045453371555486686, "grad_norm": 0.5696126818656921, "learning_rate": 2.6300000000000002e-06, "loss": 2.4688, "mean_token_accuracy": 0.47722396487370133, "num_tokens": 38372741.0, "step": 264 }, { "entropy": 2.3275146484375, "epoch": 0.004562554341743929, "grad_norm": 0.5986506342887878, "learning_rate": 2.64e-06, "loss": 2.2601, "mean_token_accuracy": 0.5040428307838738, "num_tokens": 38510771.0, "step": 265 }, { "entropy": 2.4368896484375, "epoch": 0.004579771527939189, "grad_norm": 0.5909532904624939, "learning_rate": 2.6500000000000005e-06, "loss": 2.3905, "mean_token_accuracy": 0.48478930070996284, "num_tokens": 38656604.0, "step": 266 }, { "entropy": 2.4736328125, "epoch": 0.004596988714134449, "grad_norm": 0.604336142539978, "learning_rate": 2.6600000000000004e-06, "loss": 2.47, "mean_token_accuracy": 0.47080791695043445, "num_tokens": 38790002.0, "step": 267 }, { "entropy": 2.44287109375, "epoch": 0.004614205900329709, "grad_norm": 1.2783533334732056, "learning_rate": 2.6700000000000003e-06, "loss": 2.353, "mean_token_accuracy": 0.4904564335010946, "num_tokens": 38926583.0, "step": 268 }, { "entropy": 2.474365234375, "epoch": 0.0046314230865249694, "grad_norm": 0.6569477915763855, "learning_rate": 2.68e-06, "loss": 2.443, "mean_token_accuracy": 0.47721053985878825, "num_tokens": 39070295.0, "step": 269 }, { "entropy": 2.4146728515625, "epoch": 0.004648640272720229, "grad_norm": 0.6075790524482727, "learning_rate": 2.6900000000000005e-06, "loss": 2.3758, "mean_token_accuracy": 0.4894418097101152, "num_tokens": 39224913.0, "step": 270 }, { "entropy": 2.42333984375, "epoch": 0.00466585745891549, "grad_norm": 0.6399396657943726, "learning_rate": 2.7000000000000004e-06, "loss": 2.3292, "mean_token_accuracy": 0.4931036913767457, "num_tokens": 39357252.0, "step": 271 }, { "entropy": 2.407958984375, "epoch": 0.004683074645110749, "grad_norm": 0.5720593333244324, "learning_rate": 2.7100000000000003e-06, "loss": 2.3628, "mean_token_accuracy": 0.4841322088614106, "num_tokens": 39493624.0, "step": 272 }, { "entropy": 2.5147705078125, "epoch": 0.00470029183130601, "grad_norm": 0.6194799542427063, "learning_rate": 2.7200000000000002e-06, "loss": 2.4367, "mean_token_accuracy": 0.4774662428535521, "num_tokens": 39629017.0, "step": 273 }, { "entropy": 2.461181640625, "epoch": 0.0047175090175012695, "grad_norm": 0.5498484373092651, "learning_rate": 2.7300000000000005e-06, "loss": 2.4471, "mean_token_accuracy": 0.4761566431261599, "num_tokens": 39770313.0, "step": 274 }, { "entropy": 2.458984375, "epoch": 0.00473472620369653, "grad_norm": 0.6001895070075989, "learning_rate": 2.7400000000000004e-06, "loss": 2.4156, "mean_token_accuracy": 0.47761010052636266, "num_tokens": 39921189.0, "step": 275 }, { "entropy": 2.3966064453125, "epoch": 0.00475194338989179, "grad_norm": 0.555188000202179, "learning_rate": 2.7500000000000004e-06, "loss": 2.3301, "mean_token_accuracy": 0.4860731796361506, "num_tokens": 40076244.0, "step": 276 }, { "entropy": 2.404541015625, "epoch": 0.00476916057608705, "grad_norm": 0.5978058576583862, "learning_rate": 2.7600000000000003e-06, "loss": 2.3391, "mean_token_accuracy": 0.49592722998932004, "num_tokens": 40212416.0, "step": 277 }, { "entropy": 2.437744140625, "epoch": 0.00478637776228231, "grad_norm": 0.6279250383377075, "learning_rate": 2.7700000000000006e-06, "loss": 2.3922, "mean_token_accuracy": 0.48303127055987716, "num_tokens": 40349633.0, "step": 278 }, { "entropy": 2.50244140625, "epoch": 0.00480359494847757, "grad_norm": 0.5651630759239197, "learning_rate": 2.7800000000000005e-06, "loss": 2.4524, "mean_token_accuracy": 0.47481566993519664, "num_tokens": 40493744.0, "step": 279 }, { "entropy": 2.450439453125, "epoch": 0.00482081213467283, "grad_norm": 0.5598726272583008, "learning_rate": 2.7900000000000004e-06, "loss": 2.4624, "mean_token_accuracy": 0.476602204144001, "num_tokens": 40643755.0, "step": 280 }, { "entropy": 2.46826171875, "epoch": 0.0048380293208680905, "grad_norm": 0.5888033509254456, "learning_rate": 2.8000000000000003e-06, "loss": 2.4159, "mean_token_accuracy": 0.4830831168219447, "num_tokens": 40784807.0, "step": 281 }, { "entropy": 2.379638671875, "epoch": 0.00485524650706335, "grad_norm": 0.5815398693084717, "learning_rate": 2.8100000000000006e-06, "loss": 2.351, "mean_token_accuracy": 0.49111180379986763, "num_tokens": 40934466.0, "step": 282 }, { "entropy": 2.4080810546875, "epoch": 0.004872463693258611, "grad_norm": 0.5862876176834106, "learning_rate": 2.82e-06, "loss": 2.3539, "mean_token_accuracy": 0.49174999026581645, "num_tokens": 41069116.0, "step": 283 }, { "entropy": 2.40478515625, "epoch": 0.004889680879453871, "grad_norm": 0.5889655947685242, "learning_rate": 2.83e-06, "loss": 2.3569, "mean_token_accuracy": 0.4852382456883788, "num_tokens": 41201155.0, "step": 284 }, { "entropy": 2.4091796875, "epoch": 0.004906898065649131, "grad_norm": 0.6074087619781494, "learning_rate": 2.84e-06, "loss": 2.3607, "mean_token_accuracy": 0.48959261691197753, "num_tokens": 41332480.0, "step": 285 }, { "entropy": 2.451416015625, "epoch": 0.0049241152518443914, "grad_norm": 0.6012029051780701, "learning_rate": 2.85e-06, "loss": 2.4236, "mean_token_accuracy": 0.486750605981797, "num_tokens": 41480685.0, "step": 286 }, { "entropy": 2.412841796875, "epoch": 0.004941332438039651, "grad_norm": 0.6248612999916077, "learning_rate": 2.86e-06, "loss": 2.3435, "mean_token_accuracy": 0.49190296651795506, "num_tokens": 41623635.0, "step": 287 }, { "entropy": 2.4788818359375, "epoch": 0.004958549624234912, "grad_norm": 0.5725988745689392, "learning_rate": 2.87e-06, "loss": 2.4732, "mean_token_accuracy": 0.47983552562072873, "num_tokens": 41781312.0, "step": 288 }, { "entropy": 2.446533203125, "epoch": 0.004975766810430171, "grad_norm": 0.6067764163017273, "learning_rate": 2.88e-06, "loss": 2.3832, "mean_token_accuracy": 0.48594531044363976, "num_tokens": 41913646.0, "step": 289 }, { "entropy": 2.4150390625, "epoch": 0.004992983996625432, "grad_norm": 0.6115426421165466, "learning_rate": 2.89e-06, "loss": 2.3762, "mean_token_accuracy": 0.4852985511533916, "num_tokens": 42048939.0, "step": 290 }, { "entropy": 2.5048828125, "epoch": 0.0050102011828206915, "grad_norm": 0.5820481777191162, "learning_rate": 2.9e-06, "loss": 2.4655, "mean_token_accuracy": 0.473651675041765, "num_tokens": 42187764.0, "step": 291 }, { "entropy": 2.4854736328125, "epoch": 0.005027418369015952, "grad_norm": 0.5613899827003479, "learning_rate": 2.91e-06, "loss": 2.4199, "mean_token_accuracy": 0.48045311588793993, "num_tokens": 42335423.0, "step": 292 }, { "entropy": 2.472412109375, "epoch": 0.005044635555211212, "grad_norm": 0.5480543375015259, "learning_rate": 2.92e-06, "loss": 2.4305, "mean_token_accuracy": 0.47681946912780404, "num_tokens": 42489078.0, "step": 293 }, { "entropy": 2.3948974609375, "epoch": 0.005061852741406472, "grad_norm": 0.554536759853363, "learning_rate": 2.93e-06, "loss": 2.3142, "mean_token_accuracy": 0.4870352731086314, "num_tokens": 42638769.0, "step": 294 }, { "entropy": 2.4532470703125, "epoch": 0.005079069927601732, "grad_norm": 0.5379908084869385, "learning_rate": 2.9400000000000002e-06, "loss": 2.4019, "mean_token_accuracy": 0.4769473564811051, "num_tokens": 42784366.0, "step": 295 }, { "entropy": 2.442626953125, "epoch": 0.005096287113796992, "grad_norm": 0.5843129754066467, "learning_rate": 2.95e-06, "loss": 2.4068, "mean_token_accuracy": 0.4839549297466874, "num_tokens": 42931723.0, "step": 296 }, { "entropy": 2.4779052734375, "epoch": 0.005113504299992252, "grad_norm": 0.5816124081611633, "learning_rate": 2.96e-06, "loss": 2.4102, "mean_token_accuracy": 0.47800721740350127, "num_tokens": 43068668.0, "step": 297 }, { "entropy": 2.5050048828125, "epoch": 0.0051307214861875125, "grad_norm": 0.5763994455337524, "learning_rate": 2.97e-06, "loss": 2.4308, "mean_token_accuracy": 0.47180112451314926, "num_tokens": 43212264.0, "step": 298 }, { "entropy": 2.4345703125, "epoch": 0.005147938672382772, "grad_norm": 0.5906177759170532, "learning_rate": 2.9800000000000003e-06, "loss": 2.44, "mean_token_accuracy": 0.47890339232981205, "num_tokens": 43358872.0, "step": 299 }, { "entropy": 2.361328125, "epoch": 0.005165155858578033, "grad_norm": 0.6285274028778076, "learning_rate": 2.99e-06, "loss": 2.3159, "mean_token_accuracy": 0.49288657307624817, "num_tokens": 43505769.0, "step": 300 }, { "entropy": 2.418701171875, "epoch": 0.005182373044773292, "grad_norm": 0.5737318396568298, "learning_rate": 3e-06, "loss": 2.3905, "mean_token_accuracy": 0.48132931999862194, "num_tokens": 43655664.0, "step": 301 }, { "entropy": 2.40673828125, "epoch": 0.005199590230968553, "grad_norm": 0.6073116660118103, "learning_rate": 3.01e-06, "loss": 2.3336, "mean_token_accuracy": 0.4943325803615153, "num_tokens": 43792452.0, "step": 302 }, { "entropy": 2.3916015625, "epoch": 0.0052168074171638126, "grad_norm": 0.5638276934623718, "learning_rate": 3.0200000000000003e-06, "loss": 2.3604, "mean_token_accuracy": 0.4889626274816692, "num_tokens": 43950674.0, "step": 303 }, { "entropy": 2.45849609375, "epoch": 0.005234024603359073, "grad_norm": 0.5540956854820251, "learning_rate": 3.0300000000000002e-06, "loss": 2.4143, "mean_token_accuracy": 0.47294150246307254, "num_tokens": 44104574.0, "step": 304 }, { "entropy": 2.4903564453125, "epoch": 0.005251241789554333, "grad_norm": 0.6057919263839722, "learning_rate": 3.04e-06, "loss": 2.4804, "mean_token_accuracy": 0.4774297019466758, "num_tokens": 44268375.0, "step": 305 }, { "entropy": 2.42431640625, "epoch": 0.005268458975749593, "grad_norm": 0.5463052988052368, "learning_rate": 3.05e-06, "loss": 2.3637, "mean_token_accuracy": 0.4869398158043623, "num_tokens": 44421122.0, "step": 306 }, { "entropy": 2.4371337890625, "epoch": 0.005285676161944853, "grad_norm": 0.581701934337616, "learning_rate": 3.0600000000000003e-06, "loss": 2.3915, "mean_token_accuracy": 0.4823682149872184, "num_tokens": 44580278.0, "step": 307 }, { "entropy": 2.412841796875, "epoch": 0.0053028933481401135, "grad_norm": 0.6069723963737488, "learning_rate": 3.0700000000000003e-06, "loss": 2.3847, "mean_token_accuracy": 0.4826384102925658, "num_tokens": 44722347.0, "step": 308 }, { "entropy": 2.4246826171875, "epoch": 0.005320110534335374, "grad_norm": 0.6269805431365967, "learning_rate": 3.08e-06, "loss": 2.3817, "mean_token_accuracy": 0.4870366188697517, "num_tokens": 44859344.0, "step": 309 }, { "entropy": 2.5079345703125, "epoch": 0.005337327720530634, "grad_norm": 0.5604663491249084, "learning_rate": 3.09e-06, "loss": 2.4903, "mean_token_accuracy": 0.4686946659348905, "num_tokens": 45007467.0, "step": 310 }, { "entropy": 2.53173828125, "epoch": 0.005354544906725894, "grad_norm": 0.6524129509925842, "learning_rate": 3.1000000000000004e-06, "loss": 2.5078, "mean_token_accuracy": 0.4713136567734182, "num_tokens": 45151205.0, "step": 311 }, { "entropy": 2.4884033203125, "epoch": 0.005371762092921154, "grad_norm": 0.6845608353614807, "learning_rate": 3.1100000000000003e-06, "loss": 2.4559, "mean_token_accuracy": 0.47201906703412533, "num_tokens": 45283726.0, "step": 312 }, { "entropy": 2.4666748046875, "epoch": 0.005388979279116414, "grad_norm": 0.5663162469863892, "learning_rate": 3.12e-06, "loss": 2.4598, "mean_token_accuracy": 0.4775085118599236, "num_tokens": 45431733.0, "step": 313 }, { "entropy": 2.43603515625, "epoch": 0.005406196465311674, "grad_norm": 0.6073246002197266, "learning_rate": 3.13e-06, "loss": 2.3789, "mean_token_accuracy": 0.48635248839855194, "num_tokens": 45564227.0, "step": 314 }, { "entropy": 2.403564453125, "epoch": 0.0054234136515069345, "grad_norm": 0.6272099018096924, "learning_rate": 3.1400000000000004e-06, "loss": 2.3457, "mean_token_accuracy": 0.49023490585386753, "num_tokens": 45694763.0, "step": 315 }, { "entropy": 2.45947265625, "epoch": 0.005440630837702194, "grad_norm": 0.6056774258613586, "learning_rate": 3.1500000000000003e-06, "loss": 2.4134, "mean_token_accuracy": 0.48205672251060605, "num_tokens": 45822589.0, "step": 316 }, { "entropy": 2.479248046875, "epoch": 0.005457848023897455, "grad_norm": 0.5762467384338379, "learning_rate": 3.1600000000000002e-06, "loss": 2.4066, "mean_token_accuracy": 0.474984189029783, "num_tokens": 45963018.0, "step": 317 }, { "entropy": 2.45849609375, "epoch": 0.005475065210092714, "grad_norm": 0.5697450637817383, "learning_rate": 3.17e-06, "loss": 2.4098, "mean_token_accuracy": 0.481595104560256, "num_tokens": 46116597.0, "step": 318 }, { "entropy": 2.455322265625, "epoch": 0.005492282396287975, "grad_norm": 0.5741132497787476, "learning_rate": 3.1800000000000005e-06, "loss": 2.4058, "mean_token_accuracy": 0.48317222855985165, "num_tokens": 46269799.0, "step": 319 }, { "entropy": 2.4736328125, "epoch": 0.0055094995824832346, "grad_norm": 0.5584312677383423, "learning_rate": 3.1900000000000004e-06, "loss": 2.4214, "mean_token_accuracy": 0.47542993212118745, "num_tokens": 46413009.0, "step": 320 }, { "entropy": 2.345703125, "epoch": 0.005526716768678495, "grad_norm": 0.585615336894989, "learning_rate": 3.2000000000000003e-06, "loss": 2.3015, "mean_token_accuracy": 0.5034409500658512, "num_tokens": 46553428.0, "step": 321 }, { "entropy": 2.476318359375, "epoch": 0.005543933954873755, "grad_norm": 0.5882828235626221, "learning_rate": 3.21e-06, "loss": 2.4505, "mean_token_accuracy": 0.4761179364286363, "num_tokens": 46694050.0, "step": 322 }, { "entropy": 2.5, "epoch": 0.005561151141069015, "grad_norm": 0.5832546949386597, "learning_rate": 3.2200000000000005e-06, "loss": 2.4788, "mean_token_accuracy": 0.4718233407475054, "num_tokens": 46834894.0, "step": 323 }, { "entropy": 2.4697265625, "epoch": 0.005578368327264275, "grad_norm": 0.5746425986289978, "learning_rate": 3.2300000000000004e-06, "loss": 2.437, "mean_token_accuracy": 0.473955764900893, "num_tokens": 46971269.0, "step": 324 }, { "entropy": 2.493896484375, "epoch": 0.0055955855134595354, "grad_norm": 0.5796445608139038, "learning_rate": 3.2400000000000003e-06, "loss": 2.485, "mean_token_accuracy": 0.4741158653050661, "num_tokens": 47120409.0, "step": 325 }, { "entropy": 2.455078125, "epoch": 0.005612802699654795, "grad_norm": 0.5498818159103394, "learning_rate": 3.2500000000000002e-06, "loss": 2.4014, "mean_token_accuracy": 0.48121104296296835, "num_tokens": 47268071.0, "step": 326 }, { "entropy": 2.36669921875, "epoch": 0.005630019885850056, "grad_norm": 0.5759862065315247, "learning_rate": 3.2600000000000006e-06, "loss": 2.3247, "mean_token_accuracy": 0.4973863451741636, "num_tokens": 47417393.0, "step": 327 }, { "entropy": 2.4267578125, "epoch": 0.005647237072045315, "grad_norm": 0.5729436874389648, "learning_rate": 3.2700000000000005e-06, "loss": 2.3774, "mean_token_accuracy": 0.48343247501179576, "num_tokens": 47557540.0, "step": 328 }, { "entropy": 2.4736328125, "epoch": 0.005664454258240576, "grad_norm": 0.5836823582649231, "learning_rate": 3.2800000000000004e-06, "loss": 2.4552, "mean_token_accuracy": 0.4818938923999667, "num_tokens": 47698283.0, "step": 329 }, { "entropy": 2.47900390625, "epoch": 0.0056816714444358355, "grad_norm": 0.5725101828575134, "learning_rate": 3.2900000000000003e-06, "loss": 2.4444, "mean_token_accuracy": 0.4746799021959305, "num_tokens": 47841703.0, "step": 330 }, { "entropy": 2.480712890625, "epoch": 0.005698888630631096, "grad_norm": 0.573144257068634, "learning_rate": 3.3000000000000006e-06, "loss": 2.427, "mean_token_accuracy": 0.47849088767543435, "num_tokens": 47992998.0, "step": 331 }, { "entropy": 2.496337890625, "epoch": 0.005716105816826356, "grad_norm": 0.5374743938446045, "learning_rate": 3.3100000000000005e-06, "loss": 2.4603, "mean_token_accuracy": 0.4686804707162082, "num_tokens": 48155349.0, "step": 332 }, { "entropy": 2.3896484375, "epoch": 0.005733323003021616, "grad_norm": 0.5805611610412598, "learning_rate": 3.3200000000000004e-06, "loss": 2.3428, "mean_token_accuracy": 0.49433091934770346, "num_tokens": 48301544.0, "step": 333 }, { "entropy": 2.4912109375, "epoch": 0.005750540189216877, "grad_norm": 0.5827764868736267, "learning_rate": 3.3300000000000003e-06, "loss": 2.439, "mean_token_accuracy": 0.4705883339047432, "num_tokens": 48443261.0, "step": 334 }, { "entropy": 2.476318359375, "epoch": 0.005767757375412136, "grad_norm": 0.5864250063896179, "learning_rate": 3.3400000000000006e-06, "loss": 2.4157, "mean_token_accuracy": 0.4782019346021116, "num_tokens": 48581146.0, "step": 335 }, { "entropy": 2.538818359375, "epoch": 0.005784974561607397, "grad_norm": 0.55199134349823, "learning_rate": 3.3500000000000005e-06, "loss": 2.4757, "mean_token_accuracy": 0.46869856445118785, "num_tokens": 48736815.0, "step": 336 }, { "entropy": 2.4351806640625, "epoch": 0.0058021917478026565, "grad_norm": 0.574338436126709, "learning_rate": 3.3600000000000004e-06, "loss": 2.3759, "mean_token_accuracy": 0.4860638175159693, "num_tokens": 48897927.0, "step": 337 }, { "entropy": 2.4539794921875, "epoch": 0.005819408933997917, "grad_norm": 0.5789890289306641, "learning_rate": 3.3700000000000003e-06, "loss": 2.3891, "mean_token_accuracy": 0.4830233994871378, "num_tokens": 49041318.0, "step": 338 }, { "entropy": 2.3941650390625, "epoch": 0.005836626120193177, "grad_norm": 0.6387662887573242, "learning_rate": 3.3800000000000007e-06, "loss": 2.3505, "mean_token_accuracy": 0.49095042794942856, "num_tokens": 49176213.0, "step": 339 }, { "entropy": 2.441162109375, "epoch": 0.005853843306388437, "grad_norm": 0.6007640957832336, "learning_rate": 3.3900000000000006e-06, "loss": 2.3682, "mean_token_accuracy": 0.4886678601615131, "num_tokens": 49305639.0, "step": 340 }, { "entropy": 2.51904296875, "epoch": 0.005871060492583697, "grad_norm": 0.5937355160713196, "learning_rate": 3.4000000000000005e-06, "loss": 2.4649, "mean_token_accuracy": 0.47522739693522453, "num_tokens": 49445884.0, "step": 341 }, { "entropy": 2.5255126953125, "epoch": 0.0058882776787789574, "grad_norm": 0.6529424786567688, "learning_rate": 3.4100000000000004e-06, "loss": 2.4718, "mean_token_accuracy": 0.4746531555429101, "num_tokens": 49585678.0, "step": 342 }, { "entropy": 2.4620361328125, "epoch": 0.005905494864974217, "grad_norm": 0.5634227991104126, "learning_rate": 3.4200000000000007e-06, "loss": 2.4171, "mean_token_accuracy": 0.47586130583658814, "num_tokens": 49733283.0, "step": 343 }, { "entropy": 2.4451904296875, "epoch": 0.005922712051169478, "grad_norm": 0.5977884531021118, "learning_rate": 3.4300000000000006e-06, "loss": 2.3813, "mean_token_accuracy": 0.47802019072696567, "num_tokens": 49872559.0, "step": 344 }, { "entropy": 2.4005126953125, "epoch": 0.005939929237364737, "grad_norm": 0.5835461020469666, "learning_rate": 3.44e-06, "loss": 2.351, "mean_token_accuracy": 0.4916755324229598, "num_tokens": 50014503.0, "step": 345 }, { "entropy": 2.414306640625, "epoch": 0.005957146423559998, "grad_norm": 0.616673469543457, "learning_rate": 3.45e-06, "loss": 2.3877, "mean_token_accuracy": 0.4860090627335012, "num_tokens": 50167107.0, "step": 346 }, { "entropy": 2.537353515625, "epoch": 0.0059743636097552575, "grad_norm": 0.6396327018737793, "learning_rate": 3.46e-06, "loss": 2.4815, "mean_token_accuracy": 0.4667340232990682, "num_tokens": 50299354.0, "step": 347 }, { "entropy": 2.424560546875, "epoch": 0.005991580795950518, "grad_norm": 0.5635019540786743, "learning_rate": 3.4700000000000002e-06, "loss": 2.3938, "mean_token_accuracy": 0.48614488868042827, "num_tokens": 50448841.0, "step": 348 }, { "entropy": 2.446044921875, "epoch": 0.006008797982145778, "grad_norm": 0.5684136152267456, "learning_rate": 3.48e-06, "loss": 2.3895, "mean_token_accuracy": 0.48185077449306846, "num_tokens": 50591226.0, "step": 349 }, { "entropy": 2.4525146484375, "epoch": 0.006026015168341038, "grad_norm": 0.614551842212677, "learning_rate": 3.49e-06, "loss": 2.4292, "mean_token_accuracy": 0.47555977012962103, "num_tokens": 50736325.0, "step": 350 }, { "entropy": 2.432373046875, "epoch": 0.006043232354536298, "grad_norm": 0.5730347633361816, "learning_rate": 3.5e-06, "loss": 2.402, "mean_token_accuracy": 0.4838436101563275, "num_tokens": 50882008.0, "step": 351 }, { "entropy": 2.40478515625, "epoch": 0.006060449540731558, "grad_norm": 0.5943769812583923, "learning_rate": 3.5100000000000003e-06, "loss": 2.3651, "mean_token_accuracy": 0.4828978287987411, "num_tokens": 51022802.0, "step": 352 }, { "entropy": 2.4420166015625, "epoch": 0.006077666726926818, "grad_norm": 0.5597745180130005, "learning_rate": 3.52e-06, "loss": 2.3646, "mean_token_accuracy": 0.4837341457605362, "num_tokens": 51165806.0, "step": 353 }, { "entropy": 2.487060546875, "epoch": 0.0060948839131220785, "grad_norm": 0.5304551124572754, "learning_rate": 3.53e-06, "loss": 2.4595, "mean_token_accuracy": 0.4816332710906863, "num_tokens": 51325353.0, "step": 354 }, { "entropy": 2.4630126953125, "epoch": 0.006112101099317338, "grad_norm": 0.6318809390068054, "learning_rate": 3.54e-06, "loss": 2.4102, "mean_token_accuracy": 0.4823563848622143, "num_tokens": 51447642.0, "step": 355 }, { "entropy": 2.4342041015625, "epoch": 0.006129318285512599, "grad_norm": 0.571685791015625, "learning_rate": 3.5500000000000003e-06, "loss": 2.3455, "mean_token_accuracy": 0.48439886793494225, "num_tokens": 51593840.0, "step": 356 }, { "entropy": 2.42138671875, "epoch": 0.006146535471707858, "grad_norm": 0.6026922464370728, "learning_rate": 3.5600000000000002e-06, "loss": 2.3275, "mean_token_accuracy": 0.4946357752196491, "num_tokens": 51727430.0, "step": 357 }, { "entropy": 2.3724365234375, "epoch": 0.006163752657903119, "grad_norm": 0.584361732006073, "learning_rate": 3.57e-06, "loss": 2.3214, "mean_token_accuracy": 0.48994710063561797, "num_tokens": 51873095.0, "step": 358 }, { "entropy": 2.47265625, "epoch": 0.0061809698440983794, "grad_norm": 0.5443347692489624, "learning_rate": 3.58e-06, "loss": 2.4081, "mean_token_accuracy": 0.4771101255901158, "num_tokens": 52022705.0, "step": 359 }, { "entropy": 2.44677734375, "epoch": 0.006198187030293639, "grad_norm": 0.5630244016647339, "learning_rate": 3.5900000000000004e-06, "loss": 2.4228, "mean_token_accuracy": 0.4784181611612439, "num_tokens": 52168321.0, "step": 360 }, { "entropy": 2.467529296875, "epoch": 0.0062154042164889, "grad_norm": 0.5848613977432251, "learning_rate": 3.6000000000000003e-06, "loss": 2.4734, "mean_token_accuracy": 0.47760110441595316, "num_tokens": 52321812.0, "step": 361 }, { "entropy": 2.431396484375, "epoch": 0.006232621402684159, "grad_norm": 0.5731722712516785, "learning_rate": 3.61e-06, "loss": 2.3849, "mean_token_accuracy": 0.48051671124994755, "num_tokens": 52477181.0, "step": 362 }, { "entropy": 2.435546875, "epoch": 0.00624983858887942, "grad_norm": 0.5230873227119446, "learning_rate": 3.62e-06, "loss": 2.361, "mean_token_accuracy": 0.4862850420176983, "num_tokens": 52638308.0, "step": 363 }, { "entropy": 2.3856201171875, "epoch": 0.0062670557750746795, "grad_norm": 0.5921328067779541, "learning_rate": 3.6300000000000004e-06, "loss": 2.3105, "mean_token_accuracy": 0.496675749309361, "num_tokens": 52778098.0, "step": 364 }, { "entropy": 2.486083984375, "epoch": 0.00628427296126994, "grad_norm": 0.5676338076591492, "learning_rate": 3.6400000000000003e-06, "loss": 2.4375, "mean_token_accuracy": 0.4777340483851731, "num_tokens": 52926007.0, "step": 365 }, { "entropy": 2.5103759765625, "epoch": 0.0063014901474652, "grad_norm": 0.5867769718170166, "learning_rate": 3.65e-06, "loss": 2.486, "mean_token_accuracy": 0.47535310545936227, "num_tokens": 53067977.0, "step": 366 }, { "entropy": 2.443115234375, "epoch": 0.00631870733366046, "grad_norm": 0.5718576312065125, "learning_rate": 3.66e-06, "loss": 2.4122, "mean_token_accuracy": 0.48204088117927313, "num_tokens": 53214899.0, "step": 367 }, { "entropy": 2.4334716796875, "epoch": 0.00633592451985572, "grad_norm": 0.5422816872596741, "learning_rate": 3.6700000000000004e-06, "loss": 2.4062, "mean_token_accuracy": 0.48176093958318233, "num_tokens": 53376856.0, "step": 368 }, { "entropy": 2.44580078125, "epoch": 0.00635314170605098, "grad_norm": 0.5711188912391663, "learning_rate": 3.6800000000000003e-06, "loss": 2.393, "mean_token_accuracy": 0.48084082640707493, "num_tokens": 53512787.0, "step": 369 }, { "entropy": 2.394287109375, "epoch": 0.00637035889224624, "grad_norm": 0.5846443772315979, "learning_rate": 3.6900000000000002e-06, "loss": 2.3919, "mean_token_accuracy": 0.49098140047863126, "num_tokens": 53658651.0, "step": 370 }, { "entropy": 2.4273681640625, "epoch": 0.0063875760784415005, "grad_norm": 0.5670855641365051, "learning_rate": 3.7e-06, "loss": 2.3645, "mean_token_accuracy": 0.48496190132573247, "num_tokens": 53799071.0, "step": 371 }, { "entropy": 2.559814453125, "epoch": 0.00640479326463676, "grad_norm": 0.5919318199157715, "learning_rate": 3.7100000000000005e-06, "loss": 2.5396, "mean_token_accuracy": 0.46628884179517627, "num_tokens": 53932932.0, "step": 372 }, { "entropy": 2.4007568359375, "epoch": 0.006422010450832021, "grad_norm": 0.5672026872634888, "learning_rate": 3.7200000000000004e-06, "loss": 2.3322, "mean_token_accuracy": 0.48820391669869423, "num_tokens": 54086169.0, "step": 373 }, { "entropy": 2.3978271484375, "epoch": 0.00643922763702728, "grad_norm": 0.5757989883422852, "learning_rate": 3.7300000000000003e-06, "loss": 2.3164, "mean_token_accuracy": 0.4853466097265482, "num_tokens": 54241964.0, "step": 374 }, { "entropy": 2.44580078125, "epoch": 0.006456444823222541, "grad_norm": 0.562664806842804, "learning_rate": 3.74e-06, "loss": 2.4101, "mean_token_accuracy": 0.47567897848784924, "num_tokens": 54386084.0, "step": 375 }, { "entropy": 2.4031982421875, "epoch": 0.0064736620094178006, "grad_norm": 0.5849329829216003, "learning_rate": 3.7500000000000005e-06, "loss": 2.3482, "mean_token_accuracy": 0.48964470298960805, "num_tokens": 54523676.0, "step": 376 }, { "entropy": 2.4454345703125, "epoch": 0.006490879195613061, "grad_norm": 0.6109877824783325, "learning_rate": 3.7600000000000004e-06, "loss": 2.4117, "mean_token_accuracy": 0.48503589117899537, "num_tokens": 54662310.0, "step": 377 }, { "entropy": 2.4466552734375, "epoch": 0.006508096381808321, "grad_norm": 0.5700381398200989, "learning_rate": 3.7700000000000003e-06, "loss": 2.4555, "mean_token_accuracy": 0.477643181104213, "num_tokens": 54809183.0, "step": 378 }, { "entropy": 2.4498291015625, "epoch": 0.006525313568003581, "grad_norm": 0.5381008386611938, "learning_rate": 3.7800000000000002e-06, "loss": 2.4346, "mean_token_accuracy": 0.47657692804932594, "num_tokens": 54968533.0, "step": 379 }, { "entropy": 2.4263916015625, "epoch": 0.006542530754198841, "grad_norm": 0.5504759550094604, "learning_rate": 3.79e-06, "loss": 2.3587, "mean_token_accuracy": 0.48776144767180085, "num_tokens": 55117824.0, "step": 380 }, { "entropy": 2.4163818359375, "epoch": 0.0065597479403941015, "grad_norm": 0.5289463400840759, "learning_rate": 3.8000000000000005e-06, "loss": 2.3674, "mean_token_accuracy": 0.48521781573072076, "num_tokens": 55269929.0, "step": 381 }, { "entropy": 2.4097900390625, "epoch": 0.006576965126589361, "grad_norm": 0.55353182554245, "learning_rate": 3.8100000000000004e-06, "loss": 2.3691, "mean_token_accuracy": 0.48239677073433995, "num_tokens": 55423948.0, "step": 382 }, { "entropy": 2.5047607421875, "epoch": 0.006594182312784622, "grad_norm": 0.5586856603622437, "learning_rate": 3.820000000000001e-06, "loss": 2.4676, "mean_token_accuracy": 0.47292688954621553, "num_tokens": 55567887.0, "step": 383 }, { "entropy": 2.47802734375, "epoch": 0.006611399498979881, "grad_norm": 0.5976067781448364, "learning_rate": 3.830000000000001e-06, "loss": 2.4577, "mean_token_accuracy": 0.4735474893823266, "num_tokens": 55700170.0, "step": 384 }, { "entropy": 2.4603271484375, "epoch": 0.006628616685175142, "grad_norm": 0.5445942878723145, "learning_rate": 3.8400000000000005e-06, "loss": 2.4459, "mean_token_accuracy": 0.4747904692776501, "num_tokens": 55852413.0, "step": 385 }, { "entropy": 2.494873046875, "epoch": 0.006645833871370402, "grad_norm": 0.5769667625427246, "learning_rate": 3.85e-06, "loss": 2.4465, "mean_token_accuracy": 0.4768332834355533, "num_tokens": 56010312.0, "step": 386 }, { "entropy": 2.4306640625, "epoch": 0.006663051057565662, "grad_norm": 0.5875312685966492, "learning_rate": 3.86e-06, "loss": 2.3771, "mean_token_accuracy": 0.4805051935836673, "num_tokens": 56149079.0, "step": 387 }, { "entropy": 2.406494140625, "epoch": 0.0066802682437609225, "grad_norm": 0.5804882049560547, "learning_rate": 3.87e-06, "loss": 2.38, "mean_token_accuracy": 0.48818252328783274, "num_tokens": 56292210.0, "step": 388 }, { "entropy": 2.4931640625, "epoch": 0.006697485429956182, "grad_norm": 0.6065754890441895, "learning_rate": 3.88e-06, "loss": 2.5048, "mean_token_accuracy": 0.47188919549807906, "num_tokens": 56425512.0, "step": 389 }, { "entropy": 2.528076171875, "epoch": 0.006714702616151443, "grad_norm": 0.5695645213127136, "learning_rate": 3.89e-06, "loss": 2.4927, "mean_token_accuracy": 0.47105303034186363, "num_tokens": 56567538.0, "step": 390 }, { "entropy": 2.4716796875, "epoch": 0.006731919802346702, "grad_norm": 0.5461863279342651, "learning_rate": 3.900000000000001e-06, "loss": 2.4607, "mean_token_accuracy": 0.47544049797579646, "num_tokens": 56723718.0, "step": 391 }, { "entropy": 2.42236328125, "epoch": 0.006749136988541963, "grad_norm": 0.7229442000389099, "learning_rate": 3.910000000000001e-06, "loss": 2.322, "mean_token_accuracy": 0.49011519411578774, "num_tokens": 56881653.0, "step": 392 }, { "entropy": 2.4364013671875, "epoch": 0.0067663541747372226, "grad_norm": 0.5672553181648254, "learning_rate": 3.920000000000001e-06, "loss": 2.418, "mean_token_accuracy": 0.4815367963165045, "num_tokens": 57026712.0, "step": 393 }, { "entropy": 2.4490966796875, "epoch": 0.006783571360932483, "grad_norm": 0.6193339824676514, "learning_rate": 3.9300000000000005e-06, "loss": 2.324, "mean_token_accuracy": 0.4879821529611945, "num_tokens": 57154864.0, "step": 394 }, { "entropy": 2.424072265625, "epoch": 0.006800788547127743, "grad_norm": 0.6047675609588623, "learning_rate": 3.94e-06, "loss": 2.343, "mean_token_accuracy": 0.49066451471298933, "num_tokens": 57313059.0, "step": 395 }, { "entropy": 2.529052734375, "epoch": 0.006818005733323003, "grad_norm": 0.6238451600074768, "learning_rate": 3.95e-06, "loss": 2.5103, "mean_token_accuracy": 0.4740439150482416, "num_tokens": 57456573.0, "step": 396 }, { "entropy": 2.4873046875, "epoch": 0.006835222919518263, "grad_norm": 0.6354604363441467, "learning_rate": 3.96e-06, "loss": 2.4319, "mean_token_accuracy": 0.475927259773016, "num_tokens": 57601572.0, "step": 397 }, { "entropy": 2.527587890625, "epoch": 0.0068524401057135234, "grad_norm": 0.6518051624298096, "learning_rate": 3.97e-06, "loss": 2.4717, "mean_token_accuracy": 0.4759511463344097, "num_tokens": 57737985.0, "step": 398 }, { "entropy": 2.400146484375, "epoch": 0.006869657291908783, "grad_norm": 0.5659388899803162, "learning_rate": 3.980000000000001e-06, "loss": 2.3422, "mean_token_accuracy": 0.4919633981771767, "num_tokens": 57886268.0, "step": 399 }, { "entropy": 2.4190673828125, "epoch": 0.006886874478104044, "grad_norm": 0.6125981211662292, "learning_rate": 3.990000000000001e-06, "loss": 2.3588, "mean_token_accuracy": 0.49498999677598476, "num_tokens": 58020557.0, "step": 400 }, { "entropy": 2.4425048828125, "epoch": 0.006904091664299303, "grad_norm": 0.5755429267883301, "learning_rate": 4.000000000000001e-06, "loss": 2.4063, "mean_token_accuracy": 0.4781268546357751, "num_tokens": 58157132.0, "step": 401 }, { "entropy": 2.4521484375, "epoch": 0.006921308850494564, "grad_norm": 0.5530086159706116, "learning_rate": 4.0100000000000006e-06, "loss": 2.3836, "mean_token_accuracy": 0.4830552595667541, "num_tokens": 58306412.0, "step": 402 }, { "entropy": 2.4962158203125, "epoch": 0.0069385260366898235, "grad_norm": 0.6048458218574524, "learning_rate": 4.0200000000000005e-06, "loss": 2.4414, "mean_token_accuracy": 0.4728825897909701, "num_tokens": 58441138.0, "step": 403 }, { "entropy": 2.456298828125, "epoch": 0.006955743222885084, "grad_norm": 0.5381326079368591, "learning_rate": 4.03e-06, "loss": 2.4554, "mean_token_accuracy": 0.474265918135643, "num_tokens": 58601535.0, "step": 404 }, { "entropy": 2.4443359375, "epoch": 0.006972960409080344, "grad_norm": 0.5713069438934326, "learning_rate": 4.04e-06, "loss": 2.3849, "mean_token_accuracy": 0.4821310769766569, "num_tokens": 58732975.0, "step": 405 }, { "entropy": 2.421142578125, "epoch": 0.006990177595275604, "grad_norm": 0.6063797473907471, "learning_rate": 4.05e-06, "loss": 2.3477, "mean_token_accuracy": 0.4892558893188834, "num_tokens": 58863667.0, "step": 406 }, { "entropy": 2.4732666015625, "epoch": 0.007007394781470864, "grad_norm": 0.5941247940063477, "learning_rate": 4.060000000000001e-06, "loss": 2.4404, "mean_token_accuracy": 0.4769346718676388, "num_tokens": 59001984.0, "step": 407 }, { "entropy": 2.4593505859375, "epoch": 0.007024611967666124, "grad_norm": 0.5566405653953552, "learning_rate": 4.07e-06, "loss": 2.4064, "mean_token_accuracy": 0.480907566845417, "num_tokens": 59155387.0, "step": 408 }, { "entropy": 2.40966796875, "epoch": 0.007041829153861384, "grad_norm": 0.5935348868370056, "learning_rate": 4.08e-06, "loss": 2.3289, "mean_token_accuracy": 0.49064510269090533, "num_tokens": 59311521.0, "step": 409 }, { "entropy": 2.457275390625, "epoch": 0.0070590463400566445, "grad_norm": 0.581543505191803, "learning_rate": 4.09e-06, "loss": 2.4144, "mean_token_accuracy": 0.4777227705344558, "num_tokens": 59461508.0, "step": 410 }, { "entropy": 2.46240234375, "epoch": 0.007076263526251905, "grad_norm": 0.5600920915603638, "learning_rate": 4.1e-06, "loss": 2.4242, "mean_token_accuracy": 0.47735275235027075, "num_tokens": 59611309.0, "step": 411 }, { "entropy": 2.3861083984375, "epoch": 0.007093480712447165, "grad_norm": 0.645194411277771, "learning_rate": 4.1100000000000005e-06, "loss": 2.3315, "mean_token_accuracy": 0.4974592626094818, "num_tokens": 59739837.0, "step": 412 }, { "entropy": 2.42822265625, "epoch": 0.007110697898642425, "grad_norm": 0.5792311429977417, "learning_rate": 4.12e-06, "loss": 2.3767, "mean_token_accuracy": 0.4854038432240486, "num_tokens": 59874636.0, "step": 413 }, { "entropy": 2.527587890625, "epoch": 0.007127915084837685, "grad_norm": 0.5748518109321594, "learning_rate": 4.13e-06, "loss": 2.4767, "mean_token_accuracy": 0.4697956978343427, "num_tokens": 60006386.0, "step": 414 }, { "entropy": 2.4110107421875, "epoch": 0.0071451322710329454, "grad_norm": 0.5458788871765137, "learning_rate": 4.14e-06, "loss": 2.3364, "mean_token_accuracy": 0.4888159199617803, "num_tokens": 60161574.0, "step": 415 }, { "entropy": 2.4501953125, "epoch": 0.007162349457228205, "grad_norm": 0.5602844953536987, "learning_rate": 4.15e-06, "loss": 2.4069, "mean_token_accuracy": 0.4802040630020201, "num_tokens": 60315301.0, "step": 416 }, { "entropy": 2.420166015625, "epoch": 0.007179566643423466, "grad_norm": 0.5277621150016785, "learning_rate": 4.16e-06, "loss": 2.3738, "mean_token_accuracy": 0.4825376900844276, "num_tokens": 60475723.0, "step": 417 }, { "entropy": 2.371337890625, "epoch": 0.007196783829618725, "grad_norm": 0.6119918823242188, "learning_rate": 4.17e-06, "loss": 2.3183, "mean_token_accuracy": 0.49627504125237465, "num_tokens": 60624809.0, "step": 418 }, { "entropy": 2.43359375, "epoch": 0.007214001015813986, "grad_norm": 0.5752723217010498, "learning_rate": 4.18e-06, "loss": 2.4064, "mean_token_accuracy": 0.48068576911464334, "num_tokens": 60766487.0, "step": 419 }, { "entropy": 2.55322265625, "epoch": 0.0072312182020092455, "grad_norm": 0.591503918170929, "learning_rate": 4.1900000000000005e-06, "loss": 2.5353, "mean_token_accuracy": 0.46583034889772534, "num_tokens": 60899696.0, "step": 420 }, { "entropy": 2.482666015625, "epoch": 0.007248435388204506, "grad_norm": 0.584340512752533, "learning_rate": 4.2000000000000004e-06, "loss": 2.4115, "mean_token_accuracy": 0.47717864625155926, "num_tokens": 61036187.0, "step": 421 }, { "entropy": 2.4241943359375, "epoch": 0.007265652574399766, "grad_norm": 0.5605193376541138, "learning_rate": 4.21e-06, "loss": 2.4071, "mean_token_accuracy": 0.4835399743169546, "num_tokens": 61193032.0, "step": 422 }, { "entropy": 2.447021484375, "epoch": 0.007282869760595026, "grad_norm": 0.5565392374992371, "learning_rate": 4.22e-06, "loss": 2.3779, "mean_token_accuracy": 0.479055879637599, "num_tokens": 61336014.0, "step": 423 }, { "entropy": 2.4150390625, "epoch": 0.007300086946790286, "grad_norm": 0.5690925717353821, "learning_rate": 4.23e-06, "loss": 2.3403, "mean_token_accuracy": 0.4891307670623064, "num_tokens": 61482900.0, "step": 424 }, { "entropy": 2.3863525390625, "epoch": 0.007317304132985546, "grad_norm": 0.5693207383155823, "learning_rate": 4.24e-06, "loss": 2.3509, "mean_token_accuracy": 0.4905586871318519, "num_tokens": 61629105.0, "step": 425 }, { "entropy": 2.547119140625, "epoch": 0.007334521319180806, "grad_norm": 0.5351988077163696, "learning_rate": 4.25e-06, "loss": 2.4922, "mean_token_accuracy": 0.4689330547116697, "num_tokens": 61780626.0, "step": 426 }, { "entropy": 2.47998046875, "epoch": 0.0073517385053760665, "grad_norm": 0.5875465273857117, "learning_rate": 4.26e-06, "loss": 2.4184, "mean_token_accuracy": 0.4823840647004545, "num_tokens": 61915045.0, "step": 427 }, { "entropy": 2.479248046875, "epoch": 0.007368955691571326, "grad_norm": 0.5979102253913879, "learning_rate": 4.270000000000001e-06, "loss": 2.4622, "mean_token_accuracy": 0.4727623569779098, "num_tokens": 62050386.0, "step": 428 }, { "entropy": 2.4766845703125, "epoch": 0.007386172877766587, "grad_norm": 0.5433111190795898, "learning_rate": 4.2800000000000005e-06, "loss": 2.3951, "mean_token_accuracy": 0.4800742859952152, "num_tokens": 62212530.0, "step": 429 }, { "entropy": 2.4046630859375, "epoch": 0.007403390063961846, "grad_norm": 0.5350104570388794, "learning_rate": 4.2900000000000004e-06, "loss": 2.3617, "mean_token_accuracy": 0.4864892913028598, "num_tokens": 62370301.0, "step": 430 }, { "entropy": 2.4168701171875, "epoch": 0.007420607250157107, "grad_norm": 0.5780488848686218, "learning_rate": 4.3e-06, "loss": 2.3724, "mean_token_accuracy": 0.48195584304630756, "num_tokens": 62520205.0, "step": 431 }, { "entropy": 2.5218505859375, "epoch": 0.0074378244363523666, "grad_norm": 0.5742896795272827, "learning_rate": 4.31e-06, "loss": 2.4678, "mean_token_accuracy": 0.47272730339318514, "num_tokens": 62661467.0, "step": 432 }, { "entropy": 2.519775390625, "epoch": 0.007455041622547627, "grad_norm": 0.5965112447738647, "learning_rate": 4.32e-06, "loss": 2.5211, "mean_token_accuracy": 0.46558596193790436, "num_tokens": 62798987.0, "step": 433 }, { "entropy": 2.4462890625, "epoch": 0.007472258808742887, "grad_norm": 0.5964140295982361, "learning_rate": 4.33e-06, "loss": 2.3445, "mean_token_accuracy": 0.4845765414647758, "num_tokens": 62941022.0, "step": 434 }, { "entropy": 2.403076171875, "epoch": 0.007489475994938147, "grad_norm": 0.5478980541229248, "learning_rate": 4.34e-06, "loss": 2.3149, "mean_token_accuracy": 0.4858265924267471, "num_tokens": 63106699.0, "step": 435 }, { "entropy": 2.486328125, "epoch": 0.007506693181133408, "grad_norm": 0.5869223475456238, "learning_rate": 4.350000000000001e-06, "loss": 2.4946, "mean_token_accuracy": 0.472647019661963, "num_tokens": 63247500.0, "step": 436 }, { "entropy": 2.389404296875, "epoch": 0.0075239103673286675, "grad_norm": 0.583699107170105, "learning_rate": 4.360000000000001e-06, "loss": 2.38, "mean_token_accuracy": 0.4875432150438428, "num_tokens": 63397717.0, "step": 437 }, { "entropy": 2.4312744140625, "epoch": 0.007541127553523928, "grad_norm": 0.6965104341506958, "learning_rate": 4.3700000000000005e-06, "loss": 2.3725, "mean_token_accuracy": 0.4779693940654397, "num_tokens": 63532729.0, "step": 438 }, { "entropy": 2.510498046875, "epoch": 0.007558344739719188, "grad_norm": 0.5973443984985352, "learning_rate": 4.38e-06, "loss": 2.5009, "mean_token_accuracy": 0.469982345122844, "num_tokens": 63673621.0, "step": 439 }, { "entropy": 2.414794921875, "epoch": 0.007575561925914448, "grad_norm": 0.6061626672744751, "learning_rate": 4.39e-06, "loss": 2.3686, "mean_token_accuracy": 0.4842392741702497, "num_tokens": 63811912.0, "step": 440 }, { "entropy": 2.4686279296875, "epoch": 0.007592779112109708, "grad_norm": 0.5435473322868347, "learning_rate": 4.4e-06, "loss": 2.4623, "mean_token_accuracy": 0.47148632165044546, "num_tokens": 63969367.0, "step": 441 }, { "entropy": 2.380859375, "epoch": 0.007609996298304968, "grad_norm": 0.6280810832977295, "learning_rate": 4.41e-06, "loss": 2.3327, "mean_token_accuracy": 0.4927673670463264, "num_tokens": 64106475.0, "step": 442 }, { "entropy": 2.431884765625, "epoch": 0.007627213484500228, "grad_norm": 0.60051429271698, "learning_rate": 4.42e-06, "loss": 2.3611, "mean_token_accuracy": 0.48384140338748693, "num_tokens": 64240694.0, "step": 443 }, { "entropy": 2.482421875, "epoch": 0.0076444306706954885, "grad_norm": 0.5727265477180481, "learning_rate": 4.430000000000001e-06, "loss": 2.4215, "mean_token_accuracy": 0.4743316164240241, "num_tokens": 64379463.0, "step": 444 }, { "entropy": 2.4876708984375, "epoch": 0.007661647856890748, "grad_norm": 0.6397287845611572, "learning_rate": 4.440000000000001e-06, "loss": 2.4762, "mean_token_accuracy": 0.47825985960662365, "num_tokens": 64533020.0, "step": 445 }, { "entropy": 2.498046875, "epoch": 0.007678865043086009, "grad_norm": 0.5948851704597473, "learning_rate": 4.450000000000001e-06, "loss": 2.4374, "mean_token_accuracy": 0.4723509643226862, "num_tokens": 64669487.0, "step": 446 }, { "entropy": 2.45556640625, "epoch": 0.007696082229281268, "grad_norm": 0.5741977691650391, "learning_rate": 4.4600000000000005e-06, "loss": 2.4201, "mean_token_accuracy": 0.4786538486368954, "num_tokens": 64821796.0, "step": 447 }, { "entropy": 2.421875, "epoch": 0.007713299415476529, "grad_norm": 0.5873377323150635, "learning_rate": 4.47e-06, "loss": 2.3954, "mean_token_accuracy": 0.48342282278463244, "num_tokens": 64959422.0, "step": 448 }, { "entropy": 2.4324951171875, "epoch": 0.0077305166016717886, "grad_norm": 0.6045364737510681, "learning_rate": 4.48e-06, "loss": 2.378, "mean_token_accuracy": 0.48433305928483605, "num_tokens": 65103972.0, "step": 449 }, { "entropy": 2.5223388671875, "epoch": 0.007747733787867049, "grad_norm": 0.5803834795951843, "learning_rate": 4.49e-06, "loss": 2.4753, "mean_token_accuracy": 0.469526968896389, "num_tokens": 65240634.0, "step": 450 }, { "entropy": 2.517822265625, "epoch": 0.007764950974062309, "grad_norm": 0.5813508033752441, "learning_rate": 4.5e-06, "loss": 2.5203, "mean_token_accuracy": 0.472479164134711, "num_tokens": 65396335.0, "step": 451 }, { "entropy": 2.4752197265625, "epoch": 0.007782168160257569, "grad_norm": 0.8954483270645142, "learning_rate": 4.510000000000001e-06, "loss": 2.428, "mean_token_accuracy": 0.4781260257586837, "num_tokens": 65551212.0, "step": 452 }, { "entropy": 2.38916015625, "epoch": 0.007799385346452829, "grad_norm": 0.5381391644477844, "learning_rate": 4.520000000000001e-06, "loss": 2.3428, "mean_token_accuracy": 0.4869826496578753, "num_tokens": 65706488.0, "step": 453 }, { "entropy": 2.3837890625, "epoch": 0.00781660253264809, "grad_norm": 0.6232318878173828, "learning_rate": 4.530000000000001e-06, "loss": 2.3453, "mean_token_accuracy": 0.4916538861580193, "num_tokens": 65848771.0, "step": 454 }, { "entropy": 2.39892578125, "epoch": 0.00783381971884335, "grad_norm": 0.5655352473258972, "learning_rate": 4.540000000000001e-06, "loss": 2.3439, "mean_token_accuracy": 0.48888020450249314, "num_tokens": 66002377.0, "step": 455 }, { "entropy": 2.50732421875, "epoch": 0.007851036905038609, "grad_norm": 0.5934751033782959, "learning_rate": 4.5500000000000005e-06, "loss": 2.452, "mean_token_accuracy": 0.4774050717242062, "num_tokens": 66140992.0, "step": 456 }, { "entropy": 2.486572265625, "epoch": 0.00786825409123387, "grad_norm": 0.5821974873542786, "learning_rate": 4.56e-06, "loss": 2.4434, "mean_token_accuracy": 0.47727885795757174, "num_tokens": 66283307.0, "step": 457 }, { "entropy": 2.4403076171875, "epoch": 0.00788547127742913, "grad_norm": 0.6021990180015564, "learning_rate": 4.57e-06, "loss": 2.3891, "mean_token_accuracy": 0.48405969655141234, "num_tokens": 66430065.0, "step": 458 }, { "entropy": 2.4219970703125, "epoch": 0.00790268846362439, "grad_norm": 0.5609884858131409, "learning_rate": 4.58e-06, "loss": 2.3585, "mean_token_accuracy": 0.4830049378797412, "num_tokens": 66580625.0, "step": 459 }, { "entropy": 2.4365234375, "epoch": 0.007919905649819649, "grad_norm": 0.6351988911628723, "learning_rate": 4.590000000000001e-06, "loss": 2.3914, "mean_token_accuracy": 0.4825670407153666, "num_tokens": 66710819.0, "step": 460 }, { "entropy": 2.388916015625, "epoch": 0.00793712283601491, "grad_norm": 0.5797377824783325, "learning_rate": 4.600000000000001e-06, "loss": 2.3401, "mean_token_accuracy": 0.49362843204289675, "num_tokens": 66855860.0, "step": 461 }, { "entropy": 2.502685546875, "epoch": 0.00795434002221017, "grad_norm": 0.5734979510307312, "learning_rate": 4.610000000000001e-06, "loss": 2.475, "mean_token_accuracy": 0.468968971632421, "num_tokens": 66991905.0, "step": 462 }, { "entropy": 2.4248046875, "epoch": 0.00797155720840543, "grad_norm": 0.5948538184165955, "learning_rate": 4.620000000000001e-06, "loss": 2.3821, "mean_token_accuracy": 0.49015933787450194, "num_tokens": 67140600.0, "step": 463 }, { "entropy": 2.4041748046875, "epoch": 0.007988774394600691, "grad_norm": 0.5927978754043579, "learning_rate": 4.6300000000000006e-06, "loss": 2.3563, "mean_token_accuracy": 0.4928512079641223, "num_tokens": 67296282.0, "step": 464 }, { "entropy": 2.44189453125, "epoch": 0.00800599158079595, "grad_norm": 0.5638139247894287, "learning_rate": 4.6400000000000005e-06, "loss": 2.4232, "mean_token_accuracy": 0.47663756739348173, "num_tokens": 67445364.0, "step": 465 }, { "entropy": 2.440673828125, "epoch": 0.00802320876699121, "grad_norm": 0.6092103123664856, "learning_rate": 4.65e-06, "loss": 2.3328, "mean_token_accuracy": 0.487199897877872, "num_tokens": 67574552.0, "step": 466 }, { "entropy": 2.431396484375, "epoch": 0.008040425953186471, "grad_norm": 0.5823683738708496, "learning_rate": 4.66e-06, "loss": 2.3507, "mean_token_accuracy": 0.4836606332100928, "num_tokens": 67715593.0, "step": 467 }, { "entropy": 2.4288330078125, "epoch": 0.008057643139381732, "grad_norm": 0.571380615234375, "learning_rate": 4.670000000000001e-06, "loss": 2.376, "mean_token_accuracy": 0.4848336656577885, "num_tokens": 67851959.0, "step": 468 }, { "entropy": 2.4405517578125, "epoch": 0.00807486032557699, "grad_norm": 0.5497409105300903, "learning_rate": 4.680000000000001e-06, "loss": 2.3331, "mean_token_accuracy": 0.49043108662590384, "num_tokens": 67999315.0, "step": 469 }, { "entropy": 2.4580078125, "epoch": 0.008092077511772251, "grad_norm": 0.6131438612937927, "learning_rate": 4.69e-06, "loss": 2.4471, "mean_token_accuracy": 0.48354413686320186, "num_tokens": 68151679.0, "step": 470 }, { "entropy": 2.45361328125, "epoch": 0.008109294697967511, "grad_norm": 0.5572087168693542, "learning_rate": 4.7e-06, "loss": 2.408, "mean_token_accuracy": 0.4743777387775481, "num_tokens": 68303962.0, "step": 471 }, { "entropy": 2.4327392578125, "epoch": 0.008126511884162772, "grad_norm": 0.6229270696640015, "learning_rate": 4.71e-06, "loss": 2.4372, "mean_token_accuracy": 0.4852800811640918, "num_tokens": 68435424.0, "step": 472 }, { "entropy": 2.44580078125, "epoch": 0.00814372907035803, "grad_norm": 0.5797957181930542, "learning_rate": 4.7200000000000005e-06, "loss": 2.4088, "mean_token_accuracy": 0.48404247080907226, "num_tokens": 68579820.0, "step": 473 }, { "entropy": 2.49560546875, "epoch": 0.008160946256553291, "grad_norm": 0.5527267456054688, "learning_rate": 4.7300000000000005e-06, "loss": 2.3781, "mean_token_accuracy": 0.47309215646237135, "num_tokens": 68720632.0, "step": 474 }, { "entropy": 2.4736328125, "epoch": 0.008178163442748552, "grad_norm": 0.6314601302146912, "learning_rate": 4.74e-06, "loss": 2.4292, "mean_token_accuracy": 0.47931129625067115, "num_tokens": 68847385.0, "step": 475 }, { "entropy": 2.43603515625, "epoch": 0.008195380628943812, "grad_norm": 0.5509300827980042, "learning_rate": 4.75e-06, "loss": 2.4099, "mean_token_accuracy": 0.4842258528806269, "num_tokens": 68997389.0, "step": 476 }, { "entropy": 2.41748046875, "epoch": 0.008212597815139071, "grad_norm": 0.5300650000572205, "learning_rate": 4.76e-06, "loss": 2.3382, "mean_token_accuracy": 0.48574389703571796, "num_tokens": 69155404.0, "step": 477 }, { "entropy": 2.4625244140625, "epoch": 0.008229815001334332, "grad_norm": 0.6269882321357727, "learning_rate": 4.77e-06, "loss": 2.3776, "mean_token_accuracy": 0.4838248719461262, "num_tokens": 69290738.0, "step": 478 }, { "entropy": 2.437255859375, "epoch": 0.008247032187529592, "grad_norm": 0.6751134395599365, "learning_rate": 4.78e-06, "loss": 2.4007, "mean_token_accuracy": 0.48909331811591983, "num_tokens": 69408062.0, "step": 479 }, { "entropy": 2.4910888671875, "epoch": 0.008264249373724853, "grad_norm": 0.596396267414093, "learning_rate": 4.79e-06, "loss": 2.4679, "mean_token_accuracy": 0.47261510556563735, "num_tokens": 69539526.0, "step": 480 }, { "entropy": 2.4178466796875, "epoch": 0.008281466559920111, "grad_norm": 0.6338604688644409, "learning_rate": 4.800000000000001e-06, "loss": 2.3809, "mean_token_accuracy": 0.4872106113471091, "num_tokens": 69691415.0, "step": 481 }, { "entropy": 2.419189453125, "epoch": 0.008298683746115372, "grad_norm": 0.5384358167648315, "learning_rate": 4.8100000000000005e-06, "loss": 2.3549, "mean_token_accuracy": 0.49019210133701563, "num_tokens": 69848010.0, "step": 482 }, { "entropy": 2.4853515625, "epoch": 0.008315900932310633, "grad_norm": 0.5779227614402771, "learning_rate": 4.8200000000000004e-06, "loss": 2.4459, "mean_token_accuracy": 0.4744481286033988, "num_tokens": 69997662.0, "step": 483 }, { "entropy": 2.41064453125, "epoch": 0.008333118118505893, "grad_norm": 0.5691106915473938, "learning_rate": 4.83e-06, "loss": 2.3619, "mean_token_accuracy": 0.4926013760268688, "num_tokens": 70147997.0, "step": 484 }, { "entropy": 2.382080078125, "epoch": 0.008350335304701152, "grad_norm": 0.5725884437561035, "learning_rate": 4.84e-06, "loss": 2.3575, "mean_token_accuracy": 0.489425728097558, "num_tokens": 70294184.0, "step": 485 }, { "entropy": 2.47314453125, "epoch": 0.008367552490896412, "grad_norm": 0.5658232569694519, "learning_rate": 4.85e-06, "loss": 2.426, "mean_token_accuracy": 0.4749414478428662, "num_tokens": 70442248.0, "step": 486 }, { "entropy": 2.483642578125, "epoch": 0.008384769677091673, "grad_norm": 0.6129222512245178, "learning_rate": 4.86e-06, "loss": 2.4917, "mean_token_accuracy": 0.4728327440097928, "num_tokens": 70579740.0, "step": 487 }, { "entropy": 2.4320068359375, "epoch": 0.008401986863286933, "grad_norm": 0.9568486213684082, "learning_rate": 4.87e-06, "loss": 2.3935, "mean_token_accuracy": 0.4829396088607609, "num_tokens": 70721524.0, "step": 488 }, { "entropy": 2.459228515625, "epoch": 0.008419204049482194, "grad_norm": 0.544103741645813, "learning_rate": 4.880000000000001e-06, "loss": 2.4176, "mean_token_accuracy": 0.476782136131078, "num_tokens": 70877087.0, "step": 489 }, { "entropy": 2.405517578125, "epoch": 0.008436421235677453, "grad_norm": 0.5833263397216797, "learning_rate": 4.890000000000001e-06, "loss": 2.3254, "mean_token_accuracy": 0.4887930774129927, "num_tokens": 71009717.0, "step": 490 }, { "entropy": 2.45263671875, "epoch": 0.008453638421872713, "grad_norm": 0.5471597909927368, "learning_rate": 4.9000000000000005e-06, "loss": 2.4063, "mean_token_accuracy": 0.48418434290215373, "num_tokens": 71166661.0, "step": 491 }, { "entropy": 2.4708251953125, "epoch": 0.008470855608067974, "grad_norm": 0.5708027482032776, "learning_rate": 4.9100000000000004e-06, "loss": 2.3853, "mean_token_accuracy": 0.4815034563653171, "num_tokens": 71313337.0, "step": 492 }, { "entropy": 2.4544677734375, "epoch": 0.008488072794263234, "grad_norm": 0.5583227872848511, "learning_rate": 4.92e-06, "loss": 2.4402, "mean_token_accuracy": 0.47839572792872787, "num_tokens": 71472447.0, "step": 493 }, { "entropy": 2.5404052734375, "epoch": 0.008505289980458493, "grad_norm": 0.578138530254364, "learning_rate": 4.93e-06, "loss": 2.4715, "mean_token_accuracy": 0.47347771003842354, "num_tokens": 71615228.0, "step": 494 }, { "entropy": 2.3878173828125, "epoch": 0.008522507166653754, "grad_norm": 0.5488659143447876, "learning_rate": 4.94e-06, "loss": 2.3633, "mean_token_accuracy": 0.49038203712552786, "num_tokens": 71773695.0, "step": 495 }, { "entropy": 2.4010009765625, "epoch": 0.008539724352849014, "grad_norm": 0.61408531665802, "learning_rate": 4.95e-06, "loss": 2.3474, "mean_token_accuracy": 0.48974387207999825, "num_tokens": 71924349.0, "step": 496 }, { "entropy": 2.4000244140625, "epoch": 0.008556941539044275, "grad_norm": 0.5874980688095093, "learning_rate": 4.960000000000001e-06, "loss": 2.3611, "mean_token_accuracy": 0.49134851479902864, "num_tokens": 72064362.0, "step": 497 }, { "entropy": 2.412353515625, "epoch": 0.008574158725239533, "grad_norm": 0.6588267087936401, "learning_rate": 4.970000000000001e-06, "loss": 2.3615, "mean_token_accuracy": 0.48920353641733527, "num_tokens": 72195808.0, "step": 498 }, { "entropy": 2.4034423828125, "epoch": 0.008591375911434794, "grad_norm": 0.5799132585525513, "learning_rate": 4.980000000000001e-06, "loss": 2.3648, "mean_token_accuracy": 0.4861590703949332, "num_tokens": 72329610.0, "step": 499 }, { "entropy": 2.4595947265625, "epoch": 0.008608593097630055, "grad_norm": 0.5722876787185669, "learning_rate": 4.9900000000000005e-06, "loss": 2.4214, "mean_token_accuracy": 0.47663868917152286, "num_tokens": 72472581.0, "step": 500 }, { "entropy": 2.491943359375, "epoch": 0.008625810283825315, "grad_norm": 0.5580865740776062, "learning_rate": 5e-06, "loss": 2.4629, "mean_token_accuracy": 0.4767118990421295, "num_tokens": 72616609.0, "step": 501 }, { "entropy": 2.3966064453125, "epoch": 0.008643027470020574, "grad_norm": 0.5624485015869141, "learning_rate": 5.01e-06, "loss": 2.3162, "mean_token_accuracy": 0.4926113812252879, "num_tokens": 72772528.0, "step": 502 }, { "entropy": 2.401611328125, "epoch": 0.008660244656215834, "grad_norm": 0.5372104644775391, "learning_rate": 5.02e-06, "loss": 2.3705, "mean_token_accuracy": 0.4896852090023458, "num_tokens": 72936711.0, "step": 503 }, { "entropy": 2.5235595703125, "epoch": 0.008677461842411095, "grad_norm": 0.6084150075912476, "learning_rate": 5.03e-06, "loss": 2.4556, "mean_token_accuracy": 0.47214356577023864, "num_tokens": 73065595.0, "step": 504 }, { "entropy": 2.3858642578125, "epoch": 0.008694679028606355, "grad_norm": 0.555059015750885, "learning_rate": 5.04e-06, "loss": 2.3394, "mean_token_accuracy": 0.4875369444489479, "num_tokens": 73214645.0, "step": 505 }, { "entropy": 2.473876953125, "epoch": 0.008711896214801614, "grad_norm": 0.5269579887390137, "learning_rate": 5.050000000000001e-06, "loss": 2.3993, "mean_token_accuracy": 0.4812088548205793, "num_tokens": 73376592.0, "step": 506 }, { "entropy": 2.408935546875, "epoch": 0.008729113400996875, "grad_norm": 0.5673913955688477, "learning_rate": 5.060000000000001e-06, "loss": 2.3398, "mean_token_accuracy": 0.4860687367618084, "num_tokens": 73524487.0, "step": 507 }, { "entropy": 2.4530029296875, "epoch": 0.008746330587192135, "grad_norm": 0.6308383345603943, "learning_rate": 5.070000000000001e-06, "loss": 2.4261, "mean_token_accuracy": 0.4800183614715934, "num_tokens": 73662683.0, "step": 508 }, { "entropy": 2.451171875, "epoch": 0.008763547773387396, "grad_norm": 0.5882863998413086, "learning_rate": 5.0800000000000005e-06, "loss": 2.3984, "mean_token_accuracy": 0.48627386754378676, "num_tokens": 73798089.0, "step": 509 }, { "entropy": 2.4722900390625, "epoch": 0.008780764959582655, "grad_norm": 0.5577517747879028, "learning_rate": 5.09e-06, "loss": 2.4251, "mean_token_accuracy": 0.472780239302665, "num_tokens": 73951527.0, "step": 510 }, { "entropy": 2.439208984375, "epoch": 0.008797982145777915, "grad_norm": 0.5598065257072449, "learning_rate": 5.1e-06, "loss": 2.3522, "mean_token_accuracy": 0.4847842915914953, "num_tokens": 74106602.0, "step": 511 }, { "entropy": 2.4395751953125, "epoch": 0.008815199331973176, "grad_norm": 0.6455389857292175, "learning_rate": 5.11e-06, "loss": 2.4265, "mean_token_accuracy": 0.47678500413894653, "num_tokens": 74248860.0, "step": 512 }, { "entropy": 2.429443359375, "epoch": 0.008832416518168436, "grad_norm": 0.5287806391716003, "learning_rate": 5.12e-06, "loss": 2.3812, "mean_token_accuracy": 0.4817400835454464, "num_tokens": 74406358.0, "step": 513 }, { "entropy": 2.409423828125, "epoch": 0.008849633704363697, "grad_norm": 0.5710276961326599, "learning_rate": 5.130000000000001e-06, "loss": 2.3643, "mean_token_accuracy": 0.4914928264915943, "num_tokens": 74556098.0, "step": 514 }, { "entropy": 2.4224853515625, "epoch": 0.008866850890558955, "grad_norm": 0.5766054391860962, "learning_rate": 5.140000000000001e-06, "loss": 2.372, "mean_token_accuracy": 0.48493917658925056, "num_tokens": 74692650.0, "step": 515 }, { "entropy": 2.424072265625, "epoch": 0.008884068076754216, "grad_norm": 0.6063751578330994, "learning_rate": 5.150000000000001e-06, "loss": 2.3885, "mean_token_accuracy": 0.48274309001863003, "num_tokens": 74834650.0, "step": 516 }, { "entropy": 2.4222412109375, "epoch": 0.008901285262949477, "grad_norm": 0.5876448750495911, "learning_rate": 5.1600000000000006e-06, "loss": 2.3979, "mean_token_accuracy": 0.48454813193529844, "num_tokens": 74975623.0, "step": 517 }, { "entropy": 2.4189453125, "epoch": 0.008918502449144737, "grad_norm": 0.5868141055107117, "learning_rate": 5.1700000000000005e-06, "loss": 2.3741, "mean_token_accuracy": 0.48601944325491786, "num_tokens": 75124642.0, "step": 518 }, { "entropy": 2.47705078125, "epoch": 0.008935719635339996, "grad_norm": 0.5537886023521423, "learning_rate": 5.18e-06, "loss": 2.4012, "mean_token_accuracy": 0.47799683222547174, "num_tokens": 75268718.0, "step": 519 }, { "entropy": 2.400390625, "epoch": 0.008952936821535256, "grad_norm": 0.573029637336731, "learning_rate": 5.19e-06, "loss": 2.3484, "mean_token_accuracy": 0.48853049660101533, "num_tokens": 75407285.0, "step": 520 }, { "entropy": 2.458251953125, "epoch": 0.008970154007730517, "grad_norm": 0.6153673529624939, "learning_rate": 5.2e-06, "loss": 2.4012, "mean_token_accuracy": 0.4795549106784165, "num_tokens": 75544196.0, "step": 521 }, { "entropy": 2.474853515625, "epoch": 0.008987371193925777, "grad_norm": 0.6100261807441711, "learning_rate": 5.210000000000001e-06, "loss": 2.4373, "mean_token_accuracy": 0.47731659887358546, "num_tokens": 75675964.0, "step": 522 }, { "entropy": 2.486328125, "epoch": 0.009004588380121036, "grad_norm": 0.5929141044616699, "learning_rate": 5.220000000000001e-06, "loss": 2.4467, "mean_token_accuracy": 0.4785361369140446, "num_tokens": 75820140.0, "step": 523 }, { "entropy": 2.472900390625, "epoch": 0.009021805566316297, "grad_norm": 0.5526888370513916, "learning_rate": 5.230000000000001e-06, "loss": 2.393, "mean_token_accuracy": 0.4825805053114891, "num_tokens": 75975261.0, "step": 524 }, { "entropy": 2.382080078125, "epoch": 0.009039022752511557, "grad_norm": 0.5640392303466797, "learning_rate": 5.240000000000001e-06, "loss": 2.3364, "mean_token_accuracy": 0.48861693032085896, "num_tokens": 76121140.0, "step": 525 }, { "entropy": 2.4207763671875, "epoch": 0.009056239938706818, "grad_norm": 0.626315712928772, "learning_rate": 5.2500000000000006e-06, "loss": 2.3691, "mean_token_accuracy": 0.4876149254851043, "num_tokens": 76249826.0, "step": 526 }, { "entropy": 2.468505859375, "epoch": 0.009073457124902077, "grad_norm": 0.5884665846824646, "learning_rate": 5.2600000000000005e-06, "loss": 2.4473, "mean_token_accuracy": 0.479018023237586, "num_tokens": 76382197.0, "step": 527 }, { "entropy": 2.551513671875, "epoch": 0.009090674311097337, "grad_norm": 0.6032463908195496, "learning_rate": 5.27e-06, "loss": 2.5197, "mean_token_accuracy": 0.4695024830289185, "num_tokens": 76523850.0, "step": 528 }, { "entropy": 2.5267333984375, "epoch": 0.009107891497292598, "grad_norm": 0.5680440068244934, "learning_rate": 5.28e-06, "loss": 2.4922, "mean_token_accuracy": 0.4694391922093928, "num_tokens": 76675341.0, "step": 529 }, { "entropy": 2.45654296875, "epoch": 0.009125108683487858, "grad_norm": 0.5584273338317871, "learning_rate": 5.290000000000001e-06, "loss": 2.4558, "mean_token_accuracy": 0.4753922396339476, "num_tokens": 76839148.0, "step": 530 }, { "entropy": 2.447998046875, "epoch": 0.009142325869683117, "grad_norm": 0.5700913667678833, "learning_rate": 5.300000000000001e-06, "loss": 2.3795, "mean_token_accuracy": 0.4812642466276884, "num_tokens": 76972890.0, "step": 531 }, { "entropy": 2.39208984375, "epoch": 0.009159543055878377, "grad_norm": 0.5814207196235657, "learning_rate": 5.310000000000001e-06, "loss": 2.3494, "mean_token_accuracy": 0.4927141284570098, "num_tokens": 77116305.0, "step": 532 }, { "entropy": 2.5166015625, "epoch": 0.009176760242073638, "grad_norm": 0.6363142728805542, "learning_rate": 5.320000000000001e-06, "loss": 2.4734, "mean_token_accuracy": 0.46889354614540935, "num_tokens": 77241586.0, "step": 533 }, { "entropy": 2.561279296875, "epoch": 0.009193977428268899, "grad_norm": 0.5870820879936218, "learning_rate": 5.330000000000001e-06, "loss": 2.4981, "mean_token_accuracy": 0.4720300040207803, "num_tokens": 77382449.0, "step": 534 }, { "entropy": 2.4193115234375, "epoch": 0.009211194614464157, "grad_norm": 0.5664909482002258, "learning_rate": 5.3400000000000005e-06, "loss": 2.3611, "mean_token_accuracy": 0.48326091514900327, "num_tokens": 77519348.0, "step": 535 }, { "entropy": 2.3780517578125, "epoch": 0.009228411800659418, "grad_norm": 0.677797794342041, "learning_rate": 5.3500000000000004e-06, "loss": 2.3674, "mean_token_accuracy": 0.49126658914610744, "num_tokens": 77672043.0, "step": 536 }, { "entropy": 2.447021484375, "epoch": 0.009245628986854678, "grad_norm": 0.5524245500564575, "learning_rate": 5.36e-06, "loss": 2.4252, "mean_token_accuracy": 0.47922995453700423, "num_tokens": 77815854.0, "step": 537 }, { "entropy": 2.3614501953125, "epoch": 0.009262846173049939, "grad_norm": 0.5834474563598633, "learning_rate": 5.370000000000001e-06, "loss": 2.3147, "mean_token_accuracy": 0.4956310335546732, "num_tokens": 77971110.0, "step": 538 }, { "entropy": 2.4951171875, "epoch": 0.0092800633592452, "grad_norm": 0.5798099040985107, "learning_rate": 5.380000000000001e-06, "loss": 2.4241, "mean_token_accuracy": 0.4767613257281482, "num_tokens": 78111794.0, "step": 539 }, { "entropy": 2.502197265625, "epoch": 0.009297280545440458, "grad_norm": 0.6156012415885925, "learning_rate": 5.390000000000001e-06, "loss": 2.4368, "mean_token_accuracy": 0.4805396613664925, "num_tokens": 78259210.0, "step": 540 }, { "entropy": 2.4141845703125, "epoch": 0.009314497731635719, "grad_norm": 0.5888620615005493, "learning_rate": 5.400000000000001e-06, "loss": 2.3621, "mean_token_accuracy": 0.4862367841415107, "num_tokens": 78400181.0, "step": 541 }, { "entropy": 2.41845703125, "epoch": 0.00933171491783098, "grad_norm": 0.5387586355209351, "learning_rate": 5.410000000000001e-06, "loss": 2.3808, "mean_token_accuracy": 0.48267924739047885, "num_tokens": 78558105.0, "step": 542 }, { "entropy": 2.4661865234375, "epoch": 0.00934893210402624, "grad_norm": 0.5678828954696655, "learning_rate": 5.420000000000001e-06, "loss": 2.4037, "mean_token_accuracy": 0.4831194179132581, "num_tokens": 78706162.0, "step": 543 }, { "entropy": 2.3319091796875, "epoch": 0.009366149290221499, "grad_norm": 0.5745298862457275, "learning_rate": 5.4300000000000005e-06, "loss": 2.2771, "mean_token_accuracy": 0.5001829275861382, "num_tokens": 78850606.0, "step": 544 }, { "entropy": 2.492919921875, "epoch": 0.009383366476416759, "grad_norm": 0.5902231931686401, "learning_rate": 5.4400000000000004e-06, "loss": 2.4254, "mean_token_accuracy": 0.4752844786271453, "num_tokens": 78990742.0, "step": 545 }, { "entropy": 2.5008544921875, "epoch": 0.00940058366261202, "grad_norm": 0.6180260181427002, "learning_rate": 5.450000000000001e-06, "loss": 2.5212, "mean_token_accuracy": 0.4779182830825448, "num_tokens": 79147975.0, "step": 546 }, { "entropy": 2.451171875, "epoch": 0.00941780084880728, "grad_norm": 0.571796715259552, "learning_rate": 5.460000000000001e-06, "loss": 2.3884, "mean_token_accuracy": 0.47992962412536144, "num_tokens": 79295374.0, "step": 547 }, { "entropy": 2.451416015625, "epoch": 0.009435018035002539, "grad_norm": 0.6627357006072998, "learning_rate": 5.470000000000001e-06, "loss": 2.4386, "mean_token_accuracy": 0.4789085192605853, "num_tokens": 79435365.0, "step": 548 }, { "entropy": 2.44091796875, "epoch": 0.0094522352211978, "grad_norm": 0.5701068639755249, "learning_rate": 5.480000000000001e-06, "loss": 2.3711, "mean_token_accuracy": 0.48519957158714533, "num_tokens": 79583578.0, "step": 549 }, { "entropy": 2.4254150390625, "epoch": 0.00946945240739306, "grad_norm": 0.5793823599815369, "learning_rate": 5.490000000000001e-06, "loss": 2.3977, "mean_token_accuracy": 0.4862471236847341, "num_tokens": 79731666.0, "step": 550 }, { "entropy": 2.4296875, "epoch": 0.00948666959358832, "grad_norm": 0.538820743560791, "learning_rate": 5.500000000000001e-06, "loss": 2.3655, "mean_token_accuracy": 0.487091563642025, "num_tokens": 79879814.0, "step": 551 }, { "entropy": 2.5390625, "epoch": 0.00950388677978358, "grad_norm": 0.5948156714439392, "learning_rate": 5.510000000000001e-06, "loss": 2.4975, "mean_token_accuracy": 0.46823509922251105, "num_tokens": 80021770.0, "step": 552 }, { "entropy": 2.455322265625, "epoch": 0.00952110396597884, "grad_norm": 0.5883845686912537, "learning_rate": 5.5200000000000005e-06, "loss": 2.3802, "mean_token_accuracy": 0.47493234230205417, "num_tokens": 80165630.0, "step": 553 }, { "entropy": 2.474365234375, "epoch": 0.0095383211521741, "grad_norm": 0.6061490774154663, "learning_rate": 5.530000000000001e-06, "loss": 2.4588, "mean_token_accuracy": 0.4762619035318494, "num_tokens": 80320803.0, "step": 554 }, { "entropy": 2.3839111328125, "epoch": 0.009555538338369361, "grad_norm": 0.5892252922058105, "learning_rate": 5.540000000000001e-06, "loss": 2.3663, "mean_token_accuracy": 0.48988539073616266, "num_tokens": 80455211.0, "step": 555 }, { "entropy": 2.3966064453125, "epoch": 0.00957275552456462, "grad_norm": 0.5676023960113525, "learning_rate": 5.550000000000001e-06, "loss": 2.3757, "mean_token_accuracy": 0.4864460411481559, "num_tokens": 80600140.0, "step": 556 }, { "entropy": 2.411865234375, "epoch": 0.00958997271075988, "grad_norm": 0.6370622515678406, "learning_rate": 5.560000000000001e-06, "loss": 2.4048, "mean_token_accuracy": 0.4831241965293884, "num_tokens": 80733228.0, "step": 557 }, { "entropy": 2.453369140625, "epoch": 0.00960718989695514, "grad_norm": 0.6127820611000061, "learning_rate": 5.570000000000001e-06, "loss": 2.4203, "mean_token_accuracy": 0.48309980193153024, "num_tokens": 80870174.0, "step": 558 }, { "entropy": 2.4246826171875, "epoch": 0.009624407083150401, "grad_norm": 0.6448102593421936, "learning_rate": 5.580000000000001e-06, "loss": 2.424, "mean_token_accuracy": 0.4830983644351363, "num_tokens": 81027247.0, "step": 559 }, { "entropy": 2.4490966796875, "epoch": 0.00964162426934566, "grad_norm": 0.6132236123085022, "learning_rate": 5.590000000000001e-06, "loss": 2.3908, "mean_token_accuracy": 0.48710555862635374, "num_tokens": 81181289.0, "step": 560 }, { "entropy": 2.4490966796875, "epoch": 0.00965884145554092, "grad_norm": 0.5913870334625244, "learning_rate": 5.600000000000001e-06, "loss": 2.4276, "mean_token_accuracy": 0.47844883892685175, "num_tokens": 81317554.0, "step": 561 }, { "entropy": 2.3934326171875, "epoch": 0.009676058641736181, "grad_norm": 0.5637186169624329, "learning_rate": 5.610000000000001e-06, "loss": 2.386, "mean_token_accuracy": 0.49396452866494656, "num_tokens": 81464385.0, "step": 562 }, { "entropy": 2.446044921875, "epoch": 0.009693275827931442, "grad_norm": 0.6086047887802124, "learning_rate": 5.620000000000001e-06, "loss": 2.3744, "mean_token_accuracy": 0.4851002893410623, "num_tokens": 81595954.0, "step": 563 }, { "entropy": 2.514404296875, "epoch": 0.0097104930141267, "grad_norm": 0.5994987487792969, "learning_rate": 5.63e-06, "loss": 2.445, "mean_token_accuracy": 0.47357298992574215, "num_tokens": 81731590.0, "step": 564 }, { "entropy": 2.452392578125, "epoch": 0.009727710200321961, "grad_norm": 0.5923967361450195, "learning_rate": 5.64e-06, "loss": 2.4243, "mean_token_accuracy": 0.48147726710885763, "num_tokens": 81874631.0, "step": 565 }, { "entropy": 2.49462890625, "epoch": 0.009744927386517221, "grad_norm": 0.5803849697113037, "learning_rate": 5.65e-06, "loss": 2.4918, "mean_token_accuracy": 0.4728530729189515, "num_tokens": 82032208.0, "step": 566 }, { "entropy": 2.5015869140625, "epoch": 0.009762144572712482, "grad_norm": 0.5941510200500488, "learning_rate": 5.66e-06, "loss": 2.4491, "mean_token_accuracy": 0.4712064489722252, "num_tokens": 82175645.0, "step": 567 }, { "entropy": 2.453125, "epoch": 0.009779361758907743, "grad_norm": 0.5803316831588745, "learning_rate": 5.67e-06, "loss": 2.407, "mean_token_accuracy": 0.4795740279369056, "num_tokens": 82314242.0, "step": 568 }, { "entropy": 2.4854736328125, "epoch": 0.009796578945103001, "grad_norm": 0.6026585698127747, "learning_rate": 5.68e-06, "loss": 2.4803, "mean_token_accuracy": 0.4731423007324338, "num_tokens": 82441744.0, "step": 569 }, { "entropy": 2.469482421875, "epoch": 0.009813796131298262, "grad_norm": 0.5523576140403748, "learning_rate": 5.69e-06, "loss": 2.4117, "mean_token_accuracy": 0.4742597243748605, "num_tokens": 82586104.0, "step": 570 }, { "entropy": 2.420166015625, "epoch": 0.009831013317493522, "grad_norm": 0.5471956133842468, "learning_rate": 5.7e-06, "loss": 2.3751, "mean_token_accuracy": 0.48100833920761943, "num_tokens": 82736506.0, "step": 571 }, { "entropy": 2.5037841796875, "epoch": 0.009848230503688783, "grad_norm": 0.57561856508255, "learning_rate": 5.71e-06, "loss": 2.4685, "mean_token_accuracy": 0.46707925060763955, "num_tokens": 82889738.0, "step": 572 }, { "entropy": 2.450927734375, "epoch": 0.009865447689884042, "grad_norm": 0.5832479596138, "learning_rate": 5.72e-06, "loss": 2.4341, "mean_token_accuracy": 0.4823454241268337, "num_tokens": 83033413.0, "step": 573 }, { "entropy": 2.460205078125, "epoch": 0.009882664876079302, "grad_norm": 0.546006977558136, "learning_rate": 5.73e-06, "loss": 2.4431, "mean_token_accuracy": 0.4789656042121351, "num_tokens": 83185297.0, "step": 574 }, { "entropy": 2.4942626953125, "epoch": 0.009899882062274563, "grad_norm": 0.5814852118492126, "learning_rate": 5.74e-06, "loss": 2.4655, "mean_token_accuracy": 0.469060396309942, "num_tokens": 83323076.0, "step": 575 }, { "entropy": 2.448486328125, "epoch": 0.009917099248469823, "grad_norm": 0.6251148581504822, "learning_rate": 5.75e-06, "loss": 2.3796, "mean_token_accuracy": 0.48163892794400454, "num_tokens": 83476832.0, "step": 576 }, { "entropy": 2.43505859375, "epoch": 0.009934316434665082, "grad_norm": 0.5571739077568054, "learning_rate": 5.76e-06, "loss": 2.4166, "mean_token_accuracy": 0.47773123253136873, "num_tokens": 83636406.0, "step": 577 }, { "entropy": 2.4129638671875, "epoch": 0.009951533620860343, "grad_norm": 0.5774782299995422, "learning_rate": 5.77e-06, "loss": 2.3531, "mean_token_accuracy": 0.4910357999615371, "num_tokens": 83784239.0, "step": 578 }, { "entropy": 2.395263671875, "epoch": 0.009968750807055603, "grad_norm": 0.5799167156219482, "learning_rate": 5.78e-06, "loss": 2.3157, "mean_token_accuracy": 0.49086295487359166, "num_tokens": 83930135.0, "step": 579 }, { "entropy": 2.43603515625, "epoch": 0.009985967993250864, "grad_norm": 0.9486396312713623, "learning_rate": 5.7900000000000005e-06, "loss": 2.3956, "mean_token_accuracy": 0.4829951082356274, "num_tokens": 84069466.0, "step": 580 }, { "entropy": 2.456298828125, "epoch": 0.010003185179446122, "grad_norm": 0.5900285243988037, "learning_rate": 5.8e-06, "loss": 2.4223, "mean_token_accuracy": 0.47999299177899957, "num_tokens": 84204231.0, "step": 581 }, { "entropy": 2.4512939453125, "epoch": 0.010020402365641383, "grad_norm": 0.612040638923645, "learning_rate": 5.81e-06, "loss": 2.4226, "mean_token_accuracy": 0.4809495103545487, "num_tokens": 84341875.0, "step": 582 }, { "entropy": 2.44140625, "epoch": 0.010037619551836643, "grad_norm": 0.5587046146392822, "learning_rate": 5.82e-06, "loss": 2.3725, "mean_token_accuracy": 0.4807244557887316, "num_tokens": 84495749.0, "step": 583 }, { "entropy": 2.453125, "epoch": 0.010054836738031904, "grad_norm": 0.5574972629547119, "learning_rate": 5.83e-06, "loss": 2.4249, "mean_token_accuracy": 0.47708367416635156, "num_tokens": 84642208.0, "step": 584 }, { "entropy": 2.4827880859375, "epoch": 0.010072053924227163, "grad_norm": 0.5825822353363037, "learning_rate": 5.84e-06, "loss": 2.4421, "mean_token_accuracy": 0.47600942524150014, "num_tokens": 84798422.0, "step": 585 }, { "entropy": 2.4530029296875, "epoch": 0.010089271110422423, "grad_norm": 0.6338469386100769, "learning_rate": 5.85e-06, "loss": 2.3441, "mean_token_accuracy": 0.48570866556838155, "num_tokens": 84932951.0, "step": 586 }, { "entropy": 2.51806640625, "epoch": 0.010106488296617684, "grad_norm": 0.5964832305908203, "learning_rate": 5.86e-06, "loss": 2.4671, "mean_token_accuracy": 0.47985436767339706, "num_tokens": 85070544.0, "step": 587 }, { "entropy": 2.40576171875, "epoch": 0.010123705482812944, "grad_norm": 0.5808175206184387, "learning_rate": 5.8700000000000005e-06, "loss": 2.3715, "mean_token_accuracy": 0.4833093252964318, "num_tokens": 85216000.0, "step": 588 }, { "entropy": 2.3857421875, "epoch": 0.010140922669008203, "grad_norm": 0.5820797681808472, "learning_rate": 5.8800000000000005e-06, "loss": 2.3318, "mean_token_accuracy": 0.4918548730202019, "num_tokens": 85367592.0, "step": 589 }, { "entropy": 2.4697265625, "epoch": 0.010158139855203464, "grad_norm": 0.5501261353492737, "learning_rate": 5.89e-06, "loss": 2.4305, "mean_token_accuracy": 0.47398777306079865, "num_tokens": 85507669.0, "step": 590 }, { "entropy": 2.3985595703125, "epoch": 0.010175357041398724, "grad_norm": 0.5922814607620239, "learning_rate": 5.9e-06, "loss": 2.3907, "mean_token_accuracy": 0.49256287002936006, "num_tokens": 85657583.0, "step": 591 }, { "entropy": 2.4813232421875, "epoch": 0.010192574227593985, "grad_norm": 0.6180179715156555, "learning_rate": 5.91e-06, "loss": 2.4081, "mean_token_accuracy": 0.47511684615164995, "num_tokens": 85796530.0, "step": 592 }, { "entropy": 2.548583984375, "epoch": 0.010209791413789245, "grad_norm": 0.586715817451477, "learning_rate": 5.92e-06, "loss": 2.4952, "mean_token_accuracy": 0.4641613829880953, "num_tokens": 85935315.0, "step": 593 }, { "entropy": 2.479736328125, "epoch": 0.010227008599984504, "grad_norm": 0.5589325428009033, "learning_rate": 5.93e-06, "loss": 2.4362, "mean_token_accuracy": 0.4785456941463053, "num_tokens": 86083670.0, "step": 594 }, { "entropy": 2.485107421875, "epoch": 0.010244225786179765, "grad_norm": 0.5777843594551086, "learning_rate": 5.94e-06, "loss": 2.4487, "mean_token_accuracy": 0.47643798124045134, "num_tokens": 86226759.0, "step": 595 }, { "entropy": 2.4775390625, "epoch": 0.010261442972375025, "grad_norm": 0.5679555535316467, "learning_rate": 5.950000000000001e-06, "loss": 2.4196, "mean_token_accuracy": 0.48259661020711064, "num_tokens": 86374974.0, "step": 596 }, { "entropy": 2.423095703125, "epoch": 0.010278660158570286, "grad_norm": 0.5793547034263611, "learning_rate": 5.9600000000000005e-06, "loss": 2.418, "mean_token_accuracy": 0.48791359877213836, "num_tokens": 86517518.0, "step": 597 }, { "entropy": 2.4588623046875, "epoch": 0.010295877344765544, "grad_norm": 0.6030251979827881, "learning_rate": 5.9700000000000004e-06, "loss": 2.4074, "mean_token_accuracy": 0.48480180045589805, "num_tokens": 86647346.0, "step": 598 }, { "entropy": 2.570068359375, "epoch": 0.010313094530960805, "grad_norm": 0.5821578502655029, "learning_rate": 5.98e-06, "loss": 2.5287, "mean_token_accuracy": 0.4660195717588067, "num_tokens": 86790455.0, "step": 599 }, { "entropy": 2.4483642578125, "epoch": 0.010330311717156065, "grad_norm": 0.5719680786132812, "learning_rate": 5.99e-06, "loss": 2.357, "mean_token_accuracy": 0.48361520702019334, "num_tokens": 86939582.0, "step": 600 }, { "entropy": 2.5018310546875, "epoch": 0.010347528903351326, "grad_norm": 0.6109415888786316, "learning_rate": 6e-06, "loss": 2.4647, "mean_token_accuracy": 0.47726114513352513, "num_tokens": 87097332.0, "step": 601 }, { "entropy": 2.4161376953125, "epoch": 0.010364746089546585, "grad_norm": 0.565535306930542, "learning_rate": 6.01e-06, "loss": 2.3436, "mean_token_accuracy": 0.4876682967878878, "num_tokens": 87246475.0, "step": 602 }, { "entropy": 2.3807373046875, "epoch": 0.010381963275741845, "grad_norm": 0.594390332698822, "learning_rate": 6.02e-06, "loss": 2.3292, "mean_token_accuracy": 0.49000673089176416, "num_tokens": 87402167.0, "step": 603 }, { "entropy": 2.489501953125, "epoch": 0.010399180461937106, "grad_norm": 0.5601812601089478, "learning_rate": 6.030000000000001e-06, "loss": 2.4547, "mean_token_accuracy": 0.470840523019433, "num_tokens": 87546749.0, "step": 604 }, { "entropy": 2.491455078125, "epoch": 0.010416397648132366, "grad_norm": 0.6872953772544861, "learning_rate": 6.040000000000001e-06, "loss": 2.4646, "mean_token_accuracy": 0.47835929365828633, "num_tokens": 87680924.0, "step": 605 }, { "entropy": 2.499267578125, "epoch": 0.010433614834327625, "grad_norm": 0.5995067954063416, "learning_rate": 6.0500000000000005e-06, "loss": 2.4872, "mean_token_accuracy": 0.4722044039517641, "num_tokens": 87818407.0, "step": 606 }, { "entropy": 2.4085693359375, "epoch": 0.010450832020522886, "grad_norm": 0.7130571007728577, "learning_rate": 6.0600000000000004e-06, "loss": 2.3356, "mean_token_accuracy": 0.48175712302327156, "num_tokens": 87964354.0, "step": 607 }, { "entropy": 2.40869140625, "epoch": 0.010468049206718146, "grad_norm": 0.5531951189041138, "learning_rate": 6.07e-06, "loss": 2.396, "mean_token_accuracy": 0.4883636045269668, "num_tokens": 88108908.0, "step": 608 }, { "entropy": 2.4320068359375, "epoch": 0.010485266392913407, "grad_norm": 0.6004809737205505, "learning_rate": 6.08e-06, "loss": 2.3704, "mean_token_accuracy": 0.47937911935150623, "num_tokens": 88252541.0, "step": 609 }, { "entropy": 2.447021484375, "epoch": 0.010502483579108665, "grad_norm": 0.5746989846229553, "learning_rate": 6.09e-06, "loss": 2.3784, "mean_token_accuracy": 0.4842778267338872, "num_tokens": 88387113.0, "step": 610 }, { "entropy": 2.43603515625, "epoch": 0.010519700765303926, "grad_norm": 0.6954113245010376, "learning_rate": 6.1e-06, "loss": 2.4565, "mean_token_accuracy": 0.4831416751258075, "num_tokens": 88538988.0, "step": 611 }, { "entropy": 2.4976806640625, "epoch": 0.010536917951499187, "grad_norm": 0.5616413950920105, "learning_rate": 6.110000000000001e-06, "loss": 2.4658, "mean_token_accuracy": 0.47561208764091134, "num_tokens": 88697407.0, "step": 612 }, { "entropy": 2.39599609375, "epoch": 0.010554135137694447, "grad_norm": 0.5953220129013062, "learning_rate": 6.120000000000001e-06, "loss": 2.3488, "mean_token_accuracy": 0.4882765398360789, "num_tokens": 88843276.0, "step": 613 }, { "entropy": 2.5074462890625, "epoch": 0.010571352323889706, "grad_norm": 0.5962725281715393, "learning_rate": 6.130000000000001e-06, "loss": 2.4542, "mean_token_accuracy": 0.4768844782374799, "num_tokens": 88973725.0, "step": 614 }, { "entropy": 2.416748046875, "epoch": 0.010588569510084966, "grad_norm": 0.5971546769142151, "learning_rate": 6.1400000000000005e-06, "loss": 2.384, "mean_token_accuracy": 0.4932720325887203, "num_tokens": 89112891.0, "step": 615 }, { "entropy": 2.4598388671875, "epoch": 0.010605786696280227, "grad_norm": 0.5331761837005615, "learning_rate": 6.15e-06, "loss": 2.3966, "mean_token_accuracy": 0.4843857139348984, "num_tokens": 89261669.0, "step": 616 }, { "entropy": 2.5048828125, "epoch": 0.010623003882475487, "grad_norm": 0.5298826694488525, "learning_rate": 6.16e-06, "loss": 2.4497, "mean_token_accuracy": 0.47414906043559313, "num_tokens": 89424640.0, "step": 617 }, { "entropy": 2.53369140625, "epoch": 0.010640221068670748, "grad_norm": 0.5918136835098267, "learning_rate": 6.17e-06, "loss": 2.4991, "mean_token_accuracy": 0.4715537903830409, "num_tokens": 89565111.0, "step": 618 }, { "entropy": 2.502685546875, "epoch": 0.010657438254866007, "grad_norm": 0.5514756441116333, "learning_rate": 6.18e-06, "loss": 2.4796, "mean_token_accuracy": 0.47532700607553124, "num_tokens": 89720834.0, "step": 619 }, { "entropy": 2.4874267578125, "epoch": 0.010674655441061267, "grad_norm": 0.6075375080108643, "learning_rate": 6.190000000000001e-06, "loss": 2.434, "mean_token_accuracy": 0.48047855822369456, "num_tokens": 89852579.0, "step": 620 }, { "entropy": 2.4002685546875, "epoch": 0.010691872627256528, "grad_norm": 0.5313543081283569, "learning_rate": 6.200000000000001e-06, "loss": 2.3702, "mean_token_accuracy": 0.4890581676736474, "num_tokens": 90014576.0, "step": 621 }, { "entropy": 2.5064697265625, "epoch": 0.010709089813451788, "grad_norm": 0.5929763317108154, "learning_rate": 6.210000000000001e-06, "loss": 2.4956, "mean_token_accuracy": 0.47517322562634945, "num_tokens": 90141327.0, "step": 622 }, { "entropy": 2.4847412109375, "epoch": 0.010726306999647047, "grad_norm": 0.6306544542312622, "learning_rate": 6.220000000000001e-06, "loss": 2.4664, "mean_token_accuracy": 0.4766456396318972, "num_tokens": 90289805.0, "step": 623 }, { "entropy": 2.434326171875, "epoch": 0.010743524185842308, "grad_norm": 0.6184057593345642, "learning_rate": 6.2300000000000005e-06, "loss": 2.4083, "mean_token_accuracy": 0.48298388347029686, "num_tokens": 90436897.0, "step": 624 }, { "entropy": 2.4671630859375, "epoch": 0.010760741372037568, "grad_norm": 2.9163689613342285, "learning_rate": 6.24e-06, "loss": 2.4082, "mean_token_accuracy": 0.48466121684759855, "num_tokens": 90587723.0, "step": 625 }, { "entropy": 2.4171142578125, "epoch": 0.010777958558232829, "grad_norm": 0.6109045743942261, "learning_rate": 6.25e-06, "loss": 2.3642, "mean_token_accuracy": 0.48781463131308556, "num_tokens": 90722550.0, "step": 626 }, { "entropy": 2.3975830078125, "epoch": 0.010795175744428087, "grad_norm": 0.5503095388412476, "learning_rate": 6.26e-06, "loss": 2.3723, "mean_token_accuracy": 0.484878970310092, "num_tokens": 90883612.0, "step": 627 }, { "entropy": 2.435302734375, "epoch": 0.010812392930623348, "grad_norm": 0.5557761192321777, "learning_rate": 6.27e-06, "loss": 2.4024, "mean_token_accuracy": 0.482346317730844, "num_tokens": 91020885.0, "step": 628 }, { "entropy": 2.454833984375, "epoch": 0.010829610116818609, "grad_norm": 0.5977523326873779, "learning_rate": 6.280000000000001e-06, "loss": 2.3842, "mean_token_accuracy": 0.4845750341191888, "num_tokens": 91183050.0, "step": 629 }, { "entropy": 2.536376953125, "epoch": 0.010846827303013869, "grad_norm": 0.6027834415435791, "learning_rate": 6.290000000000001e-06, "loss": 2.5204, "mean_token_accuracy": 0.46805495163425803, "num_tokens": 91320603.0, "step": 630 }, { "entropy": 2.42431640625, "epoch": 0.010864044489209128, "grad_norm": 0.569492518901825, "learning_rate": 6.300000000000001e-06, "loss": 2.3634, "mean_token_accuracy": 0.484631123021245, "num_tokens": 91466774.0, "step": 631 }, { "entropy": 2.501953125, "epoch": 0.010881261675404388, "grad_norm": 0.6085266470909119, "learning_rate": 6.3100000000000006e-06, "loss": 2.4384, "mean_token_accuracy": 0.47073780838400126, "num_tokens": 91599571.0, "step": 632 }, { "entropy": 2.454833984375, "epoch": 0.010898478861599649, "grad_norm": 0.5897052884101868, "learning_rate": 6.3200000000000005e-06, "loss": 2.3718, "mean_token_accuracy": 0.4813077808357775, "num_tokens": 91736850.0, "step": 633 }, { "entropy": 2.5152587890625, "epoch": 0.01091569604779491, "grad_norm": 0.5574195384979248, "learning_rate": 6.33e-06, "loss": 2.5027, "mean_token_accuracy": 0.4707440370693803, "num_tokens": 91883052.0, "step": 634 }, { "entropy": 2.4981689453125, "epoch": 0.010932913233990168, "grad_norm": 0.5665111541748047, "learning_rate": 6.34e-06, "loss": 2.4967, "mean_token_accuracy": 0.4747963696718216, "num_tokens": 92038624.0, "step": 635 }, { "entropy": 2.5341796875, "epoch": 0.010950130420185429, "grad_norm": 0.6099963784217834, "learning_rate": 6.35e-06, "loss": 2.4812, "mean_token_accuracy": 0.4711012118496001, "num_tokens": 92172927.0, "step": 636 }, { "entropy": 2.4072265625, "epoch": 0.01096734760638069, "grad_norm": 0.5225919485092163, "learning_rate": 6.360000000000001e-06, "loss": 2.3695, "mean_token_accuracy": 0.4833218730054796, "num_tokens": 92340412.0, "step": 637 }, { "entropy": 2.373779296875, "epoch": 0.01098456479257595, "grad_norm": 0.5803242921829224, "learning_rate": 6.370000000000001e-06, "loss": 2.3518, "mean_token_accuracy": 0.4897113349288702, "num_tokens": 92491100.0, "step": 638 }, { "entropy": 2.465576171875, "epoch": 0.011001781978771209, "grad_norm": 0.6239297986030579, "learning_rate": 6.380000000000001e-06, "loss": 2.4732, "mean_token_accuracy": 0.4763565128669143, "num_tokens": 92624388.0, "step": 639 }, { "entropy": 2.521728515625, "epoch": 0.011018999164966469, "grad_norm": 0.6022891998291016, "learning_rate": 6.390000000000001e-06, "loss": 2.5116, "mean_token_accuracy": 0.4692745329812169, "num_tokens": 92749943.0, "step": 640 }, { "entropy": 2.4610595703125, "epoch": 0.01103621635116173, "grad_norm": 0.5882202386856079, "learning_rate": 6.4000000000000006e-06, "loss": 2.4055, "mean_token_accuracy": 0.479529797565192, "num_tokens": 92901869.0, "step": 641 }, { "entropy": 2.42626953125, "epoch": 0.01105343353735699, "grad_norm": 0.6111239790916443, "learning_rate": 6.4100000000000005e-06, "loss": 2.337, "mean_token_accuracy": 0.4874786385335028, "num_tokens": 93049790.0, "step": 642 }, { "entropy": 2.459716796875, "epoch": 0.01107065072355225, "grad_norm": 0.5824291706085205, "learning_rate": 6.42e-06, "loss": 2.3972, "mean_token_accuracy": 0.4820692026987672, "num_tokens": 93190066.0, "step": 643 }, { "entropy": 2.393798828125, "epoch": 0.01108786790974751, "grad_norm": 0.6439885497093201, "learning_rate": 6.43e-06, "loss": 2.2805, "mean_token_accuracy": 0.500021081417799, "num_tokens": 93323041.0, "step": 644 }, { "entropy": 2.5235595703125, "epoch": 0.01110508509594277, "grad_norm": 0.5925695300102234, "learning_rate": 6.440000000000001e-06, "loss": 2.5175, "mean_token_accuracy": 0.46626862324774265, "num_tokens": 93454663.0, "step": 645 }, { "entropy": 2.5662841796875, "epoch": 0.01112230228213803, "grad_norm": 0.5767838954925537, "learning_rate": 6.450000000000001e-06, "loss": 2.5828, "mean_token_accuracy": 0.46684390073642135, "num_tokens": 93600987.0, "step": 646 }, { "entropy": 2.445068359375, "epoch": 0.011139519468333291, "grad_norm": 0.5589511394500732, "learning_rate": 6.460000000000001e-06, "loss": 2.3973, "mean_token_accuracy": 0.48293518042191863, "num_tokens": 93764681.0, "step": 647 }, { "entropy": 2.487548828125, "epoch": 0.01115673665452855, "grad_norm": 0.6247239112854004, "learning_rate": 6.470000000000001e-06, "loss": 2.4299, "mean_token_accuracy": 0.47755016293376684, "num_tokens": 93908163.0, "step": 648 }, { "entropy": 2.44873046875, "epoch": 0.01117395384072381, "grad_norm": 0.5852839350700378, "learning_rate": 6.480000000000001e-06, "loss": 2.3616, "mean_token_accuracy": 0.48453861800953746, "num_tokens": 94044102.0, "step": 649 }, { "entropy": 2.401611328125, "epoch": 0.011191171026919071, "grad_norm": 0.5801386833190918, "learning_rate": 6.4900000000000005e-06, "loss": 2.3961, "mean_token_accuracy": 0.4849433288909495, "num_tokens": 94186933.0, "step": 650 }, { "entropy": 2.474853515625, "epoch": 0.011208388213114331, "grad_norm": 0.5861256718635559, "learning_rate": 6.5000000000000004e-06, "loss": 2.4425, "mean_token_accuracy": 0.47974030720070004, "num_tokens": 94318045.0, "step": 651 }, { "entropy": 2.49755859375, "epoch": 0.01122560539930959, "grad_norm": 0.5657753348350525, "learning_rate": 6.51e-06, "loss": 2.4787, "mean_token_accuracy": 0.4704547990113497, "num_tokens": 94470298.0, "step": 652 }, { "entropy": 2.4617919921875, "epoch": 0.01124282258550485, "grad_norm": 0.5523881912231445, "learning_rate": 6.520000000000001e-06, "loss": 2.4404, "mean_token_accuracy": 0.47263912204653025, "num_tokens": 94617182.0, "step": 653 }, { "entropy": 2.462646484375, "epoch": 0.011260039771700111, "grad_norm": 0.5891516804695129, "learning_rate": 6.530000000000001e-06, "loss": 2.4207, "mean_token_accuracy": 0.4738515946082771, "num_tokens": 94764738.0, "step": 654 }, { "entropy": 2.43310546875, "epoch": 0.011277256957895372, "grad_norm": 0.5879467129707336, "learning_rate": 6.540000000000001e-06, "loss": 2.4553, "mean_token_accuracy": 0.4857351207174361, "num_tokens": 94911680.0, "step": 655 }, { "entropy": 2.3919677734375, "epoch": 0.01129447414409063, "grad_norm": 0.5698356032371521, "learning_rate": 6.550000000000001e-06, "loss": 2.3241, "mean_token_accuracy": 0.49458998907357454, "num_tokens": 95063211.0, "step": 656 }, { "entropy": 2.4443359375, "epoch": 0.011311691330285891, "grad_norm": 0.5569403767585754, "learning_rate": 6.560000000000001e-06, "loss": 2.4115, "mean_token_accuracy": 0.4799956767819822, "num_tokens": 95205223.0, "step": 657 }, { "entropy": 2.4415283203125, "epoch": 0.011328908516481152, "grad_norm": 0.6046442985534668, "learning_rate": 6.570000000000001e-06, "loss": 2.3966, "mean_token_accuracy": 0.4849221669137478, "num_tokens": 95353465.0, "step": 658 }, { "entropy": 2.4443359375, "epoch": 0.011346125702676412, "grad_norm": 0.7441838979721069, "learning_rate": 6.5800000000000005e-06, "loss": 2.4102, "mean_token_accuracy": 0.47888694843277335, "num_tokens": 95519881.0, "step": 659 }, { "entropy": 2.47998046875, "epoch": 0.011363342888871671, "grad_norm": 0.5905322432518005, "learning_rate": 6.5900000000000004e-06, "loss": 2.457, "mean_token_accuracy": 0.47512120427563787, "num_tokens": 95664332.0, "step": 660 }, { "entropy": 2.43408203125, "epoch": 0.011380560075066931, "grad_norm": 0.5880439281463623, "learning_rate": 6.600000000000001e-06, "loss": 2.3549, "mean_token_accuracy": 0.4882641164585948, "num_tokens": 95811290.0, "step": 661 }, { "entropy": 2.44677734375, "epoch": 0.011397777261262192, "grad_norm": 0.6042869091033936, "learning_rate": 6.610000000000001e-06, "loss": 2.4018, "mean_token_accuracy": 0.48763177869841456, "num_tokens": 95950564.0, "step": 662 }, { "entropy": 2.4495849609375, "epoch": 0.011414994447457453, "grad_norm": 0.6228047013282776, "learning_rate": 6.620000000000001e-06, "loss": 2.4301, "mean_token_accuracy": 0.4780231602489948, "num_tokens": 96088537.0, "step": 663 }, { "entropy": 2.398681640625, "epoch": 0.011432211633652711, "grad_norm": 0.6014442443847656, "learning_rate": 6.630000000000001e-06, "loss": 2.4022, "mean_token_accuracy": 0.4849638855084777, "num_tokens": 96222517.0, "step": 664 }, { "entropy": 2.395751953125, "epoch": 0.011449428819847972, "grad_norm": 0.5773961544036865, "learning_rate": 6.640000000000001e-06, "loss": 2.3727, "mean_token_accuracy": 0.48979697469621897, "num_tokens": 96371664.0, "step": 665 }, { "entropy": 2.443603515625, "epoch": 0.011466646006043232, "grad_norm": 0.5553280711174011, "learning_rate": 6.650000000000001e-06, "loss": 2.4178, "mean_token_accuracy": 0.4752015918493271, "num_tokens": 96533336.0, "step": 666 }, { "entropy": 2.4317626953125, "epoch": 0.011483863192238493, "grad_norm": 0.5981423258781433, "learning_rate": 6.660000000000001e-06, "loss": 2.3897, "mean_token_accuracy": 0.48191826045513153, "num_tokens": 96671339.0, "step": 667 }, { "entropy": 2.4635009765625, "epoch": 0.011501080378433753, "grad_norm": 0.5966627597808838, "learning_rate": 6.6700000000000005e-06, "loss": 2.4742, "mean_token_accuracy": 0.4762999969534576, "num_tokens": 96814601.0, "step": 668 }, { "entropy": 2.5224609375, "epoch": 0.011518297564629012, "grad_norm": 0.6274897456169128, "learning_rate": 6.680000000000001e-06, "loss": 2.499, "mean_token_accuracy": 0.4678979804739356, "num_tokens": 96956119.0, "step": 669 }, { "entropy": 2.422607421875, "epoch": 0.011535514750824273, "grad_norm": 0.5479171276092529, "learning_rate": 6.690000000000001e-06, "loss": 2.4028, "mean_token_accuracy": 0.48214570991694927, "num_tokens": 97106907.0, "step": 670 }, { "entropy": 2.450439453125, "epoch": 0.011552731937019533, "grad_norm": 0.5556632876396179, "learning_rate": 6.700000000000001e-06, "loss": 2.4311, "mean_token_accuracy": 0.4770152415148914, "num_tokens": 97251230.0, "step": 671 }, { "entropy": 2.4766845703125, "epoch": 0.011569949123214794, "grad_norm": 0.5444607734680176, "learning_rate": 6.710000000000001e-06, "loss": 2.4384, "mean_token_accuracy": 0.48311482975259423, "num_tokens": 97404310.0, "step": 672 }, { "entropy": 2.4534912109375, "epoch": 0.011587166309410053, "grad_norm": 0.5732424259185791, "learning_rate": 6.720000000000001e-06, "loss": 2.3888, "mean_token_accuracy": 0.48798013804480433, "num_tokens": 97557441.0, "step": 673 }, { "entropy": 2.443603515625, "epoch": 0.011604383495605313, "grad_norm": 0.5952773094177246, "learning_rate": 6.730000000000001e-06, "loss": 2.3956, "mean_token_accuracy": 0.47982571227476, "num_tokens": 97696645.0, "step": 674 }, { "entropy": 2.4781494140625, "epoch": 0.011621600681800574, "grad_norm": 0.6089589595794678, "learning_rate": 6.740000000000001e-06, "loss": 2.3921, "mean_token_accuracy": 0.48180749313905835, "num_tokens": 97836331.0, "step": 675 }, { "entropy": 2.4674072265625, "epoch": 0.011638817867995834, "grad_norm": 0.5993421077728271, "learning_rate": 6.750000000000001e-06, "loss": 2.4598, "mean_token_accuracy": 0.4802471627481282, "num_tokens": 97988494.0, "step": 676 }, { "entropy": 2.42919921875, "epoch": 0.011656035054191093, "grad_norm": 0.5687212944030762, "learning_rate": 6.760000000000001e-06, "loss": 2.382, "mean_token_accuracy": 0.48455388378351927, "num_tokens": 98148161.0, "step": 677 }, { "entropy": 2.43896484375, "epoch": 0.011673252240386353, "grad_norm": 0.5404056906700134, "learning_rate": 6.770000000000001e-06, "loss": 2.378, "mean_token_accuracy": 0.48297660844400525, "num_tokens": 98303865.0, "step": 678 }, { "entropy": 2.41552734375, "epoch": 0.011690469426581614, "grad_norm": 0.6042897701263428, "learning_rate": 6.780000000000001e-06, "loss": 2.3817, "mean_token_accuracy": 0.4834853089414537, "num_tokens": 98442926.0, "step": 679 }, { "entropy": 2.4442138671875, "epoch": 0.011707686612776875, "grad_norm": 0.5709783434867859, "learning_rate": 6.790000000000001e-06, "loss": 2.389, "mean_token_accuracy": 0.4857576950453222, "num_tokens": 98585693.0, "step": 680 }, { "entropy": 2.452392578125, "epoch": 0.011724903798972133, "grad_norm": 0.5298141837120056, "learning_rate": 6.800000000000001e-06, "loss": 2.3874, "mean_token_accuracy": 0.486653549131006, "num_tokens": 98748357.0, "step": 681 }, { "entropy": 2.470703125, "epoch": 0.011742120985167394, "grad_norm": 0.5651002526283264, "learning_rate": 6.810000000000001e-06, "loss": 2.4578, "mean_token_accuracy": 0.4745705393142998, "num_tokens": 98894643.0, "step": 682 }, { "entropy": 2.4425048828125, "epoch": 0.011759338171362654, "grad_norm": 0.5605524182319641, "learning_rate": 6.820000000000001e-06, "loss": 2.4011, "mean_token_accuracy": 0.4801498386077583, "num_tokens": 99057594.0, "step": 683 }, { "entropy": 2.457275390625, "epoch": 0.011776555357557915, "grad_norm": 0.594336748123169, "learning_rate": 6.830000000000001e-06, "loss": 2.4241, "mean_token_accuracy": 0.4828556412830949, "num_tokens": 99191074.0, "step": 684 }, { "entropy": 2.4183349609375, "epoch": 0.011793772543753174, "grad_norm": 0.5907600522041321, "learning_rate": 6.8400000000000014e-06, "loss": 2.3646, "mean_token_accuracy": 0.4930391958914697, "num_tokens": 99328908.0, "step": 685 }, { "entropy": 2.5035400390625, "epoch": 0.011810989729948434, "grad_norm": 0.5865106582641602, "learning_rate": 6.850000000000001e-06, "loss": 2.481, "mean_token_accuracy": 0.47163935378193855, "num_tokens": 99456705.0, "step": 686 }, { "entropy": 2.4417724609375, "epoch": 0.011828206916143695, "grad_norm": 0.677649736404419, "learning_rate": 6.860000000000001e-06, "loss": 2.4363, "mean_token_accuracy": 0.48826620541512966, "num_tokens": 99591855.0, "step": 687 }, { "entropy": 2.4449462890625, "epoch": 0.011845424102338955, "grad_norm": 0.5924775004386902, "learning_rate": 6.870000000000001e-06, "loss": 2.3789, "mean_token_accuracy": 0.48261339543387294, "num_tokens": 99733016.0, "step": 688 }, { "entropy": 2.420654296875, "epoch": 0.011862641288534214, "grad_norm": 0.5597655177116394, "learning_rate": 6.88e-06, "loss": 2.4079, "mean_token_accuracy": 0.4812637008726597, "num_tokens": 99884912.0, "step": 689 }, { "entropy": 2.4306640625, "epoch": 0.011879858474729475, "grad_norm": 0.5902450680732727, "learning_rate": 6.89e-06, "loss": 2.3469, "mean_token_accuracy": 0.4851334313862026, "num_tokens": 100033610.0, "step": 690 }, { "entropy": 2.40673828125, "epoch": 0.011897075660924735, "grad_norm": 0.5616133213043213, "learning_rate": 6.9e-06, "loss": 2.3481, "mean_token_accuracy": 0.4861144246533513, "num_tokens": 100175721.0, "step": 691 }, { "entropy": 2.4476318359375, "epoch": 0.011914292847119996, "grad_norm": 0.5511784553527832, "learning_rate": 6.91e-06, "loss": 2.4008, "mean_token_accuracy": 0.4786239666864276, "num_tokens": 100317239.0, "step": 692 }, { "entropy": 2.4410400390625, "epoch": 0.011931510033315256, "grad_norm": 0.5769883394241333, "learning_rate": 6.92e-06, "loss": 2.3883, "mean_token_accuracy": 0.48180802492424846, "num_tokens": 100472196.0, "step": 693 }, { "entropy": 2.423828125, "epoch": 0.011948727219510515, "grad_norm": 0.5468559265136719, "learning_rate": 6.93e-06, "loss": 2.3532, "mean_token_accuracy": 0.4820295791141689, "num_tokens": 100630584.0, "step": 694 }, { "entropy": 2.484619140625, "epoch": 0.011965944405705775, "grad_norm": 0.5945432782173157, "learning_rate": 6.9400000000000005e-06, "loss": 2.4283, "mean_token_accuracy": 0.4779118075966835, "num_tokens": 100786824.0, "step": 695 }, { "entropy": 2.4324951171875, "epoch": 0.011983161591901036, "grad_norm": 0.5941588878631592, "learning_rate": 6.95e-06, "loss": 2.3637, "mean_token_accuracy": 0.4826899361796677, "num_tokens": 100927601.0, "step": 696 }, { "entropy": 2.46240234375, "epoch": 0.012000378778096297, "grad_norm": 0.5858972668647766, "learning_rate": 6.96e-06, "loss": 2.4147, "mean_token_accuracy": 0.47940469440072775, "num_tokens": 101064043.0, "step": 697 }, { "entropy": 2.44091796875, "epoch": 0.012017595964291555, "grad_norm": 0.5900692939758301, "learning_rate": 6.97e-06, "loss": 2.3959, "mean_token_accuracy": 0.48975384049117565, "num_tokens": 101209114.0, "step": 698 }, { "entropy": 2.400146484375, "epoch": 0.012034813150486816, "grad_norm": 0.5515392422676086, "learning_rate": 6.98e-06, "loss": 2.3642, "mean_token_accuracy": 0.4900199566036463, "num_tokens": 101358773.0, "step": 699 }, { "entropy": 2.3948974609375, "epoch": 0.012052030336682076, "grad_norm": 0.5985309481620789, "learning_rate": 6.99e-06, "loss": 2.3383, "mean_token_accuracy": 0.49027787847444415, "num_tokens": 101507432.0, "step": 700 }, { "entropy": 2.45751953125, "epoch": 0.012069247522877337, "grad_norm": 0.5618374943733215, "learning_rate": 7e-06, "loss": 2.434, "mean_token_accuracy": 0.47642564633861184, "num_tokens": 101645197.0, "step": 701 }, { "entropy": 2.405517578125, "epoch": 0.012086464709072596, "grad_norm": 0.5900906920433044, "learning_rate": 7.01e-06, "loss": 2.3832, "mean_token_accuracy": 0.48692094907164574, "num_tokens": 101786084.0, "step": 702 }, { "entropy": 2.4290771484375, "epoch": 0.012103681895267856, "grad_norm": 0.5847712755203247, "learning_rate": 7.0200000000000006e-06, "loss": 2.4093, "mean_token_accuracy": 0.48114079609513283, "num_tokens": 101929991.0, "step": 703 }, { "entropy": 2.4827880859375, "epoch": 0.012120899081463117, "grad_norm": 0.6014482378959656, "learning_rate": 7.0300000000000005e-06, "loss": 2.4566, "mean_token_accuracy": 0.4787940843962133, "num_tokens": 102069231.0, "step": 704 }, { "entropy": 2.5064697265625, "epoch": 0.012138116267658377, "grad_norm": 0.5504627823829651, "learning_rate": 7.04e-06, "loss": 2.4534, "mean_token_accuracy": 0.46901731938123703, "num_tokens": 102218938.0, "step": 705 }, { "entropy": 2.401123046875, "epoch": 0.012155333453853636, "grad_norm": 0.6932315826416016, "learning_rate": 7.05e-06, "loss": 2.3427, "mean_token_accuracy": 0.4926096093840897, "num_tokens": 102353532.0, "step": 706 }, { "entropy": 2.46923828125, "epoch": 0.012172550640048897, "grad_norm": 0.5833364725112915, "learning_rate": 7.06e-06, "loss": 2.4615, "mean_token_accuracy": 0.47761310590431094, "num_tokens": 102490402.0, "step": 707 }, { "entropy": 2.439697265625, "epoch": 0.012189767826244157, "grad_norm": 0.61143559217453, "learning_rate": 7.07e-06, "loss": 2.3579, "mean_token_accuracy": 0.4858525595627725, "num_tokens": 102627917.0, "step": 708 }, { "entropy": 2.49169921875, "epoch": 0.012206985012439418, "grad_norm": 0.601938009262085, "learning_rate": 7.08e-06, "loss": 2.4313, "mean_token_accuracy": 0.4729892536997795, "num_tokens": 102759385.0, "step": 709 }, { "entropy": 2.4722900390625, "epoch": 0.012224202198634676, "grad_norm": 0.5767009854316711, "learning_rate": 7.09e-06, "loss": 2.4482, "mean_token_accuracy": 0.4772743955254555, "num_tokens": 102915430.0, "step": 710 }, { "entropy": 2.46142578125, "epoch": 0.012241419384829937, "grad_norm": 0.546136200428009, "learning_rate": 7.100000000000001e-06, "loss": 2.3961, "mean_token_accuracy": 0.4819467253983021, "num_tokens": 103066691.0, "step": 711 }, { "entropy": 2.48583984375, "epoch": 0.012258636571025197, "grad_norm": 0.5756231546401978, "learning_rate": 7.1100000000000005e-06, "loss": 2.4676, "mean_token_accuracy": 0.4714622185565531, "num_tokens": 103217176.0, "step": 712 }, { "entropy": 2.4246826171875, "epoch": 0.012275853757220458, "grad_norm": 0.5694965720176697, "learning_rate": 7.1200000000000004e-06, "loss": 2.3603, "mean_token_accuracy": 0.4863223168067634, "num_tokens": 103368786.0, "step": 713 }, { "entropy": 2.447265625, "epoch": 0.012293070943415717, "grad_norm": 0.5844082236289978, "learning_rate": 7.13e-06, "loss": 2.4418, "mean_token_accuracy": 0.482551914639771, "num_tokens": 103515573.0, "step": 714 }, { "entropy": 2.509765625, "epoch": 0.012310288129610977, "grad_norm": 0.5859200358390808, "learning_rate": 7.14e-06, "loss": 2.4894, "mean_token_accuracy": 0.4717716183513403, "num_tokens": 103649277.0, "step": 715 }, { "entropy": 2.45263671875, "epoch": 0.012327505315806238, "grad_norm": 0.5378598570823669, "learning_rate": 7.15e-06, "loss": 2.4289, "mean_token_accuracy": 0.47941537760198116, "num_tokens": 103802169.0, "step": 716 }, { "entropy": 2.4493408203125, "epoch": 0.012344722502001498, "grad_norm": 0.6121527552604675, "learning_rate": 7.16e-06, "loss": 2.3747, "mean_token_accuracy": 0.4810113995335996, "num_tokens": 103939625.0, "step": 717 }, { "entropy": 2.4144287109375, "epoch": 0.012361939688196759, "grad_norm": 0.5776681900024414, "learning_rate": 7.17e-06, "loss": 2.3531, "mean_token_accuracy": 0.4864069065079093, "num_tokens": 104080617.0, "step": 718 }, { "entropy": 2.3646240234375, "epoch": 0.012379156874392018, "grad_norm": 0.5905261635780334, "learning_rate": 7.180000000000001e-06, "loss": 2.3327, "mean_token_accuracy": 0.49934633634984493, "num_tokens": 104220572.0, "step": 719 }, { "entropy": 2.4034423828125, "epoch": 0.012396374060587278, "grad_norm": 0.5707198977470398, "learning_rate": 7.190000000000001e-06, "loss": 2.3433, "mean_token_accuracy": 0.49225129559636116, "num_tokens": 104361519.0, "step": 720 }, { "entropy": 2.4696044921875, "epoch": 0.012413591246782539, "grad_norm": 0.5869008302688599, "learning_rate": 7.2000000000000005e-06, "loss": 2.4365, "mean_token_accuracy": 0.47933553624898195, "num_tokens": 104506297.0, "step": 721 }, { "entropy": 2.4625244140625, "epoch": 0.0124308084329778, "grad_norm": 0.5947871208190918, "learning_rate": 7.2100000000000004e-06, "loss": 2.4067, "mean_token_accuracy": 0.48690010188147426, "num_tokens": 104647015.0, "step": 722 }, { "entropy": 2.450439453125, "epoch": 0.012448025619173058, "grad_norm": 0.6154801845550537, "learning_rate": 7.22e-06, "loss": 2.4108, "mean_token_accuracy": 0.47685753647238016, "num_tokens": 104802411.0, "step": 723 }, { "entropy": 2.419921875, "epoch": 0.012465242805368319, "grad_norm": 0.6330761313438416, "learning_rate": 7.23e-06, "loss": 2.3759, "mean_token_accuracy": 0.4840613235719502, "num_tokens": 104951305.0, "step": 724 }, { "entropy": 2.43359375, "epoch": 0.012482459991563579, "grad_norm": 0.6040731072425842, "learning_rate": 7.24e-06, "loss": 2.3609, "mean_token_accuracy": 0.4850413934327662, "num_tokens": 105091990.0, "step": 725 }, { "entropy": 2.44482421875, "epoch": 0.01249967717775884, "grad_norm": 0.6250449419021606, "learning_rate": 7.25e-06, "loss": 2.4137, "mean_token_accuracy": 0.4755057515576482, "num_tokens": 105212920.0, "step": 726 }, { "entropy": 2.4415283203125, "epoch": 0.012516894363954098, "grad_norm": 0.6397738456726074, "learning_rate": 7.260000000000001e-06, "loss": 2.4425, "mean_token_accuracy": 0.4787881104275584, "num_tokens": 105329271.0, "step": 727 }, { "entropy": 2.3997802734375, "epoch": 0.012534111550149359, "grad_norm": 0.5615900158882141, "learning_rate": 7.270000000000001e-06, "loss": 2.4034, "mean_token_accuracy": 0.4808120485395193, "num_tokens": 105487622.0, "step": 728 }, { "entropy": 2.464111328125, "epoch": 0.01255132873634462, "grad_norm": 0.5766611695289612, "learning_rate": 7.280000000000001e-06, "loss": 2.4341, "mean_token_accuracy": 0.47967397794127464, "num_tokens": 105633781.0, "step": 729 }, { "entropy": 2.482421875, "epoch": 0.01256854592253988, "grad_norm": 0.557733416557312, "learning_rate": 7.2900000000000005e-06, "loss": 2.4198, "mean_token_accuracy": 0.4778105691075325, "num_tokens": 105783888.0, "step": 730 }, { "entropy": 2.3619384765625, "epoch": 0.012585763108735139, "grad_norm": 0.5741136074066162, "learning_rate": 7.3e-06, "loss": 2.3078, "mean_token_accuracy": 0.49396718852221966, "num_tokens": 105927244.0, "step": 731 }, { "entropy": 2.431396484375, "epoch": 0.0126029802949304, "grad_norm": 0.5635918974876404, "learning_rate": 7.31e-06, "loss": 2.3871, "mean_token_accuracy": 0.48639540281146765, "num_tokens": 106070617.0, "step": 732 }, { "entropy": 2.4696044921875, "epoch": 0.01262019748112566, "grad_norm": 0.578965425491333, "learning_rate": 7.32e-06, "loss": 2.3938, "mean_token_accuracy": 0.4847126523964107, "num_tokens": 106230425.0, "step": 733 }, { "entropy": 2.458251953125, "epoch": 0.01263741466732092, "grad_norm": 0.9026923775672913, "learning_rate": 7.33e-06, "loss": 2.4071, "mean_token_accuracy": 0.4807370454072952, "num_tokens": 106375429.0, "step": 734 }, { "entropy": 2.4134521484375, "epoch": 0.012654631853516179, "grad_norm": 0.6249892711639404, "learning_rate": 7.340000000000001e-06, "loss": 2.3442, "mean_token_accuracy": 0.48796508787199855, "num_tokens": 106523980.0, "step": 735 }, { "entropy": 2.403076171875, "epoch": 0.01267184903971144, "grad_norm": 0.5887693166732788, "learning_rate": 7.350000000000001e-06, "loss": 2.3872, "mean_token_accuracy": 0.48691826686263084, "num_tokens": 106669149.0, "step": 736 }, { "entropy": 2.4166259765625, "epoch": 0.0126890662259067, "grad_norm": 0.617521345615387, "learning_rate": 7.360000000000001e-06, "loss": 2.3293, "mean_token_accuracy": 0.48858933337032795, "num_tokens": 106798139.0, "step": 737 }, { "entropy": 2.44921875, "epoch": 0.01270628341210196, "grad_norm": 0.5710563659667969, "learning_rate": 7.370000000000001e-06, "loss": 2.4245, "mean_token_accuracy": 0.47817583242431283, "num_tokens": 106937567.0, "step": 738 }, { "entropy": 2.4063720703125, "epoch": 0.01272350059829722, "grad_norm": 1.2993676662445068, "learning_rate": 7.3800000000000005e-06, "loss": 2.3553, "mean_token_accuracy": 0.48716708505526185, "num_tokens": 107069215.0, "step": 739 }, { "entropy": 2.417236328125, "epoch": 0.01274071778449248, "grad_norm": 0.5727201104164124, "learning_rate": 7.39e-06, "loss": 2.3806, "mean_token_accuracy": 0.4816734539344907, "num_tokens": 107216177.0, "step": 740 }, { "entropy": 2.470947265625, "epoch": 0.01275793497068774, "grad_norm": 0.568335771560669, "learning_rate": 7.4e-06, "loss": 2.4227, "mean_token_accuracy": 0.4798375847749412, "num_tokens": 107367907.0, "step": 741 }, { "entropy": 2.4110107421875, "epoch": 0.012775152156883001, "grad_norm": 0.6011884808540344, "learning_rate": 7.41e-06, "loss": 2.3219, "mean_token_accuracy": 0.49019240494817495, "num_tokens": 107504131.0, "step": 742 }, { "entropy": 2.41845703125, "epoch": 0.01279236934307826, "grad_norm": 0.5796740055084229, "learning_rate": 7.420000000000001e-06, "loss": 2.3559, "mean_token_accuracy": 0.48748622741550207, "num_tokens": 107645959.0, "step": 743 }, { "entropy": 2.4940185546875, "epoch": 0.01280958652927352, "grad_norm": 0.5444216132164001, "learning_rate": 7.430000000000001e-06, "loss": 2.4531, "mean_token_accuracy": 0.4708722811192274, "num_tokens": 107795485.0, "step": 744 }, { "entropy": 2.413818359375, "epoch": 0.012826803715468781, "grad_norm": 0.5786703824996948, "learning_rate": 7.440000000000001e-06, "loss": 2.3548, "mean_token_accuracy": 0.49165352433919907, "num_tokens": 107937289.0, "step": 745 }, { "entropy": 2.411865234375, "epoch": 0.012844020901664041, "grad_norm": 0.6046903729438782, "learning_rate": 7.450000000000001e-06, "loss": 2.3753, "mean_token_accuracy": 0.4823318342678249, "num_tokens": 108093121.0, "step": 746 }, { "entropy": 2.4716796875, "epoch": 0.012861238087859302, "grad_norm": 0.536321222782135, "learning_rate": 7.4600000000000006e-06, "loss": 2.4663, "mean_token_accuracy": 0.4771149712614715, "num_tokens": 108257267.0, "step": 747 }, { "entropy": 2.4007568359375, "epoch": 0.01287845527405456, "grad_norm": 0.6158970594406128, "learning_rate": 7.4700000000000005e-06, "loss": 2.3579, "mean_token_accuracy": 0.4832296408712864, "num_tokens": 108396635.0, "step": 748 }, { "entropy": 2.4298095703125, "epoch": 0.012895672460249821, "grad_norm": 0.5570418834686279, "learning_rate": 7.48e-06, "loss": 2.4086, "mean_token_accuracy": 0.48002893943339586, "num_tokens": 108544662.0, "step": 749 }, { "entropy": 2.54150390625, "epoch": 0.012912889646445082, "grad_norm": 0.6132979393005371, "learning_rate": 7.49e-06, "loss": 2.4981, "mean_token_accuracy": 0.46807813877239823, "num_tokens": 108709034.0, "step": 750 }, { "entropy": 2.438232421875, "epoch": 0.012930106832640342, "grad_norm": 0.5490818023681641, "learning_rate": 7.500000000000001e-06, "loss": 2.3753, "mean_token_accuracy": 0.48799073603004217, "num_tokens": 108859610.0, "step": 751 }, { "entropy": 2.4285888671875, "epoch": 0.012947324018835601, "grad_norm": 0.5972912311553955, "learning_rate": 7.510000000000001e-06, "loss": 2.4225, "mean_token_accuracy": 0.48296895902603865, "num_tokens": 109009475.0, "step": 752 }, { "entropy": 2.4571533203125, "epoch": 0.012964541205030862, "grad_norm": 0.5518878102302551, "learning_rate": 7.520000000000001e-06, "loss": 2.3751, "mean_token_accuracy": 0.4834339157678187, "num_tokens": 109158002.0, "step": 753 }, { "entropy": 2.41162109375, "epoch": 0.012981758391226122, "grad_norm": 0.602249801158905, "learning_rate": 7.530000000000001e-06, "loss": 2.3587, "mean_token_accuracy": 0.4893317800015211, "num_tokens": 109293690.0, "step": 754 }, { "entropy": 2.4407958984375, "epoch": 0.012998975577421383, "grad_norm": 0.5610491633415222, "learning_rate": 7.540000000000001e-06, "loss": 2.4023, "mean_token_accuracy": 0.4784908280707896, "num_tokens": 109439590.0, "step": 755 }, { "entropy": 2.4281005859375, "epoch": 0.013016192763616641, "grad_norm": 0.5914566516876221, "learning_rate": 7.5500000000000006e-06, "loss": 2.4052, "mean_token_accuracy": 0.4888386274687946, "num_tokens": 109582432.0, "step": 756 }, { "entropy": 2.504638671875, "epoch": 0.013033409949811902, "grad_norm": 0.5595380067825317, "learning_rate": 7.5600000000000005e-06, "loss": 2.4702, "mean_token_accuracy": 0.4686399414204061, "num_tokens": 109735418.0, "step": 757 }, { "entropy": 2.4144287109375, "epoch": 0.013050627136007163, "grad_norm": 0.5793854594230652, "learning_rate": 7.57e-06, "loss": 2.3714, "mean_token_accuracy": 0.4886330468580127, "num_tokens": 109891701.0, "step": 758 }, { "entropy": 2.4320068359375, "epoch": 0.013067844322202423, "grad_norm": 0.6832471489906311, "learning_rate": 7.58e-06, "loss": 2.3694, "mean_token_accuracy": 0.48566275043413043, "num_tokens": 110039268.0, "step": 759 }, { "entropy": 2.4044189453125, "epoch": 0.013085061508397682, "grad_norm": 0.6120168566703796, "learning_rate": 7.590000000000001e-06, "loss": 2.3659, "mean_token_accuracy": 0.4929888774640858, "num_tokens": 110182050.0, "step": 760 }, { "entropy": 2.491943359375, "epoch": 0.013102278694592942, "grad_norm": 0.6201086044311523, "learning_rate": 7.600000000000001e-06, "loss": 2.4232, "mean_token_accuracy": 0.4787399894557893, "num_tokens": 110303657.0, "step": 761 }, { "entropy": 2.499755859375, "epoch": 0.013119495880788203, "grad_norm": 0.5904567837715149, "learning_rate": 7.610000000000001e-06, "loss": 2.4735, "mean_token_accuracy": 0.47595866583287716, "num_tokens": 110445376.0, "step": 762 }, { "entropy": 2.4344482421875, "epoch": 0.013136713066983463, "grad_norm": 0.7313216924667358, "learning_rate": 7.620000000000001e-06, "loss": 2.3898, "mean_token_accuracy": 0.4856723416596651, "num_tokens": 110599091.0, "step": 763 }, { "entropy": 2.412353515625, "epoch": 0.013153930253178722, "grad_norm": 0.582177460193634, "learning_rate": 7.630000000000001e-06, "loss": 2.3465, "mean_token_accuracy": 0.49040036741644144, "num_tokens": 110767359.0, "step": 764 }, { "entropy": 2.4423828125, "epoch": 0.013171147439373983, "grad_norm": 0.5971955060958862, "learning_rate": 7.640000000000001e-06, "loss": 2.4243, "mean_token_accuracy": 0.47924549924209714, "num_tokens": 110904749.0, "step": 765 }, { "entropy": 2.370361328125, "epoch": 0.013188364625569243, "grad_norm": 0.5825552940368652, "learning_rate": 7.650000000000001e-06, "loss": 2.337, "mean_token_accuracy": 0.48746788455173373, "num_tokens": 111041416.0, "step": 766 }, { "entropy": 2.485107421875, "epoch": 0.013205581811764504, "grad_norm": 0.5713107585906982, "learning_rate": 7.660000000000001e-06, "loss": 2.4491, "mean_token_accuracy": 0.4754057708196342, "num_tokens": 111179069.0, "step": 767 }, { "entropy": 2.412353515625, "epoch": 0.013222798997959763, "grad_norm": 0.5602453947067261, "learning_rate": 7.670000000000001e-06, "loss": 2.3753, "mean_token_accuracy": 0.49099403340369463, "num_tokens": 111333973.0, "step": 768 }, { "entropy": 2.40283203125, "epoch": 0.013240016184155023, "grad_norm": 0.5362405180931091, "learning_rate": 7.680000000000001e-06, "loss": 2.3705, "mean_token_accuracy": 0.48479648493230343, "num_tokens": 111483914.0, "step": 769 }, { "entropy": 2.4000244140625, "epoch": 0.013257233370350284, "grad_norm": 0.5498298406600952, "learning_rate": 7.690000000000001e-06, "loss": 2.332, "mean_token_accuracy": 0.49140041740611196, "num_tokens": 111637688.0, "step": 770 }, { "entropy": 2.3616943359375, "epoch": 0.013274450556545544, "grad_norm": 0.5781952142715454, "learning_rate": 7.7e-06, "loss": 2.3117, "mean_token_accuracy": 0.49226083187386394, "num_tokens": 111777176.0, "step": 771 }, { "entropy": 2.45654296875, "epoch": 0.013291667742740805, "grad_norm": 0.5963894724845886, "learning_rate": 7.71e-06, "loss": 2.4077, "mean_token_accuracy": 0.4808903872035444, "num_tokens": 111907633.0, "step": 772 }, { "entropy": 2.47265625, "epoch": 0.013308884928936063, "grad_norm": 0.5909532904624939, "learning_rate": 7.72e-06, "loss": 2.4344, "mean_token_accuracy": 0.48116163862869143, "num_tokens": 112047691.0, "step": 773 }, { "entropy": 2.4112548828125, "epoch": 0.013326102115131324, "grad_norm": 0.6000514626502991, "learning_rate": 7.73e-06, "loss": 2.3594, "mean_token_accuracy": 0.4857400543987751, "num_tokens": 112185378.0, "step": 774 }, { "entropy": 2.41943359375, "epoch": 0.013343319301326585, "grad_norm": 0.586081326007843, "learning_rate": 7.74e-06, "loss": 2.3978, "mean_token_accuracy": 0.4904695344157517, "num_tokens": 112341351.0, "step": 775 }, { "entropy": 2.4359130859375, "epoch": 0.013360536487521845, "grad_norm": 0.5683102607727051, "learning_rate": 7.75e-06, "loss": 2.3903, "mean_token_accuracy": 0.4875342110171914, "num_tokens": 112487164.0, "step": 776 }, { "entropy": 2.4697265625, "epoch": 0.013377753673717104, "grad_norm": 0.5665966272354126, "learning_rate": 7.76e-06, "loss": 2.4298, "mean_token_accuracy": 0.47403282299637794, "num_tokens": 112629741.0, "step": 777 }, { "entropy": 2.45703125, "epoch": 0.013394970859912364, "grad_norm": 0.5655020475387573, "learning_rate": 7.77e-06, "loss": 2.402, "mean_token_accuracy": 0.48069310747087, "num_tokens": 112767619.0, "step": 778 }, { "entropy": 2.48095703125, "epoch": 0.013412188046107625, "grad_norm": 0.5934699177742004, "learning_rate": 7.78e-06, "loss": 2.4576, "mean_token_accuracy": 0.4850818943232298, "num_tokens": 112913870.0, "step": 779 }, { "entropy": 2.4310302734375, "epoch": 0.013429405232302885, "grad_norm": 0.7897031307220459, "learning_rate": 7.790000000000002e-06, "loss": 2.3135, "mean_token_accuracy": 0.4851987957954407, "num_tokens": 113047948.0, "step": 780 }, { "entropy": 2.3919677734375, "epoch": 0.013446622418498144, "grad_norm": 0.5672735571861267, "learning_rate": 7.800000000000002e-06, "loss": 2.3579, "mean_token_accuracy": 0.48478852584958076, "num_tokens": 113210948.0, "step": 781 }, { "entropy": 2.4381103515625, "epoch": 0.013463839604693405, "grad_norm": 0.6065455079078674, "learning_rate": 7.810000000000001e-06, "loss": 2.3854, "mean_token_accuracy": 0.4848202792927623, "num_tokens": 113361592.0, "step": 782 }, { "entropy": 2.4210205078125, "epoch": 0.013481056790888665, "grad_norm": 0.5307328701019287, "learning_rate": 7.820000000000001e-06, "loss": 2.3612, "mean_token_accuracy": 0.4887157790362835, "num_tokens": 113524797.0, "step": 783 }, { "entropy": 2.4393310546875, "epoch": 0.013498273977083926, "grad_norm": 0.6069521307945251, "learning_rate": 7.830000000000001e-06, "loss": 2.4034, "mean_token_accuracy": 0.4795961854979396, "num_tokens": 113659620.0, "step": 784 }, { "entropy": 2.443603515625, "epoch": 0.013515491163279185, "grad_norm": 0.5770717263221741, "learning_rate": 7.840000000000001e-06, "loss": 2.4369, "mean_token_accuracy": 0.48013802990317345, "num_tokens": 113803890.0, "step": 785 }, { "entropy": 2.4228515625, "epoch": 0.013532708349474445, "grad_norm": 0.6027282476425171, "learning_rate": 7.850000000000001e-06, "loss": 2.406, "mean_token_accuracy": 0.48077098093926907, "num_tokens": 113948731.0, "step": 786 }, { "entropy": 2.439208984375, "epoch": 0.013549925535669706, "grad_norm": 0.6242732405662537, "learning_rate": 7.860000000000001e-06, "loss": 2.3891, "mean_token_accuracy": 0.4767256425693631, "num_tokens": 114095401.0, "step": 787 }, { "entropy": 2.507568359375, "epoch": 0.013567142721864966, "grad_norm": 0.5736910104751587, "learning_rate": 7.870000000000001e-06, "loss": 2.4509, "mean_token_accuracy": 0.4732041200622916, "num_tokens": 114245869.0, "step": 788 }, { "entropy": 2.485595703125, "epoch": 0.013584359908060225, "grad_norm": 0.5609222054481506, "learning_rate": 7.88e-06, "loss": 2.4312, "mean_token_accuracy": 0.4742095875553787, "num_tokens": 114388858.0, "step": 789 }, { "entropy": 2.56689453125, "epoch": 0.013601577094255485, "grad_norm": 0.5870925784111023, "learning_rate": 7.89e-06, "loss": 2.5389, "mean_token_accuracy": 0.46493083937093616, "num_tokens": 114523322.0, "step": 790 }, { "entropy": 2.49267578125, "epoch": 0.013618794280450746, "grad_norm": 0.5677596926689148, "learning_rate": 7.9e-06, "loss": 2.4385, "mean_token_accuracy": 0.4769080653786659, "num_tokens": 114663505.0, "step": 791 }, { "entropy": 2.485595703125, "epoch": 0.013636011466646007, "grad_norm": 0.5597982406616211, "learning_rate": 7.91e-06, "loss": 2.4263, "mean_token_accuracy": 0.4761881032027304, "num_tokens": 114813922.0, "step": 792 }, { "entropy": 2.416259765625, "epoch": 0.013653228652841265, "grad_norm": 0.5396745204925537, "learning_rate": 7.92e-06, "loss": 2.3336, "mean_token_accuracy": 0.49072846584022045, "num_tokens": 114962105.0, "step": 793 }, { "entropy": 2.3951416015625, "epoch": 0.013670445839036526, "grad_norm": 0.6175518035888672, "learning_rate": 7.93e-06, "loss": 2.339, "mean_token_accuracy": 0.4950829269364476, "num_tokens": 115103571.0, "step": 794 }, { "entropy": 2.4580078125, "epoch": 0.013687663025231786, "grad_norm": 0.5724807977676392, "learning_rate": 7.94e-06, "loss": 2.3744, "mean_token_accuracy": 0.47975975926965475, "num_tokens": 115243719.0, "step": 795 }, { "entropy": 2.5086669921875, "epoch": 0.013704880211427047, "grad_norm": 0.5798308849334717, "learning_rate": 7.950000000000002e-06, "loss": 2.4594, "mean_token_accuracy": 0.4762448235414922, "num_tokens": 115383378.0, "step": 796 }, { "entropy": 2.4212646484375, "epoch": 0.013722097397622307, "grad_norm": 0.5839712023735046, "learning_rate": 7.960000000000002e-06, "loss": 2.3884, "mean_token_accuracy": 0.48001448903232813, "num_tokens": 115527648.0, "step": 797 }, { "entropy": 2.388671875, "epoch": 0.013739314583817566, "grad_norm": 0.630893349647522, "learning_rate": 7.970000000000002e-06, "loss": 2.3391, "mean_token_accuracy": 0.48810708662495017, "num_tokens": 115673489.0, "step": 798 }, { "entropy": 2.475341796875, "epoch": 0.013756531770012827, "grad_norm": 0.590587854385376, "learning_rate": 7.980000000000002e-06, "loss": 2.4425, "mean_token_accuracy": 0.4739443711005151, "num_tokens": 115814268.0, "step": 799 }, { "entropy": 2.380615234375, "epoch": 0.013773748956208087, "grad_norm": 0.566112220287323, "learning_rate": 7.990000000000001e-06, "loss": 2.3345, "mean_token_accuracy": 0.49479513335973024, "num_tokens": 115962087.0, "step": 800 }, { "entropy": 2.53515625, "epoch": 0.013790966142403348, "grad_norm": 0.5729132890701294, "learning_rate": 8.000000000000001e-06, "loss": 2.478, "mean_token_accuracy": 0.46622643573209643, "num_tokens": 116096894.0, "step": 801 }, { "entropy": 2.43701171875, "epoch": 0.013808183328598607, "grad_norm": 0.5780159831047058, "learning_rate": 8.010000000000001e-06, "loss": 2.4245, "mean_token_accuracy": 0.48138847574591637, "num_tokens": 116248222.0, "step": 802 }, { "entropy": 2.44482421875, "epoch": 0.013825400514793867, "grad_norm": 0.5970706343650818, "learning_rate": 8.020000000000001e-06, "loss": 2.4229, "mean_token_accuracy": 0.4804395758546889, "num_tokens": 116405644.0, "step": 803 }, { "entropy": 2.43505859375, "epoch": 0.013842617700989128, "grad_norm": 0.5705693364143372, "learning_rate": 8.030000000000001e-06, "loss": 2.3441, "mean_token_accuracy": 0.48369065998122096, "num_tokens": 116548738.0, "step": 804 }, { "entropy": 2.4342041015625, "epoch": 0.013859834887184388, "grad_norm": 0.5916334390640259, "learning_rate": 8.040000000000001e-06, "loss": 2.3664, "mean_token_accuracy": 0.4856993416324258, "num_tokens": 116691888.0, "step": 805 }, { "entropy": 2.4691162109375, "epoch": 0.013877052073379647, "grad_norm": 0.579868495464325, "learning_rate": 8.050000000000001e-06, "loss": 2.3724, "mean_token_accuracy": 0.4843080313876271, "num_tokens": 116830242.0, "step": 806 }, { "entropy": 2.40966796875, "epoch": 0.013894269259574907, "grad_norm": 0.5866817831993103, "learning_rate": 8.06e-06, "loss": 2.3801, "mean_token_accuracy": 0.48936720937490463, "num_tokens": 116965900.0, "step": 807 }, { "entropy": 2.4056396484375, "epoch": 0.013911486445770168, "grad_norm": 0.5471423864364624, "learning_rate": 8.07e-06, "loss": 2.3545, "mean_token_accuracy": 0.4835878200829029, "num_tokens": 117118489.0, "step": 808 }, { "entropy": 2.42529296875, "epoch": 0.013928703631965429, "grad_norm": 0.577364981174469, "learning_rate": 8.08e-06, "loss": 2.3581, "mean_token_accuracy": 0.48418712290003896, "num_tokens": 117264043.0, "step": 809 }, { "entropy": 2.43310546875, "epoch": 0.013945920818160687, "grad_norm": 0.5444287657737732, "learning_rate": 8.09e-06, "loss": 2.4178, "mean_token_accuracy": 0.476511531509459, "num_tokens": 117419082.0, "step": 810 }, { "entropy": 2.435302734375, "epoch": 0.013963138004355948, "grad_norm": 0.6165118217468262, "learning_rate": 8.1e-06, "loss": 2.4043, "mean_token_accuracy": 0.48205558583140373, "num_tokens": 117552496.0, "step": 811 }, { "entropy": 2.462158203125, "epoch": 0.013980355190551208, "grad_norm": 0.5636385679244995, "learning_rate": 8.110000000000002e-06, "loss": 2.4252, "mean_token_accuracy": 0.4745056303218007, "num_tokens": 117702578.0, "step": 812 }, { "entropy": 2.542724609375, "epoch": 0.013997572376746469, "grad_norm": 0.5832167863845825, "learning_rate": 8.120000000000002e-06, "loss": 2.5385, "mean_token_accuracy": 0.4675266365520656, "num_tokens": 117850701.0, "step": 813 }, { "entropy": 2.489990234375, "epoch": 0.014014789562941728, "grad_norm": 0.5839970111846924, "learning_rate": 8.13e-06, "loss": 2.4495, "mean_token_accuracy": 0.47388132382184267, "num_tokens": 118001878.0, "step": 814 }, { "entropy": 2.429931640625, "epoch": 0.014032006749136988, "grad_norm": 0.6004050970077515, "learning_rate": 8.14e-06, "loss": 2.3689, "mean_token_accuracy": 0.48473711824044585, "num_tokens": 118146781.0, "step": 815 }, { "entropy": 2.517333984375, "epoch": 0.014049223935332249, "grad_norm": 0.5721548795700073, "learning_rate": 8.15e-06, "loss": 2.5005, "mean_token_accuracy": 0.47145770117640495, "num_tokens": 118281715.0, "step": 816 }, { "entropy": 2.41259765625, "epoch": 0.01406644112152751, "grad_norm": 0.6043224930763245, "learning_rate": 8.16e-06, "loss": 2.3785, "mean_token_accuracy": 0.48464843491092324, "num_tokens": 118420852.0, "step": 817 }, { "entropy": 2.46435546875, "epoch": 0.014083658307722768, "grad_norm": 0.5578818321228027, "learning_rate": 8.17e-06, "loss": 2.4389, "mean_token_accuracy": 0.4739728611893952, "num_tokens": 118561047.0, "step": 818 }, { "entropy": 2.46435546875, "epoch": 0.014100875493918029, "grad_norm": 0.5339409112930298, "learning_rate": 8.18e-06, "loss": 2.4303, "mean_token_accuracy": 0.478867762722075, "num_tokens": 118724226.0, "step": 819 }, { "entropy": 2.4332275390625, "epoch": 0.014118092680113289, "grad_norm": 0.5819604396820068, "learning_rate": 8.19e-06, "loss": 2.3808, "mean_token_accuracy": 0.4880368346348405, "num_tokens": 118869937.0, "step": 820 }, { "entropy": 2.465576171875, "epoch": 0.01413530986630855, "grad_norm": 0.571780800819397, "learning_rate": 8.2e-06, "loss": 2.4314, "mean_token_accuracy": 0.4772975808009505, "num_tokens": 119022262.0, "step": 821 }, { "entropy": 2.4794921875, "epoch": 0.01415252705250381, "grad_norm": 0.5831077694892883, "learning_rate": 8.210000000000001e-06, "loss": 2.4701, "mean_token_accuracy": 0.47361723706126213, "num_tokens": 119165669.0, "step": 822 }, { "entropy": 2.4208984375, "epoch": 0.014169744238699069, "grad_norm": 0.5717747211456299, "learning_rate": 8.220000000000001e-06, "loss": 2.3679, "mean_token_accuracy": 0.48527460684999824, "num_tokens": 119301111.0, "step": 823 }, { "entropy": 2.45947265625, "epoch": 0.01418696142489433, "grad_norm": 0.6002344489097595, "learning_rate": 8.23e-06, "loss": 2.3749, "mean_token_accuracy": 0.48156877839937806, "num_tokens": 119443072.0, "step": 824 }, { "entropy": 2.4619140625, "epoch": 0.01420417861108959, "grad_norm": 0.5510353446006775, "learning_rate": 8.24e-06, "loss": 2.4012, "mean_token_accuracy": 0.47862795926630497, "num_tokens": 119599588.0, "step": 825 }, { "entropy": 2.426513671875, "epoch": 0.01422139579728485, "grad_norm": 0.6013565063476562, "learning_rate": 8.25e-06, "loss": 2.345, "mean_token_accuracy": 0.48915665224194527, "num_tokens": 119745982.0, "step": 826 }, { "entropy": 2.506103515625, "epoch": 0.01423861298348011, "grad_norm": 0.5756415724754333, "learning_rate": 8.26e-06, "loss": 2.4707, "mean_token_accuracy": 0.4742961646988988, "num_tokens": 119883211.0, "step": 827 }, { "entropy": 2.427734375, "epoch": 0.01425583016967537, "grad_norm": 0.6262539625167847, "learning_rate": 8.27e-06, "loss": 2.3734, "mean_token_accuracy": 0.4855869854800403, "num_tokens": 120007564.0, "step": 828 }, { "entropy": 2.447998046875, "epoch": 0.01427304735587063, "grad_norm": 0.577958345413208, "learning_rate": 8.28e-06, "loss": 2.4136, "mean_token_accuracy": 0.4838542784564197, "num_tokens": 120147199.0, "step": 829 }, { "entropy": 2.4615478515625, "epoch": 0.014290264542065891, "grad_norm": 0.6253990530967712, "learning_rate": 8.29e-06, "loss": 2.4558, "mean_token_accuracy": 0.4775591907091439, "num_tokens": 120294626.0, "step": 830 }, { "entropy": 2.3575439453125, "epoch": 0.01430748172826115, "grad_norm": 0.5368080139160156, "learning_rate": 8.3e-06, "loss": 2.3092, "mean_token_accuracy": 0.4943057978525758, "num_tokens": 120451476.0, "step": 831 }, { "entropy": 2.447265625, "epoch": 0.01432469891445641, "grad_norm": 0.6116092801094055, "learning_rate": 8.31e-06, "loss": 2.4052, "mean_token_accuracy": 0.47798394318670034, "num_tokens": 120605020.0, "step": 832 }, { "entropy": 2.3782958984375, "epoch": 0.01434191610065167, "grad_norm": 0.5678116083145142, "learning_rate": 8.32e-06, "loss": 2.3034, "mean_token_accuracy": 0.4946139776147902, "num_tokens": 120752495.0, "step": 833 }, { "entropy": 2.363037109375, "epoch": 0.014359133286846931, "grad_norm": 0.5678343772888184, "learning_rate": 8.33e-06, "loss": 2.3141, "mean_token_accuracy": 0.49313395330682397, "num_tokens": 120900694.0, "step": 834 }, { "entropy": 2.4739990234375, "epoch": 0.01437635047304219, "grad_norm": 0.5975309014320374, "learning_rate": 8.34e-06, "loss": 2.4388, "mean_token_accuracy": 0.4748355788178742, "num_tokens": 121032159.0, "step": 835 }, { "entropy": 2.51806640625, "epoch": 0.01439356765923745, "grad_norm": 0.5592584013938904, "learning_rate": 8.35e-06, "loss": 2.4939, "mean_token_accuracy": 0.4737435169517994, "num_tokens": 121177988.0, "step": 836 }, { "entropy": 2.433837890625, "epoch": 0.014410784845432711, "grad_norm": 0.5620052218437195, "learning_rate": 8.36e-06, "loss": 2.3528, "mean_token_accuracy": 0.484672705642879, "num_tokens": 121333599.0, "step": 837 }, { "entropy": 2.4949951171875, "epoch": 0.014428002031627972, "grad_norm": 0.6018322706222534, "learning_rate": 8.370000000000001e-06, "loss": 2.4874, "mean_token_accuracy": 0.471315645147115, "num_tokens": 121483935.0, "step": 838 }, { "entropy": 2.4498291015625, "epoch": 0.01444521921782323, "grad_norm": 0.5906838774681091, "learning_rate": 8.380000000000001e-06, "loss": 2.4314, "mean_token_accuracy": 0.48000403633341193, "num_tokens": 121626405.0, "step": 839 }, { "entropy": 2.3984375, "epoch": 0.014462436404018491, "grad_norm": 0.6163339614868164, "learning_rate": 8.390000000000001e-06, "loss": 2.3408, "mean_token_accuracy": 0.4879218554124236, "num_tokens": 121755719.0, "step": 840 }, { "entropy": 2.465576171875, "epoch": 0.014479653590213751, "grad_norm": 0.6231170892715454, "learning_rate": 8.400000000000001e-06, "loss": 2.4589, "mean_token_accuracy": 0.47292218124493957, "num_tokens": 121876833.0, "step": 841 }, { "entropy": 2.4288330078125, "epoch": 0.014496870776409012, "grad_norm": 0.6095725893974304, "learning_rate": 8.41e-06, "loss": 2.3584, "mean_token_accuracy": 0.4825339512899518, "num_tokens": 122029110.0, "step": 842 }, { "entropy": 2.479248046875, "epoch": 0.01451408796260427, "grad_norm": 0.5682320594787598, "learning_rate": 8.42e-06, "loss": 2.4085, "mean_token_accuracy": 0.47804021881893277, "num_tokens": 122177885.0, "step": 843 }, { "entropy": 2.4327392578125, "epoch": 0.014531305148799531, "grad_norm": 0.5729749202728271, "learning_rate": 8.43e-06, "loss": 2.365, "mean_token_accuracy": 0.4850345575250685, "num_tokens": 122326681.0, "step": 844 }, { "entropy": 2.3778076171875, "epoch": 0.014548522334994792, "grad_norm": 0.5983383059501648, "learning_rate": 8.44e-06, "loss": 2.3205, "mean_token_accuracy": 0.4961178773082793, "num_tokens": 122468050.0, "step": 845 }, { "entropy": 2.498779296875, "epoch": 0.014565739521190052, "grad_norm": 0.5855973958969116, "learning_rate": 8.45e-06, "loss": 2.4899, "mean_token_accuracy": 0.4722827118821442, "num_tokens": 122605022.0, "step": 846 }, { "entropy": 2.51318359375, "epoch": 0.014582956707385313, "grad_norm": 0.7350115180015564, "learning_rate": 8.46e-06, "loss": 2.4897, "mean_token_accuracy": 0.4719462259672582, "num_tokens": 122756763.0, "step": 847 }, { "entropy": 2.453125, "epoch": 0.014600173893580572, "grad_norm": 0.5812132954597473, "learning_rate": 8.47e-06, "loss": 2.3954, "mean_token_accuracy": 0.48041255166754127, "num_tokens": 122903879.0, "step": 848 }, { "entropy": 2.479736328125, "epoch": 0.014617391079775832, "grad_norm": 0.5762138962745667, "learning_rate": 8.48e-06, "loss": 2.4433, "mean_token_accuracy": 0.4698569756001234, "num_tokens": 123041198.0, "step": 849 }, { "entropy": 2.420654296875, "epoch": 0.014634608265971093, "grad_norm": 0.5930407643318176, "learning_rate": 8.49e-06, "loss": 2.3483, "mean_token_accuracy": 0.4922699723392725, "num_tokens": 123183506.0, "step": 850 }, { "entropy": 2.3505859375, "epoch": 0.014651825452166353, "grad_norm": 0.5760442614555359, "learning_rate": 8.5e-06, "loss": 2.2918, "mean_token_accuracy": 0.4991521080955863, "num_tokens": 123324526.0, "step": 851 }, { "entropy": 2.53662109375, "epoch": 0.014669042638361612, "grad_norm": 0.6048057079315186, "learning_rate": 8.51e-06, "loss": 2.4955, "mean_token_accuracy": 0.46434704307466745, "num_tokens": 123453650.0, "step": 852 }, { "entropy": 2.4481201171875, "epoch": 0.014686259824556873, "grad_norm": 0.5807772874832153, "learning_rate": 8.52e-06, "loss": 2.4092, "mean_token_accuracy": 0.4818004462867975, "num_tokens": 123600203.0, "step": 853 }, { "entropy": 2.42041015625, "epoch": 0.014703477010752133, "grad_norm": 0.5653759837150574, "learning_rate": 8.530000000000001e-06, "loss": 2.3675, "mean_token_accuracy": 0.4863137351348996, "num_tokens": 123750776.0, "step": 854 }, { "entropy": 2.3917236328125, "epoch": 0.014720694196947394, "grad_norm": 0.5585445761680603, "learning_rate": 8.540000000000001e-06, "loss": 2.36, "mean_token_accuracy": 0.4850917439907789, "num_tokens": 123908995.0, "step": 855 }, { "entropy": 2.408203125, "epoch": 0.014737911383142652, "grad_norm": 0.6110696196556091, "learning_rate": 8.550000000000001e-06, "loss": 2.3376, "mean_token_accuracy": 0.4934307490475476, "num_tokens": 124048260.0, "step": 856 }, { "entropy": 2.4285888671875, "epoch": 0.014755128569337913, "grad_norm": 0.5494508147239685, "learning_rate": 8.560000000000001e-06, "loss": 2.3685, "mean_token_accuracy": 0.485230874735862, "num_tokens": 124195711.0, "step": 857 }, { "entropy": 2.49462890625, "epoch": 0.014772345755533173, "grad_norm": 0.5444778203964233, "learning_rate": 8.570000000000001e-06, "loss": 2.4763, "mean_token_accuracy": 0.4732412826269865, "num_tokens": 124345089.0, "step": 858 }, { "entropy": 2.4862060546875, "epoch": 0.014789562941728434, "grad_norm": 0.5632426738739014, "learning_rate": 8.580000000000001e-06, "loss": 2.447, "mean_token_accuracy": 0.47163996985182166, "num_tokens": 124491147.0, "step": 859 }, { "entropy": 2.492919921875, "epoch": 0.014806780127923693, "grad_norm": 0.5277729034423828, "learning_rate": 8.59e-06, "loss": 2.4818, "mean_token_accuracy": 0.47168840002268553, "num_tokens": 124653263.0, "step": 860 }, { "entropy": 2.5084228515625, "epoch": 0.014823997314118953, "grad_norm": 0.5971653461456299, "learning_rate": 8.6e-06, "loss": 2.4695, "mean_token_accuracy": 0.4728867751546204, "num_tokens": 124790799.0, "step": 861 }, { "entropy": 2.415771484375, "epoch": 0.014841214500314214, "grad_norm": 0.6263266801834106, "learning_rate": 8.61e-06, "loss": 2.3963, "mean_token_accuracy": 0.4838090669363737, "num_tokens": 124936295.0, "step": 862 }, { "entropy": 2.46142578125, "epoch": 0.014858431686509474, "grad_norm": 0.592215895652771, "learning_rate": 8.62e-06, "loss": 2.4003, "mean_token_accuracy": 0.4809744218364358, "num_tokens": 125067834.0, "step": 863 }, { "entropy": 2.4461669921875, "epoch": 0.014875648872704733, "grad_norm": 0.5886532664299011, "learning_rate": 8.63e-06, "loss": 2.4247, "mean_token_accuracy": 0.48370860423892736, "num_tokens": 125208751.0, "step": 864 }, { "entropy": 2.451904296875, "epoch": 0.014892866058899994, "grad_norm": 0.544989824295044, "learning_rate": 8.64e-06, "loss": 2.4477, "mean_token_accuracy": 0.4746030508540571, "num_tokens": 125362283.0, "step": 865 }, { "entropy": 2.47802734375, "epoch": 0.014910083245095254, "grad_norm": 0.5210621356964111, "learning_rate": 8.65e-06, "loss": 2.4684, "mean_token_accuracy": 0.4717202326282859, "num_tokens": 125535484.0, "step": 866 }, { "entropy": 2.4478759765625, "epoch": 0.014927300431290515, "grad_norm": 0.5993431806564331, "learning_rate": 8.66e-06, "loss": 2.3869, "mean_token_accuracy": 0.47974208323284984, "num_tokens": 125670144.0, "step": 867 }, { "entropy": 2.3865966796875, "epoch": 0.014944517617485773, "grad_norm": 0.5767236948013306, "learning_rate": 8.67e-06, "loss": 2.3517, "mean_token_accuracy": 0.4869615617208183, "num_tokens": 125807478.0, "step": 868 }, { "entropy": 2.526611328125, "epoch": 0.014961734803681034, "grad_norm": 0.6212364435195923, "learning_rate": 8.68e-06, "loss": 2.5197, "mean_token_accuracy": 0.4605614240281284, "num_tokens": 125927918.0, "step": 869 }, { "entropy": 2.4381103515625, "epoch": 0.014978951989876295, "grad_norm": 0.6073437929153442, "learning_rate": 8.690000000000002e-06, "loss": 2.3433, "mean_token_accuracy": 0.4832576513290405, "num_tokens": 126070123.0, "step": 870 }, { "entropy": 2.5057373046875, "epoch": 0.014996169176071555, "grad_norm": 0.5757139325141907, "learning_rate": 8.700000000000001e-06, "loss": 2.4574, "mean_token_accuracy": 0.4752332870848477, "num_tokens": 126221119.0, "step": 871 }, { "entropy": 2.47509765625, "epoch": 0.015013386362266816, "grad_norm": 0.5421658754348755, "learning_rate": 8.710000000000001e-06, "loss": 2.4344, "mean_token_accuracy": 0.48030671663582325, "num_tokens": 126376014.0, "step": 872 }, { "entropy": 2.404052734375, "epoch": 0.015030603548462074, "grad_norm": 0.6064536571502686, "learning_rate": 8.720000000000001e-06, "loss": 2.3335, "mean_token_accuracy": 0.4924787334166467, "num_tokens": 126511198.0, "step": 873 }, { "entropy": 2.507080078125, "epoch": 0.015047820734657335, "grad_norm": 0.5994257926940918, "learning_rate": 8.730000000000001e-06, "loss": 2.4807, "mean_token_accuracy": 0.4721558247692883, "num_tokens": 126647001.0, "step": 874 }, { "entropy": 2.5140380859375, "epoch": 0.015065037920852595, "grad_norm": 0.5920009016990662, "learning_rate": 8.740000000000001e-06, "loss": 2.479, "mean_token_accuracy": 0.47505279863253236, "num_tokens": 126795132.0, "step": 875 }, { "entropy": 2.4464111328125, "epoch": 0.015082255107047856, "grad_norm": 0.6077152490615845, "learning_rate": 8.750000000000001e-06, "loss": 2.4088, "mean_token_accuracy": 0.48258367320522666, "num_tokens": 126943201.0, "step": 876 }, { "entropy": 2.421630859375, "epoch": 0.015099472293243115, "grad_norm": 0.5878798365592957, "learning_rate": 8.76e-06, "loss": 2.3334, "mean_token_accuracy": 0.48565139481797814, "num_tokens": 127089107.0, "step": 877 }, { "entropy": 2.4451904296875, "epoch": 0.015116689479438375, "grad_norm": 0.6210505962371826, "learning_rate": 8.77e-06, "loss": 2.3817, "mean_token_accuracy": 0.48540265718474984, "num_tokens": 127229750.0, "step": 878 }, { "entropy": 2.404541015625, "epoch": 0.015133906665633636, "grad_norm": 0.6188786029815674, "learning_rate": 8.78e-06, "loss": 2.3565, "mean_token_accuracy": 0.4887020909227431, "num_tokens": 127362276.0, "step": 879 }, { "entropy": 2.43212890625, "epoch": 0.015151123851828896, "grad_norm": 0.587872326374054, "learning_rate": 8.79e-06, "loss": 2.4133, "mean_token_accuracy": 0.48197965091094375, "num_tokens": 127499990.0, "step": 880 }, { "entropy": 2.486572265625, "epoch": 0.015168341038024155, "grad_norm": 0.6304271817207336, "learning_rate": 8.8e-06, "loss": 2.4624, "mean_token_accuracy": 0.47782522393390536, "num_tokens": 127642780.0, "step": 881 }, { "entropy": 2.51318359375, "epoch": 0.015185558224219416, "grad_norm": 0.5970308780670166, "learning_rate": 8.81e-06, "loss": 2.4563, "mean_token_accuracy": 0.47266413597390056, "num_tokens": 127792589.0, "step": 882 }, { "entropy": 2.486083984375, "epoch": 0.015202775410414676, "grad_norm": 0.558076024055481, "learning_rate": 8.82e-06, "loss": 2.4215, "mean_token_accuracy": 0.4797168541699648, "num_tokens": 127940654.0, "step": 883 }, { "entropy": 2.39892578125, "epoch": 0.015219992596609937, "grad_norm": 0.569231390953064, "learning_rate": 8.83e-06, "loss": 2.3439, "mean_token_accuracy": 0.48988029221072793, "num_tokens": 128082621.0, "step": 884 }, { "entropy": 2.4755859375, "epoch": 0.015237209782805195, "grad_norm": 0.5827202796936035, "learning_rate": 8.84e-06, "loss": 2.4142, "mean_token_accuracy": 0.4810488768853247, "num_tokens": 128215498.0, "step": 885 }, { "entropy": 2.5076904296875, "epoch": 0.015254426969000456, "grad_norm": 0.5866194367408752, "learning_rate": 8.85e-06, "loss": 2.4625, "mean_token_accuracy": 0.4744989378377795, "num_tokens": 128353838.0, "step": 886 }, { "entropy": 2.479736328125, "epoch": 0.015271644155195717, "grad_norm": 0.5743751525878906, "learning_rate": 8.860000000000002e-06, "loss": 2.4037, "mean_token_accuracy": 0.4828067999333143, "num_tokens": 128499102.0, "step": 887 }, { "entropy": 2.485595703125, "epoch": 0.015288861341390977, "grad_norm": 0.5429300665855408, "learning_rate": 8.870000000000001e-06, "loss": 2.4312, "mean_token_accuracy": 0.4705997440032661, "num_tokens": 128658894.0, "step": 888 }, { "entropy": 2.4361572265625, "epoch": 0.015306078527586236, "grad_norm": 0.5508224368095398, "learning_rate": 8.880000000000001e-06, "loss": 2.4386, "mean_token_accuracy": 0.48232552874833345, "num_tokens": 128803595.0, "step": 889 }, { "entropy": 2.453369140625, "epoch": 0.015323295713781496, "grad_norm": 0.5834963917732239, "learning_rate": 8.890000000000001e-06, "loss": 2.4298, "mean_token_accuracy": 0.4826282048597932, "num_tokens": 128939226.0, "step": 890 }, { "entropy": 2.508056640625, "epoch": 0.015340512899976757, "grad_norm": 0.5773516297340393, "learning_rate": 8.900000000000001e-06, "loss": 2.4476, "mean_token_accuracy": 0.4731542756780982, "num_tokens": 129073231.0, "step": 891 }, { "entropy": 2.526611328125, "epoch": 0.015357730086172017, "grad_norm": 0.6158662438392639, "learning_rate": 8.910000000000001e-06, "loss": 2.5126, "mean_token_accuracy": 0.47055464377626777, "num_tokens": 129212971.0, "step": 892 }, { "entropy": 2.49365234375, "epoch": 0.015374947272367276, "grad_norm": 0.571234941482544, "learning_rate": 8.920000000000001e-06, "loss": 2.4735, "mean_token_accuracy": 0.47626718133687973, "num_tokens": 129364837.0, "step": 893 }, { "entropy": 2.50927734375, "epoch": 0.015392164458562537, "grad_norm": 0.5566508173942566, "learning_rate": 8.930000000000001e-06, "loss": 2.4622, "mean_token_accuracy": 0.475893325638026, "num_tokens": 129513090.0, "step": 894 }, { "entropy": 2.4493408203125, "epoch": 0.015409381644757797, "grad_norm": 0.6352636814117432, "learning_rate": 8.94e-06, "loss": 2.4011, "mean_token_accuracy": 0.48302224883809686, "num_tokens": 129639016.0, "step": 895 }, { "entropy": 2.4149169921875, "epoch": 0.015426598830953058, "grad_norm": 0.6071867942810059, "learning_rate": 8.95e-06, "loss": 2.3388, "mean_token_accuracy": 0.48970347130671144, "num_tokens": 129775367.0, "step": 896 }, { "entropy": 2.477783203125, "epoch": 0.015443816017148317, "grad_norm": 0.6278470754623413, "learning_rate": 8.96e-06, "loss": 2.475, "mean_token_accuracy": 0.4722928828559816, "num_tokens": 129899393.0, "step": 897 }, { "entropy": 2.461669921875, "epoch": 0.015461033203343577, "grad_norm": 0.5411078929901123, "learning_rate": 8.97e-06, "loss": 2.4289, "mean_token_accuracy": 0.4794709859415889, "num_tokens": 130049335.0, "step": 898 }, { "entropy": 2.46728515625, "epoch": 0.015478250389538838, "grad_norm": 0.6227301955223083, "learning_rate": 8.98e-06, "loss": 2.4675, "mean_token_accuracy": 0.48002155777066946, "num_tokens": 130191241.0, "step": 899 }, { "entropy": 2.513916015625, "epoch": 0.015495467575734098, "grad_norm": 0.553110659122467, "learning_rate": 8.99e-06, "loss": 2.4838, "mean_token_accuracy": 0.47137136245146394, "num_tokens": 130348446.0, "step": 900 }, { "entropy": 2.462890625, "epoch": 0.015512684761929359, "grad_norm": 0.596051037311554, "learning_rate": 9e-06, "loss": 2.4365, "mean_token_accuracy": 0.48188950633630157, "num_tokens": 130487415.0, "step": 901 }, { "entropy": 2.442138671875, "epoch": 0.015529901948124617, "grad_norm": 0.6585226058959961, "learning_rate": 9.01e-06, "loss": 2.4141, "mean_token_accuracy": 0.4807861549779773, "num_tokens": 130614941.0, "step": 902 }, { "entropy": 2.4937744140625, "epoch": 0.015547119134319878, "grad_norm": 0.6055911779403687, "learning_rate": 9.020000000000002e-06, "loss": 2.4229, "mean_token_accuracy": 0.4837539931759238, "num_tokens": 130754310.0, "step": 903 }, { "entropy": 2.43115234375, "epoch": 0.015564336320515139, "grad_norm": 0.5959157347679138, "learning_rate": 9.030000000000002e-06, "loss": 2.3802, "mean_token_accuracy": 0.4859010446816683, "num_tokens": 130906157.0, "step": 904 }, { "entropy": 2.461181640625, "epoch": 0.015581553506710399, "grad_norm": 0.5708449482917786, "learning_rate": 9.040000000000002e-06, "loss": 2.4052, "mean_token_accuracy": 0.4815267431549728, "num_tokens": 131059316.0, "step": 905 }, { "entropy": 2.448974609375, "epoch": 0.015598770692905658, "grad_norm": 0.577296257019043, "learning_rate": 9.050000000000001e-06, "loss": 2.3894, "mean_token_accuracy": 0.4845569171011448, "num_tokens": 131205522.0, "step": 906 }, { "entropy": 2.41845703125, "epoch": 0.015615987879100918, "grad_norm": 0.60146164894104, "learning_rate": 9.060000000000001e-06, "loss": 2.3876, "mean_token_accuracy": 0.4812913998030126, "num_tokens": 131352873.0, "step": 907 }, { "entropy": 2.4771728515625, "epoch": 0.01563320506529618, "grad_norm": 0.6240298748016357, "learning_rate": 9.070000000000001e-06, "loss": 2.4327, "mean_token_accuracy": 0.4764832267537713, "num_tokens": 131486068.0, "step": 908 }, { "entropy": 2.4599609375, "epoch": 0.01565042225149144, "grad_norm": 1.2479300498962402, "learning_rate": 9.080000000000001e-06, "loss": 2.4198, "mean_token_accuracy": 0.47274223202839494, "num_tokens": 131625641.0, "step": 909 }, { "entropy": 2.379638671875, "epoch": 0.0156676394376867, "grad_norm": 0.6223964095115662, "learning_rate": 9.090000000000001e-06, "loss": 2.3421, "mean_token_accuracy": 0.4910487150773406, "num_tokens": 131774897.0, "step": 910 }, { "entropy": 2.4305419921875, "epoch": 0.01568485662388196, "grad_norm": 0.6342669725418091, "learning_rate": 9.100000000000001e-06, "loss": 2.4116, "mean_token_accuracy": 0.48323542159050703, "num_tokens": 131918941.0, "step": 911 }, { "entropy": 2.3966064453125, "epoch": 0.015702073810077218, "grad_norm": 0.5977234244346619, "learning_rate": 9.110000000000001e-06, "loss": 2.3407, "mean_token_accuracy": 0.4932427750900388, "num_tokens": 132058255.0, "step": 912 }, { "entropy": 2.4349365234375, "epoch": 0.015719290996272478, "grad_norm": 0.5736342072486877, "learning_rate": 9.12e-06, "loss": 2.3943, "mean_token_accuracy": 0.4837222811765969, "num_tokens": 132197276.0, "step": 913 }, { "entropy": 2.4132080078125, "epoch": 0.01573650818246774, "grad_norm": 0.6241025924682617, "learning_rate": 9.13e-06, "loss": 2.3856, "mean_token_accuracy": 0.48743872344493866, "num_tokens": 132337325.0, "step": 914 }, { "entropy": 2.41748046875, "epoch": 0.015753725368663, "grad_norm": 0.6098312735557556, "learning_rate": 9.14e-06, "loss": 2.374, "mean_token_accuracy": 0.4855101592838764, "num_tokens": 132486321.0, "step": 915 }, { "entropy": 2.479248046875, "epoch": 0.01577094255485826, "grad_norm": 0.5596804022789001, "learning_rate": 9.15e-06, "loss": 2.3876, "mean_token_accuracy": 0.47576976707205176, "num_tokens": 132647149.0, "step": 916 }, { "entropy": 2.4208984375, "epoch": 0.01578815974105352, "grad_norm": 0.5681569576263428, "learning_rate": 9.16e-06, "loss": 2.3797, "mean_token_accuracy": 0.48612832371145487, "num_tokens": 132795123.0, "step": 917 }, { "entropy": 2.4420166015625, "epoch": 0.01580537692724878, "grad_norm": 0.6165553331375122, "learning_rate": 9.17e-06, "loss": 2.4223, "mean_token_accuracy": 0.4839365719817579, "num_tokens": 132930289.0, "step": 918 }, { "entropy": 2.458740234375, "epoch": 0.01582259411344404, "grad_norm": 0.6469439268112183, "learning_rate": 9.180000000000002e-06, "loss": 2.4158, "mean_token_accuracy": 0.48115426022559404, "num_tokens": 133057834.0, "step": 919 }, { "entropy": 2.4097900390625, "epoch": 0.015839811299639298, "grad_norm": 0.5866466164588928, "learning_rate": 9.190000000000002e-06, "loss": 2.3269, "mean_token_accuracy": 0.48917456716299057, "num_tokens": 133200644.0, "step": 920 }, { "entropy": 2.463623046875, "epoch": 0.01585702848583456, "grad_norm": 0.5503515601158142, "learning_rate": 9.200000000000002e-06, "loss": 2.4136, "mean_token_accuracy": 0.478064242284745, "num_tokens": 133352710.0, "step": 921 }, { "entropy": 2.4775390625, "epoch": 0.01587424567202982, "grad_norm": 0.5962862968444824, "learning_rate": 9.210000000000002e-06, "loss": 2.454, "mean_token_accuracy": 0.47544111032038927, "num_tokens": 133501140.0, "step": 922 }, { "entropy": 2.502197265625, "epoch": 0.01589146285822508, "grad_norm": 0.6619735360145569, "learning_rate": 9.220000000000002e-06, "loss": 2.4128, "mean_token_accuracy": 0.47879257379099727, "num_tokens": 133618064.0, "step": 923 }, { "entropy": 2.3917236328125, "epoch": 0.01590868004442034, "grad_norm": 0.6243993639945984, "learning_rate": 9.230000000000001e-06, "loss": 2.3601, "mean_token_accuracy": 0.49083237862214446, "num_tokens": 133746355.0, "step": 924 }, { "entropy": 2.4593505859375, "epoch": 0.0159258972306156, "grad_norm": 0.5959250926971436, "learning_rate": 9.240000000000001e-06, "loss": 2.3893, "mean_token_accuracy": 0.487882892601192, "num_tokens": 133887812.0, "step": 925 }, { "entropy": 2.4609375, "epoch": 0.01594311441681086, "grad_norm": 0.6122800707817078, "learning_rate": 9.250000000000001e-06, "loss": 2.432, "mean_token_accuracy": 0.47797348722815514, "num_tokens": 134038997.0, "step": 926 }, { "entropy": 2.46875, "epoch": 0.015960331603006122, "grad_norm": 0.547160267829895, "learning_rate": 9.260000000000001e-06, "loss": 2.4407, "mean_token_accuracy": 0.47520409850403666, "num_tokens": 134198302.0, "step": 927 }, { "entropy": 2.48974609375, "epoch": 0.015977548789201382, "grad_norm": 0.5570476651191711, "learning_rate": 9.270000000000001e-06, "loss": 2.4886, "mean_token_accuracy": 0.47539124358445406, "num_tokens": 134342027.0, "step": 928 }, { "entropy": 2.41064453125, "epoch": 0.01599476597539664, "grad_norm": 0.5551185011863708, "learning_rate": 9.280000000000001e-06, "loss": 2.3801, "mean_token_accuracy": 0.48781104385852814, "num_tokens": 134495434.0, "step": 929 }, { "entropy": 2.4661865234375, "epoch": 0.0160119831615919, "grad_norm": 0.6195770502090454, "learning_rate": 9.29e-06, "loss": 2.4388, "mean_token_accuracy": 0.4786299457773566, "num_tokens": 134626002.0, "step": 930 }, { "entropy": 2.4622802734375, "epoch": 0.01602920034778716, "grad_norm": 0.6120036244392395, "learning_rate": 9.3e-06, "loss": 2.4435, "mean_token_accuracy": 0.47908638790249825, "num_tokens": 134778634.0, "step": 931 }, { "entropy": 2.5621337890625, "epoch": 0.01604641753398242, "grad_norm": 0.5915412306785583, "learning_rate": 9.31e-06, "loss": 2.5866, "mean_token_accuracy": 0.462941222358495, "num_tokens": 134929621.0, "step": 932 }, { "entropy": 2.41796875, "epoch": 0.01606363472017768, "grad_norm": 0.5830768346786499, "learning_rate": 9.32e-06, "loss": 2.355, "mean_token_accuracy": 0.4878848767839372, "num_tokens": 135086585.0, "step": 933 }, { "entropy": 2.353759765625, "epoch": 0.016080851906372942, "grad_norm": 0.6241956949234009, "learning_rate": 9.33e-06, "loss": 2.3049, "mean_token_accuracy": 0.49892251333221793, "num_tokens": 135231782.0, "step": 934 }, { "entropy": 2.406005859375, "epoch": 0.016098069092568203, "grad_norm": 0.5853713750839233, "learning_rate": 9.340000000000002e-06, "loss": 2.4124, "mean_token_accuracy": 0.4829076291061938, "num_tokens": 135378790.0, "step": 935 }, { "entropy": 2.5213623046875, "epoch": 0.016115286278763463, "grad_norm": 0.5981717109680176, "learning_rate": 9.350000000000002e-06, "loss": 2.5113, "mean_token_accuracy": 0.4728419524617493, "num_tokens": 135520311.0, "step": 936 }, { "entropy": 2.5010986328125, "epoch": 0.01613250346495872, "grad_norm": 0.5917856693267822, "learning_rate": 9.360000000000002e-06, "loss": 2.4518, "mean_token_accuracy": 0.46924112644046545, "num_tokens": 135669776.0, "step": 937 }, { "entropy": 2.42724609375, "epoch": 0.01614972065115398, "grad_norm": 0.5660749673843384, "learning_rate": 9.370000000000002e-06, "loss": 2.3848, "mean_token_accuracy": 0.4782256758771837, "num_tokens": 135821760.0, "step": 938 }, { "entropy": 2.498779296875, "epoch": 0.01616693783734924, "grad_norm": 0.5981243252754211, "learning_rate": 9.38e-06, "loss": 2.4628, "mean_token_accuracy": 0.4730893294326961, "num_tokens": 135956326.0, "step": 939 }, { "entropy": 2.3973388671875, "epoch": 0.016184155023544502, "grad_norm": 0.580276608467102, "learning_rate": 9.39e-06, "loss": 2.3663, "mean_token_accuracy": 0.4926355588249862, "num_tokens": 136101164.0, "step": 940 }, { "entropy": 2.470947265625, "epoch": 0.016201372209739762, "grad_norm": 0.5627194046974182, "learning_rate": 9.4e-06, "loss": 2.416, "mean_token_accuracy": 0.4773054295219481, "num_tokens": 136251505.0, "step": 941 }, { "entropy": 2.519775390625, "epoch": 0.016218589395935023, "grad_norm": 0.6778742671012878, "learning_rate": 9.41e-06, "loss": 2.4862, "mean_token_accuracy": 0.4745273180305958, "num_tokens": 136391079.0, "step": 942 }, { "entropy": 2.4498291015625, "epoch": 0.016235806582130283, "grad_norm": 0.582788348197937, "learning_rate": 9.42e-06, "loss": 2.3946, "mean_token_accuracy": 0.48375819250941277, "num_tokens": 136540299.0, "step": 943 }, { "entropy": 2.4471435546875, "epoch": 0.016253023768325544, "grad_norm": 0.5528161525726318, "learning_rate": 9.43e-06, "loss": 2.3405, "mean_token_accuracy": 0.47900286270305514, "num_tokens": 136688027.0, "step": 944 }, { "entropy": 2.429931640625, "epoch": 0.0162702409545208, "grad_norm": 0.6193938851356506, "learning_rate": 9.440000000000001e-06, "loss": 2.3861, "mean_token_accuracy": 0.4864586624316871, "num_tokens": 136823997.0, "step": 945 }, { "entropy": 2.4169921875, "epoch": 0.01628745814071606, "grad_norm": 0.5639959573745728, "learning_rate": 9.450000000000001e-06, "loss": 2.3965, "mean_token_accuracy": 0.4871432662475854, "num_tokens": 136967987.0, "step": 946 }, { "entropy": 2.4097900390625, "epoch": 0.016304675326911322, "grad_norm": 0.593999981880188, "learning_rate": 9.460000000000001e-06, "loss": 2.3956, "mean_token_accuracy": 0.48685387754812837, "num_tokens": 137120260.0, "step": 947 }, { "entropy": 2.4072265625, "epoch": 0.016321892513106583, "grad_norm": 0.5643085837364197, "learning_rate": 9.47e-06, "loss": 2.387, "mean_token_accuracy": 0.4881442333571613, "num_tokens": 137280017.0, "step": 948 }, { "entropy": 2.474853515625, "epoch": 0.016339109699301843, "grad_norm": 0.602178156375885, "learning_rate": 9.48e-06, "loss": 2.4429, "mean_token_accuracy": 0.4794820128008723, "num_tokens": 137431493.0, "step": 949 }, { "entropy": 2.43505859375, "epoch": 0.016356326885497104, "grad_norm": 0.5669793486595154, "learning_rate": 9.49e-06, "loss": 2.4303, "mean_token_accuracy": 0.4830532097257674, "num_tokens": 137575150.0, "step": 950 }, { "entropy": 2.4486083984375, "epoch": 0.016373544071692364, "grad_norm": 0.5908523201942444, "learning_rate": 9.5e-06, "loss": 2.4052, "mean_token_accuracy": 0.4807011899538338, "num_tokens": 137718243.0, "step": 951 }, { "entropy": 2.466552734375, "epoch": 0.016390761257887625, "grad_norm": 0.5942707061767578, "learning_rate": 9.51e-06, "loss": 2.4347, "mean_token_accuracy": 0.47939926059916615, "num_tokens": 137855181.0, "step": 952 }, { "entropy": 2.43359375, "epoch": 0.016407978444082885, "grad_norm": 0.6280327439308167, "learning_rate": 9.52e-06, "loss": 2.4106, "mean_token_accuracy": 0.48289193166419864, "num_tokens": 137984833.0, "step": 953 }, { "entropy": 2.4107666015625, "epoch": 0.016425195630278142, "grad_norm": 0.6055025458335876, "learning_rate": 9.53e-06, "loss": 2.3538, "mean_token_accuracy": 0.48937113396823406, "num_tokens": 138132043.0, "step": 954 }, { "entropy": 2.403076171875, "epoch": 0.016442412816473403, "grad_norm": 0.557295024394989, "learning_rate": 9.54e-06, "loss": 2.3975, "mean_token_accuracy": 0.48123540729284286, "num_tokens": 138295818.0, "step": 955 }, { "entropy": 2.475830078125, "epoch": 0.016459630002668663, "grad_norm": 0.6141370534896851, "learning_rate": 9.55e-06, "loss": 2.4164, "mean_token_accuracy": 0.47790815867483616, "num_tokens": 138437598.0, "step": 956 }, { "entropy": 2.3380126953125, "epoch": 0.016476847188863924, "grad_norm": 0.5739216208457947, "learning_rate": 9.56e-06, "loss": 2.277, "mean_token_accuracy": 0.5020720390602946, "num_tokens": 138579286.0, "step": 957 }, { "entropy": 2.482177734375, "epoch": 0.016494064375059184, "grad_norm": 0.5679910778999329, "learning_rate": 9.57e-06, "loss": 2.4916, "mean_token_accuracy": 0.4723518299870193, "num_tokens": 138737250.0, "step": 958 }, { "entropy": 2.428466796875, "epoch": 0.016511281561254445, "grad_norm": 0.5608857274055481, "learning_rate": 9.58e-06, "loss": 2.3651, "mean_token_accuracy": 0.48081721225753427, "num_tokens": 138887535.0, "step": 959 }, { "entropy": 2.45068359375, "epoch": 0.016528498747449705, "grad_norm": 0.5733979344367981, "learning_rate": 9.59e-06, "loss": 2.3793, "mean_token_accuracy": 0.4807218159548938, "num_tokens": 139046347.0, "step": 960 }, { "entropy": 2.4381103515625, "epoch": 0.016545715933644966, "grad_norm": 0.5601542592048645, "learning_rate": 9.600000000000001e-06, "loss": 2.3687, "mean_token_accuracy": 0.48160395165905356, "num_tokens": 139201178.0, "step": 961 }, { "entropy": 2.4107666015625, "epoch": 0.016562933119840223, "grad_norm": 0.5981694459915161, "learning_rate": 9.610000000000001e-06, "loss": 2.3668, "mean_token_accuracy": 0.48300402937456965, "num_tokens": 139337844.0, "step": 962 }, { "entropy": 2.4951171875, "epoch": 0.016580150306035484, "grad_norm": 0.5836125016212463, "learning_rate": 9.620000000000001e-06, "loss": 2.4706, "mean_token_accuracy": 0.467432489618659, "num_tokens": 139469993.0, "step": 963 }, { "entropy": 2.443603515625, "epoch": 0.016597367492230744, "grad_norm": 0.5777595043182373, "learning_rate": 9.630000000000001e-06, "loss": 2.3732, "mean_token_accuracy": 0.48406707495450974, "num_tokens": 139628918.0, "step": 964 }, { "entropy": 2.448974609375, "epoch": 0.016614584678426005, "grad_norm": 0.5910548567771912, "learning_rate": 9.640000000000001e-06, "loss": 2.4025, "mean_token_accuracy": 0.48476587794721127, "num_tokens": 139779064.0, "step": 965 }, { "entropy": 2.4569091796875, "epoch": 0.016631801864621265, "grad_norm": 0.6000712513923645, "learning_rate": 9.65e-06, "loss": 2.4165, "mean_token_accuracy": 0.4791805609129369, "num_tokens": 139944530.0, "step": 966 }, { "entropy": 2.45849609375, "epoch": 0.016649019050816526, "grad_norm": 0.5988768935203552, "learning_rate": 9.66e-06, "loss": 2.3822, "mean_token_accuracy": 0.4866671049967408, "num_tokens": 140083921.0, "step": 967 }, { "entropy": 2.46728515625, "epoch": 0.016666236237011786, "grad_norm": 0.5854329466819763, "learning_rate": 9.67e-06, "loss": 2.3907, "mean_token_accuracy": 0.47991981683298945, "num_tokens": 140224971.0, "step": 968 }, { "entropy": 2.533447265625, "epoch": 0.016683453423207047, "grad_norm": 0.6449822187423706, "learning_rate": 9.68e-06, "loss": 2.5346, "mean_token_accuracy": 0.4715711669996381, "num_tokens": 140353765.0, "step": 969 }, { "entropy": 2.4517822265625, "epoch": 0.016700670609402304, "grad_norm": 0.562483549118042, "learning_rate": 9.69e-06, "loss": 2.4227, "mean_token_accuracy": 0.47965225437656045, "num_tokens": 140498830.0, "step": 970 }, { "entropy": 2.4345703125, "epoch": 0.016717887795597564, "grad_norm": 0.5308300852775574, "learning_rate": 9.7e-06, "loss": 2.4248, "mean_token_accuracy": 0.47759495256468654, "num_tokens": 140665634.0, "step": 971 }, { "entropy": 2.492919921875, "epoch": 0.016735104981792825, "grad_norm": 0.9326145052909851, "learning_rate": 9.71e-06, "loss": 2.4872, "mean_token_accuracy": 0.468785522505641, "num_tokens": 140802529.0, "step": 972 }, { "entropy": 2.37744140625, "epoch": 0.016752322167988085, "grad_norm": 0.608925998210907, "learning_rate": 9.72e-06, "loss": 2.3317, "mean_token_accuracy": 0.4957911465317011, "num_tokens": 140941381.0, "step": 973 }, { "entropy": 2.4649658203125, "epoch": 0.016769539354183346, "grad_norm": 0.5866920948028564, "learning_rate": 9.73e-06, "loss": 2.4661, "mean_token_accuracy": 0.4737902185879648, "num_tokens": 141074013.0, "step": 974 }, { "entropy": 2.52001953125, "epoch": 0.016786756540378606, "grad_norm": 0.6548328399658203, "learning_rate": 9.74e-06, "loss": 2.5289, "mean_token_accuracy": 0.4689825112000108, "num_tokens": 141211732.0, "step": 975 }, { "entropy": 2.457275390625, "epoch": 0.016803973726573867, "grad_norm": 0.5622747540473938, "learning_rate": 9.75e-06, "loss": 2.4627, "mean_token_accuracy": 0.474064817186445, "num_tokens": 141361499.0, "step": 976 }, { "entropy": 2.4368896484375, "epoch": 0.016821190912769127, "grad_norm": 0.5637742280960083, "learning_rate": 9.760000000000001e-06, "loss": 2.4061, "mean_token_accuracy": 0.4773513269610703, "num_tokens": 141516957.0, "step": 977 }, { "entropy": 2.357666015625, "epoch": 0.016838408098964388, "grad_norm": 0.6168224215507507, "learning_rate": 9.770000000000001e-06, "loss": 2.3441, "mean_token_accuracy": 0.4971775123849511, "num_tokens": 141659725.0, "step": 978 }, { "entropy": 2.482666015625, "epoch": 0.016855625285159645, "grad_norm": 0.5854248404502869, "learning_rate": 9.780000000000001e-06, "loss": 2.4517, "mean_token_accuracy": 0.4778465088456869, "num_tokens": 141812273.0, "step": 979 }, { "entropy": 2.5281982421875, "epoch": 0.016872842471354906, "grad_norm": 0.5742073059082031, "learning_rate": 9.790000000000001e-06, "loss": 2.5098, "mean_token_accuracy": 0.47218647226691246, "num_tokens": 141952803.0, "step": 980 }, { "entropy": 2.4508056640625, "epoch": 0.016890059657550166, "grad_norm": 0.7346315383911133, "learning_rate": 9.800000000000001e-06, "loss": 2.3992, "mean_token_accuracy": 0.4813456032425165, "num_tokens": 142105077.0, "step": 981 }, { "entropy": 2.3946533203125, "epoch": 0.016907276843745427, "grad_norm": 0.613284170627594, "learning_rate": 9.810000000000001e-06, "loss": 2.3475, "mean_token_accuracy": 0.49409126583486795, "num_tokens": 142252173.0, "step": 982 }, { "entropy": 2.473876953125, "epoch": 0.016924494029940687, "grad_norm": 0.5658174753189087, "learning_rate": 9.820000000000001e-06, "loss": 2.459, "mean_token_accuracy": 0.47720590187236667, "num_tokens": 142396038.0, "step": 983 }, { "entropy": 2.395751953125, "epoch": 0.016941711216135948, "grad_norm": 0.5382007956504822, "learning_rate": 9.83e-06, "loss": 2.373, "mean_token_accuracy": 0.4882864346727729, "num_tokens": 142564149.0, "step": 984 }, { "entropy": 2.482666015625, "epoch": 0.016958928402331208, "grad_norm": 0.5813350677490234, "learning_rate": 9.84e-06, "loss": 2.4836, "mean_token_accuracy": 0.4725424610078335, "num_tokens": 142701037.0, "step": 985 }, { "entropy": 2.487060546875, "epoch": 0.01697614558852647, "grad_norm": 0.5810776948928833, "learning_rate": 9.85e-06, "loss": 2.4691, "mean_token_accuracy": 0.48016467317938805, "num_tokens": 142855073.0, "step": 986 }, { "entropy": 2.460693359375, "epoch": 0.016993362774721726, "grad_norm": 0.5684233903884888, "learning_rate": 9.86e-06, "loss": 2.4004, "mean_token_accuracy": 0.47402910608798265, "num_tokens": 143006703.0, "step": 987 }, { "entropy": 2.426513671875, "epoch": 0.017010579960916986, "grad_norm": 0.5875155925750732, "learning_rate": 9.87e-06, "loss": 2.4032, "mean_token_accuracy": 0.4833544669672847, "num_tokens": 143158319.0, "step": 988 }, { "entropy": 2.3924560546875, "epoch": 0.017027797147112247, "grad_norm": 0.9405050873756409, "learning_rate": 9.88e-06, "loss": 2.3455, "mean_token_accuracy": 0.4880230622366071, "num_tokens": 143281917.0, "step": 989 }, { "entropy": 2.43017578125, "epoch": 0.017045014333307507, "grad_norm": 0.593075156211853, "learning_rate": 9.89e-06, "loss": 2.3675, "mean_token_accuracy": 0.48371881106868386, "num_tokens": 143424058.0, "step": 990 }, { "entropy": 2.4437255859375, "epoch": 0.017062231519502768, "grad_norm": 0.5774204134941101, "learning_rate": 9.9e-06, "loss": 2.4004, "mean_token_accuracy": 0.47859969455748796, "num_tokens": 143567199.0, "step": 991 }, { "entropy": 2.468505859375, "epoch": 0.01707944870569803, "grad_norm": 0.6035756468772888, "learning_rate": 9.91e-06, "loss": 2.3898, "mean_token_accuracy": 0.48759705713018775, "num_tokens": 143712988.0, "step": 992 }, { "entropy": 2.4527587890625, "epoch": 0.01709666589189329, "grad_norm": 0.5580205321311951, "learning_rate": 9.920000000000002e-06, "loss": 2.4491, "mean_token_accuracy": 0.47088390588760376, "num_tokens": 143861050.0, "step": 993 }, { "entropy": 2.4556884765625, "epoch": 0.01711388307808855, "grad_norm": 0.5672869086265564, "learning_rate": 9.930000000000001e-06, "loss": 2.4092, "mean_token_accuracy": 0.48082907358184457, "num_tokens": 144017646.0, "step": 994 }, { "entropy": 2.445556640625, "epoch": 0.017131100264283806, "grad_norm": 0.6213098764419556, "learning_rate": 9.940000000000001e-06, "loss": 2.4035, "mean_token_accuracy": 0.48345539439469576, "num_tokens": 144150604.0, "step": 995 }, { "entropy": 2.489990234375, "epoch": 0.017148317450479067, "grad_norm": 0.5691782832145691, "learning_rate": 9.950000000000001e-06, "loss": 2.4575, "mean_token_accuracy": 0.4700373010709882, "num_tokens": 144292661.0, "step": 996 }, { "entropy": 2.454833984375, "epoch": 0.017165534636674327, "grad_norm": 0.5717594027519226, "learning_rate": 9.960000000000001e-06, "loss": 2.418, "mean_token_accuracy": 0.48314423533156514, "num_tokens": 144445246.0, "step": 997 }, { "entropy": 2.4835205078125, "epoch": 0.017182751822869588, "grad_norm": 0.5863724946975708, "learning_rate": 9.970000000000001e-06, "loss": 2.4627, "mean_token_accuracy": 0.4698511818423867, "num_tokens": 144589498.0, "step": 998 }, { "entropy": 2.441650390625, "epoch": 0.01719996900906485, "grad_norm": 0.770044207572937, "learning_rate": 9.980000000000001e-06, "loss": 2.3511, "mean_token_accuracy": 0.4848445672541857, "num_tokens": 144744013.0, "step": 999 }, { "entropy": 2.443115234375, "epoch": 0.01721718619526011, "grad_norm": 0.6044683456420898, "learning_rate": 9.990000000000001e-06, "loss": 2.411, "mean_token_accuracy": 0.4802071452140808, "num_tokens": 144878720.0, "step": 1000 }, { "entropy": 2.4459228515625, "epoch": 0.01723440338145537, "grad_norm": 0.5749707221984863, "learning_rate": 1e-05, "loss": 2.3881, "mean_token_accuracy": 0.480140985455364, "num_tokens": 145028328.0, "step": 1001 }, { "entropy": 2.55908203125, "epoch": 0.01725162056765063, "grad_norm": 0.6056835651397705, "learning_rate": 9.999999992427464e-06, "loss": 2.5261, "mean_token_accuracy": 0.46429841918870807, "num_tokens": 145177909.0, "step": 1002 }, { "entropy": 2.475830078125, "epoch": 0.01726883775384589, "grad_norm": 0.6159217357635498, "learning_rate": 9.999999969709854e-06, "loss": 2.4367, "mean_token_accuracy": 0.4752111411653459, "num_tokens": 145331073.0, "step": 1003 }, { "entropy": 2.4053955078125, "epoch": 0.017286054940041148, "grad_norm": 0.5820682644844055, "learning_rate": 9.999999931847169e-06, "loss": 2.3818, "mean_token_accuracy": 0.4844290711916983, "num_tokens": 145480948.0, "step": 1004 }, { "entropy": 2.432861328125, "epoch": 0.017303272126236408, "grad_norm": 0.5615769624710083, "learning_rate": 9.999999878839412e-06, "loss": 2.3797, "mean_token_accuracy": 0.4817391145043075, "num_tokens": 145615367.0, "step": 1005 }, { "entropy": 2.3687744140625, "epoch": 0.01732048931243167, "grad_norm": 0.5944392085075378, "learning_rate": 9.999999810686582e-06, "loss": 2.3464, "mean_token_accuracy": 0.4950461401604116, "num_tokens": 145751257.0, "step": 1006 }, { "entropy": 2.491455078125, "epoch": 0.01733770649862693, "grad_norm": 0.5785252451896667, "learning_rate": 9.99999972738868e-06, "loss": 2.4419, "mean_token_accuracy": 0.4759130119346082, "num_tokens": 145881989.0, "step": 1007 }, { "entropy": 2.45263671875, "epoch": 0.01735492368482219, "grad_norm": 0.573020339012146, "learning_rate": 9.999999628945702e-06, "loss": 2.4126, "mean_token_accuracy": 0.47999184019863605, "num_tokens": 146033406.0, "step": 1008 }, { "entropy": 2.474365234375, "epoch": 0.01737214087101745, "grad_norm": 0.5616711378097534, "learning_rate": 9.999999515357654e-06, "loss": 2.4179, "mean_token_accuracy": 0.47331848414614797, "num_tokens": 146185163.0, "step": 1009 }, { "entropy": 2.3839111328125, "epoch": 0.01738935805721271, "grad_norm": 0.5724861025810242, "learning_rate": 9.999999386624534e-06, "loss": 2.338, "mean_token_accuracy": 0.4900155235081911, "num_tokens": 146335979.0, "step": 1010 }, { "entropy": 2.397216796875, "epoch": 0.01740657524340797, "grad_norm": 0.566051185131073, "learning_rate": 9.99999924274634e-06, "loss": 2.3623, "mean_token_accuracy": 0.49173881486058235, "num_tokens": 146492297.0, "step": 1011 }, { "entropy": 2.425537109375, "epoch": 0.01742379242960323, "grad_norm": 0.5698406100273132, "learning_rate": 9.999999083723077e-06, "loss": 2.3803, "mean_token_accuracy": 0.4822213868610561, "num_tokens": 146638961.0, "step": 1012 }, { "entropy": 2.457763671875, "epoch": 0.01744100961579849, "grad_norm": 0.5576664805412292, "learning_rate": 9.999998909554743e-06, "loss": 2.4175, "mean_token_accuracy": 0.47678716061636806, "num_tokens": 146785199.0, "step": 1013 }, { "entropy": 2.45947265625, "epoch": 0.01745822680199375, "grad_norm": 0.5309212803840637, "learning_rate": 9.99999872024134e-06, "loss": 2.4368, "mean_token_accuracy": 0.4784322748892009, "num_tokens": 146940920.0, "step": 1014 }, { "entropy": 2.443603515625, "epoch": 0.01747544398818901, "grad_norm": 0.6436007022857666, "learning_rate": 9.999998515782865e-06, "loss": 2.3972, "mean_token_accuracy": 0.4809753801673651, "num_tokens": 147096034.0, "step": 1015 }, { "entropy": 2.4503173828125, "epoch": 0.01749266117438427, "grad_norm": 0.5794939398765564, "learning_rate": 9.99999829617932e-06, "loss": 2.4099, "mean_token_accuracy": 0.47518764482811093, "num_tokens": 147241716.0, "step": 1016 }, { "entropy": 2.4049072265625, "epoch": 0.01750987836057953, "grad_norm": 0.6116197109222412, "learning_rate": 9.999998061430709e-06, "loss": 2.3468, "mean_token_accuracy": 0.4926842013373971, "num_tokens": 147378764.0, "step": 1017 }, { "entropy": 2.47314453125, "epoch": 0.01752709554677479, "grad_norm": 0.5858633518218994, "learning_rate": 9.99999781153703e-06, "loss": 2.3979, "mean_token_accuracy": 0.47995937149971724, "num_tokens": 147517319.0, "step": 1018 }, { "entropy": 2.5513916015625, "epoch": 0.017544312732970052, "grad_norm": 0.6136896014213562, "learning_rate": 9.999997546498284e-06, "loss": 2.5054, "mean_token_accuracy": 0.46979639306664467, "num_tokens": 147657307.0, "step": 1019 }, { "entropy": 2.4195556640625, "epoch": 0.01756152991916531, "grad_norm": 0.593510091304779, "learning_rate": 9.999997266314471e-06, "loss": 2.3881, "mean_token_accuracy": 0.48458396503701806, "num_tokens": 147806252.0, "step": 1020 }, { "entropy": 2.41064453125, "epoch": 0.01757874710536057, "grad_norm": 0.675787091255188, "learning_rate": 9.999996970985592e-06, "loss": 2.3654, "mean_token_accuracy": 0.48551547806710005, "num_tokens": 147979427.0, "step": 1021 }, { "entropy": 2.46533203125, "epoch": 0.01759596429155583, "grad_norm": 0.6314930319786072, "learning_rate": 9.99999666051165e-06, "loss": 2.4691, "mean_token_accuracy": 0.4804490995593369, "num_tokens": 148114782.0, "step": 1022 }, { "entropy": 2.4417724609375, "epoch": 0.01761318147775109, "grad_norm": 0.579401969909668, "learning_rate": 9.999996334892646e-06, "loss": 2.4204, "mean_token_accuracy": 0.482807548251003, "num_tokens": 148259581.0, "step": 1023 }, { "entropy": 2.452880859375, "epoch": 0.01763039866394635, "grad_norm": 0.5902701616287231, "learning_rate": 9.999995994128578e-06, "loss": 2.4434, "mean_token_accuracy": 0.47913903277367353, "num_tokens": 148400114.0, "step": 1024 }, { "entropy": 2.482666015625, "epoch": 0.017647615850141612, "grad_norm": 0.5950527787208557, "learning_rate": 9.999995638219448e-06, "loss": 2.4566, "mean_token_accuracy": 0.4763036798685789, "num_tokens": 148540440.0, "step": 1025 }, { "entropy": 2.446533203125, "epoch": 0.017664833036336872, "grad_norm": 0.6330975294113159, "learning_rate": 9.999995267165256e-06, "loss": 2.3904, "mean_token_accuracy": 0.4806701922789216, "num_tokens": 148692789.0, "step": 1026 }, { "entropy": 2.427978515625, "epoch": 0.017682050222532133, "grad_norm": 0.617744505405426, "learning_rate": 9.999994880966008e-06, "loss": 2.369, "mean_token_accuracy": 0.4830102613195777, "num_tokens": 148836617.0, "step": 1027 }, { "entropy": 2.42822265625, "epoch": 0.017699267408727393, "grad_norm": 0.6210005879402161, "learning_rate": 9.999994479621703e-06, "loss": 2.3569, "mean_token_accuracy": 0.49285168340429664, "num_tokens": 148982058.0, "step": 1028 }, { "entropy": 2.52392578125, "epoch": 0.01771648459492265, "grad_norm": 0.5944798588752747, "learning_rate": 9.999994063132336e-06, "loss": 2.4881, "mean_token_accuracy": 0.46862922469154, "num_tokens": 149117257.0, "step": 1029 }, { "entropy": 2.4248046875, "epoch": 0.01773370178111791, "grad_norm": 0.603406548500061, "learning_rate": 9.999993631497918e-06, "loss": 2.3968, "mean_token_accuracy": 0.4805483613163233, "num_tokens": 149253194.0, "step": 1030 }, { "entropy": 2.48681640625, "epoch": 0.01775091896731317, "grad_norm": 0.5487124919891357, "learning_rate": 9.999993184718445e-06, "loss": 2.4613, "mean_token_accuracy": 0.4695818484760821, "num_tokens": 149418815.0, "step": 1031 }, { "entropy": 2.3785400390625, "epoch": 0.017768136153508432, "grad_norm": 0.5934000015258789, "learning_rate": 9.999992722793916e-06, "loss": 2.3197, "mean_token_accuracy": 0.4929351657629013, "num_tokens": 149565302.0, "step": 1032 }, { "entropy": 2.41943359375, "epoch": 0.017785353339703693, "grad_norm": 0.5832322835922241, "learning_rate": 9.999992245724338e-06, "loss": 2.3851, "mean_token_accuracy": 0.4843961731530726, "num_tokens": 149701523.0, "step": 1033 }, { "entropy": 2.477294921875, "epoch": 0.017802570525898953, "grad_norm": 0.578316330909729, "learning_rate": 9.99999175350971e-06, "loss": 2.4705, "mean_token_accuracy": 0.4734199936501682, "num_tokens": 149841173.0, "step": 1034 }, { "entropy": 2.496337890625, "epoch": 0.017819787712094214, "grad_norm": 1.1719509363174438, "learning_rate": 9.999991246150034e-06, "loss": 2.469, "mean_token_accuracy": 0.4775820942595601, "num_tokens": 149977715.0, "step": 1035 }, { "entropy": 2.458251953125, "epoch": 0.017837004898289474, "grad_norm": 0.5495033860206604, "learning_rate": 9.999990723645309e-06, "loss": 2.4407, "mean_token_accuracy": 0.47903554420918226, "num_tokens": 150124141.0, "step": 1036 }, { "entropy": 2.453857421875, "epoch": 0.01785422208448473, "grad_norm": 0.6319074034690857, "learning_rate": 9.99999018599554e-06, "loss": 2.4007, "mean_token_accuracy": 0.4762126957066357, "num_tokens": 150278398.0, "step": 1037 }, { "entropy": 2.435791015625, "epoch": 0.01787143927067999, "grad_norm": 0.6012912392616272, "learning_rate": 9.999989633200726e-06, "loss": 2.3693, "mean_token_accuracy": 0.490336746443063, "num_tokens": 150415689.0, "step": 1038 }, { "entropy": 2.431640625, "epoch": 0.017888656456875252, "grad_norm": 0.5937034487724304, "learning_rate": 9.99998906526087e-06, "loss": 2.381, "mean_token_accuracy": 0.48564466880634427, "num_tokens": 150565533.0, "step": 1039 }, { "entropy": 2.541259765625, "epoch": 0.017905873643070513, "grad_norm": 0.5868061780929565, "learning_rate": 9.999988482175975e-06, "loss": 2.485, "mean_token_accuracy": 0.4694917304441333, "num_tokens": 150703215.0, "step": 1040 }, { "entropy": 2.51025390625, "epoch": 0.017923090829265773, "grad_norm": 0.5923919677734375, "learning_rate": 9.99998788394604e-06, "loss": 2.5, "mean_token_accuracy": 0.47092792578041553, "num_tokens": 150846937.0, "step": 1041 }, { "entropy": 2.44384765625, "epoch": 0.017940308015461034, "grad_norm": 0.6263927817344666, "learning_rate": 9.999987270571067e-06, "loss": 2.4557, "mean_token_accuracy": 0.4760148832574487, "num_tokens": 150988381.0, "step": 1042 }, { "entropy": 2.388427734375, "epoch": 0.017957525201656294, "grad_norm": 0.5355521440505981, "learning_rate": 9.999986642051061e-06, "loss": 2.3634, "mean_token_accuracy": 0.4859662549570203, "num_tokens": 151137551.0, "step": 1043 }, { "entropy": 2.41552734375, "epoch": 0.017974742387851555, "grad_norm": 0.615512490272522, "learning_rate": 9.999985998386022e-06, "loss": 2.4254, "mean_token_accuracy": 0.4817372509278357, "num_tokens": 151280397.0, "step": 1044 }, { "entropy": 2.496826171875, "epoch": 0.017991959574046812, "grad_norm": 0.543178379535675, "learning_rate": 9.99998533957595e-06, "loss": 2.4526, "mean_token_accuracy": 0.4679850875400007, "num_tokens": 151423217.0, "step": 1045 }, { "entropy": 2.487548828125, "epoch": 0.018009176760242072, "grad_norm": 0.6349004507064819, "learning_rate": 9.999984665620852e-06, "loss": 2.4205, "mean_token_accuracy": 0.4785635150037706, "num_tokens": 151566681.0, "step": 1046 }, { "entropy": 2.467041015625, "epoch": 0.018026393946437333, "grad_norm": 0.5757213234901428, "learning_rate": 9.999983976520725e-06, "loss": 2.3992, "mean_token_accuracy": 0.48239869019016623, "num_tokens": 151722733.0, "step": 1047 }, { "entropy": 2.3770751953125, "epoch": 0.018043611132632593, "grad_norm": 0.5888438820838928, "learning_rate": 9.999983272275572e-06, "loss": 2.3367, "mean_token_accuracy": 0.488128668628633, "num_tokens": 151878057.0, "step": 1048 }, { "entropy": 2.40380859375, "epoch": 0.018060828318827854, "grad_norm": 0.6084235906600952, "learning_rate": 9.999982552885396e-06, "loss": 2.4194, "mean_token_accuracy": 0.48174417577683926, "num_tokens": 152013813.0, "step": 1049 }, { "entropy": 2.525390625, "epoch": 0.018078045505023115, "grad_norm": 0.6073904633522034, "learning_rate": 9.999981818350201e-06, "loss": 2.4913, "mean_token_accuracy": 0.470255627296865, "num_tokens": 152145871.0, "step": 1050 }, { "entropy": 2.4578857421875, "epoch": 0.018095262691218375, "grad_norm": 0.5444018840789795, "learning_rate": 9.999981068669986e-06, "loss": 2.4527, "mean_token_accuracy": 0.47973277885466814, "num_tokens": 152304784.0, "step": 1051 }, { "entropy": 2.47509765625, "epoch": 0.018112479877413636, "grad_norm": 0.5837482213973999, "learning_rate": 9.999980303844756e-06, "loss": 2.4489, "mean_token_accuracy": 0.47514383727684617, "num_tokens": 152453018.0, "step": 1052 }, { "entropy": 2.4283447265625, "epoch": 0.018129697063608896, "grad_norm": 2.452045202255249, "learning_rate": 9.999979523874513e-06, "loss": 2.3956, "mean_token_accuracy": 0.48897374235093594, "num_tokens": 152612683.0, "step": 1053 }, { "entropy": 2.404541015625, "epoch": 0.018146914249804153, "grad_norm": 0.5417919754981995, "learning_rate": 9.999978728759256e-06, "loss": 2.3388, "mean_token_accuracy": 0.4870688715018332, "num_tokens": 152762264.0, "step": 1054 }, { "entropy": 2.4012451171875, "epoch": 0.018164131435999414, "grad_norm": 0.5944674611091614, "learning_rate": 9.999977918498992e-06, "loss": 2.3714, "mean_token_accuracy": 0.4950359701178968, "num_tokens": 152906647.0, "step": 1055 }, { "entropy": 2.453857421875, "epoch": 0.018181348622194674, "grad_norm": 0.5601978302001953, "learning_rate": 9.99997709309372e-06, "loss": 2.3945, "mean_token_accuracy": 0.4819331760518253, "num_tokens": 153047278.0, "step": 1056 }, { "entropy": 2.5194091796875, "epoch": 0.018198565808389935, "grad_norm": 0.6144431233406067, "learning_rate": 9.999976252543444e-06, "loss": 2.4835, "mean_token_accuracy": 0.4768884154036641, "num_tokens": 153186932.0, "step": 1057 }, { "entropy": 2.403564453125, "epoch": 0.018215782994585195, "grad_norm": 0.9560090899467468, "learning_rate": 9.999975396848167e-06, "loss": 2.3291, "mean_token_accuracy": 0.4874680512584746, "num_tokens": 153341046.0, "step": 1058 }, { "entropy": 2.431884765625, "epoch": 0.018233000180780456, "grad_norm": 0.6547354459762573, "learning_rate": 9.99997452600789e-06, "loss": 2.4077, "mean_token_accuracy": 0.48102515283972025, "num_tokens": 153486117.0, "step": 1059 }, { "entropy": 2.381103515625, "epoch": 0.018250217366975716, "grad_norm": 0.5701848268508911, "learning_rate": 9.999973640022618e-06, "loss": 2.3703, "mean_token_accuracy": 0.4909746255725622, "num_tokens": 153635642.0, "step": 1060 }, { "entropy": 2.427978515625, "epoch": 0.018267434553170977, "grad_norm": 0.5640613436698914, "learning_rate": 9.99997273889235e-06, "loss": 2.3833, "mean_token_accuracy": 0.483435302041471, "num_tokens": 153778267.0, "step": 1061 }, { "entropy": 2.54736328125, "epoch": 0.018284651739366234, "grad_norm": 0.5851112008094788, "learning_rate": 9.999971822617094e-06, "loss": 2.5209, "mean_token_accuracy": 0.4660449046641588, "num_tokens": 153920352.0, "step": 1062 }, { "entropy": 2.47998046875, "epoch": 0.018301868925561494, "grad_norm": 0.5928635001182556, "learning_rate": 9.999970891196847e-06, "loss": 2.4422, "mean_token_accuracy": 0.47202530410140753, "num_tokens": 154064892.0, "step": 1063 }, { "entropy": 2.507080078125, "epoch": 0.018319086111756755, "grad_norm": 0.6197124719619751, "learning_rate": 9.999969944631615e-06, "loss": 2.4772, "mean_token_accuracy": 0.4731879197061062, "num_tokens": 154215041.0, "step": 1064 }, { "entropy": 2.46923828125, "epoch": 0.018336303297952015, "grad_norm": 0.5558657646179199, "learning_rate": 9.999968982921403e-06, "loss": 2.4352, "mean_token_accuracy": 0.47735925391316414, "num_tokens": 154367730.0, "step": 1065 }, { "entropy": 2.430419921875, "epoch": 0.018353520484147276, "grad_norm": 0.6059717535972595, "learning_rate": 9.99996800606621e-06, "loss": 2.3824, "mean_token_accuracy": 0.4844053997658193, "num_tokens": 154505107.0, "step": 1066 }, { "entropy": 2.4879150390625, "epoch": 0.018370737670342537, "grad_norm": 0.5660464763641357, "learning_rate": 9.999967014066038e-06, "loss": 2.4352, "mean_token_accuracy": 0.476345878560096, "num_tokens": 154643818.0, "step": 1067 }, { "entropy": 2.4345703125, "epoch": 0.018387954856537797, "grad_norm": 0.701174795627594, "learning_rate": 9.999966006920896e-06, "loss": 2.416, "mean_token_accuracy": 0.48350341338664293, "num_tokens": 154785623.0, "step": 1068 }, { "entropy": 2.48486328125, "epoch": 0.018405172042733058, "grad_norm": 0.6571546196937561, "learning_rate": 9.99996498463078e-06, "loss": 2.4509, "mean_token_accuracy": 0.47686226665973663, "num_tokens": 154947202.0, "step": 1069 }, { "entropy": 2.4830322265625, "epoch": 0.018422389228928315, "grad_norm": 0.6001805067062378, "learning_rate": 9.999963947195699e-06, "loss": 2.4316, "mean_token_accuracy": 0.47862283093854785, "num_tokens": 155097645.0, "step": 1070 }, { "entropy": 2.4422607421875, "epoch": 0.018439606415123575, "grad_norm": 0.6199787259101868, "learning_rate": 9.999962894615653e-06, "loss": 2.4132, "mean_token_accuracy": 0.48402009485289454, "num_tokens": 155229538.0, "step": 1071 }, { "entropy": 2.408203125, "epoch": 0.018456823601318836, "grad_norm": 0.5521693825721741, "learning_rate": 9.999961826890645e-06, "loss": 2.3365, "mean_token_accuracy": 0.4847987382672727, "num_tokens": 155376732.0, "step": 1072 }, { "entropy": 2.4617919921875, "epoch": 0.018474040787514096, "grad_norm": 0.6697084903717041, "learning_rate": 9.999960744020681e-06, "loss": 2.4263, "mean_token_accuracy": 0.48609263403341174, "num_tokens": 155514355.0, "step": 1073 }, { "entropy": 2.443603515625, "epoch": 0.018491257973709357, "grad_norm": 0.6446971297264099, "learning_rate": 9.999959646005761e-06, "loss": 2.448, "mean_token_accuracy": 0.4798265271820128, "num_tokens": 155652470.0, "step": 1074 }, { "entropy": 2.47412109375, "epoch": 0.018508475159904617, "grad_norm": 0.597260057926178, "learning_rate": 9.999958532845889e-06, "loss": 2.4575, "mean_token_accuracy": 0.4776256578043103, "num_tokens": 155809404.0, "step": 1075 }, { "entropy": 2.388916015625, "epoch": 0.018525692346099878, "grad_norm": 0.5906361937522888, "learning_rate": 9.99995740454107e-06, "loss": 2.3603, "mean_token_accuracy": 0.4867819882929325, "num_tokens": 155949820.0, "step": 1076 }, { "entropy": 2.4091796875, "epoch": 0.01854290953229514, "grad_norm": 0.5962374806404114, "learning_rate": 9.999956261091306e-06, "loss": 2.3754, "mean_token_accuracy": 0.486242544837296, "num_tokens": 156083865.0, "step": 1077 }, { "entropy": 2.466064453125, "epoch": 0.0185601267184904, "grad_norm": 0.5743368864059448, "learning_rate": 9.999955102496602e-06, "loss": 2.4522, "mean_token_accuracy": 0.4817331531085074, "num_tokens": 156230750.0, "step": 1078 }, { "entropy": 2.4454345703125, "epoch": 0.018577343904685656, "grad_norm": 0.5727860927581787, "learning_rate": 9.999953928756958e-06, "loss": 2.424, "mean_token_accuracy": 0.47826589411124587, "num_tokens": 156377631.0, "step": 1079 }, { "entropy": 2.433837890625, "epoch": 0.018594561090880916, "grad_norm": 0.6403164863586426, "learning_rate": 9.999952739872383e-06, "loss": 2.379, "mean_token_accuracy": 0.4838769477792084, "num_tokens": 156535199.0, "step": 1080 }, { "entropy": 2.504150390625, "epoch": 0.018611778277076177, "grad_norm": 0.5877081751823425, "learning_rate": 9.999951535842875e-06, "loss": 2.4479, "mean_token_accuracy": 0.4734333767555654, "num_tokens": 156667471.0, "step": 1081 }, { "entropy": 2.421630859375, "epoch": 0.018628995463271437, "grad_norm": 0.5537156462669373, "learning_rate": 9.999950316668443e-06, "loss": 2.3707, "mean_token_accuracy": 0.48230436397716403, "num_tokens": 156819504.0, "step": 1082 }, { "entropy": 2.5322265625, "epoch": 0.018646212649466698, "grad_norm": 0.603832483291626, "learning_rate": 9.999949082349085e-06, "loss": 2.4499, "mean_token_accuracy": 0.4676863308995962, "num_tokens": 156944926.0, "step": 1083 }, { "entropy": 2.473388671875, "epoch": 0.01866342983566196, "grad_norm": 0.5896692872047424, "learning_rate": 9.99994783288481e-06, "loss": 2.4796, "mean_token_accuracy": 0.4778019469231367, "num_tokens": 157090650.0, "step": 1084 }, { "entropy": 2.4200439453125, "epoch": 0.01868064702185722, "grad_norm": 0.5518651008605957, "learning_rate": 9.999946568275619e-06, "loss": 2.4286, "mean_token_accuracy": 0.47666720813140273, "num_tokens": 157252639.0, "step": 1085 }, { "entropy": 2.4140625, "epoch": 0.01869786420805248, "grad_norm": 0.5450928211212158, "learning_rate": 9.999945288521516e-06, "loss": 2.3944, "mean_token_accuracy": 0.48317190259695053, "num_tokens": 157409995.0, "step": 1086 }, { "entropy": 2.4267578125, "epoch": 0.018715081394247737, "grad_norm": 0.5875595211982727, "learning_rate": 9.999943993622504e-06, "loss": 2.4209, "mean_token_accuracy": 0.4857109053991735, "num_tokens": 157551609.0, "step": 1087 }, { "entropy": 2.43603515625, "epoch": 0.018732298580442997, "grad_norm": 0.5621595978736877, "learning_rate": 9.99994268357859e-06, "loss": 2.4107, "mean_token_accuracy": 0.4831027192994952, "num_tokens": 157699503.0, "step": 1088 }, { "entropy": 2.497802734375, "epoch": 0.018749515766638258, "grad_norm": 0.6029961109161377, "learning_rate": 9.999941358389775e-06, "loss": 2.4699, "mean_token_accuracy": 0.4773255423642695, "num_tokens": 157839741.0, "step": 1089 }, { "entropy": 2.4298095703125, "epoch": 0.018766732952833518, "grad_norm": 0.5884754061698914, "learning_rate": 9.999940018056062e-06, "loss": 2.4153, "mean_token_accuracy": 0.4817211125046015, "num_tokens": 157988371.0, "step": 1090 }, { "entropy": 2.417724609375, "epoch": 0.01878395013902878, "grad_norm": 0.5795974135398865, "learning_rate": 9.99993866257746e-06, "loss": 2.3917, "mean_token_accuracy": 0.4811216425150633, "num_tokens": 158133019.0, "step": 1091 }, { "entropy": 2.404541015625, "epoch": 0.01880116732522404, "grad_norm": 0.5938680768013, "learning_rate": 9.99993729195397e-06, "loss": 2.3589, "mean_token_accuracy": 0.4899957957677543, "num_tokens": 158276428.0, "step": 1092 }, { "entropy": 2.453857421875, "epoch": 0.0188183845114193, "grad_norm": 0.6011793613433838, "learning_rate": 9.999935906185596e-06, "loss": 2.3976, "mean_token_accuracy": 0.48242352809756994, "num_tokens": 158418256.0, "step": 1093 }, { "entropy": 2.4844970703125, "epoch": 0.01883560169761456, "grad_norm": 0.6211831569671631, "learning_rate": 9.999934505272341e-06, "loss": 2.4357, "mean_token_accuracy": 0.47488651471212506, "num_tokens": 158538103.0, "step": 1094 }, { "entropy": 2.4769287109375, "epoch": 0.018852818883809817, "grad_norm": 0.5956041812896729, "learning_rate": 9.999933089214214e-06, "loss": 2.4364, "mean_token_accuracy": 0.4817122952081263, "num_tokens": 158684837.0, "step": 1095 }, { "entropy": 2.4451904296875, "epoch": 0.018870036070005078, "grad_norm": 0.5776230692863464, "learning_rate": 9.999931658011213e-06, "loss": 2.4465, "mean_token_accuracy": 0.48071191366761923, "num_tokens": 158841114.0, "step": 1096 }, { "entropy": 2.4998779296875, "epoch": 0.01888725325620034, "grad_norm": 0.5656998753547668, "learning_rate": 9.999930211663346e-06, "loss": 2.4445, "mean_token_accuracy": 0.47156142350286245, "num_tokens": 158982072.0, "step": 1097 }, { "entropy": 2.410400390625, "epoch": 0.0189044704423956, "grad_norm": 0.5946214199066162, "learning_rate": 9.999928750170619e-06, "loss": 2.3455, "mean_token_accuracy": 0.4895977326668799, "num_tokens": 159115070.0, "step": 1098 }, { "entropy": 2.4327392578125, "epoch": 0.01892168762859086, "grad_norm": 0.594409167766571, "learning_rate": 9.999927273533032e-06, "loss": 2.3842, "mean_token_accuracy": 0.48081815196201205, "num_tokens": 159253866.0, "step": 1099 }, { "entropy": 2.450927734375, "epoch": 0.01893890481478612, "grad_norm": 0.5916756987571716, "learning_rate": 9.999925781750594e-06, "loss": 2.3838, "mean_token_accuracy": 0.4844925357028842, "num_tokens": 159415802.0, "step": 1100 }, { "entropy": 2.5107421875, "epoch": 0.01895612200098138, "grad_norm": 0.6757313013076782, "learning_rate": 9.999924274823305e-06, "loss": 2.4928, "mean_token_accuracy": 0.4722679229453206, "num_tokens": 159567360.0, "step": 1101 }, { "entropy": 2.44482421875, "epoch": 0.01897333918717664, "grad_norm": 0.5762059092521667, "learning_rate": 9.999922752751173e-06, "loss": 2.4003, "mean_token_accuracy": 0.4811086105182767, "num_tokens": 159709631.0, "step": 1102 }, { "entropy": 2.425537109375, "epoch": 0.0189905563733719, "grad_norm": 0.5835311412811279, "learning_rate": 9.999921215534201e-06, "loss": 2.377, "mean_token_accuracy": 0.4846574803814292, "num_tokens": 159847869.0, "step": 1103 }, { "entropy": 2.3896484375, "epoch": 0.01900777355956716, "grad_norm": 0.610913872718811, "learning_rate": 9.999919663172394e-06, "loss": 2.3431, "mean_token_accuracy": 0.4912629318423569, "num_tokens": 159976268.0, "step": 1104 }, { "entropy": 2.4576416015625, "epoch": 0.01902499074576242, "grad_norm": 0.6417136192321777, "learning_rate": 9.999918095665758e-06, "loss": 2.4618, "mean_token_accuracy": 0.4749236977659166, "num_tokens": 160114362.0, "step": 1105 }, { "entropy": 2.4779052734375, "epoch": 0.01904220793195768, "grad_norm": 0.5750072598457336, "learning_rate": 9.999916513014294e-06, "loss": 2.4414, "mean_token_accuracy": 0.4833888350985944, "num_tokens": 160272635.0, "step": 1106 }, { "entropy": 2.4361572265625, "epoch": 0.01905942511815294, "grad_norm": 0.6341116428375244, "learning_rate": 9.999914915218012e-06, "loss": 2.3878, "mean_token_accuracy": 0.47810999071225524, "num_tokens": 160430906.0, "step": 1107 }, { "entropy": 2.3680419921875, "epoch": 0.0190766423043482, "grad_norm": 0.575920581817627, "learning_rate": 9.999913302276912e-06, "loss": 2.3544, "mean_token_accuracy": 0.4951640688814223, "num_tokens": 160569906.0, "step": 1108 }, { "entropy": 2.431884765625, "epoch": 0.01909385949054346, "grad_norm": 0.5743590593338013, "learning_rate": 9.999911674191001e-06, "loss": 2.382, "mean_token_accuracy": 0.48963714949786663, "num_tokens": 160710393.0, "step": 1109 }, { "entropy": 2.5234375, "epoch": 0.019111076676738722, "grad_norm": 0.6044170260429382, "learning_rate": 9.999910030960285e-06, "loss": 2.467, "mean_token_accuracy": 0.4708155104890466, "num_tokens": 160837634.0, "step": 1110 }, { "entropy": 2.498046875, "epoch": 0.019128293862933982, "grad_norm": 0.641613245010376, "learning_rate": 9.99990837258477e-06, "loss": 2.4, "mean_token_accuracy": 0.474128358066082, "num_tokens": 160981820.0, "step": 1111 }, { "entropy": 2.4156494140625, "epoch": 0.01914551104912924, "grad_norm": 0.7928540110588074, "learning_rate": 9.999906699064455e-06, "loss": 2.369, "mean_token_accuracy": 0.4871662328951061, "num_tokens": 161128688.0, "step": 1112 }, { "entropy": 2.45849609375, "epoch": 0.0191627282353245, "grad_norm": 0.6063829064369202, "learning_rate": 9.999905010399351e-06, "loss": 2.3988, "mean_token_accuracy": 0.4813544964417815, "num_tokens": 161265040.0, "step": 1113 }, { "entropy": 2.40625, "epoch": 0.01917994542151976, "grad_norm": 0.5939781069755554, "learning_rate": 9.999903306589463e-06, "loss": 2.3918, "mean_token_accuracy": 0.48077729996293783, "num_tokens": 161418528.0, "step": 1114 }, { "entropy": 2.444580078125, "epoch": 0.01919716260771502, "grad_norm": 0.6092802882194519, "learning_rate": 9.99990158763479e-06, "loss": 2.4441, "mean_token_accuracy": 0.4797166772186756, "num_tokens": 161557877.0, "step": 1115 }, { "entropy": 2.457763671875, "epoch": 0.01921437979391028, "grad_norm": 0.6434066295623779, "learning_rate": 9.999899853535344e-06, "loss": 2.4275, "mean_token_accuracy": 0.48657548474147916, "num_tokens": 161698007.0, "step": 1116 }, { "entropy": 2.4796142578125, "epoch": 0.019231596980105542, "grad_norm": 0.5764423608779907, "learning_rate": 9.999898104291128e-06, "loss": 2.4679, "mean_token_accuracy": 0.4779613367281854, "num_tokens": 161848916.0, "step": 1117 }, { "entropy": 2.4736328125, "epoch": 0.019248814166300802, "grad_norm": 0.8059520721435547, "learning_rate": 9.999896339902148e-06, "loss": 2.4523, "mean_token_accuracy": 0.4732037871144712, "num_tokens": 162016077.0, "step": 1118 }, { "entropy": 2.46240234375, "epoch": 0.019266031352496063, "grad_norm": 0.5723803639411926, "learning_rate": 9.999894560368406e-06, "loss": 2.4095, "mean_token_accuracy": 0.48319359589368105, "num_tokens": 162166218.0, "step": 1119 }, { "entropy": 2.4815673828125, "epoch": 0.01928324853869132, "grad_norm": 0.5882191061973572, "learning_rate": 9.999892765689912e-06, "loss": 2.4345, "mean_token_accuracy": 0.47986175399273634, "num_tokens": 162305491.0, "step": 1120 }, { "entropy": 2.4931640625, "epoch": 0.01930046572488658, "grad_norm": 0.6278371214866638, "learning_rate": 9.999890955866667e-06, "loss": 2.3912, "mean_token_accuracy": 0.47920875577256083, "num_tokens": 162426796.0, "step": 1121 }, { "entropy": 2.485595703125, "epoch": 0.01931768291108184, "grad_norm": 0.6226006746292114, "learning_rate": 9.999889130898682e-06, "loss": 2.4591, "mean_token_accuracy": 0.47889810614287853, "num_tokens": 162575000.0, "step": 1122 }, { "entropy": 2.508544921875, "epoch": 0.0193349000972771, "grad_norm": 0.675370454788208, "learning_rate": 9.999887290785957e-06, "loss": 2.4308, "mean_token_accuracy": 0.47410700796172023, "num_tokens": 162713512.0, "step": 1123 }, { "entropy": 2.420166015625, "epoch": 0.019352117283472362, "grad_norm": 0.5244258642196655, "learning_rate": 9.9998854355285e-06, "loss": 2.3653, "mean_token_accuracy": 0.48809623531997204, "num_tokens": 162866946.0, "step": 1124 }, { "entropy": 2.420654296875, "epoch": 0.019369334469667623, "grad_norm": 0.5886077284812927, "learning_rate": 9.999883565126316e-06, "loss": 2.381, "mean_token_accuracy": 0.4897596640512347, "num_tokens": 163012916.0, "step": 1125 }, { "entropy": 2.4671630859375, "epoch": 0.019386551655862883, "grad_norm": 0.6227119565010071, "learning_rate": 9.999881679579411e-06, "loss": 2.4095, "mean_token_accuracy": 0.4798518419265747, "num_tokens": 163152736.0, "step": 1126 }, { "entropy": 2.4190673828125, "epoch": 0.019403768842058144, "grad_norm": 0.6482838988304138, "learning_rate": 9.999879778887793e-06, "loss": 2.3961, "mean_token_accuracy": 0.48625363502651453, "num_tokens": 163284418.0, "step": 1127 }, { "entropy": 2.404296875, "epoch": 0.0194209860282534, "grad_norm": 0.7466304302215576, "learning_rate": 9.999877863051464e-06, "loss": 2.3752, "mean_token_accuracy": 0.4825763679109514, "num_tokens": 163435314.0, "step": 1128 }, { "entropy": 2.39111328125, "epoch": 0.01943820321444866, "grad_norm": 0.5774152278900146, "learning_rate": 9.999875932070431e-06, "loss": 2.3536, "mean_token_accuracy": 0.4831749680452049, "num_tokens": 163590841.0, "step": 1129 }, { "entropy": 2.47607421875, "epoch": 0.019455420400643922, "grad_norm": 0.6369538307189941, "learning_rate": 9.9998739859447e-06, "loss": 2.4949, "mean_token_accuracy": 0.47335345949977636, "num_tokens": 163724787.0, "step": 1130 }, { "entropy": 2.528076171875, "epoch": 0.019472637586839182, "grad_norm": 0.5823352932929993, "learning_rate": 9.999872024674277e-06, "loss": 2.5034, "mean_token_accuracy": 0.4663214525207877, "num_tokens": 163854370.0, "step": 1131 }, { "entropy": 2.518310546875, "epoch": 0.019489854773034443, "grad_norm": 0.6219056844711304, "learning_rate": 9.99987004825917e-06, "loss": 2.5005, "mean_token_accuracy": 0.47132935794070363, "num_tokens": 163999338.0, "step": 1132 }, { "entropy": 2.46630859375, "epoch": 0.019507071959229703, "grad_norm": 0.5910077691078186, "learning_rate": 9.999868056699382e-06, "loss": 2.4295, "mean_token_accuracy": 0.4800408352166414, "num_tokens": 164145130.0, "step": 1133 }, { "entropy": 2.49267578125, "epoch": 0.019524289145424964, "grad_norm": 0.6001760959625244, "learning_rate": 9.99986604999492e-06, "loss": 2.4397, "mean_token_accuracy": 0.4783216747455299, "num_tokens": 164283826.0, "step": 1134 }, { "entropy": 2.46240234375, "epoch": 0.019541506331620224, "grad_norm": 0.6204310655593872, "learning_rate": 9.99986402814579e-06, "loss": 2.4157, "mean_token_accuracy": 0.48227256163954735, "num_tokens": 164413337.0, "step": 1135 }, { "entropy": 2.3477783203125, "epoch": 0.019558723517815485, "grad_norm": 0.5665948987007141, "learning_rate": 9.999861991151999e-06, "loss": 2.2835, "mean_token_accuracy": 0.5003114375285804, "num_tokens": 164568720.0, "step": 1136 }, { "entropy": 2.46435546875, "epoch": 0.019575940704010742, "grad_norm": 0.5802145004272461, "learning_rate": 9.999859939013553e-06, "loss": 2.3899, "mean_token_accuracy": 0.484941388014704, "num_tokens": 164705235.0, "step": 1137 }, { "entropy": 2.4754638671875, "epoch": 0.019593157890206003, "grad_norm": 0.6445546746253967, "learning_rate": 9.999857871730456e-06, "loss": 2.4595, "mean_token_accuracy": 0.4826706210151315, "num_tokens": 164834818.0, "step": 1138 }, { "entropy": 2.490478515625, "epoch": 0.019610375076401263, "grad_norm": 0.6412522196769714, "learning_rate": 9.999855789302716e-06, "loss": 2.4606, "mean_token_accuracy": 0.473591239657253, "num_tokens": 164978552.0, "step": 1139 }, { "entropy": 2.3907470703125, "epoch": 0.019627592262596524, "grad_norm": 0.5629065036773682, "learning_rate": 9.99985369173034e-06, "loss": 2.3848, "mean_token_accuracy": 0.4844861035235226, "num_tokens": 165121299.0, "step": 1140 }, { "entropy": 2.4473876953125, "epoch": 0.019644809448791784, "grad_norm": 0.6042686104774475, "learning_rate": 9.999851579013334e-06, "loss": 2.4534, "mean_token_accuracy": 0.48099326388910413, "num_tokens": 165256223.0, "step": 1141 }, { "entropy": 2.4193115234375, "epoch": 0.019662026634987045, "grad_norm": 0.5502400994300842, "learning_rate": 9.999849451151702e-06, "loss": 2.4046, "mean_token_accuracy": 0.4827784230001271, "num_tokens": 165406097.0, "step": 1142 }, { "entropy": 2.4688720703125, "epoch": 0.019679243821182305, "grad_norm": 0.5841612815856934, "learning_rate": 9.999847308145456e-06, "loss": 2.4319, "mean_token_accuracy": 0.48387880716472864, "num_tokens": 165549274.0, "step": 1143 }, { "entropy": 2.417724609375, "epoch": 0.019696461007377566, "grad_norm": 0.5570199489593506, "learning_rate": 9.999845149994595e-06, "loss": 2.3172, "mean_token_accuracy": 0.4893231294117868, "num_tokens": 165709510.0, "step": 1144 }, { "entropy": 2.4190673828125, "epoch": 0.019713678193572823, "grad_norm": 0.6449766159057617, "learning_rate": 9.999842976699133e-06, "loss": 2.3729, "mean_token_accuracy": 0.48674700502306223, "num_tokens": 165858965.0, "step": 1145 }, { "entropy": 2.448974609375, "epoch": 0.019730895379768083, "grad_norm": 0.5931810736656189, "learning_rate": 9.99984078825907e-06, "loss": 2.4001, "mean_token_accuracy": 0.4783625756390393, "num_tokens": 165996304.0, "step": 1146 }, { "entropy": 2.46533203125, "epoch": 0.019748112565963344, "grad_norm": 0.543803870677948, "learning_rate": 9.999838584674417e-06, "loss": 2.4305, "mean_token_accuracy": 0.4785708379931748, "num_tokens": 166150884.0, "step": 1147 }, { "entropy": 2.55517578125, "epoch": 0.019765329752158604, "grad_norm": 0.59067302942276, "learning_rate": 9.99983636594518e-06, "loss": 2.5251, "mean_token_accuracy": 0.4674039273522794, "num_tokens": 166283288.0, "step": 1148 }, { "entropy": 2.45947265625, "epoch": 0.019782546938353865, "grad_norm": 0.56581711769104, "learning_rate": 9.999834132071364e-06, "loss": 2.4188, "mean_token_accuracy": 0.48025199631229043, "num_tokens": 166428168.0, "step": 1149 }, { "entropy": 2.4490966796875, "epoch": 0.019799764124549125, "grad_norm": 0.6398937106132507, "learning_rate": 9.999831883052978e-06, "loss": 2.4001, "mean_token_accuracy": 0.48107053339481354, "num_tokens": 166579098.0, "step": 1150 }, { "entropy": 2.396484375, "epoch": 0.019816981310744386, "grad_norm": 0.5707962512969971, "learning_rate": 9.999829618890028e-06, "loss": 2.3725, "mean_token_accuracy": 0.4916608249768615, "num_tokens": 166724897.0, "step": 1151 }, { "entropy": 2.3883056640625, "epoch": 0.019834198496939646, "grad_norm": 0.5890077352523804, "learning_rate": 9.99982733958252e-06, "loss": 2.3406, "mean_token_accuracy": 0.49726316425949335, "num_tokens": 166869333.0, "step": 1152 }, { "entropy": 2.41796875, "epoch": 0.019851415683134904, "grad_norm": 0.5820266604423523, "learning_rate": 9.99982504513046e-06, "loss": 2.3695, "mean_token_accuracy": 0.4875667071901262, "num_tokens": 167009538.0, "step": 1153 }, { "entropy": 2.4931640625, "epoch": 0.019868632869330164, "grad_norm": 0.5474282503128052, "learning_rate": 9.999822735533857e-06, "loss": 2.4836, "mean_token_accuracy": 0.47462243027985096, "num_tokens": 167165134.0, "step": 1154 }, { "entropy": 2.48193359375, "epoch": 0.019885850055525425, "grad_norm": 0.5690717101097107, "learning_rate": 9.999820410792717e-06, "loss": 2.4011, "mean_token_accuracy": 0.47985804779455066, "num_tokens": 167308042.0, "step": 1155 }, { "entropy": 2.46240234375, "epoch": 0.019903067241720685, "grad_norm": 0.6280803680419922, "learning_rate": 9.99981807090705e-06, "loss": 2.4093, "mean_token_accuracy": 0.4808740792796016, "num_tokens": 167463555.0, "step": 1156 }, { "entropy": 2.43505859375, "epoch": 0.019920284427915946, "grad_norm": 0.5713448524475098, "learning_rate": 9.999815715876858e-06, "loss": 2.4033, "mean_token_accuracy": 0.48702497407794, "num_tokens": 167617300.0, "step": 1157 }, { "entropy": 2.41650390625, "epoch": 0.019937501614111206, "grad_norm": 0.5802963376045227, "learning_rate": 9.999813345702151e-06, "loss": 2.3546, "mean_token_accuracy": 0.4853116972371936, "num_tokens": 167751762.0, "step": 1158 }, { "entropy": 2.54541015625, "epoch": 0.019954718800306467, "grad_norm": 0.5789119005203247, "learning_rate": 9.999810960382936e-06, "loss": 2.5738, "mean_token_accuracy": 0.46945408172905445, "num_tokens": 167894886.0, "step": 1159 }, { "entropy": 2.4666748046875, "epoch": 0.019971935986501727, "grad_norm": 0.5464440584182739, "learning_rate": 9.999808559919221e-06, "loss": 2.4415, "mean_token_accuracy": 0.4792809300124645, "num_tokens": 168052147.0, "step": 1160 }, { "entropy": 2.450927734375, "epoch": 0.019989153172696988, "grad_norm": 0.5954328179359436, "learning_rate": 9.999806144311013e-06, "loss": 2.3982, "mean_token_accuracy": 0.4799165716394782, "num_tokens": 168188013.0, "step": 1161 }, { "entropy": 2.483154296875, "epoch": 0.020006370358892245, "grad_norm": 0.6072090268135071, "learning_rate": 9.999803713558316e-06, "loss": 2.4523, "mean_token_accuracy": 0.48105273954570293, "num_tokens": 168330752.0, "step": 1162 }, { "entropy": 2.472900390625, "epoch": 0.020023587545087505, "grad_norm": 0.5980669260025024, "learning_rate": 9.99980126766114e-06, "loss": 2.4257, "mean_token_accuracy": 0.47724353708326817, "num_tokens": 168466830.0, "step": 1163 }, { "entropy": 2.526123046875, "epoch": 0.020040804731282766, "grad_norm": 0.6172068119049072, "learning_rate": 9.999798806619494e-06, "loss": 2.5131, "mean_token_accuracy": 0.47623227164149284, "num_tokens": 168600305.0, "step": 1164 }, { "entropy": 2.515380859375, "epoch": 0.020058021917478026, "grad_norm": 0.6749062538146973, "learning_rate": 9.999796330433385e-06, "loss": 2.4829, "mean_token_accuracy": 0.47597271809354424, "num_tokens": 168758522.0, "step": 1165 }, { "entropy": 2.50341796875, "epoch": 0.020075239103673287, "grad_norm": 0.6174373626708984, "learning_rate": 9.999793839102816e-06, "loss": 2.4201, "mean_token_accuracy": 0.47675873059779406, "num_tokens": 168908687.0, "step": 1166 }, { "entropy": 2.4525146484375, "epoch": 0.020092456289868547, "grad_norm": 0.5919591188430786, "learning_rate": 9.9997913326278e-06, "loss": 2.4458, "mean_token_accuracy": 0.48330248473212123, "num_tokens": 169051058.0, "step": 1167 }, { "entropy": 2.4176025390625, "epoch": 0.020109673476063808, "grad_norm": 0.6027262210845947, "learning_rate": 9.999788811008342e-06, "loss": 2.3923, "mean_token_accuracy": 0.48441468458622694, "num_tokens": 169197102.0, "step": 1168 }, { "entropy": 2.500244140625, "epoch": 0.02012689066225907, "grad_norm": 0.5818685293197632, "learning_rate": 9.99978627424445e-06, "loss": 2.4626, "mean_token_accuracy": 0.47057256614789367, "num_tokens": 169324225.0, "step": 1169 }, { "entropy": 2.364501953125, "epoch": 0.020144107848454326, "grad_norm": 0.6910699605941772, "learning_rate": 9.999783722336132e-06, "loss": 2.3512, "mean_token_accuracy": 0.48974316706880927, "num_tokens": 169484740.0, "step": 1170 }, { "entropy": 2.423828125, "epoch": 0.020161325034649586, "grad_norm": 0.583166778087616, "learning_rate": 9.999781155283396e-06, "loss": 2.3721, "mean_token_accuracy": 0.4812298850156367, "num_tokens": 169629618.0, "step": 1171 }, { "entropy": 2.403076171875, "epoch": 0.020178542220844847, "grad_norm": 0.5646472573280334, "learning_rate": 9.99977857308625e-06, "loss": 2.3559, "mean_token_accuracy": 0.4837103271856904, "num_tokens": 169781342.0, "step": 1172 }, { "entropy": 2.3782958984375, "epoch": 0.020195759407040107, "grad_norm": 0.5860679149627686, "learning_rate": 9.9997759757447e-06, "loss": 2.3599, "mean_token_accuracy": 0.49117877380922437, "num_tokens": 169922222.0, "step": 1173 }, { "entropy": 2.416748046875, "epoch": 0.020212976593235368, "grad_norm": 0.6000187993049622, "learning_rate": 9.999773363258755e-06, "loss": 2.4166, "mean_token_accuracy": 0.4860631856136024, "num_tokens": 170059673.0, "step": 1174 }, { "entropy": 2.488037109375, "epoch": 0.020230193779430628, "grad_norm": 0.5900315046310425, "learning_rate": 9.999770735628423e-06, "loss": 2.4818, "mean_token_accuracy": 0.47089498909190297, "num_tokens": 170199601.0, "step": 1175 }, { "entropy": 2.4609375, "epoch": 0.02024741096562589, "grad_norm": 0.7553943991661072, "learning_rate": 9.999768092853711e-06, "loss": 2.3982, "mean_token_accuracy": 0.48237905837595463, "num_tokens": 170339217.0, "step": 1176 }, { "entropy": 2.489013671875, "epoch": 0.02026462815182115, "grad_norm": 0.5692632794380188, "learning_rate": 9.99976543493463e-06, "loss": 2.4907, "mean_token_accuracy": 0.4684693585149944, "num_tokens": 170487890.0, "step": 1177 }, { "entropy": 2.477294921875, "epoch": 0.020281845338016406, "grad_norm": 0.5685938000679016, "learning_rate": 9.999762761871184e-06, "loss": 2.403, "mean_token_accuracy": 0.48181371157988906, "num_tokens": 170641505.0, "step": 1178 }, { "entropy": 2.43310546875, "epoch": 0.020299062524211667, "grad_norm": 0.6160794496536255, "learning_rate": 9.999760073663383e-06, "loss": 2.3919, "mean_token_accuracy": 0.48729625064879656, "num_tokens": 170769357.0, "step": 1179 }, { "entropy": 2.5234375, "epoch": 0.020316279710406927, "grad_norm": 0.6029751300811768, "learning_rate": 9.999757370311237e-06, "loss": 2.5087, "mean_token_accuracy": 0.46955224964767694, "num_tokens": 170893993.0, "step": 1180 }, { "entropy": 2.463134765625, "epoch": 0.020333496896602188, "grad_norm": 0.635032057762146, "learning_rate": 9.999754651814751e-06, "loss": 2.4153, "mean_token_accuracy": 0.4809435624629259, "num_tokens": 171043577.0, "step": 1181 }, { "entropy": 2.3951416015625, "epoch": 0.02035071408279745, "grad_norm": 0.5716371536254883, "learning_rate": 9.999751918173935e-06, "loss": 2.3335, "mean_token_accuracy": 0.49141963897272944, "num_tokens": 171198846.0, "step": 1182 }, { "entropy": 2.39892578125, "epoch": 0.02036793126899271, "grad_norm": 0.5712882876396179, "learning_rate": 9.999749169388798e-06, "loss": 2.3149, "mean_token_accuracy": 0.49085350101813674, "num_tokens": 171351472.0, "step": 1183 }, { "entropy": 2.412353515625, "epoch": 0.02038514845518797, "grad_norm": 0.5964887142181396, "learning_rate": 9.999746405459345e-06, "loss": 2.3524, "mean_token_accuracy": 0.4884929973632097, "num_tokens": 171488905.0, "step": 1184 }, { "entropy": 2.5108642578125, "epoch": 0.02040236564138323, "grad_norm": 0.5682933330535889, "learning_rate": 9.999743626385587e-06, "loss": 2.4582, "mean_token_accuracy": 0.4702299786731601, "num_tokens": 171634563.0, "step": 1185 }, { "entropy": 2.4671630859375, "epoch": 0.02041958282757849, "grad_norm": 0.6103932857513428, "learning_rate": 9.999740832167532e-06, "loss": 2.4564, "mean_token_accuracy": 0.4751875218935311, "num_tokens": 171777207.0, "step": 1186 }, { "entropy": 2.49267578125, "epoch": 0.020436800013773748, "grad_norm": 1.0117425918579102, "learning_rate": 9.99973802280519e-06, "loss": 2.4774, "mean_token_accuracy": 0.4714944367296994, "num_tokens": 171934632.0, "step": 1187 }, { "entropy": 2.45458984375, "epoch": 0.020454017199969008, "grad_norm": 0.5842525959014893, "learning_rate": 9.999735198298566e-06, "loss": 2.4146, "mean_token_accuracy": 0.47794078243896365, "num_tokens": 172076052.0, "step": 1188 }, { "entropy": 2.43310546875, "epoch": 0.02047123438616427, "grad_norm": 0.594760000705719, "learning_rate": 9.999732358647671e-06, "loss": 2.3923, "mean_token_accuracy": 0.4838462113402784, "num_tokens": 172214297.0, "step": 1189 }, { "entropy": 2.46826171875, "epoch": 0.02048845157235953, "grad_norm": 0.5831640958786011, "learning_rate": 9.999729503852515e-06, "loss": 2.3907, "mean_token_accuracy": 0.48116701701655984, "num_tokens": 172354024.0, "step": 1190 }, { "entropy": 2.4698486328125, "epoch": 0.02050566875855479, "grad_norm": 0.612289547920227, "learning_rate": 9.999726633913103e-06, "loss": 2.4473, "mean_token_accuracy": 0.48014617431908846, "num_tokens": 172496977.0, "step": 1191 }, { "entropy": 2.462890625, "epoch": 0.02052288594475005, "grad_norm": 0.5079575777053833, "learning_rate": 9.999723748829446e-06, "loss": 2.4118, "mean_token_accuracy": 0.4765968224965036, "num_tokens": 172664155.0, "step": 1192 }, { "entropy": 2.4237060546875, "epoch": 0.02054010313094531, "grad_norm": 0.5485002398490906, "learning_rate": 9.999720848601552e-06, "loss": 2.374, "mean_token_accuracy": 0.4809511392377317, "num_tokens": 172824727.0, "step": 1193 }, { "entropy": 2.4146728515625, "epoch": 0.02055732031714057, "grad_norm": 0.5650107264518738, "learning_rate": 9.999717933229429e-06, "loss": 2.3705, "mean_token_accuracy": 0.49346699519082904, "num_tokens": 172981026.0, "step": 1194 }, { "entropy": 2.47900390625, "epoch": 0.020574537503335828, "grad_norm": 0.5556767582893372, "learning_rate": 9.999715002713088e-06, "loss": 2.4417, "mean_token_accuracy": 0.4759158226661384, "num_tokens": 173123752.0, "step": 1195 }, { "entropy": 2.505859375, "epoch": 0.02059175468953109, "grad_norm": 0.5984048843383789, "learning_rate": 9.999712057052537e-06, "loss": 2.4571, "mean_token_accuracy": 0.4764223378151655, "num_tokens": 173259038.0, "step": 1196 }, { "entropy": 2.4449462890625, "epoch": 0.02060897187572635, "grad_norm": 0.6215763688087463, "learning_rate": 9.999709096247784e-06, "loss": 2.4247, "mean_token_accuracy": 0.48310248367488384, "num_tokens": 173401014.0, "step": 1197 }, { "entropy": 2.391845703125, "epoch": 0.02062618906192161, "grad_norm": 0.6615683436393738, "learning_rate": 9.99970612029884e-06, "loss": 2.368, "mean_token_accuracy": 0.49283183040097356, "num_tokens": 173553153.0, "step": 1198 }, { "entropy": 2.4296875, "epoch": 0.02064340624811687, "grad_norm": 0.5707510113716125, "learning_rate": 9.999703129205711e-06, "loss": 2.3997, "mean_token_accuracy": 0.4811532562598586, "num_tokens": 173705473.0, "step": 1199 }, { "entropy": 2.3585205078125, "epoch": 0.02066062343431213, "grad_norm": 0.5650596618652344, "learning_rate": 9.999700122968408e-06, "loss": 2.2966, "mean_token_accuracy": 0.4981447677128017, "num_tokens": 173849665.0, "step": 1200 }, { "entropy": 2.46142578125, "epoch": 0.02067784062050739, "grad_norm": 0.5815020203590393, "learning_rate": 9.99969710158694e-06, "loss": 2.4193, "mean_token_accuracy": 0.4771373653784394, "num_tokens": 173988108.0, "step": 1201 }, { "entropy": 2.4542236328125, "epoch": 0.020695057806702652, "grad_norm": 0.5876851677894592, "learning_rate": 9.999694065061316e-06, "loss": 2.4786, "mean_token_accuracy": 0.4749302426353097, "num_tokens": 174126294.0, "step": 1202 }, { "entropy": 2.43505859375, "epoch": 0.02071227499289791, "grad_norm": 0.562313437461853, "learning_rate": 9.999691013391544e-06, "loss": 2.4206, "mean_token_accuracy": 0.4778692005202174, "num_tokens": 174269124.0, "step": 1203 }, { "entropy": 2.443115234375, "epoch": 0.02072949217909317, "grad_norm": 0.5638776421546936, "learning_rate": 9.999687946577636e-06, "loss": 2.3994, "mean_token_accuracy": 0.48797021247446537, "num_tokens": 174427015.0, "step": 1204 }, { "entropy": 2.4337158203125, "epoch": 0.02074670936528843, "grad_norm": 0.5831981301307678, "learning_rate": 9.999684864619599e-06, "loss": 2.3989, "mean_token_accuracy": 0.4862150545231998, "num_tokens": 174569995.0, "step": 1205 }, { "entropy": 2.449951171875, "epoch": 0.02076392655148369, "grad_norm": 0.5815305709838867, "learning_rate": 9.999681767517441e-06, "loss": 2.3913, "mean_token_accuracy": 0.4784506266005337, "num_tokens": 174706593.0, "step": 1206 }, { "entropy": 2.487060546875, "epoch": 0.02078114373767895, "grad_norm": 0.5909605026245117, "learning_rate": 9.999678655271176e-06, "loss": 2.4765, "mean_token_accuracy": 0.47097450820729136, "num_tokens": 174846238.0, "step": 1207 }, { "entropy": 2.44287109375, "epoch": 0.02079836092387421, "grad_norm": 0.5870411992073059, "learning_rate": 9.99967552788081e-06, "loss": 2.3842, "mean_token_accuracy": 0.48391013080254197, "num_tokens": 174992749.0, "step": 1208 }, { "entropy": 2.4306640625, "epoch": 0.020815578110069472, "grad_norm": 0.5437948703765869, "learning_rate": 9.999672385346352e-06, "loss": 2.3978, "mean_token_accuracy": 0.4776941388845444, "num_tokens": 175147217.0, "step": 1209 }, { "entropy": 2.428955078125, "epoch": 0.020832795296264733, "grad_norm": 0.5859825611114502, "learning_rate": 9.999669227667815e-06, "loss": 2.3921, "mean_token_accuracy": 0.4817708395421505, "num_tokens": 175293722.0, "step": 1210 }, { "entropy": 2.43603515625, "epoch": 0.020850012482459993, "grad_norm": 0.5771918892860413, "learning_rate": 9.999666054845206e-06, "loss": 2.3826, "mean_token_accuracy": 0.4837156576104462, "num_tokens": 175434663.0, "step": 1211 }, { "entropy": 2.4981689453125, "epoch": 0.02086722966865525, "grad_norm": 0.6018497943878174, "learning_rate": 9.999662866878534e-06, "loss": 2.4908, "mean_token_accuracy": 0.4697205852717161, "num_tokens": 175582504.0, "step": 1212 }, { "entropy": 2.51611328125, "epoch": 0.02088444685485051, "grad_norm": 0.5939236283302307, "learning_rate": 9.999659663767811e-06, "loss": 2.4614, "mean_token_accuracy": 0.47295166924595833, "num_tokens": 175721585.0, "step": 1213 }, { "entropy": 2.399169921875, "epoch": 0.02090166404104577, "grad_norm": 0.5899982452392578, "learning_rate": 9.999656445513043e-06, "loss": 2.3798, "mean_token_accuracy": 0.4900959865190089, "num_tokens": 175867778.0, "step": 1214 }, { "entropy": 2.4320068359375, "epoch": 0.020918881227241032, "grad_norm": 0.5718798041343689, "learning_rate": 9.999653212114245e-06, "loss": 2.395, "mean_token_accuracy": 0.48120944295078516, "num_tokens": 176013170.0, "step": 1215 }, { "entropy": 2.39794921875, "epoch": 0.020936098413436292, "grad_norm": 0.603789746761322, "learning_rate": 9.99964996357142e-06, "loss": 2.3539, "mean_token_accuracy": 0.4898811914026737, "num_tokens": 176154005.0, "step": 1216 }, { "entropy": 2.54248046875, "epoch": 0.020953315599631553, "grad_norm": 0.5883952379226685, "learning_rate": 9.999646699884585e-06, "loss": 2.4972, "mean_token_accuracy": 0.46428748965263367, "num_tokens": 176294756.0, "step": 1217 }, { "entropy": 2.4794921875, "epoch": 0.020970532785826813, "grad_norm": 0.5553483963012695, "learning_rate": 9.999643421053747e-06, "loss": 2.4403, "mean_token_accuracy": 0.4781416282057762, "num_tokens": 176450378.0, "step": 1218 }, { "entropy": 2.402099609375, "epoch": 0.020987749972022074, "grad_norm": 0.5453605055809021, "learning_rate": 9.999640127078914e-06, "loss": 2.3609, "mean_token_accuracy": 0.4892904283478856, "num_tokens": 176605732.0, "step": 1219 }, { "entropy": 2.425048828125, "epoch": 0.02100496715821733, "grad_norm": 0.5747688412666321, "learning_rate": 9.999636817960097e-06, "loss": 2.4142, "mean_token_accuracy": 0.4863899489864707, "num_tokens": 176749939.0, "step": 1220 }, { "entropy": 2.408203125, "epoch": 0.02102218434441259, "grad_norm": 0.5680667757987976, "learning_rate": 9.999633493697307e-06, "loss": 2.3843, "mean_token_accuracy": 0.4866964789107442, "num_tokens": 176896720.0, "step": 1221 }, { "entropy": 2.4571533203125, "epoch": 0.021039401530607852, "grad_norm": 0.5690086483955383, "learning_rate": 9.999630154290553e-06, "loss": 2.3909, "mean_token_accuracy": 0.48753285547718406, "num_tokens": 177039495.0, "step": 1222 }, { "entropy": 2.43115234375, "epoch": 0.021056618716803113, "grad_norm": 0.6062940359115601, "learning_rate": 9.999626799739846e-06, "loss": 2.4407, "mean_token_accuracy": 0.4770724857226014, "num_tokens": 177188199.0, "step": 1223 }, { "entropy": 2.490478515625, "epoch": 0.021073835902998373, "grad_norm": 0.6283055543899536, "learning_rate": 9.999623430045196e-06, "loss": 2.4805, "mean_token_accuracy": 0.4742564079351723, "num_tokens": 177323708.0, "step": 1224 }, { "entropy": 2.48046875, "epoch": 0.021091053089193634, "grad_norm": 0.5583751201629639, "learning_rate": 9.999620045206614e-06, "loss": 2.4941, "mean_token_accuracy": 0.47078192001208663, "num_tokens": 177479672.0, "step": 1225 }, { "entropy": 2.4630126953125, "epoch": 0.021108270275388894, "grad_norm": 0.5786680579185486, "learning_rate": 9.999616645224109e-06, "loss": 2.4421, "mean_token_accuracy": 0.4788076733238995, "num_tokens": 177624243.0, "step": 1226 }, { "entropy": 2.41796875, "epoch": 0.021125487461584155, "grad_norm": 0.5827528834342957, "learning_rate": 9.999613230097692e-06, "loss": 2.3804, "mean_token_accuracy": 0.48857234325259924, "num_tokens": 177764756.0, "step": 1227 }, { "entropy": 2.4376220703125, "epoch": 0.02114270464777941, "grad_norm": 0.5747877955436707, "learning_rate": 9.99960979982737e-06, "loss": 2.4097, "mean_token_accuracy": 0.4837777316570282, "num_tokens": 177898426.0, "step": 1228 }, { "entropy": 2.466064453125, "epoch": 0.021159921833974672, "grad_norm": 0.5510834455490112, "learning_rate": 9.999606354413159e-06, "loss": 2.398, "mean_token_accuracy": 0.4762152610346675, "num_tokens": 178050984.0, "step": 1229 }, { "entropy": 2.40283203125, "epoch": 0.021177139020169933, "grad_norm": 0.5941503643989563, "learning_rate": 9.999602893855067e-06, "loss": 2.3841, "mean_token_accuracy": 0.48818005435168743, "num_tokens": 178189354.0, "step": 1230 }, { "entropy": 2.438720703125, "epoch": 0.021194356206365193, "grad_norm": 0.5585294961929321, "learning_rate": 9.999599418153104e-06, "loss": 2.3762, "mean_token_accuracy": 0.4838878009468317, "num_tokens": 178331962.0, "step": 1231 }, { "entropy": 2.4259033203125, "epoch": 0.021211573392560454, "grad_norm": 0.6388170123100281, "learning_rate": 9.999595927307279e-06, "loss": 2.3748, "mean_token_accuracy": 0.48712681280449033, "num_tokens": 178482059.0, "step": 1232 }, { "entropy": 2.4521484375, "epoch": 0.021228790578755714, "grad_norm": 0.5800935626029968, "learning_rate": 9.999592421317606e-06, "loss": 2.4034, "mean_token_accuracy": 0.48077769484370947, "num_tokens": 178614560.0, "step": 1233 }, { "entropy": 2.4222412109375, "epoch": 0.021246007764950975, "grad_norm": 0.5996948480606079, "learning_rate": 9.999588900184094e-06, "loss": 2.3708, "mean_token_accuracy": 0.4873070912435651, "num_tokens": 178766038.0, "step": 1234 }, { "entropy": 2.4306640625, "epoch": 0.021263224951146235, "grad_norm": 0.5760281682014465, "learning_rate": 9.999585363906754e-06, "loss": 2.3985, "mean_token_accuracy": 0.4887649049051106, "num_tokens": 178911786.0, "step": 1235 }, { "entropy": 2.4580078125, "epoch": 0.021280442137341496, "grad_norm": 0.5570631623268127, "learning_rate": 9.999581812485595e-06, "loss": 2.4024, "mean_token_accuracy": 0.4858241407200694, "num_tokens": 179059130.0, "step": 1236 }, { "entropy": 2.43017578125, "epoch": 0.021297659323536753, "grad_norm": 0.6132498979568481, "learning_rate": 9.999578245920632e-06, "loss": 2.413, "mean_token_accuracy": 0.4828817122615874, "num_tokens": 179193557.0, "step": 1237 }, { "entropy": 2.518310546875, "epoch": 0.021314876509732014, "grad_norm": 0.6096269488334656, "learning_rate": 9.99957466421187e-06, "loss": 2.4572, "mean_token_accuracy": 0.4700988312251866, "num_tokens": 179318893.0, "step": 1238 }, { "entropy": 2.4744873046875, "epoch": 0.021332093695927274, "grad_norm": 0.5955691337585449, "learning_rate": 9.999571067359323e-06, "loss": 2.4467, "mean_token_accuracy": 0.475006652995944, "num_tokens": 179459850.0, "step": 1239 }, { "entropy": 2.498046875, "epoch": 0.021349310882122535, "grad_norm": 0.5774043798446655, "learning_rate": 9.999567455363003e-06, "loss": 2.4671, "mean_token_accuracy": 0.4733973373658955, "num_tokens": 179610624.0, "step": 1240 }, { "entropy": 2.450439453125, "epoch": 0.021366528068317795, "grad_norm": 0.5889162421226501, "learning_rate": 9.99956382822292e-06, "loss": 2.4152, "mean_token_accuracy": 0.4829295091331005, "num_tokens": 179743518.0, "step": 1241 }, { "entropy": 2.4434814453125, "epoch": 0.021383745254513056, "grad_norm": 0.5916315913200378, "learning_rate": 9.999560185939082e-06, "loss": 2.3796, "mean_token_accuracy": 0.48776215221732855, "num_tokens": 179888613.0, "step": 1242 }, { "entropy": 2.44091796875, "epoch": 0.021400962440708316, "grad_norm": 0.6239886283874512, "learning_rate": 9.999556528511504e-06, "loss": 2.4542, "mean_token_accuracy": 0.48557718843221664, "num_tokens": 180029182.0, "step": 1243 }, { "entropy": 2.452880859375, "epoch": 0.021418179626903577, "grad_norm": 0.5450658202171326, "learning_rate": 9.999552855940197e-06, "loss": 2.4064, "mean_token_accuracy": 0.48464760929346085, "num_tokens": 180184347.0, "step": 1244 }, { "entropy": 2.46923828125, "epoch": 0.021435396813098834, "grad_norm": 0.5909526944160461, "learning_rate": 9.999549168225169e-06, "loss": 2.3883, "mean_token_accuracy": 0.47910354332998395, "num_tokens": 180333887.0, "step": 1245 }, { "entropy": 2.4539794921875, "epoch": 0.021452613999294094, "grad_norm": 0.581224799156189, "learning_rate": 9.999545465366433e-06, "loss": 2.4123, "mean_token_accuracy": 0.4825686193071306, "num_tokens": 180490238.0, "step": 1246 }, { "entropy": 2.439697265625, "epoch": 0.021469831185489355, "grad_norm": 0.6227947473526001, "learning_rate": 9.999541747364002e-06, "loss": 2.4026, "mean_token_accuracy": 0.4787580128759146, "num_tokens": 180625744.0, "step": 1247 }, { "entropy": 2.4227294921875, "epoch": 0.021487048371684615, "grad_norm": 0.5937827825546265, "learning_rate": 9.999538014217884e-06, "loss": 2.359, "mean_token_accuracy": 0.4844051315449178, "num_tokens": 180761591.0, "step": 1248 }, { "entropy": 2.46337890625, "epoch": 0.021504265557879876, "grad_norm": 0.5774495601654053, "learning_rate": 9.999534265928092e-06, "loss": 2.4378, "mean_token_accuracy": 0.4757684310898185, "num_tokens": 180903465.0, "step": 1249 }, { "entropy": 2.5347900390625, "epoch": 0.021521482744075136, "grad_norm": 0.5800737738609314, "learning_rate": 9.999530502494639e-06, "loss": 2.4917, "mean_token_accuracy": 0.4692853162996471, "num_tokens": 181053233.0, "step": 1250 }, { "entropy": 2.4166259765625, "epoch": 0.021538699930270397, "grad_norm": 0.5999194383621216, "learning_rate": 9.999526723917533e-06, "loss": 2.372, "mean_token_accuracy": 0.4842418390326202, "num_tokens": 181202187.0, "step": 1251 }, { "entropy": 2.3966064453125, "epoch": 0.021555917116465657, "grad_norm": 0.561681866645813, "learning_rate": 9.999522930196787e-06, "loss": 2.3211, "mean_token_accuracy": 0.48686323687434196, "num_tokens": 181345964.0, "step": 1252 }, { "entropy": 2.350830078125, "epoch": 0.021573134302660914, "grad_norm": 0.6096113920211792, "learning_rate": 9.999519121332413e-06, "loss": 2.3158, "mean_token_accuracy": 0.49966981168836355, "num_tokens": 181493986.0, "step": 1253 }, { "entropy": 2.466064453125, "epoch": 0.021590351488856175, "grad_norm": 0.5756585001945496, "learning_rate": 9.99951529732442e-06, "loss": 2.4099, "mean_token_accuracy": 0.4760406082496047, "num_tokens": 181643446.0, "step": 1254 }, { "entropy": 2.4566650390625, "epoch": 0.021607568675051435, "grad_norm": 0.6152567267417908, "learning_rate": 9.999511458172823e-06, "loss": 2.4127, "mean_token_accuracy": 0.48100624280050397, "num_tokens": 181798482.0, "step": 1255 }, { "entropy": 2.42578125, "epoch": 0.021624785861246696, "grad_norm": 0.5952988266944885, "learning_rate": 9.999507603877634e-06, "loss": 2.3729, "mean_token_accuracy": 0.48960635205730796, "num_tokens": 181947283.0, "step": 1256 }, { "entropy": 2.489013671875, "epoch": 0.021642003047441957, "grad_norm": 0.5729377865791321, "learning_rate": 9.99950373443886e-06, "loss": 2.4575, "mean_token_accuracy": 0.47360460739582777, "num_tokens": 182103948.0, "step": 1257 }, { "entropy": 2.367919921875, "epoch": 0.021659220233637217, "grad_norm": 0.5920243859291077, "learning_rate": 9.999499849856518e-06, "loss": 2.3522, "mean_token_accuracy": 0.4938758905045688, "num_tokens": 182244306.0, "step": 1258 }, { "entropy": 2.4716796875, "epoch": 0.021676437419832478, "grad_norm": 0.6242178678512573, "learning_rate": 9.999495950130616e-06, "loss": 2.4071, "mean_token_accuracy": 0.4799223360605538, "num_tokens": 182370840.0, "step": 1259 }, { "entropy": 2.4951171875, "epoch": 0.021693654606027738, "grad_norm": 0.7572094798088074, "learning_rate": 9.999492035261166e-06, "loss": 2.466, "mean_token_accuracy": 0.48017187928780913, "num_tokens": 182517478.0, "step": 1260 }, { "entropy": 2.4541015625, "epoch": 0.021710871792223, "grad_norm": 0.6314413547515869, "learning_rate": 9.999488105248184e-06, "loss": 2.4002, "mean_token_accuracy": 0.4775316468439996, "num_tokens": 182661585.0, "step": 1261 }, { "entropy": 2.4586181640625, "epoch": 0.021728088978418256, "grad_norm": 0.6284291744232178, "learning_rate": 9.999484160091678e-06, "loss": 2.4529, "mean_token_accuracy": 0.4805305516347289, "num_tokens": 182804645.0, "step": 1262 }, { "entropy": 2.479248046875, "epoch": 0.021745306164613516, "grad_norm": 0.6105698347091675, "learning_rate": 9.99948019979166e-06, "loss": 2.4449, "mean_token_accuracy": 0.4798229462467134, "num_tokens": 182947450.0, "step": 1263 }, { "entropy": 2.4501953125, "epoch": 0.021762523350808777, "grad_norm": 0.5790930390357971, "learning_rate": 9.999476224348144e-06, "loss": 2.4185, "mean_token_accuracy": 0.4775314489379525, "num_tokens": 183089455.0, "step": 1264 }, { "entropy": 2.5020751953125, "epoch": 0.021779740537004037, "grad_norm": 0.6524637341499329, "learning_rate": 9.99947223376114e-06, "loss": 2.4855, "mean_token_accuracy": 0.480673139449209, "num_tokens": 183241120.0, "step": 1265 }, { "entropy": 2.4835205078125, "epoch": 0.021796957723199298, "grad_norm": 0.5516358613967896, "learning_rate": 9.99946822803066e-06, "loss": 2.414, "mean_token_accuracy": 0.47934063244611025, "num_tokens": 183390072.0, "step": 1266 }, { "entropy": 2.506591796875, "epoch": 0.02181417490939456, "grad_norm": 0.5939943194389343, "learning_rate": 9.99946420715672e-06, "loss": 2.4367, "mean_token_accuracy": 0.47747283428907394, "num_tokens": 183539529.0, "step": 1267 }, { "entropy": 2.4515380859375, "epoch": 0.02183139209558982, "grad_norm": 0.5539537072181702, "learning_rate": 9.999460171139328e-06, "loss": 2.3763, "mean_token_accuracy": 0.4820182635448873, "num_tokens": 183688585.0, "step": 1268 }, { "entropy": 2.404541015625, "epoch": 0.02184860928178508, "grad_norm": 0.5919788479804993, "learning_rate": 9.999456119978496e-06, "loss": 2.4045, "mean_token_accuracy": 0.4871440161950886, "num_tokens": 183821506.0, "step": 1269 }, { "entropy": 2.472412109375, "epoch": 0.021865826467980336, "grad_norm": 0.6141915917396545, "learning_rate": 9.999452053674242e-06, "loss": 2.438, "mean_token_accuracy": 0.4749964135698974, "num_tokens": 183954051.0, "step": 1270 }, { "entropy": 2.4482421875, "epoch": 0.021883043654175597, "grad_norm": 0.5740827322006226, "learning_rate": 9.99944797222657e-06, "loss": 2.4203, "mean_token_accuracy": 0.47882386669516563, "num_tokens": 184097023.0, "step": 1271 }, { "entropy": 2.4649658203125, "epoch": 0.021900260840370857, "grad_norm": 0.6000681519508362, "learning_rate": 9.999443875635499e-06, "loss": 2.3876, "mean_token_accuracy": 0.48048424860462546, "num_tokens": 184244536.0, "step": 1272 }, { "entropy": 2.448486328125, "epoch": 0.021917478026566118, "grad_norm": 0.5612481236457825, "learning_rate": 9.999439763901037e-06, "loss": 2.375, "mean_token_accuracy": 0.48185514099895954, "num_tokens": 184392474.0, "step": 1273 }, { "entropy": 2.397705078125, "epoch": 0.02193469521276138, "grad_norm": 0.593385636806488, "learning_rate": 9.9994356370232e-06, "loss": 2.3472, "mean_token_accuracy": 0.4966060775332153, "num_tokens": 184541344.0, "step": 1274 }, { "entropy": 2.4896240234375, "epoch": 0.02195191239895664, "grad_norm": 0.5971876978874207, "learning_rate": 9.999431495001998e-06, "loss": 2.4195, "mean_token_accuracy": 0.4794482118450105, "num_tokens": 184672977.0, "step": 1275 }, { "entropy": 2.4857177734375, "epoch": 0.0219691295851519, "grad_norm": 0.6552586555480957, "learning_rate": 9.999427337837444e-06, "loss": 2.4895, "mean_token_accuracy": 0.47428874857723713, "num_tokens": 184826101.0, "step": 1276 }, { "entropy": 2.42529296875, "epoch": 0.02198634677134716, "grad_norm": 0.5778135061264038, "learning_rate": 9.999423165529554e-06, "loss": 2.4422, "mean_token_accuracy": 0.4804799151606858, "num_tokens": 184970321.0, "step": 1277 }, { "entropy": 2.4859619140625, "epoch": 0.022003563957542417, "grad_norm": 0.537326455116272, "learning_rate": 9.999418978078335e-06, "loss": 2.4407, "mean_token_accuracy": 0.47214187448844314, "num_tokens": 185128581.0, "step": 1278 }, { "entropy": 2.39697265625, "epoch": 0.022020781143737678, "grad_norm": 0.6056032776832581, "learning_rate": 9.999414775483803e-06, "loss": 2.381, "mean_token_accuracy": 0.4875910747796297, "num_tokens": 185266968.0, "step": 1279 }, { "entropy": 2.50537109375, "epoch": 0.022037998329932938, "grad_norm": 0.5535710453987122, "learning_rate": 9.99941055774597e-06, "loss": 2.4915, "mean_token_accuracy": 0.4735433543100953, "num_tokens": 185416068.0, "step": 1280 }, { "entropy": 2.4454345703125, "epoch": 0.0220552155161282, "grad_norm": 0.6215351223945618, "learning_rate": 9.99940632486485e-06, "loss": 2.3778, "mean_token_accuracy": 0.4865972097031772, "num_tokens": 185558293.0, "step": 1281 }, { "entropy": 2.510498046875, "epoch": 0.02207243270232346, "grad_norm": 0.5621281862258911, "learning_rate": 9.999402076840452e-06, "loss": 2.4886, "mean_token_accuracy": 0.4751009796746075, "num_tokens": 185707903.0, "step": 1282 }, { "entropy": 2.439208984375, "epoch": 0.02208964988851872, "grad_norm": 0.5782500505447388, "learning_rate": 9.999397813672793e-06, "loss": 2.4012, "mean_token_accuracy": 0.47963299648836255, "num_tokens": 185864974.0, "step": 1283 }, { "entropy": 2.452880859375, "epoch": 0.02210686707471398, "grad_norm": 0.584717333316803, "learning_rate": 9.999393535361884e-06, "loss": 2.4252, "mean_token_accuracy": 0.4816917534917593, "num_tokens": 186005584.0, "step": 1284 }, { "entropy": 2.43310546875, "epoch": 0.02212408426090924, "grad_norm": 0.5956825613975525, "learning_rate": 9.99938924190774e-06, "loss": 2.3782, "mean_token_accuracy": 0.48892744118347764, "num_tokens": 186143411.0, "step": 1285 }, { "entropy": 2.4464111328125, "epoch": 0.0221413014471045, "grad_norm": 0.5437039136886597, "learning_rate": 9.99938493331037e-06, "loss": 2.4353, "mean_token_accuracy": 0.4839400313794613, "num_tokens": 186299707.0, "step": 1286 }, { "entropy": 2.498046875, "epoch": 0.02215851863329976, "grad_norm": 0.6034572720527649, "learning_rate": 9.999380609569791e-06, "loss": 2.4795, "mean_token_accuracy": 0.47284478275105357, "num_tokens": 186447393.0, "step": 1287 }, { "entropy": 2.4296875, "epoch": 0.02217573581949502, "grad_norm": 0.5920705795288086, "learning_rate": 9.999376270686015e-06, "loss": 2.4214, "mean_token_accuracy": 0.4788710060529411, "num_tokens": 186590786.0, "step": 1288 }, { "entropy": 2.429443359375, "epoch": 0.02219295300569028, "grad_norm": 0.5723591446876526, "learning_rate": 9.999371916659054e-06, "loss": 2.385, "mean_token_accuracy": 0.48299159901216626, "num_tokens": 186739554.0, "step": 1289 }, { "entropy": 2.39111328125, "epoch": 0.02221017019188554, "grad_norm": 0.5848551988601685, "learning_rate": 9.999367547488923e-06, "loss": 2.2912, "mean_token_accuracy": 0.49383825389668345, "num_tokens": 186887370.0, "step": 1290 }, { "entropy": 2.467529296875, "epoch": 0.0222273873780808, "grad_norm": 0.5705557465553284, "learning_rate": 9.999363163175632e-06, "loss": 2.4211, "mean_token_accuracy": 0.4749067425727844, "num_tokens": 187028206.0, "step": 1291 }, { "entropy": 2.5201416015625, "epoch": 0.02224460456427606, "grad_norm": 0.6075615882873535, "learning_rate": 9.999358763719198e-06, "loss": 2.4732, "mean_token_accuracy": 0.4724921630695462, "num_tokens": 187178625.0, "step": 1292 }, { "entropy": 2.46142578125, "epoch": 0.02226182175047132, "grad_norm": 0.5848605036735535, "learning_rate": 9.999354349119632e-06, "loss": 2.4225, "mean_token_accuracy": 0.47803614335134625, "num_tokens": 187317792.0, "step": 1293 }, { "entropy": 2.443359375, "epoch": 0.022279038936666582, "grad_norm": 0.5687076449394226, "learning_rate": 9.999349919376949e-06, "loss": 2.3935, "mean_token_accuracy": 0.48158882977440953, "num_tokens": 187468854.0, "step": 1294 }, { "entropy": 2.4393310546875, "epoch": 0.02229625612286184, "grad_norm": 0.5710064768791199, "learning_rate": 9.999345474491161e-06, "loss": 2.3597, "mean_token_accuracy": 0.4866549982689321, "num_tokens": 187617503.0, "step": 1295 }, { "entropy": 2.4757080078125, "epoch": 0.0223134733090571, "grad_norm": 0.6155255436897278, "learning_rate": 9.999341014462283e-06, "loss": 2.4421, "mean_token_accuracy": 0.47693472215905786, "num_tokens": 187745930.0, "step": 1296 }, { "entropy": 2.4345703125, "epoch": 0.02233069049525236, "grad_norm": 0.5629665851593018, "learning_rate": 9.999336539290325e-06, "loss": 2.4156, "mean_token_accuracy": 0.4816053519025445, "num_tokens": 187892581.0, "step": 1297 }, { "entropy": 2.4129638671875, "epoch": 0.02234790768144762, "grad_norm": 0.5968115329742432, "learning_rate": 9.999332048975305e-06, "loss": 2.3635, "mean_token_accuracy": 0.48733530612662435, "num_tokens": 188044827.0, "step": 1298 }, { "entropy": 2.4227294921875, "epoch": 0.02236512486764288, "grad_norm": 0.5534366965293884, "learning_rate": 9.999327543517234e-06, "loss": 2.4033, "mean_token_accuracy": 0.4784287307411432, "num_tokens": 188208828.0, "step": 1299 }, { "entropy": 2.42578125, "epoch": 0.022382342053838142, "grad_norm": 0.5828860402107239, "learning_rate": 9.999323022916128e-06, "loss": 2.3879, "mean_token_accuracy": 0.4846663847565651, "num_tokens": 188357779.0, "step": 1300 }, { "entropy": 2.4833984375, "epoch": 0.022399559240033402, "grad_norm": 0.6039474606513977, "learning_rate": 9.999318487171998e-06, "loss": 2.4643, "mean_token_accuracy": 0.47219090024009347, "num_tokens": 188488042.0, "step": 1301 }, { "entropy": 2.443359375, "epoch": 0.022416776426228663, "grad_norm": 0.6493925452232361, "learning_rate": 9.999313936284858e-06, "loss": 2.4156, "mean_token_accuracy": 0.4812625157646835, "num_tokens": 188631539.0, "step": 1302 }, { "entropy": 2.439453125, "epoch": 0.02243399361242392, "grad_norm": 0.590706467628479, "learning_rate": 9.999309370254722e-06, "loss": 2.4059, "mean_token_accuracy": 0.482134644407779, "num_tokens": 188778873.0, "step": 1303 }, { "entropy": 2.376708984375, "epoch": 0.02245121079861918, "grad_norm": 0.7171503305435181, "learning_rate": 9.999304789081604e-06, "loss": 2.3167, "mean_token_accuracy": 0.49248863477259874, "num_tokens": 188929372.0, "step": 1304 }, { "entropy": 2.47119140625, "epoch": 0.02246842798481444, "grad_norm": 0.603790283203125, "learning_rate": 9.999300192765521e-06, "loss": 2.4484, "mean_token_accuracy": 0.4733037226833403, "num_tokens": 189068047.0, "step": 1305 }, { "entropy": 2.41162109375, "epoch": 0.0224856451710097, "grad_norm": 0.5595695376396179, "learning_rate": 9.999295581306483e-06, "loss": 2.3642, "mean_token_accuracy": 0.4862880017608404, "num_tokens": 189213830.0, "step": 1306 }, { "entropy": 2.4998779296875, "epoch": 0.022502862357204962, "grad_norm": 0.616247296333313, "learning_rate": 9.999290954704505e-06, "loss": 2.4518, "mean_token_accuracy": 0.48123760567978024, "num_tokens": 189351112.0, "step": 1307 }, { "entropy": 2.48583984375, "epoch": 0.022520079543400223, "grad_norm": 0.5927817821502686, "learning_rate": 9.999286312959602e-06, "loss": 2.4738, "mean_token_accuracy": 0.476222672034055, "num_tokens": 189488488.0, "step": 1308 }, { "entropy": 2.479736328125, "epoch": 0.022537296729595483, "grad_norm": 0.5709447860717773, "learning_rate": 9.999281656071784e-06, "loss": 2.4563, "mean_token_accuracy": 0.47250251518562436, "num_tokens": 189630709.0, "step": 1309 }, { "entropy": 2.4727783203125, "epoch": 0.022554513915790744, "grad_norm": 0.5687850713729858, "learning_rate": 9.99927698404107e-06, "loss": 2.4368, "mean_token_accuracy": 0.4784710453823209, "num_tokens": 189772213.0, "step": 1310 }, { "entropy": 2.3721923828125, "epoch": 0.022571731101986004, "grad_norm": 0.6119619607925415, "learning_rate": 9.999272296867475e-06, "loss": 2.2997, "mean_token_accuracy": 0.49369402416050434, "num_tokens": 189913660.0, "step": 1311 }, { "entropy": 2.4296875, "epoch": 0.02258894828818126, "grad_norm": 0.61922687292099, "learning_rate": 9.999267594551007e-06, "loss": 2.4161, "mean_token_accuracy": 0.4793846160173416, "num_tokens": 190047956.0, "step": 1312 }, { "entropy": 2.5008544921875, "epoch": 0.02260616547437652, "grad_norm": 0.8448692560195923, "learning_rate": 9.999262877091687e-06, "loss": 2.4671, "mean_token_accuracy": 0.4724547420628369, "num_tokens": 190193511.0, "step": 1313 }, { "entropy": 2.43505859375, "epoch": 0.022623382660571782, "grad_norm": 0.592045247554779, "learning_rate": 9.999258144489524e-06, "loss": 2.3928, "mean_token_accuracy": 0.47932835714891553, "num_tokens": 190339476.0, "step": 1314 }, { "entropy": 2.49169921875, "epoch": 0.022640599846767043, "grad_norm": 0.6020418405532837, "learning_rate": 9.999253396744534e-06, "loss": 2.4387, "mean_token_accuracy": 0.4741725055500865, "num_tokens": 190473944.0, "step": 1315 }, { "entropy": 2.437744140625, "epoch": 0.022657817032962303, "grad_norm": 0.5783674120903015, "learning_rate": 9.999248633856735e-06, "loss": 2.3754, "mean_token_accuracy": 0.48019342171028256, "num_tokens": 190621619.0, "step": 1316 }, { "entropy": 2.5028076171875, "epoch": 0.022675034219157564, "grad_norm": 0.6043087840080261, "learning_rate": 9.999243855826137e-06, "loss": 2.4531, "mean_token_accuracy": 0.4730870760977268, "num_tokens": 190753516.0, "step": 1317 }, { "entropy": 2.45654296875, "epoch": 0.022692251405352824, "grad_norm": 0.646530270576477, "learning_rate": 9.999239062652754e-06, "loss": 2.3895, "mean_token_accuracy": 0.48305864399299026, "num_tokens": 190898210.0, "step": 1318 }, { "entropy": 2.519775390625, "epoch": 0.022709468591548085, "grad_norm": 0.5441105961799622, "learning_rate": 9.999234254336603e-06, "loss": 2.5006, "mean_token_accuracy": 0.465965838637203, "num_tokens": 191072262.0, "step": 1319 }, { "entropy": 2.4124755859375, "epoch": 0.022726685777743342, "grad_norm": 0.5883780121803284, "learning_rate": 9.9992294308777e-06, "loss": 2.3936, "mean_token_accuracy": 0.4880047831684351, "num_tokens": 191216422.0, "step": 1320 }, { "entropy": 2.4327392578125, "epoch": 0.022743902963938602, "grad_norm": 0.5934004783630371, "learning_rate": 9.999224592276055e-06, "loss": 2.3924, "mean_token_accuracy": 0.48111487692222, "num_tokens": 191352211.0, "step": 1321 }, { "entropy": 2.497314453125, "epoch": 0.022761120150133863, "grad_norm": 0.5587040781974792, "learning_rate": 9.999219738531687e-06, "loss": 2.4294, "mean_token_accuracy": 0.4762568627484143, "num_tokens": 191496965.0, "step": 1322 }, { "entropy": 2.4332275390625, "epoch": 0.022778337336329123, "grad_norm": 0.5939592719078064, "learning_rate": 9.999214869644608e-06, "loss": 2.3724, "mean_token_accuracy": 0.4876466835848987, "num_tokens": 191642816.0, "step": 1323 }, { "entropy": 2.3785400390625, "epoch": 0.022795554522524384, "grad_norm": 0.5794064402580261, "learning_rate": 9.999209985614832e-06, "loss": 2.4016, "mean_token_accuracy": 0.48678439343348145, "num_tokens": 191792676.0, "step": 1324 }, { "entropy": 2.4501953125, "epoch": 0.022812771708719645, "grad_norm": 0.5635095238685608, "learning_rate": 9.999205086442378e-06, "loss": 2.4489, "mean_token_accuracy": 0.47591904271394014, "num_tokens": 191943634.0, "step": 1325 }, { "entropy": 2.454833984375, "epoch": 0.022829988894914905, "grad_norm": 0.5465742349624634, "learning_rate": 9.999200172127257e-06, "loss": 2.4428, "mean_token_accuracy": 0.4766951338388026, "num_tokens": 192097281.0, "step": 1326 }, { "entropy": 2.490478515625, "epoch": 0.022847206081110166, "grad_norm": 0.5686846375465393, "learning_rate": 9.999195242669487e-06, "loss": 2.4461, "mean_token_accuracy": 0.4775790839921683, "num_tokens": 192238225.0, "step": 1327 }, { "entropy": 2.546630859375, "epoch": 0.022864423267305423, "grad_norm": 0.5859907865524292, "learning_rate": 9.99919029806908e-06, "loss": 2.5229, "mean_token_accuracy": 0.4657089659012854, "num_tokens": 192380707.0, "step": 1328 }, { "entropy": 2.48095703125, "epoch": 0.022881640453500683, "grad_norm": 0.5355405807495117, "learning_rate": 9.99918533832605e-06, "loss": 2.4508, "mean_token_accuracy": 0.47027599764987826, "num_tokens": 192539763.0, "step": 1329 }, { "entropy": 2.449462890625, "epoch": 0.022898857639695944, "grad_norm": 0.5785843133926392, "learning_rate": 9.999180363440416e-06, "loss": 2.4106, "mean_token_accuracy": 0.48218734934926033, "num_tokens": 192687897.0, "step": 1330 }, { "entropy": 2.4329833984375, "epoch": 0.022916074825891204, "grad_norm": 0.5569692850112915, "learning_rate": 9.99917537341219e-06, "loss": 2.4042, "mean_token_accuracy": 0.4766626372002065, "num_tokens": 192834922.0, "step": 1331 }, { "entropy": 2.3797607421875, "epoch": 0.022933292012086465, "grad_norm": 0.5599532723426819, "learning_rate": 9.999170368241389e-06, "loss": 2.3352, "mean_token_accuracy": 0.4913885644637048, "num_tokens": 192977065.0, "step": 1332 }, { "entropy": 2.492431640625, "epoch": 0.022950509198281725, "grad_norm": 0.5502541661262512, "learning_rate": 9.999165347928028e-06, "loss": 2.4414, "mean_token_accuracy": 0.4727879990823567, "num_tokens": 193116265.0, "step": 1333 }, { "entropy": 2.4232177734375, "epoch": 0.022967726384476986, "grad_norm": 0.5547073483467102, "learning_rate": 9.999160312472121e-06, "loss": 2.3541, "mean_token_accuracy": 0.48737787129357457, "num_tokens": 193276114.0, "step": 1334 }, { "entropy": 2.4656982421875, "epoch": 0.022984943570672246, "grad_norm": 0.5852370858192444, "learning_rate": 9.999155261873682e-06, "loss": 2.446, "mean_token_accuracy": 0.475621965713799, "num_tokens": 193440455.0, "step": 1335 }, { "entropy": 2.421630859375, "epoch": 0.023002160756867507, "grad_norm": 0.5449553728103638, "learning_rate": 9.999150196132731e-06, "loss": 2.3845, "mean_token_accuracy": 0.4841674976050854, "num_tokens": 193607915.0, "step": 1336 }, { "entropy": 2.4388427734375, "epoch": 0.023019377943062764, "grad_norm": 0.5830256342887878, "learning_rate": 9.999145115249278e-06, "loss": 2.4171, "mean_token_accuracy": 0.4808310712687671, "num_tokens": 193747035.0, "step": 1337 }, { "entropy": 2.4093017578125, "epoch": 0.023036595129258024, "grad_norm": 0.5849303603172302, "learning_rate": 9.999140019223343e-06, "loss": 2.3965, "mean_token_accuracy": 0.48324496299028397, "num_tokens": 193882217.0, "step": 1338 }, { "entropy": 2.457275390625, "epoch": 0.023053812315453285, "grad_norm": 0.5599417686462402, "learning_rate": 9.999134908054939e-06, "loss": 2.4355, "mean_token_accuracy": 0.47678020503371954, "num_tokens": 194037678.0, "step": 1339 }, { "entropy": 2.40771484375, "epoch": 0.023071029501648545, "grad_norm": 0.635410726070404, "learning_rate": 9.999129781744081e-06, "loss": 2.3913, "mean_token_accuracy": 0.48584868013858795, "num_tokens": 194181984.0, "step": 1340 }, { "entropy": 2.4732666015625, "epoch": 0.023088246687843806, "grad_norm": 0.6021797060966492, "learning_rate": 9.999124640290787e-06, "loss": 2.4765, "mean_token_accuracy": 0.47223041532561183, "num_tokens": 194330357.0, "step": 1341 }, { "entropy": 2.4163818359375, "epoch": 0.023105463874039067, "grad_norm": 0.6050888895988464, "learning_rate": 9.99911948369507e-06, "loss": 2.4004, "mean_token_accuracy": 0.4854390062391758, "num_tokens": 194466254.0, "step": 1342 }, { "entropy": 2.4708251953125, "epoch": 0.023122681060234327, "grad_norm": 0.5668588876724243, "learning_rate": 9.999114311956946e-06, "loss": 2.4037, "mean_token_accuracy": 0.48454291047528386, "num_tokens": 194616186.0, "step": 1343 }, { "entropy": 2.50146484375, "epoch": 0.023139898246429588, "grad_norm": 0.5625669956207275, "learning_rate": 9.99910912507643e-06, "loss": 2.5026, "mean_token_accuracy": 0.4760167011991143, "num_tokens": 194772091.0, "step": 1344 }, { "entropy": 2.4669189453125, "epoch": 0.023157115432624845, "grad_norm": 0.6059160828590393, "learning_rate": 9.999103923053541e-06, "loss": 2.4043, "mean_token_accuracy": 0.4812000300735235, "num_tokens": 194918670.0, "step": 1345 }, { "entropy": 2.41552734375, "epoch": 0.023174332618820105, "grad_norm": 0.5639379620552063, "learning_rate": 9.999098705888293e-06, "loss": 2.3719, "mean_token_accuracy": 0.4815536108799279, "num_tokens": 195073659.0, "step": 1346 }, { "entropy": 2.4085693359375, "epoch": 0.023191549805015366, "grad_norm": 0.5673343539237976, "learning_rate": 9.9990934735807e-06, "loss": 2.3906, "mean_token_accuracy": 0.4839661535806954, "num_tokens": 195215626.0, "step": 1347 }, { "entropy": 2.357421875, "epoch": 0.023208766991210626, "grad_norm": 0.54830002784729, "learning_rate": 9.99908822613078e-06, "loss": 2.3375, "mean_token_accuracy": 0.49586250027641654, "num_tokens": 195371848.0, "step": 1348 }, { "entropy": 2.517578125, "epoch": 0.023225984177405887, "grad_norm": 0.5844534039497375, "learning_rate": 9.999082963538548e-06, "loss": 2.4989, "mean_token_accuracy": 0.46372106671333313, "num_tokens": 195524114.0, "step": 1349 }, { "entropy": 2.4599609375, "epoch": 0.023243201363601147, "grad_norm": 0.5597494840621948, "learning_rate": 9.99907768580402e-06, "loss": 2.4202, "mean_token_accuracy": 0.4795773741789162, "num_tokens": 195668084.0, "step": 1350 }, { "entropy": 2.4263916015625, "epoch": 0.023260418549796408, "grad_norm": 0.5759093165397644, "learning_rate": 9.999072392927213e-06, "loss": 2.3863, "mean_token_accuracy": 0.48264182545244694, "num_tokens": 195819421.0, "step": 1351 }, { "entropy": 2.4315185546875, "epoch": 0.02327763573599167, "grad_norm": 0.5471842288970947, "learning_rate": 9.999067084908141e-06, "loss": 2.3874, "mean_token_accuracy": 0.48090588115155697, "num_tokens": 195965275.0, "step": 1352 }, { "entropy": 2.430908203125, "epoch": 0.023294852922186925, "grad_norm": 0.5952867865562439, "learning_rate": 9.999061761746822e-06, "loss": 2.3695, "mean_token_accuracy": 0.48542801523581147, "num_tokens": 196100273.0, "step": 1353 }, { "entropy": 2.4898681640625, "epoch": 0.023312070108382186, "grad_norm": 0.5692234635353088, "learning_rate": 9.999056423443272e-06, "loss": 2.4315, "mean_token_accuracy": 0.47685773856937885, "num_tokens": 196241976.0, "step": 1354 }, { "entropy": 2.411376953125, "epoch": 0.023329287294577446, "grad_norm": 0.5885135531425476, "learning_rate": 9.999051069997505e-06, "loss": 2.3328, "mean_token_accuracy": 0.48680604388937354, "num_tokens": 196397855.0, "step": 1355 }, { "entropy": 2.439208984375, "epoch": 0.023346504480772707, "grad_norm": 0.5604063272476196, "learning_rate": 9.999045701409539e-06, "loss": 2.3956, "mean_token_accuracy": 0.48355547012761235, "num_tokens": 196557337.0, "step": 1356 }, { "entropy": 2.4569091796875, "epoch": 0.023363721666967967, "grad_norm": 0.5709223747253418, "learning_rate": 9.99904031767939e-06, "loss": 2.4283, "mean_token_accuracy": 0.47822416480630636, "num_tokens": 196698366.0, "step": 1357 }, { "entropy": 2.3804931640625, "epoch": 0.023380938853163228, "grad_norm": 0.5742472410202026, "learning_rate": 9.999034918807075e-06, "loss": 2.3166, "mean_token_accuracy": 0.49618813674896955, "num_tokens": 196847854.0, "step": 1358 }, { "entropy": 2.4959716796875, "epoch": 0.02339815603935849, "grad_norm": 0.5980573892593384, "learning_rate": 9.99902950479261e-06, "loss": 2.4141, "mean_token_accuracy": 0.4758835444226861, "num_tokens": 196971667.0, "step": 1359 }, { "entropy": 2.532470703125, "epoch": 0.02341537322555375, "grad_norm": 0.5753028988838196, "learning_rate": 9.99902407563601e-06, "loss": 2.5277, "mean_token_accuracy": 0.473021827172488, "num_tokens": 197117509.0, "step": 1360 }, { "entropy": 2.3953857421875, "epoch": 0.02343259041174901, "grad_norm": 0.6161779761314392, "learning_rate": 9.999018631337294e-06, "loss": 2.3512, "mean_token_accuracy": 0.4856503955088556, "num_tokens": 197251978.0, "step": 1361 }, { "entropy": 2.5050048828125, "epoch": 0.023449807597944267, "grad_norm": 0.5542529225349426, "learning_rate": 9.999013171896476e-06, "loss": 2.5021, "mean_token_accuracy": 0.46956229070201516, "num_tokens": 197391278.0, "step": 1362 }, { "entropy": 2.420654296875, "epoch": 0.023467024784139527, "grad_norm": 0.5800600051879883, "learning_rate": 9.999007697313573e-06, "loss": 2.3979, "mean_token_accuracy": 0.48410065239295363, "num_tokens": 197531015.0, "step": 1363 }, { "entropy": 2.4874267578125, "epoch": 0.023484241970334788, "grad_norm": 0.6007982492446899, "learning_rate": 9.999002207588603e-06, "loss": 2.423, "mean_token_accuracy": 0.4762545581907034, "num_tokens": 197661883.0, "step": 1364 }, { "entropy": 2.4798583984375, "epoch": 0.023501459156530048, "grad_norm": 0.6128295660018921, "learning_rate": 9.998996702721582e-06, "loss": 2.4135, "mean_token_accuracy": 0.4799572662450373, "num_tokens": 197799526.0, "step": 1365 }, { "entropy": 2.499267578125, "epoch": 0.02351867634272531, "grad_norm": 0.574564516544342, "learning_rate": 9.998991182712526e-06, "loss": 2.4331, "mean_token_accuracy": 0.4757503070868552, "num_tokens": 197966743.0, "step": 1366 }, { "entropy": 2.4637451171875, "epoch": 0.02353589352892057, "grad_norm": 0.5752230882644653, "learning_rate": 9.998985647561453e-06, "loss": 2.438, "mean_token_accuracy": 0.4770435835234821, "num_tokens": 198124743.0, "step": 1367 }, { "entropy": 2.42333984375, "epoch": 0.02355311071511583, "grad_norm": 0.5870989561080933, "learning_rate": 9.998980097268375e-06, "loss": 2.3783, "mean_token_accuracy": 0.487036875449121, "num_tokens": 198265217.0, "step": 1368 }, { "entropy": 2.399169921875, "epoch": 0.02357032790131109, "grad_norm": 0.5645302534103394, "learning_rate": 9.998974531833316e-06, "loss": 2.3897, "mean_token_accuracy": 0.487074441742152, "num_tokens": 198402224.0, "step": 1369 }, { "entropy": 2.40234375, "epoch": 0.023587545087506347, "grad_norm": 0.6568628549575806, "learning_rate": 9.99896895125629e-06, "loss": 2.3445, "mean_token_accuracy": 0.4943762863986194, "num_tokens": 198536773.0, "step": 1370 }, { "entropy": 2.462890625, "epoch": 0.023604762273701608, "grad_norm": 0.5660800337791443, "learning_rate": 9.998963355537313e-06, "loss": 2.4593, "mean_token_accuracy": 0.4756892016157508, "num_tokens": 198690958.0, "step": 1371 }, { "entropy": 2.48193359375, "epoch": 0.02362197945989687, "grad_norm": 0.6144444346427917, "learning_rate": 9.998957744676403e-06, "loss": 2.4622, "mean_token_accuracy": 0.4751414889469743, "num_tokens": 198845530.0, "step": 1372 }, { "entropy": 2.463623046875, "epoch": 0.02363919664609213, "grad_norm": 0.6004127264022827, "learning_rate": 9.998952118673575e-06, "loss": 2.4163, "mean_token_accuracy": 0.4740082244388759, "num_tokens": 198973843.0, "step": 1373 }, { "entropy": 2.4368896484375, "epoch": 0.02365641383228739, "grad_norm": 0.5763023495674133, "learning_rate": 9.998946477528848e-06, "loss": 2.4106, "mean_token_accuracy": 0.4760644896887243, "num_tokens": 199120869.0, "step": 1374 }, { "entropy": 2.455078125, "epoch": 0.02367363101848265, "grad_norm": 0.5937883257865906, "learning_rate": 9.99894082124224e-06, "loss": 2.4061, "mean_token_accuracy": 0.47505834326148033, "num_tokens": 199271029.0, "step": 1375 }, { "entropy": 2.4976806640625, "epoch": 0.02369084820467791, "grad_norm": 0.5722872614860535, "learning_rate": 9.998935149813766e-06, "loss": 2.4279, "mean_token_accuracy": 0.4803337692283094, "num_tokens": 199421781.0, "step": 1376 }, { "entropy": 2.4547119140625, "epoch": 0.02370806539087317, "grad_norm": 0.5956291556358337, "learning_rate": 9.998929463243443e-06, "loss": 2.4151, "mean_token_accuracy": 0.478096229955554, "num_tokens": 199575267.0, "step": 1377 }, { "entropy": 2.51513671875, "epoch": 0.023725282577068428, "grad_norm": 0.5729405879974365, "learning_rate": 9.99892376153129e-06, "loss": 2.4828, "mean_token_accuracy": 0.4753085137344897, "num_tokens": 199718432.0, "step": 1378 }, { "entropy": 2.4134521484375, "epoch": 0.02374249976326369, "grad_norm": 0.5974603295326233, "learning_rate": 9.998918044677321e-06, "loss": 2.3474, "mean_token_accuracy": 0.4921872243285179, "num_tokens": 199860520.0, "step": 1379 }, { "entropy": 2.4600830078125, "epoch": 0.02375971694945895, "grad_norm": 0.5969933271408081, "learning_rate": 9.99891231268156e-06, "loss": 2.4926, "mean_token_accuracy": 0.4793005189858377, "num_tokens": 200000679.0, "step": 1380 }, { "entropy": 2.45556640625, "epoch": 0.02377693413565421, "grad_norm": 0.5951429605484009, "learning_rate": 9.998906565544017e-06, "loss": 2.4085, "mean_token_accuracy": 0.4794029123149812, "num_tokens": 200145478.0, "step": 1381 }, { "entropy": 2.4609375, "epoch": 0.02379415132184947, "grad_norm": 0.6001779437065125, "learning_rate": 9.998900803264714e-06, "loss": 2.4392, "mean_token_accuracy": 0.4742898163385689, "num_tokens": 200297504.0, "step": 1382 }, { "entropy": 2.4384765625, "epoch": 0.02381136850804473, "grad_norm": 0.5844825506210327, "learning_rate": 9.998895025843664e-06, "loss": 2.3799, "mean_token_accuracy": 0.4815810131840408, "num_tokens": 200438128.0, "step": 1383 }, { "entropy": 2.439453125, "epoch": 0.02382858569423999, "grad_norm": 0.5872356295585632, "learning_rate": 9.99888923328089e-06, "loss": 2.375, "mean_token_accuracy": 0.48686960944905877, "num_tokens": 200576616.0, "step": 1384 }, { "entropy": 2.4556884765625, "epoch": 0.023845802880435252, "grad_norm": 0.5676760077476501, "learning_rate": 9.998883425576407e-06, "loss": 2.4051, "mean_token_accuracy": 0.47929678950458765, "num_tokens": 200715807.0, "step": 1385 }, { "entropy": 2.5262451171875, "epoch": 0.023863020066630512, "grad_norm": 0.6212140321731567, "learning_rate": 9.998877602730231e-06, "loss": 2.5013, "mean_token_accuracy": 0.46635109139606357, "num_tokens": 200852036.0, "step": 1386 }, { "entropy": 2.466796875, "epoch": 0.02388023725282577, "grad_norm": 0.5637140870094299, "learning_rate": 9.998871764742383e-06, "loss": 2.4031, "mean_token_accuracy": 0.4789075516164303, "num_tokens": 201005894.0, "step": 1387 }, { "entropy": 2.432373046875, "epoch": 0.02389745443902103, "grad_norm": 0.554351806640625, "learning_rate": 9.998865911612878e-06, "loss": 2.334, "mean_token_accuracy": 0.48602917743846774, "num_tokens": 201163254.0, "step": 1388 }, { "entropy": 2.4622802734375, "epoch": 0.02391467162521629, "grad_norm": 0.5873886346817017, "learning_rate": 9.998860043341733e-06, "loss": 2.4122, "mean_token_accuracy": 0.4796907198615372, "num_tokens": 201311034.0, "step": 1389 }, { "entropy": 2.5067138671875, "epoch": 0.02393188881141155, "grad_norm": 0.589946985244751, "learning_rate": 9.99885415992897e-06, "loss": 2.4737, "mean_token_accuracy": 0.46914456551894546, "num_tokens": 201439256.0, "step": 1390 }, { "entropy": 2.5040283203125, "epoch": 0.02394910599760681, "grad_norm": 0.5907305479049683, "learning_rate": 9.998848261374602e-06, "loss": 2.5056, "mean_token_accuracy": 0.4672775506041944, "num_tokens": 201575176.0, "step": 1391 }, { "entropy": 2.432373046875, "epoch": 0.023966323183802072, "grad_norm": 0.5816705822944641, "learning_rate": 9.998842347678652e-06, "loss": 2.4009, "mean_token_accuracy": 0.48051235545426607, "num_tokens": 201741972.0, "step": 1392 }, { "entropy": 2.532470703125, "epoch": 0.023983540369997332, "grad_norm": 0.5787369012832642, "learning_rate": 9.998836418841133e-06, "loss": 2.5186, "mean_token_accuracy": 0.46860673651099205, "num_tokens": 201887196.0, "step": 1393 }, { "entropy": 2.3990478515625, "epoch": 0.024000757556192593, "grad_norm": 0.6086745262145996, "learning_rate": 9.998830474862064e-06, "loss": 2.3506, "mean_token_accuracy": 0.4886458469554782, "num_tokens": 202021347.0, "step": 1394 }, { "entropy": 2.41748046875, "epoch": 0.02401797474238785, "grad_norm": 0.5796844363212585, "learning_rate": 9.998824515741467e-06, "loss": 2.3866, "mean_token_accuracy": 0.4884433811530471, "num_tokens": 202160896.0, "step": 1395 }, { "entropy": 2.5753173828125, "epoch": 0.02403519192858311, "grad_norm": 0.5776175856590271, "learning_rate": 9.998818541479355e-06, "loss": 2.5233, "mean_token_accuracy": 0.4616355444304645, "num_tokens": 202292953.0, "step": 1396 }, { "entropy": 2.39892578125, "epoch": 0.02405240911477837, "grad_norm": 0.5363594889640808, "learning_rate": 9.998812552075747e-06, "loss": 2.3689, "mean_token_accuracy": 0.4840633156709373, "num_tokens": 202460982.0, "step": 1397 }, { "entropy": 2.39208984375, "epoch": 0.02406962630097363, "grad_norm": 0.5426232218742371, "learning_rate": 9.998806547530664e-06, "loss": 2.3481, "mean_token_accuracy": 0.48746701143682003, "num_tokens": 202621458.0, "step": 1398 }, { "entropy": 2.4622802734375, "epoch": 0.024086843487168892, "grad_norm": 0.5721414089202881, "learning_rate": 9.998800527844122e-06, "loss": 2.4094, "mean_token_accuracy": 0.4796240641735494, "num_tokens": 202756740.0, "step": 1399 }, { "entropy": 2.363037109375, "epoch": 0.024104060673364153, "grad_norm": 0.5773559212684631, "learning_rate": 9.99879449301614e-06, "loss": 2.3559, "mean_token_accuracy": 0.49749248987063766, "num_tokens": 202900096.0, "step": 1400 }, { "entropy": 2.53125, "epoch": 0.024121277859559413, "grad_norm": 0.611416220664978, "learning_rate": 9.998788443046735e-06, "loss": 2.5107, "mean_token_accuracy": 0.4666637545451522, "num_tokens": 203038362.0, "step": 1401 }, { "entropy": 2.4573974609375, "epoch": 0.024138495045754674, "grad_norm": 0.6409509778022766, "learning_rate": 9.998782377935927e-06, "loss": 2.423, "mean_token_accuracy": 0.4764982839114964, "num_tokens": 203176454.0, "step": 1402 }, { "entropy": 2.419921875, "epoch": 0.02415571223194993, "grad_norm": 0.5751169323921204, "learning_rate": 9.998776297683733e-06, "loss": 2.3627, "mean_token_accuracy": 0.48058157647028565, "num_tokens": 203311141.0, "step": 1403 }, { "entropy": 2.4736328125, "epoch": 0.02417292941814519, "grad_norm": 0.580534040927887, "learning_rate": 9.998770202290173e-06, "loss": 2.449, "mean_token_accuracy": 0.4753856221213937, "num_tokens": 203452951.0, "step": 1404 }, { "entropy": 2.426513671875, "epoch": 0.024190146604340452, "grad_norm": 0.5762086510658264, "learning_rate": 9.998764091755264e-06, "loss": 2.3869, "mean_token_accuracy": 0.4834116967394948, "num_tokens": 203600387.0, "step": 1405 }, { "entropy": 2.509521484375, "epoch": 0.024207363790535712, "grad_norm": 0.5431259870529175, "learning_rate": 9.998757966079024e-06, "loss": 2.486, "mean_token_accuracy": 0.4661334496922791, "num_tokens": 203751350.0, "step": 1406 }, { "entropy": 2.499267578125, "epoch": 0.024224580976730973, "grad_norm": 0.6089497804641724, "learning_rate": 9.998751825261474e-06, "loss": 2.4389, "mean_token_accuracy": 0.4734781333245337, "num_tokens": 203902498.0, "step": 1407 }, { "entropy": 2.48583984375, "epoch": 0.024241798162926233, "grad_norm": 0.5681830048561096, "learning_rate": 9.998745669302632e-06, "loss": 2.4445, "mean_token_accuracy": 0.4708978096023202, "num_tokens": 204044975.0, "step": 1408 }, { "entropy": 2.4814453125, "epoch": 0.024259015349121494, "grad_norm": 0.5554722547531128, "learning_rate": 9.998739498202514e-06, "loss": 2.4522, "mean_token_accuracy": 0.47262357315048575, "num_tokens": 204205348.0, "step": 1409 }, { "entropy": 2.444580078125, "epoch": 0.024276232535316754, "grad_norm": 0.5820813775062561, "learning_rate": 9.99873331196114e-06, "loss": 2.3785, "mean_token_accuracy": 0.4880591509863734, "num_tokens": 204345568.0, "step": 1410 }, { "entropy": 2.45556640625, "epoch": 0.024293449721512015, "grad_norm": 0.5614075660705566, "learning_rate": 9.99872711057853e-06, "loss": 2.4034, "mean_token_accuracy": 0.4788000574335456, "num_tokens": 204494358.0, "step": 1411 }, { "entropy": 2.4716796875, "epoch": 0.024310666907707272, "grad_norm": 0.5851388573646545, "learning_rate": 9.998720894054703e-06, "loss": 2.4773, "mean_token_accuracy": 0.4750785259529948, "num_tokens": 204626789.0, "step": 1412 }, { "entropy": 2.45458984375, "epoch": 0.024327884093902533, "grad_norm": 0.7529696822166443, "learning_rate": 9.998714662389676e-06, "loss": 2.4149, "mean_token_accuracy": 0.4763760110363364, "num_tokens": 204774398.0, "step": 1413 }, { "entropy": 2.4195556640625, "epoch": 0.024345101280097793, "grad_norm": 0.6168255805969238, "learning_rate": 9.99870841558347e-06, "loss": 2.3811, "mean_token_accuracy": 0.4846136327832937, "num_tokens": 204924934.0, "step": 1414 }, { "entropy": 2.4605712890625, "epoch": 0.024362318466293054, "grad_norm": 0.6109985113143921, "learning_rate": 9.998702153636102e-06, "loss": 2.4489, "mean_token_accuracy": 0.4767269352450967, "num_tokens": 205059906.0, "step": 1415 }, { "entropy": 2.436767578125, "epoch": 0.024379535652488314, "grad_norm": 0.5300309658050537, "learning_rate": 9.998695876547591e-06, "loss": 2.3666, "mean_token_accuracy": 0.4840262867510319, "num_tokens": 205220596.0, "step": 1416 }, { "entropy": 2.429931640625, "epoch": 0.024396752838683575, "grad_norm": 0.5640124678611755, "learning_rate": 9.998689584317955e-06, "loss": 2.3631, "mean_token_accuracy": 0.49288097163662314, "num_tokens": 205386977.0, "step": 1417 }, { "entropy": 2.46435546875, "epoch": 0.024413970024878835, "grad_norm": 0.5903874635696411, "learning_rate": 9.998683276947219e-06, "loss": 2.4291, "mean_token_accuracy": 0.47703268751502037, "num_tokens": 205526617.0, "step": 1418 }, { "entropy": 2.4453125, "epoch": 0.024431187211074096, "grad_norm": 0.567484438419342, "learning_rate": 9.998676954435394e-06, "loss": 2.3968, "mean_token_accuracy": 0.4830042072571814, "num_tokens": 205669600.0, "step": 1419 }, { "entropy": 2.4515380859375, "epoch": 0.024448404397269353, "grad_norm": 0.5405850410461426, "learning_rate": 9.998670616782505e-06, "loss": 2.3772, "mean_token_accuracy": 0.4838403551839292, "num_tokens": 205821251.0, "step": 1420 }, { "entropy": 2.4334716796875, "epoch": 0.024465621583464613, "grad_norm": 0.5751288533210754, "learning_rate": 9.99866426398857e-06, "loss": 2.3644, "mean_token_accuracy": 0.48748740553855896, "num_tokens": 205964418.0, "step": 1421 }, { "entropy": 2.474853515625, "epoch": 0.024482838769659874, "grad_norm": 0.5722588896751404, "learning_rate": 9.998657896053604e-06, "loss": 2.4274, "mean_token_accuracy": 0.47551520401611924, "num_tokens": 206095207.0, "step": 1422 }, { "entropy": 2.470458984375, "epoch": 0.024500055955855134, "grad_norm": 0.6179440021514893, "learning_rate": 9.998651512977631e-06, "loss": 2.4267, "mean_token_accuracy": 0.47643746761605144, "num_tokens": 206240116.0, "step": 1423 }, { "entropy": 2.4114990234375, "epoch": 0.024517273142050395, "grad_norm": 0.608542799949646, "learning_rate": 9.99864511476067e-06, "loss": 2.3299, "mean_token_accuracy": 0.4850447475910187, "num_tokens": 206389712.0, "step": 1424 }, { "entropy": 2.45263671875, "epoch": 0.024534490328245655, "grad_norm": 0.5260599851608276, "learning_rate": 9.998638701402739e-06, "loss": 2.4158, "mean_token_accuracy": 0.4785124696791172, "num_tokens": 206548123.0, "step": 1425 }, { "entropy": 2.39892578125, "epoch": 0.024551707514440916, "grad_norm": 0.6076765060424805, "learning_rate": 9.998632272903858e-06, "loss": 2.3619, "mean_token_accuracy": 0.49168099416419864, "num_tokens": 206687271.0, "step": 1426 }, { "entropy": 2.499755859375, "epoch": 0.024568924700636176, "grad_norm": 0.6022489070892334, "learning_rate": 9.998625829264045e-06, "loss": 2.487, "mean_token_accuracy": 0.46968710515648127, "num_tokens": 206824438.0, "step": 1427 }, { "entropy": 2.4072265625, "epoch": 0.024586141886831434, "grad_norm": 0.5461668372154236, "learning_rate": 9.998619370483324e-06, "loss": 2.3735, "mean_token_accuracy": 0.49147024331614375, "num_tokens": 206981519.0, "step": 1428 }, { "entropy": 2.4703369140625, "epoch": 0.024603359073026694, "grad_norm": 0.5643055438995361, "learning_rate": 9.998612896561709e-06, "loss": 2.4402, "mean_token_accuracy": 0.47540956176817417, "num_tokens": 207129316.0, "step": 1429 }, { "entropy": 2.430908203125, "epoch": 0.024620576259221955, "grad_norm": 0.5822505950927734, "learning_rate": 9.998606407499222e-06, "loss": 2.3844, "mean_token_accuracy": 0.4864038904197514, "num_tokens": 207277904.0, "step": 1430 }, { "entropy": 2.41015625, "epoch": 0.024637793445417215, "grad_norm": 0.5898188352584839, "learning_rate": 9.998599903295883e-06, "loss": 2.3588, "mean_token_accuracy": 0.4909462621435523, "num_tokens": 207422167.0, "step": 1431 }, { "entropy": 2.4686279296875, "epoch": 0.024655010631612476, "grad_norm": 0.5768987536430359, "learning_rate": 9.99859338395171e-06, "loss": 2.4193, "mean_token_accuracy": 0.47896814439445734, "num_tokens": 207560662.0, "step": 1432 }, { "entropy": 2.5447998046875, "epoch": 0.024672227817807736, "grad_norm": 0.582497239112854, "learning_rate": 9.998586849466727e-06, "loss": 2.5383, "mean_token_accuracy": 0.46700617065653205, "num_tokens": 207702226.0, "step": 1433 }, { "entropy": 2.4814453125, "epoch": 0.024689445004002997, "grad_norm": 0.6393965482711792, "learning_rate": 9.99858029984095e-06, "loss": 2.438, "mean_token_accuracy": 0.47804489312693477, "num_tokens": 207827846.0, "step": 1434 }, { "entropy": 2.3828125, "epoch": 0.024706662190198257, "grad_norm": 0.618248701095581, "learning_rate": 9.998573735074398e-06, "loss": 2.3306, "mean_token_accuracy": 0.48714121896773577, "num_tokens": 207996679.0, "step": 1435 }, { "entropy": 2.452880859375, "epoch": 0.024723879376393518, "grad_norm": 0.6095260381698608, "learning_rate": 9.998567155167094e-06, "loss": 2.3982, "mean_token_accuracy": 0.48710576351732016, "num_tokens": 208129844.0, "step": 1436 }, { "entropy": 2.4764404296875, "epoch": 0.024741096562588775, "grad_norm": 0.5430065393447876, "learning_rate": 9.998560560119058e-06, "loss": 2.4219, "mean_token_accuracy": 0.47808440774679184, "num_tokens": 208273997.0, "step": 1437 }, { "entropy": 2.447021484375, "epoch": 0.024758313748784035, "grad_norm": 0.6290302276611328, "learning_rate": 9.998553949930306e-06, "loss": 2.3994, "mean_token_accuracy": 0.484294559340924, "num_tokens": 208429606.0, "step": 1438 }, { "entropy": 2.431640625, "epoch": 0.024775530934979296, "grad_norm": 0.5840831398963928, "learning_rate": 9.99854732460086e-06, "loss": 2.3615, "mean_token_accuracy": 0.4878291576169431, "num_tokens": 208577096.0, "step": 1439 }, { "entropy": 2.4404296875, "epoch": 0.024792748121174556, "grad_norm": 0.5737127065658569, "learning_rate": 9.998540684130743e-06, "loss": 2.4082, "mean_token_accuracy": 0.48507948173210025, "num_tokens": 208744507.0, "step": 1440 }, { "entropy": 2.380126953125, "epoch": 0.024809965307369817, "grad_norm": 0.5842684507369995, "learning_rate": 9.998534028519971e-06, "loss": 2.3265, "mean_token_accuracy": 0.49405472399666905, "num_tokens": 208887356.0, "step": 1441 }, { "entropy": 2.447265625, "epoch": 0.024827182493565077, "grad_norm": 0.5664902329444885, "learning_rate": 9.998527357768566e-06, "loss": 2.4049, "mean_token_accuracy": 0.4856785982847214, "num_tokens": 209035653.0, "step": 1442 }, { "entropy": 2.5068359375, "epoch": 0.024844399679760338, "grad_norm": 0.5968948602676392, "learning_rate": 9.99852067187655e-06, "loss": 2.4616, "mean_token_accuracy": 0.4748954540118575, "num_tokens": 209174761.0, "step": 1443 }, { "entropy": 2.4140625, "epoch": 0.0248616168659556, "grad_norm": 0.5464600920677185, "learning_rate": 9.99851397084394e-06, "loss": 2.391, "mean_token_accuracy": 0.4841625513508916, "num_tokens": 209335128.0, "step": 1444 }, { "entropy": 2.406494140625, "epoch": 0.024878834052150856, "grad_norm": 0.5924455523490906, "learning_rate": 9.998507254670757e-06, "loss": 2.3687, "mean_token_accuracy": 0.4873992637731135, "num_tokens": 209494921.0, "step": 1445 }, { "entropy": 2.4461669921875, "epoch": 0.024896051238346116, "grad_norm": 0.5587389469146729, "learning_rate": 9.998500523357022e-06, "loss": 2.4399, "mean_token_accuracy": 0.4791180179454386, "num_tokens": 209636823.0, "step": 1446 }, { "entropy": 2.4124755859375, "epoch": 0.024913268424541377, "grad_norm": 0.5826576948165894, "learning_rate": 9.998493776902756e-06, "loss": 2.3668, "mean_token_accuracy": 0.4858208396472037, "num_tokens": 209782197.0, "step": 1447 }, { "entropy": 2.4521484375, "epoch": 0.024930485610736637, "grad_norm": 0.6207579374313354, "learning_rate": 9.998487015307978e-06, "loss": 2.392, "mean_token_accuracy": 0.4869683226570487, "num_tokens": 209916075.0, "step": 1448 }, { "entropy": 2.4453125, "epoch": 0.024947702796931898, "grad_norm": 0.640599250793457, "learning_rate": 9.998480238572711e-06, "loss": 2.383, "mean_token_accuracy": 0.480794875882566, "num_tokens": 210062565.0, "step": 1449 }, { "entropy": 2.4508056640625, "epoch": 0.024964919983127158, "grad_norm": 0.5667426586151123, "learning_rate": 9.998473446696972e-06, "loss": 2.4082, "mean_token_accuracy": 0.4857399258762598, "num_tokens": 210214070.0, "step": 1450 }, { "entropy": 2.4837646484375, "epoch": 0.02498213716932242, "grad_norm": 0.5633426904678345, "learning_rate": 9.998466639680786e-06, "loss": 2.4637, "mean_token_accuracy": 0.47439560387283564, "num_tokens": 210356264.0, "step": 1451 }, { "entropy": 2.3912353515625, "epoch": 0.02499935435551768, "grad_norm": 0.621870219707489, "learning_rate": 9.998459817524168e-06, "loss": 2.3623, "mean_token_accuracy": 0.4901335104368627, "num_tokens": 210509728.0, "step": 1452 }, { "entropy": 2.4949951171875, "epoch": 0.025016571541712936, "grad_norm": 0.5852912664413452, "learning_rate": 9.998452980227143e-06, "loss": 2.4597, "mean_token_accuracy": 0.4733400009572506, "num_tokens": 210644493.0, "step": 1453 }, { "entropy": 2.434326171875, "epoch": 0.025033788727908197, "grad_norm": 0.5888333320617676, "learning_rate": 9.998446127789731e-06, "loss": 2.4475, "mean_token_accuracy": 0.4807679671794176, "num_tokens": 210791477.0, "step": 1454 }, { "entropy": 2.44189453125, "epoch": 0.025051005914103457, "grad_norm": 0.5960559248924255, "learning_rate": 9.998439260211953e-06, "loss": 2.4365, "mean_token_accuracy": 0.4799190401099622, "num_tokens": 210939578.0, "step": 1455 }, { "entropy": 2.432373046875, "epoch": 0.025068223100298718, "grad_norm": 0.5698879957199097, "learning_rate": 9.998432377493826e-06, "loss": 2.3899, "mean_token_accuracy": 0.4848698596470058, "num_tokens": 211096780.0, "step": 1456 }, { "entropy": 2.3997802734375, "epoch": 0.02508544028649398, "grad_norm": 0.5995359420776367, "learning_rate": 9.998425479635373e-06, "loss": 2.383, "mean_token_accuracy": 0.4919665399938822, "num_tokens": 211234392.0, "step": 1457 }, { "entropy": 2.497802734375, "epoch": 0.02510265747268924, "grad_norm": 0.5807369351387024, "learning_rate": 9.99841856663662e-06, "loss": 2.4674, "mean_token_accuracy": 0.47135368920862675, "num_tokens": 211366377.0, "step": 1458 }, { "entropy": 2.4554443359375, "epoch": 0.0251198746588845, "grad_norm": 0.5541589260101318, "learning_rate": 9.99841163849758e-06, "loss": 2.4065, "mean_token_accuracy": 0.48336717346683145, "num_tokens": 211517256.0, "step": 1459 }, { "entropy": 2.473876953125, "epoch": 0.02513709184507976, "grad_norm": 0.597100555896759, "learning_rate": 9.99840469521828e-06, "loss": 2.4374, "mean_token_accuracy": 0.4758867691271007, "num_tokens": 211656203.0, "step": 1460 }, { "entropy": 2.459228515625, "epoch": 0.025154309031275017, "grad_norm": 0.5845798850059509, "learning_rate": 9.998397736798737e-06, "loss": 2.4206, "mean_token_accuracy": 0.4736228142865002, "num_tokens": 211798384.0, "step": 1461 }, { "entropy": 2.455078125, "epoch": 0.025171526217470278, "grad_norm": 0.6050572395324707, "learning_rate": 9.998390763238975e-06, "loss": 2.4037, "mean_token_accuracy": 0.48261626344174147, "num_tokens": 211932728.0, "step": 1462 }, { "entropy": 2.4322509765625, "epoch": 0.025188743403665538, "grad_norm": 0.5519930124282837, "learning_rate": 9.998383774539013e-06, "loss": 2.3796, "mean_token_accuracy": 0.48328032717108727, "num_tokens": 212075313.0, "step": 1463 }, { "entropy": 2.4688720703125, "epoch": 0.0252059605898608, "grad_norm": 0.5802918672561646, "learning_rate": 9.998376770698875e-06, "loss": 2.4487, "mean_token_accuracy": 0.4819605387747288, "num_tokens": 212220989.0, "step": 1464 }, { "entropy": 2.432373046875, "epoch": 0.02522317777605606, "grad_norm": 0.5660965442657471, "learning_rate": 9.998369751718577e-06, "loss": 2.3826, "mean_token_accuracy": 0.48062465619295835, "num_tokens": 212377908.0, "step": 1465 }, { "entropy": 2.5208740234375, "epoch": 0.02524039496225132, "grad_norm": 0.6206576228141785, "learning_rate": 9.998362717598144e-06, "loss": 2.5238, "mean_token_accuracy": 0.46971421781927347, "num_tokens": 212521169.0, "step": 1466 }, { "entropy": 2.45263671875, "epoch": 0.02525761214844658, "grad_norm": 0.5654458403587341, "learning_rate": 9.998355668337599e-06, "loss": 2.4136, "mean_token_accuracy": 0.4812098955735564, "num_tokens": 212661516.0, "step": 1467 }, { "entropy": 2.4288330078125, "epoch": 0.02527482933464184, "grad_norm": 0.5742049217224121, "learning_rate": 9.99834860393696e-06, "loss": 2.3939, "mean_token_accuracy": 0.4802519157528877, "num_tokens": 212802767.0, "step": 1468 }, { "entropy": 2.47021484375, "epoch": 0.0252920465208371, "grad_norm": 0.5584784150123596, "learning_rate": 9.998341524396249e-06, "loss": 2.4361, "mean_token_accuracy": 0.47756797447800636, "num_tokens": 212937639.0, "step": 1469 }, { "entropy": 2.4468994140625, "epoch": 0.025309263707032358, "grad_norm": 0.7078640460968018, "learning_rate": 9.998334429715488e-06, "loss": 2.3877, "mean_token_accuracy": 0.4793696654960513, "num_tokens": 213079562.0, "step": 1470 }, { "entropy": 2.39892578125, "epoch": 0.02532648089322762, "grad_norm": 0.5251427292823792, "learning_rate": 9.998327319894699e-06, "loss": 2.3489, "mean_token_accuracy": 0.4894181271083653, "num_tokens": 213245711.0, "step": 1471 }, { "entropy": 2.3956298828125, "epoch": 0.02534369807942288, "grad_norm": 0.5619721412658691, "learning_rate": 9.998320194933904e-06, "loss": 2.3521, "mean_token_accuracy": 0.49081948725506663, "num_tokens": 213387544.0, "step": 1472 }, { "entropy": 2.41552734375, "epoch": 0.02536091526561814, "grad_norm": 0.5867946147918701, "learning_rate": 9.998313054833123e-06, "loss": 2.3628, "mean_token_accuracy": 0.48890325892716646, "num_tokens": 213525323.0, "step": 1473 }, { "entropy": 2.5234375, "epoch": 0.0253781324518134, "grad_norm": 0.5271015763282776, "learning_rate": 9.998305899592378e-06, "loss": 2.4971, "mean_token_accuracy": 0.4719184576533735, "num_tokens": 213676017.0, "step": 1474 }, { "entropy": 2.4503173828125, "epoch": 0.02539534963800866, "grad_norm": 0.5921691060066223, "learning_rate": 9.99829872921169e-06, "loss": 2.4328, "mean_token_accuracy": 0.4846241660416126, "num_tokens": 213819688.0, "step": 1475 }, { "entropy": 2.4017333984375, "epoch": 0.02541256682420392, "grad_norm": 0.5926265716552734, "learning_rate": 9.998291543691081e-06, "loss": 2.3141, "mean_token_accuracy": 0.4890713025815785, "num_tokens": 213966213.0, "step": 1476 }, { "entropy": 2.418212890625, "epoch": 0.025429784010399182, "grad_norm": 0.5591694116592407, "learning_rate": 9.998284343030575e-06, "loss": 2.3925, "mean_token_accuracy": 0.4819628051482141, "num_tokens": 214097650.0, "step": 1477 }, { "entropy": 2.39453125, "epoch": 0.02544700119659444, "grad_norm": 0.5935323238372803, "learning_rate": 9.99827712723019e-06, "loss": 2.3999, "mean_token_accuracy": 0.48563890950754285, "num_tokens": 214233345.0, "step": 1478 }, { "entropy": 2.5748291015625, "epoch": 0.0254642183827897, "grad_norm": 0.5952027440071106, "learning_rate": 9.998269896289953e-06, "loss": 2.531, "mean_token_accuracy": 0.46888946322724223, "num_tokens": 214371139.0, "step": 1479 }, { "entropy": 2.436767578125, "epoch": 0.02548143556898496, "grad_norm": 0.5706344842910767, "learning_rate": 9.998262650209882e-06, "loss": 2.4313, "mean_token_accuracy": 0.4844216462224722, "num_tokens": 214524344.0, "step": 1480 }, { "entropy": 2.444580078125, "epoch": 0.02549865275518022, "grad_norm": 0.6236752271652222, "learning_rate": 9.99825538899e-06, "loss": 2.4011, "mean_token_accuracy": 0.48665000684559345, "num_tokens": 214685683.0, "step": 1481 }, { "entropy": 2.4854736328125, "epoch": 0.02551586994137548, "grad_norm": 0.56282639503479, "learning_rate": 9.998248112630328e-06, "loss": 2.414, "mean_token_accuracy": 0.47536311158910394, "num_tokens": 214833325.0, "step": 1482 }, { "entropy": 2.44873046875, "epoch": 0.02553308712757074, "grad_norm": 0.5949015617370605, "learning_rate": 9.99824082113089e-06, "loss": 2.4348, "mean_token_accuracy": 0.4832215076312423, "num_tokens": 214974294.0, "step": 1483 }, { "entropy": 2.509765625, "epoch": 0.025550304313766002, "grad_norm": 0.6240081191062927, "learning_rate": 9.998233514491706e-06, "loss": 2.4456, "mean_token_accuracy": 0.4775039851665497, "num_tokens": 215107075.0, "step": 1484 }, { "entropy": 2.48486328125, "epoch": 0.025567521499961263, "grad_norm": 0.5603934526443481, "learning_rate": 9.9982261927128e-06, "loss": 2.4404, "mean_token_accuracy": 0.4763817982748151, "num_tokens": 215257991.0, "step": 1485 }, { "entropy": 2.472412109375, "epoch": 0.02558473868615652, "grad_norm": 0.5318784117698669, "learning_rate": 9.998218855794193e-06, "loss": 2.4512, "mean_token_accuracy": 0.4735376350581646, "num_tokens": 215424684.0, "step": 1486 }, { "entropy": 2.4873046875, "epoch": 0.02560195587235178, "grad_norm": 0.5659030079841614, "learning_rate": 9.998211503735908e-06, "loss": 2.4298, "mean_token_accuracy": 0.47843836853280663, "num_tokens": 215573454.0, "step": 1487 }, { "entropy": 2.3856201171875, "epoch": 0.02561917305854704, "grad_norm": 0.6163880228996277, "learning_rate": 9.998204136537965e-06, "loss": 2.3667, "mean_token_accuracy": 0.49307617312297225, "num_tokens": 215709724.0, "step": 1488 }, { "entropy": 2.4710693359375, "epoch": 0.0256363902447423, "grad_norm": 0.5732681751251221, "learning_rate": 9.99819675420039e-06, "loss": 2.4201, "mean_token_accuracy": 0.4794590980745852, "num_tokens": 215844176.0, "step": 1489 }, { "entropy": 2.4678955078125, "epoch": 0.025653607430937562, "grad_norm": 0.8923822045326233, "learning_rate": 9.998189356723203e-06, "loss": 2.4845, "mean_token_accuracy": 0.47123232623562217, "num_tokens": 215988104.0, "step": 1490 }, { "entropy": 2.5465087890625, "epoch": 0.025670824617132822, "grad_norm": 0.5661553740501404, "learning_rate": 9.998181944106428e-06, "loss": 2.5037, "mean_token_accuracy": 0.4655986800789833, "num_tokens": 216131658.0, "step": 1491 }, { "entropy": 2.486572265625, "epoch": 0.025688041803328083, "grad_norm": 0.5814360976219177, "learning_rate": 9.998174516350086e-06, "loss": 2.4732, "mean_token_accuracy": 0.4705640790052712, "num_tokens": 216274055.0, "step": 1492 }, { "entropy": 2.3916015625, "epoch": 0.025705258989523343, "grad_norm": 0.5932287573814392, "learning_rate": 9.998167073454197e-06, "loss": 2.3721, "mean_token_accuracy": 0.49608813878148794, "num_tokens": 216414995.0, "step": 1493 }, { "entropy": 2.4781494140625, "epoch": 0.025722476175718604, "grad_norm": 0.5934662222862244, "learning_rate": 9.99815961541879e-06, "loss": 2.4183, "mean_token_accuracy": 0.4743193374015391, "num_tokens": 216559579.0, "step": 1494 }, { "entropy": 2.4559326171875, "epoch": 0.02573969336191386, "grad_norm": 0.5569154620170593, "learning_rate": 9.998152142243882e-06, "loss": 2.3922, "mean_token_accuracy": 0.48210062831640244, "num_tokens": 216706806.0, "step": 1495 }, { "entropy": 2.4129638671875, "epoch": 0.02575691054810912, "grad_norm": 0.5600568056106567, "learning_rate": 9.998144653929499e-06, "loss": 2.371, "mean_token_accuracy": 0.4889267375692725, "num_tokens": 216861317.0, "step": 1496 }, { "entropy": 2.4239501953125, "epoch": 0.025774127734304382, "grad_norm": 0.5837308168411255, "learning_rate": 9.99813715047566e-06, "loss": 2.3703, "mean_token_accuracy": 0.48130458453670144, "num_tokens": 217013439.0, "step": 1497 }, { "entropy": 2.4178466796875, "epoch": 0.025791344920499643, "grad_norm": 0.5723254084587097, "learning_rate": 9.998129631882392e-06, "loss": 2.3257, "mean_token_accuracy": 0.4927904959768057, "num_tokens": 217161046.0, "step": 1498 }, { "entropy": 2.4444580078125, "epoch": 0.025808562106694903, "grad_norm": 0.6308748126029968, "learning_rate": 9.998122098149714e-06, "loss": 2.3964, "mean_token_accuracy": 0.4866189956665039, "num_tokens": 217301151.0, "step": 1499 }, { "entropy": 2.422607421875, "epoch": 0.025825779292890164, "grad_norm": 0.5735398530960083, "learning_rate": 9.998114549277653e-06, "loss": 2.4083, "mean_token_accuracy": 0.4805009118281305, "num_tokens": 217444559.0, "step": 1500 }, { "entropy": 2.4912109375, "epoch": 0.025842996479085424, "grad_norm": 0.5808705687522888, "learning_rate": 9.998106985266229e-06, "loss": 2.4692, "mean_token_accuracy": 0.47182623157277703, "num_tokens": 217596025.0, "step": 1501 }, { "entropy": 2.4501953125, "epoch": 0.025860213665280685, "grad_norm": 0.5593245625495911, "learning_rate": 9.998099406115465e-06, "loss": 2.4285, "mean_token_accuracy": 0.4782219841144979, "num_tokens": 217747354.0, "step": 1502 }, { "entropy": 2.429931640625, "epoch": 0.02587743085147594, "grad_norm": 0.5433899760246277, "learning_rate": 9.998091811825383e-06, "loss": 2.4056, "mean_token_accuracy": 0.4807021920569241, "num_tokens": 217895004.0, "step": 1503 }, { "entropy": 2.4007568359375, "epoch": 0.025894648037671202, "grad_norm": 0.5361343026161194, "learning_rate": 9.998084202396008e-06, "loss": 2.3451, "mean_token_accuracy": 0.4870217302814126, "num_tokens": 218041797.0, "step": 1504 }, { "entropy": 2.5289306640625, "epoch": 0.025911865223866463, "grad_norm": 0.5812287330627441, "learning_rate": 9.998076577827363e-06, "loss": 2.466, "mean_token_accuracy": 0.4721836093813181, "num_tokens": 218184823.0, "step": 1505 }, { "entropy": 2.4560546875, "epoch": 0.025929082410061723, "grad_norm": 0.667839527130127, "learning_rate": 9.998068938119471e-06, "loss": 2.4359, "mean_token_accuracy": 0.4753602985292673, "num_tokens": 218322885.0, "step": 1506 }, { "entropy": 2.56201171875, "epoch": 0.025946299596256984, "grad_norm": 0.5627098083496094, "learning_rate": 9.998061283272353e-06, "loss": 2.4754, "mean_token_accuracy": 0.4772064401768148, "num_tokens": 218485130.0, "step": 1507 }, { "entropy": 2.4844970703125, "epoch": 0.025963516782452244, "grad_norm": 0.5547756552696228, "learning_rate": 9.998053613286036e-06, "loss": 2.4126, "mean_token_accuracy": 0.4796820618212223, "num_tokens": 218631229.0, "step": 1508 }, { "entropy": 2.4619140625, "epoch": 0.025980733968647505, "grad_norm": 0.5786131024360657, "learning_rate": 9.99804592816054e-06, "loss": 2.435, "mean_token_accuracy": 0.48094206769019365, "num_tokens": 218784443.0, "step": 1509 }, { "entropy": 2.4591064453125, "epoch": 0.025997951154842765, "grad_norm": 0.5810285806655884, "learning_rate": 9.99803822789589e-06, "loss": 2.419, "mean_token_accuracy": 0.47504253312945366, "num_tokens": 218923714.0, "step": 1510 }, { "entropy": 2.465087890625, "epoch": 0.026015168341038022, "grad_norm": 0.570573091506958, "learning_rate": 9.998030512492108e-06, "loss": 2.4312, "mean_token_accuracy": 0.4744260283187032, "num_tokens": 219076633.0, "step": 1511 }, { "entropy": 2.4339599609375, "epoch": 0.026032385527233283, "grad_norm": 0.5941885709762573, "learning_rate": 9.998022781949217e-06, "loss": 2.4043, "mean_token_accuracy": 0.48197115818038583, "num_tokens": 219219462.0, "step": 1512 }, { "entropy": 2.4404296875, "epoch": 0.026049602713428543, "grad_norm": 0.6012705564498901, "learning_rate": 9.998015036267243e-06, "loss": 2.4462, "mean_token_accuracy": 0.4794664392247796, "num_tokens": 219366857.0, "step": 1513 }, { "entropy": 2.44775390625, "epoch": 0.026066819899623804, "grad_norm": 0.6487138271331787, "learning_rate": 9.998007275446206e-06, "loss": 2.4562, "mean_token_accuracy": 0.48240925278514624, "num_tokens": 219501713.0, "step": 1514 }, { "entropy": 2.4669189453125, "epoch": 0.026084037085819065, "grad_norm": 0.5475379228591919, "learning_rate": 9.997999499486134e-06, "loss": 2.4614, "mean_token_accuracy": 0.4733142964541912, "num_tokens": 219651285.0, "step": 1515 }, { "entropy": 2.4892578125, "epoch": 0.026101254272014325, "grad_norm": 0.5989498496055603, "learning_rate": 9.997991708387047e-06, "loss": 2.4537, "mean_token_accuracy": 0.4730882477015257, "num_tokens": 219781732.0, "step": 1516 }, { "entropy": 2.48681640625, "epoch": 0.026118471458209586, "grad_norm": 0.6295292973518372, "learning_rate": 9.99798390214897e-06, "loss": 2.5009, "mean_token_accuracy": 0.4708400582894683, "num_tokens": 219923239.0, "step": 1517 }, { "entropy": 2.525390625, "epoch": 0.026135688644404846, "grad_norm": 0.5331100225448608, "learning_rate": 9.997976080771924e-06, "loss": 2.5217, "mean_token_accuracy": 0.4696351978927851, "num_tokens": 220085911.0, "step": 1518 }, { "entropy": 2.3858642578125, "epoch": 0.026152905830600107, "grad_norm": 0.5755723118782043, "learning_rate": 9.997968244255937e-06, "loss": 2.3168, "mean_token_accuracy": 0.4918542858213186, "num_tokens": 220225909.0, "step": 1519 }, { "entropy": 2.41943359375, "epoch": 0.026170123016795364, "grad_norm": 0.583797812461853, "learning_rate": 9.99796039260103e-06, "loss": 2.4, "mean_token_accuracy": 0.4865543758496642, "num_tokens": 220360506.0, "step": 1520 }, { "entropy": 2.4930419921875, "epoch": 0.026187340202990624, "grad_norm": 0.5480349659919739, "learning_rate": 9.997952525807229e-06, "loss": 2.4718, "mean_token_accuracy": 0.4783789166249335, "num_tokens": 220518785.0, "step": 1521 }, { "entropy": 2.446533203125, "epoch": 0.026204557389185885, "grad_norm": 0.565557062625885, "learning_rate": 9.997944643874553e-06, "loss": 2.4001, "mean_token_accuracy": 0.48282787948846817, "num_tokens": 220669933.0, "step": 1522 }, { "entropy": 2.398681640625, "epoch": 0.026221774575381145, "grad_norm": 0.5465447902679443, "learning_rate": 9.997936746803032e-06, "loss": 2.3459, "mean_token_accuracy": 0.4924741433933377, "num_tokens": 220817234.0, "step": 1523 }, { "entropy": 2.4630126953125, "epoch": 0.026238991761576406, "grad_norm": 0.6321006417274475, "learning_rate": 9.997928834592686e-06, "loss": 2.4352, "mean_token_accuracy": 0.47752982610836625, "num_tokens": 220954570.0, "step": 1524 }, { "entropy": 2.4580078125, "epoch": 0.026256208947771666, "grad_norm": 0.56505286693573, "learning_rate": 9.99792090724354e-06, "loss": 2.4511, "mean_token_accuracy": 0.48103819927200675, "num_tokens": 221097668.0, "step": 1525 }, { "entropy": 2.50634765625, "epoch": 0.026273426133966927, "grad_norm": 0.5791621208190918, "learning_rate": 9.997912964755618e-06, "loss": 2.4866, "mean_token_accuracy": 0.46793483989313245, "num_tokens": 221236494.0, "step": 1526 }, { "entropy": 2.4517822265625, "epoch": 0.026290643320162187, "grad_norm": 0.6186292171478271, "learning_rate": 9.997905007128946e-06, "loss": 2.4148, "mean_token_accuracy": 0.47660065814852715, "num_tokens": 221371798.0, "step": 1527 }, { "entropy": 2.457763671875, "epoch": 0.026307860506357444, "grad_norm": 0.5822423100471497, "learning_rate": 9.997897034363544e-06, "loss": 2.4246, "mean_token_accuracy": 0.4802300548180938, "num_tokens": 221514663.0, "step": 1528 }, { "entropy": 2.4844970703125, "epoch": 0.026325077692552705, "grad_norm": 0.5579072833061218, "learning_rate": 9.997889046459438e-06, "loss": 2.4567, "mean_token_accuracy": 0.47267806623131037, "num_tokens": 221654782.0, "step": 1529 }, { "entropy": 2.482666015625, "epoch": 0.026342294878747965, "grad_norm": 0.5542824864387512, "learning_rate": 9.997881043416653e-06, "loss": 2.4276, "mean_token_accuracy": 0.47997074108570814, "num_tokens": 221806130.0, "step": 1530 }, { "entropy": 2.4971923828125, "epoch": 0.026359512064943226, "grad_norm": 0.5979059934616089, "learning_rate": 9.997873025235215e-06, "loss": 2.4411, "mean_token_accuracy": 0.4823180711828172, "num_tokens": 221951263.0, "step": 1531 }, { "entropy": 2.483154296875, "epoch": 0.026376729251138487, "grad_norm": 1.1579712629318237, "learning_rate": 9.997864991915142e-06, "loss": 2.4429, "mean_token_accuracy": 0.4803687371313572, "num_tokens": 222106221.0, "step": 1532 }, { "entropy": 2.437255859375, "epoch": 0.026393946437333747, "grad_norm": 0.585996687412262, "learning_rate": 9.997856943456465e-06, "loss": 2.4413, "mean_token_accuracy": 0.4799509192816913, "num_tokens": 222278140.0, "step": 1533 }, { "entropy": 2.478759765625, "epoch": 0.026411163623529008, "grad_norm": 0.5700007081031799, "learning_rate": 9.997848879859205e-06, "loss": 2.3974, "mean_token_accuracy": 0.48377181869000196, "num_tokens": 222420219.0, "step": 1534 }, { "entropy": 2.4041748046875, "epoch": 0.026428380809724268, "grad_norm": 0.5667886137962341, "learning_rate": 9.99784080112339e-06, "loss": 2.387, "mean_token_accuracy": 0.48956062365323305, "num_tokens": 222562430.0, "step": 1535 }, { "entropy": 2.5068359375, "epoch": 0.026445597995919525, "grad_norm": 0.5823205709457397, "learning_rate": 9.997832707249038e-06, "loss": 2.4595, "mean_token_accuracy": 0.47320169396698475, "num_tokens": 222700414.0, "step": 1536 }, { "entropy": 2.4405517578125, "epoch": 0.026462815182114786, "grad_norm": 0.546593427658081, "learning_rate": 9.997824598236179e-06, "loss": 2.3965, "mean_token_accuracy": 0.48351220693439245, "num_tokens": 222856133.0, "step": 1537 }, { "entropy": 2.4632568359375, "epoch": 0.026480032368310046, "grad_norm": 0.5839020013809204, "learning_rate": 9.997816474084833e-06, "loss": 2.4677, "mean_token_accuracy": 0.4769261754117906, "num_tokens": 223003734.0, "step": 1538 }, { "entropy": 2.3848876953125, "epoch": 0.026497249554505307, "grad_norm": 0.5713815093040466, "learning_rate": 9.997808334795032e-06, "loss": 2.3267, "mean_token_accuracy": 0.4928825213573873, "num_tokens": 223150812.0, "step": 1539 }, { "entropy": 2.4129638671875, "epoch": 0.026514466740700567, "grad_norm": 0.5621561408042908, "learning_rate": 9.997800180366792e-06, "loss": 2.369, "mean_token_accuracy": 0.486449159681797, "num_tokens": 223302261.0, "step": 1540 }, { "entropy": 2.4468994140625, "epoch": 0.026531683926895828, "grad_norm": 0.5728362202644348, "learning_rate": 9.997792010800144e-06, "loss": 2.3988, "mean_token_accuracy": 0.4837730205617845, "num_tokens": 223448432.0, "step": 1541 }, { "entropy": 2.4149169921875, "epoch": 0.02654890111309109, "grad_norm": 0.6072005033493042, "learning_rate": 9.99778382609511e-06, "loss": 2.3772, "mean_token_accuracy": 0.48883628472685814, "num_tokens": 223586866.0, "step": 1542 }, { "entropy": 2.4757080078125, "epoch": 0.02656611829928635, "grad_norm": 0.5731198191642761, "learning_rate": 9.997775626251715e-06, "loss": 2.4887, "mean_token_accuracy": 0.4729891321621835, "num_tokens": 223739874.0, "step": 1543 }, { "entropy": 2.4337158203125, "epoch": 0.02658333548548161, "grad_norm": 0.5194607377052307, "learning_rate": 9.997767411269984e-06, "loss": 2.4039, "mean_token_accuracy": 0.47655164077878, "num_tokens": 223889106.0, "step": 1544 }, { "entropy": 2.372314453125, "epoch": 0.026600552671676866, "grad_norm": 0.5741761922836304, "learning_rate": 9.997759181149941e-06, "loss": 2.3325, "mean_token_accuracy": 0.49444987904280424, "num_tokens": 224029055.0, "step": 1545 }, { "entropy": 2.479736328125, "epoch": 0.026617769857872127, "grad_norm": 0.5601452589035034, "learning_rate": 9.997750935891615e-06, "loss": 2.4167, "mean_token_accuracy": 0.47284849267452955, "num_tokens": 224166339.0, "step": 1546 }, { "entropy": 2.44091796875, "epoch": 0.026634987044067387, "grad_norm": 0.6984487771987915, "learning_rate": 9.997742675495025e-06, "loss": 2.3943, "mean_token_accuracy": 0.4849527506157756, "num_tokens": 224314144.0, "step": 1547 }, { "entropy": 2.42138671875, "epoch": 0.026652204230262648, "grad_norm": 0.6234527230262756, "learning_rate": 9.9977343999602e-06, "loss": 2.3438, "mean_token_accuracy": 0.49563004495576024, "num_tokens": 224465327.0, "step": 1548 }, { "entropy": 2.462158203125, "epoch": 0.02666942141645791, "grad_norm": 0.5658462047576904, "learning_rate": 9.997726109287164e-06, "loss": 2.4079, "mean_token_accuracy": 0.48139845160767436, "num_tokens": 224608026.0, "step": 1549 }, { "entropy": 2.40087890625, "epoch": 0.02668663860265317, "grad_norm": 0.57075035572052, "learning_rate": 9.997717803475942e-06, "loss": 2.3665, "mean_token_accuracy": 0.4883747296407819, "num_tokens": 224758446.0, "step": 1550 }, { "entropy": 2.45947265625, "epoch": 0.02670385578884843, "grad_norm": 0.5546119809150696, "learning_rate": 9.99770948252656e-06, "loss": 2.4301, "mean_token_accuracy": 0.47884270129725337, "num_tokens": 224900091.0, "step": 1551 }, { "entropy": 2.4818115234375, "epoch": 0.02672107297504369, "grad_norm": 0.584319531917572, "learning_rate": 9.99770114643904e-06, "loss": 2.444, "mean_token_accuracy": 0.4730917243286967, "num_tokens": 225040740.0, "step": 1552 }, { "entropy": 2.4044189453125, "epoch": 0.026738290161238947, "grad_norm": 0.6092365980148315, "learning_rate": 9.997692795213412e-06, "loss": 2.3605, "mean_token_accuracy": 0.4851328986696899, "num_tokens": 225186755.0, "step": 1553 }, { "entropy": 2.4434814453125, "epoch": 0.026755507347434208, "grad_norm": 0.6220741868019104, "learning_rate": 9.997684428849698e-06, "loss": 2.4281, "mean_token_accuracy": 0.4839205415919423, "num_tokens": 225334853.0, "step": 1554 }, { "entropy": 2.4833984375, "epoch": 0.026772724533629468, "grad_norm": 0.6230896711349487, "learning_rate": 9.997676047347926e-06, "loss": 2.4525, "mean_token_accuracy": 0.4806727943941951, "num_tokens": 225463554.0, "step": 1555 }, { "entropy": 2.4664306640625, "epoch": 0.02678994171982473, "grad_norm": 0.5725734233856201, "learning_rate": 9.997667650708117e-06, "loss": 2.4501, "mean_token_accuracy": 0.4741324451752007, "num_tokens": 225606325.0, "step": 1556 }, { "entropy": 2.511962890625, "epoch": 0.02680715890601999, "grad_norm": 0.5844495296478271, "learning_rate": 9.9976592389303e-06, "loss": 2.4653, "mean_token_accuracy": 0.47366986563429236, "num_tokens": 225755308.0, "step": 1557 }, { "entropy": 2.454345703125, "epoch": 0.02682437609221525, "grad_norm": 0.5523918867111206, "learning_rate": 9.9976508120145e-06, "loss": 2.4185, "mean_token_accuracy": 0.4829486352391541, "num_tokens": 225916539.0, "step": 1558 }, { "entropy": 2.47607421875, "epoch": 0.02684159327841051, "grad_norm": 0.5556900501251221, "learning_rate": 9.997642369960743e-06, "loss": 2.4678, "mean_token_accuracy": 0.47704968182370067, "num_tokens": 226065388.0, "step": 1559 }, { "entropy": 2.4385986328125, "epoch": 0.02685881046460577, "grad_norm": 0.5409045219421387, "learning_rate": 9.997633912769054e-06, "loss": 2.3615, "mean_token_accuracy": 0.4902169401757419, "num_tokens": 226221599.0, "step": 1560 }, { "entropy": 2.439208984375, "epoch": 0.026876027650801028, "grad_norm": 0.612868070602417, "learning_rate": 9.997625440439457e-06, "loss": 2.4594, "mean_token_accuracy": 0.47914552967995405, "num_tokens": 226362147.0, "step": 1561 }, { "entropy": 2.46142578125, "epoch": 0.02689324483699629, "grad_norm": 0.5608747601509094, "learning_rate": 9.997616952971979e-06, "loss": 2.4424, "mean_token_accuracy": 0.4790024384856224, "num_tokens": 226510293.0, "step": 1562 }, { "entropy": 2.47900390625, "epoch": 0.02691046202319155, "grad_norm": 0.5777642726898193, "learning_rate": 9.997608450366646e-06, "loss": 2.4461, "mean_token_accuracy": 0.4724403969012201, "num_tokens": 226654656.0, "step": 1563 }, { "entropy": 2.42529296875, "epoch": 0.02692767920938681, "grad_norm": 0.5585452914237976, "learning_rate": 9.997599932623485e-06, "loss": 2.3716, "mean_token_accuracy": 0.4799959072843194, "num_tokens": 226810305.0, "step": 1564 }, { "entropy": 2.391357421875, "epoch": 0.02694489639558207, "grad_norm": 0.6446172595024109, "learning_rate": 9.997591399742518e-06, "loss": 2.3486, "mean_token_accuracy": 0.4916059426032007, "num_tokens": 226969167.0, "step": 1565 }, { "entropy": 2.4498291015625, "epoch": 0.02696211358177733, "grad_norm": 0.5548785924911499, "learning_rate": 9.997582851723773e-06, "loss": 2.4116, "mean_token_accuracy": 0.4788562678731978, "num_tokens": 227115813.0, "step": 1566 }, { "entropy": 2.5760498046875, "epoch": 0.02697933076797259, "grad_norm": 0.5751841068267822, "learning_rate": 9.997574288567277e-06, "loss": 2.5083, "mean_token_accuracy": 0.4620812046341598, "num_tokens": 227265495.0, "step": 1567 }, { "entropy": 2.470947265625, "epoch": 0.02699654795416785, "grad_norm": 1.4221575260162354, "learning_rate": 9.997565710273056e-06, "loss": 2.4116, "mean_token_accuracy": 0.4814313040114939, "num_tokens": 227406243.0, "step": 1568 }, { "entropy": 2.461669921875, "epoch": 0.027013765140363112, "grad_norm": 0.5775115489959717, "learning_rate": 9.997557116841134e-06, "loss": 2.4235, "mean_token_accuracy": 0.4807606884278357, "num_tokens": 227566411.0, "step": 1569 }, { "entropy": 2.447509765625, "epoch": 0.02703098232655837, "grad_norm": 0.5913234353065491, "learning_rate": 9.997548508271537e-06, "loss": 2.3974, "mean_token_accuracy": 0.48274747421965003, "num_tokens": 227695596.0, "step": 1570 }, { "entropy": 2.4384765625, "epoch": 0.02704819951275363, "grad_norm": 0.5852504968643188, "learning_rate": 9.997539884564293e-06, "loss": 2.3895, "mean_token_accuracy": 0.4838076075538993, "num_tokens": 227834644.0, "step": 1571 }, { "entropy": 2.4619140625, "epoch": 0.02706541669894889, "grad_norm": 0.5741419196128845, "learning_rate": 9.997531245719427e-06, "loss": 2.4267, "mean_token_accuracy": 0.4829978961497545, "num_tokens": 227981607.0, "step": 1572 }, { "entropy": 2.4593505859375, "epoch": 0.02708263388514415, "grad_norm": 0.5879804491996765, "learning_rate": 9.997522591736965e-06, "loss": 2.4387, "mean_token_accuracy": 0.47719143191352487, "num_tokens": 228129360.0, "step": 1573 }, { "entropy": 2.425048828125, "epoch": 0.02709985107133941, "grad_norm": 0.6101236343383789, "learning_rate": 9.997513922616935e-06, "loss": 2.3956, "mean_token_accuracy": 0.4810835770331323, "num_tokens": 228271608.0, "step": 1574 }, { "entropy": 2.447021484375, "epoch": 0.027117068257534672, "grad_norm": 0.5525056719779968, "learning_rate": 9.997505238359362e-06, "loss": 2.4221, "mean_token_accuracy": 0.47596909245476127, "num_tokens": 228425263.0, "step": 1575 }, { "entropy": 2.4608154296875, "epoch": 0.027134285443729932, "grad_norm": 0.5748025178909302, "learning_rate": 9.997496538964269e-06, "loss": 2.4484, "mean_token_accuracy": 0.4767248681746423, "num_tokens": 228577817.0, "step": 1576 }, { "entropy": 2.52978515625, "epoch": 0.027151502629925193, "grad_norm": 0.556822657585144, "learning_rate": 9.997487824431687e-06, "loss": 2.4771, "mean_token_accuracy": 0.4696510974317789, "num_tokens": 228731761.0, "step": 1577 }, { "entropy": 2.517578125, "epoch": 0.02716871981612045, "grad_norm": 0.6445668339729309, "learning_rate": 9.997479094761641e-06, "loss": 2.4696, "mean_token_accuracy": 0.47059842152521014, "num_tokens": 228882447.0, "step": 1578 }, { "entropy": 2.465576171875, "epoch": 0.02718593700231571, "grad_norm": 0.5520902276039124, "learning_rate": 9.997470349954158e-06, "loss": 2.4349, "mean_token_accuracy": 0.4763860795646906, "num_tokens": 229032651.0, "step": 1579 }, { "entropy": 2.477294921875, "epoch": 0.02720315418851097, "grad_norm": 0.6009482145309448, "learning_rate": 9.997461590009263e-06, "loss": 2.4098, "mean_token_accuracy": 0.47861299896612763, "num_tokens": 229180126.0, "step": 1580 }, { "entropy": 2.380615234375, "epoch": 0.02722037137470623, "grad_norm": 0.5625278353691101, "learning_rate": 9.997452814926984e-06, "loss": 2.3496, "mean_token_accuracy": 0.4916512775234878, "num_tokens": 229322174.0, "step": 1581 }, { "entropy": 2.5185546875, "epoch": 0.027237588560901492, "grad_norm": 0.6114506721496582, "learning_rate": 9.997444024707345e-06, "loss": 2.4779, "mean_token_accuracy": 0.4617612757720053, "num_tokens": 229454480.0, "step": 1582 }, { "entropy": 2.508544921875, "epoch": 0.027254805747096753, "grad_norm": 0.6254023909568787, "learning_rate": 9.997435219350377e-06, "loss": 2.4818, "mean_token_accuracy": 0.47184726875275373, "num_tokens": 229588770.0, "step": 1583 }, { "entropy": 2.37744140625, "epoch": 0.027272022933292013, "grad_norm": 0.5333112478256226, "learning_rate": 9.997426398856103e-06, "loss": 2.3411, "mean_token_accuracy": 0.4888959531672299, "num_tokens": 229754120.0, "step": 1584 }, { "entropy": 2.42529296875, "epoch": 0.027289240119487274, "grad_norm": 0.5652498602867126, "learning_rate": 9.997417563224551e-06, "loss": 2.3861, "mean_token_accuracy": 0.4848038759082556, "num_tokens": 229894455.0, "step": 1585 }, { "entropy": 2.4168701171875, "epoch": 0.02730645730568253, "grad_norm": 0.5742363929748535, "learning_rate": 9.997408712455748e-06, "loss": 2.3838, "mean_token_accuracy": 0.4861548813059926, "num_tokens": 230044890.0, "step": 1586 }, { "entropy": 2.4842529296875, "epoch": 0.02732367449187779, "grad_norm": 0.5573915839195251, "learning_rate": 9.99739984654972e-06, "loss": 2.4365, "mean_token_accuracy": 0.4722730196081102, "num_tokens": 230180808.0, "step": 1587 }, { "entropy": 2.497802734375, "epoch": 0.02734089167807305, "grad_norm": 0.6639450192451477, "learning_rate": 9.997390965506495e-06, "loss": 2.4367, "mean_token_accuracy": 0.47327509289607406, "num_tokens": 230325006.0, "step": 1588 }, { "entropy": 2.514404296875, "epoch": 0.027358108864268312, "grad_norm": 0.5561137795448303, "learning_rate": 9.997382069326099e-06, "loss": 2.4852, "mean_token_accuracy": 0.4662863416597247, "num_tokens": 230483748.0, "step": 1589 }, { "entropy": 2.4310302734375, "epoch": 0.027375326050463573, "grad_norm": 0.5391974449157715, "learning_rate": 9.997373158008558e-06, "loss": 2.4237, "mean_token_accuracy": 0.4804062177427113, "num_tokens": 230645992.0, "step": 1590 }, { "entropy": 2.4122314453125, "epoch": 0.027392543236658833, "grad_norm": 0.577002763748169, "learning_rate": 9.997364231553902e-06, "loss": 2.3422, "mean_token_accuracy": 0.4947265670634806, "num_tokens": 230788584.0, "step": 1591 }, { "entropy": 2.4708251953125, "epoch": 0.027409760422854094, "grad_norm": 0.5783287286758423, "learning_rate": 9.997355289962157e-06, "loss": 2.4332, "mean_token_accuracy": 0.47268676944077015, "num_tokens": 230920875.0, "step": 1592 }, { "entropy": 2.495361328125, "epoch": 0.027426977609049354, "grad_norm": 0.5736862421035767, "learning_rate": 9.997346333233347e-06, "loss": 2.4839, "mean_token_accuracy": 0.47372210351750255, "num_tokens": 231066994.0, "step": 1593 }, { "entropy": 2.414306640625, "epoch": 0.027444194795244615, "grad_norm": 0.6175742149353027, "learning_rate": 9.9973373613675e-06, "loss": 2.3682, "mean_token_accuracy": 0.481872939504683, "num_tokens": 231204186.0, "step": 1594 }, { "entropy": 2.45068359375, "epoch": 0.027461411981439872, "grad_norm": 0.6151363253593445, "learning_rate": 9.997328374364647e-06, "loss": 2.4043, "mean_token_accuracy": 0.4820513529703021, "num_tokens": 231352849.0, "step": 1595 }, { "entropy": 2.406005859375, "epoch": 0.027478629167635132, "grad_norm": 0.591463565826416, "learning_rate": 9.997319372224814e-06, "loss": 2.3365, "mean_token_accuracy": 0.4920554677955806, "num_tokens": 231492437.0, "step": 1596 }, { "entropy": 2.4417724609375, "epoch": 0.027495846353830393, "grad_norm": 0.531035304069519, "learning_rate": 9.997310354948026e-06, "loss": 2.4092, "mean_token_accuracy": 0.4780901288613677, "num_tokens": 231644457.0, "step": 1597 }, { "entropy": 2.4251708984375, "epoch": 0.027513063540025653, "grad_norm": 0.5585000514984131, "learning_rate": 9.99730132253431e-06, "loss": 2.3906, "mean_token_accuracy": 0.4829976833425462, "num_tokens": 231800143.0, "step": 1598 }, { "entropy": 2.36767578125, "epoch": 0.027530280726220914, "grad_norm": 0.5619704127311707, "learning_rate": 9.997292274983696e-06, "loss": 2.3234, "mean_token_accuracy": 0.49107312550768256, "num_tokens": 231959373.0, "step": 1599 }, { "entropy": 2.4547119140625, "epoch": 0.027547497912416175, "grad_norm": 0.6032162308692932, "learning_rate": 9.997283212296211e-06, "loss": 2.4341, "mean_token_accuracy": 0.4722294365055859, "num_tokens": 232110415.0, "step": 1600 }, { "entropy": 2.40576171875, "epoch": 0.027564715098611435, "grad_norm": 0.6166012287139893, "learning_rate": 9.99727413447188e-06, "loss": 2.3535, "mean_token_accuracy": 0.489782630931586, "num_tokens": 232247894.0, "step": 1601 }, { "entropy": 2.4820556640625, "epoch": 0.027581932284806696, "grad_norm": 0.5851189494132996, "learning_rate": 9.997265041510733e-06, "loss": 2.4721, "mean_token_accuracy": 0.4751852685585618, "num_tokens": 232388779.0, "step": 1602 }, { "entropy": 2.4580078125, "epoch": 0.027599149471001953, "grad_norm": 0.5721387267112732, "learning_rate": 9.997255933412797e-06, "loss": 2.4279, "mean_token_accuracy": 0.48017449537292123, "num_tokens": 232534833.0, "step": 1603 }, { "entropy": 2.4854736328125, "epoch": 0.027616366657197213, "grad_norm": 0.5649420022964478, "learning_rate": 9.997246810178099e-06, "loss": 2.4592, "mean_token_accuracy": 0.4723159852437675, "num_tokens": 232685772.0, "step": 1604 }, { "entropy": 2.4073486328125, "epoch": 0.027633583843392474, "grad_norm": 1.3038853406906128, "learning_rate": 9.997237671806665e-06, "loss": 2.3341, "mean_token_accuracy": 0.4938134755939245, "num_tokens": 232844671.0, "step": 1605 }, { "entropy": 2.388671875, "epoch": 0.027650801029587734, "grad_norm": 0.5900284647941589, "learning_rate": 9.997228518298527e-06, "loss": 2.338, "mean_token_accuracy": 0.4900298644788563, "num_tokens": 232985544.0, "step": 1606 }, { "entropy": 2.411376953125, "epoch": 0.027668018215782995, "grad_norm": 0.5831287503242493, "learning_rate": 9.997219349653708e-06, "loss": 2.3888, "mean_token_accuracy": 0.48680589720606804, "num_tokens": 233120888.0, "step": 1607 }, { "entropy": 2.5101318359375, "epoch": 0.027685235401978255, "grad_norm": 0.5937960743904114, "learning_rate": 9.99721016587224e-06, "loss": 2.4551, "mean_token_accuracy": 0.47399421967566013, "num_tokens": 233260349.0, "step": 1608 }, { "entropy": 2.4505615234375, "epoch": 0.027702452588173516, "grad_norm": 0.5459933876991272, "learning_rate": 9.997200966954149e-06, "loss": 2.4176, "mean_token_accuracy": 0.47501937579363585, "num_tokens": 233405845.0, "step": 1609 }, { "entropy": 2.4483642578125, "epoch": 0.027719669774368776, "grad_norm": 0.5332619547843933, "learning_rate": 9.997191752899462e-06, "loss": 2.3866, "mean_token_accuracy": 0.47997900983318686, "num_tokens": 233570774.0, "step": 1610 }, { "entropy": 2.4091796875, "epoch": 0.027736886960564033, "grad_norm": 0.57257479429245, "learning_rate": 9.997182523708208e-06, "loss": 2.3707, "mean_token_accuracy": 0.4883900200948119, "num_tokens": 233720314.0, "step": 1611 }, { "entropy": 2.4427490234375, "epoch": 0.027754104146759294, "grad_norm": 0.5707916617393494, "learning_rate": 9.997173279380415e-06, "loss": 2.4081, "mean_token_accuracy": 0.48100787540897727, "num_tokens": 233865434.0, "step": 1612 }, { "entropy": 2.4808349609375, "epoch": 0.027771321332954554, "grad_norm": 0.6137810945510864, "learning_rate": 9.997164019916109e-06, "loss": 2.4036, "mean_token_accuracy": 0.4781500454992056, "num_tokens": 234000908.0, "step": 1613 }, { "entropy": 2.44140625, "epoch": 0.027788538519149815, "grad_norm": 0.569161593914032, "learning_rate": 9.997154745315321e-06, "loss": 2.4136, "mean_token_accuracy": 0.48161053331568837, "num_tokens": 234154080.0, "step": 1614 }, { "entropy": 2.43212890625, "epoch": 0.027805755705345075, "grad_norm": 0.5593316555023193, "learning_rate": 9.997145455578076e-06, "loss": 2.3662, "mean_token_accuracy": 0.4847466191276908, "num_tokens": 234304655.0, "step": 1615 }, { "entropy": 2.4293212890625, "epoch": 0.027822972891540336, "grad_norm": 0.6409513354301453, "learning_rate": 9.997136150704407e-06, "loss": 2.3821, "mean_token_accuracy": 0.48784798802807927, "num_tokens": 234428782.0, "step": 1616 }, { "entropy": 2.4359130859375, "epoch": 0.027840190077735597, "grad_norm": 0.5953328013420105, "learning_rate": 9.997126830694336e-06, "loss": 2.3689, "mean_token_accuracy": 0.48606695234775543, "num_tokens": 234563589.0, "step": 1617 }, { "entropy": 2.4755859375, "epoch": 0.027857407263930857, "grad_norm": 1.202374815940857, "learning_rate": 9.997117495547897e-06, "loss": 2.4366, "mean_token_accuracy": 0.47866149339824915, "num_tokens": 234713984.0, "step": 1618 }, { "entropy": 2.4501953125, "epoch": 0.027874624450126118, "grad_norm": 0.6060045957565308, "learning_rate": 9.997108145265113e-06, "loss": 2.4131, "mean_token_accuracy": 0.48174894973635674, "num_tokens": 234856641.0, "step": 1619 }, { "entropy": 2.4287109375, "epoch": 0.027891841636321375, "grad_norm": 0.5465239882469177, "learning_rate": 9.997098779846017e-06, "loss": 2.3764, "mean_token_accuracy": 0.48386607645079494, "num_tokens": 235015161.0, "step": 1620 }, { "entropy": 2.4818115234375, "epoch": 0.027909058822516635, "grad_norm": 0.5660187602043152, "learning_rate": 9.997089399290635e-06, "loss": 2.4657, "mean_token_accuracy": 0.47468016063794494, "num_tokens": 235151596.0, "step": 1621 }, { "entropy": 2.471435546875, "epoch": 0.027926276008711896, "grad_norm": 0.572928786277771, "learning_rate": 9.997080003598995e-06, "loss": 2.4231, "mean_token_accuracy": 0.47549346927553415, "num_tokens": 235291268.0, "step": 1622 }, { "entropy": 2.498046875, "epoch": 0.027943493194907156, "grad_norm": 0.566953182220459, "learning_rate": 9.997070592771127e-06, "loss": 2.4507, "mean_token_accuracy": 0.47433658177033067, "num_tokens": 235435246.0, "step": 1623 }, { "entropy": 2.448486328125, "epoch": 0.027960710381102417, "grad_norm": 0.6662017703056335, "learning_rate": 9.997061166807057e-06, "loss": 2.4008, "mean_token_accuracy": 0.48391238413751125, "num_tokens": 235570223.0, "step": 1624 }, { "entropy": 2.416015625, "epoch": 0.027977927567297677, "grad_norm": 0.5774509906768799, "learning_rate": 9.997051725706816e-06, "loss": 2.3538, "mean_token_accuracy": 0.48118720296770334, "num_tokens": 235725981.0, "step": 1625 }, { "entropy": 2.4285888671875, "epoch": 0.027995144753492938, "grad_norm": 0.5478702783584595, "learning_rate": 9.997042269470431e-06, "loss": 2.3242, "mean_token_accuracy": 0.49387524649500847, "num_tokens": 235876462.0, "step": 1626 }, { "entropy": 2.462158203125, "epoch": 0.0280123619396882, "grad_norm": 0.573614776134491, "learning_rate": 9.997032798097935e-06, "loss": 2.425, "mean_token_accuracy": 0.48535694507882, "num_tokens": 236015336.0, "step": 1627 }, { "entropy": 2.50634765625, "epoch": 0.028029579125883455, "grad_norm": 0.6228254437446594, "learning_rate": 9.997023311589349e-06, "loss": 2.4627, "mean_token_accuracy": 0.47959386138245463, "num_tokens": 236155028.0, "step": 1628 }, { "entropy": 2.5274658203125, "epoch": 0.028046796312078716, "grad_norm": 0.6030795574188232, "learning_rate": 9.997013809944708e-06, "loss": 2.4979, "mean_token_accuracy": 0.4653917569667101, "num_tokens": 236309148.0, "step": 1629 }, { "entropy": 2.388916015625, "epoch": 0.028064013498273976, "grad_norm": 0.558562695980072, "learning_rate": 9.997004293164038e-06, "loss": 2.3689, "mean_token_accuracy": 0.48823840310797095, "num_tokens": 236475648.0, "step": 1630 }, { "entropy": 2.43798828125, "epoch": 0.028081230684469237, "grad_norm": 0.5895214080810547, "learning_rate": 9.996994761247368e-06, "loss": 2.4171, "mean_token_accuracy": 0.4803608972579241, "num_tokens": 236615733.0, "step": 1631 }, { "entropy": 2.421875, "epoch": 0.028098447870664497, "grad_norm": 0.5724160075187683, "learning_rate": 9.996985214194727e-06, "loss": 2.3814, "mean_token_accuracy": 0.483953685965389, "num_tokens": 236756289.0, "step": 1632 }, { "entropy": 2.47021484375, "epoch": 0.028115665056859758, "grad_norm": 0.5840807557106018, "learning_rate": 9.996975652006146e-06, "loss": 2.4387, "mean_token_accuracy": 0.4743587113916874, "num_tokens": 236887859.0, "step": 1633 }, { "entropy": 2.46826171875, "epoch": 0.02813288224305502, "grad_norm": 0.5316832065582275, "learning_rate": 9.996966074681651e-06, "loss": 2.4542, "mean_token_accuracy": 0.4724106374196708, "num_tokens": 237042538.0, "step": 1634 }, { "entropy": 2.4212646484375, "epoch": 0.02815009942925028, "grad_norm": 0.5952966809272766, "learning_rate": 9.996956482221273e-06, "loss": 2.4116, "mean_token_accuracy": 0.4807650912553072, "num_tokens": 237190659.0, "step": 1635 }, { "entropy": 2.4124755859375, "epoch": 0.028167316615445536, "grad_norm": 0.5247029066085815, "learning_rate": 9.99694687462504e-06, "loss": 2.359, "mean_token_accuracy": 0.48549360083416104, "num_tokens": 237358115.0, "step": 1636 }, { "entropy": 2.387939453125, "epoch": 0.028184533801640797, "grad_norm": 0.5859695672988892, "learning_rate": 9.996937251892982e-06, "loss": 2.3148, "mean_token_accuracy": 0.49537815153598785, "num_tokens": 237496922.0, "step": 1637 }, { "entropy": 2.4844970703125, "epoch": 0.028201750987836057, "grad_norm": 0.6078894734382629, "learning_rate": 9.996927614025127e-06, "loss": 2.443, "mean_token_accuracy": 0.4838123098015785, "num_tokens": 237623117.0, "step": 1638 }, { "entropy": 2.533203125, "epoch": 0.028218968174031318, "grad_norm": 0.5760785341262817, "learning_rate": 9.996917961021504e-06, "loss": 2.4848, "mean_token_accuracy": 0.4731389367952943, "num_tokens": 237763918.0, "step": 1639 }, { "entropy": 2.46826171875, "epoch": 0.028236185360226578, "grad_norm": 0.539117157459259, "learning_rate": 9.996908292882144e-06, "loss": 2.4377, "mean_token_accuracy": 0.47789577953517437, "num_tokens": 237926377.0, "step": 1640 }, { "entropy": 2.465087890625, "epoch": 0.02825340254642184, "grad_norm": 0.5144519209861755, "learning_rate": 9.996898609607075e-06, "loss": 2.4543, "mean_token_accuracy": 0.474421392660588, "num_tokens": 238089528.0, "step": 1641 }, { "entropy": 2.4031982421875, "epoch": 0.0282706197326171, "grad_norm": 0.5754570960998535, "learning_rate": 9.996888911196326e-06, "loss": 2.3766, "mean_token_accuracy": 0.4901612875983119, "num_tokens": 238231433.0, "step": 1642 }, { "entropy": 2.4429931640625, "epoch": 0.02828783691881236, "grad_norm": 0.6072561740875244, "learning_rate": 9.996879197649927e-06, "loss": 2.394, "mean_token_accuracy": 0.4848616924136877, "num_tokens": 238354749.0, "step": 1643 }, { "entropy": 2.43603515625, "epoch": 0.02830505410500762, "grad_norm": 0.70046466588974, "learning_rate": 9.996869468967909e-06, "loss": 2.3639, "mean_token_accuracy": 0.48499562544748187, "num_tokens": 238506100.0, "step": 1644 }, { "entropy": 2.43603515625, "epoch": 0.028322271291202877, "grad_norm": 0.6182661652565002, "learning_rate": 9.996859725150299e-06, "loss": 2.3903, "mean_token_accuracy": 0.4828766668215394, "num_tokens": 238651208.0, "step": 1645 }, { "entropy": 2.39111328125, "epoch": 0.028339488477398138, "grad_norm": 0.5814700126647949, "learning_rate": 9.996849966197127e-06, "loss": 2.3526, "mean_token_accuracy": 0.48803521366789937, "num_tokens": 238805878.0, "step": 1646 }, { "entropy": 2.37841796875, "epoch": 0.0283567056635934, "grad_norm": 0.5634573698043823, "learning_rate": 9.996840192108421e-06, "loss": 2.3943, "mean_token_accuracy": 0.48795266123488545, "num_tokens": 238958546.0, "step": 1647 }, { "entropy": 2.5108642578125, "epoch": 0.02837392284978866, "grad_norm": 0.6036639213562012, "learning_rate": 9.996830402884216e-06, "loss": 2.4941, "mean_token_accuracy": 0.4679912384599447, "num_tokens": 239087208.0, "step": 1648 }, { "entropy": 2.4544677734375, "epoch": 0.02839114003598392, "grad_norm": 0.5786367654800415, "learning_rate": 9.996820598524536e-06, "loss": 2.4108, "mean_token_accuracy": 0.4794202446937561, "num_tokens": 239230718.0, "step": 1649 }, { "entropy": 2.427490234375, "epoch": 0.02840835722217918, "grad_norm": 0.5587934255599976, "learning_rate": 9.996810779029413e-06, "loss": 2.4071, "mean_token_accuracy": 0.48648894764482975, "num_tokens": 239388616.0, "step": 1650 }, { "entropy": 2.488037109375, "epoch": 0.02842557440837444, "grad_norm": 0.5868260860443115, "learning_rate": 9.996800944398879e-06, "loss": 2.4666, "mean_token_accuracy": 0.47458827355876565, "num_tokens": 239532563.0, "step": 1651 }, { "entropy": 2.3829345703125, "epoch": 0.0284427915945697, "grad_norm": 0.5640127658843994, "learning_rate": 9.996791094632958e-06, "loss": 2.3417, "mean_token_accuracy": 0.4937547417357564, "num_tokens": 239675698.0, "step": 1652 }, { "entropy": 2.542724609375, "epoch": 0.028460008780764958, "grad_norm": 0.5434319972991943, "learning_rate": 9.996781229731685e-06, "loss": 2.4758, "mean_token_accuracy": 0.4686996676027775, "num_tokens": 239821208.0, "step": 1653 }, { "entropy": 2.4918212890625, "epoch": 0.02847722596696022, "grad_norm": 0.5542386174201965, "learning_rate": 9.996771349695089e-06, "loss": 2.4299, "mean_token_accuracy": 0.47332891169935465, "num_tokens": 239965661.0, "step": 1654 }, { "entropy": 2.4259033203125, "epoch": 0.02849444315315548, "grad_norm": 0.5533227324485779, "learning_rate": 9.996761454523198e-06, "loss": 2.4194, "mean_token_accuracy": 0.48256733594462276, "num_tokens": 240122225.0, "step": 1655 }, { "entropy": 2.398193359375, "epoch": 0.02851166033935074, "grad_norm": 0.5755099058151245, "learning_rate": 9.996751544216043e-06, "loss": 2.3681, "mean_token_accuracy": 0.48602569615468383, "num_tokens": 240255685.0, "step": 1656 }, { "entropy": 2.336669921875, "epoch": 0.028528877525546, "grad_norm": 0.573204517364502, "learning_rate": 9.996741618773654e-06, "loss": 2.2821, "mean_token_accuracy": 0.5057984632439911, "num_tokens": 240412754.0, "step": 1657 }, { "entropy": 2.4609375, "epoch": 0.02854609471174126, "grad_norm": 0.6060733199119568, "learning_rate": 9.99673167819606e-06, "loss": 2.4604, "mean_token_accuracy": 0.47678601182997227, "num_tokens": 240540902.0, "step": 1658 }, { "entropy": 2.4752197265625, "epoch": 0.02856331189793652, "grad_norm": 0.5993353724479675, "learning_rate": 9.996721722483296e-06, "loss": 2.4133, "mean_token_accuracy": 0.483869846444577, "num_tokens": 240689582.0, "step": 1659 }, { "entropy": 2.38134765625, "epoch": 0.028580529084131782, "grad_norm": 0.5666155815124512, "learning_rate": 9.996711751635384e-06, "loss": 2.3604, "mean_token_accuracy": 0.491028075106442, "num_tokens": 240844782.0, "step": 1660 }, { "entropy": 2.441650390625, "epoch": 0.02859774627032704, "grad_norm": 0.5586746335029602, "learning_rate": 9.996701765652361e-06, "loss": 2.413, "mean_token_accuracy": 0.4783767703920603, "num_tokens": 240991388.0, "step": 1661 }, { "entropy": 2.455322265625, "epoch": 0.0286149634565223, "grad_norm": 0.5903511047363281, "learning_rate": 9.996691764534255e-06, "loss": 2.4062, "mean_token_accuracy": 0.48205051850527525, "num_tokens": 241124952.0, "step": 1662 }, { "entropy": 2.498046875, "epoch": 0.02863218064271756, "grad_norm": 0.5668911933898926, "learning_rate": 9.996681748281094e-06, "loss": 2.4549, "mean_token_accuracy": 0.4747361782938242, "num_tokens": 241262036.0, "step": 1663 }, { "entropy": 2.5220947265625, "epoch": 0.02864939782891282, "grad_norm": 0.6033514738082886, "learning_rate": 9.996671716892914e-06, "loss": 2.5395, "mean_token_accuracy": 0.46681966073811054, "num_tokens": 241416917.0, "step": 1664 }, { "entropy": 2.4417724609375, "epoch": 0.02866661501510808, "grad_norm": 0.6681655645370483, "learning_rate": 9.996661670369739e-06, "loss": 2.3974, "mean_token_accuracy": 0.48373745242133737, "num_tokens": 241554068.0, "step": 1665 }, { "entropy": 2.4580078125, "epoch": 0.02868383220130334, "grad_norm": 0.5739055275917053, "learning_rate": 9.996651608711603e-06, "loss": 2.361, "mean_token_accuracy": 0.48040412552654743, "num_tokens": 241714285.0, "step": 1666 }, { "entropy": 2.4683837890625, "epoch": 0.028701049387498602, "grad_norm": 0.6406212449073792, "learning_rate": 9.996641531918536e-06, "loss": 2.43, "mean_token_accuracy": 0.48026127368211746, "num_tokens": 241841614.0, "step": 1667 }, { "entropy": 2.5107421875, "epoch": 0.028718266573693862, "grad_norm": 0.5852003693580627, "learning_rate": 9.996631439990568e-06, "loss": 2.4992, "mean_token_accuracy": 0.4704901375807822, "num_tokens": 241980442.0, "step": 1668 }, { "entropy": 2.4345703125, "epoch": 0.028735483759889123, "grad_norm": 0.6459202170372009, "learning_rate": 9.996621332927729e-06, "loss": 2.4125, "mean_token_accuracy": 0.4840882923454046, "num_tokens": 242131950.0, "step": 1669 }, { "entropy": 2.4384765625, "epoch": 0.02875270094608438, "grad_norm": 0.6423004269599915, "learning_rate": 9.996611210730053e-06, "loss": 2.4433, "mean_token_accuracy": 0.48514924151822925, "num_tokens": 242274852.0, "step": 1670 }, { "entropy": 2.502685546875, "epoch": 0.02876991813227964, "grad_norm": 0.5465385913848877, "learning_rate": 9.996601073397568e-06, "loss": 2.4691, "mean_token_accuracy": 0.4714298346079886, "num_tokens": 242426188.0, "step": 1671 }, { "entropy": 2.405517578125, "epoch": 0.0287871353184749, "grad_norm": 0.5957438945770264, "learning_rate": 9.996590920930303e-06, "loss": 2.3951, "mean_token_accuracy": 0.48868590872734785, "num_tokens": 242571018.0, "step": 1672 }, { "entropy": 2.550537109375, "epoch": 0.02880435250467016, "grad_norm": 0.5585090517997742, "learning_rate": 9.99658075332829e-06, "loss": 2.5181, "mean_token_accuracy": 0.46476193610578775, "num_tokens": 242721700.0, "step": 1673 }, { "entropy": 2.4349365234375, "epoch": 0.028821569690865422, "grad_norm": 0.6010538935661316, "learning_rate": 9.996570570591561e-06, "loss": 2.3535, "mean_token_accuracy": 0.48848696844652295, "num_tokens": 242858693.0, "step": 1674 }, { "entropy": 2.4791259765625, "epoch": 0.028838786877060683, "grad_norm": 0.5733229517936707, "learning_rate": 9.996560372720147e-06, "loss": 2.4825, "mean_token_accuracy": 0.4683424327522516, "num_tokens": 243006868.0, "step": 1675 }, { "entropy": 2.4215087890625, "epoch": 0.028856004063255943, "grad_norm": 0.5834994316101074, "learning_rate": 9.996550159714078e-06, "loss": 2.3763, "mean_token_accuracy": 0.4828924615867436, "num_tokens": 243154886.0, "step": 1676 }, { "entropy": 2.44775390625, "epoch": 0.028873221249451204, "grad_norm": 0.6166858077049255, "learning_rate": 9.996539931573385e-06, "loss": 2.3781, "mean_token_accuracy": 0.4847164931707084, "num_tokens": 243288516.0, "step": 1677 }, { "entropy": 2.49072265625, "epoch": 0.02889043843564646, "grad_norm": 0.5641864538192749, "learning_rate": 9.996529688298098e-06, "loss": 2.4465, "mean_token_accuracy": 0.473747827578336, "num_tokens": 243441352.0, "step": 1678 }, { "entropy": 2.451171875, "epoch": 0.02890765562184172, "grad_norm": 0.6030487418174744, "learning_rate": 9.996519429888249e-06, "loss": 2.4481, "mean_token_accuracy": 0.4830109137110412, "num_tokens": 243588150.0, "step": 1679 }, { "entropy": 2.4708251953125, "epoch": 0.028924872808036982, "grad_norm": 0.5804004073143005, "learning_rate": 9.99650915634387e-06, "loss": 2.4297, "mean_token_accuracy": 0.47798701329156756, "num_tokens": 243737262.0, "step": 1680 }, { "entropy": 2.4835205078125, "epoch": 0.028942089994232242, "grad_norm": 0.5946245789527893, "learning_rate": 9.996498867664992e-06, "loss": 2.4639, "mean_token_accuracy": 0.47278458066284657, "num_tokens": 243872204.0, "step": 1681 }, { "entropy": 2.450439453125, "epoch": 0.028959307180427503, "grad_norm": 0.6240355372428894, "learning_rate": 9.996488563851646e-06, "loss": 2.3891, "mean_token_accuracy": 0.48081753915175796, "num_tokens": 243998841.0, "step": 1682 }, { "entropy": 2.385009765625, "epoch": 0.028976524366622763, "grad_norm": 0.5424374341964722, "learning_rate": 9.99647824490386e-06, "loss": 2.3166, "mean_token_accuracy": 0.490779772400856, "num_tokens": 244149478.0, "step": 1683 }, { "entropy": 2.4439697265625, "epoch": 0.028993741552818024, "grad_norm": 0.718917727470398, "learning_rate": 9.99646791082167e-06, "loss": 2.4061, "mean_token_accuracy": 0.4811778524890542, "num_tokens": 244291993.0, "step": 1684 }, { "entropy": 2.394287109375, "epoch": 0.029010958739013284, "grad_norm": 0.6151554584503174, "learning_rate": 9.996457561605105e-06, "loss": 2.3617, "mean_token_accuracy": 0.48714847723022103, "num_tokens": 244435849.0, "step": 1685 }, { "entropy": 2.4591064453125, "epoch": 0.02902817592520854, "grad_norm": 0.5844502449035645, "learning_rate": 9.996447197254195e-06, "loss": 2.4036, "mean_token_accuracy": 0.48159506963565946, "num_tokens": 244580376.0, "step": 1686 }, { "entropy": 2.40625, "epoch": 0.029045393111403802, "grad_norm": 0.5591570734977722, "learning_rate": 9.996436817768974e-06, "loss": 2.3592, "mean_token_accuracy": 0.49357689963653684, "num_tokens": 244729260.0, "step": 1687 }, { "entropy": 2.4224853515625, "epoch": 0.029062610297599063, "grad_norm": 0.5616257190704346, "learning_rate": 9.996426423149472e-06, "loss": 2.3885, "mean_token_accuracy": 0.482600009534508, "num_tokens": 244872812.0, "step": 1688 }, { "entropy": 2.500244140625, "epoch": 0.029079827483794323, "grad_norm": 0.5827319622039795, "learning_rate": 9.996416013395721e-06, "loss": 2.4213, "mean_token_accuracy": 0.47733526350930333, "num_tokens": 245013545.0, "step": 1689 }, { "entropy": 2.4273681640625, "epoch": 0.029097044669989584, "grad_norm": 0.5836253762245178, "learning_rate": 9.996405588507753e-06, "loss": 2.4129, "mean_token_accuracy": 0.48227705620229244, "num_tokens": 245158448.0, "step": 1690 }, { "entropy": 2.41015625, "epoch": 0.029114261856184844, "grad_norm": 0.5588595271110535, "learning_rate": 9.996395148485598e-06, "loss": 2.3954, "mean_token_accuracy": 0.48199733812361956, "num_tokens": 245323743.0, "step": 1691 }, { "entropy": 2.431396484375, "epoch": 0.029131479042380105, "grad_norm": 0.585265576839447, "learning_rate": 9.996384693329289e-06, "loss": 2.3807, "mean_token_accuracy": 0.4879599893465638, "num_tokens": 245479102.0, "step": 1692 }, { "entropy": 2.47900390625, "epoch": 0.029148696228575365, "grad_norm": 0.5371046662330627, "learning_rate": 9.996374223038858e-06, "loss": 2.4331, "mean_token_accuracy": 0.47392465313896537, "num_tokens": 245627290.0, "step": 1693 }, { "entropy": 2.4324951171875, "epoch": 0.029165913414770626, "grad_norm": 0.608191967010498, "learning_rate": 9.996363737614334e-06, "loss": 2.3786, "mean_token_accuracy": 0.4849713812582195, "num_tokens": 245761478.0, "step": 1694 }, { "entropy": 2.5067138671875, "epoch": 0.029183130600965883, "grad_norm": 0.5639045834541321, "learning_rate": 9.996353237055753e-06, "loss": 2.4584, "mean_token_accuracy": 0.47028431948274374, "num_tokens": 245900244.0, "step": 1695 }, { "entropy": 2.4661865234375, "epoch": 0.029200347787161143, "grad_norm": 0.5640761852264404, "learning_rate": 9.996342721363144e-06, "loss": 2.4086, "mean_token_accuracy": 0.47613369347527623, "num_tokens": 246061900.0, "step": 1696 }, { "entropy": 2.413818359375, "epoch": 0.029217564973356404, "grad_norm": 0.5837756395339966, "learning_rate": 9.996332190536538e-06, "loss": 2.3655, "mean_token_accuracy": 0.48909167293459177, "num_tokens": 246203266.0, "step": 1697 }, { "entropy": 2.466064453125, "epoch": 0.029234782159551664, "grad_norm": 0.6410163044929504, "learning_rate": 9.99632164457597e-06, "loss": 2.4183, "mean_token_accuracy": 0.47384981345385313, "num_tokens": 246345295.0, "step": 1698 }, { "entropy": 2.474365234375, "epoch": 0.029251999345746925, "grad_norm": 0.5565051436424255, "learning_rate": 9.99631108348147e-06, "loss": 2.452, "mean_token_accuracy": 0.4777741697616875, "num_tokens": 246501317.0, "step": 1699 }, { "entropy": 2.4288330078125, "epoch": 0.029269216531942185, "grad_norm": 0.5428564548492432, "learning_rate": 9.996300507253068e-06, "loss": 2.3949, "mean_token_accuracy": 0.48448564764112234, "num_tokens": 246654551.0, "step": 1700 }, { "entropy": 2.4349365234375, "epoch": 0.029286433718137446, "grad_norm": 0.5566284656524658, "learning_rate": 9.9962899158908e-06, "loss": 2.4152, "mean_token_accuracy": 0.4823923450894654, "num_tokens": 246816439.0, "step": 1701 }, { "entropy": 2.48681640625, "epoch": 0.029303650904332706, "grad_norm": 0.5799583792686462, "learning_rate": 9.996279309394697e-06, "loss": 2.4778, "mean_token_accuracy": 0.4676981116645038, "num_tokens": 246961368.0, "step": 1702 }, { "entropy": 2.494140625, "epoch": 0.029320868090527964, "grad_norm": 0.5828601121902466, "learning_rate": 9.99626868776479e-06, "loss": 2.491, "mean_token_accuracy": 0.4712865217588842, "num_tokens": 247098743.0, "step": 1703 }, { "entropy": 2.4791259765625, "epoch": 0.029338085276723224, "grad_norm": 0.6346420645713806, "learning_rate": 9.99625805100111e-06, "loss": 2.462, "mean_token_accuracy": 0.4806337282061577, "num_tokens": 247245594.0, "step": 1704 }, { "entropy": 2.4534912109375, "epoch": 0.029355302462918485, "grad_norm": 0.5871755480766296, "learning_rate": 9.996247399103693e-06, "loss": 2.4171, "mean_token_accuracy": 0.4820070252753794, "num_tokens": 247385625.0, "step": 1705 }, { "entropy": 2.427001953125, "epoch": 0.029372519649113745, "grad_norm": 0.570570170879364, "learning_rate": 9.996236732072568e-06, "loss": 2.4239, "mean_token_accuracy": 0.4824762148782611, "num_tokens": 247535659.0, "step": 1706 }, { "entropy": 2.4500732421875, "epoch": 0.029389736835309006, "grad_norm": 0.5755301713943481, "learning_rate": 9.99622604990777e-06, "loss": 2.4254, "mean_token_accuracy": 0.48341484228149056, "num_tokens": 247679680.0, "step": 1707 }, { "entropy": 2.4388427734375, "epoch": 0.029406954021504266, "grad_norm": 0.5620051622390747, "learning_rate": 9.996215352609327e-06, "loss": 2.4294, "mean_token_accuracy": 0.4801658443175256, "num_tokens": 247823282.0, "step": 1708 }, { "entropy": 2.495361328125, "epoch": 0.029424171207699527, "grad_norm": 0.5927258133888245, "learning_rate": 9.996204640177276e-06, "loss": 2.4746, "mean_token_accuracy": 0.47673043003305793, "num_tokens": 247963838.0, "step": 1709 }, { "entropy": 2.48583984375, "epoch": 0.029441388393894787, "grad_norm": 0.620323657989502, "learning_rate": 9.996193912611648e-06, "loss": 2.4201, "mean_token_accuracy": 0.4740843581967056, "num_tokens": 248101252.0, "step": 1710 }, { "entropy": 2.49658203125, "epoch": 0.029458605580090044, "grad_norm": 0.5656887888908386, "learning_rate": 9.996183169912474e-06, "loss": 2.4453, "mean_token_accuracy": 0.4755016630515456, "num_tokens": 248251659.0, "step": 1711 }, { "entropy": 2.44482421875, "epoch": 0.029475822766285305, "grad_norm": 0.5744235515594482, "learning_rate": 9.996172412079788e-06, "loss": 2.4092, "mean_token_accuracy": 0.4831536212004721, "num_tokens": 248394514.0, "step": 1712 }, { "entropy": 2.4427490234375, "epoch": 0.029493039952480565, "grad_norm": 0.5855416059494019, "learning_rate": 9.996161639113622e-06, "loss": 2.399, "mean_token_accuracy": 0.48080876702442765, "num_tokens": 248535417.0, "step": 1713 }, { "entropy": 2.4171142578125, "epoch": 0.029510257138675826, "grad_norm": 0.6481823325157166, "learning_rate": 9.99615085101401e-06, "loss": 2.3865, "mean_token_accuracy": 0.48705816362053156, "num_tokens": 248672896.0, "step": 1714 }, { "entropy": 2.397216796875, "epoch": 0.029527474324871086, "grad_norm": 0.525701642036438, "learning_rate": 9.996140047780983e-06, "loss": 2.3877, "mean_token_accuracy": 0.48106732219457626, "num_tokens": 248838851.0, "step": 1715 }, { "entropy": 2.412353515625, "epoch": 0.029544691511066347, "grad_norm": 0.5457813143730164, "learning_rate": 9.996129229414573e-06, "loss": 2.362, "mean_token_accuracy": 0.4910212284885347, "num_tokens": 249000921.0, "step": 1716 }, { "entropy": 2.3861083984375, "epoch": 0.029561908697261607, "grad_norm": 0.5792080163955688, "learning_rate": 9.996118395914816e-06, "loss": 2.3344, "mean_token_accuracy": 0.4931281958706677, "num_tokens": 249142323.0, "step": 1717 }, { "entropy": 2.4461669921875, "epoch": 0.029579125883456868, "grad_norm": 0.5751327872276306, "learning_rate": 9.996107547281741e-06, "loss": 2.4195, "mean_token_accuracy": 0.4737996533513069, "num_tokens": 249273907.0, "step": 1718 }, { "entropy": 2.50390625, "epoch": 0.02959634306965213, "grad_norm": 0.5305837988853455, "learning_rate": 9.996096683515385e-06, "loss": 2.4582, "mean_token_accuracy": 0.47129624895751476, "num_tokens": 249420011.0, "step": 1719 }, { "entropy": 2.50146484375, "epoch": 0.029613560255847386, "grad_norm": 0.5615363121032715, "learning_rate": 9.996085804615776e-06, "loss": 2.4453, "mean_token_accuracy": 0.4726092144846916, "num_tokens": 249577874.0, "step": 1720 }, { "entropy": 2.4356689453125, "epoch": 0.029630777442042646, "grad_norm": 0.584879457950592, "learning_rate": 9.99607491058295e-06, "loss": 2.3874, "mean_token_accuracy": 0.4838989768177271, "num_tokens": 249733995.0, "step": 1721 }, { "entropy": 2.4276123046875, "epoch": 0.029647994628237907, "grad_norm": 0.6611423492431641, "learning_rate": 9.996064001416943e-06, "loss": 2.3991, "mean_token_accuracy": 0.4866844713687897, "num_tokens": 249863834.0, "step": 1722 }, { "entropy": 2.387939453125, "epoch": 0.029665211814433167, "grad_norm": 0.5828907489776611, "learning_rate": 9.996053077117781e-06, "loss": 2.35, "mean_token_accuracy": 0.48920594388619065, "num_tokens": 249999433.0, "step": 1723 }, { "entropy": 2.5263671875, "epoch": 0.029682429000628428, "grad_norm": 0.6122536063194275, "learning_rate": 9.996042137685502e-06, "loss": 2.5203, "mean_token_accuracy": 0.4699005661532283, "num_tokens": 250134401.0, "step": 1724 }, { "entropy": 2.44482421875, "epoch": 0.029699646186823688, "grad_norm": 0.6737586259841919, "learning_rate": 9.996031183120138e-06, "loss": 2.3536, "mean_token_accuracy": 0.48844635486602783, "num_tokens": 250292079.0, "step": 1725 }, { "entropy": 2.47119140625, "epoch": 0.02971686337301895, "grad_norm": 0.5378682613372803, "learning_rate": 9.996020213421722e-06, "loss": 2.4471, "mean_token_accuracy": 0.48123562429100275, "num_tokens": 250449143.0, "step": 1726 }, { "entropy": 2.448974609375, "epoch": 0.02973408055921421, "grad_norm": 0.5978697538375854, "learning_rate": 9.996009228590286e-06, "loss": 2.4195, "mean_token_accuracy": 0.48353995755314827, "num_tokens": 250581816.0, "step": 1727 }, { "entropy": 2.515869140625, "epoch": 0.029751297745409466, "grad_norm": 0.5631060004234314, "learning_rate": 9.995998228625868e-06, "loss": 2.4653, "mean_token_accuracy": 0.473801807500422, "num_tokens": 250717401.0, "step": 1728 }, { "entropy": 2.482421875, "epoch": 0.029768514931604727, "grad_norm": 0.5445279479026794, "learning_rate": 9.995987213528493e-06, "loss": 2.4938, "mean_token_accuracy": 0.47592513309791684, "num_tokens": 250867240.0, "step": 1729 }, { "entropy": 2.5250244140625, "epoch": 0.029785732117799987, "grad_norm": 0.5898825526237488, "learning_rate": 9.995976183298204e-06, "loss": 2.4803, "mean_token_accuracy": 0.4719364200718701, "num_tokens": 251016870.0, "step": 1730 }, { "entropy": 2.4178466796875, "epoch": 0.029802949303995248, "grad_norm": 0.5892136096954346, "learning_rate": 9.995965137935026e-06, "loss": 2.3497, "mean_token_accuracy": 0.48639260046184063, "num_tokens": 251159320.0, "step": 1731 }, { "entropy": 2.41259765625, "epoch": 0.02982016649019051, "grad_norm": 0.6107653379440308, "learning_rate": 9.995954077439e-06, "loss": 2.3769, "mean_token_accuracy": 0.4900634833611548, "num_tokens": 251294730.0, "step": 1732 }, { "entropy": 2.41552734375, "epoch": 0.02983738367638577, "grad_norm": 0.5585425496101379, "learning_rate": 9.995943001810151e-06, "loss": 2.3654, "mean_token_accuracy": 0.48663356387987733, "num_tokens": 251438812.0, "step": 1733 }, { "entropy": 2.45751953125, "epoch": 0.02985460086258103, "grad_norm": 0.5406437516212463, "learning_rate": 9.99593191104852e-06, "loss": 2.4438, "mean_token_accuracy": 0.4822356994263828, "num_tokens": 251597411.0, "step": 1734 }, { "entropy": 2.44091796875, "epoch": 0.02987181804877629, "grad_norm": 0.5275185704231262, "learning_rate": 9.995920805154136e-06, "loss": 2.3614, "mean_token_accuracy": 0.4821869912557304, "num_tokens": 251751924.0, "step": 1735 }, { "entropy": 2.424560546875, "epoch": 0.029889035234971547, "grad_norm": 0.5398510694503784, "learning_rate": 9.995909684127036e-06, "loss": 2.3199, "mean_token_accuracy": 0.4816825478337705, "num_tokens": 251897737.0, "step": 1736 }, { "entropy": 2.421142578125, "epoch": 0.029906252421166808, "grad_norm": 0.5590670704841614, "learning_rate": 9.995898547967252e-06, "loss": 2.3626, "mean_token_accuracy": 0.49043199233710766, "num_tokens": 252047294.0, "step": 1737 }, { "entropy": 2.443359375, "epoch": 0.029923469607362068, "grad_norm": 0.5771318078041077, "learning_rate": 9.995887396674816e-06, "loss": 2.4432, "mean_token_accuracy": 0.4781417637132108, "num_tokens": 252192301.0, "step": 1738 }, { "entropy": 2.5240478515625, "epoch": 0.02994068679355733, "grad_norm": 0.5523571968078613, "learning_rate": 9.995876230249765e-06, "loss": 2.4913, "mean_token_accuracy": 0.4711290653795004, "num_tokens": 252343478.0, "step": 1739 }, { "entropy": 2.466796875, "epoch": 0.02995790397975259, "grad_norm": 0.5873834490776062, "learning_rate": 9.99586504869213e-06, "loss": 2.4351, "mean_token_accuracy": 0.4716539951041341, "num_tokens": 252494029.0, "step": 1740 }, { "entropy": 2.42822265625, "epoch": 0.02997512116594785, "grad_norm": 0.6091635227203369, "learning_rate": 9.99585385200195e-06, "loss": 2.3937, "mean_token_accuracy": 0.4848588118329644, "num_tokens": 252644102.0, "step": 1741 }, { "entropy": 2.36083984375, "epoch": 0.02999233835214311, "grad_norm": 0.6105889678001404, "learning_rate": 9.995842640179251e-06, "loss": 2.3327, "mean_token_accuracy": 0.4923085803166032, "num_tokens": 252785353.0, "step": 1742 }, { "entropy": 2.4542236328125, "epoch": 0.03000955553833837, "grad_norm": 0.5387972593307495, "learning_rate": 9.995831413224073e-06, "loss": 2.3902, "mean_token_accuracy": 0.48074496537446976, "num_tokens": 252931252.0, "step": 1743 }, { "entropy": 2.4627685546875, "epoch": 0.03002677272453363, "grad_norm": 0.588346540927887, "learning_rate": 9.995820171136447e-06, "loss": 2.4444, "mean_token_accuracy": 0.4807250229641795, "num_tokens": 253081891.0, "step": 1744 }, { "entropy": 2.404052734375, "epoch": 0.030043989910728888, "grad_norm": 0.6005964279174805, "learning_rate": 9.995808913916409e-06, "loss": 2.3296, "mean_token_accuracy": 0.4894480253569782, "num_tokens": 253227000.0, "step": 1745 }, { "entropy": 2.4176025390625, "epoch": 0.03006120709692415, "grad_norm": 0.5213177800178528, "learning_rate": 9.995797641563993e-06, "loss": 2.3511, "mean_token_accuracy": 0.4866582774557173, "num_tokens": 253382573.0, "step": 1746 }, { "entropy": 2.4058837890625, "epoch": 0.03007842428311941, "grad_norm": 0.6120131015777588, "learning_rate": 9.995786354079232e-06, "loss": 2.3611, "mean_token_accuracy": 0.48763601342216134, "num_tokens": 253516221.0, "step": 1747 }, { "entropy": 2.4691162109375, "epoch": 0.03009564146931467, "grad_norm": 0.5826076865196228, "learning_rate": 9.99577505146216e-06, "loss": 2.4718, "mean_token_accuracy": 0.4789294474758208, "num_tokens": 253656220.0, "step": 1748 }, { "entropy": 2.5205078125, "epoch": 0.03011285865550993, "grad_norm": 0.6555241346359253, "learning_rate": 9.995763733712813e-06, "loss": 2.4926, "mean_token_accuracy": 0.47109587537124753, "num_tokens": 253792424.0, "step": 1749 }, { "entropy": 2.447998046875, "epoch": 0.03013007584170519, "grad_norm": 1.2172026634216309, "learning_rate": 9.995752400831224e-06, "loss": 2.3486, "mean_token_accuracy": 0.4864461640827358, "num_tokens": 253936774.0, "step": 1750 }, { "entropy": 2.4464111328125, "epoch": 0.03014729302790045, "grad_norm": 0.5619203448295593, "learning_rate": 9.995741052817426e-06, "loss": 2.4002, "mean_token_accuracy": 0.4801822490990162, "num_tokens": 254079519.0, "step": 1751 }, { "entropy": 2.40087890625, "epoch": 0.030164510214095712, "grad_norm": 0.6061223745346069, "learning_rate": 9.995729689671457e-06, "loss": 2.3576, "mean_token_accuracy": 0.49237477174028754, "num_tokens": 254218671.0, "step": 1752 }, { "entropy": 2.4599609375, "epoch": 0.03018172740029097, "grad_norm": 0.5428354740142822, "learning_rate": 9.995718311393348e-06, "loss": 2.4407, "mean_token_accuracy": 0.47417072020471096, "num_tokens": 254366027.0, "step": 1753 }, { "entropy": 2.444580078125, "epoch": 0.03019894458648623, "grad_norm": 0.6137836575508118, "learning_rate": 9.995706917983135e-06, "loss": 2.3982, "mean_token_accuracy": 0.4815092282369733, "num_tokens": 254500348.0, "step": 1754 }, { "entropy": 2.4688720703125, "epoch": 0.03021616177268149, "grad_norm": 0.5226655602455139, "learning_rate": 9.99569550944085e-06, "loss": 2.4489, "mean_token_accuracy": 0.47875166358426213, "num_tokens": 254665949.0, "step": 1755 }, { "entropy": 2.4661865234375, "epoch": 0.03023337895887675, "grad_norm": 0.5874465107917786, "learning_rate": 9.995684085766532e-06, "loss": 2.4685, "mean_token_accuracy": 0.47104707127436996, "num_tokens": 254821526.0, "step": 1756 }, { "entropy": 2.3651123046875, "epoch": 0.03025059614507201, "grad_norm": 0.5636343955993652, "learning_rate": 9.995672646960214e-06, "loss": 2.3245, "mean_token_accuracy": 0.49339101929217577, "num_tokens": 254965888.0, "step": 1757 }, { "entropy": 2.449951171875, "epoch": 0.03026781333126727, "grad_norm": 0.5968552231788635, "learning_rate": 9.995661193021929e-06, "loss": 2.4269, "mean_token_accuracy": 0.47896120324730873, "num_tokens": 255097221.0, "step": 1758 }, { "entropy": 2.50341796875, "epoch": 0.030285030517462532, "grad_norm": 0.6104995012283325, "learning_rate": 9.995649723951713e-06, "loss": 2.4343, "mean_token_accuracy": 0.48089546570554376, "num_tokens": 255240193.0, "step": 1759 }, { "entropy": 2.516357421875, "epoch": 0.030302247703657793, "grad_norm": 0.6157956719398499, "learning_rate": 9.995638239749601e-06, "loss": 2.4065, "mean_token_accuracy": 0.47861207462847233, "num_tokens": 255377884.0, "step": 1760 }, { "entropy": 2.451416015625, "epoch": 0.03031946488985305, "grad_norm": 0.5781038403511047, "learning_rate": 9.995626740415627e-06, "loss": 2.3953, "mean_token_accuracy": 0.4827661495655775, "num_tokens": 255545815.0, "step": 1761 }, { "entropy": 2.41162109375, "epoch": 0.03033668207604831, "grad_norm": 0.5962122678756714, "learning_rate": 9.995615225949826e-06, "loss": 2.3384, "mean_token_accuracy": 0.48970350436866283, "num_tokens": 255687102.0, "step": 1762 }, { "entropy": 2.56103515625, "epoch": 0.03035389926224357, "grad_norm": 0.556058943271637, "learning_rate": 9.995603696352231e-06, "loss": 2.5805, "mean_token_accuracy": 0.46257754508405924, "num_tokens": 255832845.0, "step": 1763 }, { "entropy": 2.5029296875, "epoch": 0.03037111644843883, "grad_norm": 0.5889034867286682, "learning_rate": 9.995592151622881e-06, "loss": 2.4863, "mean_token_accuracy": 0.4708904637955129, "num_tokens": 255966812.0, "step": 1764 }, { "entropy": 2.4873046875, "epoch": 0.030388333634634092, "grad_norm": 0.564799964427948, "learning_rate": 9.99558059176181e-06, "loss": 2.4345, "mean_token_accuracy": 0.47310470743104815, "num_tokens": 256115280.0, "step": 1765 }, { "entropy": 2.46533203125, "epoch": 0.030405550820829352, "grad_norm": 0.5584237575531006, "learning_rate": 9.995569016769052e-06, "loss": 2.4093, "mean_token_accuracy": 0.48134846799075603, "num_tokens": 256262835.0, "step": 1766 }, { "entropy": 2.4100341796875, "epoch": 0.030422768007024613, "grad_norm": 0.6054108142852783, "learning_rate": 9.99555742664464e-06, "loss": 2.3842, "mean_token_accuracy": 0.4840676193125546, "num_tokens": 256412903.0, "step": 1767 }, { "entropy": 2.4122314453125, "epoch": 0.030439985193219873, "grad_norm": 0.5872666835784912, "learning_rate": 9.99554582138861e-06, "loss": 2.4073, "mean_token_accuracy": 0.4819220928475261, "num_tokens": 256556383.0, "step": 1768 }, { "entropy": 2.4541015625, "epoch": 0.030457202379415134, "grad_norm": 0.6415526270866394, "learning_rate": 9.995534201001e-06, "loss": 2.4326, "mean_token_accuracy": 0.48358411993831396, "num_tokens": 256701332.0, "step": 1769 }, { "entropy": 2.5523681640625, "epoch": 0.03047441956561039, "grad_norm": 0.5595957040786743, "learning_rate": 9.995522565481843e-06, "loss": 2.5369, "mean_token_accuracy": 0.46628848975524306, "num_tokens": 256846217.0, "step": 1770 }, { "entropy": 2.4697265625, "epoch": 0.03049163675180565, "grad_norm": 0.5512428879737854, "learning_rate": 9.995510914831175e-06, "loss": 2.4261, "mean_token_accuracy": 0.4768143934197724, "num_tokens": 256989853.0, "step": 1771 }, { "entropy": 2.4036865234375, "epoch": 0.030508853938000912, "grad_norm": 0.5823377370834351, "learning_rate": 9.995499249049032e-06, "loss": 2.3555, "mean_token_accuracy": 0.4956060196273029, "num_tokens": 257135041.0, "step": 1772 }, { "entropy": 2.4488525390625, "epoch": 0.030526071124196173, "grad_norm": 0.555253803730011, "learning_rate": 9.995487568135447e-06, "loss": 2.4463, "mean_token_accuracy": 0.4722318626008928, "num_tokens": 257277086.0, "step": 1773 }, { "entropy": 2.419921875, "epoch": 0.030543288310391433, "grad_norm": 0.6303548216819763, "learning_rate": 9.995475872090456e-06, "loss": 2.3485, "mean_token_accuracy": 0.48873788258060813, "num_tokens": 257419673.0, "step": 1774 }, { "entropy": 2.552490234375, "epoch": 0.030560505496586694, "grad_norm": 0.5881249904632568, "learning_rate": 9.995464160914096e-06, "loss": 2.5034, "mean_token_accuracy": 0.4676412958651781, "num_tokens": 257552483.0, "step": 1775 }, { "entropy": 2.4627685546875, "epoch": 0.030577722682781954, "grad_norm": 0.5897058248519897, "learning_rate": 9.995452434606401e-06, "loss": 2.3792, "mean_token_accuracy": 0.4818587265908718, "num_tokens": 257696825.0, "step": 1776 }, { "entropy": 2.431396484375, "epoch": 0.030594939868977215, "grad_norm": 0.5501959919929504, "learning_rate": 9.995440693167407e-06, "loss": 2.3885, "mean_token_accuracy": 0.4862223519012332, "num_tokens": 257852558.0, "step": 1777 }, { "entropy": 2.48681640625, "epoch": 0.03061215705517247, "grad_norm": 0.5968089699745178, "learning_rate": 9.99542893659715e-06, "loss": 2.4541, "mean_token_accuracy": 0.4725658977404237, "num_tokens": 257972949.0, "step": 1778 }, { "entropy": 2.490966796875, "epoch": 0.030629374241367732, "grad_norm": 0.5689330101013184, "learning_rate": 9.995417164895665e-06, "loss": 2.4556, "mean_token_accuracy": 0.4732924662530422, "num_tokens": 258118477.0, "step": 1779 }, { "entropy": 2.4151611328125, "epoch": 0.030646591427562993, "grad_norm": 0.6031442880630493, "learning_rate": 9.99540537806299e-06, "loss": 2.4075, "mean_token_accuracy": 0.47733619809150696, "num_tokens": 258262583.0, "step": 1780 }, { "entropy": 2.418701171875, "epoch": 0.030663808613758253, "grad_norm": 0.5307676792144775, "learning_rate": 9.995393576099156e-06, "loss": 2.4052, "mean_token_accuracy": 0.48428564984351397, "num_tokens": 258415703.0, "step": 1781 }, { "entropy": 2.466552734375, "epoch": 0.030681025799953514, "grad_norm": 0.613979697227478, "learning_rate": 9.995381759004202e-06, "loss": 2.4284, "mean_token_accuracy": 0.48056693421676755, "num_tokens": 258546742.0, "step": 1782 }, { "entropy": 2.5028076171875, "epoch": 0.030698242986148774, "grad_norm": 0.6007286906242371, "learning_rate": 9.995369926778163e-06, "loss": 2.4515, "mean_token_accuracy": 0.4713759426958859, "num_tokens": 258667720.0, "step": 1783 }, { "entropy": 2.459228515625, "epoch": 0.030715460172344035, "grad_norm": 0.5262390375137329, "learning_rate": 9.995358079421076e-06, "loss": 2.4149, "mean_token_accuracy": 0.476927958894521, "num_tokens": 258818608.0, "step": 1784 }, { "entropy": 2.476318359375, "epoch": 0.030732677358539295, "grad_norm": 0.6184121370315552, "learning_rate": 9.995346216932974e-06, "loss": 2.4888, "mean_token_accuracy": 0.47189527517184615, "num_tokens": 258947687.0, "step": 1785 }, { "entropy": 2.421630859375, "epoch": 0.030749894544734552, "grad_norm": 0.641552209854126, "learning_rate": 9.995334339313898e-06, "loss": 2.3801, "mean_token_accuracy": 0.48330466262996197, "num_tokens": 259089154.0, "step": 1786 }, { "entropy": 2.3883056640625, "epoch": 0.030767111730929813, "grad_norm": 0.573010265827179, "learning_rate": 9.995322446563878e-06, "loss": 2.3846, "mean_token_accuracy": 0.48760603182017803, "num_tokens": 259231925.0, "step": 1787 }, { "entropy": 2.4847412109375, "epoch": 0.030784328917125073, "grad_norm": 0.5725507736206055, "learning_rate": 9.995310538682954e-06, "loss": 2.4799, "mean_token_accuracy": 0.4728474370203912, "num_tokens": 259374698.0, "step": 1788 }, { "entropy": 2.4398193359375, "epoch": 0.030801546103320334, "grad_norm": 0.681101381778717, "learning_rate": 9.995298615671161e-06, "loss": 2.4197, "mean_token_accuracy": 0.4821738447062671, "num_tokens": 259524367.0, "step": 1789 }, { "entropy": 2.400146484375, "epoch": 0.030818763289515595, "grad_norm": 0.5721313953399658, "learning_rate": 9.995286677528533e-06, "loss": 2.363, "mean_token_accuracy": 0.48788571590557694, "num_tokens": 259678466.0, "step": 1790 }, { "entropy": 2.4361572265625, "epoch": 0.030835980475710855, "grad_norm": 0.6100559234619141, "learning_rate": 9.995274724255111e-06, "loss": 2.4359, "mean_token_accuracy": 0.48224303452298045, "num_tokens": 259822604.0, "step": 1791 }, { "entropy": 2.4278564453125, "epoch": 0.030853197661906116, "grad_norm": 0.5751356482505798, "learning_rate": 9.995262755850926e-06, "loss": 2.4072, "mean_token_accuracy": 0.48218475840985775, "num_tokens": 259965240.0, "step": 1792 }, { "entropy": 2.448974609375, "epoch": 0.030870414848101376, "grad_norm": 0.6143491864204407, "learning_rate": 9.995250772316019e-06, "loss": 2.4289, "mean_token_accuracy": 0.4808902507647872, "num_tokens": 260105881.0, "step": 1793 }, { "entropy": 2.53466796875, "epoch": 0.030887632034296633, "grad_norm": 0.6435692310333252, "learning_rate": 9.995238773650422e-06, "loss": 2.5116, "mean_token_accuracy": 0.46682596672326326, "num_tokens": 260232986.0, "step": 1794 }, { "entropy": 2.459716796875, "epoch": 0.030904849220491894, "grad_norm": 0.5735113620758057, "learning_rate": 9.995226759854173e-06, "loss": 2.4054, "mean_token_accuracy": 0.4811546648852527, "num_tokens": 260366767.0, "step": 1795 }, { "entropy": 2.465576171875, "epoch": 0.030922066406687154, "grad_norm": 1.1088193655014038, "learning_rate": 9.995214730927309e-06, "loss": 2.4319, "mean_token_accuracy": 0.47814410272985697, "num_tokens": 260518664.0, "step": 1796 }, { "entropy": 2.4351806640625, "epoch": 0.030939283592882415, "grad_norm": 0.6060841083526611, "learning_rate": 9.995202686869867e-06, "loss": 2.4261, "mean_token_accuracy": 0.48954142862930894, "num_tokens": 260665493.0, "step": 1797 }, { "entropy": 2.4580078125, "epoch": 0.030956500779077675, "grad_norm": 0.5547589659690857, "learning_rate": 9.995190627681883e-06, "loss": 2.4379, "mean_token_accuracy": 0.476859278511256, "num_tokens": 260813391.0, "step": 1798 }, { "entropy": 2.43603515625, "epoch": 0.030973717965272936, "grad_norm": 0.6191805601119995, "learning_rate": 9.995178553363392e-06, "loss": 2.4344, "mean_token_accuracy": 0.47961972700431943, "num_tokens": 260955219.0, "step": 1799 }, { "entropy": 2.4512939453125, "epoch": 0.030990935151468196, "grad_norm": 0.5580746531486511, "learning_rate": 9.995166463914432e-06, "loss": 2.3881, "mean_token_accuracy": 0.4819506905041635, "num_tokens": 261107398.0, "step": 1800 }, { "entropy": 2.4354248046875, "epoch": 0.031008152337663457, "grad_norm": 0.5572928190231323, "learning_rate": 9.99515435933504e-06, "loss": 2.4291, "mean_token_accuracy": 0.47748745791614056, "num_tokens": 261258885.0, "step": 1801 }, { "entropy": 2.4228515625, "epoch": 0.031025369523858717, "grad_norm": 0.5744470953941345, "learning_rate": 9.99514223962525e-06, "loss": 2.4143, "mean_token_accuracy": 0.4853782169520855, "num_tokens": 261408577.0, "step": 1802 }, { "entropy": 2.5040283203125, "epoch": 0.031042586710053974, "grad_norm": 0.5782204866409302, "learning_rate": 9.995130104785103e-06, "loss": 2.4765, "mean_token_accuracy": 0.47027752734720707, "num_tokens": 261547340.0, "step": 1803 }, { "entropy": 2.4500732421875, "epoch": 0.031059803896249235, "grad_norm": 0.6056774258613586, "learning_rate": 9.995117954814632e-06, "loss": 2.4059, "mean_token_accuracy": 0.47932664630934596, "num_tokens": 261680759.0, "step": 1804 }, { "entropy": 2.4530029296875, "epoch": 0.031077021082444495, "grad_norm": 0.5747044682502747, "learning_rate": 9.995105789713874e-06, "loss": 2.407, "mean_token_accuracy": 0.4791347337886691, "num_tokens": 261839318.0, "step": 1805 }, { "entropy": 2.420654296875, "epoch": 0.031094238268639756, "grad_norm": 0.5714244246482849, "learning_rate": 9.995093609482868e-06, "loss": 2.3137, "mean_token_accuracy": 0.4866795940324664, "num_tokens": 261983899.0, "step": 1806 }, { "entropy": 2.45361328125, "epoch": 0.031111455454835017, "grad_norm": 0.6132124066352844, "learning_rate": 9.995081414121652e-06, "loss": 2.3875, "mean_token_accuracy": 0.4830195610411465, "num_tokens": 262123814.0, "step": 1807 }, { "entropy": 2.4935302734375, "epoch": 0.031128672641030277, "grad_norm": 0.5878351330757141, "learning_rate": 9.995069203630258e-06, "loss": 2.4513, "mean_token_accuracy": 0.4727471759542823, "num_tokens": 262264324.0, "step": 1808 }, { "entropy": 2.50146484375, "epoch": 0.031145889827225538, "grad_norm": 0.8043968081474304, "learning_rate": 9.995056978008728e-06, "loss": 2.4545, "mean_token_accuracy": 0.47300353879109025, "num_tokens": 262404423.0, "step": 1809 }, { "entropy": 2.419677734375, "epoch": 0.031163107013420798, "grad_norm": 0.5692616105079651, "learning_rate": 9.995044737257097e-06, "loss": 2.3788, "mean_token_accuracy": 0.4819628787226975, "num_tokens": 262554058.0, "step": 1810 }, { "entropy": 2.4549560546875, "epoch": 0.031180324199616055, "grad_norm": 0.6038571000099182, "learning_rate": 9.995032481375401e-06, "loss": 2.4808, "mean_token_accuracy": 0.4799831509590149, "num_tokens": 262687962.0, "step": 1811 }, { "entropy": 2.389404296875, "epoch": 0.031197541385811316, "grad_norm": 0.6061134338378906, "learning_rate": 9.995020210363678e-06, "loss": 2.3422, "mean_token_accuracy": 0.49405871424824, "num_tokens": 262821281.0, "step": 1812 }, { "entropy": 2.488037109375, "epoch": 0.031214758572006576, "grad_norm": 0.5849748849868774, "learning_rate": 9.995007924221966e-06, "loss": 2.4825, "mean_token_accuracy": 0.4694899981841445, "num_tokens": 262963950.0, "step": 1813 }, { "entropy": 2.513671875, "epoch": 0.031231975758201837, "grad_norm": 0.5811004638671875, "learning_rate": 9.994995622950301e-06, "loss": 2.4641, "mean_token_accuracy": 0.4680064986459911, "num_tokens": 263089602.0, "step": 1814 }, { "entropy": 2.4034423828125, "epoch": 0.031249192944397097, "grad_norm": 0.5289223790168762, "learning_rate": 9.994983306548722e-06, "loss": 2.3548, "mean_token_accuracy": 0.48990146769210696, "num_tokens": 263262419.0, "step": 1815 }, { "entropy": 2.536865234375, "epoch": 0.03126641013059236, "grad_norm": 0.5391739010810852, "learning_rate": 9.994970975017264e-06, "loss": 2.5102, "mean_token_accuracy": 0.46705460641533136, "num_tokens": 263415410.0, "step": 1816 }, { "entropy": 2.4627685546875, "epoch": 0.03128362731678762, "grad_norm": 0.5562330484390259, "learning_rate": 9.994958628355964e-06, "loss": 2.4007, "mean_token_accuracy": 0.4822265561670065, "num_tokens": 263560708.0, "step": 1817 }, { "entropy": 2.5174560546875, "epoch": 0.03130084450298288, "grad_norm": 0.589061975479126, "learning_rate": 9.994946266564862e-06, "loss": 2.4517, "mean_token_accuracy": 0.4671993814408779, "num_tokens": 263706338.0, "step": 1818 }, { "entropy": 2.431396484375, "epoch": 0.03131806168917814, "grad_norm": 0.5534610152244568, "learning_rate": 9.994933889643995e-06, "loss": 2.4091, "mean_token_accuracy": 0.4794508134946227, "num_tokens": 263846694.0, "step": 1819 }, { "entropy": 2.5390625, "epoch": 0.0313352788753734, "grad_norm": 0.691927969455719, "learning_rate": 9.9949214975934e-06, "loss": 2.5067, "mean_token_accuracy": 0.46535237692296505, "num_tokens": 263993842.0, "step": 1820 }, { "entropy": 2.4317626953125, "epoch": 0.03135249606156866, "grad_norm": 0.5267771482467651, "learning_rate": 9.994909090413113e-06, "loss": 2.3885, "mean_token_accuracy": 0.48383715003728867, "num_tokens": 264149235.0, "step": 1821 }, { "entropy": 2.4241943359375, "epoch": 0.03136971324776392, "grad_norm": 0.5582144856452942, "learning_rate": 9.994896668103174e-06, "loss": 2.4032, "mean_token_accuracy": 0.48607371421530843, "num_tokens": 264305264.0, "step": 1822 }, { "entropy": 2.4718017578125, "epoch": 0.03138693043395918, "grad_norm": 0.5681629180908203, "learning_rate": 9.994884230663619e-06, "loss": 2.4183, "mean_token_accuracy": 0.47495387122035027, "num_tokens": 264445168.0, "step": 1823 }, { "entropy": 2.498779296875, "epoch": 0.031404147620154435, "grad_norm": 0.5814370512962341, "learning_rate": 9.994871778094486e-06, "loss": 2.4998, "mean_token_accuracy": 0.475694193970412, "num_tokens": 264584998.0, "step": 1824 }, { "entropy": 2.4818115234375, "epoch": 0.031421364806349696, "grad_norm": 0.5599236488342285, "learning_rate": 9.994859310395812e-06, "loss": 2.4159, "mean_token_accuracy": 0.47769313864409924, "num_tokens": 264735369.0, "step": 1825 }, { "entropy": 2.4952392578125, "epoch": 0.031438581992544956, "grad_norm": 0.5620182156562805, "learning_rate": 9.994846827567638e-06, "loss": 2.481, "mean_token_accuracy": 0.4710625964216888, "num_tokens": 264876244.0, "step": 1826 }, { "entropy": 2.4227294921875, "epoch": 0.03145579917874022, "grad_norm": 0.5782105326652527, "learning_rate": 9.994834329609997e-06, "loss": 2.3754, "mean_token_accuracy": 0.4855298362672329, "num_tokens": 265015068.0, "step": 1827 }, { "entropy": 2.49755859375, "epoch": 0.03147301636493548, "grad_norm": 0.578761637210846, "learning_rate": 9.994821816522931e-06, "loss": 2.4221, "mean_token_accuracy": 0.47399685718119144, "num_tokens": 265142077.0, "step": 1828 }, { "entropy": 2.4862060546875, "epoch": 0.03149023355113074, "grad_norm": 0.5534428954124451, "learning_rate": 9.994809288306475e-06, "loss": 2.4853, "mean_token_accuracy": 0.47513916390016675, "num_tokens": 265286452.0, "step": 1829 }, { "entropy": 2.52734375, "epoch": 0.031507450737326, "grad_norm": 0.5408220887184143, "learning_rate": 9.99479674496067e-06, "loss": 2.4523, "mean_token_accuracy": 0.47395798983052373, "num_tokens": 265441215.0, "step": 1830 }, { "entropy": 2.4600830078125, "epoch": 0.03152466792352126, "grad_norm": 0.5792752504348755, "learning_rate": 9.994784186485551e-06, "loss": 2.4759, "mean_token_accuracy": 0.47542623011395335, "num_tokens": 265586593.0, "step": 1831 }, { "entropy": 2.4188232421875, "epoch": 0.03154188510971652, "grad_norm": 0.5839810371398926, "learning_rate": 9.994771612881157e-06, "loss": 2.3782, "mean_token_accuracy": 0.48431128449738026, "num_tokens": 265725310.0, "step": 1832 }, { "entropy": 2.454345703125, "epoch": 0.03155910229591178, "grad_norm": 0.5647464394569397, "learning_rate": 9.994759024147526e-06, "loss": 2.4133, "mean_token_accuracy": 0.47814659122377634, "num_tokens": 265858415.0, "step": 1833 }, { "entropy": 2.45654296875, "epoch": 0.03157631948210704, "grad_norm": 0.5568596124649048, "learning_rate": 9.994746420284698e-06, "loss": 2.4259, "mean_token_accuracy": 0.4771571112796664, "num_tokens": 266011950.0, "step": 1834 }, { "entropy": 2.413330078125, "epoch": 0.0315935366683023, "grad_norm": 0.5861586332321167, "learning_rate": 9.994733801292709e-06, "loss": 2.3792, "mean_token_accuracy": 0.4801113181747496, "num_tokens": 266157685.0, "step": 1835 }, { "entropy": 2.4085693359375, "epoch": 0.03161075385449756, "grad_norm": 0.5990592837333679, "learning_rate": 9.994721167171597e-06, "loss": 2.3922, "mean_token_accuracy": 0.48092158418148756, "num_tokens": 266293266.0, "step": 1836 }, { "entropy": 2.432373046875, "epoch": 0.03162797104069282, "grad_norm": 0.5652367472648621, "learning_rate": 9.9947085179214e-06, "loss": 2.3796, "mean_token_accuracy": 0.48524676635861397, "num_tokens": 266448458.0, "step": 1837 }, { "entropy": 2.5150146484375, "epoch": 0.03164518822688808, "grad_norm": 0.581610918045044, "learning_rate": 9.99469585354216e-06, "loss": 2.4421, "mean_token_accuracy": 0.4794493457302451, "num_tokens": 266588403.0, "step": 1838 }, { "entropy": 2.479248046875, "epoch": 0.03166240541308334, "grad_norm": 0.5524604320526123, "learning_rate": 9.994683174033913e-06, "loss": 2.409, "mean_token_accuracy": 0.4818028402514756, "num_tokens": 266748122.0, "step": 1839 }, { "entropy": 2.4130859375, "epoch": 0.031679622599278597, "grad_norm": 0.5744349360466003, "learning_rate": 9.994670479396695e-06, "loss": 2.3541, "mean_token_accuracy": 0.4899553945288062, "num_tokens": 266889574.0, "step": 1840 }, { "entropy": 2.50341796875, "epoch": 0.03169683978547386, "grad_norm": 0.5297801494598389, "learning_rate": 9.994657769630546e-06, "loss": 2.4784, "mean_token_accuracy": 0.4729268723167479, "num_tokens": 267038916.0, "step": 1841 }, { "entropy": 2.42041015625, "epoch": 0.03171405697166912, "grad_norm": 0.570783257484436, "learning_rate": 9.994645044735507e-06, "loss": 2.377, "mean_token_accuracy": 0.4859691094607115, "num_tokens": 267183760.0, "step": 1842 }, { "entropy": 2.4842529296875, "epoch": 0.03173127415786438, "grad_norm": 0.5840896368026733, "learning_rate": 9.994632304711616e-06, "loss": 2.4273, "mean_token_accuracy": 0.4811637466773391, "num_tokens": 267324455.0, "step": 1843 }, { "entropy": 2.4864501953125, "epoch": 0.03174849134405964, "grad_norm": 0.5744589567184448, "learning_rate": 9.994619549558908e-06, "loss": 2.4713, "mean_token_accuracy": 0.4727841066196561, "num_tokens": 267472269.0, "step": 1844 }, { "entropy": 2.46923828125, "epoch": 0.0317657085302549, "grad_norm": 0.5857048630714417, "learning_rate": 9.994606779277425e-06, "loss": 2.4301, "mean_token_accuracy": 0.48071943665854633, "num_tokens": 267624717.0, "step": 1845 }, { "entropy": 2.420654296875, "epoch": 0.03178292571645016, "grad_norm": 0.5835974812507629, "learning_rate": 9.994593993867203e-06, "loss": 2.3782, "mean_token_accuracy": 0.4824722218327224, "num_tokens": 267762034.0, "step": 1846 }, { "entropy": 2.503662109375, "epoch": 0.03180014290264542, "grad_norm": 0.5819142460823059, "learning_rate": 9.994581193328283e-06, "loss": 2.5042, "mean_token_accuracy": 0.47309408662840724, "num_tokens": 267897855.0, "step": 1847 }, { "entropy": 2.4564208984375, "epoch": 0.03181736008884068, "grad_norm": 0.6381955742835999, "learning_rate": 9.994568377660703e-06, "loss": 2.4308, "mean_token_accuracy": 0.48583648912608624, "num_tokens": 268025165.0, "step": 1848 }, { "entropy": 2.439453125, "epoch": 0.03183457727503594, "grad_norm": 0.5733355283737183, "learning_rate": 9.994555546864504e-06, "loss": 2.418, "mean_token_accuracy": 0.48078050184994936, "num_tokens": 268172403.0, "step": 1849 }, { "entropy": 2.3912353515625, "epoch": 0.0318517944612312, "grad_norm": 0.5635650157928467, "learning_rate": 9.99454270093972e-06, "loss": 2.3733, "mean_token_accuracy": 0.4842658434063196, "num_tokens": 268337839.0, "step": 1850 }, { "entropy": 2.4051513671875, "epoch": 0.03186901164742646, "grad_norm": 0.5738821625709534, "learning_rate": 9.994529839886395e-06, "loss": 2.332, "mean_token_accuracy": 0.49162986455485225, "num_tokens": 268490293.0, "step": 1851 }, { "entropy": 2.4752197265625, "epoch": 0.03188622883362172, "grad_norm": 0.5878849029541016, "learning_rate": 9.994516963704564e-06, "loss": 2.4238, "mean_token_accuracy": 0.47905471734702587, "num_tokens": 268627952.0, "step": 1852 }, { "entropy": 2.494384765625, "epoch": 0.03190344601981698, "grad_norm": 0.5611072182655334, "learning_rate": 9.994504072394268e-06, "loss": 2.4512, "mean_token_accuracy": 0.4697739346884191, "num_tokens": 268771756.0, "step": 1853 }, { "entropy": 2.4912109375, "epoch": 0.031920663206012244, "grad_norm": 0.5786696672439575, "learning_rate": 9.994491165955546e-06, "loss": 2.4426, "mean_token_accuracy": 0.47656662855297327, "num_tokens": 268906129.0, "step": 1854 }, { "entropy": 2.48193359375, "epoch": 0.031937880392207504, "grad_norm": 0.5850675106048584, "learning_rate": 9.994478244388437e-06, "loss": 2.4469, "mean_token_accuracy": 0.47550681745633483, "num_tokens": 269056386.0, "step": 1855 }, { "entropy": 2.499755859375, "epoch": 0.031955097578402765, "grad_norm": 0.5520582795143127, "learning_rate": 9.994465307692978e-06, "loss": 2.4417, "mean_token_accuracy": 0.47765777353197336, "num_tokens": 269195599.0, "step": 1856 }, { "entropy": 2.4632568359375, "epoch": 0.03197231476459802, "grad_norm": 0.5770227313041687, "learning_rate": 9.994452355869211e-06, "loss": 2.4842, "mean_token_accuracy": 0.4763926393352449, "num_tokens": 269336161.0, "step": 1857 }, { "entropy": 2.4757080078125, "epoch": 0.03198953195079328, "grad_norm": 0.6584348678588867, "learning_rate": 9.994439388917177e-06, "loss": 2.3984, "mean_token_accuracy": 0.4819053406827152, "num_tokens": 269461648.0, "step": 1858 }, { "entropy": 2.427001953125, "epoch": 0.03200674913698854, "grad_norm": 0.5825558304786682, "learning_rate": 9.99442640683691e-06, "loss": 2.3939, "mean_token_accuracy": 0.4860147815197706, "num_tokens": 269614043.0, "step": 1859 }, { "entropy": 2.5079345703125, "epoch": 0.0320239663231838, "grad_norm": 0.5787304043769836, "learning_rate": 9.994413409628451e-06, "loss": 2.526, "mean_token_accuracy": 0.4718983927741647, "num_tokens": 269763008.0, "step": 1860 }, { "entropy": 2.4483642578125, "epoch": 0.03204118350937906, "grad_norm": 0.5473356246948242, "learning_rate": 9.994400397291843e-06, "loss": 2.4192, "mean_token_accuracy": 0.4761442271992564, "num_tokens": 269916961.0, "step": 1861 }, { "entropy": 2.347412109375, "epoch": 0.03205840069557432, "grad_norm": 0.5818413496017456, "learning_rate": 9.99438736982712e-06, "loss": 2.3176, "mean_token_accuracy": 0.49792582634836435, "num_tokens": 270053323.0, "step": 1862 }, { "entropy": 2.522216796875, "epoch": 0.03207561788176958, "grad_norm": 0.5837598443031311, "learning_rate": 9.994374327234326e-06, "loss": 2.5254, "mean_token_accuracy": 0.4689773670397699, "num_tokens": 270183004.0, "step": 1863 }, { "entropy": 2.424560546875, "epoch": 0.03209283506796484, "grad_norm": 0.5816727876663208, "learning_rate": 9.994361269513499e-06, "loss": 2.3983, "mean_token_accuracy": 0.4842468095012009, "num_tokens": 270337766.0, "step": 1864 }, { "entropy": 2.4515380859375, "epoch": 0.0321100522541601, "grad_norm": 0.5595372319221497, "learning_rate": 9.994348196664679e-06, "loss": 2.4535, "mean_token_accuracy": 0.4758255537599325, "num_tokens": 270501411.0, "step": 1865 }, { "entropy": 2.4014892578125, "epoch": 0.03212726944035536, "grad_norm": 0.5381937623023987, "learning_rate": 9.994335108687903e-06, "loss": 2.3243, "mean_token_accuracy": 0.4866001163609326, "num_tokens": 270652865.0, "step": 1866 }, { "entropy": 2.427734375, "epoch": 0.032144486626550624, "grad_norm": 0.5453839302062988, "learning_rate": 9.994322005583213e-06, "loss": 2.3798, "mean_token_accuracy": 0.4782120902091265, "num_tokens": 270798390.0, "step": 1867 }, { "entropy": 2.5419921875, "epoch": 0.032161703812745884, "grad_norm": 0.6079609394073486, "learning_rate": 9.994308887350647e-06, "loss": 2.5727, "mean_token_accuracy": 0.46313366387039423, "num_tokens": 270943300.0, "step": 1868 }, { "entropy": 2.486083984375, "epoch": 0.032178920998941145, "grad_norm": 0.597481906414032, "learning_rate": 9.994295753990247e-06, "loss": 2.4698, "mean_token_accuracy": 0.46938649471849203, "num_tokens": 271077719.0, "step": 1869 }, { "entropy": 2.40771484375, "epoch": 0.032196138185136405, "grad_norm": 1.105528473854065, "learning_rate": 9.99428260550205e-06, "loss": 2.3737, "mean_token_accuracy": 0.4825026970356703, "num_tokens": 271237964.0, "step": 1870 }, { "entropy": 2.4666748046875, "epoch": 0.032213355371331666, "grad_norm": 0.5856791138648987, "learning_rate": 9.9942694418861e-06, "loss": 2.4256, "mean_token_accuracy": 0.47253927774727345, "num_tokens": 271380338.0, "step": 1871 }, { "entropy": 2.3770751953125, "epoch": 0.032230572557526926, "grad_norm": 0.5685958862304688, "learning_rate": 9.994256263142433e-06, "loss": 2.3337, "mean_token_accuracy": 0.4951976570300758, "num_tokens": 271518289.0, "step": 1872 }, { "entropy": 2.5079345703125, "epoch": 0.03224778974372219, "grad_norm": 0.5702466368675232, "learning_rate": 9.99424306927109e-06, "loss": 2.4513, "mean_token_accuracy": 0.4732716833241284, "num_tokens": 271660591.0, "step": 1873 }, { "entropy": 2.501953125, "epoch": 0.03226500692991744, "grad_norm": 0.5536970496177673, "learning_rate": 9.994229860272114e-06, "loss": 2.4598, "mean_token_accuracy": 0.47078709537163377, "num_tokens": 271815738.0, "step": 1874 }, { "entropy": 2.433349609375, "epoch": 0.0322822241161127, "grad_norm": 0.5575986504554749, "learning_rate": 9.99421663614554e-06, "loss": 2.3716, "mean_token_accuracy": 0.4846403314732015, "num_tokens": 271964724.0, "step": 1875 }, { "entropy": 2.439208984375, "epoch": 0.03229944130230796, "grad_norm": 0.594307541847229, "learning_rate": 9.99420339689141e-06, "loss": 2.3674, "mean_token_accuracy": 0.48272377625107765, "num_tokens": 272103280.0, "step": 1876 }, { "entropy": 2.382080078125, "epoch": 0.03231665848850322, "grad_norm": 0.5934799909591675, "learning_rate": 9.994190142509766e-06, "loss": 2.3689, "mean_token_accuracy": 0.4859785996377468, "num_tokens": 272237744.0, "step": 1877 }, { "entropy": 2.453369140625, "epoch": 0.03233387567469848, "grad_norm": 0.5809702277183533, "learning_rate": 9.994176873000646e-06, "loss": 2.4275, "mean_token_accuracy": 0.4772698413580656, "num_tokens": 272384323.0, "step": 1878 }, { "entropy": 2.457275390625, "epoch": 0.03235109286089374, "grad_norm": 0.5598036646842957, "learning_rate": 9.99416358836409e-06, "loss": 2.4166, "mean_token_accuracy": 0.47385495621711016, "num_tokens": 272530813.0, "step": 1879 }, { "entropy": 2.51318359375, "epoch": 0.032368310047089004, "grad_norm": 0.567221462726593, "learning_rate": 9.994150288600139e-06, "loss": 2.5107, "mean_token_accuracy": 0.4726970442570746, "num_tokens": 272679093.0, "step": 1880 }, { "entropy": 2.4515380859375, "epoch": 0.032385527233284264, "grad_norm": 0.6015093922615051, "learning_rate": 9.994136973708834e-06, "loss": 2.399, "mean_token_accuracy": 0.47881753370165825, "num_tokens": 272819409.0, "step": 1881 }, { "entropy": 2.424072265625, "epoch": 0.032402744419479525, "grad_norm": 0.5528237819671631, "learning_rate": 9.994123643690214e-06, "loss": 2.3889, "mean_token_accuracy": 0.4852414163760841, "num_tokens": 272964763.0, "step": 1882 }, { "entropy": 2.4576416015625, "epoch": 0.032419961605674785, "grad_norm": 0.6007446050643921, "learning_rate": 9.99411029854432e-06, "loss": 2.4264, "mean_token_accuracy": 0.4797611120156944, "num_tokens": 273090453.0, "step": 1883 }, { "entropy": 2.463623046875, "epoch": 0.032437178791870046, "grad_norm": 0.5631487369537354, "learning_rate": 9.994096938271193e-06, "loss": 2.4681, "mean_token_accuracy": 0.4756158501841128, "num_tokens": 273239046.0, "step": 1884 }, { "entropy": 2.4407958984375, "epoch": 0.032454395978065306, "grad_norm": 0.5506601929664612, "learning_rate": 9.994083562870873e-06, "loss": 2.3948, "mean_token_accuracy": 0.47984826657921076, "num_tokens": 273396483.0, "step": 1885 }, { "entropy": 2.4609375, "epoch": 0.03247161316426057, "grad_norm": 0.5861914157867432, "learning_rate": 9.9940701723434e-06, "loss": 2.4491, "mean_token_accuracy": 0.47965921740978956, "num_tokens": 273539136.0, "step": 1886 }, { "entropy": 2.4434814453125, "epoch": 0.03248883035045583, "grad_norm": 0.5634111166000366, "learning_rate": 9.994056766688815e-06, "loss": 2.4001, "mean_token_accuracy": 0.4822332635521889, "num_tokens": 273683327.0, "step": 1887 }, { "entropy": 2.4342041015625, "epoch": 0.03250604753665109, "grad_norm": 0.6063517332077026, "learning_rate": 9.994043345907158e-06, "loss": 2.3837, "mean_token_accuracy": 0.49188648257404566, "num_tokens": 273812912.0, "step": 1888 }, { "entropy": 2.454345703125, "epoch": 0.03252326472284635, "grad_norm": 0.6061086058616638, "learning_rate": 9.99402990999847e-06, "loss": 2.3902, "mean_token_accuracy": 0.4804657520726323, "num_tokens": 273953195.0, "step": 1889 }, { "entropy": 2.3865966796875, "epoch": 0.0325404819090416, "grad_norm": 0.5631823539733887, "learning_rate": 9.994016458962795e-06, "loss": 2.3116, "mean_token_accuracy": 0.4911833154037595, "num_tokens": 274105327.0, "step": 1890 }, { "entropy": 2.44384765625, "epoch": 0.03255769909523686, "grad_norm": 0.5700172185897827, "learning_rate": 9.994002992800167e-06, "loss": 2.4395, "mean_token_accuracy": 0.4804955665022135, "num_tokens": 274245805.0, "step": 1891 }, { "entropy": 2.508056640625, "epoch": 0.03257491628143212, "grad_norm": 0.5785242915153503, "learning_rate": 9.993989511510633e-06, "loss": 2.4574, "mean_token_accuracy": 0.47288175392895937, "num_tokens": 274390327.0, "step": 1892 }, { "entropy": 2.5166015625, "epoch": 0.032592133467627384, "grad_norm": 0.5731043815612793, "learning_rate": 9.99397601509423e-06, "loss": 2.4638, "mean_token_accuracy": 0.47718239948153496, "num_tokens": 274536899.0, "step": 1893 }, { "entropy": 2.3447265625, "epoch": 0.032609350653822644, "grad_norm": 0.6108200550079346, "learning_rate": 9.993962503551e-06, "loss": 2.2998, "mean_token_accuracy": 0.504625148139894, "num_tokens": 274690625.0, "step": 1894 }, { "entropy": 2.4359130859375, "epoch": 0.032626567840017905, "grad_norm": 0.5686972141265869, "learning_rate": 9.993948976880985e-06, "loss": 2.3772, "mean_token_accuracy": 0.4784249165095389, "num_tokens": 274857384.0, "step": 1895 }, { "entropy": 2.465576171875, "epoch": 0.032643785026213165, "grad_norm": 0.5697457194328308, "learning_rate": 9.993935435084225e-06, "loss": 2.3909, "mean_token_accuracy": 0.483038027305156, "num_tokens": 275007281.0, "step": 1896 }, { "entropy": 2.4967041015625, "epoch": 0.032661002212408426, "grad_norm": 0.5822637677192688, "learning_rate": 9.99392187816076e-06, "loss": 2.4637, "mean_token_accuracy": 0.46831735828891397, "num_tokens": 275145804.0, "step": 1897 }, { "entropy": 2.4696044921875, "epoch": 0.032678219398603686, "grad_norm": 0.5560740828514099, "learning_rate": 9.993908306110632e-06, "loss": 2.4337, "mean_token_accuracy": 0.4820185494609177, "num_tokens": 275303743.0, "step": 1898 }, { "entropy": 2.4893798828125, "epoch": 0.03269543658479895, "grad_norm": 0.5658345222473145, "learning_rate": 9.993894718933882e-06, "loss": 2.4622, "mean_token_accuracy": 0.4734317073598504, "num_tokens": 275455938.0, "step": 1899 }, { "entropy": 2.367431640625, "epoch": 0.03271265377099421, "grad_norm": 0.5981510281562805, "learning_rate": 9.993881116630556e-06, "loss": 2.2978, "mean_token_accuracy": 0.495200231205672, "num_tokens": 275607789.0, "step": 1900 }, { "entropy": 2.46435546875, "epoch": 0.03272987095718947, "grad_norm": 0.5610291361808777, "learning_rate": 9.993867499200684e-06, "loss": 2.4479, "mean_token_accuracy": 0.4744836580939591, "num_tokens": 275761693.0, "step": 1901 }, { "entropy": 2.5068359375, "epoch": 0.03274708814338473, "grad_norm": 0.6446607708930969, "learning_rate": 9.993853866644319e-06, "loss": 2.4878, "mean_token_accuracy": 0.46817948622629046, "num_tokens": 275887793.0, "step": 1902 }, { "entropy": 2.503662109375, "epoch": 0.03276430532957999, "grad_norm": 0.6273615956306458, "learning_rate": 9.993840218961495e-06, "loss": 2.4879, "mean_token_accuracy": 0.4687243062071502, "num_tokens": 276018500.0, "step": 1903 }, { "entropy": 2.498779296875, "epoch": 0.03278152251577525, "grad_norm": 0.5124494433403015, "learning_rate": 9.993826556152255e-06, "loss": 2.4705, "mean_token_accuracy": 0.4730762280523777, "num_tokens": 276183672.0, "step": 1904 }, { "entropy": 2.518798828125, "epoch": 0.03279873970197051, "grad_norm": 0.5735337138175964, "learning_rate": 9.993812878216642e-06, "loss": 2.4894, "mean_token_accuracy": 0.47411146527156234, "num_tokens": 276336067.0, "step": 1905 }, { "entropy": 2.498291015625, "epoch": 0.03281595688816577, "grad_norm": 0.6396939158439636, "learning_rate": 9.993799185154695e-06, "loss": 2.4308, "mean_token_accuracy": 0.4775810223072767, "num_tokens": 276454633.0, "step": 1906 }, { "entropy": 2.43798828125, "epoch": 0.032833174074361024, "grad_norm": 0.5700352191925049, "learning_rate": 9.993785476966458e-06, "loss": 2.4091, "mean_token_accuracy": 0.47954965522512794, "num_tokens": 276592071.0, "step": 1907 }, { "entropy": 2.555419921875, "epoch": 0.032850391260556285, "grad_norm": 0.5614842772483826, "learning_rate": 9.993771753651971e-06, "loss": 2.5613, "mean_token_accuracy": 0.4670336083509028, "num_tokens": 276746077.0, "step": 1908 }, { "entropy": 2.423095703125, "epoch": 0.032867608446751545, "grad_norm": 0.5809895396232605, "learning_rate": 9.993758015211276e-06, "loss": 2.3669, "mean_token_accuracy": 0.48658125940710306, "num_tokens": 276900397.0, "step": 1909 }, { "entropy": 2.384765625, "epoch": 0.032884825632946806, "grad_norm": 0.5184022784233093, "learning_rate": 9.993744261644414e-06, "loss": 2.3631, "mean_token_accuracy": 0.4841692647896707, "num_tokens": 277080897.0, "step": 1910 }, { "entropy": 2.4111328125, "epoch": 0.032902042819142066, "grad_norm": 0.575218915939331, "learning_rate": 9.993730492951428e-06, "loss": 2.3794, "mean_token_accuracy": 0.48858750332146883, "num_tokens": 277229579.0, "step": 1911 }, { "entropy": 2.4375, "epoch": 0.03291926000533733, "grad_norm": 0.8869200944900513, "learning_rate": 9.993716709132359e-06, "loss": 2.3684, "mean_token_accuracy": 0.48189750453457236, "num_tokens": 277362179.0, "step": 1912 }, { "entropy": 2.4571533203125, "epoch": 0.03293647719153259, "grad_norm": 0.5700924396514893, "learning_rate": 9.993702910187247e-06, "loss": 2.3851, "mean_token_accuracy": 0.48076847614720464, "num_tokens": 277520460.0, "step": 1913 }, { "entropy": 2.51025390625, "epoch": 0.03295369437772785, "grad_norm": 0.6675810813903809, "learning_rate": 9.993689096116138e-06, "loss": 2.4804, "mean_token_accuracy": 0.47350738383829594, "num_tokens": 277667131.0, "step": 1914 }, { "entropy": 2.4278564453125, "epoch": 0.03297091156392311, "grad_norm": 0.6161773204803467, "learning_rate": 9.993675266919068e-06, "loss": 2.3634, "mean_token_accuracy": 0.48864840995520353, "num_tokens": 277798538.0, "step": 1915 }, { "entropy": 2.42724609375, "epoch": 0.03298812875011837, "grad_norm": 0.6031524538993835, "learning_rate": 9.993661422596083e-06, "loss": 2.3735, "mean_token_accuracy": 0.4872046741656959, "num_tokens": 277938299.0, "step": 1916 }, { "entropy": 2.459716796875, "epoch": 0.03300534593631363, "grad_norm": 0.5436295866966248, "learning_rate": 9.993647563147226e-06, "loss": 2.4323, "mean_token_accuracy": 0.47822323255240917, "num_tokens": 278080906.0, "step": 1917 }, { "entropy": 2.482177734375, "epoch": 0.03302256312250889, "grad_norm": 0.5724844336509705, "learning_rate": 9.993633688572536e-06, "loss": 2.4886, "mean_token_accuracy": 0.47133151395246387, "num_tokens": 278220379.0, "step": 1918 }, { "entropy": 2.4127197265625, "epoch": 0.03303978030870415, "grad_norm": 0.583770751953125, "learning_rate": 9.993619798872056e-06, "loss": 2.3904, "mean_token_accuracy": 0.48571639275178313, "num_tokens": 278372188.0, "step": 1919 }, { "entropy": 2.45849609375, "epoch": 0.03305699749489941, "grad_norm": 0.5579352378845215, "learning_rate": 9.993605894045828e-06, "loss": 2.4178, "mean_token_accuracy": 0.4752850723452866, "num_tokens": 278515572.0, "step": 1920 }, { "entropy": 2.46728515625, "epoch": 0.03307421468109467, "grad_norm": 0.5899266600608826, "learning_rate": 9.993591974093896e-06, "loss": 2.4453, "mean_token_accuracy": 0.4791210265830159, "num_tokens": 278659949.0, "step": 1921 }, { "entropy": 2.431640625, "epoch": 0.03309143186728993, "grad_norm": 0.5982487797737122, "learning_rate": 9.993578039016298e-06, "loss": 2.4021, "mean_token_accuracy": 0.4784347270615399, "num_tokens": 278784019.0, "step": 1922 }, { "entropy": 2.448486328125, "epoch": 0.03310864905348519, "grad_norm": 0.7279030680656433, "learning_rate": 9.993564088813078e-06, "loss": 2.4092, "mean_token_accuracy": 0.47610331838950515, "num_tokens": 278941353.0, "step": 1923 }, { "entropy": 2.5162353515625, "epoch": 0.033125866239680446, "grad_norm": 0.5260778665542603, "learning_rate": 9.993550123484282e-06, "loss": 2.4839, "mean_token_accuracy": 0.4682774804532528, "num_tokens": 279101143.0, "step": 1924 }, { "entropy": 2.448974609375, "epoch": 0.033143083425875706, "grad_norm": 0.5392525792121887, "learning_rate": 9.993536143029946e-06, "loss": 2.3976, "mean_token_accuracy": 0.479318049736321, "num_tokens": 279249944.0, "step": 1925 }, { "entropy": 2.4193115234375, "epoch": 0.03316030061207097, "grad_norm": 0.5925559401512146, "learning_rate": 9.993522147450118e-06, "loss": 2.3594, "mean_token_accuracy": 0.49047204852104187, "num_tokens": 279387621.0, "step": 1926 }, { "entropy": 2.51318359375, "epoch": 0.03317751779826623, "grad_norm": 0.6154163479804993, "learning_rate": 9.993508136744838e-06, "loss": 2.4331, "mean_token_accuracy": 0.47704603895545006, "num_tokens": 279525188.0, "step": 1927 }, { "entropy": 2.469482421875, "epoch": 0.03319473498446149, "grad_norm": 0.5946447253227234, "learning_rate": 9.993494110914149e-06, "loss": 2.4433, "mean_token_accuracy": 0.4768550405278802, "num_tokens": 279675297.0, "step": 1928 }, { "entropy": 2.4324951171875, "epoch": 0.03321195217065675, "grad_norm": 0.5541896820068359, "learning_rate": 9.99348006995809e-06, "loss": 2.4008, "mean_token_accuracy": 0.4792257468216121, "num_tokens": 279826165.0, "step": 1929 }, { "entropy": 2.430908203125, "epoch": 0.03322916935685201, "grad_norm": 0.5395227670669556, "learning_rate": 9.993466013876707e-06, "loss": 2.3978, "mean_token_accuracy": 0.4806228969246149, "num_tokens": 279980558.0, "step": 1930 }, { "entropy": 2.423828125, "epoch": 0.03324638654304727, "grad_norm": 0.5909799933433533, "learning_rate": 9.993451942670044e-06, "loss": 2.393, "mean_token_accuracy": 0.48585162637755275, "num_tokens": 280119585.0, "step": 1931 }, { "entropy": 2.512939453125, "epoch": 0.03326360372924253, "grad_norm": 0.6035844683647156, "learning_rate": 9.993437856338139e-06, "loss": 2.5345, "mean_token_accuracy": 0.4715218679048121, "num_tokens": 280244876.0, "step": 1932 }, { "entropy": 2.4185791015625, "epoch": 0.03328082091543779, "grad_norm": 0.5301707983016968, "learning_rate": 9.993423754881039e-06, "loss": 2.4086, "mean_token_accuracy": 0.48241995088756084, "num_tokens": 280401852.0, "step": 1933 }, { "entropy": 2.4444580078125, "epoch": 0.03329803810163305, "grad_norm": 0.5419909358024597, "learning_rate": 9.993409638298785e-06, "loss": 2.4023, "mean_token_accuracy": 0.4783248957246542, "num_tokens": 280561735.0, "step": 1934 }, { "entropy": 2.4560546875, "epoch": 0.03331525528782831, "grad_norm": 0.5301235914230347, "learning_rate": 9.99339550659142e-06, "loss": 2.4201, "mean_token_accuracy": 0.4791613514535129, "num_tokens": 280719202.0, "step": 1935 }, { "entropy": 2.3726806640625, "epoch": 0.03333247247402357, "grad_norm": 0.5838637948036194, "learning_rate": 9.993381359758985e-06, "loss": 2.347, "mean_token_accuracy": 0.4987304233945906, "num_tokens": 280866306.0, "step": 1936 }, { "entropy": 2.45068359375, "epoch": 0.03334968966021883, "grad_norm": 0.5787121057510376, "learning_rate": 9.993367197801527e-06, "loss": 2.3823, "mean_token_accuracy": 0.47824144922196865, "num_tokens": 280996610.0, "step": 1937 }, { "entropy": 2.5333251953125, "epoch": 0.03336690684641409, "grad_norm": 0.5946321487426758, "learning_rate": 9.993353020719083e-06, "loss": 2.5036, "mean_token_accuracy": 0.4754126025363803, "num_tokens": 281133369.0, "step": 1938 }, { "entropy": 2.444580078125, "epoch": 0.033384124032609354, "grad_norm": 0.5681336522102356, "learning_rate": 9.993338828511701e-06, "loss": 2.419, "mean_token_accuracy": 0.480977026745677, "num_tokens": 281293453.0, "step": 1939 }, { "entropy": 2.469970703125, "epoch": 0.03340134121880461, "grad_norm": 0.5857387185096741, "learning_rate": 9.993324621179424e-06, "loss": 2.446, "mean_token_accuracy": 0.48483802145347, "num_tokens": 281434754.0, "step": 1940 }, { "entropy": 2.4569091796875, "epoch": 0.03341855840499987, "grad_norm": 0.6091550588607788, "learning_rate": 9.99331039872229e-06, "loss": 2.4002, "mean_token_accuracy": 0.4800125630572438, "num_tokens": 281571983.0, "step": 1941 }, { "entropy": 2.4578857421875, "epoch": 0.03343577559119513, "grad_norm": 0.6192805171012878, "learning_rate": 9.993296161140346e-06, "loss": 2.3834, "mean_token_accuracy": 0.4805147326551378, "num_tokens": 281716426.0, "step": 1942 }, { "entropy": 2.5145263671875, "epoch": 0.03345299277739039, "grad_norm": 0.5912413001060486, "learning_rate": 9.993281908433637e-06, "loss": 2.4856, "mean_token_accuracy": 0.47100443998351693, "num_tokens": 281859208.0, "step": 1943 }, { "entropy": 2.4542236328125, "epoch": 0.03347020996358565, "grad_norm": 0.5443603992462158, "learning_rate": 9.9932676406022e-06, "loss": 2.4321, "mean_token_accuracy": 0.4776157597079873, "num_tokens": 282011533.0, "step": 1944 }, { "entropy": 2.4461669921875, "epoch": 0.03348742714978091, "grad_norm": 0.5912041068077087, "learning_rate": 9.993253357646085e-06, "loss": 2.4245, "mean_token_accuracy": 0.4797707684338093, "num_tokens": 282142801.0, "step": 1945 }, { "entropy": 2.4661865234375, "epoch": 0.03350464433597617, "grad_norm": 0.5891805291175842, "learning_rate": 9.99323905956533e-06, "loss": 2.408, "mean_token_accuracy": 0.4835380604490638, "num_tokens": 282282290.0, "step": 1946 }, { "entropy": 2.505859375, "epoch": 0.03352186152217143, "grad_norm": 0.6140239238739014, "learning_rate": 9.993224746359981e-06, "loss": 2.4778, "mean_token_accuracy": 0.47382217831909657, "num_tokens": 282421711.0, "step": 1947 }, { "entropy": 2.464111328125, "epoch": 0.03353907870836669, "grad_norm": 0.5682819485664368, "learning_rate": 9.993210418030082e-06, "loss": 2.4156, "mean_token_accuracy": 0.47630415530875325, "num_tokens": 282576394.0, "step": 1948 }, { "entropy": 2.3934326171875, "epoch": 0.03355629589456195, "grad_norm": 0.5857619643211365, "learning_rate": 9.993196074575673e-06, "loss": 2.3721, "mean_token_accuracy": 0.49081736570224166, "num_tokens": 282729923.0, "step": 1949 }, { "entropy": 2.4425048828125, "epoch": 0.03357351308075721, "grad_norm": 0.6023845076560974, "learning_rate": 9.9931817159968e-06, "loss": 2.3967, "mean_token_accuracy": 0.4840724840760231, "num_tokens": 282862205.0, "step": 1950 }, { "entropy": 2.423095703125, "epoch": 0.03359073026695247, "grad_norm": 0.5758981108665466, "learning_rate": 9.993167342293508e-06, "loss": 2.3915, "mean_token_accuracy": 0.48772860085591674, "num_tokens": 282997486.0, "step": 1951 }, { "entropy": 2.5380859375, "epoch": 0.033607947453147734, "grad_norm": 0.550911545753479, "learning_rate": 9.993152953465837e-06, "loss": 2.5101, "mean_token_accuracy": 0.4724648226983845, "num_tokens": 283148960.0, "step": 1952 }, { "entropy": 2.501708984375, "epoch": 0.033625164639342994, "grad_norm": 0.5853079557418823, "learning_rate": 9.993138549513834e-06, "loss": 2.46, "mean_token_accuracy": 0.4766088160686195, "num_tokens": 283290181.0, "step": 1953 }, { "entropy": 2.412353515625, "epoch": 0.033642381825538255, "grad_norm": 0.5772018432617188, "learning_rate": 9.99312413043754e-06, "loss": 2.3818, "mean_token_accuracy": 0.4874812732450664, "num_tokens": 283446157.0, "step": 1954 }, { "entropy": 2.4298095703125, "epoch": 0.033659599011733515, "grad_norm": 0.5674806833267212, "learning_rate": 9.993109696236997e-06, "loss": 2.4043, "mean_token_accuracy": 0.48742856411263347, "num_tokens": 283583169.0, "step": 1955 }, { "entropy": 2.4627685546875, "epoch": 0.033676816197928776, "grad_norm": 0.5692137479782104, "learning_rate": 9.993095246912254e-06, "loss": 2.4209, "mean_token_accuracy": 0.4784468016587198, "num_tokens": 283726808.0, "step": 1956 }, { "entropy": 2.411865234375, "epoch": 0.03369403338412403, "grad_norm": 0.5745928287506104, "learning_rate": 9.99308078246335e-06, "loss": 2.3845, "mean_token_accuracy": 0.48880593106150627, "num_tokens": 283884790.0, "step": 1957 }, { "entropy": 2.4476318359375, "epoch": 0.03371125057031929, "grad_norm": 0.5757923722267151, "learning_rate": 9.993066302890335e-06, "loss": 2.4339, "mean_token_accuracy": 0.47860253555700183, "num_tokens": 284026807.0, "step": 1958 }, { "entropy": 2.4261474609375, "epoch": 0.03372846775651455, "grad_norm": 0.509971559047699, "learning_rate": 9.993051808193246e-06, "loss": 2.39, "mean_token_accuracy": 0.4785323585383594, "num_tokens": 284189392.0, "step": 1959 }, { "entropy": 2.4044189453125, "epoch": 0.03374568494270981, "grad_norm": 0.575952410697937, "learning_rate": 9.99303729837213e-06, "loss": 2.3911, "mean_token_accuracy": 0.48549577174708247, "num_tokens": 284324103.0, "step": 1960 }, { "entropy": 2.46044921875, "epoch": 0.03376290212890507, "grad_norm": 0.5873697996139526, "learning_rate": 9.993022773427028e-06, "loss": 2.419, "mean_token_accuracy": 0.4775766748934984, "num_tokens": 284466158.0, "step": 1961 }, { "entropy": 2.533935546875, "epoch": 0.03378011931510033, "grad_norm": 0.5654860734939575, "learning_rate": 9.99300823335799e-06, "loss": 2.4545, "mean_token_accuracy": 0.47519396571442485, "num_tokens": 284602979.0, "step": 1962 }, { "entropy": 2.472412109375, "epoch": 0.03379733650129559, "grad_norm": 0.7695214748382568, "learning_rate": 9.992993678165055e-06, "loss": 2.4068, "mean_token_accuracy": 0.4817899586632848, "num_tokens": 284740325.0, "step": 1963 }, { "entropy": 2.4033203125, "epoch": 0.03381455368749085, "grad_norm": 0.5517217516899109, "learning_rate": 9.99297910784827e-06, "loss": 2.3849, "mean_token_accuracy": 0.48449106933549047, "num_tokens": 284885765.0, "step": 1964 }, { "entropy": 2.4239501953125, "epoch": 0.033831770873686114, "grad_norm": 0.6454329490661621, "learning_rate": 9.992964522407676e-06, "loss": 2.4082, "mean_token_accuracy": 0.48670034017413855, "num_tokens": 285020125.0, "step": 1965 }, { "entropy": 2.4947509765625, "epoch": 0.033848988059881374, "grad_norm": 0.5675487518310547, "learning_rate": 9.99294992184332e-06, "loss": 2.4896, "mean_token_accuracy": 0.4730933913961053, "num_tokens": 285165664.0, "step": 1966 }, { "entropy": 2.4700927734375, "epoch": 0.033866205246076635, "grad_norm": 0.581511914730072, "learning_rate": 9.992935306155246e-06, "loss": 2.4402, "mean_token_accuracy": 0.4833594365045428, "num_tokens": 285304514.0, "step": 1967 }, { "entropy": 2.4307861328125, "epoch": 0.033883422432271895, "grad_norm": 0.5686525702476501, "learning_rate": 9.992920675343496e-06, "loss": 2.412, "mean_token_accuracy": 0.4830741249024868, "num_tokens": 285451178.0, "step": 1968 }, { "entropy": 2.44921875, "epoch": 0.033900639618467156, "grad_norm": 0.5576603412628174, "learning_rate": 9.992906029408115e-06, "loss": 2.3827, "mean_token_accuracy": 0.4801848717033863, "num_tokens": 285595105.0, "step": 1969 }, { "entropy": 2.415771484375, "epoch": 0.033917856804662416, "grad_norm": 0.5764795541763306, "learning_rate": 9.992891368349151e-06, "loss": 2.3845, "mean_token_accuracy": 0.48759686667472124, "num_tokens": 285726774.0, "step": 1970 }, { "entropy": 2.4775390625, "epoch": 0.03393507399085768, "grad_norm": 0.6182502508163452, "learning_rate": 9.992876692166644e-06, "loss": 2.491, "mean_token_accuracy": 0.4704209272749722, "num_tokens": 285846737.0, "step": 1971 }, { "entropy": 2.4888916015625, "epoch": 0.03395229117705294, "grad_norm": 0.5855472683906555, "learning_rate": 9.99286200086064e-06, "loss": 2.4541, "mean_token_accuracy": 0.4759668963961303, "num_tokens": 285986773.0, "step": 1972 }, { "entropy": 2.473388671875, "epoch": 0.03396950836324819, "grad_norm": 0.5262144804000854, "learning_rate": 9.992847294431186e-06, "loss": 2.473, "mean_token_accuracy": 0.4791461192071438, "num_tokens": 286147251.0, "step": 1973 }, { "entropy": 2.4501953125, "epoch": 0.03398672554944345, "grad_norm": 0.558925211429596, "learning_rate": 9.99283257287832e-06, "loss": 2.4154, "mean_token_accuracy": 0.47716324497014284, "num_tokens": 286310437.0, "step": 1974 }, { "entropy": 2.426025390625, "epoch": 0.03400394273563871, "grad_norm": 0.5795169472694397, "learning_rate": 9.992817836202093e-06, "loss": 2.3785, "mean_token_accuracy": 0.4854766642674804, "num_tokens": 286448769.0, "step": 1975 }, { "entropy": 2.52880859375, "epoch": 0.03402115992183397, "grad_norm": 0.5628653764724731, "learning_rate": 9.992803084402547e-06, "loss": 2.5193, "mean_token_accuracy": 0.4707297068089247, "num_tokens": 286588873.0, "step": 1976 }, { "entropy": 2.4481201171875, "epoch": 0.03403837710802923, "grad_norm": 0.5932325124740601, "learning_rate": 9.992788317479727e-06, "loss": 2.4434, "mean_token_accuracy": 0.47935432521626353, "num_tokens": 286733286.0, "step": 1977 }, { "entropy": 2.45361328125, "epoch": 0.034055594294224494, "grad_norm": 0.576995313167572, "learning_rate": 9.992773535433678e-06, "loss": 2.4079, "mean_token_accuracy": 0.47960167936980724, "num_tokens": 286875771.0, "step": 1978 }, { "entropy": 2.4423828125, "epoch": 0.034072811480419754, "grad_norm": 0.5512486100196838, "learning_rate": 9.992758738264442e-06, "loss": 2.4155, "mean_token_accuracy": 0.480304847471416, "num_tokens": 287031003.0, "step": 1979 }, { "entropy": 2.3990478515625, "epoch": 0.034090028666615015, "grad_norm": 0.5462455153465271, "learning_rate": 9.992743925972069e-06, "loss": 2.3775, "mean_token_accuracy": 0.48557318560779095, "num_tokens": 287180788.0, "step": 1980 }, { "entropy": 2.5538330078125, "epoch": 0.034107245852810275, "grad_norm": 0.5789005756378174, "learning_rate": 9.992729098556601e-06, "loss": 2.5184, "mean_token_accuracy": 0.4639392225071788, "num_tokens": 287318088.0, "step": 1981 }, { "entropy": 2.4896240234375, "epoch": 0.034124463039005536, "grad_norm": 0.66868656873703, "learning_rate": 9.992714256018082e-06, "loss": 2.4807, "mean_token_accuracy": 0.4695613798685372, "num_tokens": 287466895.0, "step": 1982 }, { "entropy": 2.4345703125, "epoch": 0.034141680225200796, "grad_norm": 0.5860902667045593, "learning_rate": 9.99269939835656e-06, "loss": 2.3898, "mean_token_accuracy": 0.4838330140337348, "num_tokens": 287612255.0, "step": 1983 }, { "entropy": 2.507080078125, "epoch": 0.03415889741139606, "grad_norm": 0.5566897988319397, "learning_rate": 9.992684525572076e-06, "loss": 2.498, "mean_token_accuracy": 0.47418344393372536, "num_tokens": 287765365.0, "step": 1984 }, { "entropy": 2.367431640625, "epoch": 0.03417611459759132, "grad_norm": 0.5585086941719055, "learning_rate": 9.992669637664679e-06, "loss": 2.3324, "mean_token_accuracy": 0.49284734204411507, "num_tokens": 287923606.0, "step": 1985 }, { "entropy": 2.4853515625, "epoch": 0.03419333178378658, "grad_norm": 0.5721833109855652, "learning_rate": 9.99265473463441e-06, "loss": 2.4174, "mean_token_accuracy": 0.47697839234024286, "num_tokens": 288065850.0, "step": 1986 }, { "entropy": 2.4268798828125, "epoch": 0.03421054896998184, "grad_norm": 0.5846219062805176, "learning_rate": 9.992639816481317e-06, "loss": 2.4077, "mean_token_accuracy": 0.4877488249912858, "num_tokens": 288219901.0, "step": 1987 }, { "entropy": 2.494140625, "epoch": 0.0342277661561771, "grad_norm": 0.7576902508735657, "learning_rate": 9.992624883205446e-06, "loss": 2.4919, "mean_token_accuracy": 0.4697610507719219, "num_tokens": 288368125.0, "step": 1988 }, { "entropy": 2.4490966796875, "epoch": 0.03424498334237236, "grad_norm": 0.5778993368148804, "learning_rate": 9.99260993480684e-06, "loss": 2.3865, "mean_token_accuracy": 0.48272112663835287, "num_tokens": 288502228.0, "step": 1989 }, { "entropy": 2.4388427734375, "epoch": 0.03426220052856761, "grad_norm": 0.5564126372337341, "learning_rate": 9.992594971285545e-06, "loss": 2.4208, "mean_token_accuracy": 0.479803703725338, "num_tokens": 288650170.0, "step": 1990 }, { "entropy": 2.4871826171875, "epoch": 0.03427941771476287, "grad_norm": 0.5940274596214294, "learning_rate": 9.992579992641606e-06, "loss": 2.4353, "mean_token_accuracy": 0.4715647897683084, "num_tokens": 288780528.0, "step": 1991 }, { "entropy": 2.4788818359375, "epoch": 0.034296634900958134, "grad_norm": 0.6397010087966919, "learning_rate": 9.99256499887507e-06, "loss": 2.4715, "mean_token_accuracy": 0.4812845438718796, "num_tokens": 288936902.0, "step": 1992 }, { "entropy": 2.417724609375, "epoch": 0.034313852087153394, "grad_norm": 0.5475616455078125, "learning_rate": 9.99254998998598e-06, "loss": 2.3702, "mean_token_accuracy": 0.4824229711666703, "num_tokens": 289091883.0, "step": 1993 }, { "entropy": 2.4339599609375, "epoch": 0.034331069273348655, "grad_norm": 0.5338613390922546, "learning_rate": 9.99253496597438e-06, "loss": 2.417, "mean_token_accuracy": 0.479421756695956, "num_tokens": 289241366.0, "step": 1994 }, { "entropy": 2.406005859375, "epoch": 0.034348286459543916, "grad_norm": 0.6116018295288086, "learning_rate": 9.992519926840322e-06, "loss": 2.3794, "mean_token_accuracy": 0.4870318342000246, "num_tokens": 289380641.0, "step": 1995 }, { "entropy": 2.4573974609375, "epoch": 0.034365503645739176, "grad_norm": 0.6071226596832275, "learning_rate": 9.992504872583847e-06, "loss": 2.4109, "mean_token_accuracy": 0.47875087382271886, "num_tokens": 289521937.0, "step": 1996 }, { "entropy": 2.4119873046875, "epoch": 0.03438272083193444, "grad_norm": 0.5995868444442749, "learning_rate": 9.992489803205e-06, "loss": 2.3732, "mean_token_accuracy": 0.4848559848032892, "num_tokens": 289670647.0, "step": 1997 }, { "entropy": 2.4595947265625, "epoch": 0.0343999380181297, "grad_norm": 0.5530126094818115, "learning_rate": 9.992474718703829e-06, "loss": 2.4787, "mean_token_accuracy": 0.47666746005415916, "num_tokens": 289833608.0, "step": 1998 }, { "entropy": 2.4443359375, "epoch": 0.03441715520432496, "grad_norm": 0.5142894387245178, "learning_rate": 9.99245961908038e-06, "loss": 2.3763, "mean_token_accuracy": 0.48295469116419554, "num_tokens": 289992658.0, "step": 1999 }, { "entropy": 2.42138671875, "epoch": 0.03443437239052022, "grad_norm": 0.5748444199562073, "learning_rate": 9.992444504334696e-06, "loss": 2.4077, "mean_token_accuracy": 0.48649854911491275, "num_tokens": 290134205.0, "step": 2000 } ], "logging_steps": 1, "max_steps": 58082, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7734025191882752e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }