diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.03443437239052022, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.413330078125, + "epoch": 1.721718619526011e-05, + "grad_norm": 0.6972781419754028, + "learning_rate": 0.0, + "loss": 2.3998, + "mean_token_accuracy": 0.4832073478028178, + "num_tokens": 152243.0, + "step": 1 + }, + { + "entropy": 2.4014892578125, + "epoch": 3.443437239052022e-05, + "grad_norm": 0.703504204750061, + "learning_rate": 1e-08, + "loss": 2.3565, + "mean_token_accuracy": 0.48620754200965166, + "num_tokens": 302755.0, + "step": 2 + }, + { + "entropy": 2.3984375, + "epoch": 5.165155858578032e-05, + "grad_norm": 0.7933295369148254, + "learning_rate": 2e-08, + "loss": 2.3735, + "mean_token_accuracy": 0.4893745076842606, + "num_tokens": 430137.0, + "step": 3 + }, + { + "entropy": 2.4783935546875, + "epoch": 6.886874478104043e-05, + "grad_norm": 0.6833076477050781, + "learning_rate": 3.0000000000000004e-08, + "loss": 2.4365, + "mean_token_accuracy": 0.4767027348279953, + "num_tokens": 575284.0, + "step": 4 + }, + { + "entropy": 2.457275390625, + "epoch": 8.608593097630054e-05, + "grad_norm": 0.7900882363319397, + "learning_rate": 4e-08, + "loss": 2.4671, + "mean_token_accuracy": 0.47980545135214925, + "num_tokens": 714418.0, + "step": 5 + }, + { + "entropy": 2.367431640625, + "epoch": 0.00010330311717156065, + "grad_norm": 0.6312107443809509, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.3371, + "mean_token_accuracy": 0.49468297231942415, + "num_tokens": 883298.0, + "step": 6 + }, + { + "entropy": 2.45263671875, + "epoch": 0.00012052030336682076, + "grad_norm": 0.6607802510261536, + "learning_rate": 6.000000000000001e-08, + "loss": 2.4411, + "mean_token_accuracy": 0.4778139302507043, + "num_tokens": 1034855.0, + "step": 7 + }, + { + "entropy": 2.482421875, + "epoch": 0.00013773748956208087, + "grad_norm": 0.6800277829170227, + "learning_rate": 7e-08, + "loss": 2.439, + "mean_token_accuracy": 0.4751331675797701, + "num_tokens": 1176490.0, + "step": 8 + }, + { + "entropy": 2.391357421875, + "epoch": 0.000154954675757341, + "grad_norm": 0.7818512916564941, + "learning_rate": 8e-08, + "loss": 2.3771, + "mean_token_accuracy": 0.48402665881440043, + "num_tokens": 1311861.0, + "step": 9 + }, + { + "entropy": 2.39453125, + "epoch": 0.00017217186195260108, + "grad_norm": 0.7182415127754211, + "learning_rate": 9e-08, + "loss": 2.3644, + "mean_token_accuracy": 0.4823771519586444, + "num_tokens": 1454916.0, + "step": 10 + }, + { + "entropy": 2.477783203125, + "epoch": 0.0001893890481478612, + "grad_norm": 0.7052203416824341, + "learning_rate": 1.0000000000000001e-07, + "loss": 2.4729, + "mean_token_accuracy": 0.4706865563057363, + "num_tokens": 1587290.0, + "step": 11 + }, + { + "entropy": 2.4482421875, + "epoch": 0.0002066062343431213, + "grad_norm": 0.6999819278717041, + "learning_rate": 1.1e-07, + "loss": 2.4037, + "mean_token_accuracy": 0.47885329788550735, + "num_tokens": 1724163.0, + "step": 12 + }, + { + "entropy": 2.4818115234375, + "epoch": 0.0002238234205383814, + "grad_norm": 0.7237532734870911, + "learning_rate": 1.2000000000000002e-07, + "loss": 2.5056, + "mean_token_accuracy": 0.46921027079224586, + "num_tokens": 1865666.0, + "step": 13 + }, + { + "entropy": 2.392333984375, + "epoch": 0.00024104060673364153, + "grad_norm": 0.6929540634155273, + "learning_rate": 1.3e-07, + "loss": 2.3854, + "mean_token_accuracy": 0.4831690890714526, + "num_tokens": 2020018.0, + "step": 14 + }, + { + "entropy": 2.4205322265625, + "epoch": 0.00025825779292890165, + "grad_norm": 0.6675299406051636, + "learning_rate": 1.4e-07, + "loss": 2.3398, + "mean_token_accuracy": 0.48172238236293197, + "num_tokens": 2158990.0, + "step": 15 + }, + { + "entropy": 2.385498046875, + "epoch": 0.00027547497912416174, + "grad_norm": 0.7503660321235657, + "learning_rate": 1.5000000000000002e-07, + "loss": 2.3352, + "mean_token_accuracy": 0.4925760827027261, + "num_tokens": 2296401.0, + "step": 16 + }, + { + "entropy": 2.4197998046875, + "epoch": 0.00029269216531942183, + "grad_norm": 0.6798411011695862, + "learning_rate": 1.6e-07, + "loss": 2.4002, + "mean_token_accuracy": 0.4798423429019749, + "num_tokens": 2445270.0, + "step": 17 + }, + { + "entropy": 2.474609375, + "epoch": 0.000309909351514682, + "grad_norm": 0.6871540546417236, + "learning_rate": 1.7000000000000001e-07, + "loss": 2.4938, + "mean_token_accuracy": 0.47186582954600453, + "num_tokens": 2586867.0, + "step": 18 + }, + { + "entropy": 2.50732421875, + "epoch": 0.00032712653770994207, + "grad_norm": 0.717059314250946, + "learning_rate": 1.8e-07, + "loss": 2.525, + "mean_token_accuracy": 0.46879610791802406, + "num_tokens": 2720721.0, + "step": 19 + }, + { + "entropy": 2.424560546875, + "epoch": 0.00034434372390520216, + "grad_norm": 0.6196162700653076, + "learning_rate": 1.9e-07, + "loss": 2.4176, + "mean_token_accuracy": 0.4790610708296299, + "num_tokens": 2876790.0, + "step": 20 + }, + { + "entropy": 2.4759521484375, + "epoch": 0.0003615609101004623, + "grad_norm": 0.6419414281845093, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.4874, + "mean_token_accuracy": 0.471744445618242, + "num_tokens": 3033198.0, + "step": 21 + }, + { + "entropy": 2.383056640625, + "epoch": 0.0003787780962957224, + "grad_norm": 0.6465677618980408, + "learning_rate": 2.1000000000000003e-07, + "loss": 2.3503, + "mean_token_accuracy": 0.48874560045078397, + "num_tokens": 3181718.0, + "step": 22 + }, + { + "entropy": 2.3697509765625, + "epoch": 0.0003959952824909825, + "grad_norm": 0.7614319920539856, + "learning_rate": 2.2e-07, + "loss": 2.3713, + "mean_token_accuracy": 0.48694683285430074, + "num_tokens": 3328269.0, + "step": 23 + }, + { + "entropy": 2.3336181640625, + "epoch": 0.0004132124686862426, + "grad_norm": 0.7275253534317017, + "learning_rate": 2.3000000000000002e-07, + "loss": 2.3167, + "mean_token_accuracy": 0.4986211028881371, + "num_tokens": 3482113.0, + "step": 24 + }, + { + "entropy": 2.44775390625, + "epoch": 0.0004304296548815027, + "grad_norm": 0.6546138525009155, + "learning_rate": 2.4000000000000003e-07, + "loss": 2.4172, + "mean_token_accuracy": 0.48186044162139297, + "num_tokens": 3634079.0, + "step": 25 + }, + { + "entropy": 2.495849609375, + "epoch": 0.0004476468410767628, + "grad_norm": 0.6819922924041748, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.4707, + "mean_token_accuracy": 0.47281224094331264, + "num_tokens": 3784902.0, + "step": 26 + }, + { + "entropy": 2.490966796875, + "epoch": 0.0004648640272720229, + "grad_norm": 0.6698691248893738, + "learning_rate": 2.6e-07, + "loss": 2.4847, + "mean_token_accuracy": 0.46762992488220334, + "num_tokens": 3923281.0, + "step": 27 + }, + { + "entropy": 2.4964599609375, + "epoch": 0.00048208121346728306, + "grad_norm": 0.6502507925033569, + "learning_rate": 2.7e-07, + "loss": 2.4646, + "mean_token_accuracy": 0.46740976348519325, + "num_tokens": 4074956.0, + "step": 28 + }, + { + "entropy": 2.4283447265625, + "epoch": 0.0004992983996625432, + "grad_norm": 0.7344250679016113, + "learning_rate": 2.8e-07, + "loss": 2.425, + "mean_token_accuracy": 0.47232619673013687, + "num_tokens": 4218108.0, + "step": 29 + }, + { + "entropy": 2.453857421875, + "epoch": 0.0005165155858578033, + "grad_norm": 0.7299566268920898, + "learning_rate": 2.9000000000000003e-07, + "loss": 2.4514, + "mean_token_accuracy": 0.477730430662632, + "num_tokens": 4352208.0, + "step": 30 + }, + { + "entropy": 2.39404296875, + "epoch": 0.0005337327720530634, + "grad_norm": 0.6726910471916199, + "learning_rate": 3.0000000000000004e-07, + "loss": 2.3704, + "mean_token_accuracy": 0.4892354430630803, + "num_tokens": 4510862.0, + "step": 31 + }, + { + "entropy": 2.37158203125, + "epoch": 0.0005509499582483235, + "grad_norm": 0.6483345627784729, + "learning_rate": 3.1000000000000005e-07, + "loss": 2.3378, + "mean_token_accuracy": 0.4861680665053427, + "num_tokens": 4655812.0, + "step": 32 + }, + { + "entropy": 2.4970703125, + "epoch": 0.0005681671444435836, + "grad_norm": 0.7244667410850525, + "learning_rate": 3.2e-07, + "loss": 2.4361, + "mean_token_accuracy": 0.4727164036594331, + "num_tokens": 4796670.0, + "step": 33 + }, + { + "entropy": 2.4234619140625, + "epoch": 0.0005853843306388437, + "grad_norm": 0.6697008013725281, + "learning_rate": 3.3e-07, + "loss": 2.4176, + "mean_token_accuracy": 0.48080282052978873, + "num_tokens": 4946476.0, + "step": 34 + }, + { + "entropy": 2.4354248046875, + "epoch": 0.0006026015168341038, + "grad_norm": 0.6681280732154846, + "learning_rate": 3.4000000000000003e-07, + "loss": 2.3967, + "mean_token_accuracy": 0.48242951929569244, + "num_tokens": 5097062.0, + "step": 35 + }, + { + "entropy": 2.4337158203125, + "epoch": 0.000619818703029364, + "grad_norm": 0.6784984469413757, + "learning_rate": 3.5000000000000004e-07, + "loss": 2.4061, + "mean_token_accuracy": 0.4787406297400594, + "num_tokens": 5241121.0, + "step": 36 + }, + { + "entropy": 2.3486328125, + "epoch": 0.000637035889224624, + "grad_norm": 0.7190965414047241, + "learning_rate": 3.6e-07, + "loss": 2.3033, + "mean_token_accuracy": 0.4977131159976125, + "num_tokens": 5379636.0, + "step": 37 + }, + { + "entropy": 2.400390625, + "epoch": 0.0006542530754198841, + "grad_norm": 0.7516965866088867, + "learning_rate": 3.7e-07, + "loss": 2.4019, + "mean_token_accuracy": 0.48473000153899193, + "num_tokens": 5513259.0, + "step": 38 + }, + { + "entropy": 2.4234619140625, + "epoch": 0.0006714702616151442, + "grad_norm": 0.6816972494125366, + "learning_rate": 3.8e-07, + "loss": 2.4078, + "mean_token_accuracy": 0.4816413172520697, + "num_tokens": 5657625.0, + "step": 39 + }, + { + "entropy": 2.4327392578125, + "epoch": 0.0006886874478104043, + "grad_norm": 0.7355881929397583, + "learning_rate": 3.9e-07, + "loss": 2.4284, + "mean_token_accuracy": 0.4844762939028442, + "num_tokens": 5801203.0, + "step": 40 + }, + { + "entropy": 2.4520263671875, + "epoch": 0.0007059046340056644, + "grad_norm": 0.6229036450386047, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.4646, + "mean_token_accuracy": 0.47269141068682075, + "num_tokens": 5953949.0, + "step": 41 + }, + { + "entropy": 2.4000244140625, + "epoch": 0.0007231218202009246, + "grad_norm": 1.3853771686553955, + "learning_rate": 4.1000000000000004e-07, + "loss": 2.415, + "mean_token_accuracy": 0.48673709062859416, + "num_tokens": 6124847.0, + "step": 42 + }, + { + "entropy": 2.443603515625, + "epoch": 0.0007403390063961847, + "grad_norm": 0.6690321564674377, + "learning_rate": 4.2000000000000006e-07, + "loss": 2.4083, + "mean_token_accuracy": 0.4795906525105238, + "num_tokens": 6277763.0, + "step": 43 + }, + { + "entropy": 2.38916015625, + "epoch": 0.0007575561925914448, + "grad_norm": 0.6857286691665649, + "learning_rate": 4.3e-07, + "loss": 2.3703, + "mean_token_accuracy": 0.4900118997320533, + "num_tokens": 6426281.0, + "step": 44 + }, + { + "entropy": 2.4334716796875, + "epoch": 0.0007747733787867049, + "grad_norm": 0.7215378284454346, + "learning_rate": 4.4e-07, + "loss": 2.4171, + "mean_token_accuracy": 0.48149433452636003, + "num_tokens": 6569929.0, + "step": 45 + }, + { + "entropy": 2.4718017578125, + "epoch": 0.000791990564981965, + "grad_norm": 0.6353131532669067, + "learning_rate": 4.5000000000000003e-07, + "loss": 2.4404, + "mean_token_accuracy": 0.4716165652498603, + "num_tokens": 6715254.0, + "step": 46 + }, + { + "entropy": 2.4588623046875, + "epoch": 0.0008092077511772251, + "grad_norm": 0.7047929763793945, + "learning_rate": 4.6000000000000004e-07, + "loss": 2.4363, + "mean_token_accuracy": 0.47845502756536007, + "num_tokens": 6858114.0, + "step": 47 + }, + { + "entropy": 2.4326171875, + "epoch": 0.0008264249373724852, + "grad_norm": 0.6554984450340271, + "learning_rate": 4.7000000000000005e-07, + "loss": 2.4103, + "mean_token_accuracy": 0.48302431078627706, + "num_tokens": 7003888.0, + "step": 48 + }, + { + "entropy": 2.424072265625, + "epoch": 0.0008436421235677454, + "grad_norm": 0.615967333316803, + "learning_rate": 4.800000000000001e-07, + "loss": 2.3426, + "mean_token_accuracy": 0.48605213360860944, + "num_tokens": 7178232.0, + "step": 49 + }, + { + "entropy": 2.3641357421875, + "epoch": 0.0008608593097630055, + "grad_norm": 0.7089155316352844, + "learning_rate": 4.900000000000001e-07, + "loss": 2.3231, + "mean_token_accuracy": 0.49523815233260393, + "num_tokens": 7323317.0, + "step": 50 + }, + { + "entropy": 2.355224609375, + "epoch": 0.0008780764959582655, + "grad_norm": 0.6622390151023865, + "learning_rate": 5.000000000000001e-07, + "loss": 2.328, + "mean_token_accuracy": 0.49393986631184816, + "num_tokens": 7473982.0, + "step": 51 + }, + { + "entropy": 2.41357421875, + "epoch": 0.0008952936821535256, + "grad_norm": 0.7162524461746216, + "learning_rate": 5.1e-07, + "loss": 2.3915, + "mean_token_accuracy": 0.4822638025507331, + "num_tokens": 7618445.0, + "step": 52 + }, + { + "entropy": 2.4310302734375, + "epoch": 0.0009125108683487857, + "grad_norm": 0.7647550106048584, + "learning_rate": 5.2e-07, + "loss": 2.3974, + "mean_token_accuracy": 0.4842324573546648, + "num_tokens": 7759148.0, + "step": 53 + }, + { + "entropy": 2.4063720703125, + "epoch": 0.0009297280545440458, + "grad_norm": 0.6380776166915894, + "learning_rate": 5.3e-07, + "loss": 2.4044, + "mean_token_accuracy": 0.48621372459456325, + "num_tokens": 7909476.0, + "step": 54 + }, + { + "entropy": 2.40234375, + "epoch": 0.000946945240739306, + "grad_norm": 0.6949607729911804, + "learning_rate": 5.4e-07, + "loss": 2.3508, + "mean_token_accuracy": 0.48716961592435837, + "num_tokens": 8052542.0, + "step": 55 + }, + { + "entropy": 2.361328125, + "epoch": 0.0009641624269345661, + "grad_norm": 0.6398380398750305, + "learning_rate": 5.5e-07, + "loss": 2.298, + "mean_token_accuracy": 0.49046063888818026, + "num_tokens": 8198359.0, + "step": 56 + }, + { + "entropy": 2.424560546875, + "epoch": 0.000981379613129826, + "grad_norm": 0.6661210060119629, + "learning_rate": 5.6e-07, + "loss": 2.3812, + "mean_token_accuracy": 0.4842396741732955, + "num_tokens": 8350561.0, + "step": 57 + }, + { + "entropy": 2.42333984375, + "epoch": 0.0009985967993250864, + "grad_norm": 0.6568598747253418, + "learning_rate": 5.7e-07, + "loss": 2.386, + "mean_token_accuracy": 0.4811908514238894, + "num_tokens": 8498421.0, + "step": 58 + }, + { + "entropy": 2.4390869140625, + "epoch": 0.0010158139855203465, + "grad_norm": 0.6521676778793335, + "learning_rate": 5.800000000000001e-07, + "loss": 2.4007, + "mean_token_accuracy": 0.47653205832466483, + "num_tokens": 8645838.0, + "step": 59 + }, + { + "entropy": 2.4642333984375, + "epoch": 0.0010330311717156066, + "grad_norm": 0.6818094849586487, + "learning_rate": 5.900000000000001e-07, + "loss": 2.4183, + "mean_token_accuracy": 0.47624633833765984, + "num_tokens": 8781213.0, + "step": 60 + }, + { + "entropy": 2.4544677734375, + "epoch": 0.0010502483579108667, + "grad_norm": 0.7099837064743042, + "learning_rate": 6.000000000000001e-07, + "loss": 2.429, + "mean_token_accuracy": 0.4788547111675143, + "num_tokens": 8932181.0, + "step": 61 + }, + { + "entropy": 2.475830078125, + "epoch": 0.0010674655441061268, + "grad_norm": 0.6400408744812012, + "learning_rate": 6.100000000000001e-07, + "loss": 2.452, + "mean_token_accuracy": 0.4709730679169297, + "num_tokens": 9069468.0, + "step": 62 + }, + { + "entropy": 2.4625244140625, + "epoch": 0.0010846827303013869, + "grad_norm": 0.6099865436553955, + "learning_rate": 6.200000000000001e-07, + "loss": 2.4152, + "mean_token_accuracy": 0.4845339651219547, + "num_tokens": 9238211.0, + "step": 63 + }, + { + "entropy": 2.4390869140625, + "epoch": 0.001101899916496647, + "grad_norm": 0.6318409442901611, + "learning_rate": 6.3e-07, + "loss": 2.402, + "mean_token_accuracy": 0.4825730072334409, + "num_tokens": 9393830.0, + "step": 64 + }, + { + "entropy": 2.3837890625, + "epoch": 0.001119117102691907, + "grad_norm": 0.6888275742530823, + "learning_rate": 6.4e-07, + "loss": 2.3289, + "mean_token_accuracy": 0.4893337092362344, + "num_tokens": 9536691.0, + "step": 65 + }, + { + "entropy": 2.476806640625, + "epoch": 0.0011363342888871671, + "grad_norm": 0.6647155284881592, + "learning_rate": 6.5e-07, + "loss": 2.4071, + "mean_token_accuracy": 0.47626868123188615, + "num_tokens": 9672776.0, + "step": 66 + }, + { + "entropy": 2.4517822265625, + "epoch": 0.0011535514750824272, + "grad_norm": 0.6950869560241699, + "learning_rate": 6.6e-07, + "loss": 2.4153, + "mean_token_accuracy": 0.4799360786564648, + "num_tokens": 9823719.0, + "step": 67 + }, + { + "entropy": 2.4583740234375, + "epoch": 0.0011707686612776873, + "grad_norm": 0.5642852783203125, + "learning_rate": 6.7e-07, + "loss": 2.4354, + "mean_token_accuracy": 0.47825055569410324, + "num_tokens": 9979166.0, + "step": 68 + }, + { + "entropy": 2.503173828125, + "epoch": 0.0011879858474729474, + "grad_norm": 0.6830074787139893, + "learning_rate": 6.800000000000001e-07, + "loss": 2.4759, + "mean_token_accuracy": 0.46751530282199383, + "num_tokens": 10123771.0, + "step": 69 + }, + { + "entropy": 2.4609375, + "epoch": 0.0012052030336682075, + "grad_norm": 0.611301600933075, + "learning_rate": 6.900000000000001e-07, + "loss": 2.3415, + "mean_token_accuracy": 0.4828020860441029, + "num_tokens": 10266978.0, + "step": 70 + }, + { + "entropy": 2.393798828125, + "epoch": 0.0012224202198634678, + "grad_norm": 0.6329925060272217, + "learning_rate": 7.000000000000001e-07, + "loss": 2.3527, + "mean_token_accuracy": 0.4885860946960747, + "num_tokens": 10411005.0, + "step": 71 + }, + { + "entropy": 2.4456787109375, + "epoch": 0.001239637406058728, + "grad_norm": 0.6201856732368469, + "learning_rate": 7.1e-07, + "loss": 2.3725, + "mean_token_accuracy": 0.4878034754656255, + "num_tokens": 10560620.0, + "step": 72 + }, + { + "entropy": 2.50439453125, + "epoch": 0.001256854592253988, + "grad_norm": 0.6518511772155762, + "learning_rate": 7.2e-07, + "loss": 2.4772, + "mean_token_accuracy": 0.47327897092327476, + "num_tokens": 10692258.0, + "step": 73 + }, + { + "entropy": 2.5009765625, + "epoch": 0.001274071778449248, + "grad_norm": 0.6756422519683838, + "learning_rate": 7.3e-07, + "loss": 2.5074, + "mean_token_accuracy": 0.4726545801386237, + "num_tokens": 10841410.0, + "step": 74 + }, + { + "entropy": 2.47998046875, + "epoch": 0.0012912889646445082, + "grad_norm": 0.5928777456283569, + "learning_rate": 7.4e-07, + "loss": 2.4521, + "mean_token_accuracy": 0.4727823534049094, + "num_tokens": 10996060.0, + "step": 75 + }, + { + "entropy": 2.499755859375, + "epoch": 0.0013085061508397683, + "grad_norm": 0.6060748100280762, + "learning_rate": 7.5e-07, + "loss": 2.4569, + "mean_token_accuracy": 0.4745932733640075, + "num_tokens": 11146252.0, + "step": 76 + }, + { + "entropy": 2.429443359375, + "epoch": 0.0013257233370350284, + "grad_norm": 0.633307933807373, + "learning_rate": 7.6e-07, + "loss": 2.3987, + "mean_token_accuracy": 0.4819001527503133, + "num_tokens": 11287713.0, + "step": 77 + }, + { + "entropy": 2.38232421875, + "epoch": 0.0013429405232302885, + "grad_norm": 0.6448901295661926, + "learning_rate": 7.7e-07, + "loss": 2.3849, + "mean_token_accuracy": 0.4902081396430731, + "num_tokens": 11427731.0, + "step": 78 + }, + { + "entropy": 2.4320068359375, + "epoch": 0.0013601577094255485, + "grad_norm": 0.7015244364738464, + "learning_rate": 7.8e-07, + "loss": 2.3898, + "mean_token_accuracy": 0.48666849778965116, + "num_tokens": 11570107.0, + "step": 79 + }, + { + "entropy": 2.47998046875, + "epoch": 0.0013773748956208086, + "grad_norm": 0.6528029441833496, + "learning_rate": 7.900000000000001e-07, + "loss": 2.4978, + "mean_token_accuracy": 0.4753390052355826, + "num_tokens": 11704020.0, + "step": 80 + }, + { + "entropy": 2.433837890625, + "epoch": 0.0013945920818160687, + "grad_norm": 0.7266194820404053, + "learning_rate": 8.000000000000001e-07, + "loss": 2.3828, + "mean_token_accuracy": 0.479493273422122, + "num_tokens": 11849459.0, + "step": 81 + }, + { + "entropy": 2.4395751953125, + "epoch": 0.0014118092680113288, + "grad_norm": 0.6236125230789185, + "learning_rate": 8.100000000000001e-07, + "loss": 2.3505, + "mean_token_accuracy": 0.48662899900227785, + "num_tokens": 12010828.0, + "step": 82 + }, + { + "entropy": 2.475830078125, + "epoch": 0.001429026454206589, + "grad_norm": 0.7149572968482971, + "learning_rate": 8.200000000000001e-07, + "loss": 2.4566, + "mean_token_accuracy": 0.47693332051858306, + "num_tokens": 12157399.0, + "step": 83 + }, + { + "entropy": 2.438720703125, + "epoch": 0.0014462436404018492, + "grad_norm": 0.6647018790245056, + "learning_rate": 8.300000000000001e-07, + "loss": 2.3986, + "mean_token_accuracy": 0.4824375621974468, + "num_tokens": 12287842.0, + "step": 84 + }, + { + "entropy": 2.41455078125, + "epoch": 0.0014634608265971093, + "grad_norm": 0.6005454063415527, + "learning_rate": 8.400000000000001e-07, + "loss": 2.3891, + "mean_token_accuracy": 0.4884789031930268, + "num_tokens": 12436104.0, + "step": 85 + }, + { + "entropy": 2.4703369140625, + "epoch": 0.0014806780127923694, + "grad_norm": 0.6212813854217529, + "learning_rate": 8.500000000000001e-07, + "loss": 2.34, + "mean_token_accuracy": 0.4844899824820459, + "num_tokens": 12575067.0, + "step": 86 + }, + { + "entropy": 2.6005859375, + "epoch": 0.0014978951989876295, + "grad_norm": 0.8042929172515869, + "learning_rate": 8.6e-07, + "loss": 2.6196, + "mean_token_accuracy": 0.4688438312150538, + "num_tokens": 12730536.0, + "step": 87 + }, + { + "entropy": 2.4884033203125, + "epoch": 0.0015151123851828896, + "grad_norm": 0.5991901755332947, + "learning_rate": 8.7e-07, + "loss": 2.4611, + "mean_token_accuracy": 0.4766750931739807, + "num_tokens": 12882514.0, + "step": 88 + }, + { + "entropy": 2.4810791015625, + "epoch": 0.0015323295713781497, + "grad_norm": 0.6494613289833069, + "learning_rate": 8.8e-07, + "loss": 2.4435, + "mean_token_accuracy": 0.47679867735132575, + "num_tokens": 13023006.0, + "step": 89 + }, + { + "entropy": 2.4412841796875, + "epoch": 0.0015495467575734098, + "grad_norm": 0.6427425146102905, + "learning_rate": 8.900000000000001e-07, + "loss": 2.3905, + "mean_token_accuracy": 0.4838052117265761, + "num_tokens": 13156996.0, + "step": 90 + }, + { + "entropy": 2.394775390625, + "epoch": 0.0015667639437686699, + "grad_norm": 0.6279881596565247, + "learning_rate": 9.000000000000001e-07, + "loss": 2.3753, + "mean_token_accuracy": 0.48632694967091084, + "num_tokens": 13299631.0, + "step": 91 + }, + { + "entropy": 2.4176025390625, + "epoch": 0.00158398112996393, + "grad_norm": 0.6744757294654846, + "learning_rate": 9.100000000000001e-07, + "loss": 2.3854, + "mean_token_accuracy": 0.4846075074747205, + "num_tokens": 13440366.0, + "step": 92 + }, + { + "entropy": 2.4541015625, + "epoch": 0.00160119831615919, + "grad_norm": 0.6708775758743286, + "learning_rate": 9.200000000000001e-07, + "loss": 2.4069, + "mean_token_accuracy": 0.481810312718153, + "num_tokens": 13566615.0, + "step": 93 + }, + { + "entropy": 2.443115234375, + "epoch": 0.0016184155023544501, + "grad_norm": 0.5979477167129517, + "learning_rate": 9.300000000000001e-07, + "loss": 2.3846, + "mean_token_accuracy": 0.48044557217508554, + "num_tokens": 13719522.0, + "step": 94 + }, + { + "entropy": 2.45263671875, + "epoch": 0.0016356326885497102, + "grad_norm": 0.6026092171669006, + "learning_rate": 9.400000000000001e-07, + "loss": 2.4051, + "mean_token_accuracy": 0.4736237172037363, + "num_tokens": 13864674.0, + "step": 95 + }, + { + "entropy": 2.3526611328125, + "epoch": 0.0016528498747449703, + "grad_norm": 0.5873211622238159, + "learning_rate": 9.500000000000001e-07, + "loss": 2.2743, + "mean_token_accuracy": 0.49419589480385184, + "num_tokens": 14027711.0, + "step": 96 + }, + { + "entropy": 2.460693359375, + "epoch": 0.0016700670609402306, + "grad_norm": 0.5936735272407532, + "learning_rate": 9.600000000000001e-07, + "loss": 2.3992, + "mean_token_accuracy": 0.4806458824314177, + "num_tokens": 14172381.0, + "step": 97 + }, + { + "entropy": 2.45458984375, + "epoch": 0.0016872842471354907, + "grad_norm": 1.1826030015945435, + "learning_rate": 9.7e-07, + "loss": 2.4255, + "mean_token_accuracy": 0.4772787392139435, + "num_tokens": 14309500.0, + "step": 98 + }, + { + "entropy": 2.51806640625, + "epoch": 0.0017045014333307508, + "grad_norm": 0.6550461053848267, + "learning_rate": 9.800000000000001e-07, + "loss": 2.4692, + "mean_token_accuracy": 0.47163511207327247, + "num_tokens": 14449340.0, + "step": 99 + }, + { + "entropy": 2.446533203125, + "epoch": 0.001721718619526011, + "grad_norm": 0.5825348496437073, + "learning_rate": 9.9e-07, + "loss": 2.4099, + "mean_token_accuracy": 0.47821634402498603, + "num_tokens": 14600293.0, + "step": 100 + }, + { + "entropy": 2.454833984375, + "epoch": 0.001738935805721271, + "grad_norm": 0.5967729091644287, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4471, + "mean_token_accuracy": 0.4838520339690149, + "num_tokens": 14754154.0, + "step": 101 + }, + { + "entropy": 2.470703125, + "epoch": 0.001756152991916531, + "grad_norm": 0.7432575225830078, + "learning_rate": 1.01e-06, + "loss": 2.4286, + "mean_token_accuracy": 0.48514684848487377, + "num_tokens": 14912580.0, + "step": 102 + }, + { + "entropy": 2.5068359375, + "epoch": 0.0017733701781117912, + "grad_norm": 0.6083717942237854, + "learning_rate": 1.02e-06, + "loss": 2.4618, + "mean_token_accuracy": 0.47291735280305147, + "num_tokens": 15054619.0, + "step": 103 + }, + { + "entropy": 2.37841796875, + "epoch": 0.0017905873643070513, + "grad_norm": 0.664107084274292, + "learning_rate": 1.03e-06, + "loss": 2.3513, + "mean_token_accuracy": 0.49140653293579817, + "num_tokens": 15184259.0, + "step": 104 + }, + { + "entropy": 2.408935546875, + "epoch": 0.0018078045505023114, + "grad_norm": 0.6227862238883972, + "learning_rate": 1.04e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.4819969036616385, + "num_tokens": 15328191.0, + "step": 105 + }, + { + "entropy": 2.4210205078125, + "epoch": 0.0018250217366975715, + "grad_norm": 0.5873227119445801, + "learning_rate": 1.0500000000000001e-06, + "loss": 2.3571, + "mean_token_accuracy": 0.4860111135058105, + "num_tokens": 15484579.0, + "step": 106 + }, + { + "entropy": 2.4600830078125, + "epoch": 0.0018422389228928315, + "grad_norm": 0.7568690776824951, + "learning_rate": 1.06e-06, + "loss": 2.4228, + "mean_token_accuracy": 0.4871084992773831, + "num_tokens": 15621836.0, + "step": 107 + }, + { + "entropy": 2.443603515625, + "epoch": 0.0018594561090880916, + "grad_norm": 0.5804514288902283, + "learning_rate": 1.0700000000000001e-06, + "loss": 2.4028, + "mean_token_accuracy": 0.4824648411013186, + "num_tokens": 15778932.0, + "step": 108 + }, + { + "entropy": 2.455810546875, + "epoch": 0.001876673295283352, + "grad_norm": 0.5958721041679382, + "learning_rate": 1.08e-06, + "loss": 2.3833, + "mean_token_accuracy": 0.48080389108508825, + "num_tokens": 15924525.0, + "step": 109 + }, + { + "entropy": 2.3779296875, + "epoch": 0.001893890481478612, + "grad_norm": 0.6376572251319885, + "learning_rate": 1.0900000000000002e-06, + "loss": 2.3314, + "mean_token_accuracy": 0.4905442167073488, + "num_tokens": 16080275.0, + "step": 110 + }, + { + "entropy": 2.4273681640625, + "epoch": 0.0019111076676738721, + "grad_norm": 0.6554363369941711, + "learning_rate": 1.1e-06, + "loss": 2.4002, + "mean_token_accuracy": 0.4839071067981422, + "num_tokens": 16212132.0, + "step": 111 + }, + { + "entropy": 2.4451904296875, + "epoch": 0.0019283248538691322, + "grad_norm": 0.6245486736297607, + "learning_rate": 1.1100000000000002e-06, + "loss": 2.4013, + "mean_token_accuracy": 0.4785764031112194, + "num_tokens": 16346241.0, + "step": 112 + }, + { + "entropy": 2.44677734375, + "epoch": 0.0019455420400643923, + "grad_norm": 0.6356763243675232, + "learning_rate": 1.12e-06, + "loss": 2.3992, + "mean_token_accuracy": 0.4817821686156094, + "num_tokens": 16477490.0, + "step": 113 + }, + { + "entropy": 2.480224609375, + "epoch": 0.001962759226259652, + "grad_norm": 0.7369113564491272, + "learning_rate": 1.1300000000000002e-06, + "loss": 2.4604, + "mean_token_accuracy": 0.4817192433401942, + "num_tokens": 16635712.0, + "step": 114 + }, + { + "entropy": 2.446533203125, + "epoch": 0.0019799764124549123, + "grad_norm": 0.5834375023841858, + "learning_rate": 1.14e-06, + "loss": 2.3876, + "mean_token_accuracy": 0.48327695531770587, + "num_tokens": 16790029.0, + "step": 115 + }, + { + "entropy": 2.43994140625, + "epoch": 0.001997193598650173, + "grad_norm": 0.589958906173706, + "learning_rate": 1.1500000000000002e-06, + "loss": 2.3869, + "mean_token_accuracy": 0.4807235752232373, + "num_tokens": 16939941.0, + "step": 116 + }, + { + "entropy": 2.4541015625, + "epoch": 0.002014410784845433, + "grad_norm": 0.5870490074157715, + "learning_rate": 1.1600000000000001e-06, + "loss": 2.3979, + "mean_token_accuracy": 0.47932128235697746, + "num_tokens": 17090653.0, + "step": 117 + }, + { + "entropy": 2.4052734375, + "epoch": 0.002031627971040693, + "grad_norm": 0.6636145710945129, + "learning_rate": 1.1700000000000002e-06, + "loss": 2.3499, + "mean_token_accuracy": 0.48607511818408966, + "num_tokens": 17225388.0, + "step": 118 + }, + { + "entropy": 2.4976806640625, + "epoch": 0.002048845157235953, + "grad_norm": 0.8917343020439148, + "learning_rate": 1.1800000000000001e-06, + "loss": 2.4271, + "mean_token_accuracy": 0.47680498752743006, + "num_tokens": 17365012.0, + "step": 119 + }, + { + "entropy": 2.4691162109375, + "epoch": 0.002066062343431213, + "grad_norm": 0.623699963092804, + "learning_rate": 1.19e-06, + "loss": 2.4145, + "mean_token_accuracy": 0.4758805222809315, + "num_tokens": 17515672.0, + "step": 120 + }, + { + "entropy": 2.508544921875, + "epoch": 0.0020832795296264733, + "grad_norm": 0.5833603143692017, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.48172463616356254, + "num_tokens": 17664187.0, + "step": 121 + }, + { + "entropy": 2.4619140625, + "epoch": 0.0021004967158217334, + "grad_norm": 0.6505944728851318, + "learning_rate": 1.21e-06, + "loss": 2.4608, + "mean_token_accuracy": 0.47443893272429705, + "num_tokens": 17806861.0, + "step": 122 + }, + { + "entropy": 2.5152587890625, + "epoch": 0.0021177139020169935, + "grad_norm": 0.6009451150894165, + "learning_rate": 1.2200000000000002e-06, + "loss": 2.4738, + "mean_token_accuracy": 0.4736114493571222, + "num_tokens": 17949930.0, + "step": 123 + }, + { + "entropy": 2.415771484375, + "epoch": 0.0021349310882122535, + "grad_norm": 0.5856963992118835, + "learning_rate": 1.23e-06, + "loss": 2.3834, + "mean_token_accuracy": 0.4865727727301419, + "num_tokens": 18096555.0, + "step": 124 + }, + { + "entropy": 2.3616943359375, + "epoch": 0.0021521482744075136, + "grad_norm": 0.5739309787750244, + "learning_rate": 1.2400000000000002e-06, + "loss": 2.3385, + "mean_token_accuracy": 0.49280355405062437, + "num_tokens": 18249345.0, + "step": 125 + }, + { + "entropy": 2.4732666015625, + "epoch": 0.0021693654606027737, + "grad_norm": 0.6043084859848022, + "learning_rate": 1.25e-06, + "loss": 2.416, + "mean_token_accuracy": 0.47766703460365534, + "num_tokens": 18385525.0, + "step": 126 + }, + { + "entropy": 2.5040283203125, + "epoch": 0.002186582646798034, + "grad_norm": 0.6954610347747803, + "learning_rate": 1.26e-06, + "loss": 2.4997, + "mean_token_accuracy": 0.4837381485849619, + "num_tokens": 18522666.0, + "step": 127 + }, + { + "entropy": 2.45166015625, + "epoch": 0.002203799832993294, + "grad_norm": 0.5906988382339478, + "learning_rate": 1.2700000000000001e-06, + "loss": 2.4099, + "mean_token_accuracy": 0.47865421138703823, + "num_tokens": 18662132.0, + "step": 128 + }, + { + "entropy": 2.395263671875, + "epoch": 0.002221017019188554, + "grad_norm": 0.5896300077438354, + "learning_rate": 1.28e-06, + "loss": 2.3787, + "mean_token_accuracy": 0.49431150034070015, + "num_tokens": 18803105.0, + "step": 129 + }, + { + "entropy": 2.435546875, + "epoch": 0.002238234205383814, + "grad_norm": 0.618240475654602, + "learning_rate": 1.2900000000000001e-06, + "loss": 2.3837, + "mean_token_accuracy": 0.48449931014329195, + "num_tokens": 18942086.0, + "step": 130 + }, + { + "entropy": 2.387451171875, + "epoch": 0.002255451391579074, + "grad_norm": 0.5633107423782349, + "learning_rate": 1.3e-06, + "loss": 2.3315, + "mean_token_accuracy": 0.4937907229177654, + "num_tokens": 19097977.0, + "step": 131 + }, + { + "entropy": 2.431640625, + "epoch": 0.0022726685777743343, + "grad_norm": 0.5887622833251953, + "learning_rate": 1.3100000000000002e-06, + "loss": 2.381, + "mean_token_accuracy": 0.48834379855543375, + "num_tokens": 19242582.0, + "step": 132 + }, + { + "entropy": 2.476806640625, + "epoch": 0.0022898857639695944, + "grad_norm": 0.633418083190918, + "learning_rate": 1.32e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.4792938116006553, + "num_tokens": 19379757.0, + "step": 133 + }, + { + "entropy": 2.42333984375, + "epoch": 0.0023071029501648545, + "grad_norm": 0.6043598651885986, + "learning_rate": 1.3300000000000002e-06, + "loss": 2.343, + "mean_token_accuracy": 0.4908184530213475, + "num_tokens": 19524622.0, + "step": 134 + }, + { + "entropy": 2.46484375, + "epoch": 0.0023243201363601146, + "grad_norm": 0.6375739574432373, + "learning_rate": 1.34e-06, + "loss": 2.4083, + "mean_token_accuracy": 0.4828670499846339, + "num_tokens": 19663378.0, + "step": 135 + }, + { + "entropy": 2.428955078125, + "epoch": 0.0023415373225553746, + "grad_norm": 0.5688341856002808, + "learning_rate": 1.3500000000000002e-06, + "loss": 2.3791, + "mean_token_accuracy": 0.48436517268419266, + "num_tokens": 19816124.0, + "step": 136 + }, + { + "entropy": 2.4317626953125, + "epoch": 0.0023587545087506347, + "grad_norm": 0.5994829535484314, + "learning_rate": 1.3600000000000001e-06, + "loss": 2.3781, + "mean_token_accuracy": 0.4828225467354059, + "num_tokens": 19958536.0, + "step": 137 + }, + { + "entropy": 2.4664306640625, + "epoch": 0.002375971694945895, + "grad_norm": 0.5764389634132385, + "learning_rate": 1.3700000000000002e-06, + "loss": 2.4281, + "mean_token_accuracy": 0.47715017944574356, + "num_tokens": 20101195.0, + "step": 138 + }, + { + "entropy": 2.515869140625, + "epoch": 0.002393188881141155, + "grad_norm": 0.5928436517715454, + "learning_rate": 1.3800000000000001e-06, + "loss": 2.4664, + "mean_token_accuracy": 0.4714601272717118, + "num_tokens": 20250847.0, + "step": 139 + }, + { + "entropy": 2.47509765625, + "epoch": 0.002410406067336415, + "grad_norm": 0.7260881662368774, + "learning_rate": 1.3900000000000002e-06, + "loss": 2.4494, + "mean_token_accuracy": 0.47532156156376004, + "num_tokens": 20391934.0, + "step": 140 + }, + { + "entropy": 2.42236328125, + "epoch": 0.002427623253531675, + "grad_norm": 0.5777806043624878, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3889, + "mean_token_accuracy": 0.4810917223803699, + "num_tokens": 20542473.0, + "step": 141 + }, + { + "entropy": 2.426025390625, + "epoch": 0.0024448404397269356, + "grad_norm": 0.5795040130615234, + "learning_rate": 1.41e-06, + "loss": 2.357, + "mean_token_accuracy": 0.4965799758210778, + "num_tokens": 20692496.0, + "step": 142 + }, + { + "entropy": 2.446044921875, + "epoch": 0.0024620576259221957, + "grad_norm": 0.584563672542572, + "learning_rate": 1.42e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.4806175325065851, + "num_tokens": 20829269.0, + "step": 143 + }, + { + "entropy": 2.431640625, + "epoch": 0.002479274812117456, + "grad_norm": 0.5803564190864563, + "learning_rate": 1.43e-06, + "loss": 2.3908, + "mean_token_accuracy": 0.4852483980357647, + "num_tokens": 20976041.0, + "step": 144 + }, + { + "entropy": 2.4169921875, + "epoch": 0.002496491998312716, + "grad_norm": 0.5736103057861328, + "learning_rate": 1.44e-06, + "loss": 2.407, + "mean_token_accuracy": 0.48149813804775476, + "num_tokens": 21123880.0, + "step": 145 + }, + { + "entropy": 2.501708984375, + "epoch": 0.002513709184507976, + "grad_norm": 0.5667904615402222, + "learning_rate": 1.45e-06, + "loss": 2.4569, + "mean_token_accuracy": 0.470580879598856, + "num_tokens": 21267685.0, + "step": 146 + }, + { + "entropy": 2.431396484375, + "epoch": 0.002530926370703236, + "grad_norm": 0.6545902490615845, + "learning_rate": 1.46e-06, + "loss": 2.368, + "mean_token_accuracy": 0.4863086869008839, + "num_tokens": 21415641.0, + "step": 147 + }, + { + "entropy": 2.4876708984375, + "epoch": 0.002548143556898496, + "grad_norm": 0.6045504808425903, + "learning_rate": 1.4700000000000001e-06, + "loss": 2.4502, + "mean_token_accuracy": 0.47358084423467517, + "num_tokens": 21568579.0, + "step": 148 + }, + { + "entropy": 2.4735107421875, + "epoch": 0.0025653607430937563, + "grad_norm": 0.5392025113105774, + "learning_rate": 1.48e-06, + "loss": 2.3939, + "mean_token_accuracy": 0.47306955326348543, + "num_tokens": 21725147.0, + "step": 149 + }, + { + "entropy": 2.38037109375, + "epoch": 0.0025825779292890164, + "grad_norm": 0.6125035285949707, + "learning_rate": 1.4900000000000001e-06, + "loss": 2.3062, + "mean_token_accuracy": 0.49953836342319846, + "num_tokens": 21861243.0, + "step": 150 + }, + { + "entropy": 2.40234375, + "epoch": 0.0025997951154842765, + "grad_norm": 0.6005491614341736, + "learning_rate": 1.5e-06, + "loss": 2.3803, + "mean_token_accuracy": 0.483843975700438, + "num_tokens": 21996041.0, + "step": 151 + }, + { + "entropy": 2.51171875, + "epoch": 0.0026170123016795365, + "grad_norm": 0.6047178506851196, + "learning_rate": 1.5100000000000002e-06, + "loss": 2.4551, + "mean_token_accuracy": 0.4724902934394777, + "num_tokens": 22124115.0, + "step": 152 + }, + { + "entropy": 2.4654541015625, + "epoch": 0.0026342294878747966, + "grad_norm": 0.5984538197517395, + "learning_rate": 1.52e-06, + "loss": 2.4066, + "mean_token_accuracy": 0.47982197999954224, + "num_tokens": 22268397.0, + "step": 153 + }, + { + "entropy": 2.5594482421875, + "epoch": 0.0026514466740700567, + "grad_norm": 0.5999759435653687, + "learning_rate": 1.5300000000000002e-06, + "loss": 2.5766, + "mean_token_accuracy": 0.46560019347816706, + "num_tokens": 22417806.0, + "step": 154 + }, + { + "entropy": 2.468017578125, + "epoch": 0.002668663860265317, + "grad_norm": 0.5409292578697205, + "learning_rate": 1.54e-06, + "loss": 2.4253, + "mean_token_accuracy": 0.471966958604753, + "num_tokens": 22581097.0, + "step": 155 + }, + { + "entropy": 2.4951171875, + "epoch": 0.002685881046460577, + "grad_norm": 0.5459778904914856, + "learning_rate": 1.5500000000000002e-06, + "loss": 2.4459, + "mean_token_accuracy": 0.47298973286524415, + "num_tokens": 22735616.0, + "step": 156 + }, + { + "entropy": 2.4017333984375, + "epoch": 0.002703098232655837, + "grad_norm": 0.5974356532096863, + "learning_rate": 1.56e-06, + "loss": 2.3332, + "mean_token_accuracy": 0.49170317640528083, + "num_tokens": 22876476.0, + "step": 157 + }, + { + "entropy": 2.447021484375, + "epoch": 0.002720315418851097, + "grad_norm": 0.6058854460716248, + "learning_rate": 1.5700000000000002e-06, + "loss": 2.3759, + "mean_token_accuracy": 0.48308438109233975, + "num_tokens": 23000697.0, + "step": 158 + }, + { + "entropy": 2.4227294921875, + "epoch": 0.002737532605046357, + "grad_norm": 0.6649252772331238, + "learning_rate": 1.5800000000000001e-06, + "loss": 2.3364, + "mean_token_accuracy": 0.49115608306601644, + "num_tokens": 23128424.0, + "step": 159 + }, + { + "entropy": 2.459716796875, + "epoch": 0.0027547497912416173, + "grad_norm": 0.571075439453125, + "learning_rate": 1.5900000000000002e-06, + "loss": 2.4113, + "mean_token_accuracy": 0.4818022232502699, + "num_tokens": 23269190.0, + "step": 160 + }, + { + "entropy": 2.45654296875, + "epoch": 0.0027719669774368774, + "grad_norm": 0.5999649167060852, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.4138, + "mean_token_accuracy": 0.4814398717135191, + "num_tokens": 23423206.0, + "step": 161 + }, + { + "entropy": 2.4156494140625, + "epoch": 0.0027891841636321375, + "grad_norm": 0.5621162056922913, + "learning_rate": 1.6100000000000003e-06, + "loss": 2.3751, + "mean_token_accuracy": 0.48285360960289836, + "num_tokens": 23572561.0, + "step": 162 + }, + { + "entropy": 2.3802490234375, + "epoch": 0.0028064013498273976, + "grad_norm": 0.6032527089118958, + "learning_rate": 1.6200000000000002e-06, + "loss": 2.3422, + "mean_token_accuracy": 0.4902229546569288, + "num_tokens": 23718051.0, + "step": 163 + }, + { + "entropy": 2.370849609375, + "epoch": 0.0028236185360226576, + "grad_norm": 0.55767822265625, + "learning_rate": 1.6300000000000003e-06, + "loss": 2.3473, + "mean_token_accuracy": 0.49248579889535904, + "num_tokens": 23877563.0, + "step": 164 + }, + { + "entropy": 2.5054931640625, + "epoch": 0.0028408357222179177, + "grad_norm": 0.5944464206695557, + "learning_rate": 1.6400000000000002e-06, + "loss": 2.4374, + "mean_token_accuracy": 0.47586293099448085, + "num_tokens": 24013173.0, + "step": 165 + }, + { + "entropy": 2.4613037109375, + "epoch": 0.002858052908413178, + "grad_norm": 0.6158614754676819, + "learning_rate": 1.6500000000000003e-06, + "loss": 2.4078, + "mean_token_accuracy": 0.48460292909294367, + "num_tokens": 24150366.0, + "step": 166 + }, + { + "entropy": 2.4307861328125, + "epoch": 0.0028752700946084384, + "grad_norm": 0.562500536441803, + "learning_rate": 1.6600000000000002e-06, + "loss": 2.3867, + "mean_token_accuracy": 0.4834507182240486, + "num_tokens": 24300680.0, + "step": 167 + }, + { + "entropy": 2.4783935546875, + "epoch": 0.0028924872808036984, + "grad_norm": 0.5674195289611816, + "learning_rate": 1.6700000000000003e-06, + "loss": 2.4497, + "mean_token_accuracy": 0.47510121995583177, + "num_tokens": 24445534.0, + "step": 168 + }, + { + "entropy": 2.4825439453125, + "epoch": 0.0029097044669989585, + "grad_norm": 0.6282191276550293, + "learning_rate": 1.6800000000000002e-06, + "loss": 2.4444, + "mean_token_accuracy": 0.4739877316169441, + "num_tokens": 24568649.0, + "step": 169 + }, + { + "entropy": 2.4176025390625, + "epoch": 0.0029269216531942186, + "grad_norm": 0.6034136414527893, + "learning_rate": 1.6900000000000003e-06, + "loss": 2.3804, + "mean_token_accuracy": 0.48952830489724874, + "num_tokens": 24710505.0, + "step": 170 + }, + { + "entropy": 2.441650390625, + "epoch": 0.0029441388393894787, + "grad_norm": 0.579363226890564, + "learning_rate": 1.7000000000000002e-06, + "loss": 2.3715, + "mean_token_accuracy": 0.4852260472252965, + "num_tokens": 24860551.0, + "step": 171 + }, + { + "entropy": 2.4251708984375, + "epoch": 0.002961356025584739, + "grad_norm": 0.6410456895828247, + "learning_rate": 1.7100000000000004e-06, + "loss": 2.38, + "mean_token_accuracy": 0.4843108947388828, + "num_tokens": 24997729.0, + "step": 172 + }, + { + "entropy": 2.430419921875, + "epoch": 0.002978573211779999, + "grad_norm": 0.5755802392959595, + "learning_rate": 1.72e-06, + "loss": 2.3886, + "mean_token_accuracy": 0.4809368369169533, + "num_tokens": 25140050.0, + "step": 173 + }, + { + "entropy": 2.485595703125, + "epoch": 0.002995790397975259, + "grad_norm": 0.606526792049408, + "learning_rate": 1.73e-06, + "loss": 2.4497, + "mean_token_accuracy": 0.48244015080854297, + "num_tokens": 25280791.0, + "step": 174 + }, + { + "entropy": 2.48046875, + "epoch": 0.003013007584170519, + "grad_norm": 0.6201883554458618, + "learning_rate": 1.74e-06, + "loss": 2.4686, + "mean_token_accuracy": 0.4720367449335754, + "num_tokens": 25429327.0, + "step": 175 + }, + { + "entropy": 2.40380859375, + "epoch": 0.003030224770365779, + "grad_norm": 0.6322990655899048, + "learning_rate": 1.75e-06, + "loss": 2.3775, + "mean_token_accuracy": 0.48563892720267177, + "num_tokens": 25556902.0, + "step": 176 + }, + { + "entropy": 2.4178466796875, + "epoch": 0.0030474419565610393, + "grad_norm": 0.5754444003105164, + "learning_rate": 1.76e-06, + "loss": 2.3881, + "mean_token_accuracy": 0.4864646405912936, + "num_tokens": 25706350.0, + "step": 177 + }, + { + "entropy": 2.4537353515625, + "epoch": 0.0030646591427562994, + "grad_norm": 0.5913891792297363, + "learning_rate": 1.77e-06, + "loss": 2.4168, + "mean_token_accuracy": 0.47908906172960997, + "num_tokens": 25845686.0, + "step": 178 + }, + { + "entropy": 2.442138671875, + "epoch": 0.0030818763289515595, + "grad_norm": 0.5810282826423645, + "learning_rate": 1.7800000000000001e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.48843948962166905, + "num_tokens": 25991169.0, + "step": 179 + }, + { + "entropy": 2.430908203125, + "epoch": 0.0030990935151468195, + "grad_norm": 0.59124356508255, + "learning_rate": 1.79e-06, + "loss": 2.345, + "mean_token_accuracy": 0.49094062810763717, + "num_tokens": 26137613.0, + "step": 180 + }, + { + "entropy": 2.43798828125, + "epoch": 0.0031163107013420796, + "grad_norm": 0.5589233636856079, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.3647, + "mean_token_accuracy": 0.47995236283168197, + "num_tokens": 26304122.0, + "step": 181 + }, + { + "entropy": 2.43017578125, + "epoch": 0.0031335278875373397, + "grad_norm": 0.6091126203536987, + "learning_rate": 1.81e-06, + "loss": 2.3233, + "mean_token_accuracy": 0.49118568608537316, + "num_tokens": 26439950.0, + "step": 182 + }, + { + "entropy": 2.39697265625, + "epoch": 0.0031507450737326, + "grad_norm": 0.6027998328208923, + "learning_rate": 1.8200000000000002e-06, + "loss": 2.332, + "mean_token_accuracy": 0.4913685843348503, + "num_tokens": 26579726.0, + "step": 183 + }, + { + "entropy": 2.404296875, + "epoch": 0.00316796225992786, + "grad_norm": 0.5764933228492737, + "learning_rate": 1.83e-06, + "loss": 2.3598, + "mean_token_accuracy": 0.48573345225304365, + "num_tokens": 26724853.0, + "step": 184 + }, + { + "entropy": 2.4344482421875, + "epoch": 0.00318517944612312, + "grad_norm": 0.5961573123931885, + "learning_rate": 1.8400000000000002e-06, + "loss": 2.3914, + "mean_token_accuracy": 0.48558472096920013, + "num_tokens": 26863674.0, + "step": 185 + }, + { + "entropy": 2.446044921875, + "epoch": 0.00320239663231838, + "grad_norm": 0.570022702217102, + "learning_rate": 1.85e-06, + "loss": 2.4299, + "mean_token_accuracy": 0.48616287065669894, + "num_tokens": 27016137.0, + "step": 186 + }, + { + "entropy": 2.4267578125, + "epoch": 0.00321961381851364, + "grad_norm": 0.5620612502098083, + "learning_rate": 1.8600000000000002e-06, + "loss": 2.3965, + "mean_token_accuracy": 0.4951572152785957, + "num_tokens": 27168668.0, + "step": 187 + }, + { + "entropy": 2.5091552734375, + "epoch": 0.0032368310047089003, + "grad_norm": 0.5910755395889282, + "learning_rate": 1.87e-06, + "loss": 2.4964, + "mean_token_accuracy": 0.47218859009444714, + "num_tokens": 27303758.0, + "step": 188 + }, + { + "entropy": 2.41259765625, + "epoch": 0.0032540481909041604, + "grad_norm": 0.5817851424217224, + "learning_rate": 1.8800000000000002e-06, + "loss": 2.3692, + "mean_token_accuracy": 0.48497994616627693, + "num_tokens": 27442544.0, + "step": 189 + }, + { + "entropy": 2.4501953125, + "epoch": 0.0032712653770994205, + "grad_norm": 0.565247654914856, + "learning_rate": 1.8900000000000001e-06, + "loss": 2.4424, + "mean_token_accuracy": 0.4795730533078313, + "num_tokens": 27596457.0, + "step": 190 + }, + { + "entropy": 2.4776611328125, + "epoch": 0.0032884825632946806, + "grad_norm": 0.5709455609321594, + "learning_rate": 1.9000000000000002e-06, + "loss": 2.4314, + "mean_token_accuracy": 0.47365658916532993, + "num_tokens": 27743357.0, + "step": 191 + }, + { + "entropy": 2.5281982421875, + "epoch": 0.0033056997494899406, + "grad_norm": 0.629690945148468, + "learning_rate": 1.9100000000000003e-06, + "loss": 2.4736, + "mean_token_accuracy": 0.47410575672984123, + "num_tokens": 27876803.0, + "step": 192 + }, + { + "entropy": 2.4635009765625, + "epoch": 0.003322916935685201, + "grad_norm": 0.637241780757904, + "learning_rate": 1.9200000000000003e-06, + "loss": 2.4301, + "mean_token_accuracy": 0.48486198624596, + "num_tokens": 28028469.0, + "step": 193 + }, + { + "entropy": 2.486083984375, + "epoch": 0.0033401341218804613, + "grad_norm": 0.5691868662834167, + "learning_rate": 1.93e-06, + "loss": 2.437, + "mean_token_accuracy": 0.4730471963994205, + "num_tokens": 28169802.0, + "step": 194 + }, + { + "entropy": 2.43017578125, + "epoch": 0.0033573513080757214, + "grad_norm": 0.6118927597999573, + "learning_rate": 1.94e-06, + "loss": 2.3835, + "mean_token_accuracy": 0.4860619972459972, + "num_tokens": 28299767.0, + "step": 195 + }, + { + "entropy": 2.4813232421875, + "epoch": 0.0033745684942709814, + "grad_norm": 0.564987063407898, + "learning_rate": 1.9500000000000004e-06, + "loss": 2.4519, + "mean_token_accuracy": 0.4765043603256345, + "num_tokens": 28442455.0, + "step": 196 + }, + { + "entropy": 2.4971923828125, + "epoch": 0.0033917856804662415, + "grad_norm": 0.5519808530807495, + "learning_rate": 1.9600000000000003e-06, + "loss": 2.4937, + "mean_token_accuracy": 0.4625990390777588, + "num_tokens": 28585079.0, + "step": 197 + }, + { + "entropy": 2.464599609375, + "epoch": 0.0034090028666615016, + "grad_norm": 0.6350486874580383, + "learning_rate": 1.97e-06, + "loss": 2.4561, + "mean_token_accuracy": 0.4764754744246602, + "num_tokens": 28722871.0, + "step": 198 + }, + { + "entropy": 2.4639892578125, + "epoch": 0.0034262200528567617, + "grad_norm": 0.5707213878631592, + "learning_rate": 1.98e-06, + "loss": 2.4378, + "mean_token_accuracy": 0.48255998734384775, + "num_tokens": 28868682.0, + "step": 199 + }, + { + "entropy": 2.415771484375, + "epoch": 0.003443437239052022, + "grad_norm": 0.593158483505249, + "learning_rate": 1.9900000000000004e-06, + "loss": 2.3004, + "mean_token_accuracy": 0.49145969236269593, + "num_tokens": 29016713.0, + "step": 200 + }, + { + "entropy": 2.462890625, + "epoch": 0.003460654425247282, + "grad_norm": 0.6505548357963562, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.3531, + "mean_token_accuracy": 0.48328409856185317, + "num_tokens": 29143923.0, + "step": 201 + }, + { + "entropy": 2.3577880859375, + "epoch": 0.003477871611442542, + "grad_norm": 0.6153655052185059, + "learning_rate": 2.0100000000000002e-06, + "loss": 2.3246, + "mean_token_accuracy": 0.4954985845834017, + "num_tokens": 29285239.0, + "step": 202 + }, + { + "entropy": 2.51025390625, + "epoch": 0.003495088797637802, + "grad_norm": 0.5901440978050232, + "learning_rate": 2.02e-06, + "loss": 2.4444, + "mean_token_accuracy": 0.47322959266602993, + "num_tokens": 29423789.0, + "step": 203 + }, + { + "entropy": 2.473388671875, + "epoch": 0.003512305983833062, + "grad_norm": 0.6093935370445251, + "learning_rate": 2.0300000000000005e-06, + "loss": 2.4143, + "mean_token_accuracy": 0.4813940548337996, + "num_tokens": 29571521.0, + "step": 204 + }, + { + "entropy": 2.423828125, + "epoch": 0.0035295231700283223, + "grad_norm": 0.5486546754837036, + "learning_rate": 2.04e-06, + "loss": 2.3655, + "mean_token_accuracy": 0.4821597971022129, + "num_tokens": 29726314.0, + "step": 205 + }, + { + "entropy": 2.4498291015625, + "epoch": 0.0035467403562235824, + "grad_norm": 0.57234126329422, + "learning_rate": 2.05e-06, + "loss": 2.425, + "mean_token_accuracy": 0.47626253589987755, + "num_tokens": 29880000.0, + "step": 206 + }, + { + "entropy": 2.416015625, + "epoch": 0.0035639575424188425, + "grad_norm": 0.561396062374115, + "learning_rate": 2.06e-06, + "loss": 2.3439, + "mean_token_accuracy": 0.4864202947355807, + "num_tokens": 30027628.0, + "step": 207 + }, + { + "entropy": 2.4002685546875, + "epoch": 0.0035811747286141025, + "grad_norm": 0.569572389125824, + "learning_rate": 2.07e-06, + "loss": 2.3673, + "mean_token_accuracy": 0.4875691821798682, + "num_tokens": 30169974.0, + "step": 208 + }, + { + "entropy": 2.455078125, + "epoch": 0.0035983919148093626, + "grad_norm": 0.5601251721382141, + "learning_rate": 2.08e-06, + "loss": 2.4015, + "mean_token_accuracy": 0.4847674574702978, + "num_tokens": 30317198.0, + "step": 209 + }, + { + "entropy": 2.53271484375, + "epoch": 0.0036156091010046227, + "grad_norm": 0.5616832375526428, + "learning_rate": 2.09e-06, + "loss": 2.4877, + "mean_token_accuracy": 0.47258848743513227, + "num_tokens": 30460614.0, + "step": 210 + }, + { + "entropy": 2.454345703125, + "epoch": 0.003632826287199883, + "grad_norm": 0.5962870717048645, + "learning_rate": 2.1000000000000002e-06, + "loss": 2.4077, + "mean_token_accuracy": 0.47984306002035737, + "num_tokens": 30595270.0, + "step": 211 + }, + { + "entropy": 2.414306640625, + "epoch": 0.003650043473395143, + "grad_norm": 0.5458055734634399, + "learning_rate": 2.11e-06, + "loss": 2.349, + "mean_token_accuracy": 0.48367989249527454, + "num_tokens": 30754164.0, + "step": 212 + }, + { + "entropy": 2.3519287109375, + "epoch": 0.003667260659590403, + "grad_norm": 0.6218618154525757, + "learning_rate": 2.12e-06, + "loss": 2.2976, + "mean_token_accuracy": 0.49590098252519965, + "num_tokens": 30910624.0, + "step": 213 + }, + { + "entropy": 2.4219970703125, + "epoch": 0.003684477845785663, + "grad_norm": 0.6083913445472717, + "learning_rate": 2.13e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.4871506607159972, + "num_tokens": 31045412.0, + "step": 214 + }, + { + "entropy": 2.3848876953125, + "epoch": 0.003701695031980923, + "grad_norm": 0.5669545531272888, + "learning_rate": 2.1400000000000003e-06, + "loss": 2.3481, + "mean_token_accuracy": 0.4880279768258333, + "num_tokens": 31198584.0, + "step": 215 + }, + { + "entropy": 2.497314453125, + "epoch": 0.0037189122181761833, + "grad_norm": 0.5737752318382263, + "learning_rate": 2.15e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.47354970779269934, + "num_tokens": 31341345.0, + "step": 216 + }, + { + "entropy": 2.4847412109375, + "epoch": 0.0037361294043714434, + "grad_norm": 0.5978350639343262, + "learning_rate": 2.16e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.4729965156875551, + "num_tokens": 31503528.0, + "step": 217 + }, + { + "entropy": 2.4783935546875, + "epoch": 0.003753346590566704, + "grad_norm": 0.5699813961982727, + "learning_rate": 2.17e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.4826196124777198, + "num_tokens": 31641779.0, + "step": 218 + }, + { + "entropy": 2.436279296875, + "epoch": 0.003770563776761964, + "grad_norm": 0.5500178337097168, + "learning_rate": 2.1800000000000003e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.4823254165239632, + "num_tokens": 31811688.0, + "step": 219 + }, + { + "entropy": 2.431884765625, + "epoch": 0.003787780962957224, + "grad_norm": 0.5497094392776489, + "learning_rate": 2.19e-06, + "loss": 2.4188, + "mean_token_accuracy": 0.48219793336465955, + "num_tokens": 31964463.0, + "step": 220 + }, + { + "entropy": 2.4559326171875, + "epoch": 0.003804998149152484, + "grad_norm": 0.5953599810600281, + "learning_rate": 2.2e-06, + "loss": 2.4074, + "mean_token_accuracy": 0.4810604937374592, + "num_tokens": 32106174.0, + "step": 221 + }, + { + "entropy": 2.4549560546875, + "epoch": 0.0038222153353477443, + "grad_norm": 0.5802156329154968, + "learning_rate": 2.21e-06, + "loss": 2.401, + "mean_token_accuracy": 0.4832776212133467, + "num_tokens": 32240256.0, + "step": 222 + }, + { + "entropy": 2.5172119140625, + "epoch": 0.0038394325215430044, + "grad_norm": 0.7379570007324219, + "learning_rate": 2.2200000000000003e-06, + "loss": 2.4759, + "mean_token_accuracy": 0.47404406825080514, + "num_tokens": 32384376.0, + "step": 223 + }, + { + "entropy": 2.4268798828125, + "epoch": 0.0038566497077382644, + "grad_norm": 0.535229504108429, + "learning_rate": 2.2300000000000002e-06, + "loss": 2.3825, + "mean_token_accuracy": 0.48500062711536884, + "num_tokens": 32547276.0, + "step": 224 + }, + { + "entropy": 2.4552001953125, + "epoch": 0.0038738668939335245, + "grad_norm": 0.5618354082107544, + "learning_rate": 2.24e-06, + "loss": 2.4164, + "mean_token_accuracy": 0.4798359959386289, + "num_tokens": 32695574.0, + "step": 225 + }, + { + "entropy": 2.491455078125, + "epoch": 0.0038910840801287846, + "grad_norm": 0.5642921924591064, + "learning_rate": 2.25e-06, + "loss": 2.4474, + "mean_token_accuracy": 0.4762350879609585, + "num_tokens": 32836847.0, + "step": 226 + }, + { + "entropy": 2.415283203125, + "epoch": 0.003908301266324045, + "grad_norm": 0.614703893661499, + "learning_rate": 2.2600000000000004e-06, + "loss": 2.3591, + "mean_token_accuracy": 0.48611282790079713, + "num_tokens": 32970328.0, + "step": 227 + }, + { + "entropy": 2.4599609375, + "epoch": 0.003925518452519304, + "grad_norm": 0.5725888013839722, + "learning_rate": 2.2700000000000003e-06, + "loss": 2.4113, + "mean_token_accuracy": 0.4802963142283261, + "num_tokens": 33112724.0, + "step": 228 + }, + { + "entropy": 2.4117431640625, + "epoch": 0.003942735638714565, + "grad_norm": 0.6599273681640625, + "learning_rate": 2.28e-06, + "loss": 2.3471, + "mean_token_accuracy": 0.4909554719924927, + "num_tokens": 33250365.0, + "step": 229 + }, + { + "entropy": 2.417236328125, + "epoch": 0.0039599528249098246, + "grad_norm": 0.5717114806175232, + "learning_rate": 2.29e-06, + "loss": 2.3673, + "mean_token_accuracy": 0.4863877324387431, + "num_tokens": 33402602.0, + "step": 230 + }, + { + "entropy": 2.4541015625, + "epoch": 0.003977170011105085, + "grad_norm": 0.5801478028297424, + "learning_rate": 2.3000000000000004e-06, + "loss": 2.4395, + "mean_token_accuracy": 0.47902765218168497, + "num_tokens": 33544113.0, + "step": 231 + }, + { + "entropy": 2.454345703125, + "epoch": 0.003994387197300346, + "grad_norm": 0.6313830614089966, + "learning_rate": 2.3100000000000003e-06, + "loss": 2.368, + "mean_token_accuracy": 0.48253711173310876, + "num_tokens": 33693199.0, + "step": 232 + }, + { + "entropy": 2.411376953125, + "epoch": 0.004011604383495605, + "grad_norm": 0.5773042440414429, + "learning_rate": 2.3200000000000002e-06, + "loss": 2.3673, + "mean_token_accuracy": 0.4857380697503686, + "num_tokens": 33834160.0, + "step": 233 + }, + { + "entropy": 2.4261474609375, + "epoch": 0.004028821569690866, + "grad_norm": 0.6097912788391113, + "learning_rate": 2.33e-06, + "loss": 2.4526, + "mean_token_accuracy": 0.4798373435623944, + "num_tokens": 33966216.0, + "step": 234 + }, + { + "entropy": 2.47802734375, + "epoch": 0.0040460387558861255, + "grad_norm": 0.5512007474899292, + "learning_rate": 2.3400000000000005e-06, + "loss": 2.3984, + "mean_token_accuracy": 0.47808168828487396, + "num_tokens": 34113686.0, + "step": 235 + }, + { + "entropy": 2.4539794921875, + "epoch": 0.004063255942081386, + "grad_norm": 0.5286883115768433, + "learning_rate": 2.35e-06, + "loss": 2.412, + "mean_token_accuracy": 0.48627515137195587, + "num_tokens": 34282616.0, + "step": 236 + }, + { + "entropy": 2.4539794921875, + "epoch": 0.004080473128276646, + "grad_norm": 0.564912736415863, + "learning_rate": 2.3600000000000003e-06, + "loss": 2.3459, + "mean_token_accuracy": 0.48682177206501365, + "num_tokens": 34436498.0, + "step": 237 + }, + { + "entropy": 2.4637451171875, + "epoch": 0.004097690314471906, + "grad_norm": 0.6081207990646362, + "learning_rate": 2.37e-06, + "loss": 2.4327, + "mean_token_accuracy": 0.4862173437140882, + "num_tokens": 34579113.0, + "step": 238 + }, + { + "entropy": 2.3974609375, + "epoch": 0.004114907500667166, + "grad_norm": 0.5634726285934448, + "learning_rate": 2.38e-06, + "loss": 2.3362, + "mean_token_accuracy": 0.49311843886971474, + "num_tokens": 34723287.0, + "step": 239 + }, + { + "entropy": 2.3797607421875, + "epoch": 0.004132124686862426, + "grad_norm": 0.5402041673660278, + "learning_rate": 2.39e-06, + "loss": 2.3051, + "mean_token_accuracy": 0.4971098625101149, + "num_tokens": 34879330.0, + "step": 240 + }, + { + "entropy": 2.4957275390625, + "epoch": 0.004149341873057686, + "grad_norm": 0.6134316325187683, + "learning_rate": 2.4000000000000003e-06, + "loss": 2.4642, + "mean_token_accuracy": 0.4705090904608369, + "num_tokens": 35007660.0, + "step": 241 + }, + { + "entropy": 2.5213623046875, + "epoch": 0.0041665590592529465, + "grad_norm": 0.5660735368728638, + "learning_rate": 2.4100000000000002e-06, + "loss": 2.503, + "mean_token_accuracy": 0.4669585754163563, + "num_tokens": 35155284.0, + "step": 242 + }, + { + "entropy": 2.4287109375, + "epoch": 0.004183776245448206, + "grad_norm": 0.5641495585441589, + "learning_rate": 2.42e-06, + "loss": 2.411, + "mean_token_accuracy": 0.4803205137141049, + "num_tokens": 35306435.0, + "step": 243 + }, + { + "entropy": 2.47705078125, + "epoch": 0.004200993431643467, + "grad_norm": 0.5801253914833069, + "learning_rate": 2.43e-06, + "loss": 2.4781, + "mean_token_accuracy": 0.47467044508084655, + "num_tokens": 35458606.0, + "step": 244 + }, + { + "entropy": 2.3427734375, + "epoch": 0.004218210617838726, + "grad_norm": 0.5665391683578491, + "learning_rate": 2.4400000000000004e-06, + "loss": 2.2603, + "mean_token_accuracy": 0.5001557641662657, + "num_tokens": 35599151.0, + "step": 245 + }, + { + "entropy": 2.4097900390625, + "epoch": 0.004235427804033987, + "grad_norm": 0.5967923998832703, + "learning_rate": 2.4500000000000003e-06, + "loss": 2.3661, + "mean_token_accuracy": 0.4944393504410982, + "num_tokens": 35753307.0, + "step": 246 + }, + { + "entropy": 2.3837890625, + "epoch": 0.0042526449902292466, + "grad_norm": 0.5936510562896729, + "learning_rate": 2.46e-06, + "loss": 2.3321, + "mean_token_accuracy": 0.49646031484007835, + "num_tokens": 35892908.0, + "step": 247 + }, + { + "entropy": 2.4852294921875, + "epoch": 0.004269862176424507, + "grad_norm": 0.5804598927497864, + "learning_rate": 2.47e-06, + "loss": 2.3769, + "mean_token_accuracy": 0.47953970776870847, + "num_tokens": 36042857.0, + "step": 248 + }, + { + "entropy": 2.4976806640625, + "epoch": 0.004287079362619767, + "grad_norm": 0.6185864806175232, + "learning_rate": 2.4800000000000004e-06, + "loss": 2.4387, + "mean_token_accuracy": 0.4731791294179857, + "num_tokens": 36194862.0, + "step": 249 + }, + { + "entropy": 2.4356689453125, + "epoch": 0.004304296548815027, + "grad_norm": 0.6290936470031738, + "learning_rate": 2.4900000000000003e-06, + "loss": 2.3616, + "mean_token_accuracy": 0.4869615500792861, + "num_tokens": 36347599.0, + "step": 250 + }, + { + "entropy": 2.4132080078125, + "epoch": 0.004321513735010287, + "grad_norm": 0.6308562755584717, + "learning_rate": 2.5e-06, + "loss": 2.355, + "mean_token_accuracy": 0.48826425010338426, + "num_tokens": 36506643.0, + "step": 251 + }, + { + "entropy": 2.4609375, + "epoch": 0.0043387309212055475, + "grad_norm": 0.5634401440620422, + "learning_rate": 2.51e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.4824108784087002, + "num_tokens": 36652239.0, + "step": 252 + }, + { + "entropy": 2.360595703125, + "epoch": 0.004355948107400807, + "grad_norm": 0.574517548084259, + "learning_rate": 2.52e-06, + "loss": 2.3125, + "mean_token_accuracy": 0.49683515494689345, + "num_tokens": 36803342.0, + "step": 253 + }, + { + "entropy": 2.440185546875, + "epoch": 0.004373165293596068, + "grad_norm": 0.6065932512283325, + "learning_rate": 2.5300000000000003e-06, + "loss": 2.4206, + "mean_token_accuracy": 0.48550984309986234, + "num_tokens": 36942721.0, + "step": 254 + }, + { + "entropy": 2.485107421875, + "epoch": 0.004390382479791327, + "grad_norm": 0.8562254905700684, + "learning_rate": 2.5400000000000002e-06, + "loss": 2.4961, + "mean_token_accuracy": 0.47687816014513373, + "num_tokens": 37068241.0, + "step": 255 + }, + { + "entropy": 2.3944091796875, + "epoch": 0.004407599665986588, + "grad_norm": 0.547681987285614, + "learning_rate": 2.55e-06, + "loss": 2.3469, + "mean_token_accuracy": 0.49269043607637286, + "num_tokens": 37221499.0, + "step": 256 + }, + { + "entropy": 2.4814453125, + "epoch": 0.004424816852181848, + "grad_norm": 0.5839248299598694, + "learning_rate": 2.56e-06, + "loss": 2.4229, + "mean_token_accuracy": 0.47095031198114157, + "num_tokens": 37373368.0, + "step": 257 + }, + { + "entropy": 2.354736328125, + "epoch": 0.004442034038377108, + "grad_norm": 0.587020993232727, + "learning_rate": 2.5700000000000004e-06, + "loss": 2.3414, + "mean_token_accuracy": 0.4924391624517739, + "num_tokens": 37518300.0, + "step": 258 + }, + { + "entropy": 2.37255859375, + "epoch": 0.0044592512245723685, + "grad_norm": 0.5921294093132019, + "learning_rate": 2.5800000000000003e-06, + "loss": 2.3288, + "mean_token_accuracy": 0.4891428491100669, + "num_tokens": 37668963.0, + "step": 259 + }, + { + "entropy": 2.461181640625, + "epoch": 0.004476468410767628, + "grad_norm": 0.6068063378334045, + "learning_rate": 2.59e-06, + "loss": 2.4455, + "mean_token_accuracy": 0.47976093366742134, + "num_tokens": 37805995.0, + "step": 260 + }, + { + "entropy": 2.4947509765625, + "epoch": 0.004493685596962889, + "grad_norm": 0.6007203459739685, + "learning_rate": 2.6e-06, + "loss": 2.4731, + "mean_token_accuracy": 0.4695629784837365, + "num_tokens": 37953374.0, + "step": 261 + }, + { + "entropy": 2.4515380859375, + "epoch": 0.004510902783158148, + "grad_norm": 0.6087055206298828, + "learning_rate": 2.6100000000000004e-06, + "loss": 2.3859, + "mean_token_accuracy": 0.48078236635774374, + "num_tokens": 38090268.0, + "step": 262 + }, + { + "entropy": 2.405029296875, + "epoch": 0.004528119969353409, + "grad_norm": 0.6398684978485107, + "learning_rate": 2.6200000000000003e-06, + "loss": 2.3682, + "mean_token_accuracy": 0.4855854455381632, + "num_tokens": 38230536.0, + "step": 263 + }, + { + "entropy": 2.484375, + "epoch": 0.0045453371555486686, + "grad_norm": 0.5696126818656921, + "learning_rate": 2.6300000000000002e-06, + "loss": 2.4688, + "mean_token_accuracy": 0.47722396487370133, + "num_tokens": 38372741.0, + "step": 264 + }, + { + "entropy": 2.3275146484375, + "epoch": 0.004562554341743929, + "grad_norm": 0.5986506342887878, + "learning_rate": 2.64e-06, + "loss": 2.2601, + "mean_token_accuracy": 0.5040428307838738, + "num_tokens": 38510771.0, + "step": 265 + }, + { + "entropy": 2.4368896484375, + "epoch": 0.004579771527939189, + "grad_norm": 0.5909532904624939, + "learning_rate": 2.6500000000000005e-06, + "loss": 2.3905, + "mean_token_accuracy": 0.48478930070996284, + "num_tokens": 38656604.0, + "step": 266 + }, + { + "entropy": 2.4736328125, + "epoch": 0.004596988714134449, + "grad_norm": 0.604336142539978, + "learning_rate": 2.6600000000000004e-06, + "loss": 2.47, + "mean_token_accuracy": 0.47080791695043445, + "num_tokens": 38790002.0, + "step": 267 + }, + { + "entropy": 2.44287109375, + "epoch": 0.004614205900329709, + "grad_norm": 1.2783533334732056, + "learning_rate": 2.6700000000000003e-06, + "loss": 2.353, + "mean_token_accuracy": 0.4904564335010946, + "num_tokens": 38926583.0, + "step": 268 + }, + { + "entropy": 2.474365234375, + "epoch": 0.0046314230865249694, + "grad_norm": 0.6569477915763855, + "learning_rate": 2.68e-06, + "loss": 2.443, + "mean_token_accuracy": 0.47721053985878825, + "num_tokens": 39070295.0, + "step": 269 + }, + { + "entropy": 2.4146728515625, + "epoch": 0.004648640272720229, + "grad_norm": 0.6075790524482727, + "learning_rate": 2.6900000000000005e-06, + "loss": 2.3758, + "mean_token_accuracy": 0.4894418097101152, + "num_tokens": 39224913.0, + "step": 270 + }, + { + "entropy": 2.42333984375, + "epoch": 0.00466585745891549, + "grad_norm": 0.6399396657943726, + "learning_rate": 2.7000000000000004e-06, + "loss": 2.3292, + "mean_token_accuracy": 0.4931036913767457, + "num_tokens": 39357252.0, + "step": 271 + }, + { + "entropy": 2.407958984375, + "epoch": 0.004683074645110749, + "grad_norm": 0.5720593333244324, + "learning_rate": 2.7100000000000003e-06, + "loss": 2.3628, + "mean_token_accuracy": 0.4841322088614106, + "num_tokens": 39493624.0, + "step": 272 + }, + { + "entropy": 2.5147705078125, + "epoch": 0.00470029183130601, + "grad_norm": 0.6194799542427063, + "learning_rate": 2.7200000000000002e-06, + "loss": 2.4367, + "mean_token_accuracy": 0.4774662428535521, + "num_tokens": 39629017.0, + "step": 273 + }, + { + "entropy": 2.461181640625, + "epoch": 0.0047175090175012695, + "grad_norm": 0.5498484373092651, + "learning_rate": 2.7300000000000005e-06, + "loss": 2.4471, + "mean_token_accuracy": 0.4761566431261599, + "num_tokens": 39770313.0, + "step": 274 + }, + { + "entropy": 2.458984375, + "epoch": 0.00473472620369653, + "grad_norm": 0.6001895070075989, + "learning_rate": 2.7400000000000004e-06, + "loss": 2.4156, + "mean_token_accuracy": 0.47761010052636266, + "num_tokens": 39921189.0, + "step": 275 + }, + { + "entropy": 2.3966064453125, + "epoch": 0.00475194338989179, + "grad_norm": 0.555188000202179, + "learning_rate": 2.7500000000000004e-06, + "loss": 2.3301, + "mean_token_accuracy": 0.4860731796361506, + "num_tokens": 40076244.0, + "step": 276 + }, + { + "entropy": 2.404541015625, + "epoch": 0.00476916057608705, + "grad_norm": 0.5978058576583862, + "learning_rate": 2.7600000000000003e-06, + "loss": 2.3391, + "mean_token_accuracy": 0.49592722998932004, + "num_tokens": 40212416.0, + "step": 277 + }, + { + "entropy": 2.437744140625, + "epoch": 0.00478637776228231, + "grad_norm": 0.6279250383377075, + "learning_rate": 2.7700000000000006e-06, + "loss": 2.3922, + "mean_token_accuracy": 0.48303127055987716, + "num_tokens": 40349633.0, + "step": 278 + }, + { + "entropy": 2.50244140625, + "epoch": 0.00480359494847757, + "grad_norm": 0.5651630759239197, + "learning_rate": 2.7800000000000005e-06, + "loss": 2.4524, + "mean_token_accuracy": 0.47481566993519664, + "num_tokens": 40493744.0, + "step": 279 + }, + { + "entropy": 2.450439453125, + "epoch": 0.00482081213467283, + "grad_norm": 0.5598726272583008, + "learning_rate": 2.7900000000000004e-06, + "loss": 2.4624, + "mean_token_accuracy": 0.476602204144001, + "num_tokens": 40643755.0, + "step": 280 + }, + { + "entropy": 2.46826171875, + "epoch": 0.0048380293208680905, + "grad_norm": 0.5888033509254456, + "learning_rate": 2.8000000000000003e-06, + "loss": 2.4159, + "mean_token_accuracy": 0.4830831168219447, + "num_tokens": 40784807.0, + "step": 281 + }, + { + "entropy": 2.379638671875, + "epoch": 0.00485524650706335, + "grad_norm": 0.5815398693084717, + "learning_rate": 2.8100000000000006e-06, + "loss": 2.351, + "mean_token_accuracy": 0.49111180379986763, + "num_tokens": 40934466.0, + "step": 282 + }, + { + "entropy": 2.4080810546875, + "epoch": 0.004872463693258611, + "grad_norm": 0.5862876176834106, + "learning_rate": 2.82e-06, + "loss": 2.3539, + "mean_token_accuracy": 0.49174999026581645, + "num_tokens": 41069116.0, + "step": 283 + }, + { + "entropy": 2.40478515625, + "epoch": 0.004889680879453871, + "grad_norm": 0.5889655947685242, + "learning_rate": 2.83e-06, + "loss": 2.3569, + "mean_token_accuracy": 0.4852382456883788, + "num_tokens": 41201155.0, + "step": 284 + }, + { + "entropy": 2.4091796875, + "epoch": 0.004906898065649131, + "grad_norm": 0.6074087619781494, + "learning_rate": 2.84e-06, + "loss": 2.3607, + "mean_token_accuracy": 0.48959261691197753, + "num_tokens": 41332480.0, + "step": 285 + }, + { + "entropy": 2.451416015625, + "epoch": 0.0049241152518443914, + "grad_norm": 0.6012029051780701, + "learning_rate": 2.85e-06, + "loss": 2.4236, + "mean_token_accuracy": 0.486750605981797, + "num_tokens": 41480685.0, + "step": 286 + }, + { + "entropy": 2.412841796875, + "epoch": 0.004941332438039651, + "grad_norm": 0.6248612999916077, + "learning_rate": 2.86e-06, + "loss": 2.3435, + "mean_token_accuracy": 0.49190296651795506, + "num_tokens": 41623635.0, + "step": 287 + }, + { + "entropy": 2.4788818359375, + "epoch": 0.004958549624234912, + "grad_norm": 0.5725988745689392, + "learning_rate": 2.87e-06, + "loss": 2.4732, + "mean_token_accuracy": 0.47983552562072873, + "num_tokens": 41781312.0, + "step": 288 + }, + { + "entropy": 2.446533203125, + "epoch": 0.004975766810430171, + "grad_norm": 0.6067764163017273, + "learning_rate": 2.88e-06, + "loss": 2.3832, + "mean_token_accuracy": 0.48594531044363976, + "num_tokens": 41913646.0, + "step": 289 + }, + { + "entropy": 2.4150390625, + "epoch": 0.004992983996625432, + "grad_norm": 0.6115426421165466, + "learning_rate": 2.89e-06, + "loss": 2.3762, + "mean_token_accuracy": 0.4852985511533916, + "num_tokens": 42048939.0, + "step": 290 + }, + { + "entropy": 2.5048828125, + "epoch": 0.0050102011828206915, + "grad_norm": 0.5820481777191162, + "learning_rate": 2.9e-06, + "loss": 2.4655, + "mean_token_accuracy": 0.473651675041765, + "num_tokens": 42187764.0, + "step": 291 + }, + { + "entropy": 2.4854736328125, + "epoch": 0.005027418369015952, + "grad_norm": 0.5613899827003479, + "learning_rate": 2.91e-06, + "loss": 2.4199, + "mean_token_accuracy": 0.48045311588793993, + "num_tokens": 42335423.0, + "step": 292 + }, + { + "entropy": 2.472412109375, + "epoch": 0.005044635555211212, + "grad_norm": 0.5480543375015259, + "learning_rate": 2.92e-06, + "loss": 2.4305, + "mean_token_accuracy": 0.47681946912780404, + "num_tokens": 42489078.0, + "step": 293 + }, + { + "entropy": 2.3948974609375, + "epoch": 0.005061852741406472, + "grad_norm": 0.554536759853363, + "learning_rate": 2.93e-06, + "loss": 2.3142, + "mean_token_accuracy": 0.4870352731086314, + "num_tokens": 42638769.0, + "step": 294 + }, + { + "entropy": 2.4532470703125, + "epoch": 0.005079069927601732, + "grad_norm": 0.5379908084869385, + "learning_rate": 2.9400000000000002e-06, + "loss": 2.4019, + "mean_token_accuracy": 0.4769473564811051, + "num_tokens": 42784366.0, + "step": 295 + }, + { + "entropy": 2.442626953125, + "epoch": 0.005096287113796992, + "grad_norm": 0.5843129754066467, + "learning_rate": 2.95e-06, + "loss": 2.4068, + "mean_token_accuracy": 0.4839549297466874, + "num_tokens": 42931723.0, + "step": 296 + }, + { + "entropy": 2.4779052734375, + "epoch": 0.005113504299992252, + "grad_norm": 0.5816124081611633, + "learning_rate": 2.96e-06, + "loss": 2.4102, + "mean_token_accuracy": 0.47800721740350127, + "num_tokens": 43068668.0, + "step": 297 + }, + { + "entropy": 2.5050048828125, + "epoch": 0.0051307214861875125, + "grad_norm": 0.5763994455337524, + "learning_rate": 2.97e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.47180112451314926, + "num_tokens": 43212264.0, + "step": 298 + }, + { + "entropy": 2.4345703125, + "epoch": 0.005147938672382772, + "grad_norm": 0.5906177759170532, + "learning_rate": 2.9800000000000003e-06, + "loss": 2.44, + "mean_token_accuracy": 0.47890339232981205, + "num_tokens": 43358872.0, + "step": 299 + }, + { + "entropy": 2.361328125, + "epoch": 0.005165155858578033, + "grad_norm": 0.6285274028778076, + "learning_rate": 2.99e-06, + "loss": 2.3159, + "mean_token_accuracy": 0.49288657307624817, + "num_tokens": 43505769.0, + "step": 300 + }, + { + "entropy": 2.418701171875, + "epoch": 0.005182373044773292, + "grad_norm": 0.5737318396568298, + "learning_rate": 3e-06, + "loss": 2.3905, + "mean_token_accuracy": 0.48132931999862194, + "num_tokens": 43655664.0, + "step": 301 + }, + { + "entropy": 2.40673828125, + "epoch": 0.005199590230968553, + "grad_norm": 0.6073116660118103, + "learning_rate": 3.01e-06, + "loss": 2.3336, + "mean_token_accuracy": 0.4943325803615153, + "num_tokens": 43792452.0, + "step": 302 + }, + { + "entropy": 2.3916015625, + "epoch": 0.0052168074171638126, + "grad_norm": 0.5638276934623718, + "learning_rate": 3.0200000000000003e-06, + "loss": 2.3604, + "mean_token_accuracy": 0.4889626274816692, + "num_tokens": 43950674.0, + "step": 303 + }, + { + "entropy": 2.45849609375, + "epoch": 0.005234024603359073, + "grad_norm": 0.5540956854820251, + "learning_rate": 3.0300000000000002e-06, + "loss": 2.4143, + "mean_token_accuracy": 0.47294150246307254, + "num_tokens": 44104574.0, + "step": 304 + }, + { + "entropy": 2.4903564453125, + "epoch": 0.005251241789554333, + "grad_norm": 0.6057919263839722, + "learning_rate": 3.04e-06, + "loss": 2.4804, + "mean_token_accuracy": 0.4774297019466758, + "num_tokens": 44268375.0, + "step": 305 + }, + { + "entropy": 2.42431640625, + "epoch": 0.005268458975749593, + "grad_norm": 0.5463052988052368, + "learning_rate": 3.05e-06, + "loss": 2.3637, + "mean_token_accuracy": 0.4869398158043623, + "num_tokens": 44421122.0, + "step": 306 + }, + { + "entropy": 2.4371337890625, + "epoch": 0.005285676161944853, + "grad_norm": 0.581701934337616, + "learning_rate": 3.0600000000000003e-06, + "loss": 2.3915, + "mean_token_accuracy": 0.4823682149872184, + "num_tokens": 44580278.0, + "step": 307 + }, + { + "entropy": 2.412841796875, + "epoch": 0.0053028933481401135, + "grad_norm": 0.6069723963737488, + "learning_rate": 3.0700000000000003e-06, + "loss": 2.3847, + "mean_token_accuracy": 0.4826384102925658, + "num_tokens": 44722347.0, + "step": 308 + }, + { + "entropy": 2.4246826171875, + "epoch": 0.005320110534335374, + "grad_norm": 0.6269805431365967, + "learning_rate": 3.08e-06, + "loss": 2.3817, + "mean_token_accuracy": 0.4870366188697517, + "num_tokens": 44859344.0, + "step": 309 + }, + { + "entropy": 2.5079345703125, + "epoch": 0.005337327720530634, + "grad_norm": 0.5604663491249084, + "learning_rate": 3.09e-06, + "loss": 2.4903, + "mean_token_accuracy": 0.4686946659348905, + "num_tokens": 45007467.0, + "step": 310 + }, + { + "entropy": 2.53173828125, + "epoch": 0.005354544906725894, + "grad_norm": 0.6524129509925842, + "learning_rate": 3.1000000000000004e-06, + "loss": 2.5078, + "mean_token_accuracy": 0.4713136567734182, + "num_tokens": 45151205.0, + "step": 311 + }, + { + "entropy": 2.4884033203125, + "epoch": 0.005371762092921154, + "grad_norm": 0.6845608353614807, + "learning_rate": 3.1100000000000003e-06, + "loss": 2.4559, + "mean_token_accuracy": 0.47201906703412533, + "num_tokens": 45283726.0, + "step": 312 + }, + { + "entropy": 2.4666748046875, + "epoch": 0.005388979279116414, + "grad_norm": 0.5663162469863892, + "learning_rate": 3.12e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.4775085118599236, + "num_tokens": 45431733.0, + "step": 313 + }, + { + "entropy": 2.43603515625, + "epoch": 0.005406196465311674, + "grad_norm": 0.6073246002197266, + "learning_rate": 3.13e-06, + "loss": 2.3789, + "mean_token_accuracy": 0.48635248839855194, + "num_tokens": 45564227.0, + "step": 314 + }, + { + "entropy": 2.403564453125, + "epoch": 0.0054234136515069345, + "grad_norm": 0.6272099018096924, + "learning_rate": 3.1400000000000004e-06, + "loss": 2.3457, + "mean_token_accuracy": 0.49023490585386753, + "num_tokens": 45694763.0, + "step": 315 + }, + { + "entropy": 2.45947265625, + "epoch": 0.005440630837702194, + "grad_norm": 0.6056774258613586, + "learning_rate": 3.1500000000000003e-06, + "loss": 2.4134, + "mean_token_accuracy": 0.48205672251060605, + "num_tokens": 45822589.0, + "step": 316 + }, + { + "entropy": 2.479248046875, + "epoch": 0.005457848023897455, + "grad_norm": 0.5762467384338379, + "learning_rate": 3.1600000000000002e-06, + "loss": 2.4066, + "mean_token_accuracy": 0.474984189029783, + "num_tokens": 45963018.0, + "step": 317 + }, + { + "entropy": 2.45849609375, + "epoch": 0.005475065210092714, + "grad_norm": 0.5697450637817383, + "learning_rate": 3.17e-06, + "loss": 2.4098, + "mean_token_accuracy": 0.481595104560256, + "num_tokens": 46116597.0, + "step": 318 + }, + { + "entropy": 2.455322265625, + "epoch": 0.005492282396287975, + "grad_norm": 0.5741132497787476, + "learning_rate": 3.1800000000000005e-06, + "loss": 2.4058, + "mean_token_accuracy": 0.48317222855985165, + "num_tokens": 46269799.0, + "step": 319 + }, + { + "entropy": 2.4736328125, + "epoch": 0.0055094995824832346, + "grad_norm": 0.5584312677383423, + "learning_rate": 3.1900000000000004e-06, + "loss": 2.4214, + "mean_token_accuracy": 0.47542993212118745, + "num_tokens": 46413009.0, + "step": 320 + }, + { + "entropy": 2.345703125, + "epoch": 0.005526716768678495, + "grad_norm": 0.585615336894989, + "learning_rate": 3.2000000000000003e-06, + "loss": 2.3015, + "mean_token_accuracy": 0.5034409500658512, + "num_tokens": 46553428.0, + "step": 321 + }, + { + "entropy": 2.476318359375, + "epoch": 0.005543933954873755, + "grad_norm": 0.5882828235626221, + "learning_rate": 3.21e-06, + "loss": 2.4505, + "mean_token_accuracy": 0.4761179364286363, + "num_tokens": 46694050.0, + "step": 322 + }, + { + "entropy": 2.5, + "epoch": 0.005561151141069015, + "grad_norm": 0.5832546949386597, + "learning_rate": 3.2200000000000005e-06, + "loss": 2.4788, + "mean_token_accuracy": 0.4718233407475054, + "num_tokens": 46834894.0, + "step": 323 + }, + { + "entropy": 2.4697265625, + "epoch": 0.005578368327264275, + "grad_norm": 0.5746425986289978, + "learning_rate": 3.2300000000000004e-06, + "loss": 2.437, + "mean_token_accuracy": 0.473955764900893, + "num_tokens": 46971269.0, + "step": 324 + }, + { + "entropy": 2.493896484375, + "epoch": 0.0055955855134595354, + "grad_norm": 0.5796445608139038, + "learning_rate": 3.2400000000000003e-06, + "loss": 2.485, + "mean_token_accuracy": 0.4741158653050661, + "num_tokens": 47120409.0, + "step": 325 + }, + { + "entropy": 2.455078125, + "epoch": 0.005612802699654795, + "grad_norm": 0.5498818159103394, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.4014, + "mean_token_accuracy": 0.48121104296296835, + "num_tokens": 47268071.0, + "step": 326 + }, + { + "entropy": 2.36669921875, + "epoch": 0.005630019885850056, + "grad_norm": 0.5759862065315247, + "learning_rate": 3.2600000000000006e-06, + "loss": 2.3247, + "mean_token_accuracy": 0.4973863451741636, + "num_tokens": 47417393.0, + "step": 327 + }, + { + "entropy": 2.4267578125, + "epoch": 0.005647237072045315, + "grad_norm": 0.5729436874389648, + "learning_rate": 3.2700000000000005e-06, + "loss": 2.3774, + "mean_token_accuracy": 0.48343247501179576, + "num_tokens": 47557540.0, + "step": 328 + }, + { + "entropy": 2.4736328125, + "epoch": 0.005664454258240576, + "grad_norm": 0.5836823582649231, + "learning_rate": 3.2800000000000004e-06, + "loss": 2.4552, + "mean_token_accuracy": 0.4818938923999667, + "num_tokens": 47698283.0, + "step": 329 + }, + { + "entropy": 2.47900390625, + "epoch": 0.0056816714444358355, + "grad_norm": 0.5725101828575134, + "learning_rate": 3.2900000000000003e-06, + "loss": 2.4444, + "mean_token_accuracy": 0.4746799021959305, + "num_tokens": 47841703.0, + "step": 330 + }, + { + "entropy": 2.480712890625, + "epoch": 0.005698888630631096, + "grad_norm": 0.573144257068634, + "learning_rate": 3.3000000000000006e-06, + "loss": 2.427, + "mean_token_accuracy": 0.47849088767543435, + "num_tokens": 47992998.0, + "step": 331 + }, + { + "entropy": 2.496337890625, + "epoch": 0.005716105816826356, + "grad_norm": 0.5374743938446045, + "learning_rate": 3.3100000000000005e-06, + "loss": 2.4603, + "mean_token_accuracy": 0.4686804707162082, + "num_tokens": 48155349.0, + "step": 332 + }, + { + "entropy": 2.3896484375, + "epoch": 0.005733323003021616, + "grad_norm": 0.5805611610412598, + "learning_rate": 3.3200000000000004e-06, + "loss": 2.3428, + "mean_token_accuracy": 0.49433091934770346, + "num_tokens": 48301544.0, + "step": 333 + }, + { + "entropy": 2.4912109375, + "epoch": 0.005750540189216877, + "grad_norm": 0.5827764868736267, + "learning_rate": 3.3300000000000003e-06, + "loss": 2.439, + "mean_token_accuracy": 0.4705883339047432, + "num_tokens": 48443261.0, + "step": 334 + }, + { + "entropy": 2.476318359375, + "epoch": 0.005767757375412136, + "grad_norm": 0.5864250063896179, + "learning_rate": 3.3400000000000006e-06, + "loss": 2.4157, + "mean_token_accuracy": 0.4782019346021116, + "num_tokens": 48581146.0, + "step": 335 + }, + { + "entropy": 2.538818359375, + "epoch": 0.005784974561607397, + "grad_norm": 0.55199134349823, + "learning_rate": 3.3500000000000005e-06, + "loss": 2.4757, + "mean_token_accuracy": 0.46869856445118785, + "num_tokens": 48736815.0, + "step": 336 + }, + { + "entropy": 2.4351806640625, + "epoch": 0.0058021917478026565, + "grad_norm": 0.574338436126709, + "learning_rate": 3.3600000000000004e-06, + "loss": 2.3759, + "mean_token_accuracy": 0.4860638175159693, + "num_tokens": 48897927.0, + "step": 337 + }, + { + "entropy": 2.4539794921875, + "epoch": 0.005819408933997917, + "grad_norm": 0.5789890289306641, + "learning_rate": 3.3700000000000003e-06, + "loss": 2.3891, + "mean_token_accuracy": 0.4830233994871378, + "num_tokens": 49041318.0, + "step": 338 + }, + { + "entropy": 2.3941650390625, + "epoch": 0.005836626120193177, + "grad_norm": 0.6387662887573242, + "learning_rate": 3.3800000000000007e-06, + "loss": 2.3505, + "mean_token_accuracy": 0.49095042794942856, + "num_tokens": 49176213.0, + "step": 339 + }, + { + "entropy": 2.441162109375, + "epoch": 0.005853843306388437, + "grad_norm": 0.6007640957832336, + "learning_rate": 3.3900000000000006e-06, + "loss": 2.3682, + "mean_token_accuracy": 0.4886678601615131, + "num_tokens": 49305639.0, + "step": 340 + }, + { + "entropy": 2.51904296875, + "epoch": 0.005871060492583697, + "grad_norm": 0.5937355160713196, + "learning_rate": 3.4000000000000005e-06, + "loss": 2.4649, + "mean_token_accuracy": 0.47522739693522453, + "num_tokens": 49445884.0, + "step": 341 + }, + { + "entropy": 2.5255126953125, + "epoch": 0.0058882776787789574, + "grad_norm": 0.6529424786567688, + "learning_rate": 3.4100000000000004e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.4746531555429101, + "num_tokens": 49585678.0, + "step": 342 + }, + { + "entropy": 2.4620361328125, + "epoch": 0.005905494864974217, + "grad_norm": 0.5634227991104126, + "learning_rate": 3.4200000000000007e-06, + "loss": 2.4171, + "mean_token_accuracy": 0.47586130583658814, + "num_tokens": 49733283.0, + "step": 343 + }, + { + "entropy": 2.4451904296875, + "epoch": 0.005922712051169478, + "grad_norm": 0.5977884531021118, + "learning_rate": 3.4300000000000006e-06, + "loss": 2.3813, + "mean_token_accuracy": 0.47802019072696567, + "num_tokens": 49872559.0, + "step": 344 + }, + { + "entropy": 2.4005126953125, + "epoch": 0.005939929237364737, + "grad_norm": 0.5835461020469666, + "learning_rate": 3.44e-06, + "loss": 2.351, + "mean_token_accuracy": 0.4916755324229598, + "num_tokens": 50014503.0, + "step": 345 + }, + { + "entropy": 2.414306640625, + "epoch": 0.005957146423559998, + "grad_norm": 0.616673469543457, + "learning_rate": 3.45e-06, + "loss": 2.3877, + "mean_token_accuracy": 0.4860090627335012, + "num_tokens": 50167107.0, + "step": 346 + }, + { + "entropy": 2.537353515625, + "epoch": 0.0059743636097552575, + "grad_norm": 0.6396327018737793, + "learning_rate": 3.46e-06, + "loss": 2.4815, + "mean_token_accuracy": 0.4667340232990682, + "num_tokens": 50299354.0, + "step": 347 + }, + { + "entropy": 2.424560546875, + "epoch": 0.005991580795950518, + "grad_norm": 0.5635019540786743, + "learning_rate": 3.4700000000000002e-06, + "loss": 2.3938, + "mean_token_accuracy": 0.48614488868042827, + "num_tokens": 50448841.0, + "step": 348 + }, + { + "entropy": 2.446044921875, + "epoch": 0.006008797982145778, + "grad_norm": 0.5684136152267456, + "learning_rate": 3.48e-06, + "loss": 2.3895, + "mean_token_accuracy": 0.48185077449306846, + "num_tokens": 50591226.0, + "step": 349 + }, + { + "entropy": 2.4525146484375, + "epoch": 0.006026015168341038, + "grad_norm": 0.614551842212677, + "learning_rate": 3.49e-06, + "loss": 2.4292, + "mean_token_accuracy": 0.47555977012962103, + "num_tokens": 50736325.0, + "step": 350 + }, + { + "entropy": 2.432373046875, + "epoch": 0.006043232354536298, + "grad_norm": 0.5730347633361816, + "learning_rate": 3.5e-06, + "loss": 2.402, + "mean_token_accuracy": 0.4838436101563275, + "num_tokens": 50882008.0, + "step": 351 + }, + { + "entropy": 2.40478515625, + "epoch": 0.006060449540731558, + "grad_norm": 0.5943769812583923, + "learning_rate": 3.5100000000000003e-06, + "loss": 2.3651, + "mean_token_accuracy": 0.4828978287987411, + "num_tokens": 51022802.0, + "step": 352 + }, + { + "entropy": 2.4420166015625, + "epoch": 0.006077666726926818, + "grad_norm": 0.5597745180130005, + "learning_rate": 3.52e-06, + "loss": 2.3646, + "mean_token_accuracy": 0.4837341457605362, + "num_tokens": 51165806.0, + "step": 353 + }, + { + "entropy": 2.487060546875, + "epoch": 0.0060948839131220785, + "grad_norm": 0.5304551124572754, + "learning_rate": 3.53e-06, + "loss": 2.4595, + "mean_token_accuracy": 0.4816332710906863, + "num_tokens": 51325353.0, + "step": 354 + }, + { + "entropy": 2.4630126953125, + "epoch": 0.006112101099317338, + "grad_norm": 0.6318809390068054, + "learning_rate": 3.54e-06, + "loss": 2.4102, + "mean_token_accuracy": 0.4823563848622143, + "num_tokens": 51447642.0, + "step": 355 + }, + { + "entropy": 2.4342041015625, + "epoch": 0.006129318285512599, + "grad_norm": 0.571685791015625, + "learning_rate": 3.5500000000000003e-06, + "loss": 2.3455, + "mean_token_accuracy": 0.48439886793494225, + "num_tokens": 51593840.0, + "step": 356 + }, + { + "entropy": 2.42138671875, + "epoch": 0.006146535471707858, + "grad_norm": 0.6026922464370728, + "learning_rate": 3.5600000000000002e-06, + "loss": 2.3275, + "mean_token_accuracy": 0.4946357752196491, + "num_tokens": 51727430.0, + "step": 357 + }, + { + "entropy": 2.3724365234375, + "epoch": 0.006163752657903119, + "grad_norm": 0.584361732006073, + "learning_rate": 3.57e-06, + "loss": 2.3214, + "mean_token_accuracy": 0.48994710063561797, + "num_tokens": 51873095.0, + "step": 358 + }, + { + "entropy": 2.47265625, + "epoch": 0.0061809698440983794, + "grad_norm": 0.5443347692489624, + "learning_rate": 3.58e-06, + "loss": 2.4081, + "mean_token_accuracy": 0.4771101255901158, + "num_tokens": 52022705.0, + "step": 359 + }, + { + "entropy": 2.44677734375, + "epoch": 0.006198187030293639, + "grad_norm": 0.5630244016647339, + "learning_rate": 3.5900000000000004e-06, + "loss": 2.4228, + "mean_token_accuracy": 0.4784181611612439, + "num_tokens": 52168321.0, + "step": 360 + }, + { + "entropy": 2.467529296875, + "epoch": 0.0062154042164889, + "grad_norm": 0.5848613977432251, + "learning_rate": 3.6000000000000003e-06, + "loss": 2.4734, + "mean_token_accuracy": 0.47760110441595316, + "num_tokens": 52321812.0, + "step": 361 + }, + { + "entropy": 2.431396484375, + "epoch": 0.006232621402684159, + "grad_norm": 0.5731722712516785, + "learning_rate": 3.61e-06, + "loss": 2.3849, + "mean_token_accuracy": 0.48051671124994755, + "num_tokens": 52477181.0, + "step": 362 + }, + { + "entropy": 2.435546875, + "epoch": 0.00624983858887942, + "grad_norm": 0.5230873227119446, + "learning_rate": 3.62e-06, + "loss": 2.361, + "mean_token_accuracy": 0.4862850420176983, + "num_tokens": 52638308.0, + "step": 363 + }, + { + "entropy": 2.3856201171875, + "epoch": 0.0062670557750746795, + "grad_norm": 0.5921328067779541, + "learning_rate": 3.6300000000000004e-06, + "loss": 2.3105, + "mean_token_accuracy": 0.496675749309361, + "num_tokens": 52778098.0, + "step": 364 + }, + { + "entropy": 2.486083984375, + "epoch": 0.00628427296126994, + "grad_norm": 0.5676338076591492, + "learning_rate": 3.6400000000000003e-06, + "loss": 2.4375, + "mean_token_accuracy": 0.4777340483851731, + "num_tokens": 52926007.0, + "step": 365 + }, + { + "entropy": 2.5103759765625, + "epoch": 0.0063014901474652, + "grad_norm": 0.5867769718170166, + "learning_rate": 3.65e-06, + "loss": 2.486, + "mean_token_accuracy": 0.47535310545936227, + "num_tokens": 53067977.0, + "step": 366 + }, + { + "entropy": 2.443115234375, + "epoch": 0.00631870733366046, + "grad_norm": 0.5718576312065125, + "learning_rate": 3.66e-06, + "loss": 2.4122, + "mean_token_accuracy": 0.48204088117927313, + "num_tokens": 53214899.0, + "step": 367 + }, + { + "entropy": 2.4334716796875, + "epoch": 0.00633592451985572, + "grad_norm": 0.5422816872596741, + "learning_rate": 3.6700000000000004e-06, + "loss": 2.4062, + "mean_token_accuracy": 0.48176093958318233, + "num_tokens": 53376856.0, + "step": 368 + }, + { + "entropy": 2.44580078125, + "epoch": 0.00635314170605098, + "grad_norm": 0.5711188912391663, + "learning_rate": 3.6800000000000003e-06, + "loss": 2.393, + "mean_token_accuracy": 0.48084082640707493, + "num_tokens": 53512787.0, + "step": 369 + }, + { + "entropy": 2.394287109375, + "epoch": 0.00637035889224624, + "grad_norm": 0.5846443772315979, + "learning_rate": 3.6900000000000002e-06, + "loss": 2.3919, + "mean_token_accuracy": 0.49098140047863126, + "num_tokens": 53658651.0, + "step": 370 + }, + { + "entropy": 2.4273681640625, + "epoch": 0.0063875760784415005, + "grad_norm": 0.5670855641365051, + "learning_rate": 3.7e-06, + "loss": 2.3645, + "mean_token_accuracy": 0.48496190132573247, + "num_tokens": 53799071.0, + "step": 371 + }, + { + "entropy": 2.559814453125, + "epoch": 0.00640479326463676, + "grad_norm": 0.5919318199157715, + "learning_rate": 3.7100000000000005e-06, + "loss": 2.5396, + "mean_token_accuracy": 0.46628884179517627, + "num_tokens": 53932932.0, + "step": 372 + }, + { + "entropy": 2.4007568359375, + "epoch": 0.006422010450832021, + "grad_norm": 0.5672026872634888, + "learning_rate": 3.7200000000000004e-06, + "loss": 2.3322, + "mean_token_accuracy": 0.48820391669869423, + "num_tokens": 54086169.0, + "step": 373 + }, + { + "entropy": 2.3978271484375, + "epoch": 0.00643922763702728, + "grad_norm": 0.5757989883422852, + "learning_rate": 3.7300000000000003e-06, + "loss": 2.3164, + "mean_token_accuracy": 0.4853466097265482, + "num_tokens": 54241964.0, + "step": 374 + }, + { + "entropy": 2.44580078125, + "epoch": 0.006456444823222541, + "grad_norm": 0.562664806842804, + "learning_rate": 3.74e-06, + "loss": 2.4101, + "mean_token_accuracy": 0.47567897848784924, + "num_tokens": 54386084.0, + "step": 375 + }, + { + "entropy": 2.4031982421875, + "epoch": 0.0064736620094178006, + "grad_norm": 0.5849329829216003, + "learning_rate": 3.7500000000000005e-06, + "loss": 2.3482, + "mean_token_accuracy": 0.48964470298960805, + "num_tokens": 54523676.0, + "step": 376 + }, + { + "entropy": 2.4454345703125, + "epoch": 0.006490879195613061, + "grad_norm": 0.6109877824783325, + "learning_rate": 3.7600000000000004e-06, + "loss": 2.4117, + "mean_token_accuracy": 0.48503589117899537, + "num_tokens": 54662310.0, + "step": 377 + }, + { + "entropy": 2.4466552734375, + "epoch": 0.006508096381808321, + "grad_norm": 0.5700381398200989, + "learning_rate": 3.7700000000000003e-06, + "loss": 2.4555, + "mean_token_accuracy": 0.477643181104213, + "num_tokens": 54809183.0, + "step": 378 + }, + { + "entropy": 2.4498291015625, + "epoch": 0.006525313568003581, + "grad_norm": 0.5381008386611938, + "learning_rate": 3.7800000000000002e-06, + "loss": 2.4346, + "mean_token_accuracy": 0.47657692804932594, + "num_tokens": 54968533.0, + "step": 379 + }, + { + "entropy": 2.4263916015625, + "epoch": 0.006542530754198841, + "grad_norm": 0.5504759550094604, + "learning_rate": 3.79e-06, + "loss": 2.3587, + "mean_token_accuracy": 0.48776144767180085, + "num_tokens": 55117824.0, + "step": 380 + }, + { + "entropy": 2.4163818359375, + "epoch": 0.0065597479403941015, + "grad_norm": 0.5289463400840759, + "learning_rate": 3.8000000000000005e-06, + "loss": 2.3674, + "mean_token_accuracy": 0.48521781573072076, + "num_tokens": 55269929.0, + "step": 381 + }, + { + "entropy": 2.4097900390625, + "epoch": 0.006576965126589361, + "grad_norm": 0.55353182554245, + "learning_rate": 3.8100000000000004e-06, + "loss": 2.3691, + "mean_token_accuracy": 0.48239677073433995, + "num_tokens": 55423948.0, + "step": 382 + }, + { + "entropy": 2.5047607421875, + "epoch": 0.006594182312784622, + "grad_norm": 0.5586856603622437, + "learning_rate": 3.820000000000001e-06, + "loss": 2.4676, + "mean_token_accuracy": 0.47292688954621553, + "num_tokens": 55567887.0, + "step": 383 + }, + { + "entropy": 2.47802734375, + "epoch": 0.006611399498979881, + "grad_norm": 0.5976067781448364, + "learning_rate": 3.830000000000001e-06, + "loss": 2.4577, + "mean_token_accuracy": 0.4735474893823266, + "num_tokens": 55700170.0, + "step": 384 + }, + { + "entropy": 2.4603271484375, + "epoch": 0.006628616685175142, + "grad_norm": 0.5445942878723145, + "learning_rate": 3.8400000000000005e-06, + "loss": 2.4459, + "mean_token_accuracy": 0.4747904692776501, + "num_tokens": 55852413.0, + "step": 385 + }, + { + "entropy": 2.494873046875, + "epoch": 0.006645833871370402, + "grad_norm": 0.5769667625427246, + "learning_rate": 3.85e-06, + "loss": 2.4465, + "mean_token_accuracy": 0.4768332834355533, + "num_tokens": 56010312.0, + "step": 386 + }, + { + "entropy": 2.4306640625, + "epoch": 0.006663051057565662, + "grad_norm": 0.5875312685966492, + "learning_rate": 3.86e-06, + "loss": 2.3771, + "mean_token_accuracy": 0.4805051935836673, + "num_tokens": 56149079.0, + "step": 387 + }, + { + "entropy": 2.406494140625, + "epoch": 0.0066802682437609225, + "grad_norm": 0.5804882049560547, + "learning_rate": 3.87e-06, + "loss": 2.38, + "mean_token_accuracy": 0.48818252328783274, + "num_tokens": 56292210.0, + "step": 388 + }, + { + "entropy": 2.4931640625, + "epoch": 0.006697485429956182, + "grad_norm": 0.6065754890441895, + "learning_rate": 3.88e-06, + "loss": 2.5048, + "mean_token_accuracy": 0.47188919549807906, + "num_tokens": 56425512.0, + "step": 389 + }, + { + "entropy": 2.528076171875, + "epoch": 0.006714702616151443, + "grad_norm": 0.5695645213127136, + "learning_rate": 3.89e-06, + "loss": 2.4927, + "mean_token_accuracy": 0.47105303034186363, + "num_tokens": 56567538.0, + "step": 390 + }, + { + "entropy": 2.4716796875, + "epoch": 0.006731919802346702, + "grad_norm": 0.5461863279342651, + "learning_rate": 3.900000000000001e-06, + "loss": 2.4607, + "mean_token_accuracy": 0.47544049797579646, + "num_tokens": 56723718.0, + "step": 391 + }, + { + "entropy": 2.42236328125, + "epoch": 0.006749136988541963, + "grad_norm": 0.7229442000389099, + "learning_rate": 3.910000000000001e-06, + "loss": 2.322, + "mean_token_accuracy": 0.49011519411578774, + "num_tokens": 56881653.0, + "step": 392 + }, + { + "entropy": 2.4364013671875, + "epoch": 0.0067663541747372226, + "grad_norm": 0.5672553181648254, + "learning_rate": 3.920000000000001e-06, + "loss": 2.418, + "mean_token_accuracy": 0.4815367963165045, + "num_tokens": 57026712.0, + "step": 393 + }, + { + "entropy": 2.4490966796875, + "epoch": 0.006783571360932483, + "grad_norm": 0.6193339824676514, + "learning_rate": 3.9300000000000005e-06, + "loss": 2.324, + "mean_token_accuracy": 0.4879821529611945, + "num_tokens": 57154864.0, + "step": 394 + }, + { + "entropy": 2.424072265625, + "epoch": 0.006800788547127743, + "grad_norm": 0.6047675609588623, + "learning_rate": 3.94e-06, + "loss": 2.343, + "mean_token_accuracy": 0.49066451471298933, + "num_tokens": 57313059.0, + "step": 395 + }, + { + "entropy": 2.529052734375, + "epoch": 0.006818005733323003, + "grad_norm": 0.6238451600074768, + "learning_rate": 3.95e-06, + "loss": 2.5103, + "mean_token_accuracy": 0.4740439150482416, + "num_tokens": 57456573.0, + "step": 396 + }, + { + "entropy": 2.4873046875, + "epoch": 0.006835222919518263, + "grad_norm": 0.6354604363441467, + "learning_rate": 3.96e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.475927259773016, + "num_tokens": 57601572.0, + "step": 397 + }, + { + "entropy": 2.527587890625, + "epoch": 0.0068524401057135234, + "grad_norm": 0.6518051624298096, + "learning_rate": 3.97e-06, + "loss": 2.4717, + "mean_token_accuracy": 0.4759511463344097, + "num_tokens": 57737985.0, + "step": 398 + }, + { + "entropy": 2.400146484375, + "epoch": 0.006869657291908783, + "grad_norm": 0.5659388899803162, + "learning_rate": 3.980000000000001e-06, + "loss": 2.3422, + "mean_token_accuracy": 0.4919633981771767, + "num_tokens": 57886268.0, + "step": 399 + }, + { + "entropy": 2.4190673828125, + "epoch": 0.006886874478104044, + "grad_norm": 0.6125981211662292, + "learning_rate": 3.990000000000001e-06, + "loss": 2.3588, + "mean_token_accuracy": 0.49498999677598476, + "num_tokens": 58020557.0, + "step": 400 + }, + { + "entropy": 2.4425048828125, + "epoch": 0.006904091664299303, + "grad_norm": 0.5755429267883301, + "learning_rate": 4.000000000000001e-06, + "loss": 2.4063, + "mean_token_accuracy": 0.4781268546357751, + "num_tokens": 58157132.0, + "step": 401 + }, + { + "entropy": 2.4521484375, + "epoch": 0.006921308850494564, + "grad_norm": 0.5530086159706116, + "learning_rate": 4.0100000000000006e-06, + "loss": 2.3836, + "mean_token_accuracy": 0.4830552595667541, + "num_tokens": 58306412.0, + "step": 402 + }, + { + "entropy": 2.4962158203125, + "epoch": 0.0069385260366898235, + "grad_norm": 0.6048458218574524, + "learning_rate": 4.0200000000000005e-06, + "loss": 2.4414, + "mean_token_accuracy": 0.4728825897909701, + "num_tokens": 58441138.0, + "step": 403 + }, + { + "entropy": 2.456298828125, + "epoch": 0.006955743222885084, + "grad_norm": 0.5381326079368591, + "learning_rate": 4.03e-06, + "loss": 2.4554, + "mean_token_accuracy": 0.474265918135643, + "num_tokens": 58601535.0, + "step": 404 + }, + { + "entropy": 2.4443359375, + "epoch": 0.006972960409080344, + "grad_norm": 0.5713069438934326, + "learning_rate": 4.04e-06, + "loss": 2.3849, + "mean_token_accuracy": 0.4821310769766569, + "num_tokens": 58732975.0, + "step": 405 + }, + { + "entropy": 2.421142578125, + "epoch": 0.006990177595275604, + "grad_norm": 0.6063797473907471, + "learning_rate": 4.05e-06, + "loss": 2.3477, + "mean_token_accuracy": 0.4892558893188834, + "num_tokens": 58863667.0, + "step": 406 + }, + { + "entropy": 2.4732666015625, + "epoch": 0.007007394781470864, + "grad_norm": 0.5941247940063477, + "learning_rate": 4.060000000000001e-06, + "loss": 2.4404, + "mean_token_accuracy": 0.4769346718676388, + "num_tokens": 59001984.0, + "step": 407 + }, + { + "entropy": 2.4593505859375, + "epoch": 0.007024611967666124, + "grad_norm": 0.5566405653953552, + "learning_rate": 4.07e-06, + "loss": 2.4064, + "mean_token_accuracy": 0.480907566845417, + "num_tokens": 59155387.0, + "step": 408 + }, + { + "entropy": 2.40966796875, + "epoch": 0.007041829153861384, + "grad_norm": 0.5935348868370056, + "learning_rate": 4.08e-06, + "loss": 2.3289, + "mean_token_accuracy": 0.49064510269090533, + "num_tokens": 59311521.0, + "step": 409 + }, + { + "entropy": 2.457275390625, + "epoch": 0.0070590463400566445, + "grad_norm": 0.581543505191803, + "learning_rate": 4.09e-06, + "loss": 2.4144, + "mean_token_accuracy": 0.4777227705344558, + "num_tokens": 59461508.0, + "step": 410 + }, + { + "entropy": 2.46240234375, + "epoch": 0.007076263526251905, + "grad_norm": 0.5600920915603638, + "learning_rate": 4.1e-06, + "loss": 2.4242, + "mean_token_accuracy": 0.47735275235027075, + "num_tokens": 59611309.0, + "step": 411 + }, + { + "entropy": 2.3861083984375, + "epoch": 0.007093480712447165, + "grad_norm": 0.645194411277771, + "learning_rate": 4.1100000000000005e-06, + "loss": 2.3315, + "mean_token_accuracy": 0.4974592626094818, + "num_tokens": 59739837.0, + "step": 412 + }, + { + "entropy": 2.42822265625, + "epoch": 0.007110697898642425, + "grad_norm": 0.5792311429977417, + "learning_rate": 4.12e-06, + "loss": 2.3767, + "mean_token_accuracy": 0.4854038432240486, + "num_tokens": 59874636.0, + "step": 413 + }, + { + "entropy": 2.527587890625, + "epoch": 0.007127915084837685, + "grad_norm": 0.5748518109321594, + "learning_rate": 4.13e-06, + "loss": 2.4767, + "mean_token_accuracy": 0.4697956978343427, + "num_tokens": 60006386.0, + "step": 414 + }, + { + "entropy": 2.4110107421875, + "epoch": 0.0071451322710329454, + "grad_norm": 0.5458788871765137, + "learning_rate": 4.14e-06, + "loss": 2.3364, + "mean_token_accuracy": 0.4888159199617803, + "num_tokens": 60161574.0, + "step": 415 + }, + { + "entropy": 2.4501953125, + "epoch": 0.007162349457228205, + "grad_norm": 0.5602844953536987, + "learning_rate": 4.15e-06, + "loss": 2.4069, + "mean_token_accuracy": 0.4802040630020201, + "num_tokens": 60315301.0, + "step": 416 + }, + { + "entropy": 2.420166015625, + "epoch": 0.007179566643423466, + "grad_norm": 0.5277621150016785, + "learning_rate": 4.16e-06, + "loss": 2.3738, + "mean_token_accuracy": 0.4825376900844276, + "num_tokens": 60475723.0, + "step": 417 + }, + { + "entropy": 2.371337890625, + "epoch": 0.007196783829618725, + "grad_norm": 0.6119918823242188, + "learning_rate": 4.17e-06, + "loss": 2.3183, + "mean_token_accuracy": 0.49627504125237465, + "num_tokens": 60624809.0, + "step": 418 + }, + { + "entropy": 2.43359375, + "epoch": 0.007214001015813986, + "grad_norm": 0.5752723217010498, + "learning_rate": 4.18e-06, + "loss": 2.4064, + "mean_token_accuracy": 0.48068576911464334, + "num_tokens": 60766487.0, + "step": 419 + }, + { + "entropy": 2.55322265625, + "epoch": 0.0072312182020092455, + "grad_norm": 0.591503918170929, + "learning_rate": 4.1900000000000005e-06, + "loss": 2.5353, + "mean_token_accuracy": 0.46583034889772534, + "num_tokens": 60899696.0, + "step": 420 + }, + { + "entropy": 2.482666015625, + "epoch": 0.007248435388204506, + "grad_norm": 0.584340512752533, + "learning_rate": 4.2000000000000004e-06, + "loss": 2.4115, + "mean_token_accuracy": 0.47717864625155926, + "num_tokens": 61036187.0, + "step": 421 + }, + { + "entropy": 2.4241943359375, + "epoch": 0.007265652574399766, + "grad_norm": 0.5605193376541138, + "learning_rate": 4.21e-06, + "loss": 2.4071, + "mean_token_accuracy": 0.4835399743169546, + "num_tokens": 61193032.0, + "step": 422 + }, + { + "entropy": 2.447021484375, + "epoch": 0.007282869760595026, + "grad_norm": 0.5565392374992371, + "learning_rate": 4.22e-06, + "loss": 2.3779, + "mean_token_accuracy": 0.479055879637599, + "num_tokens": 61336014.0, + "step": 423 + }, + { + "entropy": 2.4150390625, + "epoch": 0.007300086946790286, + "grad_norm": 0.5690925717353821, + "learning_rate": 4.23e-06, + "loss": 2.3403, + "mean_token_accuracy": 0.4891307670623064, + "num_tokens": 61482900.0, + "step": 424 + }, + { + "entropy": 2.3863525390625, + "epoch": 0.007317304132985546, + "grad_norm": 0.5693207383155823, + "learning_rate": 4.24e-06, + "loss": 2.3509, + "mean_token_accuracy": 0.4905586871318519, + "num_tokens": 61629105.0, + "step": 425 + }, + { + "entropy": 2.547119140625, + "epoch": 0.007334521319180806, + "grad_norm": 0.5351988077163696, + "learning_rate": 4.25e-06, + "loss": 2.4922, + "mean_token_accuracy": 0.4689330547116697, + "num_tokens": 61780626.0, + "step": 426 + }, + { + "entropy": 2.47998046875, + "epoch": 0.0073517385053760665, + "grad_norm": 0.5875465273857117, + "learning_rate": 4.26e-06, + "loss": 2.4184, + "mean_token_accuracy": 0.4823840647004545, + "num_tokens": 61915045.0, + "step": 427 + }, + { + "entropy": 2.479248046875, + "epoch": 0.007368955691571326, + "grad_norm": 0.5979102253913879, + "learning_rate": 4.270000000000001e-06, + "loss": 2.4622, + "mean_token_accuracy": 0.4727623569779098, + "num_tokens": 62050386.0, + "step": 428 + }, + { + "entropy": 2.4766845703125, + "epoch": 0.007386172877766587, + "grad_norm": 0.5433111190795898, + "learning_rate": 4.2800000000000005e-06, + "loss": 2.3951, + "mean_token_accuracy": 0.4800742859952152, + "num_tokens": 62212530.0, + "step": 429 + }, + { + "entropy": 2.4046630859375, + "epoch": 0.007403390063961846, + "grad_norm": 0.5350104570388794, + "learning_rate": 4.2900000000000004e-06, + "loss": 2.3617, + "mean_token_accuracy": 0.4864892913028598, + "num_tokens": 62370301.0, + "step": 430 + }, + { + "entropy": 2.4168701171875, + "epoch": 0.007420607250157107, + "grad_norm": 0.5780488848686218, + "learning_rate": 4.3e-06, + "loss": 2.3724, + "mean_token_accuracy": 0.48195584304630756, + "num_tokens": 62520205.0, + "step": 431 + }, + { + "entropy": 2.5218505859375, + "epoch": 0.0074378244363523666, + "grad_norm": 0.5742896795272827, + "learning_rate": 4.31e-06, + "loss": 2.4678, + "mean_token_accuracy": 0.47272730339318514, + "num_tokens": 62661467.0, + "step": 432 + }, + { + "entropy": 2.519775390625, + "epoch": 0.007455041622547627, + "grad_norm": 0.5965112447738647, + "learning_rate": 4.32e-06, + "loss": 2.5211, + "mean_token_accuracy": 0.46558596193790436, + "num_tokens": 62798987.0, + "step": 433 + }, + { + "entropy": 2.4462890625, + "epoch": 0.007472258808742887, + "grad_norm": 0.5964140295982361, + "learning_rate": 4.33e-06, + "loss": 2.3445, + "mean_token_accuracy": 0.4845765414647758, + "num_tokens": 62941022.0, + "step": 434 + }, + { + "entropy": 2.403076171875, + "epoch": 0.007489475994938147, + "grad_norm": 0.5478980541229248, + "learning_rate": 4.34e-06, + "loss": 2.3149, + "mean_token_accuracy": 0.4858265924267471, + "num_tokens": 63106699.0, + "step": 435 + }, + { + "entropy": 2.486328125, + "epoch": 0.007506693181133408, + "grad_norm": 0.5869223475456238, + "learning_rate": 4.350000000000001e-06, + "loss": 2.4946, + "mean_token_accuracy": 0.472647019661963, + "num_tokens": 63247500.0, + "step": 436 + }, + { + "entropy": 2.389404296875, + "epoch": 0.0075239103673286675, + "grad_norm": 0.583699107170105, + "learning_rate": 4.360000000000001e-06, + "loss": 2.38, + "mean_token_accuracy": 0.4875432150438428, + "num_tokens": 63397717.0, + "step": 437 + }, + { + "entropy": 2.4312744140625, + "epoch": 0.007541127553523928, + "grad_norm": 0.6965104341506958, + "learning_rate": 4.3700000000000005e-06, + "loss": 2.3725, + "mean_token_accuracy": 0.4779693940654397, + "num_tokens": 63532729.0, + "step": 438 + }, + { + "entropy": 2.510498046875, + "epoch": 0.007558344739719188, + "grad_norm": 0.5973443984985352, + "learning_rate": 4.38e-06, + "loss": 2.5009, + "mean_token_accuracy": 0.469982345122844, + "num_tokens": 63673621.0, + "step": 439 + }, + { + "entropy": 2.414794921875, + "epoch": 0.007575561925914448, + "grad_norm": 0.6061626672744751, + "learning_rate": 4.39e-06, + "loss": 2.3686, + "mean_token_accuracy": 0.4842392741702497, + "num_tokens": 63811912.0, + "step": 440 + }, + { + "entropy": 2.4686279296875, + "epoch": 0.007592779112109708, + "grad_norm": 0.5435473322868347, + "learning_rate": 4.4e-06, + "loss": 2.4623, + "mean_token_accuracy": 0.47148632165044546, + "num_tokens": 63969367.0, + "step": 441 + }, + { + "entropy": 2.380859375, + "epoch": 0.007609996298304968, + "grad_norm": 0.6280810832977295, + "learning_rate": 4.41e-06, + "loss": 2.3327, + "mean_token_accuracy": 0.4927673670463264, + "num_tokens": 64106475.0, + "step": 442 + }, + { + "entropy": 2.431884765625, + "epoch": 0.007627213484500228, + "grad_norm": 0.60051429271698, + "learning_rate": 4.42e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.48384140338748693, + "num_tokens": 64240694.0, + "step": 443 + }, + { + "entropy": 2.482421875, + "epoch": 0.0076444306706954885, + "grad_norm": 0.5727265477180481, + "learning_rate": 4.430000000000001e-06, + "loss": 2.4215, + "mean_token_accuracy": 0.4743316164240241, + "num_tokens": 64379463.0, + "step": 444 + }, + { + "entropy": 2.4876708984375, + "epoch": 0.007661647856890748, + "grad_norm": 0.6397287845611572, + "learning_rate": 4.440000000000001e-06, + "loss": 2.4762, + "mean_token_accuracy": 0.47825985960662365, + "num_tokens": 64533020.0, + "step": 445 + }, + { + "entropy": 2.498046875, + "epoch": 0.007678865043086009, + "grad_norm": 0.5948851704597473, + "learning_rate": 4.450000000000001e-06, + "loss": 2.4374, + "mean_token_accuracy": 0.4723509643226862, + "num_tokens": 64669487.0, + "step": 446 + }, + { + "entropy": 2.45556640625, + "epoch": 0.007696082229281268, + "grad_norm": 0.5741977691650391, + "learning_rate": 4.4600000000000005e-06, + "loss": 2.4201, + "mean_token_accuracy": 0.4786538486368954, + "num_tokens": 64821796.0, + "step": 447 + }, + { + "entropy": 2.421875, + "epoch": 0.007713299415476529, + "grad_norm": 0.5873377323150635, + "learning_rate": 4.47e-06, + "loss": 2.3954, + "mean_token_accuracy": 0.48342282278463244, + "num_tokens": 64959422.0, + "step": 448 + }, + { + "entropy": 2.4324951171875, + "epoch": 0.0077305166016717886, + "grad_norm": 0.6045364737510681, + "learning_rate": 4.48e-06, + "loss": 2.378, + "mean_token_accuracy": 0.48433305928483605, + "num_tokens": 65103972.0, + "step": 449 + }, + { + "entropy": 2.5223388671875, + "epoch": 0.007747733787867049, + "grad_norm": 0.5803834795951843, + "learning_rate": 4.49e-06, + "loss": 2.4753, + "mean_token_accuracy": 0.469526968896389, + "num_tokens": 65240634.0, + "step": 450 + }, + { + "entropy": 2.517822265625, + "epoch": 0.007764950974062309, + "grad_norm": 0.5813508033752441, + "learning_rate": 4.5e-06, + "loss": 2.5203, + "mean_token_accuracy": 0.472479164134711, + "num_tokens": 65396335.0, + "step": 451 + }, + { + "entropy": 2.4752197265625, + "epoch": 0.007782168160257569, + "grad_norm": 0.8954483270645142, + "learning_rate": 4.510000000000001e-06, + "loss": 2.428, + "mean_token_accuracy": 0.4781260257586837, + "num_tokens": 65551212.0, + "step": 452 + }, + { + "entropy": 2.38916015625, + "epoch": 0.007799385346452829, + "grad_norm": 0.5381391644477844, + "learning_rate": 4.520000000000001e-06, + "loss": 2.3428, + "mean_token_accuracy": 0.4869826496578753, + "num_tokens": 65706488.0, + "step": 453 + }, + { + "entropy": 2.3837890625, + "epoch": 0.00781660253264809, + "grad_norm": 0.6232318878173828, + "learning_rate": 4.530000000000001e-06, + "loss": 2.3453, + "mean_token_accuracy": 0.4916538861580193, + "num_tokens": 65848771.0, + "step": 454 + }, + { + "entropy": 2.39892578125, + "epoch": 0.00783381971884335, + "grad_norm": 0.5655352473258972, + "learning_rate": 4.540000000000001e-06, + "loss": 2.3439, + "mean_token_accuracy": 0.48888020450249314, + "num_tokens": 66002377.0, + "step": 455 + }, + { + "entropy": 2.50732421875, + "epoch": 0.007851036905038609, + "grad_norm": 0.5934751033782959, + "learning_rate": 4.5500000000000005e-06, + "loss": 2.452, + "mean_token_accuracy": 0.4774050717242062, + "num_tokens": 66140992.0, + "step": 456 + }, + { + "entropy": 2.486572265625, + "epoch": 0.00786825409123387, + "grad_norm": 0.5821974873542786, + "learning_rate": 4.56e-06, + "loss": 2.4434, + "mean_token_accuracy": 0.47727885795757174, + "num_tokens": 66283307.0, + "step": 457 + }, + { + "entropy": 2.4403076171875, + "epoch": 0.00788547127742913, + "grad_norm": 0.6021990180015564, + "learning_rate": 4.57e-06, + "loss": 2.3891, + "mean_token_accuracy": 0.48405969655141234, + "num_tokens": 66430065.0, + "step": 458 + }, + { + "entropy": 2.4219970703125, + "epoch": 0.00790268846362439, + "grad_norm": 0.5609884858131409, + "learning_rate": 4.58e-06, + "loss": 2.3585, + "mean_token_accuracy": 0.4830049378797412, + "num_tokens": 66580625.0, + "step": 459 + }, + { + "entropy": 2.4365234375, + "epoch": 0.007919905649819649, + "grad_norm": 0.6351988911628723, + "learning_rate": 4.590000000000001e-06, + "loss": 2.3914, + "mean_token_accuracy": 0.4825670407153666, + "num_tokens": 66710819.0, + "step": 460 + }, + { + "entropy": 2.388916015625, + "epoch": 0.00793712283601491, + "grad_norm": 0.5797377824783325, + "learning_rate": 4.600000000000001e-06, + "loss": 2.3401, + "mean_token_accuracy": 0.49362843204289675, + "num_tokens": 66855860.0, + "step": 461 + }, + { + "entropy": 2.502685546875, + "epoch": 0.00795434002221017, + "grad_norm": 0.5734979510307312, + "learning_rate": 4.610000000000001e-06, + "loss": 2.475, + "mean_token_accuracy": 0.468968971632421, + "num_tokens": 66991905.0, + "step": 462 + }, + { + "entropy": 2.4248046875, + "epoch": 0.00797155720840543, + "grad_norm": 0.5948538184165955, + "learning_rate": 4.620000000000001e-06, + "loss": 2.3821, + "mean_token_accuracy": 0.49015933787450194, + "num_tokens": 67140600.0, + "step": 463 + }, + { + "entropy": 2.4041748046875, + "epoch": 0.007988774394600691, + "grad_norm": 0.5927978754043579, + "learning_rate": 4.6300000000000006e-06, + "loss": 2.3563, + "mean_token_accuracy": 0.4928512079641223, + "num_tokens": 67296282.0, + "step": 464 + }, + { + "entropy": 2.44189453125, + "epoch": 0.00800599158079595, + "grad_norm": 0.5638139247894287, + "learning_rate": 4.6400000000000005e-06, + "loss": 2.4232, + "mean_token_accuracy": 0.47663756739348173, + "num_tokens": 67445364.0, + "step": 465 + }, + { + "entropy": 2.440673828125, + "epoch": 0.00802320876699121, + "grad_norm": 0.6092103123664856, + "learning_rate": 4.65e-06, + "loss": 2.3328, + "mean_token_accuracy": 0.487199897877872, + "num_tokens": 67574552.0, + "step": 466 + }, + { + "entropy": 2.431396484375, + "epoch": 0.008040425953186471, + "grad_norm": 0.5823683738708496, + "learning_rate": 4.66e-06, + "loss": 2.3507, + "mean_token_accuracy": 0.4836606332100928, + "num_tokens": 67715593.0, + "step": 467 + }, + { + "entropy": 2.4288330078125, + "epoch": 0.008057643139381732, + "grad_norm": 0.571380615234375, + "learning_rate": 4.670000000000001e-06, + "loss": 2.376, + "mean_token_accuracy": 0.4848336656577885, + "num_tokens": 67851959.0, + "step": 468 + }, + { + "entropy": 2.4405517578125, + "epoch": 0.00807486032557699, + "grad_norm": 0.5497409105300903, + "learning_rate": 4.680000000000001e-06, + "loss": 2.3331, + "mean_token_accuracy": 0.49043108662590384, + "num_tokens": 67999315.0, + "step": 469 + }, + { + "entropy": 2.4580078125, + "epoch": 0.008092077511772251, + "grad_norm": 0.6131438612937927, + "learning_rate": 4.69e-06, + "loss": 2.4471, + "mean_token_accuracy": 0.48354413686320186, + "num_tokens": 68151679.0, + "step": 470 + }, + { + "entropy": 2.45361328125, + "epoch": 0.008109294697967511, + "grad_norm": 0.5572087168693542, + "learning_rate": 4.7e-06, + "loss": 2.408, + "mean_token_accuracy": 0.4743777387775481, + "num_tokens": 68303962.0, + "step": 471 + }, + { + "entropy": 2.4327392578125, + "epoch": 0.008126511884162772, + "grad_norm": 0.6229270696640015, + "learning_rate": 4.71e-06, + "loss": 2.4372, + "mean_token_accuracy": 0.4852800811640918, + "num_tokens": 68435424.0, + "step": 472 + }, + { + "entropy": 2.44580078125, + "epoch": 0.00814372907035803, + "grad_norm": 0.5797957181930542, + "learning_rate": 4.7200000000000005e-06, + "loss": 2.4088, + "mean_token_accuracy": 0.48404247080907226, + "num_tokens": 68579820.0, + "step": 473 + }, + { + "entropy": 2.49560546875, + "epoch": 0.008160946256553291, + "grad_norm": 0.5527267456054688, + "learning_rate": 4.7300000000000005e-06, + "loss": 2.3781, + "mean_token_accuracy": 0.47309215646237135, + "num_tokens": 68720632.0, + "step": 474 + }, + { + "entropy": 2.4736328125, + "epoch": 0.008178163442748552, + "grad_norm": 0.6314601302146912, + "learning_rate": 4.74e-06, + "loss": 2.4292, + "mean_token_accuracy": 0.47931129625067115, + "num_tokens": 68847385.0, + "step": 475 + }, + { + "entropy": 2.43603515625, + "epoch": 0.008195380628943812, + "grad_norm": 0.5509300827980042, + "learning_rate": 4.75e-06, + "loss": 2.4099, + "mean_token_accuracy": 0.4842258528806269, + "num_tokens": 68997389.0, + "step": 476 + }, + { + "entropy": 2.41748046875, + "epoch": 0.008212597815139071, + "grad_norm": 0.5300650000572205, + "learning_rate": 4.76e-06, + "loss": 2.3382, + "mean_token_accuracy": 0.48574389703571796, + "num_tokens": 69155404.0, + "step": 477 + }, + { + "entropy": 2.4625244140625, + "epoch": 0.008229815001334332, + "grad_norm": 0.6269882321357727, + "learning_rate": 4.77e-06, + "loss": 2.3776, + "mean_token_accuracy": 0.4838248719461262, + "num_tokens": 69290738.0, + "step": 478 + }, + { + "entropy": 2.437255859375, + "epoch": 0.008247032187529592, + "grad_norm": 0.6751134395599365, + "learning_rate": 4.78e-06, + "loss": 2.4007, + "mean_token_accuracy": 0.48909331811591983, + "num_tokens": 69408062.0, + "step": 479 + }, + { + "entropy": 2.4910888671875, + "epoch": 0.008264249373724853, + "grad_norm": 0.596396267414093, + "learning_rate": 4.79e-06, + "loss": 2.4679, + "mean_token_accuracy": 0.47261510556563735, + "num_tokens": 69539526.0, + "step": 480 + }, + { + "entropy": 2.4178466796875, + "epoch": 0.008281466559920111, + "grad_norm": 0.6338604688644409, + "learning_rate": 4.800000000000001e-06, + "loss": 2.3809, + "mean_token_accuracy": 0.4872106113471091, + "num_tokens": 69691415.0, + "step": 481 + }, + { + "entropy": 2.419189453125, + "epoch": 0.008298683746115372, + "grad_norm": 0.5384358167648315, + "learning_rate": 4.8100000000000005e-06, + "loss": 2.3549, + "mean_token_accuracy": 0.49019210133701563, + "num_tokens": 69848010.0, + "step": 482 + }, + { + "entropy": 2.4853515625, + "epoch": 0.008315900932310633, + "grad_norm": 0.5779227614402771, + "learning_rate": 4.8200000000000004e-06, + "loss": 2.4459, + "mean_token_accuracy": 0.4744481286033988, + "num_tokens": 69997662.0, + "step": 483 + }, + { + "entropy": 2.41064453125, + "epoch": 0.008333118118505893, + "grad_norm": 0.5691106915473938, + "learning_rate": 4.83e-06, + "loss": 2.3619, + "mean_token_accuracy": 0.4926013760268688, + "num_tokens": 70147997.0, + "step": 484 + }, + { + "entropy": 2.382080078125, + "epoch": 0.008350335304701152, + "grad_norm": 0.5725884437561035, + "learning_rate": 4.84e-06, + "loss": 2.3575, + "mean_token_accuracy": 0.489425728097558, + "num_tokens": 70294184.0, + "step": 485 + }, + { + "entropy": 2.47314453125, + "epoch": 0.008367552490896412, + "grad_norm": 0.5658232569694519, + "learning_rate": 4.85e-06, + "loss": 2.426, + "mean_token_accuracy": 0.4749414478428662, + "num_tokens": 70442248.0, + "step": 486 + }, + { + "entropy": 2.483642578125, + "epoch": 0.008384769677091673, + "grad_norm": 0.6129222512245178, + "learning_rate": 4.86e-06, + "loss": 2.4917, + "mean_token_accuracy": 0.4728327440097928, + "num_tokens": 70579740.0, + "step": 487 + }, + { + "entropy": 2.4320068359375, + "epoch": 0.008401986863286933, + "grad_norm": 0.9568486213684082, + "learning_rate": 4.87e-06, + "loss": 2.3935, + "mean_token_accuracy": 0.4829396088607609, + "num_tokens": 70721524.0, + "step": 488 + }, + { + "entropy": 2.459228515625, + "epoch": 0.008419204049482194, + "grad_norm": 0.544103741645813, + "learning_rate": 4.880000000000001e-06, + "loss": 2.4176, + "mean_token_accuracy": 0.476782136131078, + "num_tokens": 70877087.0, + "step": 489 + }, + { + "entropy": 2.405517578125, + "epoch": 0.008436421235677453, + "grad_norm": 0.5833263397216797, + "learning_rate": 4.890000000000001e-06, + "loss": 2.3254, + "mean_token_accuracy": 0.4887930774129927, + "num_tokens": 71009717.0, + "step": 490 + }, + { + "entropy": 2.45263671875, + "epoch": 0.008453638421872713, + "grad_norm": 0.5471597909927368, + "learning_rate": 4.9000000000000005e-06, + "loss": 2.4063, + "mean_token_accuracy": 0.48418434290215373, + "num_tokens": 71166661.0, + "step": 491 + }, + { + "entropy": 2.4708251953125, + "epoch": 0.008470855608067974, + "grad_norm": 0.5708027482032776, + "learning_rate": 4.9100000000000004e-06, + "loss": 2.3853, + "mean_token_accuracy": 0.4815034563653171, + "num_tokens": 71313337.0, + "step": 492 + }, + { + "entropy": 2.4544677734375, + "epoch": 0.008488072794263234, + "grad_norm": 0.5583227872848511, + "learning_rate": 4.92e-06, + "loss": 2.4402, + "mean_token_accuracy": 0.47839572792872787, + "num_tokens": 71472447.0, + "step": 493 + }, + { + "entropy": 2.5404052734375, + "epoch": 0.008505289980458493, + "grad_norm": 0.578138530254364, + "learning_rate": 4.93e-06, + "loss": 2.4715, + "mean_token_accuracy": 0.47347771003842354, + "num_tokens": 71615228.0, + "step": 494 + }, + { + "entropy": 2.3878173828125, + "epoch": 0.008522507166653754, + "grad_norm": 0.5488659143447876, + "learning_rate": 4.94e-06, + "loss": 2.3633, + "mean_token_accuracy": 0.49038203712552786, + "num_tokens": 71773695.0, + "step": 495 + }, + { + "entropy": 2.4010009765625, + "epoch": 0.008539724352849014, + "grad_norm": 0.61408531665802, + "learning_rate": 4.95e-06, + "loss": 2.3474, + "mean_token_accuracy": 0.48974387207999825, + "num_tokens": 71924349.0, + "step": 496 + }, + { + "entropy": 2.4000244140625, + "epoch": 0.008556941539044275, + "grad_norm": 0.5874980688095093, + "learning_rate": 4.960000000000001e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.49134851479902864, + "num_tokens": 72064362.0, + "step": 497 + }, + { + "entropy": 2.412353515625, + "epoch": 0.008574158725239533, + "grad_norm": 0.6588267087936401, + "learning_rate": 4.970000000000001e-06, + "loss": 2.3615, + "mean_token_accuracy": 0.48920353641733527, + "num_tokens": 72195808.0, + "step": 498 + }, + { + "entropy": 2.4034423828125, + "epoch": 0.008591375911434794, + "grad_norm": 0.5799132585525513, + "learning_rate": 4.980000000000001e-06, + "loss": 2.3648, + "mean_token_accuracy": 0.4861590703949332, + "num_tokens": 72329610.0, + "step": 499 + }, + { + "entropy": 2.4595947265625, + "epoch": 0.008608593097630055, + "grad_norm": 0.5722876787185669, + "learning_rate": 4.9900000000000005e-06, + "loss": 2.4214, + "mean_token_accuracy": 0.47663868917152286, + "num_tokens": 72472581.0, + "step": 500 + }, + { + "entropy": 2.491943359375, + "epoch": 0.008625810283825315, + "grad_norm": 0.5580865740776062, + "learning_rate": 5e-06, + "loss": 2.4629, + "mean_token_accuracy": 0.4767118990421295, + "num_tokens": 72616609.0, + "step": 501 + }, + { + "entropy": 2.3966064453125, + "epoch": 0.008643027470020574, + "grad_norm": 0.5624485015869141, + "learning_rate": 5.01e-06, + "loss": 2.3162, + "mean_token_accuracy": 0.4926113812252879, + "num_tokens": 72772528.0, + "step": 502 + }, + { + "entropy": 2.401611328125, + "epoch": 0.008660244656215834, + "grad_norm": 0.5372104644775391, + "learning_rate": 5.02e-06, + "loss": 2.3705, + "mean_token_accuracy": 0.4896852090023458, + "num_tokens": 72936711.0, + "step": 503 + }, + { + "entropy": 2.5235595703125, + "epoch": 0.008677461842411095, + "grad_norm": 0.6084150075912476, + "learning_rate": 5.03e-06, + "loss": 2.4556, + "mean_token_accuracy": 0.47214356577023864, + "num_tokens": 73065595.0, + "step": 504 + }, + { + "entropy": 2.3858642578125, + "epoch": 0.008694679028606355, + "grad_norm": 0.555059015750885, + "learning_rate": 5.04e-06, + "loss": 2.3394, + "mean_token_accuracy": 0.4875369444489479, + "num_tokens": 73214645.0, + "step": 505 + }, + { + "entropy": 2.473876953125, + "epoch": 0.008711896214801614, + "grad_norm": 0.5269579887390137, + "learning_rate": 5.050000000000001e-06, + "loss": 2.3993, + "mean_token_accuracy": 0.4812088548205793, + "num_tokens": 73376592.0, + "step": 506 + }, + { + "entropy": 2.408935546875, + "epoch": 0.008729113400996875, + "grad_norm": 0.5673913955688477, + "learning_rate": 5.060000000000001e-06, + "loss": 2.3398, + "mean_token_accuracy": 0.4860687367618084, + "num_tokens": 73524487.0, + "step": 507 + }, + { + "entropy": 2.4530029296875, + "epoch": 0.008746330587192135, + "grad_norm": 0.6308383345603943, + "learning_rate": 5.070000000000001e-06, + "loss": 2.4261, + "mean_token_accuracy": 0.4800183614715934, + "num_tokens": 73662683.0, + "step": 508 + }, + { + "entropy": 2.451171875, + "epoch": 0.008763547773387396, + "grad_norm": 0.5882863998413086, + "learning_rate": 5.0800000000000005e-06, + "loss": 2.3984, + "mean_token_accuracy": 0.48627386754378676, + "num_tokens": 73798089.0, + "step": 509 + }, + { + "entropy": 2.4722900390625, + "epoch": 0.008780764959582655, + "grad_norm": 0.5577517747879028, + "learning_rate": 5.09e-06, + "loss": 2.4251, + "mean_token_accuracy": 0.472780239302665, + "num_tokens": 73951527.0, + "step": 510 + }, + { + "entropy": 2.439208984375, + "epoch": 0.008797982145777915, + "grad_norm": 0.5598065257072449, + "learning_rate": 5.1e-06, + "loss": 2.3522, + "mean_token_accuracy": 0.4847842915914953, + "num_tokens": 74106602.0, + "step": 511 + }, + { + "entropy": 2.4395751953125, + "epoch": 0.008815199331973176, + "grad_norm": 0.6455389857292175, + "learning_rate": 5.11e-06, + "loss": 2.4265, + "mean_token_accuracy": 0.47678500413894653, + "num_tokens": 74248860.0, + "step": 512 + }, + { + "entropy": 2.429443359375, + "epoch": 0.008832416518168436, + "grad_norm": 0.5287806391716003, + "learning_rate": 5.12e-06, + "loss": 2.3812, + "mean_token_accuracy": 0.4817400835454464, + "num_tokens": 74406358.0, + "step": 513 + }, + { + "entropy": 2.409423828125, + "epoch": 0.008849633704363697, + "grad_norm": 0.5710276961326599, + "learning_rate": 5.130000000000001e-06, + "loss": 2.3643, + "mean_token_accuracy": 0.4914928264915943, + "num_tokens": 74556098.0, + "step": 514 + }, + { + "entropy": 2.4224853515625, + "epoch": 0.008866850890558955, + "grad_norm": 0.5766054391860962, + "learning_rate": 5.140000000000001e-06, + "loss": 2.372, + "mean_token_accuracy": 0.48493917658925056, + "num_tokens": 74692650.0, + "step": 515 + }, + { + "entropy": 2.424072265625, + "epoch": 0.008884068076754216, + "grad_norm": 0.6063751578330994, + "learning_rate": 5.150000000000001e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.48274309001863003, + "num_tokens": 74834650.0, + "step": 516 + }, + { + "entropy": 2.4222412109375, + "epoch": 0.008901285262949477, + "grad_norm": 0.5876448750495911, + "learning_rate": 5.1600000000000006e-06, + "loss": 2.3979, + "mean_token_accuracy": 0.48454813193529844, + "num_tokens": 74975623.0, + "step": 517 + }, + { + "entropy": 2.4189453125, + "epoch": 0.008918502449144737, + "grad_norm": 0.5868141055107117, + "learning_rate": 5.1700000000000005e-06, + "loss": 2.3741, + "mean_token_accuracy": 0.48601944325491786, + "num_tokens": 75124642.0, + "step": 518 + }, + { + "entropy": 2.47705078125, + "epoch": 0.008935719635339996, + "grad_norm": 0.5537886023521423, + "learning_rate": 5.18e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.47799683222547174, + "num_tokens": 75268718.0, + "step": 519 + }, + { + "entropy": 2.400390625, + "epoch": 0.008952936821535256, + "grad_norm": 0.573029637336731, + "learning_rate": 5.19e-06, + "loss": 2.3484, + "mean_token_accuracy": 0.48853049660101533, + "num_tokens": 75407285.0, + "step": 520 + }, + { + "entropy": 2.458251953125, + "epoch": 0.008970154007730517, + "grad_norm": 0.6153673529624939, + "learning_rate": 5.2e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.4795549106784165, + "num_tokens": 75544196.0, + "step": 521 + }, + { + "entropy": 2.474853515625, + "epoch": 0.008987371193925777, + "grad_norm": 0.6100261807441711, + "learning_rate": 5.210000000000001e-06, + "loss": 2.4373, + "mean_token_accuracy": 0.47731659887358546, + "num_tokens": 75675964.0, + "step": 522 + }, + { + "entropy": 2.486328125, + "epoch": 0.009004588380121036, + "grad_norm": 0.5929141044616699, + "learning_rate": 5.220000000000001e-06, + "loss": 2.4467, + "mean_token_accuracy": 0.4785361369140446, + "num_tokens": 75820140.0, + "step": 523 + }, + { + "entropy": 2.472900390625, + "epoch": 0.009021805566316297, + "grad_norm": 0.5526888370513916, + "learning_rate": 5.230000000000001e-06, + "loss": 2.393, + "mean_token_accuracy": 0.4825805053114891, + "num_tokens": 75975261.0, + "step": 524 + }, + { + "entropy": 2.382080078125, + "epoch": 0.009039022752511557, + "grad_norm": 0.5640392303466797, + "learning_rate": 5.240000000000001e-06, + "loss": 2.3364, + "mean_token_accuracy": 0.48861693032085896, + "num_tokens": 76121140.0, + "step": 525 + }, + { + "entropy": 2.4207763671875, + "epoch": 0.009056239938706818, + "grad_norm": 0.626315712928772, + "learning_rate": 5.2500000000000006e-06, + "loss": 2.3691, + "mean_token_accuracy": 0.4876149254851043, + "num_tokens": 76249826.0, + "step": 526 + }, + { + "entropy": 2.468505859375, + "epoch": 0.009073457124902077, + "grad_norm": 0.5884665846824646, + "learning_rate": 5.2600000000000005e-06, + "loss": 2.4473, + "mean_token_accuracy": 0.479018023237586, + "num_tokens": 76382197.0, + "step": 527 + }, + { + "entropy": 2.551513671875, + "epoch": 0.009090674311097337, + "grad_norm": 0.6032463908195496, + "learning_rate": 5.27e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.4695024830289185, + "num_tokens": 76523850.0, + "step": 528 + }, + { + "entropy": 2.5267333984375, + "epoch": 0.009107891497292598, + "grad_norm": 0.5680440068244934, + "learning_rate": 5.28e-06, + "loss": 2.4922, + "mean_token_accuracy": 0.4694391922093928, + "num_tokens": 76675341.0, + "step": 529 + }, + { + "entropy": 2.45654296875, + "epoch": 0.009125108683487858, + "grad_norm": 0.5584273338317871, + "learning_rate": 5.290000000000001e-06, + "loss": 2.4558, + "mean_token_accuracy": 0.4753922396339476, + "num_tokens": 76839148.0, + "step": 530 + }, + { + "entropy": 2.447998046875, + "epoch": 0.009142325869683117, + "grad_norm": 0.5700913667678833, + "learning_rate": 5.300000000000001e-06, + "loss": 2.3795, + "mean_token_accuracy": 0.4812642466276884, + "num_tokens": 76972890.0, + "step": 531 + }, + { + "entropy": 2.39208984375, + "epoch": 0.009159543055878377, + "grad_norm": 0.5814207196235657, + "learning_rate": 5.310000000000001e-06, + "loss": 2.3494, + "mean_token_accuracy": 0.4927141284570098, + "num_tokens": 77116305.0, + "step": 532 + }, + { + "entropy": 2.5166015625, + "epoch": 0.009176760242073638, + "grad_norm": 0.6363142728805542, + "learning_rate": 5.320000000000001e-06, + "loss": 2.4734, + "mean_token_accuracy": 0.46889354614540935, + "num_tokens": 77241586.0, + "step": 533 + }, + { + "entropy": 2.561279296875, + "epoch": 0.009193977428268899, + "grad_norm": 0.5870820879936218, + "learning_rate": 5.330000000000001e-06, + "loss": 2.4981, + "mean_token_accuracy": 0.4720300040207803, + "num_tokens": 77382449.0, + "step": 534 + }, + { + "entropy": 2.4193115234375, + "epoch": 0.009211194614464157, + "grad_norm": 0.5664909482002258, + "learning_rate": 5.3400000000000005e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.48326091514900327, + "num_tokens": 77519348.0, + "step": 535 + }, + { + "entropy": 2.3780517578125, + "epoch": 0.009228411800659418, + "grad_norm": 0.677797794342041, + "learning_rate": 5.3500000000000004e-06, + "loss": 2.3674, + "mean_token_accuracy": 0.49126658914610744, + "num_tokens": 77672043.0, + "step": 536 + }, + { + "entropy": 2.447021484375, + "epoch": 0.009245628986854678, + "grad_norm": 0.5524245500564575, + "learning_rate": 5.36e-06, + "loss": 2.4252, + "mean_token_accuracy": 0.47922995453700423, + "num_tokens": 77815854.0, + "step": 537 + }, + { + "entropy": 2.3614501953125, + "epoch": 0.009262846173049939, + "grad_norm": 0.5834474563598633, + "learning_rate": 5.370000000000001e-06, + "loss": 2.3147, + "mean_token_accuracy": 0.4956310335546732, + "num_tokens": 77971110.0, + "step": 538 + }, + { + "entropy": 2.4951171875, + "epoch": 0.0092800633592452, + "grad_norm": 0.5798099040985107, + "learning_rate": 5.380000000000001e-06, + "loss": 2.4241, + "mean_token_accuracy": 0.4767613257281482, + "num_tokens": 78111794.0, + "step": 539 + }, + { + "entropy": 2.502197265625, + "epoch": 0.009297280545440458, + "grad_norm": 0.6156012415885925, + "learning_rate": 5.390000000000001e-06, + "loss": 2.4368, + "mean_token_accuracy": 0.4805396613664925, + "num_tokens": 78259210.0, + "step": 540 + }, + { + "entropy": 2.4141845703125, + "epoch": 0.009314497731635719, + "grad_norm": 0.5888620615005493, + "learning_rate": 5.400000000000001e-06, + "loss": 2.3621, + "mean_token_accuracy": 0.4862367841415107, + "num_tokens": 78400181.0, + "step": 541 + }, + { + "entropy": 2.41845703125, + "epoch": 0.00933171491783098, + "grad_norm": 0.5387586355209351, + "learning_rate": 5.410000000000001e-06, + "loss": 2.3808, + "mean_token_accuracy": 0.48267924739047885, + "num_tokens": 78558105.0, + "step": 542 + }, + { + "entropy": 2.4661865234375, + "epoch": 0.00934893210402624, + "grad_norm": 0.5678828954696655, + "learning_rate": 5.420000000000001e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.4831194179132581, + "num_tokens": 78706162.0, + "step": 543 + }, + { + "entropy": 2.3319091796875, + "epoch": 0.009366149290221499, + "grad_norm": 0.5745298862457275, + "learning_rate": 5.4300000000000005e-06, + "loss": 2.2771, + "mean_token_accuracy": 0.5001829275861382, + "num_tokens": 78850606.0, + "step": 544 + }, + { + "entropy": 2.492919921875, + "epoch": 0.009383366476416759, + "grad_norm": 0.5902231931686401, + "learning_rate": 5.4400000000000004e-06, + "loss": 2.4254, + "mean_token_accuracy": 0.4752844786271453, + "num_tokens": 78990742.0, + "step": 545 + }, + { + "entropy": 2.5008544921875, + "epoch": 0.00940058366261202, + "grad_norm": 0.6180260181427002, + "learning_rate": 5.450000000000001e-06, + "loss": 2.5212, + "mean_token_accuracy": 0.4779182830825448, + "num_tokens": 79147975.0, + "step": 546 + }, + { + "entropy": 2.451171875, + "epoch": 0.00941780084880728, + "grad_norm": 0.571796715259552, + "learning_rate": 5.460000000000001e-06, + "loss": 2.3884, + "mean_token_accuracy": 0.47992962412536144, + "num_tokens": 79295374.0, + "step": 547 + }, + { + "entropy": 2.451416015625, + "epoch": 0.009435018035002539, + "grad_norm": 0.6627357006072998, + "learning_rate": 5.470000000000001e-06, + "loss": 2.4386, + "mean_token_accuracy": 0.4789085192605853, + "num_tokens": 79435365.0, + "step": 548 + }, + { + "entropy": 2.44091796875, + "epoch": 0.0094522352211978, + "grad_norm": 0.5701068639755249, + "learning_rate": 5.480000000000001e-06, + "loss": 2.3711, + "mean_token_accuracy": 0.48519957158714533, + "num_tokens": 79583578.0, + "step": 549 + }, + { + "entropy": 2.4254150390625, + "epoch": 0.00946945240739306, + "grad_norm": 0.5793823599815369, + "learning_rate": 5.490000000000001e-06, + "loss": 2.3977, + "mean_token_accuracy": 0.4862471236847341, + "num_tokens": 79731666.0, + "step": 550 + }, + { + "entropy": 2.4296875, + "epoch": 0.00948666959358832, + "grad_norm": 0.538820743560791, + "learning_rate": 5.500000000000001e-06, + "loss": 2.3655, + "mean_token_accuracy": 0.487091563642025, + "num_tokens": 79879814.0, + "step": 551 + }, + { + "entropy": 2.5390625, + "epoch": 0.00950388677978358, + "grad_norm": 0.5948156714439392, + "learning_rate": 5.510000000000001e-06, + "loss": 2.4975, + "mean_token_accuracy": 0.46823509922251105, + "num_tokens": 80021770.0, + "step": 552 + }, + { + "entropy": 2.455322265625, + "epoch": 0.00952110396597884, + "grad_norm": 0.5883845686912537, + "learning_rate": 5.5200000000000005e-06, + "loss": 2.3802, + "mean_token_accuracy": 0.47493234230205417, + "num_tokens": 80165630.0, + "step": 553 + }, + { + "entropy": 2.474365234375, + "epoch": 0.0095383211521741, + "grad_norm": 0.6061490774154663, + "learning_rate": 5.530000000000001e-06, + "loss": 2.4588, + "mean_token_accuracy": 0.4762619035318494, + "num_tokens": 80320803.0, + "step": 554 + }, + { + "entropy": 2.3839111328125, + "epoch": 0.009555538338369361, + "grad_norm": 0.5892252922058105, + "learning_rate": 5.540000000000001e-06, + "loss": 2.3663, + "mean_token_accuracy": 0.48988539073616266, + "num_tokens": 80455211.0, + "step": 555 + }, + { + "entropy": 2.3966064453125, + "epoch": 0.00957275552456462, + "grad_norm": 0.5676023960113525, + "learning_rate": 5.550000000000001e-06, + "loss": 2.3757, + "mean_token_accuracy": 0.4864460411481559, + "num_tokens": 80600140.0, + "step": 556 + }, + { + "entropy": 2.411865234375, + "epoch": 0.00958997271075988, + "grad_norm": 0.6370622515678406, + "learning_rate": 5.560000000000001e-06, + "loss": 2.4048, + "mean_token_accuracy": 0.4831241965293884, + "num_tokens": 80733228.0, + "step": 557 + }, + { + "entropy": 2.453369140625, + "epoch": 0.00960718989695514, + "grad_norm": 0.6127820611000061, + "learning_rate": 5.570000000000001e-06, + "loss": 2.4203, + "mean_token_accuracy": 0.48309980193153024, + "num_tokens": 80870174.0, + "step": 558 + }, + { + "entropy": 2.4246826171875, + "epoch": 0.009624407083150401, + "grad_norm": 0.6448102593421936, + "learning_rate": 5.580000000000001e-06, + "loss": 2.424, + "mean_token_accuracy": 0.4830983644351363, + "num_tokens": 81027247.0, + "step": 559 + }, + { + "entropy": 2.4490966796875, + "epoch": 0.00964162426934566, + "grad_norm": 0.6132236123085022, + "learning_rate": 5.590000000000001e-06, + "loss": 2.3908, + "mean_token_accuracy": 0.48710555862635374, + "num_tokens": 81181289.0, + "step": 560 + }, + { + "entropy": 2.4490966796875, + "epoch": 0.00965884145554092, + "grad_norm": 0.5913870334625244, + "learning_rate": 5.600000000000001e-06, + "loss": 2.4276, + "mean_token_accuracy": 0.47844883892685175, + "num_tokens": 81317554.0, + "step": 561 + }, + { + "entropy": 2.3934326171875, + "epoch": 0.009676058641736181, + "grad_norm": 0.5637186169624329, + "learning_rate": 5.610000000000001e-06, + "loss": 2.386, + "mean_token_accuracy": 0.49396452866494656, + "num_tokens": 81464385.0, + "step": 562 + }, + { + "entropy": 2.446044921875, + "epoch": 0.009693275827931442, + "grad_norm": 0.6086047887802124, + "learning_rate": 5.620000000000001e-06, + "loss": 2.3744, + "mean_token_accuracy": 0.4851002893410623, + "num_tokens": 81595954.0, + "step": 563 + }, + { + "entropy": 2.514404296875, + "epoch": 0.0097104930141267, + "grad_norm": 0.5994987487792969, + "learning_rate": 5.63e-06, + "loss": 2.445, + "mean_token_accuracy": 0.47357298992574215, + "num_tokens": 81731590.0, + "step": 564 + }, + { + "entropy": 2.452392578125, + "epoch": 0.009727710200321961, + "grad_norm": 0.5923967361450195, + "learning_rate": 5.64e-06, + "loss": 2.4243, + "mean_token_accuracy": 0.48147726710885763, + "num_tokens": 81874631.0, + "step": 565 + }, + { + "entropy": 2.49462890625, + "epoch": 0.009744927386517221, + "grad_norm": 0.5803849697113037, + "learning_rate": 5.65e-06, + "loss": 2.4918, + "mean_token_accuracy": 0.4728530729189515, + "num_tokens": 82032208.0, + "step": 566 + }, + { + "entropy": 2.5015869140625, + "epoch": 0.009762144572712482, + "grad_norm": 0.5941510200500488, + "learning_rate": 5.66e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.4712064489722252, + "num_tokens": 82175645.0, + "step": 567 + }, + { + "entropy": 2.453125, + "epoch": 0.009779361758907743, + "grad_norm": 0.5803316831588745, + "learning_rate": 5.67e-06, + "loss": 2.407, + "mean_token_accuracy": 0.4795740279369056, + "num_tokens": 82314242.0, + "step": 568 + }, + { + "entropy": 2.4854736328125, + "epoch": 0.009796578945103001, + "grad_norm": 0.6026585698127747, + "learning_rate": 5.68e-06, + "loss": 2.4803, + "mean_token_accuracy": 0.4731423007324338, + "num_tokens": 82441744.0, + "step": 569 + }, + { + "entropy": 2.469482421875, + "epoch": 0.009813796131298262, + "grad_norm": 0.5523576140403748, + "learning_rate": 5.69e-06, + "loss": 2.4117, + "mean_token_accuracy": 0.4742597243748605, + "num_tokens": 82586104.0, + "step": 570 + }, + { + "entropy": 2.420166015625, + "epoch": 0.009831013317493522, + "grad_norm": 0.5471956133842468, + "learning_rate": 5.7e-06, + "loss": 2.3751, + "mean_token_accuracy": 0.48100833920761943, + "num_tokens": 82736506.0, + "step": 571 + }, + { + "entropy": 2.5037841796875, + "epoch": 0.009848230503688783, + "grad_norm": 0.57561856508255, + "learning_rate": 5.71e-06, + "loss": 2.4685, + "mean_token_accuracy": 0.46707925060763955, + "num_tokens": 82889738.0, + "step": 572 + }, + { + "entropy": 2.450927734375, + "epoch": 0.009865447689884042, + "grad_norm": 0.5832479596138, + "learning_rate": 5.72e-06, + "loss": 2.4341, + "mean_token_accuracy": 0.4823454241268337, + "num_tokens": 83033413.0, + "step": 573 + }, + { + "entropy": 2.460205078125, + "epoch": 0.009882664876079302, + "grad_norm": 0.546006977558136, + "learning_rate": 5.73e-06, + "loss": 2.4431, + "mean_token_accuracy": 0.4789656042121351, + "num_tokens": 83185297.0, + "step": 574 + }, + { + "entropy": 2.4942626953125, + "epoch": 0.009899882062274563, + "grad_norm": 0.5814852118492126, + "learning_rate": 5.74e-06, + "loss": 2.4655, + "mean_token_accuracy": 0.469060396309942, + "num_tokens": 83323076.0, + "step": 575 + }, + { + "entropy": 2.448486328125, + "epoch": 0.009917099248469823, + "grad_norm": 0.6251148581504822, + "learning_rate": 5.75e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.48163892794400454, + "num_tokens": 83476832.0, + "step": 576 + }, + { + "entropy": 2.43505859375, + "epoch": 0.009934316434665082, + "grad_norm": 0.5571739077568054, + "learning_rate": 5.76e-06, + "loss": 2.4166, + "mean_token_accuracy": 0.47773123253136873, + "num_tokens": 83636406.0, + "step": 577 + }, + { + "entropy": 2.4129638671875, + "epoch": 0.009951533620860343, + "grad_norm": 0.5774782299995422, + "learning_rate": 5.77e-06, + "loss": 2.3531, + "mean_token_accuracy": 0.4910357999615371, + "num_tokens": 83784239.0, + "step": 578 + }, + { + "entropy": 2.395263671875, + "epoch": 0.009968750807055603, + "grad_norm": 0.5799167156219482, + "learning_rate": 5.78e-06, + "loss": 2.3157, + "mean_token_accuracy": 0.49086295487359166, + "num_tokens": 83930135.0, + "step": 579 + }, + { + "entropy": 2.43603515625, + "epoch": 0.009985967993250864, + "grad_norm": 0.9486396312713623, + "learning_rate": 5.7900000000000005e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.4829951082356274, + "num_tokens": 84069466.0, + "step": 580 + }, + { + "entropy": 2.456298828125, + "epoch": 0.010003185179446122, + "grad_norm": 0.5900285243988037, + "learning_rate": 5.8e-06, + "loss": 2.4223, + "mean_token_accuracy": 0.47999299177899957, + "num_tokens": 84204231.0, + "step": 581 + }, + { + "entropy": 2.4512939453125, + "epoch": 0.010020402365641383, + "grad_norm": 0.612040638923645, + "learning_rate": 5.81e-06, + "loss": 2.4226, + "mean_token_accuracy": 0.4809495103545487, + "num_tokens": 84341875.0, + "step": 582 + }, + { + "entropy": 2.44140625, + "epoch": 0.010037619551836643, + "grad_norm": 0.5587046146392822, + "learning_rate": 5.82e-06, + "loss": 2.3725, + "mean_token_accuracy": 0.4807244557887316, + "num_tokens": 84495749.0, + "step": 583 + }, + { + "entropy": 2.453125, + "epoch": 0.010054836738031904, + "grad_norm": 0.5574972629547119, + "learning_rate": 5.83e-06, + "loss": 2.4249, + "mean_token_accuracy": 0.47708367416635156, + "num_tokens": 84642208.0, + "step": 584 + }, + { + "entropy": 2.4827880859375, + "epoch": 0.010072053924227163, + "grad_norm": 0.5825822353363037, + "learning_rate": 5.84e-06, + "loss": 2.4421, + "mean_token_accuracy": 0.47600942524150014, + "num_tokens": 84798422.0, + "step": 585 + }, + { + "entropy": 2.4530029296875, + "epoch": 0.010089271110422423, + "grad_norm": 0.6338469386100769, + "learning_rate": 5.85e-06, + "loss": 2.3441, + "mean_token_accuracy": 0.48570866556838155, + "num_tokens": 84932951.0, + "step": 586 + }, + { + "entropy": 2.51806640625, + "epoch": 0.010106488296617684, + "grad_norm": 0.5964832305908203, + "learning_rate": 5.86e-06, + "loss": 2.4671, + "mean_token_accuracy": 0.47985436767339706, + "num_tokens": 85070544.0, + "step": 587 + }, + { + "entropy": 2.40576171875, + "epoch": 0.010123705482812944, + "grad_norm": 0.5808175206184387, + "learning_rate": 5.8700000000000005e-06, + "loss": 2.3715, + "mean_token_accuracy": 0.4833093252964318, + "num_tokens": 85216000.0, + "step": 588 + }, + { + "entropy": 2.3857421875, + "epoch": 0.010140922669008203, + "grad_norm": 0.5820797681808472, + "learning_rate": 5.8800000000000005e-06, + "loss": 2.3318, + "mean_token_accuracy": 0.4918548730202019, + "num_tokens": 85367592.0, + "step": 589 + }, + { + "entropy": 2.4697265625, + "epoch": 0.010158139855203464, + "grad_norm": 0.5501261353492737, + "learning_rate": 5.89e-06, + "loss": 2.4305, + "mean_token_accuracy": 0.47398777306079865, + "num_tokens": 85507669.0, + "step": 590 + }, + { + "entropy": 2.3985595703125, + "epoch": 0.010175357041398724, + "grad_norm": 0.5922814607620239, + "learning_rate": 5.9e-06, + "loss": 2.3907, + "mean_token_accuracy": 0.49256287002936006, + "num_tokens": 85657583.0, + "step": 591 + }, + { + "entropy": 2.4813232421875, + "epoch": 0.010192574227593985, + "grad_norm": 0.6180179715156555, + "learning_rate": 5.91e-06, + "loss": 2.4081, + "mean_token_accuracy": 0.47511684615164995, + "num_tokens": 85796530.0, + "step": 592 + }, + { + "entropy": 2.548583984375, + "epoch": 0.010209791413789245, + "grad_norm": 0.586715817451477, + "learning_rate": 5.92e-06, + "loss": 2.4952, + "mean_token_accuracy": 0.4641613829880953, + "num_tokens": 85935315.0, + "step": 593 + }, + { + "entropy": 2.479736328125, + "epoch": 0.010227008599984504, + "grad_norm": 0.5589325428009033, + "learning_rate": 5.93e-06, + "loss": 2.4362, + "mean_token_accuracy": 0.4785456941463053, + "num_tokens": 86083670.0, + "step": 594 + }, + { + "entropy": 2.485107421875, + "epoch": 0.010244225786179765, + "grad_norm": 0.5777843594551086, + "learning_rate": 5.94e-06, + "loss": 2.4487, + "mean_token_accuracy": 0.47643798124045134, + "num_tokens": 86226759.0, + "step": 595 + }, + { + "entropy": 2.4775390625, + "epoch": 0.010261442972375025, + "grad_norm": 0.5679555535316467, + "learning_rate": 5.950000000000001e-06, + "loss": 2.4196, + "mean_token_accuracy": 0.48259661020711064, + "num_tokens": 86374974.0, + "step": 596 + }, + { + "entropy": 2.423095703125, + "epoch": 0.010278660158570286, + "grad_norm": 0.5793547034263611, + "learning_rate": 5.9600000000000005e-06, + "loss": 2.418, + "mean_token_accuracy": 0.48791359877213836, + "num_tokens": 86517518.0, + "step": 597 + }, + { + "entropy": 2.4588623046875, + "epoch": 0.010295877344765544, + "grad_norm": 0.6030251979827881, + "learning_rate": 5.9700000000000004e-06, + "loss": 2.4074, + "mean_token_accuracy": 0.48480180045589805, + "num_tokens": 86647346.0, + "step": 598 + }, + { + "entropy": 2.570068359375, + "epoch": 0.010313094530960805, + "grad_norm": 0.5821578502655029, + "learning_rate": 5.98e-06, + "loss": 2.5287, + "mean_token_accuracy": 0.4660195717588067, + "num_tokens": 86790455.0, + "step": 599 + }, + { + "entropy": 2.4483642578125, + "epoch": 0.010330311717156065, + "grad_norm": 0.5719680786132812, + "learning_rate": 5.99e-06, + "loss": 2.357, + "mean_token_accuracy": 0.48361520702019334, + "num_tokens": 86939582.0, + "step": 600 + }, + { + "entropy": 2.5018310546875, + "epoch": 0.010347528903351326, + "grad_norm": 0.6109415888786316, + "learning_rate": 6e-06, + "loss": 2.4647, + "mean_token_accuracy": 0.47726114513352513, + "num_tokens": 87097332.0, + "step": 601 + }, + { + "entropy": 2.4161376953125, + "epoch": 0.010364746089546585, + "grad_norm": 0.565535306930542, + "learning_rate": 6.01e-06, + "loss": 2.3436, + "mean_token_accuracy": 0.4876682967878878, + "num_tokens": 87246475.0, + "step": 602 + }, + { + "entropy": 2.3807373046875, + "epoch": 0.010381963275741845, + "grad_norm": 0.594390332698822, + "learning_rate": 6.02e-06, + "loss": 2.3292, + "mean_token_accuracy": 0.49000673089176416, + "num_tokens": 87402167.0, + "step": 603 + }, + { + "entropy": 2.489501953125, + "epoch": 0.010399180461937106, + "grad_norm": 0.5601812601089478, + "learning_rate": 6.030000000000001e-06, + "loss": 2.4547, + "mean_token_accuracy": 0.470840523019433, + "num_tokens": 87546749.0, + "step": 604 + }, + { + "entropy": 2.491455078125, + "epoch": 0.010416397648132366, + "grad_norm": 0.6872953772544861, + "learning_rate": 6.040000000000001e-06, + "loss": 2.4646, + "mean_token_accuracy": 0.47835929365828633, + "num_tokens": 87680924.0, + "step": 605 + }, + { + "entropy": 2.499267578125, + "epoch": 0.010433614834327625, + "grad_norm": 0.5995067954063416, + "learning_rate": 6.0500000000000005e-06, + "loss": 2.4872, + "mean_token_accuracy": 0.4722044039517641, + "num_tokens": 87818407.0, + "step": 606 + }, + { + "entropy": 2.4085693359375, + "epoch": 0.010450832020522886, + "grad_norm": 0.7130571007728577, + "learning_rate": 6.0600000000000004e-06, + "loss": 2.3356, + "mean_token_accuracy": 0.48175712302327156, + "num_tokens": 87964354.0, + "step": 607 + }, + { + "entropy": 2.40869140625, + "epoch": 0.010468049206718146, + "grad_norm": 0.5531951189041138, + "learning_rate": 6.07e-06, + "loss": 2.396, + "mean_token_accuracy": 0.4883636045269668, + "num_tokens": 88108908.0, + "step": 608 + }, + { + "entropy": 2.4320068359375, + "epoch": 0.010485266392913407, + "grad_norm": 0.6004809737205505, + "learning_rate": 6.08e-06, + "loss": 2.3704, + "mean_token_accuracy": 0.47937911935150623, + "num_tokens": 88252541.0, + "step": 609 + }, + { + "entropy": 2.447021484375, + "epoch": 0.010502483579108665, + "grad_norm": 0.5746989846229553, + "learning_rate": 6.09e-06, + "loss": 2.3784, + "mean_token_accuracy": 0.4842778267338872, + "num_tokens": 88387113.0, + "step": 610 + }, + { + "entropy": 2.43603515625, + "epoch": 0.010519700765303926, + "grad_norm": 0.6954113245010376, + "learning_rate": 6.1e-06, + "loss": 2.4565, + "mean_token_accuracy": 0.4831416751258075, + "num_tokens": 88538988.0, + "step": 611 + }, + { + "entropy": 2.4976806640625, + "epoch": 0.010536917951499187, + "grad_norm": 0.5616413950920105, + "learning_rate": 6.110000000000001e-06, + "loss": 2.4658, + "mean_token_accuracy": 0.47561208764091134, + "num_tokens": 88697407.0, + "step": 612 + }, + { + "entropy": 2.39599609375, + "epoch": 0.010554135137694447, + "grad_norm": 0.5953220129013062, + "learning_rate": 6.120000000000001e-06, + "loss": 2.3488, + "mean_token_accuracy": 0.4882765398360789, + "num_tokens": 88843276.0, + "step": 613 + }, + { + "entropy": 2.5074462890625, + "epoch": 0.010571352323889706, + "grad_norm": 0.5962725281715393, + "learning_rate": 6.130000000000001e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.4768844782374799, + "num_tokens": 88973725.0, + "step": 614 + }, + { + "entropy": 2.416748046875, + "epoch": 0.010588569510084966, + "grad_norm": 0.5971546769142151, + "learning_rate": 6.1400000000000005e-06, + "loss": 2.384, + "mean_token_accuracy": 0.4932720325887203, + "num_tokens": 89112891.0, + "step": 615 + }, + { + "entropy": 2.4598388671875, + "epoch": 0.010605786696280227, + "grad_norm": 0.5331761837005615, + "learning_rate": 6.15e-06, + "loss": 2.3966, + "mean_token_accuracy": 0.4843857139348984, + "num_tokens": 89261669.0, + "step": 616 + }, + { + "entropy": 2.5048828125, + "epoch": 0.010623003882475487, + "grad_norm": 0.5298826694488525, + "learning_rate": 6.16e-06, + "loss": 2.4497, + "mean_token_accuracy": 0.47414906043559313, + "num_tokens": 89424640.0, + "step": 617 + }, + { + "entropy": 2.53369140625, + "epoch": 0.010640221068670748, + "grad_norm": 0.5918136835098267, + "learning_rate": 6.17e-06, + "loss": 2.4991, + "mean_token_accuracy": 0.4715537903830409, + "num_tokens": 89565111.0, + "step": 618 + }, + { + "entropy": 2.502685546875, + "epoch": 0.010657438254866007, + "grad_norm": 0.5514756441116333, + "learning_rate": 6.18e-06, + "loss": 2.4796, + "mean_token_accuracy": 0.47532700607553124, + "num_tokens": 89720834.0, + "step": 619 + }, + { + "entropy": 2.4874267578125, + "epoch": 0.010674655441061267, + "grad_norm": 0.6075375080108643, + "learning_rate": 6.190000000000001e-06, + "loss": 2.434, + "mean_token_accuracy": 0.48047855822369456, + "num_tokens": 89852579.0, + "step": 620 + }, + { + "entropy": 2.4002685546875, + "epoch": 0.010691872627256528, + "grad_norm": 0.5313543081283569, + "learning_rate": 6.200000000000001e-06, + "loss": 2.3702, + "mean_token_accuracy": 0.4890581676736474, + "num_tokens": 90014576.0, + "step": 621 + }, + { + "entropy": 2.5064697265625, + "epoch": 0.010709089813451788, + "grad_norm": 0.5929763317108154, + "learning_rate": 6.210000000000001e-06, + "loss": 2.4956, + "mean_token_accuracy": 0.47517322562634945, + "num_tokens": 90141327.0, + "step": 622 + }, + { + "entropy": 2.4847412109375, + "epoch": 0.010726306999647047, + "grad_norm": 0.6306544542312622, + "learning_rate": 6.220000000000001e-06, + "loss": 2.4664, + "mean_token_accuracy": 0.4766456396318972, + "num_tokens": 90289805.0, + "step": 623 + }, + { + "entropy": 2.434326171875, + "epoch": 0.010743524185842308, + "grad_norm": 0.6184057593345642, + "learning_rate": 6.2300000000000005e-06, + "loss": 2.4083, + "mean_token_accuracy": 0.48298388347029686, + "num_tokens": 90436897.0, + "step": 624 + }, + { + "entropy": 2.4671630859375, + "epoch": 0.010760741372037568, + "grad_norm": 2.9163689613342285, + "learning_rate": 6.24e-06, + "loss": 2.4082, + "mean_token_accuracy": 0.48466121684759855, + "num_tokens": 90587723.0, + "step": 625 + }, + { + "entropy": 2.4171142578125, + "epoch": 0.010777958558232829, + "grad_norm": 0.6109045743942261, + "learning_rate": 6.25e-06, + "loss": 2.3642, + "mean_token_accuracy": 0.48781463131308556, + "num_tokens": 90722550.0, + "step": 626 + }, + { + "entropy": 2.3975830078125, + "epoch": 0.010795175744428087, + "grad_norm": 0.5503095388412476, + "learning_rate": 6.26e-06, + "loss": 2.3723, + "mean_token_accuracy": 0.484878970310092, + "num_tokens": 90883612.0, + "step": 627 + }, + { + "entropy": 2.435302734375, + "epoch": 0.010812392930623348, + "grad_norm": 0.5557761192321777, + "learning_rate": 6.27e-06, + "loss": 2.4024, + "mean_token_accuracy": 0.482346317730844, + "num_tokens": 91020885.0, + "step": 628 + }, + { + "entropy": 2.454833984375, + "epoch": 0.010829610116818609, + "grad_norm": 0.5977523326873779, + "learning_rate": 6.280000000000001e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.4845750341191888, + "num_tokens": 91183050.0, + "step": 629 + }, + { + "entropy": 2.536376953125, + "epoch": 0.010846827303013869, + "grad_norm": 0.6027834415435791, + "learning_rate": 6.290000000000001e-06, + "loss": 2.5204, + "mean_token_accuracy": 0.46805495163425803, + "num_tokens": 91320603.0, + "step": 630 + }, + { + "entropy": 2.42431640625, + "epoch": 0.010864044489209128, + "grad_norm": 0.569492518901825, + "learning_rate": 6.300000000000001e-06, + "loss": 2.3634, + "mean_token_accuracy": 0.484631123021245, + "num_tokens": 91466774.0, + "step": 631 + }, + { + "entropy": 2.501953125, + "epoch": 0.010881261675404388, + "grad_norm": 0.6085266470909119, + "learning_rate": 6.3100000000000006e-06, + "loss": 2.4384, + "mean_token_accuracy": 0.47073780838400126, + "num_tokens": 91599571.0, + "step": 632 + }, + { + "entropy": 2.454833984375, + "epoch": 0.010898478861599649, + "grad_norm": 0.5897052884101868, + "learning_rate": 6.3200000000000005e-06, + "loss": 2.3718, + "mean_token_accuracy": 0.4813077808357775, + "num_tokens": 91736850.0, + "step": 633 + }, + { + "entropy": 2.5152587890625, + "epoch": 0.01091569604779491, + "grad_norm": 0.5574195384979248, + "learning_rate": 6.33e-06, + "loss": 2.5027, + "mean_token_accuracy": 0.4707440370693803, + "num_tokens": 91883052.0, + "step": 634 + }, + { + "entropy": 2.4981689453125, + "epoch": 0.010932913233990168, + "grad_norm": 0.5665111541748047, + "learning_rate": 6.34e-06, + "loss": 2.4967, + "mean_token_accuracy": 0.4747963696718216, + "num_tokens": 92038624.0, + "step": 635 + }, + { + "entropy": 2.5341796875, + "epoch": 0.010950130420185429, + "grad_norm": 0.6099963784217834, + "learning_rate": 6.35e-06, + "loss": 2.4812, + "mean_token_accuracy": 0.4711012118496001, + "num_tokens": 92172927.0, + "step": 636 + }, + { + "entropy": 2.4072265625, + "epoch": 0.01096734760638069, + "grad_norm": 0.5225919485092163, + "learning_rate": 6.360000000000001e-06, + "loss": 2.3695, + "mean_token_accuracy": 0.4833218730054796, + "num_tokens": 92340412.0, + "step": 637 + }, + { + "entropy": 2.373779296875, + "epoch": 0.01098456479257595, + "grad_norm": 0.5803242921829224, + "learning_rate": 6.370000000000001e-06, + "loss": 2.3518, + "mean_token_accuracy": 0.4897113349288702, + "num_tokens": 92491100.0, + "step": 638 + }, + { + "entropy": 2.465576171875, + "epoch": 0.011001781978771209, + "grad_norm": 0.6239297986030579, + "learning_rate": 6.380000000000001e-06, + "loss": 2.4732, + "mean_token_accuracy": 0.4763565128669143, + "num_tokens": 92624388.0, + "step": 639 + }, + { + "entropy": 2.521728515625, + "epoch": 0.011018999164966469, + "grad_norm": 0.6022891998291016, + "learning_rate": 6.390000000000001e-06, + "loss": 2.5116, + "mean_token_accuracy": 0.4692745329812169, + "num_tokens": 92749943.0, + "step": 640 + }, + { + "entropy": 2.4610595703125, + "epoch": 0.01103621635116173, + "grad_norm": 0.5882202386856079, + "learning_rate": 6.4000000000000006e-06, + "loss": 2.4055, + "mean_token_accuracy": 0.479529797565192, + "num_tokens": 92901869.0, + "step": 641 + }, + { + "entropy": 2.42626953125, + "epoch": 0.01105343353735699, + "grad_norm": 0.6111239790916443, + "learning_rate": 6.4100000000000005e-06, + "loss": 2.337, + "mean_token_accuracy": 0.4874786385335028, + "num_tokens": 93049790.0, + "step": 642 + }, + { + "entropy": 2.459716796875, + "epoch": 0.01107065072355225, + "grad_norm": 0.5824291706085205, + "learning_rate": 6.42e-06, + "loss": 2.3972, + "mean_token_accuracy": 0.4820692026987672, + "num_tokens": 93190066.0, + "step": 643 + }, + { + "entropy": 2.393798828125, + "epoch": 0.01108786790974751, + "grad_norm": 0.6439885497093201, + "learning_rate": 6.43e-06, + "loss": 2.2805, + "mean_token_accuracy": 0.500021081417799, + "num_tokens": 93323041.0, + "step": 644 + }, + { + "entropy": 2.5235595703125, + "epoch": 0.01110508509594277, + "grad_norm": 0.5925695300102234, + "learning_rate": 6.440000000000001e-06, + "loss": 2.5175, + "mean_token_accuracy": 0.46626862324774265, + "num_tokens": 93454663.0, + "step": 645 + }, + { + "entropy": 2.5662841796875, + "epoch": 0.01112230228213803, + "grad_norm": 0.5767838954925537, + "learning_rate": 6.450000000000001e-06, + "loss": 2.5828, + "mean_token_accuracy": 0.46684390073642135, + "num_tokens": 93600987.0, + "step": 646 + }, + { + "entropy": 2.445068359375, + "epoch": 0.011139519468333291, + "grad_norm": 0.5589511394500732, + "learning_rate": 6.460000000000001e-06, + "loss": 2.3973, + "mean_token_accuracy": 0.48293518042191863, + "num_tokens": 93764681.0, + "step": 647 + }, + { + "entropy": 2.487548828125, + "epoch": 0.01115673665452855, + "grad_norm": 0.6247239112854004, + "learning_rate": 6.470000000000001e-06, + "loss": 2.4299, + "mean_token_accuracy": 0.47755016293376684, + "num_tokens": 93908163.0, + "step": 648 + }, + { + "entropy": 2.44873046875, + "epoch": 0.01117395384072381, + "grad_norm": 0.5852839350700378, + "learning_rate": 6.480000000000001e-06, + "loss": 2.3616, + "mean_token_accuracy": 0.48453861800953746, + "num_tokens": 94044102.0, + "step": 649 + }, + { + "entropy": 2.401611328125, + "epoch": 0.011191171026919071, + "grad_norm": 0.5801386833190918, + "learning_rate": 6.4900000000000005e-06, + "loss": 2.3961, + "mean_token_accuracy": 0.4849433288909495, + "num_tokens": 94186933.0, + "step": 650 + }, + { + "entropy": 2.474853515625, + "epoch": 0.011208388213114331, + "grad_norm": 0.5861256718635559, + "learning_rate": 6.5000000000000004e-06, + "loss": 2.4425, + "mean_token_accuracy": 0.47974030720070004, + "num_tokens": 94318045.0, + "step": 651 + }, + { + "entropy": 2.49755859375, + "epoch": 0.01122560539930959, + "grad_norm": 0.5657753348350525, + "learning_rate": 6.51e-06, + "loss": 2.4787, + "mean_token_accuracy": 0.4704547990113497, + "num_tokens": 94470298.0, + "step": 652 + }, + { + "entropy": 2.4617919921875, + "epoch": 0.01124282258550485, + "grad_norm": 0.5523881912231445, + "learning_rate": 6.520000000000001e-06, + "loss": 2.4404, + "mean_token_accuracy": 0.47263912204653025, + "num_tokens": 94617182.0, + "step": 653 + }, + { + "entropy": 2.462646484375, + "epoch": 0.011260039771700111, + "grad_norm": 0.5891516804695129, + "learning_rate": 6.530000000000001e-06, + "loss": 2.4207, + "mean_token_accuracy": 0.4738515946082771, + "num_tokens": 94764738.0, + "step": 654 + }, + { + "entropy": 2.43310546875, + "epoch": 0.011277256957895372, + "grad_norm": 0.5879467129707336, + "learning_rate": 6.540000000000001e-06, + "loss": 2.4553, + "mean_token_accuracy": 0.4857351207174361, + "num_tokens": 94911680.0, + "step": 655 + }, + { + "entropy": 2.3919677734375, + "epoch": 0.01129447414409063, + "grad_norm": 0.5698356032371521, + "learning_rate": 6.550000000000001e-06, + "loss": 2.3241, + "mean_token_accuracy": 0.49458998907357454, + "num_tokens": 95063211.0, + "step": 656 + }, + { + "entropy": 2.4443359375, + "epoch": 0.011311691330285891, + "grad_norm": 0.5569403767585754, + "learning_rate": 6.560000000000001e-06, + "loss": 2.4115, + "mean_token_accuracy": 0.4799956767819822, + "num_tokens": 95205223.0, + "step": 657 + }, + { + "entropy": 2.4415283203125, + "epoch": 0.011328908516481152, + "grad_norm": 0.6046442985534668, + "learning_rate": 6.570000000000001e-06, + "loss": 2.3966, + "mean_token_accuracy": 0.4849221669137478, + "num_tokens": 95353465.0, + "step": 658 + }, + { + "entropy": 2.4443359375, + "epoch": 0.011346125702676412, + "grad_norm": 0.7441838979721069, + "learning_rate": 6.5800000000000005e-06, + "loss": 2.4102, + "mean_token_accuracy": 0.47888694843277335, + "num_tokens": 95519881.0, + "step": 659 + }, + { + "entropy": 2.47998046875, + "epoch": 0.011363342888871671, + "grad_norm": 0.5905322432518005, + "learning_rate": 6.5900000000000004e-06, + "loss": 2.457, + "mean_token_accuracy": 0.47512120427563787, + "num_tokens": 95664332.0, + "step": 660 + }, + { + "entropy": 2.43408203125, + "epoch": 0.011380560075066931, + "grad_norm": 0.5880439281463623, + "learning_rate": 6.600000000000001e-06, + "loss": 2.3549, + "mean_token_accuracy": 0.4882641164585948, + "num_tokens": 95811290.0, + "step": 661 + }, + { + "entropy": 2.44677734375, + "epoch": 0.011397777261262192, + "grad_norm": 0.6042869091033936, + "learning_rate": 6.610000000000001e-06, + "loss": 2.4018, + "mean_token_accuracy": 0.48763177869841456, + "num_tokens": 95950564.0, + "step": 662 + }, + { + "entropy": 2.4495849609375, + "epoch": 0.011414994447457453, + "grad_norm": 0.6228047013282776, + "learning_rate": 6.620000000000001e-06, + "loss": 2.4301, + "mean_token_accuracy": 0.4780231602489948, + "num_tokens": 96088537.0, + "step": 663 + }, + { + "entropy": 2.398681640625, + "epoch": 0.011432211633652711, + "grad_norm": 0.6014442443847656, + "learning_rate": 6.630000000000001e-06, + "loss": 2.4022, + "mean_token_accuracy": 0.4849638855084777, + "num_tokens": 96222517.0, + "step": 664 + }, + { + "entropy": 2.395751953125, + "epoch": 0.011449428819847972, + "grad_norm": 0.5773961544036865, + "learning_rate": 6.640000000000001e-06, + "loss": 2.3727, + "mean_token_accuracy": 0.48979697469621897, + "num_tokens": 96371664.0, + "step": 665 + }, + { + "entropy": 2.443603515625, + "epoch": 0.011466646006043232, + "grad_norm": 0.5553280711174011, + "learning_rate": 6.650000000000001e-06, + "loss": 2.4178, + "mean_token_accuracy": 0.4752015918493271, + "num_tokens": 96533336.0, + "step": 666 + }, + { + "entropy": 2.4317626953125, + "epoch": 0.011483863192238493, + "grad_norm": 0.5981423258781433, + "learning_rate": 6.660000000000001e-06, + "loss": 2.3897, + "mean_token_accuracy": 0.48191826045513153, + "num_tokens": 96671339.0, + "step": 667 + }, + { + "entropy": 2.4635009765625, + "epoch": 0.011501080378433753, + "grad_norm": 0.5966627597808838, + "learning_rate": 6.6700000000000005e-06, + "loss": 2.4742, + "mean_token_accuracy": 0.4762999969534576, + "num_tokens": 96814601.0, + "step": 668 + }, + { + "entropy": 2.5224609375, + "epoch": 0.011518297564629012, + "grad_norm": 0.6274897456169128, + "learning_rate": 6.680000000000001e-06, + "loss": 2.499, + "mean_token_accuracy": 0.4678979804739356, + "num_tokens": 96956119.0, + "step": 669 + }, + { + "entropy": 2.422607421875, + "epoch": 0.011535514750824273, + "grad_norm": 0.5479171276092529, + "learning_rate": 6.690000000000001e-06, + "loss": 2.4028, + "mean_token_accuracy": 0.48214570991694927, + "num_tokens": 97106907.0, + "step": 670 + }, + { + "entropy": 2.450439453125, + "epoch": 0.011552731937019533, + "grad_norm": 0.5556632876396179, + "learning_rate": 6.700000000000001e-06, + "loss": 2.4311, + "mean_token_accuracy": 0.4770152415148914, + "num_tokens": 97251230.0, + "step": 671 + }, + { + "entropy": 2.4766845703125, + "epoch": 0.011569949123214794, + "grad_norm": 0.5444607734680176, + "learning_rate": 6.710000000000001e-06, + "loss": 2.4384, + "mean_token_accuracy": 0.48311482975259423, + "num_tokens": 97404310.0, + "step": 672 + }, + { + "entropy": 2.4534912109375, + "epoch": 0.011587166309410053, + "grad_norm": 0.5732424259185791, + "learning_rate": 6.720000000000001e-06, + "loss": 2.3888, + "mean_token_accuracy": 0.48798013804480433, + "num_tokens": 97557441.0, + "step": 673 + }, + { + "entropy": 2.443603515625, + "epoch": 0.011604383495605313, + "grad_norm": 0.5952773094177246, + "learning_rate": 6.730000000000001e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.47982571227476, + "num_tokens": 97696645.0, + "step": 674 + }, + { + "entropy": 2.4781494140625, + "epoch": 0.011621600681800574, + "grad_norm": 0.6089589595794678, + "learning_rate": 6.740000000000001e-06, + "loss": 2.3921, + "mean_token_accuracy": 0.48180749313905835, + "num_tokens": 97836331.0, + "step": 675 + }, + { + "entropy": 2.4674072265625, + "epoch": 0.011638817867995834, + "grad_norm": 0.5993421077728271, + "learning_rate": 6.750000000000001e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.4802471627481282, + "num_tokens": 97988494.0, + "step": 676 + }, + { + "entropy": 2.42919921875, + "epoch": 0.011656035054191093, + "grad_norm": 0.5687212944030762, + "learning_rate": 6.760000000000001e-06, + "loss": 2.382, + "mean_token_accuracy": 0.48455388378351927, + "num_tokens": 98148161.0, + "step": 677 + }, + { + "entropy": 2.43896484375, + "epoch": 0.011673252240386353, + "grad_norm": 0.5404056906700134, + "learning_rate": 6.770000000000001e-06, + "loss": 2.378, + "mean_token_accuracy": 0.48297660844400525, + "num_tokens": 98303865.0, + "step": 678 + }, + { + "entropy": 2.41552734375, + "epoch": 0.011690469426581614, + "grad_norm": 0.6042897701263428, + "learning_rate": 6.780000000000001e-06, + "loss": 2.3817, + "mean_token_accuracy": 0.4834853089414537, + "num_tokens": 98442926.0, + "step": 679 + }, + { + "entropy": 2.4442138671875, + "epoch": 0.011707686612776875, + "grad_norm": 0.5709783434867859, + "learning_rate": 6.790000000000001e-06, + "loss": 2.389, + "mean_token_accuracy": 0.4857576950453222, + "num_tokens": 98585693.0, + "step": 680 + }, + { + "entropy": 2.452392578125, + "epoch": 0.011724903798972133, + "grad_norm": 0.5298141837120056, + "learning_rate": 6.800000000000001e-06, + "loss": 2.3874, + "mean_token_accuracy": 0.486653549131006, + "num_tokens": 98748357.0, + "step": 681 + }, + { + "entropy": 2.470703125, + "epoch": 0.011742120985167394, + "grad_norm": 0.5651002526283264, + "learning_rate": 6.810000000000001e-06, + "loss": 2.4578, + "mean_token_accuracy": 0.4745705393142998, + "num_tokens": 98894643.0, + "step": 682 + }, + { + "entropy": 2.4425048828125, + "epoch": 0.011759338171362654, + "grad_norm": 0.5605524182319641, + "learning_rate": 6.820000000000001e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.4801498386077583, + "num_tokens": 99057594.0, + "step": 683 + }, + { + "entropy": 2.457275390625, + "epoch": 0.011776555357557915, + "grad_norm": 0.594336748123169, + "learning_rate": 6.830000000000001e-06, + "loss": 2.4241, + "mean_token_accuracy": 0.4828556412830949, + "num_tokens": 99191074.0, + "step": 684 + }, + { + "entropy": 2.4183349609375, + "epoch": 0.011793772543753174, + "grad_norm": 0.5907600522041321, + "learning_rate": 6.8400000000000014e-06, + "loss": 2.3646, + "mean_token_accuracy": 0.4930391958914697, + "num_tokens": 99328908.0, + "step": 685 + }, + { + "entropy": 2.5035400390625, + "epoch": 0.011810989729948434, + "grad_norm": 0.5865106582641602, + "learning_rate": 6.850000000000001e-06, + "loss": 2.481, + "mean_token_accuracy": 0.47163935378193855, + "num_tokens": 99456705.0, + "step": 686 + }, + { + "entropy": 2.4417724609375, + "epoch": 0.011828206916143695, + "grad_norm": 0.677649736404419, + "learning_rate": 6.860000000000001e-06, + "loss": 2.4363, + "mean_token_accuracy": 0.48826620541512966, + "num_tokens": 99591855.0, + "step": 687 + }, + { + "entropy": 2.4449462890625, + "epoch": 0.011845424102338955, + "grad_norm": 0.5924775004386902, + "learning_rate": 6.870000000000001e-06, + "loss": 2.3789, + "mean_token_accuracy": 0.48261339543387294, + "num_tokens": 99733016.0, + "step": 688 + }, + { + "entropy": 2.420654296875, + "epoch": 0.011862641288534214, + "grad_norm": 0.5597655177116394, + "learning_rate": 6.88e-06, + "loss": 2.4079, + "mean_token_accuracy": 0.4812637008726597, + "num_tokens": 99884912.0, + "step": 689 + }, + { + "entropy": 2.4306640625, + "epoch": 0.011879858474729475, + "grad_norm": 0.5902450680732727, + "learning_rate": 6.89e-06, + "loss": 2.3469, + "mean_token_accuracy": 0.4851334313862026, + "num_tokens": 100033610.0, + "step": 690 + }, + { + "entropy": 2.40673828125, + "epoch": 0.011897075660924735, + "grad_norm": 0.5616133213043213, + "learning_rate": 6.9e-06, + "loss": 2.3481, + "mean_token_accuracy": 0.4861144246533513, + "num_tokens": 100175721.0, + "step": 691 + }, + { + "entropy": 2.4476318359375, + "epoch": 0.011914292847119996, + "grad_norm": 0.5511784553527832, + "learning_rate": 6.91e-06, + "loss": 2.4008, + "mean_token_accuracy": 0.4786239666864276, + "num_tokens": 100317239.0, + "step": 692 + }, + { + "entropy": 2.4410400390625, + "epoch": 0.011931510033315256, + "grad_norm": 0.5769883394241333, + "learning_rate": 6.92e-06, + "loss": 2.3883, + "mean_token_accuracy": 0.48180802492424846, + "num_tokens": 100472196.0, + "step": 693 + }, + { + "entropy": 2.423828125, + "epoch": 0.011948727219510515, + "grad_norm": 0.5468559265136719, + "learning_rate": 6.93e-06, + "loss": 2.3532, + "mean_token_accuracy": 0.4820295791141689, + "num_tokens": 100630584.0, + "step": 694 + }, + { + "entropy": 2.484619140625, + "epoch": 0.011965944405705775, + "grad_norm": 0.5945432782173157, + "learning_rate": 6.9400000000000005e-06, + "loss": 2.4283, + "mean_token_accuracy": 0.4779118075966835, + "num_tokens": 100786824.0, + "step": 695 + }, + { + "entropy": 2.4324951171875, + "epoch": 0.011983161591901036, + "grad_norm": 0.5941588878631592, + "learning_rate": 6.95e-06, + "loss": 2.3637, + "mean_token_accuracy": 0.4826899361796677, + "num_tokens": 100927601.0, + "step": 696 + }, + { + "entropy": 2.46240234375, + "epoch": 0.012000378778096297, + "grad_norm": 0.5858972668647766, + "learning_rate": 6.96e-06, + "loss": 2.4147, + "mean_token_accuracy": 0.47940469440072775, + "num_tokens": 101064043.0, + "step": 697 + }, + { + "entropy": 2.44091796875, + "epoch": 0.012017595964291555, + "grad_norm": 0.5900692939758301, + "learning_rate": 6.97e-06, + "loss": 2.3959, + "mean_token_accuracy": 0.48975384049117565, + "num_tokens": 101209114.0, + "step": 698 + }, + { + "entropy": 2.400146484375, + "epoch": 0.012034813150486816, + "grad_norm": 0.5515392422676086, + "learning_rate": 6.98e-06, + "loss": 2.3642, + "mean_token_accuracy": 0.4900199566036463, + "num_tokens": 101358773.0, + "step": 699 + }, + { + "entropy": 2.3948974609375, + "epoch": 0.012052030336682076, + "grad_norm": 0.5985309481620789, + "learning_rate": 6.99e-06, + "loss": 2.3383, + "mean_token_accuracy": 0.49027787847444415, + "num_tokens": 101507432.0, + "step": 700 + }, + { + "entropy": 2.45751953125, + "epoch": 0.012069247522877337, + "grad_norm": 0.5618374943733215, + "learning_rate": 7e-06, + "loss": 2.434, + "mean_token_accuracy": 0.47642564633861184, + "num_tokens": 101645197.0, + "step": 701 + }, + { + "entropy": 2.405517578125, + "epoch": 0.012086464709072596, + "grad_norm": 0.5900906920433044, + "learning_rate": 7.01e-06, + "loss": 2.3832, + "mean_token_accuracy": 0.48692094907164574, + "num_tokens": 101786084.0, + "step": 702 + }, + { + "entropy": 2.4290771484375, + "epoch": 0.012103681895267856, + "grad_norm": 0.5847712755203247, + "learning_rate": 7.0200000000000006e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.48114079609513283, + "num_tokens": 101929991.0, + "step": 703 + }, + { + "entropy": 2.4827880859375, + "epoch": 0.012120899081463117, + "grad_norm": 0.6014482378959656, + "learning_rate": 7.0300000000000005e-06, + "loss": 2.4566, + "mean_token_accuracy": 0.4787940843962133, + "num_tokens": 102069231.0, + "step": 704 + }, + { + "entropy": 2.5064697265625, + "epoch": 0.012138116267658377, + "grad_norm": 0.5504627823829651, + "learning_rate": 7.04e-06, + "loss": 2.4534, + "mean_token_accuracy": 0.46901731938123703, + "num_tokens": 102218938.0, + "step": 705 + }, + { + "entropy": 2.401123046875, + "epoch": 0.012155333453853636, + "grad_norm": 0.6932315826416016, + "learning_rate": 7.05e-06, + "loss": 2.3427, + "mean_token_accuracy": 0.4926096093840897, + "num_tokens": 102353532.0, + "step": 706 + }, + { + "entropy": 2.46923828125, + "epoch": 0.012172550640048897, + "grad_norm": 0.5833364725112915, + "learning_rate": 7.06e-06, + "loss": 2.4615, + "mean_token_accuracy": 0.47761310590431094, + "num_tokens": 102490402.0, + "step": 707 + }, + { + "entropy": 2.439697265625, + "epoch": 0.012189767826244157, + "grad_norm": 0.61143559217453, + "learning_rate": 7.07e-06, + "loss": 2.3579, + "mean_token_accuracy": 0.4858525595627725, + "num_tokens": 102627917.0, + "step": 708 + }, + { + "entropy": 2.49169921875, + "epoch": 0.012206985012439418, + "grad_norm": 0.601938009262085, + "learning_rate": 7.08e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.4729892536997795, + "num_tokens": 102759385.0, + "step": 709 + }, + { + "entropy": 2.4722900390625, + "epoch": 0.012224202198634676, + "grad_norm": 0.5767009854316711, + "learning_rate": 7.09e-06, + "loss": 2.4482, + "mean_token_accuracy": 0.4772743955254555, + "num_tokens": 102915430.0, + "step": 710 + }, + { + "entropy": 2.46142578125, + "epoch": 0.012241419384829937, + "grad_norm": 0.546136200428009, + "learning_rate": 7.100000000000001e-06, + "loss": 2.3961, + "mean_token_accuracy": 0.4819467253983021, + "num_tokens": 103066691.0, + "step": 711 + }, + { + "entropy": 2.48583984375, + "epoch": 0.012258636571025197, + "grad_norm": 0.5756231546401978, + "learning_rate": 7.1100000000000005e-06, + "loss": 2.4676, + "mean_token_accuracy": 0.4714622185565531, + "num_tokens": 103217176.0, + "step": 712 + }, + { + "entropy": 2.4246826171875, + "epoch": 0.012275853757220458, + "grad_norm": 0.5694965720176697, + "learning_rate": 7.1200000000000004e-06, + "loss": 2.3603, + "mean_token_accuracy": 0.4863223168067634, + "num_tokens": 103368786.0, + "step": 713 + }, + { + "entropy": 2.447265625, + "epoch": 0.012293070943415717, + "grad_norm": 0.5844082236289978, + "learning_rate": 7.13e-06, + "loss": 2.4418, + "mean_token_accuracy": 0.482551914639771, + "num_tokens": 103515573.0, + "step": 714 + }, + { + "entropy": 2.509765625, + "epoch": 0.012310288129610977, + "grad_norm": 0.5859200358390808, + "learning_rate": 7.14e-06, + "loss": 2.4894, + "mean_token_accuracy": 0.4717716183513403, + "num_tokens": 103649277.0, + "step": 715 + }, + { + "entropy": 2.45263671875, + "epoch": 0.012327505315806238, + "grad_norm": 0.5378598570823669, + "learning_rate": 7.15e-06, + "loss": 2.4289, + "mean_token_accuracy": 0.47941537760198116, + "num_tokens": 103802169.0, + "step": 716 + }, + { + "entropy": 2.4493408203125, + "epoch": 0.012344722502001498, + "grad_norm": 0.6121527552604675, + "learning_rate": 7.16e-06, + "loss": 2.3747, + "mean_token_accuracy": 0.4810113995335996, + "num_tokens": 103939625.0, + "step": 717 + }, + { + "entropy": 2.4144287109375, + "epoch": 0.012361939688196759, + "grad_norm": 0.5776681900024414, + "learning_rate": 7.17e-06, + "loss": 2.3531, + "mean_token_accuracy": 0.4864069065079093, + "num_tokens": 104080617.0, + "step": 718 + }, + { + "entropy": 2.3646240234375, + "epoch": 0.012379156874392018, + "grad_norm": 0.5905261635780334, + "learning_rate": 7.180000000000001e-06, + "loss": 2.3327, + "mean_token_accuracy": 0.49934633634984493, + "num_tokens": 104220572.0, + "step": 719 + }, + { + "entropy": 2.4034423828125, + "epoch": 0.012396374060587278, + "grad_norm": 0.5707198977470398, + "learning_rate": 7.190000000000001e-06, + "loss": 2.3433, + "mean_token_accuracy": 0.49225129559636116, + "num_tokens": 104361519.0, + "step": 720 + }, + { + "entropy": 2.4696044921875, + "epoch": 0.012413591246782539, + "grad_norm": 0.5869008302688599, + "learning_rate": 7.2000000000000005e-06, + "loss": 2.4365, + "mean_token_accuracy": 0.47933553624898195, + "num_tokens": 104506297.0, + "step": 721 + }, + { + "entropy": 2.4625244140625, + "epoch": 0.0124308084329778, + "grad_norm": 0.5947871208190918, + "learning_rate": 7.2100000000000004e-06, + "loss": 2.4067, + "mean_token_accuracy": 0.48690010188147426, + "num_tokens": 104647015.0, + "step": 722 + }, + { + "entropy": 2.450439453125, + "epoch": 0.012448025619173058, + "grad_norm": 0.6154801845550537, + "learning_rate": 7.22e-06, + "loss": 2.4108, + "mean_token_accuracy": 0.47685753647238016, + "num_tokens": 104802411.0, + "step": 723 + }, + { + "entropy": 2.419921875, + "epoch": 0.012465242805368319, + "grad_norm": 0.6330761313438416, + "learning_rate": 7.23e-06, + "loss": 2.3759, + "mean_token_accuracy": 0.4840613235719502, + "num_tokens": 104951305.0, + "step": 724 + }, + { + "entropy": 2.43359375, + "epoch": 0.012482459991563579, + "grad_norm": 0.6040731072425842, + "learning_rate": 7.24e-06, + "loss": 2.3609, + "mean_token_accuracy": 0.4850413934327662, + "num_tokens": 105091990.0, + "step": 725 + }, + { + "entropy": 2.44482421875, + "epoch": 0.01249967717775884, + "grad_norm": 0.6250449419021606, + "learning_rate": 7.25e-06, + "loss": 2.4137, + "mean_token_accuracy": 0.4755057515576482, + "num_tokens": 105212920.0, + "step": 726 + }, + { + "entropy": 2.4415283203125, + "epoch": 0.012516894363954098, + "grad_norm": 0.6397738456726074, + "learning_rate": 7.260000000000001e-06, + "loss": 2.4425, + "mean_token_accuracy": 0.4787881104275584, + "num_tokens": 105329271.0, + "step": 727 + }, + { + "entropy": 2.3997802734375, + "epoch": 0.012534111550149359, + "grad_norm": 0.5615900158882141, + "learning_rate": 7.270000000000001e-06, + "loss": 2.4034, + "mean_token_accuracy": 0.4808120485395193, + "num_tokens": 105487622.0, + "step": 728 + }, + { + "entropy": 2.464111328125, + "epoch": 0.01255132873634462, + "grad_norm": 0.5766611695289612, + "learning_rate": 7.280000000000001e-06, + "loss": 2.4341, + "mean_token_accuracy": 0.47967397794127464, + "num_tokens": 105633781.0, + "step": 729 + }, + { + "entropy": 2.482421875, + "epoch": 0.01256854592253988, + "grad_norm": 0.557733416557312, + "learning_rate": 7.2900000000000005e-06, + "loss": 2.4198, + "mean_token_accuracy": 0.4778105691075325, + "num_tokens": 105783888.0, + "step": 730 + }, + { + "entropy": 2.3619384765625, + "epoch": 0.012585763108735139, + "grad_norm": 0.5741136074066162, + "learning_rate": 7.3e-06, + "loss": 2.3078, + "mean_token_accuracy": 0.49396718852221966, + "num_tokens": 105927244.0, + "step": 731 + }, + { + "entropy": 2.431396484375, + "epoch": 0.0126029802949304, + "grad_norm": 0.5635918974876404, + "learning_rate": 7.31e-06, + "loss": 2.3871, + "mean_token_accuracy": 0.48639540281146765, + "num_tokens": 106070617.0, + "step": 732 + }, + { + "entropy": 2.4696044921875, + "epoch": 0.01262019748112566, + "grad_norm": 0.578965425491333, + "learning_rate": 7.32e-06, + "loss": 2.3938, + "mean_token_accuracy": 0.4847126523964107, + "num_tokens": 106230425.0, + "step": 733 + }, + { + "entropy": 2.458251953125, + "epoch": 0.01263741466732092, + "grad_norm": 0.9026923775672913, + "learning_rate": 7.33e-06, + "loss": 2.4071, + "mean_token_accuracy": 0.4807370454072952, + "num_tokens": 106375429.0, + "step": 734 + }, + { + "entropy": 2.4134521484375, + "epoch": 0.012654631853516179, + "grad_norm": 0.6249892711639404, + "learning_rate": 7.340000000000001e-06, + "loss": 2.3442, + "mean_token_accuracy": 0.48796508787199855, + "num_tokens": 106523980.0, + "step": 735 + }, + { + "entropy": 2.403076171875, + "epoch": 0.01267184903971144, + "grad_norm": 0.5887693166732788, + "learning_rate": 7.350000000000001e-06, + "loss": 2.3872, + "mean_token_accuracy": 0.48691826686263084, + "num_tokens": 106669149.0, + "step": 736 + }, + { + "entropy": 2.4166259765625, + "epoch": 0.0126890662259067, + "grad_norm": 0.617521345615387, + "learning_rate": 7.360000000000001e-06, + "loss": 2.3293, + "mean_token_accuracy": 0.48858933337032795, + "num_tokens": 106798139.0, + "step": 737 + }, + { + "entropy": 2.44921875, + "epoch": 0.01270628341210196, + "grad_norm": 0.5710563659667969, + "learning_rate": 7.370000000000001e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.47817583242431283, + "num_tokens": 106937567.0, + "step": 738 + }, + { + "entropy": 2.4063720703125, + "epoch": 0.01272350059829722, + "grad_norm": 1.2993676662445068, + "learning_rate": 7.3800000000000005e-06, + "loss": 2.3553, + "mean_token_accuracy": 0.48716708505526185, + "num_tokens": 107069215.0, + "step": 739 + }, + { + "entropy": 2.417236328125, + "epoch": 0.01274071778449248, + "grad_norm": 0.5727201104164124, + "learning_rate": 7.39e-06, + "loss": 2.3806, + "mean_token_accuracy": 0.4816734539344907, + "num_tokens": 107216177.0, + "step": 740 + }, + { + "entropy": 2.470947265625, + "epoch": 0.01275793497068774, + "grad_norm": 0.568335771560669, + "learning_rate": 7.4e-06, + "loss": 2.4227, + "mean_token_accuracy": 0.4798375847749412, + "num_tokens": 107367907.0, + "step": 741 + }, + { + "entropy": 2.4110107421875, + "epoch": 0.012775152156883001, + "grad_norm": 0.6011884808540344, + "learning_rate": 7.41e-06, + "loss": 2.3219, + "mean_token_accuracy": 0.49019240494817495, + "num_tokens": 107504131.0, + "step": 742 + }, + { + "entropy": 2.41845703125, + "epoch": 0.01279236934307826, + "grad_norm": 0.5796740055084229, + "learning_rate": 7.420000000000001e-06, + "loss": 2.3559, + "mean_token_accuracy": 0.48748622741550207, + "num_tokens": 107645959.0, + "step": 743 + }, + { + "entropy": 2.4940185546875, + "epoch": 0.01280958652927352, + "grad_norm": 0.5444216132164001, + "learning_rate": 7.430000000000001e-06, + "loss": 2.4531, + "mean_token_accuracy": 0.4708722811192274, + "num_tokens": 107795485.0, + "step": 744 + }, + { + "entropy": 2.413818359375, + "epoch": 0.012826803715468781, + "grad_norm": 0.5786703824996948, + "learning_rate": 7.440000000000001e-06, + "loss": 2.3548, + "mean_token_accuracy": 0.49165352433919907, + "num_tokens": 107937289.0, + "step": 745 + }, + { + "entropy": 2.411865234375, + "epoch": 0.012844020901664041, + "grad_norm": 0.6046903729438782, + "learning_rate": 7.450000000000001e-06, + "loss": 2.3753, + "mean_token_accuracy": 0.4823318342678249, + "num_tokens": 108093121.0, + "step": 746 + }, + { + "entropy": 2.4716796875, + "epoch": 0.012861238087859302, + "grad_norm": 0.536321222782135, + "learning_rate": 7.4600000000000006e-06, + "loss": 2.4663, + "mean_token_accuracy": 0.4771149712614715, + "num_tokens": 108257267.0, + "step": 747 + }, + { + "entropy": 2.4007568359375, + "epoch": 0.01287845527405456, + "grad_norm": 0.6158970594406128, + "learning_rate": 7.4700000000000005e-06, + "loss": 2.3579, + "mean_token_accuracy": 0.4832296408712864, + "num_tokens": 108396635.0, + "step": 748 + }, + { + "entropy": 2.4298095703125, + "epoch": 0.012895672460249821, + "grad_norm": 0.5570418834686279, + "learning_rate": 7.48e-06, + "loss": 2.4086, + "mean_token_accuracy": 0.48002893943339586, + "num_tokens": 108544662.0, + "step": 749 + }, + { + "entropy": 2.54150390625, + "epoch": 0.012912889646445082, + "grad_norm": 0.6132979393005371, + "learning_rate": 7.49e-06, + "loss": 2.4981, + "mean_token_accuracy": 0.46807813877239823, + "num_tokens": 108709034.0, + "step": 750 + }, + { + "entropy": 2.438232421875, + "epoch": 0.012930106832640342, + "grad_norm": 0.5490818023681641, + "learning_rate": 7.500000000000001e-06, + "loss": 2.3753, + "mean_token_accuracy": 0.48799073603004217, + "num_tokens": 108859610.0, + "step": 751 + }, + { + "entropy": 2.4285888671875, + "epoch": 0.012947324018835601, + "grad_norm": 0.5972912311553955, + "learning_rate": 7.510000000000001e-06, + "loss": 2.4225, + "mean_token_accuracy": 0.48296895902603865, + "num_tokens": 109009475.0, + "step": 752 + }, + { + "entropy": 2.4571533203125, + "epoch": 0.012964541205030862, + "grad_norm": 0.5518878102302551, + "learning_rate": 7.520000000000001e-06, + "loss": 2.3751, + "mean_token_accuracy": 0.4834339157678187, + "num_tokens": 109158002.0, + "step": 753 + }, + { + "entropy": 2.41162109375, + "epoch": 0.012981758391226122, + "grad_norm": 0.602249801158905, + "learning_rate": 7.530000000000001e-06, + "loss": 2.3587, + "mean_token_accuracy": 0.4893317800015211, + "num_tokens": 109293690.0, + "step": 754 + }, + { + "entropy": 2.4407958984375, + "epoch": 0.012998975577421383, + "grad_norm": 0.5610491633415222, + "learning_rate": 7.540000000000001e-06, + "loss": 2.4023, + "mean_token_accuracy": 0.4784908280707896, + "num_tokens": 109439590.0, + "step": 755 + }, + { + "entropy": 2.4281005859375, + "epoch": 0.013016192763616641, + "grad_norm": 0.5914566516876221, + "learning_rate": 7.5500000000000006e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.4888386274687946, + "num_tokens": 109582432.0, + "step": 756 + }, + { + "entropy": 2.504638671875, + "epoch": 0.013033409949811902, + "grad_norm": 0.5595380067825317, + "learning_rate": 7.5600000000000005e-06, + "loss": 2.4702, + "mean_token_accuracy": 0.4686399414204061, + "num_tokens": 109735418.0, + "step": 757 + }, + { + "entropy": 2.4144287109375, + "epoch": 0.013050627136007163, + "grad_norm": 0.5793854594230652, + "learning_rate": 7.57e-06, + "loss": 2.3714, + "mean_token_accuracy": 0.4886330468580127, + "num_tokens": 109891701.0, + "step": 758 + }, + { + "entropy": 2.4320068359375, + "epoch": 0.013067844322202423, + "grad_norm": 0.6832471489906311, + "learning_rate": 7.58e-06, + "loss": 2.3694, + "mean_token_accuracy": 0.48566275043413043, + "num_tokens": 110039268.0, + "step": 759 + }, + { + "entropy": 2.4044189453125, + "epoch": 0.013085061508397682, + "grad_norm": 0.6120168566703796, + "learning_rate": 7.590000000000001e-06, + "loss": 2.3659, + "mean_token_accuracy": 0.4929888774640858, + "num_tokens": 110182050.0, + "step": 760 + }, + { + "entropy": 2.491943359375, + "epoch": 0.013102278694592942, + "grad_norm": 0.6201086044311523, + "learning_rate": 7.600000000000001e-06, + "loss": 2.4232, + "mean_token_accuracy": 0.4787399894557893, + "num_tokens": 110303657.0, + "step": 761 + }, + { + "entropy": 2.499755859375, + "epoch": 0.013119495880788203, + "grad_norm": 0.5904567837715149, + "learning_rate": 7.610000000000001e-06, + "loss": 2.4735, + "mean_token_accuracy": 0.47595866583287716, + "num_tokens": 110445376.0, + "step": 762 + }, + { + "entropy": 2.4344482421875, + "epoch": 0.013136713066983463, + "grad_norm": 0.7313216924667358, + "learning_rate": 7.620000000000001e-06, + "loss": 2.3898, + "mean_token_accuracy": 0.4856723416596651, + "num_tokens": 110599091.0, + "step": 763 + }, + { + "entropy": 2.412353515625, + "epoch": 0.013153930253178722, + "grad_norm": 0.582177460193634, + "learning_rate": 7.630000000000001e-06, + "loss": 2.3465, + "mean_token_accuracy": 0.49040036741644144, + "num_tokens": 110767359.0, + "step": 764 + }, + { + "entropy": 2.4423828125, + "epoch": 0.013171147439373983, + "grad_norm": 0.5971955060958862, + "learning_rate": 7.640000000000001e-06, + "loss": 2.4243, + "mean_token_accuracy": 0.47924549924209714, + "num_tokens": 110904749.0, + "step": 765 + }, + { + "entropy": 2.370361328125, + "epoch": 0.013188364625569243, + "grad_norm": 0.5825552940368652, + "learning_rate": 7.650000000000001e-06, + "loss": 2.337, + "mean_token_accuracy": 0.48746788455173373, + "num_tokens": 111041416.0, + "step": 766 + }, + { + "entropy": 2.485107421875, + "epoch": 0.013205581811764504, + "grad_norm": 0.5713107585906982, + "learning_rate": 7.660000000000001e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.4754057708196342, + "num_tokens": 111179069.0, + "step": 767 + }, + { + "entropy": 2.412353515625, + "epoch": 0.013222798997959763, + "grad_norm": 0.5602453947067261, + "learning_rate": 7.670000000000001e-06, + "loss": 2.3753, + "mean_token_accuracy": 0.49099403340369463, + "num_tokens": 111333973.0, + "step": 768 + }, + { + "entropy": 2.40283203125, + "epoch": 0.013240016184155023, + "grad_norm": 0.5362405180931091, + "learning_rate": 7.680000000000001e-06, + "loss": 2.3705, + "mean_token_accuracy": 0.48479648493230343, + "num_tokens": 111483914.0, + "step": 769 + }, + { + "entropy": 2.4000244140625, + "epoch": 0.013257233370350284, + "grad_norm": 0.5498298406600952, + "learning_rate": 7.690000000000001e-06, + "loss": 2.332, + "mean_token_accuracy": 0.49140041740611196, + "num_tokens": 111637688.0, + "step": 770 + }, + { + "entropy": 2.3616943359375, + "epoch": 0.013274450556545544, + "grad_norm": 0.5781952142715454, + "learning_rate": 7.7e-06, + "loss": 2.3117, + "mean_token_accuracy": 0.49226083187386394, + "num_tokens": 111777176.0, + "step": 771 + }, + { + "entropy": 2.45654296875, + "epoch": 0.013291667742740805, + "grad_norm": 0.5963894724845886, + "learning_rate": 7.71e-06, + "loss": 2.4077, + "mean_token_accuracy": 0.4808903872035444, + "num_tokens": 111907633.0, + "step": 772 + }, + { + "entropy": 2.47265625, + "epoch": 0.013308884928936063, + "grad_norm": 0.5909532904624939, + "learning_rate": 7.72e-06, + "loss": 2.4344, + "mean_token_accuracy": 0.48116163862869143, + "num_tokens": 112047691.0, + "step": 773 + }, + { + "entropy": 2.4112548828125, + "epoch": 0.013326102115131324, + "grad_norm": 0.6000514626502991, + "learning_rate": 7.73e-06, + "loss": 2.3594, + "mean_token_accuracy": 0.4857400543987751, + "num_tokens": 112185378.0, + "step": 774 + }, + { + "entropy": 2.41943359375, + "epoch": 0.013343319301326585, + "grad_norm": 0.586081326007843, + "learning_rate": 7.74e-06, + "loss": 2.3978, + "mean_token_accuracy": 0.4904695344157517, + "num_tokens": 112341351.0, + "step": 775 + }, + { + "entropy": 2.4359130859375, + "epoch": 0.013360536487521845, + "grad_norm": 0.5683102607727051, + "learning_rate": 7.75e-06, + "loss": 2.3903, + "mean_token_accuracy": 0.4875342110171914, + "num_tokens": 112487164.0, + "step": 776 + }, + { + "entropy": 2.4697265625, + "epoch": 0.013377753673717104, + "grad_norm": 0.5665966272354126, + "learning_rate": 7.76e-06, + "loss": 2.4298, + "mean_token_accuracy": 0.47403282299637794, + "num_tokens": 112629741.0, + "step": 777 + }, + { + "entropy": 2.45703125, + "epoch": 0.013394970859912364, + "grad_norm": 0.5655020475387573, + "learning_rate": 7.77e-06, + "loss": 2.402, + "mean_token_accuracy": 0.48069310747087, + "num_tokens": 112767619.0, + "step": 778 + }, + { + "entropy": 2.48095703125, + "epoch": 0.013412188046107625, + "grad_norm": 0.5934699177742004, + "learning_rate": 7.78e-06, + "loss": 2.4576, + "mean_token_accuracy": 0.4850818943232298, + "num_tokens": 112913870.0, + "step": 779 + }, + { + "entropy": 2.4310302734375, + "epoch": 0.013429405232302885, + "grad_norm": 0.7897031307220459, + "learning_rate": 7.790000000000002e-06, + "loss": 2.3135, + "mean_token_accuracy": 0.4851987957954407, + "num_tokens": 113047948.0, + "step": 780 + }, + { + "entropy": 2.3919677734375, + "epoch": 0.013446622418498144, + "grad_norm": 0.5672735571861267, + "learning_rate": 7.800000000000002e-06, + "loss": 2.3579, + "mean_token_accuracy": 0.48478852584958076, + "num_tokens": 113210948.0, + "step": 781 + }, + { + "entropy": 2.4381103515625, + "epoch": 0.013463839604693405, + "grad_norm": 0.6065455079078674, + "learning_rate": 7.810000000000001e-06, + "loss": 2.3854, + "mean_token_accuracy": 0.4848202792927623, + "num_tokens": 113361592.0, + "step": 782 + }, + { + "entropy": 2.4210205078125, + "epoch": 0.013481056790888665, + "grad_norm": 0.5307328701019287, + "learning_rate": 7.820000000000001e-06, + "loss": 2.3612, + "mean_token_accuracy": 0.4887157790362835, + "num_tokens": 113524797.0, + "step": 783 + }, + { + "entropy": 2.4393310546875, + "epoch": 0.013498273977083926, + "grad_norm": 0.6069521307945251, + "learning_rate": 7.830000000000001e-06, + "loss": 2.4034, + "mean_token_accuracy": 0.4795961854979396, + "num_tokens": 113659620.0, + "step": 784 + }, + { + "entropy": 2.443603515625, + "epoch": 0.013515491163279185, + "grad_norm": 0.5770717263221741, + "learning_rate": 7.840000000000001e-06, + "loss": 2.4369, + "mean_token_accuracy": 0.48013802990317345, + "num_tokens": 113803890.0, + "step": 785 + }, + { + "entropy": 2.4228515625, + "epoch": 0.013532708349474445, + "grad_norm": 0.6027282476425171, + "learning_rate": 7.850000000000001e-06, + "loss": 2.406, + "mean_token_accuracy": 0.48077098093926907, + "num_tokens": 113948731.0, + "step": 786 + }, + { + "entropy": 2.439208984375, + "epoch": 0.013549925535669706, + "grad_norm": 0.6242732405662537, + "learning_rate": 7.860000000000001e-06, + "loss": 2.3891, + "mean_token_accuracy": 0.4767256425693631, + "num_tokens": 114095401.0, + "step": 787 + }, + { + "entropy": 2.507568359375, + "epoch": 0.013567142721864966, + "grad_norm": 0.5736910104751587, + "learning_rate": 7.870000000000001e-06, + "loss": 2.4509, + "mean_token_accuracy": 0.4732041200622916, + "num_tokens": 114245869.0, + "step": 788 + }, + { + "entropy": 2.485595703125, + "epoch": 0.013584359908060225, + "grad_norm": 0.5609222054481506, + "learning_rate": 7.88e-06, + "loss": 2.4312, + "mean_token_accuracy": 0.4742095875553787, + "num_tokens": 114388858.0, + "step": 789 + }, + { + "entropy": 2.56689453125, + "epoch": 0.013601577094255485, + "grad_norm": 0.5870925784111023, + "learning_rate": 7.89e-06, + "loss": 2.5389, + "mean_token_accuracy": 0.46493083937093616, + "num_tokens": 114523322.0, + "step": 790 + }, + { + "entropy": 2.49267578125, + "epoch": 0.013618794280450746, + "grad_norm": 0.5677596926689148, + "learning_rate": 7.9e-06, + "loss": 2.4385, + "mean_token_accuracy": 0.4769080653786659, + "num_tokens": 114663505.0, + "step": 791 + }, + { + "entropy": 2.485595703125, + "epoch": 0.013636011466646007, + "grad_norm": 0.5597982406616211, + "learning_rate": 7.91e-06, + "loss": 2.4263, + "mean_token_accuracy": 0.4761881032027304, + "num_tokens": 114813922.0, + "step": 792 + }, + { + "entropy": 2.416259765625, + "epoch": 0.013653228652841265, + "grad_norm": 0.5396745204925537, + "learning_rate": 7.92e-06, + "loss": 2.3336, + "mean_token_accuracy": 0.49072846584022045, + "num_tokens": 114962105.0, + "step": 793 + }, + { + "entropy": 2.3951416015625, + "epoch": 0.013670445839036526, + "grad_norm": 0.6175518035888672, + "learning_rate": 7.93e-06, + "loss": 2.339, + "mean_token_accuracy": 0.4950829269364476, + "num_tokens": 115103571.0, + "step": 794 + }, + { + "entropy": 2.4580078125, + "epoch": 0.013687663025231786, + "grad_norm": 0.5724807977676392, + "learning_rate": 7.94e-06, + "loss": 2.3744, + "mean_token_accuracy": 0.47975975926965475, + "num_tokens": 115243719.0, + "step": 795 + }, + { + "entropy": 2.5086669921875, + "epoch": 0.013704880211427047, + "grad_norm": 0.5798308849334717, + "learning_rate": 7.950000000000002e-06, + "loss": 2.4594, + "mean_token_accuracy": 0.4762448235414922, + "num_tokens": 115383378.0, + "step": 796 + }, + { + "entropy": 2.4212646484375, + "epoch": 0.013722097397622307, + "grad_norm": 0.5839712023735046, + "learning_rate": 7.960000000000002e-06, + "loss": 2.3884, + "mean_token_accuracy": 0.48001448903232813, + "num_tokens": 115527648.0, + "step": 797 + }, + { + "entropy": 2.388671875, + "epoch": 0.013739314583817566, + "grad_norm": 0.630893349647522, + "learning_rate": 7.970000000000002e-06, + "loss": 2.3391, + "mean_token_accuracy": 0.48810708662495017, + "num_tokens": 115673489.0, + "step": 798 + }, + { + "entropy": 2.475341796875, + "epoch": 0.013756531770012827, + "grad_norm": 0.590587854385376, + "learning_rate": 7.980000000000002e-06, + "loss": 2.4425, + "mean_token_accuracy": 0.4739443711005151, + "num_tokens": 115814268.0, + "step": 799 + }, + { + "entropy": 2.380615234375, + "epoch": 0.013773748956208087, + "grad_norm": 0.566112220287323, + "learning_rate": 7.990000000000001e-06, + "loss": 2.3345, + "mean_token_accuracy": 0.49479513335973024, + "num_tokens": 115962087.0, + "step": 800 + }, + { + "entropy": 2.53515625, + "epoch": 0.013790966142403348, + "grad_norm": 0.5729132890701294, + "learning_rate": 8.000000000000001e-06, + "loss": 2.478, + "mean_token_accuracy": 0.46622643573209643, + "num_tokens": 116096894.0, + "step": 801 + }, + { + "entropy": 2.43701171875, + "epoch": 0.013808183328598607, + "grad_norm": 0.5780159831047058, + "learning_rate": 8.010000000000001e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.48138847574591637, + "num_tokens": 116248222.0, + "step": 802 + }, + { + "entropy": 2.44482421875, + "epoch": 0.013825400514793867, + "grad_norm": 0.5970706343650818, + "learning_rate": 8.020000000000001e-06, + "loss": 2.4229, + "mean_token_accuracy": 0.4804395758546889, + "num_tokens": 116405644.0, + "step": 803 + }, + { + "entropy": 2.43505859375, + "epoch": 0.013842617700989128, + "grad_norm": 0.5705693364143372, + "learning_rate": 8.030000000000001e-06, + "loss": 2.3441, + "mean_token_accuracy": 0.48369065998122096, + "num_tokens": 116548738.0, + "step": 804 + }, + { + "entropy": 2.4342041015625, + "epoch": 0.013859834887184388, + "grad_norm": 0.5916334390640259, + "learning_rate": 8.040000000000001e-06, + "loss": 2.3664, + "mean_token_accuracy": 0.4856993416324258, + "num_tokens": 116691888.0, + "step": 805 + }, + { + "entropy": 2.4691162109375, + "epoch": 0.013877052073379647, + "grad_norm": 0.579868495464325, + "learning_rate": 8.050000000000001e-06, + "loss": 2.3724, + "mean_token_accuracy": 0.4843080313876271, + "num_tokens": 116830242.0, + "step": 806 + }, + { + "entropy": 2.40966796875, + "epoch": 0.013894269259574907, + "grad_norm": 0.5866817831993103, + "learning_rate": 8.06e-06, + "loss": 2.3801, + "mean_token_accuracy": 0.48936720937490463, + "num_tokens": 116965900.0, + "step": 807 + }, + { + "entropy": 2.4056396484375, + "epoch": 0.013911486445770168, + "grad_norm": 0.5471423864364624, + "learning_rate": 8.07e-06, + "loss": 2.3545, + "mean_token_accuracy": 0.4835878200829029, + "num_tokens": 117118489.0, + "step": 808 + }, + { + "entropy": 2.42529296875, + "epoch": 0.013928703631965429, + "grad_norm": 0.577364981174469, + "learning_rate": 8.08e-06, + "loss": 2.3581, + "mean_token_accuracy": 0.48418712290003896, + "num_tokens": 117264043.0, + "step": 809 + }, + { + "entropy": 2.43310546875, + "epoch": 0.013945920818160687, + "grad_norm": 0.5444287657737732, + "learning_rate": 8.09e-06, + "loss": 2.4178, + "mean_token_accuracy": 0.476511531509459, + "num_tokens": 117419082.0, + "step": 810 + }, + { + "entropy": 2.435302734375, + "epoch": 0.013963138004355948, + "grad_norm": 0.6165118217468262, + "learning_rate": 8.1e-06, + "loss": 2.4043, + "mean_token_accuracy": 0.48205558583140373, + "num_tokens": 117552496.0, + "step": 811 + }, + { + "entropy": 2.462158203125, + "epoch": 0.013980355190551208, + "grad_norm": 0.5636385679244995, + "learning_rate": 8.110000000000002e-06, + "loss": 2.4252, + "mean_token_accuracy": 0.4745056303218007, + "num_tokens": 117702578.0, + "step": 812 + }, + { + "entropy": 2.542724609375, + "epoch": 0.013997572376746469, + "grad_norm": 0.5832167863845825, + "learning_rate": 8.120000000000002e-06, + "loss": 2.5385, + "mean_token_accuracy": 0.4675266365520656, + "num_tokens": 117850701.0, + "step": 813 + }, + { + "entropy": 2.489990234375, + "epoch": 0.014014789562941728, + "grad_norm": 0.5839970111846924, + "learning_rate": 8.13e-06, + "loss": 2.4495, + "mean_token_accuracy": 0.47388132382184267, + "num_tokens": 118001878.0, + "step": 814 + }, + { + "entropy": 2.429931640625, + "epoch": 0.014032006749136988, + "grad_norm": 0.6004050970077515, + "learning_rate": 8.14e-06, + "loss": 2.3689, + "mean_token_accuracy": 0.48473711824044585, + "num_tokens": 118146781.0, + "step": 815 + }, + { + "entropy": 2.517333984375, + "epoch": 0.014049223935332249, + "grad_norm": 0.5721548795700073, + "learning_rate": 8.15e-06, + "loss": 2.5005, + "mean_token_accuracy": 0.47145770117640495, + "num_tokens": 118281715.0, + "step": 816 + }, + { + "entropy": 2.41259765625, + "epoch": 0.01406644112152751, + "grad_norm": 0.6043224930763245, + "learning_rate": 8.16e-06, + "loss": 2.3785, + "mean_token_accuracy": 0.48464843491092324, + "num_tokens": 118420852.0, + "step": 817 + }, + { + "entropy": 2.46435546875, + "epoch": 0.014083658307722768, + "grad_norm": 0.5578818321228027, + "learning_rate": 8.17e-06, + "loss": 2.4389, + "mean_token_accuracy": 0.4739728611893952, + "num_tokens": 118561047.0, + "step": 818 + }, + { + "entropy": 2.46435546875, + "epoch": 0.014100875493918029, + "grad_norm": 0.5339409112930298, + "learning_rate": 8.18e-06, + "loss": 2.4303, + "mean_token_accuracy": 0.478867762722075, + "num_tokens": 118724226.0, + "step": 819 + }, + { + "entropy": 2.4332275390625, + "epoch": 0.014118092680113289, + "grad_norm": 0.5819604396820068, + "learning_rate": 8.19e-06, + "loss": 2.3808, + "mean_token_accuracy": 0.4880368346348405, + "num_tokens": 118869937.0, + "step": 820 + }, + { + "entropy": 2.465576171875, + "epoch": 0.01413530986630855, + "grad_norm": 0.571780800819397, + "learning_rate": 8.2e-06, + "loss": 2.4314, + "mean_token_accuracy": 0.4772975808009505, + "num_tokens": 119022262.0, + "step": 821 + }, + { + "entropy": 2.4794921875, + "epoch": 0.01415252705250381, + "grad_norm": 0.5831077694892883, + "learning_rate": 8.210000000000001e-06, + "loss": 2.4701, + "mean_token_accuracy": 0.47361723706126213, + "num_tokens": 119165669.0, + "step": 822 + }, + { + "entropy": 2.4208984375, + "epoch": 0.014169744238699069, + "grad_norm": 0.5717747211456299, + "learning_rate": 8.220000000000001e-06, + "loss": 2.3679, + "mean_token_accuracy": 0.48527460684999824, + "num_tokens": 119301111.0, + "step": 823 + }, + { + "entropy": 2.45947265625, + "epoch": 0.01418696142489433, + "grad_norm": 0.6002344489097595, + "learning_rate": 8.23e-06, + "loss": 2.3749, + "mean_token_accuracy": 0.48156877839937806, + "num_tokens": 119443072.0, + "step": 824 + }, + { + "entropy": 2.4619140625, + "epoch": 0.01420417861108959, + "grad_norm": 0.5510353446006775, + "learning_rate": 8.24e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.47862795926630497, + "num_tokens": 119599588.0, + "step": 825 + }, + { + "entropy": 2.426513671875, + "epoch": 0.01422139579728485, + "grad_norm": 0.6013565063476562, + "learning_rate": 8.25e-06, + "loss": 2.345, + "mean_token_accuracy": 0.48915665224194527, + "num_tokens": 119745982.0, + "step": 826 + }, + { + "entropy": 2.506103515625, + "epoch": 0.01423861298348011, + "grad_norm": 0.5756415724754333, + "learning_rate": 8.26e-06, + "loss": 2.4707, + "mean_token_accuracy": 0.4742961646988988, + "num_tokens": 119883211.0, + "step": 827 + }, + { + "entropy": 2.427734375, + "epoch": 0.01425583016967537, + "grad_norm": 0.6262539625167847, + "learning_rate": 8.27e-06, + "loss": 2.3734, + "mean_token_accuracy": 0.4855869854800403, + "num_tokens": 120007564.0, + "step": 828 + }, + { + "entropy": 2.447998046875, + "epoch": 0.01427304735587063, + "grad_norm": 0.577958345413208, + "learning_rate": 8.28e-06, + "loss": 2.4136, + "mean_token_accuracy": 0.4838542784564197, + "num_tokens": 120147199.0, + "step": 829 + }, + { + "entropy": 2.4615478515625, + "epoch": 0.014290264542065891, + "grad_norm": 0.6253990530967712, + "learning_rate": 8.29e-06, + "loss": 2.4558, + "mean_token_accuracy": 0.4775591907091439, + "num_tokens": 120294626.0, + "step": 830 + }, + { + "entropy": 2.3575439453125, + "epoch": 0.01430748172826115, + "grad_norm": 0.5368080139160156, + "learning_rate": 8.3e-06, + "loss": 2.3092, + "mean_token_accuracy": 0.4943057978525758, + "num_tokens": 120451476.0, + "step": 831 + }, + { + "entropy": 2.447265625, + "epoch": 0.01432469891445641, + "grad_norm": 0.6116092801094055, + "learning_rate": 8.31e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.47798394318670034, + "num_tokens": 120605020.0, + "step": 832 + }, + { + "entropy": 2.3782958984375, + "epoch": 0.01434191610065167, + "grad_norm": 0.5678116083145142, + "learning_rate": 8.32e-06, + "loss": 2.3034, + "mean_token_accuracy": 0.4946139776147902, + "num_tokens": 120752495.0, + "step": 833 + }, + { + "entropy": 2.363037109375, + "epoch": 0.014359133286846931, + "grad_norm": 0.5678343772888184, + "learning_rate": 8.33e-06, + "loss": 2.3141, + "mean_token_accuracy": 0.49313395330682397, + "num_tokens": 120900694.0, + "step": 834 + }, + { + "entropy": 2.4739990234375, + "epoch": 0.01437635047304219, + "grad_norm": 0.5975309014320374, + "learning_rate": 8.34e-06, + "loss": 2.4388, + "mean_token_accuracy": 0.4748355788178742, + "num_tokens": 121032159.0, + "step": 835 + }, + { + "entropy": 2.51806640625, + "epoch": 0.01439356765923745, + "grad_norm": 0.5592584013938904, + "learning_rate": 8.35e-06, + "loss": 2.4939, + "mean_token_accuracy": 0.4737435169517994, + "num_tokens": 121177988.0, + "step": 836 + }, + { + "entropy": 2.433837890625, + "epoch": 0.014410784845432711, + "grad_norm": 0.5620052218437195, + "learning_rate": 8.36e-06, + "loss": 2.3528, + "mean_token_accuracy": 0.484672705642879, + "num_tokens": 121333599.0, + "step": 837 + }, + { + "entropy": 2.4949951171875, + "epoch": 0.014428002031627972, + "grad_norm": 0.6018322706222534, + "learning_rate": 8.370000000000001e-06, + "loss": 2.4874, + "mean_token_accuracy": 0.471315645147115, + "num_tokens": 121483935.0, + "step": 838 + }, + { + "entropy": 2.4498291015625, + "epoch": 0.01444521921782323, + "grad_norm": 0.5906838774681091, + "learning_rate": 8.380000000000001e-06, + "loss": 2.4314, + "mean_token_accuracy": 0.48000403633341193, + "num_tokens": 121626405.0, + "step": 839 + }, + { + "entropy": 2.3984375, + "epoch": 0.014462436404018491, + "grad_norm": 0.6163339614868164, + "learning_rate": 8.390000000000001e-06, + "loss": 2.3408, + "mean_token_accuracy": 0.4879218554124236, + "num_tokens": 121755719.0, + "step": 840 + }, + { + "entropy": 2.465576171875, + "epoch": 0.014479653590213751, + "grad_norm": 0.6231170892715454, + "learning_rate": 8.400000000000001e-06, + "loss": 2.4589, + "mean_token_accuracy": 0.47292218124493957, + "num_tokens": 121876833.0, + "step": 841 + }, + { + "entropy": 2.4288330078125, + "epoch": 0.014496870776409012, + "grad_norm": 0.6095725893974304, + "learning_rate": 8.41e-06, + "loss": 2.3584, + "mean_token_accuracy": 0.4825339512899518, + "num_tokens": 122029110.0, + "step": 842 + }, + { + "entropy": 2.479248046875, + "epoch": 0.01451408796260427, + "grad_norm": 0.5682320594787598, + "learning_rate": 8.42e-06, + "loss": 2.4085, + "mean_token_accuracy": 0.47804021881893277, + "num_tokens": 122177885.0, + "step": 843 + }, + { + "entropy": 2.4327392578125, + "epoch": 0.014531305148799531, + "grad_norm": 0.5729749202728271, + "learning_rate": 8.43e-06, + "loss": 2.365, + "mean_token_accuracy": 0.4850345575250685, + "num_tokens": 122326681.0, + "step": 844 + }, + { + "entropy": 2.3778076171875, + "epoch": 0.014548522334994792, + "grad_norm": 0.5983383059501648, + "learning_rate": 8.44e-06, + "loss": 2.3205, + "mean_token_accuracy": 0.4961178773082793, + "num_tokens": 122468050.0, + "step": 845 + }, + { + "entropy": 2.498779296875, + "epoch": 0.014565739521190052, + "grad_norm": 0.5855973958969116, + "learning_rate": 8.45e-06, + "loss": 2.4899, + "mean_token_accuracy": 0.4722827118821442, + "num_tokens": 122605022.0, + "step": 846 + }, + { + "entropy": 2.51318359375, + "epoch": 0.014582956707385313, + "grad_norm": 0.7350115180015564, + "learning_rate": 8.46e-06, + "loss": 2.4897, + "mean_token_accuracy": 0.4719462259672582, + "num_tokens": 122756763.0, + "step": 847 + }, + { + "entropy": 2.453125, + "epoch": 0.014600173893580572, + "grad_norm": 0.5812132954597473, + "learning_rate": 8.47e-06, + "loss": 2.3954, + "mean_token_accuracy": 0.48041255166754127, + "num_tokens": 122903879.0, + "step": 848 + }, + { + "entropy": 2.479736328125, + "epoch": 0.014617391079775832, + "grad_norm": 0.5762138962745667, + "learning_rate": 8.48e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.4698569756001234, + "num_tokens": 123041198.0, + "step": 849 + }, + { + "entropy": 2.420654296875, + "epoch": 0.014634608265971093, + "grad_norm": 0.5930407643318176, + "learning_rate": 8.49e-06, + "loss": 2.3483, + "mean_token_accuracy": 0.4922699723392725, + "num_tokens": 123183506.0, + "step": 850 + }, + { + "entropy": 2.3505859375, + "epoch": 0.014651825452166353, + "grad_norm": 0.5760442614555359, + "learning_rate": 8.5e-06, + "loss": 2.2918, + "mean_token_accuracy": 0.4991521080955863, + "num_tokens": 123324526.0, + "step": 851 + }, + { + "entropy": 2.53662109375, + "epoch": 0.014669042638361612, + "grad_norm": 0.6048057079315186, + "learning_rate": 8.51e-06, + "loss": 2.4955, + "mean_token_accuracy": 0.46434704307466745, + "num_tokens": 123453650.0, + "step": 852 + }, + { + "entropy": 2.4481201171875, + "epoch": 0.014686259824556873, + "grad_norm": 0.5807772874832153, + "learning_rate": 8.52e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.4818004462867975, + "num_tokens": 123600203.0, + "step": 853 + }, + { + "entropy": 2.42041015625, + "epoch": 0.014703477010752133, + "grad_norm": 0.5653759837150574, + "learning_rate": 8.530000000000001e-06, + "loss": 2.3675, + "mean_token_accuracy": 0.4863137351348996, + "num_tokens": 123750776.0, + "step": 854 + }, + { + "entropy": 2.3917236328125, + "epoch": 0.014720694196947394, + "grad_norm": 0.5585445761680603, + "learning_rate": 8.540000000000001e-06, + "loss": 2.36, + "mean_token_accuracy": 0.4850917439907789, + "num_tokens": 123908995.0, + "step": 855 + }, + { + "entropy": 2.408203125, + "epoch": 0.014737911383142652, + "grad_norm": 0.6110696196556091, + "learning_rate": 8.550000000000001e-06, + "loss": 2.3376, + "mean_token_accuracy": 0.4934307490475476, + "num_tokens": 124048260.0, + "step": 856 + }, + { + "entropy": 2.4285888671875, + "epoch": 0.014755128569337913, + "grad_norm": 0.5494508147239685, + "learning_rate": 8.560000000000001e-06, + "loss": 2.3685, + "mean_token_accuracy": 0.485230874735862, + "num_tokens": 124195711.0, + "step": 857 + }, + { + "entropy": 2.49462890625, + "epoch": 0.014772345755533173, + "grad_norm": 0.5444778203964233, + "learning_rate": 8.570000000000001e-06, + "loss": 2.4763, + "mean_token_accuracy": 0.4732412826269865, + "num_tokens": 124345089.0, + "step": 858 + }, + { + "entropy": 2.4862060546875, + "epoch": 0.014789562941728434, + "grad_norm": 0.5632426738739014, + "learning_rate": 8.580000000000001e-06, + "loss": 2.447, + "mean_token_accuracy": 0.47163996985182166, + "num_tokens": 124491147.0, + "step": 859 + }, + { + "entropy": 2.492919921875, + "epoch": 0.014806780127923693, + "grad_norm": 0.5277729034423828, + "learning_rate": 8.59e-06, + "loss": 2.4818, + "mean_token_accuracy": 0.47168840002268553, + "num_tokens": 124653263.0, + "step": 860 + }, + { + "entropy": 2.5084228515625, + "epoch": 0.014823997314118953, + "grad_norm": 0.5971653461456299, + "learning_rate": 8.6e-06, + "loss": 2.4695, + "mean_token_accuracy": 0.4728867751546204, + "num_tokens": 124790799.0, + "step": 861 + }, + { + "entropy": 2.415771484375, + "epoch": 0.014841214500314214, + "grad_norm": 0.6263266801834106, + "learning_rate": 8.61e-06, + "loss": 2.3963, + "mean_token_accuracy": 0.4838090669363737, + "num_tokens": 124936295.0, + "step": 862 + }, + { + "entropy": 2.46142578125, + "epoch": 0.014858431686509474, + "grad_norm": 0.592215895652771, + "learning_rate": 8.62e-06, + "loss": 2.4003, + "mean_token_accuracy": 0.4809744218364358, + "num_tokens": 125067834.0, + "step": 863 + }, + { + "entropy": 2.4461669921875, + "epoch": 0.014875648872704733, + "grad_norm": 0.5886532664299011, + "learning_rate": 8.63e-06, + "loss": 2.4247, + "mean_token_accuracy": 0.48370860423892736, + "num_tokens": 125208751.0, + "step": 864 + }, + { + "entropy": 2.451904296875, + "epoch": 0.014892866058899994, + "grad_norm": 0.544989824295044, + "learning_rate": 8.64e-06, + "loss": 2.4477, + "mean_token_accuracy": 0.4746030508540571, + "num_tokens": 125362283.0, + "step": 865 + }, + { + "entropy": 2.47802734375, + "epoch": 0.014910083245095254, + "grad_norm": 0.5210621356964111, + "learning_rate": 8.65e-06, + "loss": 2.4684, + "mean_token_accuracy": 0.4717202326282859, + "num_tokens": 125535484.0, + "step": 866 + }, + { + "entropy": 2.4478759765625, + "epoch": 0.014927300431290515, + "grad_norm": 0.5993431806564331, + "learning_rate": 8.66e-06, + "loss": 2.3869, + "mean_token_accuracy": 0.47974208323284984, + "num_tokens": 125670144.0, + "step": 867 + }, + { + "entropy": 2.3865966796875, + "epoch": 0.014944517617485773, + "grad_norm": 0.5767236948013306, + "learning_rate": 8.67e-06, + "loss": 2.3517, + "mean_token_accuracy": 0.4869615617208183, + "num_tokens": 125807478.0, + "step": 868 + }, + { + "entropy": 2.526611328125, + "epoch": 0.014961734803681034, + "grad_norm": 0.6212364435195923, + "learning_rate": 8.68e-06, + "loss": 2.5197, + "mean_token_accuracy": 0.4605614240281284, + "num_tokens": 125927918.0, + "step": 869 + }, + { + "entropy": 2.4381103515625, + "epoch": 0.014978951989876295, + "grad_norm": 0.6073437929153442, + "learning_rate": 8.690000000000002e-06, + "loss": 2.3433, + "mean_token_accuracy": 0.4832576513290405, + "num_tokens": 126070123.0, + "step": 870 + }, + { + "entropy": 2.5057373046875, + "epoch": 0.014996169176071555, + "grad_norm": 0.5757139325141907, + "learning_rate": 8.700000000000001e-06, + "loss": 2.4574, + "mean_token_accuracy": 0.4752332870848477, + "num_tokens": 126221119.0, + "step": 871 + }, + { + "entropy": 2.47509765625, + "epoch": 0.015013386362266816, + "grad_norm": 0.5421658754348755, + "learning_rate": 8.710000000000001e-06, + "loss": 2.4344, + "mean_token_accuracy": 0.48030671663582325, + "num_tokens": 126376014.0, + "step": 872 + }, + { + "entropy": 2.404052734375, + "epoch": 0.015030603548462074, + "grad_norm": 0.6064536571502686, + "learning_rate": 8.720000000000001e-06, + "loss": 2.3335, + "mean_token_accuracy": 0.4924787334166467, + "num_tokens": 126511198.0, + "step": 873 + }, + { + "entropy": 2.507080078125, + "epoch": 0.015047820734657335, + "grad_norm": 0.5994257926940918, + "learning_rate": 8.730000000000001e-06, + "loss": 2.4807, + "mean_token_accuracy": 0.4721558247692883, + "num_tokens": 126647001.0, + "step": 874 + }, + { + "entropy": 2.5140380859375, + "epoch": 0.015065037920852595, + "grad_norm": 0.5920009016990662, + "learning_rate": 8.740000000000001e-06, + "loss": 2.479, + "mean_token_accuracy": 0.47505279863253236, + "num_tokens": 126795132.0, + "step": 875 + }, + { + "entropy": 2.4464111328125, + "epoch": 0.015082255107047856, + "grad_norm": 0.6077152490615845, + "learning_rate": 8.750000000000001e-06, + "loss": 2.4088, + "mean_token_accuracy": 0.48258367320522666, + "num_tokens": 126943201.0, + "step": 876 + }, + { + "entropy": 2.421630859375, + "epoch": 0.015099472293243115, + "grad_norm": 0.5878798365592957, + "learning_rate": 8.76e-06, + "loss": 2.3334, + "mean_token_accuracy": 0.48565139481797814, + "num_tokens": 127089107.0, + "step": 877 + }, + { + "entropy": 2.4451904296875, + "epoch": 0.015116689479438375, + "grad_norm": 0.6210505962371826, + "learning_rate": 8.77e-06, + "loss": 2.3817, + "mean_token_accuracy": 0.48540265718474984, + "num_tokens": 127229750.0, + "step": 878 + }, + { + "entropy": 2.404541015625, + "epoch": 0.015133906665633636, + "grad_norm": 0.6188786029815674, + "learning_rate": 8.78e-06, + "loss": 2.3565, + "mean_token_accuracy": 0.4887020909227431, + "num_tokens": 127362276.0, + "step": 879 + }, + { + "entropy": 2.43212890625, + "epoch": 0.015151123851828896, + "grad_norm": 0.587872326374054, + "learning_rate": 8.79e-06, + "loss": 2.4133, + "mean_token_accuracy": 0.48197965091094375, + "num_tokens": 127499990.0, + "step": 880 + }, + { + "entropy": 2.486572265625, + "epoch": 0.015168341038024155, + "grad_norm": 0.6304271817207336, + "learning_rate": 8.8e-06, + "loss": 2.4624, + "mean_token_accuracy": 0.47782522393390536, + "num_tokens": 127642780.0, + "step": 881 + }, + { + "entropy": 2.51318359375, + "epoch": 0.015185558224219416, + "grad_norm": 0.5970308780670166, + "learning_rate": 8.81e-06, + "loss": 2.4563, + "mean_token_accuracy": 0.47266413597390056, + "num_tokens": 127792589.0, + "step": 882 + }, + { + "entropy": 2.486083984375, + "epoch": 0.015202775410414676, + "grad_norm": 0.558076024055481, + "learning_rate": 8.82e-06, + "loss": 2.4215, + "mean_token_accuracy": 0.4797168541699648, + "num_tokens": 127940654.0, + "step": 883 + }, + { + "entropy": 2.39892578125, + "epoch": 0.015219992596609937, + "grad_norm": 0.569231390953064, + "learning_rate": 8.83e-06, + "loss": 2.3439, + "mean_token_accuracy": 0.48988029221072793, + "num_tokens": 128082621.0, + "step": 884 + }, + { + "entropy": 2.4755859375, + "epoch": 0.015237209782805195, + "grad_norm": 0.5827202796936035, + "learning_rate": 8.84e-06, + "loss": 2.4142, + "mean_token_accuracy": 0.4810488768853247, + "num_tokens": 128215498.0, + "step": 885 + }, + { + "entropy": 2.5076904296875, + "epoch": 0.015254426969000456, + "grad_norm": 0.5866194367408752, + "learning_rate": 8.85e-06, + "loss": 2.4625, + "mean_token_accuracy": 0.4744989378377795, + "num_tokens": 128353838.0, + "step": 886 + }, + { + "entropy": 2.479736328125, + "epoch": 0.015271644155195717, + "grad_norm": 0.5743751525878906, + "learning_rate": 8.860000000000002e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.4828067999333143, + "num_tokens": 128499102.0, + "step": 887 + }, + { + "entropy": 2.485595703125, + "epoch": 0.015288861341390977, + "grad_norm": 0.5429300665855408, + "learning_rate": 8.870000000000001e-06, + "loss": 2.4312, + "mean_token_accuracy": 0.4705997440032661, + "num_tokens": 128658894.0, + "step": 888 + }, + { + "entropy": 2.4361572265625, + "epoch": 0.015306078527586236, + "grad_norm": 0.5508224368095398, + "learning_rate": 8.880000000000001e-06, + "loss": 2.4386, + "mean_token_accuracy": 0.48232552874833345, + "num_tokens": 128803595.0, + "step": 889 + }, + { + "entropy": 2.453369140625, + "epoch": 0.015323295713781496, + "grad_norm": 0.5834963917732239, + "learning_rate": 8.890000000000001e-06, + "loss": 2.4298, + "mean_token_accuracy": 0.4826282048597932, + "num_tokens": 128939226.0, + "step": 890 + }, + { + "entropy": 2.508056640625, + "epoch": 0.015340512899976757, + "grad_norm": 0.5773516297340393, + "learning_rate": 8.900000000000001e-06, + "loss": 2.4476, + "mean_token_accuracy": 0.4731542756780982, + "num_tokens": 129073231.0, + "step": 891 + }, + { + "entropy": 2.526611328125, + "epoch": 0.015357730086172017, + "grad_norm": 0.6158662438392639, + "learning_rate": 8.910000000000001e-06, + "loss": 2.5126, + "mean_token_accuracy": 0.47055464377626777, + "num_tokens": 129212971.0, + "step": 892 + }, + { + "entropy": 2.49365234375, + "epoch": 0.015374947272367276, + "grad_norm": 0.571234941482544, + "learning_rate": 8.920000000000001e-06, + "loss": 2.4735, + "mean_token_accuracy": 0.47626718133687973, + "num_tokens": 129364837.0, + "step": 893 + }, + { + "entropy": 2.50927734375, + "epoch": 0.015392164458562537, + "grad_norm": 0.5566508173942566, + "learning_rate": 8.930000000000001e-06, + "loss": 2.4622, + "mean_token_accuracy": 0.475893325638026, + "num_tokens": 129513090.0, + "step": 894 + }, + { + "entropy": 2.4493408203125, + "epoch": 0.015409381644757797, + "grad_norm": 0.6352636814117432, + "learning_rate": 8.94e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.48302224883809686, + "num_tokens": 129639016.0, + "step": 895 + }, + { + "entropy": 2.4149169921875, + "epoch": 0.015426598830953058, + "grad_norm": 0.6071867942810059, + "learning_rate": 8.95e-06, + "loss": 2.3388, + "mean_token_accuracy": 0.48970347130671144, + "num_tokens": 129775367.0, + "step": 896 + }, + { + "entropy": 2.477783203125, + "epoch": 0.015443816017148317, + "grad_norm": 0.6278470754623413, + "learning_rate": 8.96e-06, + "loss": 2.475, + "mean_token_accuracy": 0.4722928828559816, + "num_tokens": 129899393.0, + "step": 897 + }, + { + "entropy": 2.461669921875, + "epoch": 0.015461033203343577, + "grad_norm": 0.5411078929901123, + "learning_rate": 8.97e-06, + "loss": 2.4289, + "mean_token_accuracy": 0.4794709859415889, + "num_tokens": 130049335.0, + "step": 898 + }, + { + "entropy": 2.46728515625, + "epoch": 0.015478250389538838, + "grad_norm": 0.6227301955223083, + "learning_rate": 8.98e-06, + "loss": 2.4675, + "mean_token_accuracy": 0.48002155777066946, + "num_tokens": 130191241.0, + "step": 899 + }, + { + "entropy": 2.513916015625, + "epoch": 0.015495467575734098, + "grad_norm": 0.553110659122467, + "learning_rate": 8.99e-06, + "loss": 2.4838, + "mean_token_accuracy": 0.47137136245146394, + "num_tokens": 130348446.0, + "step": 900 + }, + { + "entropy": 2.462890625, + "epoch": 0.015512684761929359, + "grad_norm": 0.596051037311554, + "learning_rate": 9e-06, + "loss": 2.4365, + "mean_token_accuracy": 0.48188950633630157, + "num_tokens": 130487415.0, + "step": 901 + }, + { + "entropy": 2.442138671875, + "epoch": 0.015529901948124617, + "grad_norm": 0.6585226058959961, + "learning_rate": 9.01e-06, + "loss": 2.4141, + "mean_token_accuracy": 0.4807861549779773, + "num_tokens": 130614941.0, + "step": 902 + }, + { + "entropy": 2.4937744140625, + "epoch": 0.015547119134319878, + "grad_norm": 0.6055911779403687, + "learning_rate": 9.020000000000002e-06, + "loss": 2.4229, + "mean_token_accuracy": 0.4837539931759238, + "num_tokens": 130754310.0, + "step": 903 + }, + { + "entropy": 2.43115234375, + "epoch": 0.015564336320515139, + "grad_norm": 0.5959157347679138, + "learning_rate": 9.030000000000002e-06, + "loss": 2.3802, + "mean_token_accuracy": 0.4859010446816683, + "num_tokens": 130906157.0, + "step": 904 + }, + { + "entropy": 2.461181640625, + "epoch": 0.015581553506710399, + "grad_norm": 0.5708449482917786, + "learning_rate": 9.040000000000002e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.4815267431549728, + "num_tokens": 131059316.0, + "step": 905 + }, + { + "entropy": 2.448974609375, + "epoch": 0.015598770692905658, + "grad_norm": 0.577296257019043, + "learning_rate": 9.050000000000001e-06, + "loss": 2.3894, + "mean_token_accuracy": 0.4845569171011448, + "num_tokens": 131205522.0, + "step": 906 + }, + { + "entropy": 2.41845703125, + "epoch": 0.015615987879100918, + "grad_norm": 0.60146164894104, + "learning_rate": 9.060000000000001e-06, + "loss": 2.3876, + "mean_token_accuracy": 0.4812913998030126, + "num_tokens": 131352873.0, + "step": 907 + }, + { + "entropy": 2.4771728515625, + "epoch": 0.01563320506529618, + "grad_norm": 0.6240298748016357, + "learning_rate": 9.070000000000001e-06, + "loss": 2.4327, + "mean_token_accuracy": 0.4764832267537713, + "num_tokens": 131486068.0, + "step": 908 + }, + { + "entropy": 2.4599609375, + "epoch": 0.01565042225149144, + "grad_norm": 1.2479300498962402, + "learning_rate": 9.080000000000001e-06, + "loss": 2.4198, + "mean_token_accuracy": 0.47274223202839494, + "num_tokens": 131625641.0, + "step": 909 + }, + { + "entropy": 2.379638671875, + "epoch": 0.0156676394376867, + "grad_norm": 0.6223964095115662, + "learning_rate": 9.090000000000001e-06, + "loss": 2.3421, + "mean_token_accuracy": 0.4910487150773406, + "num_tokens": 131774897.0, + "step": 910 + }, + { + "entropy": 2.4305419921875, + "epoch": 0.01568485662388196, + "grad_norm": 0.6342669725418091, + "learning_rate": 9.100000000000001e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.48323542159050703, + "num_tokens": 131918941.0, + "step": 911 + }, + { + "entropy": 2.3966064453125, + "epoch": 0.015702073810077218, + "grad_norm": 0.5977234244346619, + "learning_rate": 9.110000000000001e-06, + "loss": 2.3407, + "mean_token_accuracy": 0.4932427750900388, + "num_tokens": 132058255.0, + "step": 912 + }, + { + "entropy": 2.4349365234375, + "epoch": 0.015719290996272478, + "grad_norm": 0.5736342072486877, + "learning_rate": 9.12e-06, + "loss": 2.3943, + "mean_token_accuracy": 0.4837222811765969, + "num_tokens": 132197276.0, + "step": 913 + }, + { + "entropy": 2.4132080078125, + "epoch": 0.01573650818246774, + "grad_norm": 0.6241025924682617, + "learning_rate": 9.13e-06, + "loss": 2.3856, + "mean_token_accuracy": 0.48743872344493866, + "num_tokens": 132337325.0, + "step": 914 + }, + { + "entropy": 2.41748046875, + "epoch": 0.015753725368663, + "grad_norm": 0.6098312735557556, + "learning_rate": 9.14e-06, + "loss": 2.374, + "mean_token_accuracy": 0.4855101592838764, + "num_tokens": 132486321.0, + "step": 915 + }, + { + "entropy": 2.479248046875, + "epoch": 0.01577094255485826, + "grad_norm": 0.5596804022789001, + "learning_rate": 9.15e-06, + "loss": 2.3876, + "mean_token_accuracy": 0.47576976707205176, + "num_tokens": 132647149.0, + "step": 916 + }, + { + "entropy": 2.4208984375, + "epoch": 0.01578815974105352, + "grad_norm": 0.5681569576263428, + "learning_rate": 9.16e-06, + "loss": 2.3797, + "mean_token_accuracy": 0.48612832371145487, + "num_tokens": 132795123.0, + "step": 917 + }, + { + "entropy": 2.4420166015625, + "epoch": 0.01580537692724878, + "grad_norm": 0.6165553331375122, + "learning_rate": 9.17e-06, + "loss": 2.4223, + "mean_token_accuracy": 0.4839365719817579, + "num_tokens": 132930289.0, + "step": 918 + }, + { + "entropy": 2.458740234375, + "epoch": 0.01582259411344404, + "grad_norm": 0.6469439268112183, + "learning_rate": 9.180000000000002e-06, + "loss": 2.4158, + "mean_token_accuracy": 0.48115426022559404, + "num_tokens": 133057834.0, + "step": 919 + }, + { + "entropy": 2.4097900390625, + "epoch": 0.015839811299639298, + "grad_norm": 0.5866466164588928, + "learning_rate": 9.190000000000002e-06, + "loss": 2.3269, + "mean_token_accuracy": 0.48917456716299057, + "num_tokens": 133200644.0, + "step": 920 + }, + { + "entropy": 2.463623046875, + "epoch": 0.01585702848583456, + "grad_norm": 0.5503515601158142, + "learning_rate": 9.200000000000002e-06, + "loss": 2.4136, + "mean_token_accuracy": 0.478064242284745, + "num_tokens": 133352710.0, + "step": 921 + }, + { + "entropy": 2.4775390625, + "epoch": 0.01587424567202982, + "grad_norm": 0.5962862968444824, + "learning_rate": 9.210000000000002e-06, + "loss": 2.454, + "mean_token_accuracy": 0.47544111032038927, + "num_tokens": 133501140.0, + "step": 922 + }, + { + "entropy": 2.502197265625, + "epoch": 0.01589146285822508, + "grad_norm": 0.6619735360145569, + "learning_rate": 9.220000000000002e-06, + "loss": 2.4128, + "mean_token_accuracy": 0.47879257379099727, + "num_tokens": 133618064.0, + "step": 923 + }, + { + "entropy": 2.3917236328125, + "epoch": 0.01590868004442034, + "grad_norm": 0.6243993639945984, + "learning_rate": 9.230000000000001e-06, + "loss": 2.3601, + "mean_token_accuracy": 0.49083237862214446, + "num_tokens": 133746355.0, + "step": 924 + }, + { + "entropy": 2.4593505859375, + "epoch": 0.0159258972306156, + "grad_norm": 0.5959250926971436, + "learning_rate": 9.240000000000001e-06, + "loss": 2.3893, + "mean_token_accuracy": 0.487882892601192, + "num_tokens": 133887812.0, + "step": 925 + }, + { + "entropy": 2.4609375, + "epoch": 0.01594311441681086, + "grad_norm": 0.6122800707817078, + "learning_rate": 9.250000000000001e-06, + "loss": 2.432, + "mean_token_accuracy": 0.47797348722815514, + "num_tokens": 134038997.0, + "step": 926 + }, + { + "entropy": 2.46875, + "epoch": 0.015960331603006122, + "grad_norm": 0.547160267829895, + "learning_rate": 9.260000000000001e-06, + "loss": 2.4407, + "mean_token_accuracy": 0.47520409850403666, + "num_tokens": 134198302.0, + "step": 927 + }, + { + "entropy": 2.48974609375, + "epoch": 0.015977548789201382, + "grad_norm": 0.5570476651191711, + "learning_rate": 9.270000000000001e-06, + "loss": 2.4886, + "mean_token_accuracy": 0.47539124358445406, + "num_tokens": 134342027.0, + "step": 928 + }, + { + "entropy": 2.41064453125, + "epoch": 0.01599476597539664, + "grad_norm": 0.5551185011863708, + "learning_rate": 9.280000000000001e-06, + "loss": 2.3801, + "mean_token_accuracy": 0.48781104385852814, + "num_tokens": 134495434.0, + "step": 929 + }, + { + "entropy": 2.4661865234375, + "epoch": 0.0160119831615919, + "grad_norm": 0.6195770502090454, + "learning_rate": 9.29e-06, + "loss": 2.4388, + "mean_token_accuracy": 0.4786299457773566, + "num_tokens": 134626002.0, + "step": 930 + }, + { + "entropy": 2.4622802734375, + "epoch": 0.01602920034778716, + "grad_norm": 0.6120036244392395, + "learning_rate": 9.3e-06, + "loss": 2.4435, + "mean_token_accuracy": 0.47908638790249825, + "num_tokens": 134778634.0, + "step": 931 + }, + { + "entropy": 2.5621337890625, + "epoch": 0.01604641753398242, + "grad_norm": 0.5915412306785583, + "learning_rate": 9.31e-06, + "loss": 2.5866, + "mean_token_accuracy": 0.462941222358495, + "num_tokens": 134929621.0, + "step": 932 + }, + { + "entropy": 2.41796875, + "epoch": 0.01606363472017768, + "grad_norm": 0.5830768346786499, + "learning_rate": 9.32e-06, + "loss": 2.355, + "mean_token_accuracy": 0.4878848767839372, + "num_tokens": 135086585.0, + "step": 933 + }, + { + "entropy": 2.353759765625, + "epoch": 0.016080851906372942, + "grad_norm": 0.6241956949234009, + "learning_rate": 9.33e-06, + "loss": 2.3049, + "mean_token_accuracy": 0.49892251333221793, + "num_tokens": 135231782.0, + "step": 934 + }, + { + "entropy": 2.406005859375, + "epoch": 0.016098069092568203, + "grad_norm": 0.5853713750839233, + "learning_rate": 9.340000000000002e-06, + "loss": 2.4124, + "mean_token_accuracy": 0.4829076291061938, + "num_tokens": 135378790.0, + "step": 935 + }, + { + "entropy": 2.5213623046875, + "epoch": 0.016115286278763463, + "grad_norm": 0.5981717109680176, + "learning_rate": 9.350000000000002e-06, + "loss": 2.5113, + "mean_token_accuracy": 0.4728419524617493, + "num_tokens": 135520311.0, + "step": 936 + }, + { + "entropy": 2.5010986328125, + "epoch": 0.01613250346495872, + "grad_norm": 0.5917856693267822, + "learning_rate": 9.360000000000002e-06, + "loss": 2.4518, + "mean_token_accuracy": 0.46924112644046545, + "num_tokens": 135669776.0, + "step": 937 + }, + { + "entropy": 2.42724609375, + "epoch": 0.01614972065115398, + "grad_norm": 0.5660749673843384, + "learning_rate": 9.370000000000002e-06, + "loss": 2.3848, + "mean_token_accuracy": 0.4782256758771837, + "num_tokens": 135821760.0, + "step": 938 + }, + { + "entropy": 2.498779296875, + "epoch": 0.01616693783734924, + "grad_norm": 0.5981243252754211, + "learning_rate": 9.38e-06, + "loss": 2.4628, + "mean_token_accuracy": 0.4730893294326961, + "num_tokens": 135956326.0, + "step": 939 + }, + { + "entropy": 2.3973388671875, + "epoch": 0.016184155023544502, + "grad_norm": 0.580276608467102, + "learning_rate": 9.39e-06, + "loss": 2.3663, + "mean_token_accuracy": 0.4926355588249862, + "num_tokens": 136101164.0, + "step": 940 + }, + { + "entropy": 2.470947265625, + "epoch": 0.016201372209739762, + "grad_norm": 0.5627194046974182, + "learning_rate": 9.4e-06, + "loss": 2.416, + "mean_token_accuracy": 0.4773054295219481, + "num_tokens": 136251505.0, + "step": 941 + }, + { + "entropy": 2.519775390625, + "epoch": 0.016218589395935023, + "grad_norm": 0.6778742671012878, + "learning_rate": 9.41e-06, + "loss": 2.4862, + "mean_token_accuracy": 0.4745273180305958, + "num_tokens": 136391079.0, + "step": 942 + }, + { + "entropy": 2.4498291015625, + "epoch": 0.016235806582130283, + "grad_norm": 0.582788348197937, + "learning_rate": 9.42e-06, + "loss": 2.3946, + "mean_token_accuracy": 0.48375819250941277, + "num_tokens": 136540299.0, + "step": 943 + }, + { + "entropy": 2.4471435546875, + "epoch": 0.016253023768325544, + "grad_norm": 0.5528161525726318, + "learning_rate": 9.43e-06, + "loss": 2.3405, + "mean_token_accuracy": 0.47900286270305514, + "num_tokens": 136688027.0, + "step": 944 + }, + { + "entropy": 2.429931640625, + "epoch": 0.0162702409545208, + "grad_norm": 0.6193938851356506, + "learning_rate": 9.440000000000001e-06, + "loss": 2.3861, + "mean_token_accuracy": 0.4864586624316871, + "num_tokens": 136823997.0, + "step": 945 + }, + { + "entropy": 2.4169921875, + "epoch": 0.01628745814071606, + "grad_norm": 0.5639959573745728, + "learning_rate": 9.450000000000001e-06, + "loss": 2.3965, + "mean_token_accuracy": 0.4871432662475854, + "num_tokens": 136967987.0, + "step": 946 + }, + { + "entropy": 2.4097900390625, + "epoch": 0.016304675326911322, + "grad_norm": 0.593999981880188, + "learning_rate": 9.460000000000001e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.48685387754812837, + "num_tokens": 137120260.0, + "step": 947 + }, + { + "entropy": 2.4072265625, + "epoch": 0.016321892513106583, + "grad_norm": 0.5643085837364197, + "learning_rate": 9.47e-06, + "loss": 2.387, + "mean_token_accuracy": 0.4881442333571613, + "num_tokens": 137280017.0, + "step": 948 + }, + { + "entropy": 2.474853515625, + "epoch": 0.016339109699301843, + "grad_norm": 0.602178156375885, + "learning_rate": 9.48e-06, + "loss": 2.4429, + "mean_token_accuracy": 0.4794820128008723, + "num_tokens": 137431493.0, + "step": 949 + }, + { + "entropy": 2.43505859375, + "epoch": 0.016356326885497104, + "grad_norm": 0.5669793486595154, + "learning_rate": 9.49e-06, + "loss": 2.4303, + "mean_token_accuracy": 0.4830532097257674, + "num_tokens": 137575150.0, + "step": 950 + }, + { + "entropy": 2.4486083984375, + "epoch": 0.016373544071692364, + "grad_norm": 0.5908523201942444, + "learning_rate": 9.5e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.4807011899538338, + "num_tokens": 137718243.0, + "step": 951 + }, + { + "entropy": 2.466552734375, + "epoch": 0.016390761257887625, + "grad_norm": 0.5942707061767578, + "learning_rate": 9.51e-06, + "loss": 2.4347, + "mean_token_accuracy": 0.47939926059916615, + "num_tokens": 137855181.0, + "step": 952 + }, + { + "entropy": 2.43359375, + "epoch": 0.016407978444082885, + "grad_norm": 0.6280327439308167, + "learning_rate": 9.52e-06, + "loss": 2.4106, + "mean_token_accuracy": 0.48289193166419864, + "num_tokens": 137984833.0, + "step": 953 + }, + { + "entropy": 2.4107666015625, + "epoch": 0.016425195630278142, + "grad_norm": 0.6055025458335876, + "learning_rate": 9.53e-06, + "loss": 2.3538, + "mean_token_accuracy": 0.48937113396823406, + "num_tokens": 138132043.0, + "step": 954 + }, + { + "entropy": 2.403076171875, + "epoch": 0.016442412816473403, + "grad_norm": 0.557295024394989, + "learning_rate": 9.54e-06, + "loss": 2.3975, + "mean_token_accuracy": 0.48123540729284286, + "num_tokens": 138295818.0, + "step": 955 + }, + { + "entropy": 2.475830078125, + "epoch": 0.016459630002668663, + "grad_norm": 0.6141370534896851, + "learning_rate": 9.55e-06, + "loss": 2.4164, + "mean_token_accuracy": 0.47790815867483616, + "num_tokens": 138437598.0, + "step": 956 + }, + { + "entropy": 2.3380126953125, + "epoch": 0.016476847188863924, + "grad_norm": 0.5739216208457947, + "learning_rate": 9.56e-06, + "loss": 2.277, + "mean_token_accuracy": 0.5020720390602946, + "num_tokens": 138579286.0, + "step": 957 + }, + { + "entropy": 2.482177734375, + "epoch": 0.016494064375059184, + "grad_norm": 0.5679910778999329, + "learning_rate": 9.57e-06, + "loss": 2.4916, + "mean_token_accuracy": 0.4723518299870193, + "num_tokens": 138737250.0, + "step": 958 + }, + { + "entropy": 2.428466796875, + "epoch": 0.016511281561254445, + "grad_norm": 0.5608857274055481, + "learning_rate": 9.58e-06, + "loss": 2.3651, + "mean_token_accuracy": 0.48081721225753427, + "num_tokens": 138887535.0, + "step": 959 + }, + { + "entropy": 2.45068359375, + "epoch": 0.016528498747449705, + "grad_norm": 0.5733979344367981, + "learning_rate": 9.59e-06, + "loss": 2.3793, + "mean_token_accuracy": 0.4807218159548938, + "num_tokens": 139046347.0, + "step": 960 + }, + { + "entropy": 2.4381103515625, + "epoch": 0.016545715933644966, + "grad_norm": 0.5601542592048645, + "learning_rate": 9.600000000000001e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.48160395165905356, + "num_tokens": 139201178.0, + "step": 961 + }, + { + "entropy": 2.4107666015625, + "epoch": 0.016562933119840223, + "grad_norm": 0.5981694459915161, + "learning_rate": 9.610000000000001e-06, + "loss": 2.3668, + "mean_token_accuracy": 0.48300402937456965, + "num_tokens": 139337844.0, + "step": 962 + }, + { + "entropy": 2.4951171875, + "epoch": 0.016580150306035484, + "grad_norm": 0.5836125016212463, + "learning_rate": 9.620000000000001e-06, + "loss": 2.4706, + "mean_token_accuracy": 0.467432489618659, + "num_tokens": 139469993.0, + "step": 963 + }, + { + "entropy": 2.443603515625, + "epoch": 0.016597367492230744, + "grad_norm": 0.5777595043182373, + "learning_rate": 9.630000000000001e-06, + "loss": 2.3732, + "mean_token_accuracy": 0.48406707495450974, + "num_tokens": 139628918.0, + "step": 964 + }, + { + "entropy": 2.448974609375, + "epoch": 0.016614584678426005, + "grad_norm": 0.5910548567771912, + "learning_rate": 9.640000000000001e-06, + "loss": 2.4025, + "mean_token_accuracy": 0.48476587794721127, + "num_tokens": 139779064.0, + "step": 965 + }, + { + "entropy": 2.4569091796875, + "epoch": 0.016631801864621265, + "grad_norm": 0.6000712513923645, + "learning_rate": 9.65e-06, + "loss": 2.4165, + "mean_token_accuracy": 0.4791805609129369, + "num_tokens": 139944530.0, + "step": 966 + }, + { + "entropy": 2.45849609375, + "epoch": 0.016649019050816526, + "grad_norm": 0.5988768935203552, + "learning_rate": 9.66e-06, + "loss": 2.3822, + "mean_token_accuracy": 0.4866671049967408, + "num_tokens": 140083921.0, + "step": 967 + }, + { + "entropy": 2.46728515625, + "epoch": 0.016666236237011786, + "grad_norm": 0.5854329466819763, + "learning_rate": 9.67e-06, + "loss": 2.3907, + "mean_token_accuracy": 0.47991981683298945, + "num_tokens": 140224971.0, + "step": 968 + }, + { + "entropy": 2.533447265625, + "epoch": 0.016683453423207047, + "grad_norm": 0.6449822187423706, + "learning_rate": 9.68e-06, + "loss": 2.5346, + "mean_token_accuracy": 0.4715711669996381, + "num_tokens": 140353765.0, + "step": 969 + }, + { + "entropy": 2.4517822265625, + "epoch": 0.016700670609402304, + "grad_norm": 0.562483549118042, + "learning_rate": 9.69e-06, + "loss": 2.4227, + "mean_token_accuracy": 0.47965225437656045, + "num_tokens": 140498830.0, + "step": 970 + }, + { + "entropy": 2.4345703125, + "epoch": 0.016717887795597564, + "grad_norm": 0.5308300852775574, + "learning_rate": 9.7e-06, + "loss": 2.4248, + "mean_token_accuracy": 0.47759495256468654, + "num_tokens": 140665634.0, + "step": 971 + }, + { + "entropy": 2.492919921875, + "epoch": 0.016735104981792825, + "grad_norm": 0.9326145052909851, + "learning_rate": 9.71e-06, + "loss": 2.4872, + "mean_token_accuracy": 0.468785522505641, + "num_tokens": 140802529.0, + "step": 972 + }, + { + "entropy": 2.37744140625, + "epoch": 0.016752322167988085, + "grad_norm": 0.608925998210907, + "learning_rate": 9.72e-06, + "loss": 2.3317, + "mean_token_accuracy": 0.4957911465317011, + "num_tokens": 140941381.0, + "step": 973 + }, + { + "entropy": 2.4649658203125, + "epoch": 0.016769539354183346, + "grad_norm": 0.5866920948028564, + "learning_rate": 9.73e-06, + "loss": 2.4661, + "mean_token_accuracy": 0.4737902185879648, + "num_tokens": 141074013.0, + "step": 974 + }, + { + "entropy": 2.52001953125, + "epoch": 0.016786756540378606, + "grad_norm": 0.6548328399658203, + "learning_rate": 9.74e-06, + "loss": 2.5289, + "mean_token_accuracy": 0.4689825112000108, + "num_tokens": 141211732.0, + "step": 975 + }, + { + "entropy": 2.457275390625, + "epoch": 0.016803973726573867, + "grad_norm": 0.5622747540473938, + "learning_rate": 9.75e-06, + "loss": 2.4627, + "mean_token_accuracy": 0.474064817186445, + "num_tokens": 141361499.0, + "step": 976 + }, + { + "entropy": 2.4368896484375, + "epoch": 0.016821190912769127, + "grad_norm": 0.5637742280960083, + "learning_rate": 9.760000000000001e-06, + "loss": 2.4061, + "mean_token_accuracy": 0.4773513269610703, + "num_tokens": 141516957.0, + "step": 977 + }, + { + "entropy": 2.357666015625, + "epoch": 0.016838408098964388, + "grad_norm": 0.6168224215507507, + "learning_rate": 9.770000000000001e-06, + "loss": 2.3441, + "mean_token_accuracy": 0.4971775123849511, + "num_tokens": 141659725.0, + "step": 978 + }, + { + "entropy": 2.482666015625, + "epoch": 0.016855625285159645, + "grad_norm": 0.5854248404502869, + "learning_rate": 9.780000000000001e-06, + "loss": 2.4517, + "mean_token_accuracy": 0.4778465088456869, + "num_tokens": 141812273.0, + "step": 979 + }, + { + "entropy": 2.5281982421875, + "epoch": 0.016872842471354906, + "grad_norm": 0.5742073059082031, + "learning_rate": 9.790000000000001e-06, + "loss": 2.5098, + "mean_token_accuracy": 0.47218647226691246, + "num_tokens": 141952803.0, + "step": 980 + }, + { + "entropy": 2.4508056640625, + "epoch": 0.016890059657550166, + "grad_norm": 0.7346315383911133, + "learning_rate": 9.800000000000001e-06, + "loss": 2.3992, + "mean_token_accuracy": 0.4813456032425165, + "num_tokens": 142105077.0, + "step": 981 + }, + { + "entropy": 2.3946533203125, + "epoch": 0.016907276843745427, + "grad_norm": 0.613284170627594, + "learning_rate": 9.810000000000001e-06, + "loss": 2.3475, + "mean_token_accuracy": 0.49409126583486795, + "num_tokens": 142252173.0, + "step": 982 + }, + { + "entropy": 2.473876953125, + "epoch": 0.016924494029940687, + "grad_norm": 0.5658174753189087, + "learning_rate": 9.820000000000001e-06, + "loss": 2.459, + "mean_token_accuracy": 0.47720590187236667, + "num_tokens": 142396038.0, + "step": 983 + }, + { + "entropy": 2.395751953125, + "epoch": 0.016941711216135948, + "grad_norm": 0.5382007956504822, + "learning_rate": 9.83e-06, + "loss": 2.373, + "mean_token_accuracy": 0.4882864346727729, + "num_tokens": 142564149.0, + "step": 984 + }, + { + "entropy": 2.482666015625, + "epoch": 0.016958928402331208, + "grad_norm": 0.5813350677490234, + "learning_rate": 9.84e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.4725424610078335, + "num_tokens": 142701037.0, + "step": 985 + }, + { + "entropy": 2.487060546875, + "epoch": 0.01697614558852647, + "grad_norm": 0.5810776948928833, + "learning_rate": 9.85e-06, + "loss": 2.4691, + "mean_token_accuracy": 0.48016467317938805, + "num_tokens": 142855073.0, + "step": 986 + }, + { + "entropy": 2.460693359375, + "epoch": 0.016993362774721726, + "grad_norm": 0.5684233903884888, + "learning_rate": 9.86e-06, + "loss": 2.4004, + "mean_token_accuracy": 0.47402910608798265, + "num_tokens": 143006703.0, + "step": 987 + }, + { + "entropy": 2.426513671875, + "epoch": 0.017010579960916986, + "grad_norm": 0.5875155925750732, + "learning_rate": 9.87e-06, + "loss": 2.4032, + "mean_token_accuracy": 0.4833544669672847, + "num_tokens": 143158319.0, + "step": 988 + }, + { + "entropy": 2.3924560546875, + "epoch": 0.017027797147112247, + "grad_norm": 0.9405050873756409, + "learning_rate": 9.88e-06, + "loss": 2.3455, + "mean_token_accuracy": 0.4880230622366071, + "num_tokens": 143281917.0, + "step": 989 + }, + { + "entropy": 2.43017578125, + "epoch": 0.017045014333307507, + "grad_norm": 0.593075156211853, + "learning_rate": 9.89e-06, + "loss": 2.3675, + "mean_token_accuracy": 0.48371881106868386, + "num_tokens": 143424058.0, + "step": 990 + }, + { + "entropy": 2.4437255859375, + "epoch": 0.017062231519502768, + "grad_norm": 0.5774204134941101, + "learning_rate": 9.9e-06, + "loss": 2.4004, + "mean_token_accuracy": 0.47859969455748796, + "num_tokens": 143567199.0, + "step": 991 + }, + { + "entropy": 2.468505859375, + "epoch": 0.01707944870569803, + "grad_norm": 0.6035756468772888, + "learning_rate": 9.91e-06, + "loss": 2.3898, + "mean_token_accuracy": 0.48759705713018775, + "num_tokens": 143712988.0, + "step": 992 + }, + { + "entropy": 2.4527587890625, + "epoch": 0.01709666589189329, + "grad_norm": 0.5580205321311951, + "learning_rate": 9.920000000000002e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.47088390588760376, + "num_tokens": 143861050.0, + "step": 993 + }, + { + "entropy": 2.4556884765625, + "epoch": 0.01711388307808855, + "grad_norm": 0.5672869086265564, + "learning_rate": 9.930000000000001e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.48082907358184457, + "num_tokens": 144017646.0, + "step": 994 + }, + { + "entropy": 2.445556640625, + "epoch": 0.017131100264283806, + "grad_norm": 0.6213098764419556, + "learning_rate": 9.940000000000001e-06, + "loss": 2.4035, + "mean_token_accuracy": 0.48345539439469576, + "num_tokens": 144150604.0, + "step": 995 + }, + { + "entropy": 2.489990234375, + "epoch": 0.017148317450479067, + "grad_norm": 0.5691782832145691, + "learning_rate": 9.950000000000001e-06, + "loss": 2.4575, + "mean_token_accuracy": 0.4700373010709882, + "num_tokens": 144292661.0, + "step": 996 + }, + { + "entropy": 2.454833984375, + "epoch": 0.017165534636674327, + "grad_norm": 0.5717594027519226, + "learning_rate": 9.960000000000001e-06, + "loss": 2.418, + "mean_token_accuracy": 0.48314423533156514, + "num_tokens": 144445246.0, + "step": 997 + }, + { + "entropy": 2.4835205078125, + "epoch": 0.017182751822869588, + "grad_norm": 0.5863724946975708, + "learning_rate": 9.970000000000001e-06, + "loss": 2.4627, + "mean_token_accuracy": 0.4698511818423867, + "num_tokens": 144589498.0, + "step": 998 + }, + { + "entropy": 2.441650390625, + "epoch": 0.01719996900906485, + "grad_norm": 0.770044207572937, + "learning_rate": 9.980000000000001e-06, + "loss": 2.3511, + "mean_token_accuracy": 0.4848445672541857, + "num_tokens": 144744013.0, + "step": 999 + }, + { + "entropy": 2.443115234375, + "epoch": 0.01721718619526011, + "grad_norm": 0.6044683456420898, + "learning_rate": 9.990000000000001e-06, + "loss": 2.411, + "mean_token_accuracy": 0.4802071452140808, + "num_tokens": 144878720.0, + "step": 1000 + }, + { + "entropy": 2.4459228515625, + "epoch": 0.01723440338145537, + "grad_norm": 0.5749707221984863, + "learning_rate": 1e-05, + "loss": 2.3881, + "mean_token_accuracy": 0.480140985455364, + "num_tokens": 145028328.0, + "step": 1001 + }, + { + "entropy": 2.55908203125, + "epoch": 0.01725162056765063, + "grad_norm": 0.6056835651397705, + "learning_rate": 9.999999992427464e-06, + "loss": 2.5261, + "mean_token_accuracy": 0.46429841918870807, + "num_tokens": 145177909.0, + "step": 1002 + }, + { + "entropy": 2.475830078125, + "epoch": 0.01726883775384589, + "grad_norm": 0.6159217357635498, + "learning_rate": 9.999999969709854e-06, + "loss": 2.4367, + "mean_token_accuracy": 0.4752111411653459, + "num_tokens": 145331073.0, + "step": 1003 + }, + { + "entropy": 2.4053955078125, + "epoch": 0.017286054940041148, + "grad_norm": 0.5820682644844055, + "learning_rate": 9.999999931847169e-06, + "loss": 2.3818, + "mean_token_accuracy": 0.4844290711916983, + "num_tokens": 145480948.0, + "step": 1004 + }, + { + "entropy": 2.432861328125, + "epoch": 0.017303272126236408, + "grad_norm": 0.5615769624710083, + "learning_rate": 9.999999878839412e-06, + "loss": 2.3797, + "mean_token_accuracy": 0.4817391145043075, + "num_tokens": 145615367.0, + "step": 1005 + }, + { + "entropy": 2.3687744140625, + "epoch": 0.01732048931243167, + "grad_norm": 0.5944392085075378, + "learning_rate": 9.999999810686582e-06, + "loss": 2.3464, + "mean_token_accuracy": 0.4950461401604116, + "num_tokens": 145751257.0, + "step": 1006 + }, + { + "entropy": 2.491455078125, + "epoch": 0.01733770649862693, + "grad_norm": 0.5785252451896667, + "learning_rate": 9.99999972738868e-06, + "loss": 2.4419, + "mean_token_accuracy": 0.4759130119346082, + "num_tokens": 145881989.0, + "step": 1007 + }, + { + "entropy": 2.45263671875, + "epoch": 0.01735492368482219, + "grad_norm": 0.573020339012146, + "learning_rate": 9.999999628945702e-06, + "loss": 2.4126, + "mean_token_accuracy": 0.47999184019863605, + "num_tokens": 146033406.0, + "step": 1008 + }, + { + "entropy": 2.474365234375, + "epoch": 0.01737214087101745, + "grad_norm": 0.5616711378097534, + "learning_rate": 9.999999515357654e-06, + "loss": 2.4179, + "mean_token_accuracy": 0.47331848414614797, + "num_tokens": 146185163.0, + "step": 1009 + }, + { + "entropy": 2.3839111328125, + "epoch": 0.01738935805721271, + "grad_norm": 0.5724861025810242, + "learning_rate": 9.999999386624534e-06, + "loss": 2.338, + "mean_token_accuracy": 0.4900155235081911, + "num_tokens": 146335979.0, + "step": 1010 + }, + { + "entropy": 2.397216796875, + "epoch": 0.01740657524340797, + "grad_norm": 0.566051185131073, + "learning_rate": 9.99999924274634e-06, + "loss": 2.3623, + "mean_token_accuracy": 0.49173881486058235, + "num_tokens": 146492297.0, + "step": 1011 + }, + { + "entropy": 2.425537109375, + "epoch": 0.01742379242960323, + "grad_norm": 0.5698406100273132, + "learning_rate": 9.999999083723077e-06, + "loss": 2.3803, + "mean_token_accuracy": 0.4822213868610561, + "num_tokens": 146638961.0, + "step": 1012 + }, + { + "entropy": 2.457763671875, + "epoch": 0.01744100961579849, + "grad_norm": 0.5576664805412292, + "learning_rate": 9.999998909554743e-06, + "loss": 2.4175, + "mean_token_accuracy": 0.47678716061636806, + "num_tokens": 146785199.0, + "step": 1013 + }, + { + "entropy": 2.45947265625, + "epoch": 0.01745822680199375, + "grad_norm": 0.5309212803840637, + "learning_rate": 9.99999872024134e-06, + "loss": 2.4368, + "mean_token_accuracy": 0.4784322748892009, + "num_tokens": 146940920.0, + "step": 1014 + }, + { + "entropy": 2.443603515625, + "epoch": 0.01747544398818901, + "grad_norm": 0.6436007022857666, + "learning_rate": 9.999998515782865e-06, + "loss": 2.3972, + "mean_token_accuracy": 0.4809753801673651, + "num_tokens": 147096034.0, + "step": 1015 + }, + { + "entropy": 2.4503173828125, + "epoch": 0.01749266117438427, + "grad_norm": 0.5794939398765564, + "learning_rate": 9.99999829617932e-06, + "loss": 2.4099, + "mean_token_accuracy": 0.47518764482811093, + "num_tokens": 147241716.0, + "step": 1016 + }, + { + "entropy": 2.4049072265625, + "epoch": 0.01750987836057953, + "grad_norm": 0.6116197109222412, + "learning_rate": 9.999998061430709e-06, + "loss": 2.3468, + "mean_token_accuracy": 0.4926842013373971, + "num_tokens": 147378764.0, + "step": 1017 + }, + { + "entropy": 2.47314453125, + "epoch": 0.01752709554677479, + "grad_norm": 0.5858633518218994, + "learning_rate": 9.99999781153703e-06, + "loss": 2.3979, + "mean_token_accuracy": 0.47995937149971724, + "num_tokens": 147517319.0, + "step": 1018 + }, + { + "entropy": 2.5513916015625, + "epoch": 0.017544312732970052, + "grad_norm": 0.6136896014213562, + "learning_rate": 9.999997546498284e-06, + "loss": 2.5054, + "mean_token_accuracy": 0.46979639306664467, + "num_tokens": 147657307.0, + "step": 1019 + }, + { + "entropy": 2.4195556640625, + "epoch": 0.01756152991916531, + "grad_norm": 0.593510091304779, + "learning_rate": 9.999997266314471e-06, + "loss": 2.3881, + "mean_token_accuracy": 0.48458396503701806, + "num_tokens": 147806252.0, + "step": 1020 + }, + { + "entropy": 2.41064453125, + "epoch": 0.01757874710536057, + "grad_norm": 0.675787091255188, + "learning_rate": 9.999996970985592e-06, + "loss": 2.3654, + "mean_token_accuracy": 0.48551547806710005, + "num_tokens": 147979427.0, + "step": 1021 + }, + { + "entropy": 2.46533203125, + "epoch": 0.01759596429155583, + "grad_norm": 0.6314930319786072, + "learning_rate": 9.99999666051165e-06, + "loss": 2.4691, + "mean_token_accuracy": 0.4804490995593369, + "num_tokens": 148114782.0, + "step": 1022 + }, + { + "entropy": 2.4417724609375, + "epoch": 0.01761318147775109, + "grad_norm": 0.579401969909668, + "learning_rate": 9.999996334892646e-06, + "loss": 2.4204, + "mean_token_accuracy": 0.482807548251003, + "num_tokens": 148259581.0, + "step": 1023 + }, + { + "entropy": 2.452880859375, + "epoch": 0.01763039866394635, + "grad_norm": 0.5902701616287231, + "learning_rate": 9.999995994128578e-06, + "loss": 2.4434, + "mean_token_accuracy": 0.47913903277367353, + "num_tokens": 148400114.0, + "step": 1024 + }, + { + "entropy": 2.482666015625, + "epoch": 0.017647615850141612, + "grad_norm": 0.5950527787208557, + "learning_rate": 9.999995638219448e-06, + "loss": 2.4566, + "mean_token_accuracy": 0.4763036798685789, + "num_tokens": 148540440.0, + "step": 1025 + }, + { + "entropy": 2.446533203125, + "epoch": 0.017664833036336872, + "grad_norm": 0.6330975294113159, + "learning_rate": 9.999995267165256e-06, + "loss": 2.3904, + "mean_token_accuracy": 0.4806701922789216, + "num_tokens": 148692789.0, + "step": 1026 + }, + { + "entropy": 2.427978515625, + "epoch": 0.017682050222532133, + "grad_norm": 0.617744505405426, + "learning_rate": 9.999994880966008e-06, + "loss": 2.369, + "mean_token_accuracy": 0.4830102613195777, + "num_tokens": 148836617.0, + "step": 1027 + }, + { + "entropy": 2.42822265625, + "epoch": 0.017699267408727393, + "grad_norm": 0.6210005879402161, + "learning_rate": 9.999994479621703e-06, + "loss": 2.3569, + "mean_token_accuracy": 0.49285168340429664, + "num_tokens": 148982058.0, + "step": 1028 + }, + { + "entropy": 2.52392578125, + "epoch": 0.01771648459492265, + "grad_norm": 0.5944798588752747, + "learning_rate": 9.999994063132336e-06, + "loss": 2.4881, + "mean_token_accuracy": 0.46862922469154, + "num_tokens": 149117257.0, + "step": 1029 + }, + { + "entropy": 2.4248046875, + "epoch": 0.01773370178111791, + "grad_norm": 0.603406548500061, + "learning_rate": 9.999993631497918e-06, + "loss": 2.3968, + "mean_token_accuracy": 0.4805483613163233, + "num_tokens": 149253194.0, + "step": 1030 + }, + { + "entropy": 2.48681640625, + "epoch": 0.01775091896731317, + "grad_norm": 0.5487124919891357, + "learning_rate": 9.999993184718445e-06, + "loss": 2.4613, + "mean_token_accuracy": 0.4695818484760821, + "num_tokens": 149418815.0, + "step": 1031 + }, + { + "entropy": 2.3785400390625, + "epoch": 0.017768136153508432, + "grad_norm": 0.5934000015258789, + "learning_rate": 9.999992722793916e-06, + "loss": 2.3197, + "mean_token_accuracy": 0.4929351657629013, + "num_tokens": 149565302.0, + "step": 1032 + }, + { + "entropy": 2.41943359375, + "epoch": 0.017785353339703693, + "grad_norm": 0.5832322835922241, + "learning_rate": 9.999992245724338e-06, + "loss": 2.3851, + "mean_token_accuracy": 0.4843961731530726, + "num_tokens": 149701523.0, + "step": 1033 + }, + { + "entropy": 2.477294921875, + "epoch": 0.017802570525898953, + "grad_norm": 0.578316330909729, + "learning_rate": 9.99999175350971e-06, + "loss": 2.4705, + "mean_token_accuracy": 0.4734199936501682, + "num_tokens": 149841173.0, + "step": 1034 + }, + { + "entropy": 2.496337890625, + "epoch": 0.017819787712094214, + "grad_norm": 1.1719509363174438, + "learning_rate": 9.999991246150034e-06, + "loss": 2.469, + "mean_token_accuracy": 0.4775820942595601, + "num_tokens": 149977715.0, + "step": 1035 + }, + { + "entropy": 2.458251953125, + "epoch": 0.017837004898289474, + "grad_norm": 0.5495033860206604, + "learning_rate": 9.999990723645309e-06, + "loss": 2.4407, + "mean_token_accuracy": 0.47903554420918226, + "num_tokens": 150124141.0, + "step": 1036 + }, + { + "entropy": 2.453857421875, + "epoch": 0.01785422208448473, + "grad_norm": 0.6319074034690857, + "learning_rate": 9.99999018599554e-06, + "loss": 2.4007, + "mean_token_accuracy": 0.4762126957066357, + "num_tokens": 150278398.0, + "step": 1037 + }, + { + "entropy": 2.435791015625, + "epoch": 0.01787143927067999, + "grad_norm": 0.6012912392616272, + "learning_rate": 9.999989633200726e-06, + "loss": 2.3693, + "mean_token_accuracy": 0.490336746443063, + "num_tokens": 150415689.0, + "step": 1038 + }, + { + "entropy": 2.431640625, + "epoch": 0.017888656456875252, + "grad_norm": 0.5937034487724304, + "learning_rate": 9.99998906526087e-06, + "loss": 2.381, + "mean_token_accuracy": 0.48564466880634427, + "num_tokens": 150565533.0, + "step": 1039 + }, + { + "entropy": 2.541259765625, + "epoch": 0.017905873643070513, + "grad_norm": 0.5868061780929565, + "learning_rate": 9.999988482175975e-06, + "loss": 2.485, + "mean_token_accuracy": 0.4694917304441333, + "num_tokens": 150703215.0, + "step": 1040 + }, + { + "entropy": 2.51025390625, + "epoch": 0.017923090829265773, + "grad_norm": 0.5923919677734375, + "learning_rate": 9.99998788394604e-06, + "loss": 2.5, + "mean_token_accuracy": 0.47092792578041553, + "num_tokens": 150846937.0, + "step": 1041 + }, + { + "entropy": 2.44384765625, + "epoch": 0.017940308015461034, + "grad_norm": 0.6263927817344666, + "learning_rate": 9.999987270571067e-06, + "loss": 2.4557, + "mean_token_accuracy": 0.4760148832574487, + "num_tokens": 150988381.0, + "step": 1042 + }, + { + "entropy": 2.388427734375, + "epoch": 0.017957525201656294, + "grad_norm": 0.5355521440505981, + "learning_rate": 9.999986642051061e-06, + "loss": 2.3634, + "mean_token_accuracy": 0.4859662549570203, + "num_tokens": 151137551.0, + "step": 1043 + }, + { + "entropy": 2.41552734375, + "epoch": 0.017974742387851555, + "grad_norm": 0.615512490272522, + "learning_rate": 9.999985998386022e-06, + "loss": 2.4254, + "mean_token_accuracy": 0.4817372509278357, + "num_tokens": 151280397.0, + "step": 1044 + }, + { + "entropy": 2.496826171875, + "epoch": 0.017991959574046812, + "grad_norm": 0.543178379535675, + "learning_rate": 9.99998533957595e-06, + "loss": 2.4526, + "mean_token_accuracy": 0.4679850875400007, + "num_tokens": 151423217.0, + "step": 1045 + }, + { + "entropy": 2.487548828125, + "epoch": 0.018009176760242072, + "grad_norm": 0.6349004507064819, + "learning_rate": 9.999984665620852e-06, + "loss": 2.4205, + "mean_token_accuracy": 0.4785635150037706, + "num_tokens": 151566681.0, + "step": 1046 + }, + { + "entropy": 2.467041015625, + "epoch": 0.018026393946437333, + "grad_norm": 0.5757213234901428, + "learning_rate": 9.999983976520725e-06, + "loss": 2.3992, + "mean_token_accuracy": 0.48239869019016623, + "num_tokens": 151722733.0, + "step": 1047 + }, + { + "entropy": 2.3770751953125, + "epoch": 0.018043611132632593, + "grad_norm": 0.5888438820838928, + "learning_rate": 9.999983272275572e-06, + "loss": 2.3367, + "mean_token_accuracy": 0.488128668628633, + "num_tokens": 151878057.0, + "step": 1048 + }, + { + "entropy": 2.40380859375, + "epoch": 0.018060828318827854, + "grad_norm": 0.6084235906600952, + "learning_rate": 9.999982552885396e-06, + "loss": 2.4194, + "mean_token_accuracy": 0.48174417577683926, + "num_tokens": 152013813.0, + "step": 1049 + }, + { + "entropy": 2.525390625, + "epoch": 0.018078045505023115, + "grad_norm": 0.6073904633522034, + "learning_rate": 9.999981818350201e-06, + "loss": 2.4913, + "mean_token_accuracy": 0.470255627296865, + "num_tokens": 152145871.0, + "step": 1050 + }, + { + "entropy": 2.4578857421875, + "epoch": 0.018095262691218375, + "grad_norm": 0.5444018840789795, + "learning_rate": 9.999981068669986e-06, + "loss": 2.4527, + "mean_token_accuracy": 0.47973277885466814, + "num_tokens": 152304784.0, + "step": 1051 + }, + { + "entropy": 2.47509765625, + "epoch": 0.018112479877413636, + "grad_norm": 0.5837482213973999, + "learning_rate": 9.999980303844756e-06, + "loss": 2.4489, + "mean_token_accuracy": 0.47514383727684617, + "num_tokens": 152453018.0, + "step": 1052 + }, + { + "entropy": 2.4283447265625, + "epoch": 0.018129697063608896, + "grad_norm": 2.452045202255249, + "learning_rate": 9.999979523874513e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.48897374235093594, + "num_tokens": 152612683.0, + "step": 1053 + }, + { + "entropy": 2.404541015625, + "epoch": 0.018146914249804153, + "grad_norm": 0.5417919754981995, + "learning_rate": 9.999978728759256e-06, + "loss": 2.3388, + "mean_token_accuracy": 0.4870688715018332, + "num_tokens": 152762264.0, + "step": 1054 + }, + { + "entropy": 2.4012451171875, + "epoch": 0.018164131435999414, + "grad_norm": 0.5944674611091614, + "learning_rate": 9.999977918498992e-06, + "loss": 2.3714, + "mean_token_accuracy": 0.4950359701178968, + "num_tokens": 152906647.0, + "step": 1055 + }, + { + "entropy": 2.453857421875, + "epoch": 0.018181348622194674, + "grad_norm": 0.5601978302001953, + "learning_rate": 9.99997709309372e-06, + "loss": 2.3945, + "mean_token_accuracy": 0.4819331760518253, + "num_tokens": 153047278.0, + "step": 1056 + }, + { + "entropy": 2.5194091796875, + "epoch": 0.018198565808389935, + "grad_norm": 0.6144431233406067, + "learning_rate": 9.999976252543444e-06, + "loss": 2.4835, + "mean_token_accuracy": 0.4768884154036641, + "num_tokens": 153186932.0, + "step": 1057 + }, + { + "entropy": 2.403564453125, + "epoch": 0.018215782994585195, + "grad_norm": 0.9560090899467468, + "learning_rate": 9.999975396848167e-06, + "loss": 2.3291, + "mean_token_accuracy": 0.4874680512584746, + "num_tokens": 153341046.0, + "step": 1058 + }, + { + "entropy": 2.431884765625, + "epoch": 0.018233000180780456, + "grad_norm": 0.6547354459762573, + "learning_rate": 9.99997452600789e-06, + "loss": 2.4077, + "mean_token_accuracy": 0.48102515283972025, + "num_tokens": 153486117.0, + "step": 1059 + }, + { + "entropy": 2.381103515625, + "epoch": 0.018250217366975716, + "grad_norm": 0.5701848268508911, + "learning_rate": 9.999973640022618e-06, + "loss": 2.3703, + "mean_token_accuracy": 0.4909746255725622, + "num_tokens": 153635642.0, + "step": 1060 + }, + { + "entropy": 2.427978515625, + "epoch": 0.018267434553170977, + "grad_norm": 0.5640613436698914, + "learning_rate": 9.99997273889235e-06, + "loss": 2.3833, + "mean_token_accuracy": 0.483435302041471, + "num_tokens": 153778267.0, + "step": 1061 + }, + { + "entropy": 2.54736328125, + "epoch": 0.018284651739366234, + "grad_norm": 0.5851112008094788, + "learning_rate": 9.999971822617094e-06, + "loss": 2.5209, + "mean_token_accuracy": 0.4660449046641588, + "num_tokens": 153920352.0, + "step": 1062 + }, + { + "entropy": 2.47998046875, + "epoch": 0.018301868925561494, + "grad_norm": 0.5928635001182556, + "learning_rate": 9.999970891196847e-06, + "loss": 2.4422, + "mean_token_accuracy": 0.47202530410140753, + "num_tokens": 154064892.0, + "step": 1063 + }, + { + "entropy": 2.507080078125, + "epoch": 0.018319086111756755, + "grad_norm": 0.6197124719619751, + "learning_rate": 9.999969944631615e-06, + "loss": 2.4772, + "mean_token_accuracy": 0.4731879197061062, + "num_tokens": 154215041.0, + "step": 1064 + }, + { + "entropy": 2.46923828125, + "epoch": 0.018336303297952015, + "grad_norm": 0.5558657646179199, + "learning_rate": 9.999968982921403e-06, + "loss": 2.4352, + "mean_token_accuracy": 0.47735925391316414, + "num_tokens": 154367730.0, + "step": 1065 + }, + { + "entropy": 2.430419921875, + "epoch": 0.018353520484147276, + "grad_norm": 0.6059717535972595, + "learning_rate": 9.99996800606621e-06, + "loss": 2.3824, + "mean_token_accuracy": 0.4844053997658193, + "num_tokens": 154505107.0, + "step": 1066 + }, + { + "entropy": 2.4879150390625, + "epoch": 0.018370737670342537, + "grad_norm": 0.5660464763641357, + "learning_rate": 9.999967014066038e-06, + "loss": 2.4352, + "mean_token_accuracy": 0.476345878560096, + "num_tokens": 154643818.0, + "step": 1067 + }, + { + "entropy": 2.4345703125, + "epoch": 0.018387954856537797, + "grad_norm": 0.701174795627594, + "learning_rate": 9.999966006920896e-06, + "loss": 2.416, + "mean_token_accuracy": 0.48350341338664293, + "num_tokens": 154785623.0, + "step": 1068 + }, + { + "entropy": 2.48486328125, + "epoch": 0.018405172042733058, + "grad_norm": 0.6571546196937561, + "learning_rate": 9.99996498463078e-06, + "loss": 2.4509, + "mean_token_accuracy": 0.47686226665973663, + "num_tokens": 154947202.0, + "step": 1069 + }, + { + "entropy": 2.4830322265625, + "epoch": 0.018422389228928315, + "grad_norm": 0.6001805067062378, + "learning_rate": 9.999963947195699e-06, + "loss": 2.4316, + "mean_token_accuracy": 0.47862283093854785, + "num_tokens": 155097645.0, + "step": 1070 + }, + { + "entropy": 2.4422607421875, + "epoch": 0.018439606415123575, + "grad_norm": 0.6199787259101868, + "learning_rate": 9.999962894615653e-06, + "loss": 2.4132, + "mean_token_accuracy": 0.48402009485289454, + "num_tokens": 155229538.0, + "step": 1071 + }, + { + "entropy": 2.408203125, + "epoch": 0.018456823601318836, + "grad_norm": 0.5521693825721741, + "learning_rate": 9.999961826890645e-06, + "loss": 2.3365, + "mean_token_accuracy": 0.4847987382672727, + "num_tokens": 155376732.0, + "step": 1072 + }, + { + "entropy": 2.4617919921875, + "epoch": 0.018474040787514096, + "grad_norm": 0.6697084903717041, + "learning_rate": 9.999960744020681e-06, + "loss": 2.4263, + "mean_token_accuracy": 0.48609263403341174, + "num_tokens": 155514355.0, + "step": 1073 + }, + { + "entropy": 2.443603515625, + "epoch": 0.018491257973709357, + "grad_norm": 0.6446971297264099, + "learning_rate": 9.999959646005761e-06, + "loss": 2.448, + "mean_token_accuracy": 0.4798265271820128, + "num_tokens": 155652470.0, + "step": 1074 + }, + { + "entropy": 2.47412109375, + "epoch": 0.018508475159904617, + "grad_norm": 0.597260057926178, + "learning_rate": 9.999958532845889e-06, + "loss": 2.4575, + "mean_token_accuracy": 0.4776256578043103, + "num_tokens": 155809404.0, + "step": 1075 + }, + { + "entropy": 2.388916015625, + "epoch": 0.018525692346099878, + "grad_norm": 0.5906361937522888, + "learning_rate": 9.99995740454107e-06, + "loss": 2.3603, + "mean_token_accuracy": 0.4867819882929325, + "num_tokens": 155949820.0, + "step": 1076 + }, + { + "entropy": 2.4091796875, + "epoch": 0.01854290953229514, + "grad_norm": 0.5962374806404114, + "learning_rate": 9.999956261091306e-06, + "loss": 2.3754, + "mean_token_accuracy": 0.486242544837296, + "num_tokens": 156083865.0, + "step": 1077 + }, + { + "entropy": 2.466064453125, + "epoch": 0.0185601267184904, + "grad_norm": 0.5743368864059448, + "learning_rate": 9.999955102496602e-06, + "loss": 2.4522, + "mean_token_accuracy": 0.4817331531085074, + "num_tokens": 156230750.0, + "step": 1078 + }, + { + "entropy": 2.4454345703125, + "epoch": 0.018577343904685656, + "grad_norm": 0.5727860927581787, + "learning_rate": 9.999953928756958e-06, + "loss": 2.424, + "mean_token_accuracy": 0.47826589411124587, + "num_tokens": 156377631.0, + "step": 1079 + }, + { + "entropy": 2.433837890625, + "epoch": 0.018594561090880916, + "grad_norm": 0.6403164863586426, + "learning_rate": 9.999952739872383e-06, + "loss": 2.379, + "mean_token_accuracy": 0.4838769477792084, + "num_tokens": 156535199.0, + "step": 1080 + }, + { + "entropy": 2.504150390625, + "epoch": 0.018611778277076177, + "grad_norm": 0.5877081751823425, + "learning_rate": 9.999951535842875e-06, + "loss": 2.4479, + "mean_token_accuracy": 0.4734333767555654, + "num_tokens": 156667471.0, + "step": 1081 + }, + { + "entropy": 2.421630859375, + "epoch": 0.018628995463271437, + "grad_norm": 0.5537156462669373, + "learning_rate": 9.999950316668443e-06, + "loss": 2.3707, + "mean_token_accuracy": 0.48230436397716403, + "num_tokens": 156819504.0, + "step": 1082 + }, + { + "entropy": 2.5322265625, + "epoch": 0.018646212649466698, + "grad_norm": 0.603832483291626, + "learning_rate": 9.999949082349085e-06, + "loss": 2.4499, + "mean_token_accuracy": 0.4676863308995962, + "num_tokens": 156944926.0, + "step": 1083 + }, + { + "entropy": 2.473388671875, + "epoch": 0.01866342983566196, + "grad_norm": 0.5896692872047424, + "learning_rate": 9.99994783288481e-06, + "loss": 2.4796, + "mean_token_accuracy": 0.4778019469231367, + "num_tokens": 157090650.0, + "step": 1084 + }, + { + "entropy": 2.4200439453125, + "epoch": 0.01868064702185722, + "grad_norm": 0.5518651008605957, + "learning_rate": 9.999946568275619e-06, + "loss": 2.4286, + "mean_token_accuracy": 0.47666720813140273, + "num_tokens": 157252639.0, + "step": 1085 + }, + { + "entropy": 2.4140625, + "epoch": 0.01869786420805248, + "grad_norm": 0.5450928211212158, + "learning_rate": 9.999945288521516e-06, + "loss": 2.3944, + "mean_token_accuracy": 0.48317190259695053, + "num_tokens": 157409995.0, + "step": 1086 + }, + { + "entropy": 2.4267578125, + "epoch": 0.018715081394247737, + "grad_norm": 0.5875595211982727, + "learning_rate": 9.999943993622504e-06, + "loss": 2.4209, + "mean_token_accuracy": 0.4857109053991735, + "num_tokens": 157551609.0, + "step": 1087 + }, + { + "entropy": 2.43603515625, + "epoch": 0.018732298580442997, + "grad_norm": 0.5621595978736877, + "learning_rate": 9.99994268357859e-06, + "loss": 2.4107, + "mean_token_accuracy": 0.4831027192994952, + "num_tokens": 157699503.0, + "step": 1088 + }, + { + "entropy": 2.497802734375, + "epoch": 0.018749515766638258, + "grad_norm": 0.6029961109161377, + "learning_rate": 9.999941358389775e-06, + "loss": 2.4699, + "mean_token_accuracy": 0.4773255423642695, + "num_tokens": 157839741.0, + "step": 1089 + }, + { + "entropy": 2.4298095703125, + "epoch": 0.018766732952833518, + "grad_norm": 0.5884754061698914, + "learning_rate": 9.999940018056062e-06, + "loss": 2.4153, + "mean_token_accuracy": 0.4817211125046015, + "num_tokens": 157988371.0, + "step": 1090 + }, + { + "entropy": 2.417724609375, + "epoch": 0.01878395013902878, + "grad_norm": 0.5795974135398865, + "learning_rate": 9.99993866257746e-06, + "loss": 2.3917, + "mean_token_accuracy": 0.4811216425150633, + "num_tokens": 158133019.0, + "step": 1091 + }, + { + "entropy": 2.404541015625, + "epoch": 0.01880116732522404, + "grad_norm": 0.5938680768013, + "learning_rate": 9.99993729195397e-06, + "loss": 2.3589, + "mean_token_accuracy": 0.4899957957677543, + "num_tokens": 158276428.0, + "step": 1092 + }, + { + "entropy": 2.453857421875, + "epoch": 0.0188183845114193, + "grad_norm": 0.6011793613433838, + "learning_rate": 9.999935906185596e-06, + "loss": 2.3976, + "mean_token_accuracy": 0.48242352809756994, + "num_tokens": 158418256.0, + "step": 1093 + }, + { + "entropy": 2.4844970703125, + "epoch": 0.01883560169761456, + "grad_norm": 0.6211831569671631, + "learning_rate": 9.999934505272341e-06, + "loss": 2.4357, + "mean_token_accuracy": 0.47488651471212506, + "num_tokens": 158538103.0, + "step": 1094 + }, + { + "entropy": 2.4769287109375, + "epoch": 0.018852818883809817, + "grad_norm": 0.5956041812896729, + "learning_rate": 9.999933089214214e-06, + "loss": 2.4364, + "mean_token_accuracy": 0.4817122952081263, + "num_tokens": 158684837.0, + "step": 1095 + }, + { + "entropy": 2.4451904296875, + "epoch": 0.018870036070005078, + "grad_norm": 0.5776230692863464, + "learning_rate": 9.999931658011213e-06, + "loss": 2.4465, + "mean_token_accuracy": 0.48071191366761923, + "num_tokens": 158841114.0, + "step": 1096 + }, + { + "entropy": 2.4998779296875, + "epoch": 0.01888725325620034, + "grad_norm": 0.5656998753547668, + "learning_rate": 9.999930211663346e-06, + "loss": 2.4445, + "mean_token_accuracy": 0.47156142350286245, + "num_tokens": 158982072.0, + "step": 1097 + }, + { + "entropy": 2.410400390625, + "epoch": 0.0189044704423956, + "grad_norm": 0.5946214199066162, + "learning_rate": 9.999928750170619e-06, + "loss": 2.3455, + "mean_token_accuracy": 0.4895977326668799, + "num_tokens": 159115070.0, + "step": 1098 + }, + { + "entropy": 2.4327392578125, + "epoch": 0.01892168762859086, + "grad_norm": 0.594409167766571, + "learning_rate": 9.999927273533032e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.48081815196201205, + "num_tokens": 159253866.0, + "step": 1099 + }, + { + "entropy": 2.450927734375, + "epoch": 0.01893890481478612, + "grad_norm": 0.5916756987571716, + "learning_rate": 9.999925781750594e-06, + "loss": 2.3838, + "mean_token_accuracy": 0.4844925357028842, + "num_tokens": 159415802.0, + "step": 1100 + }, + { + "entropy": 2.5107421875, + "epoch": 0.01895612200098138, + "grad_norm": 0.6757313013076782, + "learning_rate": 9.999924274823305e-06, + "loss": 2.4928, + "mean_token_accuracy": 0.4722679229453206, + "num_tokens": 159567360.0, + "step": 1101 + }, + { + "entropy": 2.44482421875, + "epoch": 0.01897333918717664, + "grad_norm": 0.5762059092521667, + "learning_rate": 9.999922752751173e-06, + "loss": 2.4003, + "mean_token_accuracy": 0.4811086105182767, + "num_tokens": 159709631.0, + "step": 1102 + }, + { + "entropy": 2.425537109375, + "epoch": 0.0189905563733719, + "grad_norm": 0.5835311412811279, + "learning_rate": 9.999921215534201e-06, + "loss": 2.377, + "mean_token_accuracy": 0.4846574803814292, + "num_tokens": 159847869.0, + "step": 1103 + }, + { + "entropy": 2.3896484375, + "epoch": 0.01900777355956716, + "grad_norm": 0.610913872718811, + "learning_rate": 9.999919663172394e-06, + "loss": 2.3431, + "mean_token_accuracy": 0.4912629318423569, + "num_tokens": 159976268.0, + "step": 1104 + }, + { + "entropy": 2.4576416015625, + "epoch": 0.01902499074576242, + "grad_norm": 0.6417136192321777, + "learning_rate": 9.999918095665758e-06, + "loss": 2.4618, + "mean_token_accuracy": 0.4749236977659166, + "num_tokens": 160114362.0, + "step": 1105 + }, + { + "entropy": 2.4779052734375, + "epoch": 0.01904220793195768, + "grad_norm": 0.5750072598457336, + "learning_rate": 9.999916513014294e-06, + "loss": 2.4414, + "mean_token_accuracy": 0.4833888350985944, + "num_tokens": 160272635.0, + "step": 1106 + }, + { + "entropy": 2.4361572265625, + "epoch": 0.01905942511815294, + "grad_norm": 0.6341116428375244, + "learning_rate": 9.999914915218012e-06, + "loss": 2.3878, + "mean_token_accuracy": 0.47810999071225524, + "num_tokens": 160430906.0, + "step": 1107 + }, + { + "entropy": 2.3680419921875, + "epoch": 0.0190766423043482, + "grad_norm": 0.575920581817627, + "learning_rate": 9.999913302276912e-06, + "loss": 2.3544, + "mean_token_accuracy": 0.4951640688814223, + "num_tokens": 160569906.0, + "step": 1108 + }, + { + "entropy": 2.431884765625, + "epoch": 0.01909385949054346, + "grad_norm": 0.5743590593338013, + "learning_rate": 9.999911674191001e-06, + "loss": 2.382, + "mean_token_accuracy": 0.48963714949786663, + "num_tokens": 160710393.0, + "step": 1109 + }, + { + "entropy": 2.5234375, + "epoch": 0.019111076676738722, + "grad_norm": 0.6044170260429382, + "learning_rate": 9.999910030960285e-06, + "loss": 2.467, + "mean_token_accuracy": 0.4708155104890466, + "num_tokens": 160837634.0, + "step": 1110 + }, + { + "entropy": 2.498046875, + "epoch": 0.019128293862933982, + "grad_norm": 0.641613245010376, + "learning_rate": 9.99990837258477e-06, + "loss": 2.4, + "mean_token_accuracy": 0.474128358066082, + "num_tokens": 160981820.0, + "step": 1111 + }, + { + "entropy": 2.4156494140625, + "epoch": 0.01914551104912924, + "grad_norm": 0.7928540110588074, + "learning_rate": 9.999906699064455e-06, + "loss": 2.369, + "mean_token_accuracy": 0.4871662328951061, + "num_tokens": 161128688.0, + "step": 1112 + }, + { + "entropy": 2.45849609375, + "epoch": 0.0191627282353245, + "grad_norm": 0.6063829064369202, + "learning_rate": 9.999905010399351e-06, + "loss": 2.3988, + "mean_token_accuracy": 0.4813544964417815, + "num_tokens": 161265040.0, + "step": 1113 + }, + { + "entropy": 2.40625, + "epoch": 0.01917994542151976, + "grad_norm": 0.5939781069755554, + "learning_rate": 9.999903306589463e-06, + "loss": 2.3918, + "mean_token_accuracy": 0.48077729996293783, + "num_tokens": 161418528.0, + "step": 1114 + }, + { + "entropy": 2.444580078125, + "epoch": 0.01919716260771502, + "grad_norm": 0.6092802882194519, + "learning_rate": 9.99990158763479e-06, + "loss": 2.4441, + "mean_token_accuracy": 0.4797166772186756, + "num_tokens": 161557877.0, + "step": 1115 + }, + { + "entropy": 2.457763671875, + "epoch": 0.01921437979391028, + "grad_norm": 0.6434066295623779, + "learning_rate": 9.999899853535344e-06, + "loss": 2.4275, + "mean_token_accuracy": 0.48657548474147916, + "num_tokens": 161698007.0, + "step": 1116 + }, + { + "entropy": 2.4796142578125, + "epoch": 0.019231596980105542, + "grad_norm": 0.5764423608779907, + "learning_rate": 9.999898104291128e-06, + "loss": 2.4679, + "mean_token_accuracy": 0.4779613367281854, + "num_tokens": 161848916.0, + "step": 1117 + }, + { + "entropy": 2.4736328125, + "epoch": 0.019248814166300802, + "grad_norm": 0.8059520721435547, + "learning_rate": 9.999896339902148e-06, + "loss": 2.4523, + "mean_token_accuracy": 0.4732037871144712, + "num_tokens": 162016077.0, + "step": 1118 + }, + { + "entropy": 2.46240234375, + "epoch": 0.019266031352496063, + "grad_norm": 0.5723803639411926, + "learning_rate": 9.999894560368406e-06, + "loss": 2.4095, + "mean_token_accuracy": 0.48319359589368105, + "num_tokens": 162166218.0, + "step": 1119 + }, + { + "entropy": 2.4815673828125, + "epoch": 0.01928324853869132, + "grad_norm": 0.5882191061973572, + "learning_rate": 9.999892765689912e-06, + "loss": 2.4345, + "mean_token_accuracy": 0.47986175399273634, + "num_tokens": 162305491.0, + "step": 1120 + }, + { + "entropy": 2.4931640625, + "epoch": 0.01930046572488658, + "grad_norm": 0.6278371214866638, + "learning_rate": 9.999890955866667e-06, + "loss": 2.3912, + "mean_token_accuracy": 0.47920875577256083, + "num_tokens": 162426796.0, + "step": 1121 + }, + { + "entropy": 2.485595703125, + "epoch": 0.01931768291108184, + "grad_norm": 0.6226006746292114, + "learning_rate": 9.999889130898682e-06, + "loss": 2.4591, + "mean_token_accuracy": 0.47889810614287853, + "num_tokens": 162575000.0, + "step": 1122 + }, + { + "entropy": 2.508544921875, + "epoch": 0.0193349000972771, + "grad_norm": 0.675370454788208, + "learning_rate": 9.999887290785957e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.47410700796172023, + "num_tokens": 162713512.0, + "step": 1123 + }, + { + "entropy": 2.420166015625, + "epoch": 0.019352117283472362, + "grad_norm": 0.5244258642196655, + "learning_rate": 9.9998854355285e-06, + "loss": 2.3653, + "mean_token_accuracy": 0.48809623531997204, + "num_tokens": 162866946.0, + "step": 1124 + }, + { + "entropy": 2.420654296875, + "epoch": 0.019369334469667623, + "grad_norm": 0.5886077284812927, + "learning_rate": 9.999883565126316e-06, + "loss": 2.381, + "mean_token_accuracy": 0.4897596640512347, + "num_tokens": 163012916.0, + "step": 1125 + }, + { + "entropy": 2.4671630859375, + "epoch": 0.019386551655862883, + "grad_norm": 0.6227119565010071, + "learning_rate": 9.999881679579411e-06, + "loss": 2.4095, + "mean_token_accuracy": 0.4798518419265747, + "num_tokens": 163152736.0, + "step": 1126 + }, + { + "entropy": 2.4190673828125, + "epoch": 0.019403768842058144, + "grad_norm": 0.6482838988304138, + "learning_rate": 9.999879778887793e-06, + "loss": 2.3961, + "mean_token_accuracy": 0.48625363502651453, + "num_tokens": 163284418.0, + "step": 1127 + }, + { + "entropy": 2.404296875, + "epoch": 0.0194209860282534, + "grad_norm": 0.7466304302215576, + "learning_rate": 9.999877863051464e-06, + "loss": 2.3752, + "mean_token_accuracy": 0.4825763679109514, + "num_tokens": 163435314.0, + "step": 1128 + }, + { + "entropy": 2.39111328125, + "epoch": 0.01943820321444866, + "grad_norm": 0.5774152278900146, + "learning_rate": 9.999875932070431e-06, + "loss": 2.3536, + "mean_token_accuracy": 0.4831749680452049, + "num_tokens": 163590841.0, + "step": 1129 + }, + { + "entropy": 2.47607421875, + "epoch": 0.019455420400643922, + "grad_norm": 0.6369538307189941, + "learning_rate": 9.9998739859447e-06, + "loss": 2.4949, + "mean_token_accuracy": 0.47335345949977636, + "num_tokens": 163724787.0, + "step": 1130 + }, + { + "entropy": 2.528076171875, + "epoch": 0.019472637586839182, + "grad_norm": 0.5823352932929993, + "learning_rate": 9.999872024674277e-06, + "loss": 2.5034, + "mean_token_accuracy": 0.4663214525207877, + "num_tokens": 163854370.0, + "step": 1131 + }, + { + "entropy": 2.518310546875, + "epoch": 0.019489854773034443, + "grad_norm": 0.6219056844711304, + "learning_rate": 9.99987004825917e-06, + "loss": 2.5005, + "mean_token_accuracy": 0.47132935794070363, + "num_tokens": 163999338.0, + "step": 1132 + }, + { + "entropy": 2.46630859375, + "epoch": 0.019507071959229703, + "grad_norm": 0.5910077691078186, + "learning_rate": 9.999868056699382e-06, + "loss": 2.4295, + "mean_token_accuracy": 0.4800408352166414, + "num_tokens": 164145130.0, + "step": 1133 + }, + { + "entropy": 2.49267578125, + "epoch": 0.019524289145424964, + "grad_norm": 0.6001760959625244, + "learning_rate": 9.99986604999492e-06, + "loss": 2.4397, + "mean_token_accuracy": 0.4783216747455299, + "num_tokens": 164283826.0, + "step": 1134 + }, + { + "entropy": 2.46240234375, + "epoch": 0.019541506331620224, + "grad_norm": 0.6204310655593872, + "learning_rate": 9.99986402814579e-06, + "loss": 2.4157, + "mean_token_accuracy": 0.48227256163954735, + "num_tokens": 164413337.0, + "step": 1135 + }, + { + "entropy": 2.3477783203125, + "epoch": 0.019558723517815485, + "grad_norm": 0.5665948987007141, + "learning_rate": 9.999861991151999e-06, + "loss": 2.2835, + "mean_token_accuracy": 0.5003114375285804, + "num_tokens": 164568720.0, + "step": 1136 + }, + { + "entropy": 2.46435546875, + "epoch": 0.019575940704010742, + "grad_norm": 0.5802145004272461, + "learning_rate": 9.999859939013553e-06, + "loss": 2.3899, + "mean_token_accuracy": 0.484941388014704, + "num_tokens": 164705235.0, + "step": 1137 + }, + { + "entropy": 2.4754638671875, + "epoch": 0.019593157890206003, + "grad_norm": 0.6445546746253967, + "learning_rate": 9.999857871730456e-06, + "loss": 2.4595, + "mean_token_accuracy": 0.4826706210151315, + "num_tokens": 164834818.0, + "step": 1138 + }, + { + "entropy": 2.490478515625, + "epoch": 0.019610375076401263, + "grad_norm": 0.6412522196769714, + "learning_rate": 9.999855789302716e-06, + "loss": 2.4606, + "mean_token_accuracy": 0.473591239657253, + "num_tokens": 164978552.0, + "step": 1139 + }, + { + "entropy": 2.3907470703125, + "epoch": 0.019627592262596524, + "grad_norm": 0.5629065036773682, + "learning_rate": 9.99985369173034e-06, + "loss": 2.3848, + "mean_token_accuracy": 0.4844861035235226, + "num_tokens": 165121299.0, + "step": 1140 + }, + { + "entropy": 2.4473876953125, + "epoch": 0.019644809448791784, + "grad_norm": 0.6042686104774475, + "learning_rate": 9.999851579013334e-06, + "loss": 2.4534, + "mean_token_accuracy": 0.48099326388910413, + "num_tokens": 165256223.0, + "step": 1141 + }, + { + "entropy": 2.4193115234375, + "epoch": 0.019662026634987045, + "grad_norm": 0.5502400994300842, + "learning_rate": 9.999849451151702e-06, + "loss": 2.4046, + "mean_token_accuracy": 0.4827784230001271, + "num_tokens": 165406097.0, + "step": 1142 + }, + { + "entropy": 2.4688720703125, + "epoch": 0.019679243821182305, + "grad_norm": 0.5841612815856934, + "learning_rate": 9.999847308145456e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.48387880716472864, + "num_tokens": 165549274.0, + "step": 1143 + }, + { + "entropy": 2.417724609375, + "epoch": 0.019696461007377566, + "grad_norm": 0.5570199489593506, + "learning_rate": 9.999845149994595e-06, + "loss": 2.3172, + "mean_token_accuracy": 0.4893231294117868, + "num_tokens": 165709510.0, + "step": 1144 + }, + { + "entropy": 2.4190673828125, + "epoch": 0.019713678193572823, + "grad_norm": 0.6449766159057617, + "learning_rate": 9.999842976699133e-06, + "loss": 2.3729, + "mean_token_accuracy": 0.48674700502306223, + "num_tokens": 165858965.0, + "step": 1145 + }, + { + "entropy": 2.448974609375, + "epoch": 0.019730895379768083, + "grad_norm": 0.5931810736656189, + "learning_rate": 9.99984078825907e-06, + "loss": 2.4001, + "mean_token_accuracy": 0.4783625756390393, + "num_tokens": 165996304.0, + "step": 1146 + }, + { + "entropy": 2.46533203125, + "epoch": 0.019748112565963344, + "grad_norm": 0.543803870677948, + "learning_rate": 9.999838584674417e-06, + "loss": 2.4305, + "mean_token_accuracy": 0.4785708379931748, + "num_tokens": 166150884.0, + "step": 1147 + }, + { + "entropy": 2.55517578125, + "epoch": 0.019765329752158604, + "grad_norm": 0.59067302942276, + "learning_rate": 9.99983636594518e-06, + "loss": 2.5251, + "mean_token_accuracy": 0.4674039273522794, + "num_tokens": 166283288.0, + "step": 1148 + }, + { + "entropy": 2.45947265625, + "epoch": 0.019782546938353865, + "grad_norm": 0.56581711769104, + "learning_rate": 9.999834132071364e-06, + "loss": 2.4188, + "mean_token_accuracy": 0.48025199631229043, + "num_tokens": 166428168.0, + "step": 1149 + }, + { + "entropy": 2.4490966796875, + "epoch": 0.019799764124549125, + "grad_norm": 0.6398937106132507, + "learning_rate": 9.999831883052978e-06, + "loss": 2.4001, + "mean_token_accuracy": 0.48107053339481354, + "num_tokens": 166579098.0, + "step": 1150 + }, + { + "entropy": 2.396484375, + "epoch": 0.019816981310744386, + "grad_norm": 0.5707962512969971, + "learning_rate": 9.999829618890028e-06, + "loss": 2.3725, + "mean_token_accuracy": 0.4916608249768615, + "num_tokens": 166724897.0, + "step": 1151 + }, + { + "entropy": 2.3883056640625, + "epoch": 0.019834198496939646, + "grad_norm": 0.5890077352523804, + "learning_rate": 9.99982733958252e-06, + "loss": 2.3406, + "mean_token_accuracy": 0.49726316425949335, + "num_tokens": 166869333.0, + "step": 1152 + }, + { + "entropy": 2.41796875, + "epoch": 0.019851415683134904, + "grad_norm": 0.5820266604423523, + "learning_rate": 9.99982504513046e-06, + "loss": 2.3695, + "mean_token_accuracy": 0.4875667071901262, + "num_tokens": 167009538.0, + "step": 1153 + }, + { + "entropy": 2.4931640625, + "epoch": 0.019868632869330164, + "grad_norm": 0.5474282503128052, + "learning_rate": 9.999822735533857e-06, + "loss": 2.4836, + "mean_token_accuracy": 0.47462243027985096, + "num_tokens": 167165134.0, + "step": 1154 + }, + { + "entropy": 2.48193359375, + "epoch": 0.019885850055525425, + "grad_norm": 0.5690717101097107, + "learning_rate": 9.999820410792717e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.47985804779455066, + "num_tokens": 167308042.0, + "step": 1155 + }, + { + "entropy": 2.46240234375, + "epoch": 0.019903067241720685, + "grad_norm": 0.6280803680419922, + "learning_rate": 9.99981807090705e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.4808740792796016, + "num_tokens": 167463555.0, + "step": 1156 + }, + { + "entropy": 2.43505859375, + "epoch": 0.019920284427915946, + "grad_norm": 0.5713448524475098, + "learning_rate": 9.999815715876858e-06, + "loss": 2.4033, + "mean_token_accuracy": 0.48702497407794, + "num_tokens": 167617300.0, + "step": 1157 + }, + { + "entropy": 2.41650390625, + "epoch": 0.019937501614111206, + "grad_norm": 0.5802963376045227, + "learning_rate": 9.999813345702151e-06, + "loss": 2.3546, + "mean_token_accuracy": 0.4853116972371936, + "num_tokens": 167751762.0, + "step": 1158 + }, + { + "entropy": 2.54541015625, + "epoch": 0.019954718800306467, + "grad_norm": 0.5789119005203247, + "learning_rate": 9.999810960382936e-06, + "loss": 2.5738, + "mean_token_accuracy": 0.46945408172905445, + "num_tokens": 167894886.0, + "step": 1159 + }, + { + "entropy": 2.4666748046875, + "epoch": 0.019971935986501727, + "grad_norm": 0.5464440584182739, + "learning_rate": 9.999808559919221e-06, + "loss": 2.4415, + "mean_token_accuracy": 0.4792809300124645, + "num_tokens": 168052147.0, + "step": 1160 + }, + { + "entropy": 2.450927734375, + "epoch": 0.019989153172696988, + "grad_norm": 0.5954328179359436, + "learning_rate": 9.999806144311013e-06, + "loss": 2.3982, + "mean_token_accuracy": 0.4799165716394782, + "num_tokens": 168188013.0, + "step": 1161 + }, + { + "entropy": 2.483154296875, + "epoch": 0.020006370358892245, + "grad_norm": 0.6072090268135071, + "learning_rate": 9.999803713558316e-06, + "loss": 2.4523, + "mean_token_accuracy": 0.48105273954570293, + "num_tokens": 168330752.0, + "step": 1162 + }, + { + "entropy": 2.472900390625, + "epoch": 0.020023587545087505, + "grad_norm": 0.5980669260025024, + "learning_rate": 9.99980126766114e-06, + "loss": 2.4257, + "mean_token_accuracy": 0.47724353708326817, + "num_tokens": 168466830.0, + "step": 1163 + }, + { + "entropy": 2.526123046875, + "epoch": 0.020040804731282766, + "grad_norm": 0.6172068119049072, + "learning_rate": 9.999798806619494e-06, + "loss": 2.5131, + "mean_token_accuracy": 0.47623227164149284, + "num_tokens": 168600305.0, + "step": 1164 + }, + { + "entropy": 2.515380859375, + "epoch": 0.020058021917478026, + "grad_norm": 0.6749062538146973, + "learning_rate": 9.999796330433385e-06, + "loss": 2.4829, + "mean_token_accuracy": 0.47597271809354424, + "num_tokens": 168758522.0, + "step": 1165 + }, + { + "entropy": 2.50341796875, + "epoch": 0.020075239103673287, + "grad_norm": 0.6174373626708984, + "learning_rate": 9.999793839102816e-06, + "loss": 2.4201, + "mean_token_accuracy": 0.47675873059779406, + "num_tokens": 168908687.0, + "step": 1166 + }, + { + "entropy": 2.4525146484375, + "epoch": 0.020092456289868547, + "grad_norm": 0.5919591188430786, + "learning_rate": 9.9997913326278e-06, + "loss": 2.4458, + "mean_token_accuracy": 0.48330248473212123, + "num_tokens": 169051058.0, + "step": 1167 + }, + { + "entropy": 2.4176025390625, + "epoch": 0.020109673476063808, + "grad_norm": 0.6027262210845947, + "learning_rate": 9.999788811008342e-06, + "loss": 2.3923, + "mean_token_accuracy": 0.48441468458622694, + "num_tokens": 169197102.0, + "step": 1168 + }, + { + "entropy": 2.500244140625, + "epoch": 0.02012689066225907, + "grad_norm": 0.5818685293197632, + "learning_rate": 9.99978627424445e-06, + "loss": 2.4626, + "mean_token_accuracy": 0.47057256614789367, + "num_tokens": 169324225.0, + "step": 1169 + }, + { + "entropy": 2.364501953125, + "epoch": 0.020144107848454326, + "grad_norm": 0.6910699605941772, + "learning_rate": 9.999783722336132e-06, + "loss": 2.3512, + "mean_token_accuracy": 0.48974316706880927, + "num_tokens": 169484740.0, + "step": 1170 + }, + { + "entropy": 2.423828125, + "epoch": 0.020161325034649586, + "grad_norm": 0.583166778087616, + "learning_rate": 9.999781155283396e-06, + "loss": 2.3721, + "mean_token_accuracy": 0.4812298850156367, + "num_tokens": 169629618.0, + "step": 1171 + }, + { + "entropy": 2.403076171875, + "epoch": 0.020178542220844847, + "grad_norm": 0.5646472573280334, + "learning_rate": 9.99977857308625e-06, + "loss": 2.3559, + "mean_token_accuracy": 0.4837103271856904, + "num_tokens": 169781342.0, + "step": 1172 + }, + { + "entropy": 2.3782958984375, + "epoch": 0.020195759407040107, + "grad_norm": 0.5860679149627686, + "learning_rate": 9.9997759757447e-06, + "loss": 2.3599, + "mean_token_accuracy": 0.49117877380922437, + "num_tokens": 169922222.0, + "step": 1173 + }, + { + "entropy": 2.416748046875, + "epoch": 0.020212976593235368, + "grad_norm": 0.6000187993049622, + "learning_rate": 9.999773363258755e-06, + "loss": 2.4166, + "mean_token_accuracy": 0.4860631856136024, + "num_tokens": 170059673.0, + "step": 1174 + }, + { + "entropy": 2.488037109375, + "epoch": 0.020230193779430628, + "grad_norm": 0.5900315046310425, + "learning_rate": 9.999770735628423e-06, + "loss": 2.4818, + "mean_token_accuracy": 0.47089498909190297, + "num_tokens": 170199601.0, + "step": 1175 + }, + { + "entropy": 2.4609375, + "epoch": 0.02024741096562589, + "grad_norm": 0.7553943991661072, + "learning_rate": 9.999768092853711e-06, + "loss": 2.3982, + "mean_token_accuracy": 0.48237905837595463, + "num_tokens": 170339217.0, + "step": 1176 + }, + { + "entropy": 2.489013671875, + "epoch": 0.02026462815182115, + "grad_norm": 0.5692632794380188, + "learning_rate": 9.99976543493463e-06, + "loss": 2.4907, + "mean_token_accuracy": 0.4684693585149944, + "num_tokens": 170487890.0, + "step": 1177 + }, + { + "entropy": 2.477294921875, + "epoch": 0.020281845338016406, + "grad_norm": 0.5685938000679016, + "learning_rate": 9.999762761871184e-06, + "loss": 2.403, + "mean_token_accuracy": 0.48181371157988906, + "num_tokens": 170641505.0, + "step": 1178 + }, + { + "entropy": 2.43310546875, + "epoch": 0.020299062524211667, + "grad_norm": 0.6160794496536255, + "learning_rate": 9.999760073663383e-06, + "loss": 2.3919, + "mean_token_accuracy": 0.48729625064879656, + "num_tokens": 170769357.0, + "step": 1179 + }, + { + "entropy": 2.5234375, + "epoch": 0.020316279710406927, + "grad_norm": 0.6029751300811768, + "learning_rate": 9.999757370311237e-06, + "loss": 2.5087, + "mean_token_accuracy": 0.46955224964767694, + "num_tokens": 170893993.0, + "step": 1180 + }, + { + "entropy": 2.463134765625, + "epoch": 0.020333496896602188, + "grad_norm": 0.635032057762146, + "learning_rate": 9.999754651814751e-06, + "loss": 2.4153, + "mean_token_accuracy": 0.4809435624629259, + "num_tokens": 171043577.0, + "step": 1181 + }, + { + "entropy": 2.3951416015625, + "epoch": 0.02035071408279745, + "grad_norm": 0.5716371536254883, + "learning_rate": 9.999751918173935e-06, + "loss": 2.3335, + "mean_token_accuracy": 0.49141963897272944, + "num_tokens": 171198846.0, + "step": 1182 + }, + { + "entropy": 2.39892578125, + "epoch": 0.02036793126899271, + "grad_norm": 0.5712882876396179, + "learning_rate": 9.999749169388798e-06, + "loss": 2.3149, + "mean_token_accuracy": 0.49085350101813674, + "num_tokens": 171351472.0, + "step": 1183 + }, + { + "entropy": 2.412353515625, + "epoch": 0.02038514845518797, + "grad_norm": 0.5964887142181396, + "learning_rate": 9.999746405459345e-06, + "loss": 2.3524, + "mean_token_accuracy": 0.4884929973632097, + "num_tokens": 171488905.0, + "step": 1184 + }, + { + "entropy": 2.5108642578125, + "epoch": 0.02040236564138323, + "grad_norm": 0.5682933330535889, + "learning_rate": 9.999743626385587e-06, + "loss": 2.4582, + "mean_token_accuracy": 0.4702299786731601, + "num_tokens": 171634563.0, + "step": 1185 + }, + { + "entropy": 2.4671630859375, + "epoch": 0.02041958282757849, + "grad_norm": 0.6103932857513428, + "learning_rate": 9.999740832167532e-06, + "loss": 2.4564, + "mean_token_accuracy": 0.4751875218935311, + "num_tokens": 171777207.0, + "step": 1186 + }, + { + "entropy": 2.49267578125, + "epoch": 0.020436800013773748, + "grad_norm": 1.0117425918579102, + "learning_rate": 9.99973802280519e-06, + "loss": 2.4774, + "mean_token_accuracy": 0.4714944367296994, + "num_tokens": 171934632.0, + "step": 1187 + }, + { + "entropy": 2.45458984375, + "epoch": 0.020454017199969008, + "grad_norm": 0.5842525959014893, + "learning_rate": 9.999735198298566e-06, + "loss": 2.4146, + "mean_token_accuracy": 0.47794078243896365, + "num_tokens": 172076052.0, + "step": 1188 + }, + { + "entropy": 2.43310546875, + "epoch": 0.02047123438616427, + "grad_norm": 0.594760000705719, + "learning_rate": 9.999732358647671e-06, + "loss": 2.3923, + "mean_token_accuracy": 0.4838462113402784, + "num_tokens": 172214297.0, + "step": 1189 + }, + { + "entropy": 2.46826171875, + "epoch": 0.02048845157235953, + "grad_norm": 0.5831640958786011, + "learning_rate": 9.999729503852515e-06, + "loss": 2.3907, + "mean_token_accuracy": 0.48116701701655984, + "num_tokens": 172354024.0, + "step": 1190 + }, + { + "entropy": 2.4698486328125, + "epoch": 0.02050566875855479, + "grad_norm": 0.612289547920227, + "learning_rate": 9.999726633913103e-06, + "loss": 2.4473, + "mean_token_accuracy": 0.48014617431908846, + "num_tokens": 172496977.0, + "step": 1191 + }, + { + "entropy": 2.462890625, + "epoch": 0.02052288594475005, + "grad_norm": 0.5079575777053833, + "learning_rate": 9.999723748829446e-06, + "loss": 2.4118, + "mean_token_accuracy": 0.4765968224965036, + "num_tokens": 172664155.0, + "step": 1192 + }, + { + "entropy": 2.4237060546875, + "epoch": 0.02054010313094531, + "grad_norm": 0.5485002398490906, + "learning_rate": 9.999720848601552e-06, + "loss": 2.374, + "mean_token_accuracy": 0.4809511392377317, + "num_tokens": 172824727.0, + "step": 1193 + }, + { + "entropy": 2.4146728515625, + "epoch": 0.02055732031714057, + "grad_norm": 0.5650107264518738, + "learning_rate": 9.999717933229429e-06, + "loss": 2.3705, + "mean_token_accuracy": 0.49346699519082904, + "num_tokens": 172981026.0, + "step": 1194 + }, + { + "entropy": 2.47900390625, + "epoch": 0.020574537503335828, + "grad_norm": 0.5556767582893372, + "learning_rate": 9.999715002713088e-06, + "loss": 2.4417, + "mean_token_accuracy": 0.4759158226661384, + "num_tokens": 173123752.0, + "step": 1195 + }, + { + "entropy": 2.505859375, + "epoch": 0.02059175468953109, + "grad_norm": 0.5984048843383789, + "learning_rate": 9.999712057052537e-06, + "loss": 2.4571, + "mean_token_accuracy": 0.4764223378151655, + "num_tokens": 173259038.0, + "step": 1196 + }, + { + "entropy": 2.4449462890625, + "epoch": 0.02060897187572635, + "grad_norm": 0.6215763688087463, + "learning_rate": 9.999709096247784e-06, + "loss": 2.4247, + "mean_token_accuracy": 0.48310248367488384, + "num_tokens": 173401014.0, + "step": 1197 + }, + { + "entropy": 2.391845703125, + "epoch": 0.02062618906192161, + "grad_norm": 0.6615683436393738, + "learning_rate": 9.99970612029884e-06, + "loss": 2.368, + "mean_token_accuracy": 0.49283183040097356, + "num_tokens": 173553153.0, + "step": 1198 + }, + { + "entropy": 2.4296875, + "epoch": 0.02064340624811687, + "grad_norm": 0.5707510113716125, + "learning_rate": 9.999703129205711e-06, + "loss": 2.3997, + "mean_token_accuracy": 0.4811532562598586, + "num_tokens": 173705473.0, + "step": 1199 + }, + { + "entropy": 2.3585205078125, + "epoch": 0.02066062343431213, + "grad_norm": 0.5650596618652344, + "learning_rate": 9.999700122968408e-06, + "loss": 2.2966, + "mean_token_accuracy": 0.4981447677128017, + "num_tokens": 173849665.0, + "step": 1200 + }, + { + "entropy": 2.46142578125, + "epoch": 0.02067784062050739, + "grad_norm": 0.5815020203590393, + "learning_rate": 9.99969710158694e-06, + "loss": 2.4193, + "mean_token_accuracy": 0.4771373653784394, + "num_tokens": 173988108.0, + "step": 1201 + }, + { + "entropy": 2.4542236328125, + "epoch": 0.020695057806702652, + "grad_norm": 0.5876851677894592, + "learning_rate": 9.999694065061316e-06, + "loss": 2.4786, + "mean_token_accuracy": 0.4749302426353097, + "num_tokens": 174126294.0, + "step": 1202 + }, + { + "entropy": 2.43505859375, + "epoch": 0.02071227499289791, + "grad_norm": 0.562313437461853, + "learning_rate": 9.999691013391544e-06, + "loss": 2.4206, + "mean_token_accuracy": 0.4778692005202174, + "num_tokens": 174269124.0, + "step": 1203 + }, + { + "entropy": 2.443115234375, + "epoch": 0.02072949217909317, + "grad_norm": 0.5638776421546936, + "learning_rate": 9.999687946577636e-06, + "loss": 2.3994, + "mean_token_accuracy": 0.48797021247446537, + "num_tokens": 174427015.0, + "step": 1204 + }, + { + "entropy": 2.4337158203125, + "epoch": 0.02074670936528843, + "grad_norm": 0.5831981301307678, + "learning_rate": 9.999684864619599e-06, + "loss": 2.3989, + "mean_token_accuracy": 0.4862150545231998, + "num_tokens": 174569995.0, + "step": 1205 + }, + { + "entropy": 2.449951171875, + "epoch": 0.02076392655148369, + "grad_norm": 0.5815305709838867, + "learning_rate": 9.999681767517441e-06, + "loss": 2.3913, + "mean_token_accuracy": 0.4784506266005337, + "num_tokens": 174706593.0, + "step": 1206 + }, + { + "entropy": 2.487060546875, + "epoch": 0.02078114373767895, + "grad_norm": 0.5909605026245117, + "learning_rate": 9.999678655271176e-06, + "loss": 2.4765, + "mean_token_accuracy": 0.47097450820729136, + "num_tokens": 174846238.0, + "step": 1207 + }, + { + "entropy": 2.44287109375, + "epoch": 0.02079836092387421, + "grad_norm": 0.5870411992073059, + "learning_rate": 9.99967552788081e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.48391013080254197, + "num_tokens": 174992749.0, + "step": 1208 + }, + { + "entropy": 2.4306640625, + "epoch": 0.020815578110069472, + "grad_norm": 0.5437948703765869, + "learning_rate": 9.999672385346352e-06, + "loss": 2.3978, + "mean_token_accuracy": 0.4776941388845444, + "num_tokens": 175147217.0, + "step": 1209 + }, + { + "entropy": 2.428955078125, + "epoch": 0.020832795296264733, + "grad_norm": 0.5859825611114502, + "learning_rate": 9.999669227667815e-06, + "loss": 2.3921, + "mean_token_accuracy": 0.4817708395421505, + "num_tokens": 175293722.0, + "step": 1210 + }, + { + "entropy": 2.43603515625, + "epoch": 0.020850012482459993, + "grad_norm": 0.5771918892860413, + "learning_rate": 9.999666054845206e-06, + "loss": 2.3826, + "mean_token_accuracy": 0.4837156576104462, + "num_tokens": 175434663.0, + "step": 1211 + }, + { + "entropy": 2.4981689453125, + "epoch": 0.02086722966865525, + "grad_norm": 0.6018497943878174, + "learning_rate": 9.999662866878534e-06, + "loss": 2.4908, + "mean_token_accuracy": 0.4697205852717161, + "num_tokens": 175582504.0, + "step": 1212 + }, + { + "entropy": 2.51611328125, + "epoch": 0.02088444685485051, + "grad_norm": 0.5939236283302307, + "learning_rate": 9.999659663767811e-06, + "loss": 2.4614, + "mean_token_accuracy": 0.47295166924595833, + "num_tokens": 175721585.0, + "step": 1213 + }, + { + "entropy": 2.399169921875, + "epoch": 0.02090166404104577, + "grad_norm": 0.5899982452392578, + "learning_rate": 9.999656445513043e-06, + "loss": 2.3798, + "mean_token_accuracy": 0.4900959865190089, + "num_tokens": 175867778.0, + "step": 1214 + }, + { + "entropy": 2.4320068359375, + "epoch": 0.020918881227241032, + "grad_norm": 0.5718798041343689, + "learning_rate": 9.999653212114245e-06, + "loss": 2.395, + "mean_token_accuracy": 0.48120944295078516, + "num_tokens": 176013170.0, + "step": 1215 + }, + { + "entropy": 2.39794921875, + "epoch": 0.020936098413436292, + "grad_norm": 0.603789746761322, + "learning_rate": 9.99964996357142e-06, + "loss": 2.3539, + "mean_token_accuracy": 0.4898811914026737, + "num_tokens": 176154005.0, + "step": 1216 + }, + { + "entropy": 2.54248046875, + "epoch": 0.020953315599631553, + "grad_norm": 0.5883952379226685, + "learning_rate": 9.999646699884585e-06, + "loss": 2.4972, + "mean_token_accuracy": 0.46428748965263367, + "num_tokens": 176294756.0, + "step": 1217 + }, + { + "entropy": 2.4794921875, + "epoch": 0.020970532785826813, + "grad_norm": 0.5553483963012695, + "learning_rate": 9.999643421053747e-06, + "loss": 2.4403, + "mean_token_accuracy": 0.4781416282057762, + "num_tokens": 176450378.0, + "step": 1218 + }, + { + "entropy": 2.402099609375, + "epoch": 0.020987749972022074, + "grad_norm": 0.5453605055809021, + "learning_rate": 9.999640127078914e-06, + "loss": 2.3609, + "mean_token_accuracy": 0.4892904283478856, + "num_tokens": 176605732.0, + "step": 1219 + }, + { + "entropy": 2.425048828125, + "epoch": 0.02100496715821733, + "grad_norm": 0.5747688412666321, + "learning_rate": 9.999636817960097e-06, + "loss": 2.4142, + "mean_token_accuracy": 0.4863899489864707, + "num_tokens": 176749939.0, + "step": 1220 + }, + { + "entropy": 2.408203125, + "epoch": 0.02102218434441259, + "grad_norm": 0.5680667757987976, + "learning_rate": 9.999633493697307e-06, + "loss": 2.3843, + "mean_token_accuracy": 0.4866964789107442, + "num_tokens": 176896720.0, + "step": 1221 + }, + { + "entropy": 2.4571533203125, + "epoch": 0.021039401530607852, + "grad_norm": 0.5690086483955383, + "learning_rate": 9.999630154290553e-06, + "loss": 2.3909, + "mean_token_accuracy": 0.48753285547718406, + "num_tokens": 177039495.0, + "step": 1222 + }, + { + "entropy": 2.43115234375, + "epoch": 0.021056618716803113, + "grad_norm": 0.6062940359115601, + "learning_rate": 9.999626799739846e-06, + "loss": 2.4407, + "mean_token_accuracy": 0.4770724857226014, + "num_tokens": 177188199.0, + "step": 1223 + }, + { + "entropy": 2.490478515625, + "epoch": 0.021073835902998373, + "grad_norm": 0.6283055543899536, + "learning_rate": 9.999623430045196e-06, + "loss": 2.4805, + "mean_token_accuracy": 0.4742564079351723, + "num_tokens": 177323708.0, + "step": 1224 + }, + { + "entropy": 2.48046875, + "epoch": 0.021091053089193634, + "grad_norm": 0.5583751201629639, + "learning_rate": 9.999620045206614e-06, + "loss": 2.4941, + "mean_token_accuracy": 0.47078192001208663, + "num_tokens": 177479672.0, + "step": 1225 + }, + { + "entropy": 2.4630126953125, + "epoch": 0.021108270275388894, + "grad_norm": 0.5786680579185486, + "learning_rate": 9.999616645224109e-06, + "loss": 2.4421, + "mean_token_accuracy": 0.4788076733238995, + "num_tokens": 177624243.0, + "step": 1226 + }, + { + "entropy": 2.41796875, + "epoch": 0.021125487461584155, + "grad_norm": 0.5827528834342957, + "learning_rate": 9.999613230097692e-06, + "loss": 2.3804, + "mean_token_accuracy": 0.48857234325259924, + "num_tokens": 177764756.0, + "step": 1227 + }, + { + "entropy": 2.4376220703125, + "epoch": 0.02114270464777941, + "grad_norm": 0.5747877955436707, + "learning_rate": 9.99960979982737e-06, + "loss": 2.4097, + "mean_token_accuracy": 0.4837777316570282, + "num_tokens": 177898426.0, + "step": 1228 + }, + { + "entropy": 2.466064453125, + "epoch": 0.021159921833974672, + "grad_norm": 0.5510834455490112, + "learning_rate": 9.999606354413159e-06, + "loss": 2.398, + "mean_token_accuracy": 0.4762152610346675, + "num_tokens": 178050984.0, + "step": 1229 + }, + { + "entropy": 2.40283203125, + "epoch": 0.021177139020169933, + "grad_norm": 0.5941503643989563, + "learning_rate": 9.999602893855067e-06, + "loss": 2.3841, + "mean_token_accuracy": 0.48818005435168743, + "num_tokens": 178189354.0, + "step": 1230 + }, + { + "entropy": 2.438720703125, + "epoch": 0.021194356206365193, + "grad_norm": 0.5585294961929321, + "learning_rate": 9.999599418153104e-06, + "loss": 2.3762, + "mean_token_accuracy": 0.4838878009468317, + "num_tokens": 178331962.0, + "step": 1231 + }, + { + "entropy": 2.4259033203125, + "epoch": 0.021211573392560454, + "grad_norm": 0.6388170123100281, + "learning_rate": 9.999595927307279e-06, + "loss": 2.3748, + "mean_token_accuracy": 0.48712681280449033, + "num_tokens": 178482059.0, + "step": 1232 + }, + { + "entropy": 2.4521484375, + "epoch": 0.021228790578755714, + "grad_norm": 0.5800935626029968, + "learning_rate": 9.999592421317606e-06, + "loss": 2.4034, + "mean_token_accuracy": 0.48077769484370947, + "num_tokens": 178614560.0, + "step": 1233 + }, + { + "entropy": 2.4222412109375, + "epoch": 0.021246007764950975, + "grad_norm": 0.5996948480606079, + "learning_rate": 9.999588900184094e-06, + "loss": 2.3708, + "mean_token_accuracy": 0.4873070912435651, + "num_tokens": 178766038.0, + "step": 1234 + }, + { + "entropy": 2.4306640625, + "epoch": 0.021263224951146235, + "grad_norm": 0.5760281682014465, + "learning_rate": 9.999585363906754e-06, + "loss": 2.3985, + "mean_token_accuracy": 0.4887649049051106, + "num_tokens": 178911786.0, + "step": 1235 + }, + { + "entropy": 2.4580078125, + "epoch": 0.021280442137341496, + "grad_norm": 0.5570631623268127, + "learning_rate": 9.999581812485595e-06, + "loss": 2.4024, + "mean_token_accuracy": 0.4858241407200694, + "num_tokens": 179059130.0, + "step": 1236 + }, + { + "entropy": 2.43017578125, + "epoch": 0.021297659323536753, + "grad_norm": 0.6132498979568481, + "learning_rate": 9.999578245920632e-06, + "loss": 2.413, + "mean_token_accuracy": 0.4828817122615874, + "num_tokens": 179193557.0, + "step": 1237 + }, + { + "entropy": 2.518310546875, + "epoch": 0.021314876509732014, + "grad_norm": 0.6096269488334656, + "learning_rate": 9.99957466421187e-06, + "loss": 2.4572, + "mean_token_accuracy": 0.4700988312251866, + "num_tokens": 179318893.0, + "step": 1238 + }, + { + "entropy": 2.4744873046875, + "epoch": 0.021332093695927274, + "grad_norm": 0.5955691337585449, + "learning_rate": 9.999571067359323e-06, + "loss": 2.4467, + "mean_token_accuracy": 0.475006652995944, + "num_tokens": 179459850.0, + "step": 1239 + }, + { + "entropy": 2.498046875, + "epoch": 0.021349310882122535, + "grad_norm": 0.5774043798446655, + "learning_rate": 9.999567455363003e-06, + "loss": 2.4671, + "mean_token_accuracy": 0.4733973373658955, + "num_tokens": 179610624.0, + "step": 1240 + }, + { + "entropy": 2.450439453125, + "epoch": 0.021366528068317795, + "grad_norm": 0.5889162421226501, + "learning_rate": 9.99956382822292e-06, + "loss": 2.4152, + "mean_token_accuracy": 0.4829295091331005, + "num_tokens": 179743518.0, + "step": 1241 + }, + { + "entropy": 2.4434814453125, + "epoch": 0.021383745254513056, + "grad_norm": 0.5916315913200378, + "learning_rate": 9.999560185939082e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.48776215221732855, + "num_tokens": 179888613.0, + "step": 1242 + }, + { + "entropy": 2.44091796875, + "epoch": 0.021400962440708316, + "grad_norm": 0.6239886283874512, + "learning_rate": 9.999556528511504e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.48557718843221664, + "num_tokens": 180029182.0, + "step": 1243 + }, + { + "entropy": 2.452880859375, + "epoch": 0.021418179626903577, + "grad_norm": 0.5450658202171326, + "learning_rate": 9.999552855940197e-06, + "loss": 2.4064, + "mean_token_accuracy": 0.48464760929346085, + "num_tokens": 180184347.0, + "step": 1244 + }, + { + "entropy": 2.46923828125, + "epoch": 0.021435396813098834, + "grad_norm": 0.5909526944160461, + "learning_rate": 9.999549168225169e-06, + "loss": 2.3883, + "mean_token_accuracy": 0.47910354332998395, + "num_tokens": 180333887.0, + "step": 1245 + }, + { + "entropy": 2.4539794921875, + "epoch": 0.021452613999294094, + "grad_norm": 0.581224799156189, + "learning_rate": 9.999545465366433e-06, + "loss": 2.4123, + "mean_token_accuracy": 0.4825686193071306, + "num_tokens": 180490238.0, + "step": 1246 + }, + { + "entropy": 2.439697265625, + "epoch": 0.021469831185489355, + "grad_norm": 0.6227947473526001, + "learning_rate": 9.999541747364002e-06, + "loss": 2.4026, + "mean_token_accuracy": 0.4787580128759146, + "num_tokens": 180625744.0, + "step": 1247 + }, + { + "entropy": 2.4227294921875, + "epoch": 0.021487048371684615, + "grad_norm": 0.5937827825546265, + "learning_rate": 9.999538014217884e-06, + "loss": 2.359, + "mean_token_accuracy": 0.4844051315449178, + "num_tokens": 180761591.0, + "step": 1248 + }, + { + "entropy": 2.46337890625, + "epoch": 0.021504265557879876, + "grad_norm": 0.5774495601654053, + "learning_rate": 9.999534265928092e-06, + "loss": 2.4378, + "mean_token_accuracy": 0.4757684310898185, + "num_tokens": 180903465.0, + "step": 1249 + }, + { + "entropy": 2.5347900390625, + "epoch": 0.021521482744075136, + "grad_norm": 0.5800737738609314, + "learning_rate": 9.999530502494639e-06, + "loss": 2.4917, + "mean_token_accuracy": 0.4692853162996471, + "num_tokens": 181053233.0, + "step": 1250 + }, + { + "entropy": 2.4166259765625, + "epoch": 0.021538699930270397, + "grad_norm": 0.5999194383621216, + "learning_rate": 9.999526723917533e-06, + "loss": 2.372, + "mean_token_accuracy": 0.4842418390326202, + "num_tokens": 181202187.0, + "step": 1251 + }, + { + "entropy": 2.3966064453125, + "epoch": 0.021555917116465657, + "grad_norm": 0.561681866645813, + "learning_rate": 9.999522930196787e-06, + "loss": 2.3211, + "mean_token_accuracy": 0.48686323687434196, + "num_tokens": 181345964.0, + "step": 1252 + }, + { + "entropy": 2.350830078125, + "epoch": 0.021573134302660914, + "grad_norm": 0.6096113920211792, + "learning_rate": 9.999519121332413e-06, + "loss": 2.3158, + "mean_token_accuracy": 0.49966981168836355, + "num_tokens": 181493986.0, + "step": 1253 + }, + { + "entropy": 2.466064453125, + "epoch": 0.021590351488856175, + "grad_norm": 0.5756585001945496, + "learning_rate": 9.99951529732442e-06, + "loss": 2.4099, + "mean_token_accuracy": 0.4760406082496047, + "num_tokens": 181643446.0, + "step": 1254 + }, + { + "entropy": 2.4566650390625, + "epoch": 0.021607568675051435, + "grad_norm": 0.6152567267417908, + "learning_rate": 9.999511458172823e-06, + "loss": 2.4127, + "mean_token_accuracy": 0.48100624280050397, + "num_tokens": 181798482.0, + "step": 1255 + }, + { + "entropy": 2.42578125, + "epoch": 0.021624785861246696, + "grad_norm": 0.5952988266944885, + "learning_rate": 9.999507603877634e-06, + "loss": 2.3729, + "mean_token_accuracy": 0.48960635205730796, + "num_tokens": 181947283.0, + "step": 1256 + }, + { + "entropy": 2.489013671875, + "epoch": 0.021642003047441957, + "grad_norm": 0.5729377865791321, + "learning_rate": 9.99950373443886e-06, + "loss": 2.4575, + "mean_token_accuracy": 0.47360460739582777, + "num_tokens": 182103948.0, + "step": 1257 + }, + { + "entropy": 2.367919921875, + "epoch": 0.021659220233637217, + "grad_norm": 0.5920243859291077, + "learning_rate": 9.999499849856518e-06, + "loss": 2.3522, + "mean_token_accuracy": 0.4938758905045688, + "num_tokens": 182244306.0, + "step": 1258 + }, + { + "entropy": 2.4716796875, + "epoch": 0.021676437419832478, + "grad_norm": 0.6242178678512573, + "learning_rate": 9.999495950130616e-06, + "loss": 2.4071, + "mean_token_accuracy": 0.4799223360605538, + "num_tokens": 182370840.0, + "step": 1259 + }, + { + "entropy": 2.4951171875, + "epoch": 0.021693654606027738, + "grad_norm": 0.7572094798088074, + "learning_rate": 9.999492035261166e-06, + "loss": 2.466, + "mean_token_accuracy": 0.48017187928780913, + "num_tokens": 182517478.0, + "step": 1260 + }, + { + "entropy": 2.4541015625, + "epoch": 0.021710871792223, + "grad_norm": 0.6314413547515869, + "learning_rate": 9.999488105248184e-06, + "loss": 2.4002, + "mean_token_accuracy": 0.4775316468439996, + "num_tokens": 182661585.0, + "step": 1261 + }, + { + "entropy": 2.4586181640625, + "epoch": 0.021728088978418256, + "grad_norm": 0.6284291744232178, + "learning_rate": 9.999484160091678e-06, + "loss": 2.4529, + "mean_token_accuracy": 0.4805305516347289, + "num_tokens": 182804645.0, + "step": 1262 + }, + { + "entropy": 2.479248046875, + "epoch": 0.021745306164613516, + "grad_norm": 0.6105698347091675, + "learning_rate": 9.99948019979166e-06, + "loss": 2.4449, + "mean_token_accuracy": 0.4798229462467134, + "num_tokens": 182947450.0, + "step": 1263 + }, + { + "entropy": 2.4501953125, + "epoch": 0.021762523350808777, + "grad_norm": 0.5790930390357971, + "learning_rate": 9.999476224348144e-06, + "loss": 2.4185, + "mean_token_accuracy": 0.4775314489379525, + "num_tokens": 183089455.0, + "step": 1264 + }, + { + "entropy": 2.5020751953125, + "epoch": 0.021779740537004037, + "grad_norm": 0.6524637341499329, + "learning_rate": 9.99947223376114e-06, + "loss": 2.4855, + "mean_token_accuracy": 0.480673139449209, + "num_tokens": 183241120.0, + "step": 1265 + }, + { + "entropy": 2.4835205078125, + "epoch": 0.021796957723199298, + "grad_norm": 0.5516358613967896, + "learning_rate": 9.99946822803066e-06, + "loss": 2.414, + "mean_token_accuracy": 0.47934063244611025, + "num_tokens": 183390072.0, + "step": 1266 + }, + { + "entropy": 2.506591796875, + "epoch": 0.02181417490939456, + "grad_norm": 0.5939943194389343, + "learning_rate": 9.99946420715672e-06, + "loss": 2.4367, + "mean_token_accuracy": 0.47747283428907394, + "num_tokens": 183539529.0, + "step": 1267 + }, + { + "entropy": 2.4515380859375, + "epoch": 0.02183139209558982, + "grad_norm": 0.5539537072181702, + "learning_rate": 9.999460171139328e-06, + "loss": 2.3763, + "mean_token_accuracy": 0.4820182635448873, + "num_tokens": 183688585.0, + "step": 1268 + }, + { + "entropy": 2.404541015625, + "epoch": 0.02184860928178508, + "grad_norm": 0.5919788479804993, + "learning_rate": 9.999456119978496e-06, + "loss": 2.4045, + "mean_token_accuracy": 0.4871440161950886, + "num_tokens": 183821506.0, + "step": 1269 + }, + { + "entropy": 2.472412109375, + "epoch": 0.021865826467980336, + "grad_norm": 0.6141915917396545, + "learning_rate": 9.999452053674242e-06, + "loss": 2.438, + "mean_token_accuracy": 0.4749964135698974, + "num_tokens": 183954051.0, + "step": 1270 + }, + { + "entropy": 2.4482421875, + "epoch": 0.021883043654175597, + "grad_norm": 0.5740827322006226, + "learning_rate": 9.99944797222657e-06, + "loss": 2.4203, + "mean_token_accuracy": 0.47882386669516563, + "num_tokens": 184097023.0, + "step": 1271 + }, + { + "entropy": 2.4649658203125, + "epoch": 0.021900260840370857, + "grad_norm": 0.6000681519508362, + "learning_rate": 9.999443875635499e-06, + "loss": 2.3876, + "mean_token_accuracy": 0.48048424860462546, + "num_tokens": 184244536.0, + "step": 1272 + }, + { + "entropy": 2.448486328125, + "epoch": 0.021917478026566118, + "grad_norm": 0.5612481236457825, + "learning_rate": 9.999439763901037e-06, + "loss": 2.375, + "mean_token_accuracy": 0.48185514099895954, + "num_tokens": 184392474.0, + "step": 1273 + }, + { + "entropy": 2.397705078125, + "epoch": 0.02193469521276138, + "grad_norm": 0.593385636806488, + "learning_rate": 9.9994356370232e-06, + "loss": 2.3472, + "mean_token_accuracy": 0.4966060775332153, + "num_tokens": 184541344.0, + "step": 1274 + }, + { + "entropy": 2.4896240234375, + "epoch": 0.02195191239895664, + "grad_norm": 0.5971876978874207, + "learning_rate": 9.999431495001998e-06, + "loss": 2.4195, + "mean_token_accuracy": 0.4794482118450105, + "num_tokens": 184672977.0, + "step": 1275 + }, + { + "entropy": 2.4857177734375, + "epoch": 0.0219691295851519, + "grad_norm": 0.6552586555480957, + "learning_rate": 9.999427337837444e-06, + "loss": 2.4895, + "mean_token_accuracy": 0.47428874857723713, + "num_tokens": 184826101.0, + "step": 1276 + }, + { + "entropy": 2.42529296875, + "epoch": 0.02198634677134716, + "grad_norm": 0.5778135061264038, + "learning_rate": 9.999423165529554e-06, + "loss": 2.4422, + "mean_token_accuracy": 0.4804799151606858, + "num_tokens": 184970321.0, + "step": 1277 + }, + { + "entropy": 2.4859619140625, + "epoch": 0.022003563957542417, + "grad_norm": 0.537326455116272, + "learning_rate": 9.999418978078335e-06, + "loss": 2.4407, + "mean_token_accuracy": 0.47214187448844314, + "num_tokens": 185128581.0, + "step": 1278 + }, + { + "entropy": 2.39697265625, + "epoch": 0.022020781143737678, + "grad_norm": 0.6056032776832581, + "learning_rate": 9.999414775483803e-06, + "loss": 2.381, + "mean_token_accuracy": 0.4875910747796297, + "num_tokens": 185266968.0, + "step": 1279 + }, + { + "entropy": 2.50537109375, + "epoch": 0.022037998329932938, + "grad_norm": 0.5535710453987122, + "learning_rate": 9.99941055774597e-06, + "loss": 2.4915, + "mean_token_accuracy": 0.4735433543100953, + "num_tokens": 185416068.0, + "step": 1280 + }, + { + "entropy": 2.4454345703125, + "epoch": 0.0220552155161282, + "grad_norm": 0.6215351223945618, + "learning_rate": 9.99940632486485e-06, + "loss": 2.3778, + "mean_token_accuracy": 0.4865972097031772, + "num_tokens": 185558293.0, + "step": 1281 + }, + { + "entropy": 2.510498046875, + "epoch": 0.02207243270232346, + "grad_norm": 0.5621281862258911, + "learning_rate": 9.999402076840452e-06, + "loss": 2.4886, + "mean_token_accuracy": 0.4751009796746075, + "num_tokens": 185707903.0, + "step": 1282 + }, + { + "entropy": 2.439208984375, + "epoch": 0.02208964988851872, + "grad_norm": 0.5782500505447388, + "learning_rate": 9.999397813672793e-06, + "loss": 2.4012, + "mean_token_accuracy": 0.47963299648836255, + "num_tokens": 185864974.0, + "step": 1283 + }, + { + "entropy": 2.452880859375, + "epoch": 0.02210686707471398, + "grad_norm": 0.584717333316803, + "learning_rate": 9.999393535361884e-06, + "loss": 2.4252, + "mean_token_accuracy": 0.4816917534917593, + "num_tokens": 186005584.0, + "step": 1284 + }, + { + "entropy": 2.43310546875, + "epoch": 0.02212408426090924, + "grad_norm": 0.5956825613975525, + "learning_rate": 9.99938924190774e-06, + "loss": 2.3782, + "mean_token_accuracy": 0.48892744118347764, + "num_tokens": 186143411.0, + "step": 1285 + }, + { + "entropy": 2.4464111328125, + "epoch": 0.0221413014471045, + "grad_norm": 0.5437039136886597, + "learning_rate": 9.99938493331037e-06, + "loss": 2.4353, + "mean_token_accuracy": 0.4839400313794613, + "num_tokens": 186299707.0, + "step": 1286 + }, + { + "entropy": 2.498046875, + "epoch": 0.02215851863329976, + "grad_norm": 0.6034572720527649, + "learning_rate": 9.999380609569791e-06, + "loss": 2.4795, + "mean_token_accuracy": 0.47284478275105357, + "num_tokens": 186447393.0, + "step": 1287 + }, + { + "entropy": 2.4296875, + "epoch": 0.02217573581949502, + "grad_norm": 0.5920705795288086, + "learning_rate": 9.999376270686015e-06, + "loss": 2.4214, + "mean_token_accuracy": 0.4788710060529411, + "num_tokens": 186590786.0, + "step": 1288 + }, + { + "entropy": 2.429443359375, + "epoch": 0.02219295300569028, + "grad_norm": 0.5723591446876526, + "learning_rate": 9.999371916659054e-06, + "loss": 2.385, + "mean_token_accuracy": 0.48299159901216626, + "num_tokens": 186739554.0, + "step": 1289 + }, + { + "entropy": 2.39111328125, + "epoch": 0.02221017019188554, + "grad_norm": 0.5848551988601685, + "learning_rate": 9.999367547488923e-06, + "loss": 2.2912, + "mean_token_accuracy": 0.49383825389668345, + "num_tokens": 186887370.0, + "step": 1290 + }, + { + "entropy": 2.467529296875, + "epoch": 0.0222273873780808, + "grad_norm": 0.5705557465553284, + "learning_rate": 9.999363163175632e-06, + "loss": 2.4211, + "mean_token_accuracy": 0.4749067425727844, + "num_tokens": 187028206.0, + "step": 1291 + }, + { + "entropy": 2.5201416015625, + "epoch": 0.02224460456427606, + "grad_norm": 0.6075615882873535, + "learning_rate": 9.999358763719198e-06, + "loss": 2.4732, + "mean_token_accuracy": 0.4724921630695462, + "num_tokens": 187178625.0, + "step": 1292 + }, + { + "entropy": 2.46142578125, + "epoch": 0.02226182175047132, + "grad_norm": 0.5848605036735535, + "learning_rate": 9.999354349119632e-06, + "loss": 2.4225, + "mean_token_accuracy": 0.47803614335134625, + "num_tokens": 187317792.0, + "step": 1293 + }, + { + "entropy": 2.443359375, + "epoch": 0.022279038936666582, + "grad_norm": 0.5687076449394226, + "learning_rate": 9.999349919376949e-06, + "loss": 2.3935, + "mean_token_accuracy": 0.48158882977440953, + "num_tokens": 187468854.0, + "step": 1294 + }, + { + "entropy": 2.4393310546875, + "epoch": 0.02229625612286184, + "grad_norm": 0.5710064768791199, + "learning_rate": 9.999345474491161e-06, + "loss": 2.3597, + "mean_token_accuracy": 0.4866549982689321, + "num_tokens": 187617503.0, + "step": 1295 + }, + { + "entropy": 2.4757080078125, + "epoch": 0.0223134733090571, + "grad_norm": 0.6155255436897278, + "learning_rate": 9.999341014462283e-06, + "loss": 2.4421, + "mean_token_accuracy": 0.47693472215905786, + "num_tokens": 187745930.0, + "step": 1296 + }, + { + "entropy": 2.4345703125, + "epoch": 0.02233069049525236, + "grad_norm": 0.5629665851593018, + "learning_rate": 9.999336539290325e-06, + "loss": 2.4156, + "mean_token_accuracy": 0.4816053519025445, + "num_tokens": 187892581.0, + "step": 1297 + }, + { + "entropy": 2.4129638671875, + "epoch": 0.02234790768144762, + "grad_norm": 0.5968115329742432, + "learning_rate": 9.999332048975305e-06, + "loss": 2.3635, + "mean_token_accuracy": 0.48733530612662435, + "num_tokens": 188044827.0, + "step": 1298 + }, + { + "entropy": 2.4227294921875, + "epoch": 0.02236512486764288, + "grad_norm": 0.5534366965293884, + "learning_rate": 9.999327543517234e-06, + "loss": 2.4033, + "mean_token_accuracy": 0.4784287307411432, + "num_tokens": 188208828.0, + "step": 1299 + }, + { + "entropy": 2.42578125, + "epoch": 0.022382342053838142, + "grad_norm": 0.5828860402107239, + "learning_rate": 9.999323022916128e-06, + "loss": 2.3879, + "mean_token_accuracy": 0.4846663847565651, + "num_tokens": 188357779.0, + "step": 1300 + }, + { + "entropy": 2.4833984375, + "epoch": 0.022399559240033402, + "grad_norm": 0.6039474606513977, + "learning_rate": 9.999318487171998e-06, + "loss": 2.4643, + "mean_token_accuracy": 0.47219090024009347, + "num_tokens": 188488042.0, + "step": 1301 + }, + { + "entropy": 2.443359375, + "epoch": 0.022416776426228663, + "grad_norm": 0.6493925452232361, + "learning_rate": 9.999313936284858e-06, + "loss": 2.4156, + "mean_token_accuracy": 0.4812625157646835, + "num_tokens": 188631539.0, + "step": 1302 + }, + { + "entropy": 2.439453125, + "epoch": 0.02243399361242392, + "grad_norm": 0.590706467628479, + "learning_rate": 9.999309370254722e-06, + "loss": 2.4059, + "mean_token_accuracy": 0.482134644407779, + "num_tokens": 188778873.0, + "step": 1303 + }, + { + "entropy": 2.376708984375, + "epoch": 0.02245121079861918, + "grad_norm": 0.7171503305435181, + "learning_rate": 9.999304789081604e-06, + "loss": 2.3167, + "mean_token_accuracy": 0.49248863477259874, + "num_tokens": 188929372.0, + "step": 1304 + }, + { + "entropy": 2.47119140625, + "epoch": 0.02246842798481444, + "grad_norm": 0.603790283203125, + "learning_rate": 9.999300192765521e-06, + "loss": 2.4484, + "mean_token_accuracy": 0.4733037226833403, + "num_tokens": 189068047.0, + "step": 1305 + }, + { + "entropy": 2.41162109375, + "epoch": 0.0224856451710097, + "grad_norm": 0.5595695376396179, + "learning_rate": 9.999295581306483e-06, + "loss": 2.3642, + "mean_token_accuracy": 0.4862880017608404, + "num_tokens": 189213830.0, + "step": 1306 + }, + { + "entropy": 2.4998779296875, + "epoch": 0.022502862357204962, + "grad_norm": 0.616247296333313, + "learning_rate": 9.999290954704505e-06, + "loss": 2.4518, + "mean_token_accuracy": 0.48123760567978024, + "num_tokens": 189351112.0, + "step": 1307 + }, + { + "entropy": 2.48583984375, + "epoch": 0.022520079543400223, + "grad_norm": 0.5927817821502686, + "learning_rate": 9.999286312959602e-06, + "loss": 2.4738, + "mean_token_accuracy": 0.476222672034055, + "num_tokens": 189488488.0, + "step": 1308 + }, + { + "entropy": 2.479736328125, + "epoch": 0.022537296729595483, + "grad_norm": 0.5709447860717773, + "learning_rate": 9.999281656071784e-06, + "loss": 2.4563, + "mean_token_accuracy": 0.47250251518562436, + "num_tokens": 189630709.0, + "step": 1309 + }, + { + "entropy": 2.4727783203125, + "epoch": 0.022554513915790744, + "grad_norm": 0.5687850713729858, + "learning_rate": 9.99927698404107e-06, + "loss": 2.4368, + "mean_token_accuracy": 0.4784710453823209, + "num_tokens": 189772213.0, + "step": 1310 + }, + { + "entropy": 2.3721923828125, + "epoch": 0.022571731101986004, + "grad_norm": 0.6119619607925415, + "learning_rate": 9.999272296867475e-06, + "loss": 2.2997, + "mean_token_accuracy": 0.49369402416050434, + "num_tokens": 189913660.0, + "step": 1311 + }, + { + "entropy": 2.4296875, + "epoch": 0.02258894828818126, + "grad_norm": 0.61922687292099, + "learning_rate": 9.999267594551007e-06, + "loss": 2.4161, + "mean_token_accuracy": 0.4793846160173416, + "num_tokens": 190047956.0, + "step": 1312 + }, + { + "entropy": 2.5008544921875, + "epoch": 0.02260616547437652, + "grad_norm": 0.8448692560195923, + "learning_rate": 9.999262877091687e-06, + "loss": 2.4671, + "mean_token_accuracy": 0.4724547420628369, + "num_tokens": 190193511.0, + "step": 1313 + }, + { + "entropy": 2.43505859375, + "epoch": 0.022623382660571782, + "grad_norm": 0.592045247554779, + "learning_rate": 9.999258144489524e-06, + "loss": 2.3928, + "mean_token_accuracy": 0.47932835714891553, + "num_tokens": 190339476.0, + "step": 1314 + }, + { + "entropy": 2.49169921875, + "epoch": 0.022640599846767043, + "grad_norm": 0.6020418405532837, + "learning_rate": 9.999253396744534e-06, + "loss": 2.4387, + "mean_token_accuracy": 0.4741725055500865, + "num_tokens": 190473944.0, + "step": 1315 + }, + { + "entropy": 2.437744140625, + "epoch": 0.022657817032962303, + "grad_norm": 0.5783674120903015, + "learning_rate": 9.999248633856735e-06, + "loss": 2.3754, + "mean_token_accuracy": 0.48019342171028256, + "num_tokens": 190621619.0, + "step": 1316 + }, + { + "entropy": 2.5028076171875, + "epoch": 0.022675034219157564, + "grad_norm": 0.6043087840080261, + "learning_rate": 9.999243855826137e-06, + "loss": 2.4531, + "mean_token_accuracy": 0.4730870760977268, + "num_tokens": 190753516.0, + "step": 1317 + }, + { + "entropy": 2.45654296875, + "epoch": 0.022692251405352824, + "grad_norm": 0.646530270576477, + "learning_rate": 9.999239062652754e-06, + "loss": 2.3895, + "mean_token_accuracy": 0.48305864399299026, + "num_tokens": 190898210.0, + "step": 1318 + }, + { + "entropy": 2.519775390625, + "epoch": 0.022709468591548085, + "grad_norm": 0.5441105961799622, + "learning_rate": 9.999234254336603e-06, + "loss": 2.5006, + "mean_token_accuracy": 0.465965838637203, + "num_tokens": 191072262.0, + "step": 1319 + }, + { + "entropy": 2.4124755859375, + "epoch": 0.022726685777743342, + "grad_norm": 0.5883780121803284, + "learning_rate": 9.9992294308777e-06, + "loss": 2.3936, + "mean_token_accuracy": 0.4880047831684351, + "num_tokens": 191216422.0, + "step": 1320 + }, + { + "entropy": 2.4327392578125, + "epoch": 0.022743902963938602, + "grad_norm": 0.5934004783630371, + "learning_rate": 9.999224592276055e-06, + "loss": 2.3924, + "mean_token_accuracy": 0.48111487692222, + "num_tokens": 191352211.0, + "step": 1321 + }, + { + "entropy": 2.497314453125, + "epoch": 0.022761120150133863, + "grad_norm": 0.5587040781974792, + "learning_rate": 9.999219738531687e-06, + "loss": 2.4294, + "mean_token_accuracy": 0.4762568627484143, + "num_tokens": 191496965.0, + "step": 1322 + }, + { + "entropy": 2.4332275390625, + "epoch": 0.022778337336329123, + "grad_norm": 0.5939592719078064, + "learning_rate": 9.999214869644608e-06, + "loss": 2.3724, + "mean_token_accuracy": 0.4876466835848987, + "num_tokens": 191642816.0, + "step": 1323 + }, + { + "entropy": 2.3785400390625, + "epoch": 0.022795554522524384, + "grad_norm": 0.5794064402580261, + "learning_rate": 9.999209985614832e-06, + "loss": 2.4016, + "mean_token_accuracy": 0.48678439343348145, + "num_tokens": 191792676.0, + "step": 1324 + }, + { + "entropy": 2.4501953125, + "epoch": 0.022812771708719645, + "grad_norm": 0.5635095238685608, + "learning_rate": 9.999205086442378e-06, + "loss": 2.4489, + "mean_token_accuracy": 0.47591904271394014, + "num_tokens": 191943634.0, + "step": 1325 + }, + { + "entropy": 2.454833984375, + "epoch": 0.022829988894914905, + "grad_norm": 0.5465742349624634, + "learning_rate": 9.999200172127257e-06, + "loss": 2.4428, + "mean_token_accuracy": 0.4766951338388026, + "num_tokens": 192097281.0, + "step": 1326 + }, + { + "entropy": 2.490478515625, + "epoch": 0.022847206081110166, + "grad_norm": 0.5686846375465393, + "learning_rate": 9.999195242669487e-06, + "loss": 2.4461, + "mean_token_accuracy": 0.4775790839921683, + "num_tokens": 192238225.0, + "step": 1327 + }, + { + "entropy": 2.546630859375, + "epoch": 0.022864423267305423, + "grad_norm": 0.5859907865524292, + "learning_rate": 9.99919029806908e-06, + "loss": 2.5229, + "mean_token_accuracy": 0.4657089659012854, + "num_tokens": 192380707.0, + "step": 1328 + }, + { + "entropy": 2.48095703125, + "epoch": 0.022881640453500683, + "grad_norm": 0.5355405807495117, + "learning_rate": 9.99918533832605e-06, + "loss": 2.4508, + "mean_token_accuracy": 0.47027599764987826, + "num_tokens": 192539763.0, + "step": 1329 + }, + { + "entropy": 2.449462890625, + "epoch": 0.022898857639695944, + "grad_norm": 0.5785843133926392, + "learning_rate": 9.999180363440416e-06, + "loss": 2.4106, + "mean_token_accuracy": 0.48218734934926033, + "num_tokens": 192687897.0, + "step": 1330 + }, + { + "entropy": 2.4329833984375, + "epoch": 0.022916074825891204, + "grad_norm": 0.5569692850112915, + "learning_rate": 9.99917537341219e-06, + "loss": 2.4042, + "mean_token_accuracy": 0.4766626372002065, + "num_tokens": 192834922.0, + "step": 1331 + }, + { + "entropy": 2.3797607421875, + "epoch": 0.022933292012086465, + "grad_norm": 0.5599532723426819, + "learning_rate": 9.999170368241389e-06, + "loss": 2.3352, + "mean_token_accuracy": 0.4913885644637048, + "num_tokens": 192977065.0, + "step": 1332 + }, + { + "entropy": 2.492431640625, + "epoch": 0.022950509198281725, + "grad_norm": 0.5502541661262512, + "learning_rate": 9.999165347928028e-06, + "loss": 2.4414, + "mean_token_accuracy": 0.4727879990823567, + "num_tokens": 193116265.0, + "step": 1333 + }, + { + "entropy": 2.4232177734375, + "epoch": 0.022967726384476986, + "grad_norm": 0.5547073483467102, + "learning_rate": 9.999160312472121e-06, + "loss": 2.3541, + "mean_token_accuracy": 0.48737787129357457, + "num_tokens": 193276114.0, + "step": 1334 + }, + { + "entropy": 2.4656982421875, + "epoch": 0.022984943570672246, + "grad_norm": 0.5852370858192444, + "learning_rate": 9.999155261873682e-06, + "loss": 2.446, + "mean_token_accuracy": 0.475621965713799, + "num_tokens": 193440455.0, + "step": 1335 + }, + { + "entropy": 2.421630859375, + "epoch": 0.023002160756867507, + "grad_norm": 0.5449553728103638, + "learning_rate": 9.999150196132731e-06, + "loss": 2.3845, + "mean_token_accuracy": 0.4841674976050854, + "num_tokens": 193607915.0, + "step": 1336 + }, + { + "entropy": 2.4388427734375, + "epoch": 0.023019377943062764, + "grad_norm": 0.5830256342887878, + "learning_rate": 9.999145115249278e-06, + "loss": 2.4171, + "mean_token_accuracy": 0.4808310712687671, + "num_tokens": 193747035.0, + "step": 1337 + }, + { + "entropy": 2.4093017578125, + "epoch": 0.023036595129258024, + "grad_norm": 0.5849303603172302, + "learning_rate": 9.999140019223343e-06, + "loss": 2.3965, + "mean_token_accuracy": 0.48324496299028397, + "num_tokens": 193882217.0, + "step": 1338 + }, + { + "entropy": 2.457275390625, + "epoch": 0.023053812315453285, + "grad_norm": 0.5599417686462402, + "learning_rate": 9.999134908054939e-06, + "loss": 2.4355, + "mean_token_accuracy": 0.47678020503371954, + "num_tokens": 194037678.0, + "step": 1339 + }, + { + "entropy": 2.40771484375, + "epoch": 0.023071029501648545, + "grad_norm": 0.635410726070404, + "learning_rate": 9.999129781744081e-06, + "loss": 2.3913, + "mean_token_accuracy": 0.48584868013858795, + "num_tokens": 194181984.0, + "step": 1340 + }, + { + "entropy": 2.4732666015625, + "epoch": 0.023088246687843806, + "grad_norm": 0.6021797060966492, + "learning_rate": 9.999124640290787e-06, + "loss": 2.4765, + "mean_token_accuracy": 0.47223041532561183, + "num_tokens": 194330357.0, + "step": 1341 + }, + { + "entropy": 2.4163818359375, + "epoch": 0.023105463874039067, + "grad_norm": 0.6050888895988464, + "learning_rate": 9.99911948369507e-06, + "loss": 2.4004, + "mean_token_accuracy": 0.4854390062391758, + "num_tokens": 194466254.0, + "step": 1342 + }, + { + "entropy": 2.4708251953125, + "epoch": 0.023122681060234327, + "grad_norm": 0.5668588876724243, + "learning_rate": 9.999114311956946e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.48454291047528386, + "num_tokens": 194616186.0, + "step": 1343 + }, + { + "entropy": 2.50146484375, + "epoch": 0.023139898246429588, + "grad_norm": 0.5625669956207275, + "learning_rate": 9.99910912507643e-06, + "loss": 2.5026, + "mean_token_accuracy": 0.4760167011991143, + "num_tokens": 194772091.0, + "step": 1344 + }, + { + "entropy": 2.4669189453125, + "epoch": 0.023157115432624845, + "grad_norm": 0.6059160828590393, + "learning_rate": 9.999103923053541e-06, + "loss": 2.4043, + "mean_token_accuracy": 0.4812000300735235, + "num_tokens": 194918670.0, + "step": 1345 + }, + { + "entropy": 2.41552734375, + "epoch": 0.023174332618820105, + "grad_norm": 0.5639379620552063, + "learning_rate": 9.999098705888293e-06, + "loss": 2.3719, + "mean_token_accuracy": 0.4815536108799279, + "num_tokens": 195073659.0, + "step": 1346 + }, + { + "entropy": 2.4085693359375, + "epoch": 0.023191549805015366, + "grad_norm": 0.5673343539237976, + "learning_rate": 9.9990934735807e-06, + "loss": 2.3906, + "mean_token_accuracy": 0.4839661535806954, + "num_tokens": 195215626.0, + "step": 1347 + }, + { + "entropy": 2.357421875, + "epoch": 0.023208766991210626, + "grad_norm": 0.54830002784729, + "learning_rate": 9.99908822613078e-06, + "loss": 2.3375, + "mean_token_accuracy": 0.49586250027641654, + "num_tokens": 195371848.0, + "step": 1348 + }, + { + "entropy": 2.517578125, + "epoch": 0.023225984177405887, + "grad_norm": 0.5844534039497375, + "learning_rate": 9.999082963538548e-06, + "loss": 2.4989, + "mean_token_accuracy": 0.46372106671333313, + "num_tokens": 195524114.0, + "step": 1349 + }, + { + "entropy": 2.4599609375, + "epoch": 0.023243201363601147, + "grad_norm": 0.5597494840621948, + "learning_rate": 9.99907768580402e-06, + "loss": 2.4202, + "mean_token_accuracy": 0.4795773741789162, + "num_tokens": 195668084.0, + "step": 1350 + }, + { + "entropy": 2.4263916015625, + "epoch": 0.023260418549796408, + "grad_norm": 0.5759093165397644, + "learning_rate": 9.999072392927213e-06, + "loss": 2.3863, + "mean_token_accuracy": 0.48264182545244694, + "num_tokens": 195819421.0, + "step": 1351 + }, + { + "entropy": 2.4315185546875, + "epoch": 0.02327763573599167, + "grad_norm": 0.5471842288970947, + "learning_rate": 9.999067084908141e-06, + "loss": 2.3874, + "mean_token_accuracy": 0.48090588115155697, + "num_tokens": 195965275.0, + "step": 1352 + }, + { + "entropy": 2.430908203125, + "epoch": 0.023294852922186925, + "grad_norm": 0.5952867865562439, + "learning_rate": 9.999061761746822e-06, + "loss": 2.3695, + "mean_token_accuracy": 0.48542801523581147, + "num_tokens": 196100273.0, + "step": 1353 + }, + { + "entropy": 2.4898681640625, + "epoch": 0.023312070108382186, + "grad_norm": 0.5692234635353088, + "learning_rate": 9.999056423443272e-06, + "loss": 2.4315, + "mean_token_accuracy": 0.47685773856937885, + "num_tokens": 196241976.0, + "step": 1354 + }, + { + "entropy": 2.411376953125, + "epoch": 0.023329287294577446, + "grad_norm": 0.5885135531425476, + "learning_rate": 9.999051069997505e-06, + "loss": 2.3328, + "mean_token_accuracy": 0.48680604388937354, + "num_tokens": 196397855.0, + "step": 1355 + }, + { + "entropy": 2.439208984375, + "epoch": 0.023346504480772707, + "grad_norm": 0.5604063272476196, + "learning_rate": 9.999045701409539e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.48355547012761235, + "num_tokens": 196557337.0, + "step": 1356 + }, + { + "entropy": 2.4569091796875, + "epoch": 0.023363721666967967, + "grad_norm": 0.5709223747253418, + "learning_rate": 9.99904031767939e-06, + "loss": 2.4283, + "mean_token_accuracy": 0.47822416480630636, + "num_tokens": 196698366.0, + "step": 1357 + }, + { + "entropy": 2.3804931640625, + "epoch": 0.023380938853163228, + "grad_norm": 0.5742472410202026, + "learning_rate": 9.999034918807075e-06, + "loss": 2.3166, + "mean_token_accuracy": 0.49618813674896955, + "num_tokens": 196847854.0, + "step": 1358 + }, + { + "entropy": 2.4959716796875, + "epoch": 0.02339815603935849, + "grad_norm": 0.5980573892593384, + "learning_rate": 9.99902950479261e-06, + "loss": 2.4141, + "mean_token_accuracy": 0.4758835444226861, + "num_tokens": 196971667.0, + "step": 1359 + }, + { + "entropy": 2.532470703125, + "epoch": 0.02341537322555375, + "grad_norm": 0.5753028988838196, + "learning_rate": 9.99902407563601e-06, + "loss": 2.5277, + "mean_token_accuracy": 0.473021827172488, + "num_tokens": 197117509.0, + "step": 1360 + }, + { + "entropy": 2.3953857421875, + "epoch": 0.02343259041174901, + "grad_norm": 0.6161779761314392, + "learning_rate": 9.999018631337294e-06, + "loss": 2.3512, + "mean_token_accuracy": 0.4856503955088556, + "num_tokens": 197251978.0, + "step": 1361 + }, + { + "entropy": 2.5050048828125, + "epoch": 0.023449807597944267, + "grad_norm": 0.5542529225349426, + "learning_rate": 9.999013171896476e-06, + "loss": 2.5021, + "mean_token_accuracy": 0.46956229070201516, + "num_tokens": 197391278.0, + "step": 1362 + }, + { + "entropy": 2.420654296875, + "epoch": 0.023467024784139527, + "grad_norm": 0.5800600051879883, + "learning_rate": 9.999007697313573e-06, + "loss": 2.3979, + "mean_token_accuracy": 0.48410065239295363, + "num_tokens": 197531015.0, + "step": 1363 + }, + { + "entropy": 2.4874267578125, + "epoch": 0.023484241970334788, + "grad_norm": 0.6007982492446899, + "learning_rate": 9.999002207588603e-06, + "loss": 2.423, + "mean_token_accuracy": 0.4762545581907034, + "num_tokens": 197661883.0, + "step": 1364 + }, + { + "entropy": 2.4798583984375, + "epoch": 0.023501459156530048, + "grad_norm": 0.6128295660018921, + "learning_rate": 9.998996702721582e-06, + "loss": 2.4135, + "mean_token_accuracy": 0.4799572662450373, + "num_tokens": 197799526.0, + "step": 1365 + }, + { + "entropy": 2.499267578125, + "epoch": 0.02351867634272531, + "grad_norm": 0.574564516544342, + "learning_rate": 9.998991182712526e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.4757503070868552, + "num_tokens": 197966743.0, + "step": 1366 + }, + { + "entropy": 2.4637451171875, + "epoch": 0.02353589352892057, + "grad_norm": 0.5752230882644653, + "learning_rate": 9.998985647561453e-06, + "loss": 2.438, + "mean_token_accuracy": 0.4770435835234821, + "num_tokens": 198124743.0, + "step": 1367 + }, + { + "entropy": 2.42333984375, + "epoch": 0.02355311071511583, + "grad_norm": 0.5870989561080933, + "learning_rate": 9.998980097268375e-06, + "loss": 2.3783, + "mean_token_accuracy": 0.487036875449121, + "num_tokens": 198265217.0, + "step": 1368 + }, + { + "entropy": 2.399169921875, + "epoch": 0.02357032790131109, + "grad_norm": 0.5645302534103394, + "learning_rate": 9.998974531833316e-06, + "loss": 2.3897, + "mean_token_accuracy": 0.487074441742152, + "num_tokens": 198402224.0, + "step": 1369 + }, + { + "entropy": 2.40234375, + "epoch": 0.023587545087506347, + "grad_norm": 0.6568628549575806, + "learning_rate": 9.99896895125629e-06, + "loss": 2.3445, + "mean_token_accuracy": 0.4943762863986194, + "num_tokens": 198536773.0, + "step": 1370 + }, + { + "entropy": 2.462890625, + "epoch": 0.023604762273701608, + "grad_norm": 0.5660800337791443, + "learning_rate": 9.998963355537313e-06, + "loss": 2.4593, + "mean_token_accuracy": 0.4756892016157508, + "num_tokens": 198690958.0, + "step": 1371 + }, + { + "entropy": 2.48193359375, + "epoch": 0.02362197945989687, + "grad_norm": 0.6144444346427917, + "learning_rate": 9.998957744676403e-06, + "loss": 2.4622, + "mean_token_accuracy": 0.4751414889469743, + "num_tokens": 198845530.0, + "step": 1372 + }, + { + "entropy": 2.463623046875, + "epoch": 0.02363919664609213, + "grad_norm": 0.6004127264022827, + "learning_rate": 9.998952118673575e-06, + "loss": 2.4163, + "mean_token_accuracy": 0.4740082244388759, + "num_tokens": 198973843.0, + "step": 1373 + }, + { + "entropy": 2.4368896484375, + "epoch": 0.02365641383228739, + "grad_norm": 0.5763023495674133, + "learning_rate": 9.998946477528848e-06, + "loss": 2.4106, + "mean_token_accuracy": 0.4760644896887243, + "num_tokens": 199120869.0, + "step": 1374 + }, + { + "entropy": 2.455078125, + "epoch": 0.02367363101848265, + "grad_norm": 0.5937883257865906, + "learning_rate": 9.99894082124224e-06, + "loss": 2.4061, + "mean_token_accuracy": 0.47505834326148033, + "num_tokens": 199271029.0, + "step": 1375 + }, + { + "entropy": 2.4976806640625, + "epoch": 0.02369084820467791, + "grad_norm": 0.5722872614860535, + "learning_rate": 9.998935149813766e-06, + "loss": 2.4279, + "mean_token_accuracy": 0.4803337692283094, + "num_tokens": 199421781.0, + "step": 1376 + }, + { + "entropy": 2.4547119140625, + "epoch": 0.02370806539087317, + "grad_norm": 0.5956291556358337, + "learning_rate": 9.998929463243443e-06, + "loss": 2.4151, + "mean_token_accuracy": 0.478096229955554, + "num_tokens": 199575267.0, + "step": 1377 + }, + { + "entropy": 2.51513671875, + "epoch": 0.023725282577068428, + "grad_norm": 0.5729405879974365, + "learning_rate": 9.99892376153129e-06, + "loss": 2.4828, + "mean_token_accuracy": 0.4753085137344897, + "num_tokens": 199718432.0, + "step": 1378 + }, + { + "entropy": 2.4134521484375, + "epoch": 0.02374249976326369, + "grad_norm": 0.5974603295326233, + "learning_rate": 9.998918044677321e-06, + "loss": 2.3474, + "mean_token_accuracy": 0.4921872243285179, + "num_tokens": 199860520.0, + "step": 1379 + }, + { + "entropy": 2.4600830078125, + "epoch": 0.02375971694945895, + "grad_norm": 0.5969933271408081, + "learning_rate": 9.99891231268156e-06, + "loss": 2.4926, + "mean_token_accuracy": 0.4793005189858377, + "num_tokens": 200000679.0, + "step": 1380 + }, + { + "entropy": 2.45556640625, + "epoch": 0.02377693413565421, + "grad_norm": 0.5951429605484009, + "learning_rate": 9.998906565544017e-06, + "loss": 2.4085, + "mean_token_accuracy": 0.4794029123149812, + "num_tokens": 200145478.0, + "step": 1381 + }, + { + "entropy": 2.4609375, + "epoch": 0.02379415132184947, + "grad_norm": 0.6001779437065125, + "learning_rate": 9.998900803264714e-06, + "loss": 2.4392, + "mean_token_accuracy": 0.4742898163385689, + "num_tokens": 200297504.0, + "step": 1382 + }, + { + "entropy": 2.4384765625, + "epoch": 0.02381136850804473, + "grad_norm": 0.5844825506210327, + "learning_rate": 9.998895025843664e-06, + "loss": 2.3799, + "mean_token_accuracy": 0.4815810131840408, + "num_tokens": 200438128.0, + "step": 1383 + }, + { + "entropy": 2.439453125, + "epoch": 0.02382858569423999, + "grad_norm": 0.5872356295585632, + "learning_rate": 9.99888923328089e-06, + "loss": 2.375, + "mean_token_accuracy": 0.48686960944905877, + "num_tokens": 200576616.0, + "step": 1384 + }, + { + "entropy": 2.4556884765625, + "epoch": 0.023845802880435252, + "grad_norm": 0.5676760077476501, + "learning_rate": 9.998883425576407e-06, + "loss": 2.4051, + "mean_token_accuracy": 0.47929678950458765, + "num_tokens": 200715807.0, + "step": 1385 + }, + { + "entropy": 2.5262451171875, + "epoch": 0.023863020066630512, + "grad_norm": 0.6212140321731567, + "learning_rate": 9.998877602730231e-06, + "loss": 2.5013, + "mean_token_accuracy": 0.46635109139606357, + "num_tokens": 200852036.0, + "step": 1386 + }, + { + "entropy": 2.466796875, + "epoch": 0.02388023725282577, + "grad_norm": 0.5637140870094299, + "learning_rate": 9.998871764742383e-06, + "loss": 2.4031, + "mean_token_accuracy": 0.4789075516164303, + "num_tokens": 201005894.0, + "step": 1387 + }, + { + "entropy": 2.432373046875, + "epoch": 0.02389745443902103, + "grad_norm": 0.554351806640625, + "learning_rate": 9.998865911612878e-06, + "loss": 2.334, + "mean_token_accuracy": 0.48602917743846774, + "num_tokens": 201163254.0, + "step": 1388 + }, + { + "entropy": 2.4622802734375, + "epoch": 0.02391467162521629, + "grad_norm": 0.5873886346817017, + "learning_rate": 9.998860043341733e-06, + "loss": 2.4122, + "mean_token_accuracy": 0.4796907198615372, + "num_tokens": 201311034.0, + "step": 1389 + }, + { + "entropy": 2.5067138671875, + "epoch": 0.02393188881141155, + "grad_norm": 0.589946985244751, + "learning_rate": 9.99885415992897e-06, + "loss": 2.4737, + "mean_token_accuracy": 0.46914456551894546, + "num_tokens": 201439256.0, + "step": 1390 + }, + { + "entropy": 2.5040283203125, + "epoch": 0.02394910599760681, + "grad_norm": 0.5907305479049683, + "learning_rate": 9.998848261374602e-06, + "loss": 2.5056, + "mean_token_accuracy": 0.4672775506041944, + "num_tokens": 201575176.0, + "step": 1391 + }, + { + "entropy": 2.432373046875, + "epoch": 0.023966323183802072, + "grad_norm": 0.5816705822944641, + "learning_rate": 9.998842347678652e-06, + "loss": 2.4009, + "mean_token_accuracy": 0.48051235545426607, + "num_tokens": 201741972.0, + "step": 1392 + }, + { + "entropy": 2.532470703125, + "epoch": 0.023983540369997332, + "grad_norm": 0.5787369012832642, + "learning_rate": 9.998836418841133e-06, + "loss": 2.5186, + "mean_token_accuracy": 0.46860673651099205, + "num_tokens": 201887196.0, + "step": 1393 + }, + { + "entropy": 2.3990478515625, + "epoch": 0.024000757556192593, + "grad_norm": 0.6086745262145996, + "learning_rate": 9.998830474862064e-06, + "loss": 2.3506, + "mean_token_accuracy": 0.4886458469554782, + "num_tokens": 202021347.0, + "step": 1394 + }, + { + "entropy": 2.41748046875, + "epoch": 0.02401797474238785, + "grad_norm": 0.5796844363212585, + "learning_rate": 9.998824515741467e-06, + "loss": 2.3866, + "mean_token_accuracy": 0.4884433811530471, + "num_tokens": 202160896.0, + "step": 1395 + }, + { + "entropy": 2.5753173828125, + "epoch": 0.02403519192858311, + "grad_norm": 0.5776175856590271, + "learning_rate": 9.998818541479355e-06, + "loss": 2.5233, + "mean_token_accuracy": 0.4616355444304645, + "num_tokens": 202292953.0, + "step": 1396 + }, + { + "entropy": 2.39892578125, + "epoch": 0.02405240911477837, + "grad_norm": 0.5363594889640808, + "learning_rate": 9.998812552075747e-06, + "loss": 2.3689, + "mean_token_accuracy": 0.4840633156709373, + "num_tokens": 202460982.0, + "step": 1397 + }, + { + "entropy": 2.39208984375, + "epoch": 0.02406962630097363, + "grad_norm": 0.5426232218742371, + "learning_rate": 9.998806547530664e-06, + "loss": 2.3481, + "mean_token_accuracy": 0.48746701143682003, + "num_tokens": 202621458.0, + "step": 1398 + }, + { + "entropy": 2.4622802734375, + "epoch": 0.024086843487168892, + "grad_norm": 0.5721414089202881, + "learning_rate": 9.998800527844122e-06, + "loss": 2.4094, + "mean_token_accuracy": 0.4796240641735494, + "num_tokens": 202756740.0, + "step": 1399 + }, + { + "entropy": 2.363037109375, + "epoch": 0.024104060673364153, + "grad_norm": 0.5773559212684631, + "learning_rate": 9.99879449301614e-06, + "loss": 2.3559, + "mean_token_accuracy": 0.49749248987063766, + "num_tokens": 202900096.0, + "step": 1400 + }, + { + "entropy": 2.53125, + "epoch": 0.024121277859559413, + "grad_norm": 0.611416220664978, + "learning_rate": 9.998788443046735e-06, + "loss": 2.5107, + "mean_token_accuracy": 0.4666637545451522, + "num_tokens": 203038362.0, + "step": 1401 + }, + { + "entropy": 2.4573974609375, + "epoch": 0.024138495045754674, + "grad_norm": 0.6409509778022766, + "learning_rate": 9.998782377935927e-06, + "loss": 2.423, + "mean_token_accuracy": 0.4764982839114964, + "num_tokens": 203176454.0, + "step": 1402 + }, + { + "entropy": 2.419921875, + "epoch": 0.02415571223194993, + "grad_norm": 0.5751169323921204, + "learning_rate": 9.998776297683733e-06, + "loss": 2.3627, + "mean_token_accuracy": 0.48058157647028565, + "num_tokens": 203311141.0, + "step": 1403 + }, + { + "entropy": 2.4736328125, + "epoch": 0.02417292941814519, + "grad_norm": 0.580534040927887, + "learning_rate": 9.998770202290173e-06, + "loss": 2.449, + "mean_token_accuracy": 0.4753856221213937, + "num_tokens": 203452951.0, + "step": 1404 + }, + { + "entropy": 2.426513671875, + "epoch": 0.024190146604340452, + "grad_norm": 0.5762086510658264, + "learning_rate": 9.998764091755264e-06, + "loss": 2.3869, + "mean_token_accuracy": 0.4834116967394948, + "num_tokens": 203600387.0, + "step": 1405 + }, + { + "entropy": 2.509521484375, + "epoch": 0.024207363790535712, + "grad_norm": 0.5431259870529175, + "learning_rate": 9.998757966079024e-06, + "loss": 2.486, + "mean_token_accuracy": 0.4661334496922791, + "num_tokens": 203751350.0, + "step": 1406 + }, + { + "entropy": 2.499267578125, + "epoch": 0.024224580976730973, + "grad_norm": 0.6089497804641724, + "learning_rate": 9.998751825261474e-06, + "loss": 2.4389, + "mean_token_accuracy": 0.4734781333245337, + "num_tokens": 203902498.0, + "step": 1407 + }, + { + "entropy": 2.48583984375, + "epoch": 0.024241798162926233, + "grad_norm": 0.5681830048561096, + "learning_rate": 9.998745669302632e-06, + "loss": 2.4445, + "mean_token_accuracy": 0.4708978096023202, + "num_tokens": 204044975.0, + "step": 1408 + }, + { + "entropy": 2.4814453125, + "epoch": 0.024259015349121494, + "grad_norm": 0.5554722547531128, + "learning_rate": 9.998739498202514e-06, + "loss": 2.4522, + "mean_token_accuracy": 0.47262357315048575, + "num_tokens": 204205348.0, + "step": 1409 + }, + { + "entropy": 2.444580078125, + "epoch": 0.024276232535316754, + "grad_norm": 0.5820813775062561, + "learning_rate": 9.99873331196114e-06, + "loss": 2.3785, + "mean_token_accuracy": 0.4880591509863734, + "num_tokens": 204345568.0, + "step": 1410 + }, + { + "entropy": 2.45556640625, + "epoch": 0.024293449721512015, + "grad_norm": 0.5614075660705566, + "learning_rate": 9.99872711057853e-06, + "loss": 2.4034, + "mean_token_accuracy": 0.4788000574335456, + "num_tokens": 204494358.0, + "step": 1411 + }, + { + "entropy": 2.4716796875, + "epoch": 0.024310666907707272, + "grad_norm": 0.5851388573646545, + "learning_rate": 9.998720894054703e-06, + "loss": 2.4773, + "mean_token_accuracy": 0.4750785259529948, + "num_tokens": 204626789.0, + "step": 1412 + }, + { + "entropy": 2.45458984375, + "epoch": 0.024327884093902533, + "grad_norm": 0.7529696822166443, + "learning_rate": 9.998714662389676e-06, + "loss": 2.4149, + "mean_token_accuracy": 0.4763760110363364, + "num_tokens": 204774398.0, + "step": 1413 + }, + { + "entropy": 2.4195556640625, + "epoch": 0.024345101280097793, + "grad_norm": 0.6168255805969238, + "learning_rate": 9.99870841558347e-06, + "loss": 2.3811, + "mean_token_accuracy": 0.4846136327832937, + "num_tokens": 204924934.0, + "step": 1414 + }, + { + "entropy": 2.4605712890625, + "epoch": 0.024362318466293054, + "grad_norm": 0.6109985113143921, + "learning_rate": 9.998702153636102e-06, + "loss": 2.4489, + "mean_token_accuracy": 0.4767269352450967, + "num_tokens": 205059906.0, + "step": 1415 + }, + { + "entropy": 2.436767578125, + "epoch": 0.024379535652488314, + "grad_norm": 0.5300309658050537, + "learning_rate": 9.998695876547591e-06, + "loss": 2.3666, + "mean_token_accuracy": 0.4840262867510319, + "num_tokens": 205220596.0, + "step": 1416 + }, + { + "entropy": 2.429931640625, + "epoch": 0.024396752838683575, + "grad_norm": 0.5640124678611755, + "learning_rate": 9.998689584317955e-06, + "loss": 2.3631, + "mean_token_accuracy": 0.49288097163662314, + "num_tokens": 205386977.0, + "step": 1417 + }, + { + "entropy": 2.46435546875, + "epoch": 0.024413970024878835, + "grad_norm": 0.5903874635696411, + "learning_rate": 9.998683276947219e-06, + "loss": 2.4291, + "mean_token_accuracy": 0.47703268751502037, + "num_tokens": 205526617.0, + "step": 1418 + }, + { + "entropy": 2.4453125, + "epoch": 0.024431187211074096, + "grad_norm": 0.567484438419342, + "learning_rate": 9.998676954435394e-06, + "loss": 2.3968, + "mean_token_accuracy": 0.4830042072571814, + "num_tokens": 205669600.0, + "step": 1419 + }, + { + "entropy": 2.4515380859375, + "epoch": 0.024448404397269353, + "grad_norm": 0.5405850410461426, + "learning_rate": 9.998670616782505e-06, + "loss": 2.3772, + "mean_token_accuracy": 0.4838403551839292, + "num_tokens": 205821251.0, + "step": 1420 + }, + { + "entropy": 2.4334716796875, + "epoch": 0.024465621583464613, + "grad_norm": 0.5751288533210754, + "learning_rate": 9.99866426398857e-06, + "loss": 2.3644, + "mean_token_accuracy": 0.48748740553855896, + "num_tokens": 205964418.0, + "step": 1421 + }, + { + "entropy": 2.474853515625, + "epoch": 0.024482838769659874, + "grad_norm": 0.5722588896751404, + "learning_rate": 9.998657896053604e-06, + "loss": 2.4274, + "mean_token_accuracy": 0.47551520401611924, + "num_tokens": 206095207.0, + "step": 1422 + }, + { + "entropy": 2.470458984375, + "epoch": 0.024500055955855134, + "grad_norm": 0.6179440021514893, + "learning_rate": 9.998651512977631e-06, + "loss": 2.4267, + "mean_token_accuracy": 0.47643746761605144, + "num_tokens": 206240116.0, + "step": 1423 + }, + { + "entropy": 2.4114990234375, + "epoch": 0.024517273142050395, + "grad_norm": 0.608542799949646, + "learning_rate": 9.99864511476067e-06, + "loss": 2.3299, + "mean_token_accuracy": 0.4850447475910187, + "num_tokens": 206389712.0, + "step": 1424 + }, + { + "entropy": 2.45263671875, + "epoch": 0.024534490328245655, + "grad_norm": 0.5260599851608276, + "learning_rate": 9.998638701402739e-06, + "loss": 2.4158, + "mean_token_accuracy": 0.4785124696791172, + "num_tokens": 206548123.0, + "step": 1425 + }, + { + "entropy": 2.39892578125, + "epoch": 0.024551707514440916, + "grad_norm": 0.6076765060424805, + "learning_rate": 9.998632272903858e-06, + "loss": 2.3619, + "mean_token_accuracy": 0.49168099416419864, + "num_tokens": 206687271.0, + "step": 1426 + }, + { + "entropy": 2.499755859375, + "epoch": 0.024568924700636176, + "grad_norm": 0.6022489070892334, + "learning_rate": 9.998625829264045e-06, + "loss": 2.487, + "mean_token_accuracy": 0.46968710515648127, + "num_tokens": 206824438.0, + "step": 1427 + }, + { + "entropy": 2.4072265625, + "epoch": 0.024586141886831434, + "grad_norm": 0.5461668372154236, + "learning_rate": 9.998619370483324e-06, + "loss": 2.3735, + "mean_token_accuracy": 0.49147024331614375, + "num_tokens": 206981519.0, + "step": 1428 + }, + { + "entropy": 2.4703369140625, + "epoch": 0.024603359073026694, + "grad_norm": 0.5643055438995361, + "learning_rate": 9.998612896561709e-06, + "loss": 2.4402, + "mean_token_accuracy": 0.47540956176817417, + "num_tokens": 207129316.0, + "step": 1429 + }, + { + "entropy": 2.430908203125, + "epoch": 0.024620576259221955, + "grad_norm": 0.5822505950927734, + "learning_rate": 9.998606407499222e-06, + "loss": 2.3844, + "mean_token_accuracy": 0.4864038904197514, + "num_tokens": 207277904.0, + "step": 1430 + }, + { + "entropy": 2.41015625, + "epoch": 0.024637793445417215, + "grad_norm": 0.5898188352584839, + "learning_rate": 9.998599903295883e-06, + "loss": 2.3588, + "mean_token_accuracy": 0.4909462621435523, + "num_tokens": 207422167.0, + "step": 1431 + }, + { + "entropy": 2.4686279296875, + "epoch": 0.024655010631612476, + "grad_norm": 0.5768987536430359, + "learning_rate": 9.99859338395171e-06, + "loss": 2.4193, + "mean_token_accuracy": 0.47896814439445734, + "num_tokens": 207560662.0, + "step": 1432 + }, + { + "entropy": 2.5447998046875, + "epoch": 0.024672227817807736, + "grad_norm": 0.582497239112854, + "learning_rate": 9.998586849466727e-06, + "loss": 2.5383, + "mean_token_accuracy": 0.46700617065653205, + "num_tokens": 207702226.0, + "step": 1433 + }, + { + "entropy": 2.4814453125, + "epoch": 0.024689445004002997, + "grad_norm": 0.6393965482711792, + "learning_rate": 9.99858029984095e-06, + "loss": 2.438, + "mean_token_accuracy": 0.47804489312693477, + "num_tokens": 207827846.0, + "step": 1434 + }, + { + "entropy": 2.3828125, + "epoch": 0.024706662190198257, + "grad_norm": 0.618248701095581, + "learning_rate": 9.998573735074398e-06, + "loss": 2.3306, + "mean_token_accuracy": 0.48714121896773577, + "num_tokens": 207996679.0, + "step": 1435 + }, + { + "entropy": 2.452880859375, + "epoch": 0.024723879376393518, + "grad_norm": 0.6095260381698608, + "learning_rate": 9.998567155167094e-06, + "loss": 2.3982, + "mean_token_accuracy": 0.48710576351732016, + "num_tokens": 208129844.0, + "step": 1436 + }, + { + "entropy": 2.4764404296875, + "epoch": 0.024741096562588775, + "grad_norm": 0.5430065393447876, + "learning_rate": 9.998560560119058e-06, + "loss": 2.4219, + "mean_token_accuracy": 0.47808440774679184, + "num_tokens": 208273997.0, + "step": 1437 + }, + { + "entropy": 2.447021484375, + "epoch": 0.024758313748784035, + "grad_norm": 0.6290302276611328, + "learning_rate": 9.998553949930306e-06, + "loss": 2.3994, + "mean_token_accuracy": 0.484294559340924, + "num_tokens": 208429606.0, + "step": 1438 + }, + { + "entropy": 2.431640625, + "epoch": 0.024775530934979296, + "grad_norm": 0.5840831398963928, + "learning_rate": 9.99854732460086e-06, + "loss": 2.3615, + "mean_token_accuracy": 0.4878291576169431, + "num_tokens": 208577096.0, + "step": 1439 + }, + { + "entropy": 2.4404296875, + "epoch": 0.024792748121174556, + "grad_norm": 0.5737127065658569, + "learning_rate": 9.998540684130743e-06, + "loss": 2.4082, + "mean_token_accuracy": 0.48507948173210025, + "num_tokens": 208744507.0, + "step": 1440 + }, + { + "entropy": 2.380126953125, + "epoch": 0.024809965307369817, + "grad_norm": 0.5842684507369995, + "learning_rate": 9.998534028519971e-06, + "loss": 2.3265, + "mean_token_accuracy": 0.49405472399666905, + "num_tokens": 208887356.0, + "step": 1441 + }, + { + "entropy": 2.447265625, + "epoch": 0.024827182493565077, + "grad_norm": 0.5664902329444885, + "learning_rate": 9.998527357768566e-06, + "loss": 2.4049, + "mean_token_accuracy": 0.4856785982847214, + "num_tokens": 209035653.0, + "step": 1442 + }, + { + "entropy": 2.5068359375, + "epoch": 0.024844399679760338, + "grad_norm": 0.5968948602676392, + "learning_rate": 9.99852067187655e-06, + "loss": 2.4616, + "mean_token_accuracy": 0.4748954540118575, + "num_tokens": 209174761.0, + "step": 1443 + }, + { + "entropy": 2.4140625, + "epoch": 0.0248616168659556, + "grad_norm": 0.5464600920677185, + "learning_rate": 9.99851397084394e-06, + "loss": 2.391, + "mean_token_accuracy": 0.4841625513508916, + "num_tokens": 209335128.0, + "step": 1444 + }, + { + "entropy": 2.406494140625, + "epoch": 0.024878834052150856, + "grad_norm": 0.5924455523490906, + "learning_rate": 9.998507254670757e-06, + "loss": 2.3687, + "mean_token_accuracy": 0.4873992637731135, + "num_tokens": 209494921.0, + "step": 1445 + }, + { + "entropy": 2.4461669921875, + "epoch": 0.024896051238346116, + "grad_norm": 0.5587389469146729, + "learning_rate": 9.998500523357022e-06, + "loss": 2.4399, + "mean_token_accuracy": 0.4791180179454386, + "num_tokens": 209636823.0, + "step": 1446 + }, + { + "entropy": 2.4124755859375, + "epoch": 0.024913268424541377, + "grad_norm": 0.5826576948165894, + "learning_rate": 9.998493776902756e-06, + "loss": 2.3668, + "mean_token_accuracy": 0.4858208396472037, + "num_tokens": 209782197.0, + "step": 1447 + }, + { + "entropy": 2.4521484375, + "epoch": 0.024930485610736637, + "grad_norm": 0.6207579374313354, + "learning_rate": 9.998487015307978e-06, + "loss": 2.392, + "mean_token_accuracy": 0.4869683226570487, + "num_tokens": 209916075.0, + "step": 1448 + }, + { + "entropy": 2.4453125, + "epoch": 0.024947702796931898, + "grad_norm": 0.640599250793457, + "learning_rate": 9.998480238572711e-06, + "loss": 2.383, + "mean_token_accuracy": 0.480794875882566, + "num_tokens": 210062565.0, + "step": 1449 + }, + { + "entropy": 2.4508056640625, + "epoch": 0.024964919983127158, + "grad_norm": 0.5667426586151123, + "learning_rate": 9.998473446696972e-06, + "loss": 2.4082, + "mean_token_accuracy": 0.4857399258762598, + "num_tokens": 210214070.0, + "step": 1450 + }, + { + "entropy": 2.4837646484375, + "epoch": 0.02498213716932242, + "grad_norm": 0.5633426904678345, + "learning_rate": 9.998466639680786e-06, + "loss": 2.4637, + "mean_token_accuracy": 0.47439560387283564, + "num_tokens": 210356264.0, + "step": 1451 + }, + { + "entropy": 2.3912353515625, + "epoch": 0.02499935435551768, + "grad_norm": 0.621870219707489, + "learning_rate": 9.998459817524168e-06, + "loss": 2.3623, + "mean_token_accuracy": 0.4901335104368627, + "num_tokens": 210509728.0, + "step": 1452 + }, + { + "entropy": 2.4949951171875, + "epoch": 0.025016571541712936, + "grad_norm": 0.5852912664413452, + "learning_rate": 9.998452980227143e-06, + "loss": 2.4597, + "mean_token_accuracy": 0.4733400009572506, + "num_tokens": 210644493.0, + "step": 1453 + }, + { + "entropy": 2.434326171875, + "epoch": 0.025033788727908197, + "grad_norm": 0.5888333320617676, + "learning_rate": 9.998446127789731e-06, + "loss": 2.4475, + "mean_token_accuracy": 0.4807679671794176, + "num_tokens": 210791477.0, + "step": 1454 + }, + { + "entropy": 2.44189453125, + "epoch": 0.025051005914103457, + "grad_norm": 0.5960559248924255, + "learning_rate": 9.998439260211953e-06, + "loss": 2.4365, + "mean_token_accuracy": 0.4799190401099622, + "num_tokens": 210939578.0, + "step": 1455 + }, + { + "entropy": 2.432373046875, + "epoch": 0.025068223100298718, + "grad_norm": 0.5698879957199097, + "learning_rate": 9.998432377493826e-06, + "loss": 2.3899, + "mean_token_accuracy": 0.4848698596470058, + "num_tokens": 211096780.0, + "step": 1456 + }, + { + "entropy": 2.3997802734375, + "epoch": 0.02508544028649398, + "grad_norm": 0.5995359420776367, + "learning_rate": 9.998425479635373e-06, + "loss": 2.383, + "mean_token_accuracy": 0.4919665399938822, + "num_tokens": 211234392.0, + "step": 1457 + }, + { + "entropy": 2.497802734375, + "epoch": 0.02510265747268924, + "grad_norm": 0.5807369351387024, + "learning_rate": 9.99841856663662e-06, + "loss": 2.4674, + "mean_token_accuracy": 0.47135368920862675, + "num_tokens": 211366377.0, + "step": 1458 + }, + { + "entropy": 2.4554443359375, + "epoch": 0.0251198746588845, + "grad_norm": 0.5541589260101318, + "learning_rate": 9.99841163849758e-06, + "loss": 2.4065, + "mean_token_accuracy": 0.48336717346683145, + "num_tokens": 211517256.0, + "step": 1459 + }, + { + "entropy": 2.473876953125, + "epoch": 0.02513709184507976, + "grad_norm": 0.597100555896759, + "learning_rate": 9.99840469521828e-06, + "loss": 2.4374, + "mean_token_accuracy": 0.4758867691271007, + "num_tokens": 211656203.0, + "step": 1460 + }, + { + "entropy": 2.459228515625, + "epoch": 0.025154309031275017, + "grad_norm": 0.5845798850059509, + "learning_rate": 9.998397736798737e-06, + "loss": 2.4206, + "mean_token_accuracy": 0.4736228142865002, + "num_tokens": 211798384.0, + "step": 1461 + }, + { + "entropy": 2.455078125, + "epoch": 0.025171526217470278, + "grad_norm": 0.6050572395324707, + "learning_rate": 9.998390763238975e-06, + "loss": 2.4037, + "mean_token_accuracy": 0.48261626344174147, + "num_tokens": 211932728.0, + "step": 1462 + }, + { + "entropy": 2.4322509765625, + "epoch": 0.025188743403665538, + "grad_norm": 0.5519930124282837, + "learning_rate": 9.998383774539013e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.48328032717108727, + "num_tokens": 212075313.0, + "step": 1463 + }, + { + "entropy": 2.4688720703125, + "epoch": 0.0252059605898608, + "grad_norm": 0.5802918672561646, + "learning_rate": 9.998376770698875e-06, + "loss": 2.4487, + "mean_token_accuracy": 0.4819605387747288, + "num_tokens": 212220989.0, + "step": 1464 + }, + { + "entropy": 2.432373046875, + "epoch": 0.02522317777605606, + "grad_norm": 0.5660965442657471, + "learning_rate": 9.998369751718577e-06, + "loss": 2.3826, + "mean_token_accuracy": 0.48062465619295835, + "num_tokens": 212377908.0, + "step": 1465 + }, + { + "entropy": 2.5208740234375, + "epoch": 0.02524039496225132, + "grad_norm": 0.6206576228141785, + "learning_rate": 9.998362717598144e-06, + "loss": 2.5238, + "mean_token_accuracy": 0.46971421781927347, + "num_tokens": 212521169.0, + "step": 1466 + }, + { + "entropy": 2.45263671875, + "epoch": 0.02525761214844658, + "grad_norm": 0.5654458403587341, + "learning_rate": 9.998355668337599e-06, + "loss": 2.4136, + "mean_token_accuracy": 0.4812098955735564, + "num_tokens": 212661516.0, + "step": 1467 + }, + { + "entropy": 2.4288330078125, + "epoch": 0.02527482933464184, + "grad_norm": 0.5742049217224121, + "learning_rate": 9.99834860393696e-06, + "loss": 2.3939, + "mean_token_accuracy": 0.4802519157528877, + "num_tokens": 212802767.0, + "step": 1468 + }, + { + "entropy": 2.47021484375, + "epoch": 0.0252920465208371, + "grad_norm": 0.5584784150123596, + "learning_rate": 9.998341524396249e-06, + "loss": 2.4361, + "mean_token_accuracy": 0.47756797447800636, + "num_tokens": 212937639.0, + "step": 1469 + }, + { + "entropy": 2.4468994140625, + "epoch": 0.025309263707032358, + "grad_norm": 0.7078640460968018, + "learning_rate": 9.998334429715488e-06, + "loss": 2.3877, + "mean_token_accuracy": 0.4793696654960513, + "num_tokens": 213079562.0, + "step": 1470 + }, + { + "entropy": 2.39892578125, + "epoch": 0.02532648089322762, + "grad_norm": 0.5251427292823792, + "learning_rate": 9.998327319894699e-06, + "loss": 2.3489, + "mean_token_accuracy": 0.4894181271083653, + "num_tokens": 213245711.0, + "step": 1471 + }, + { + "entropy": 2.3956298828125, + "epoch": 0.02534369807942288, + "grad_norm": 0.5619721412658691, + "learning_rate": 9.998320194933904e-06, + "loss": 2.3521, + "mean_token_accuracy": 0.49081948725506663, + "num_tokens": 213387544.0, + "step": 1472 + }, + { + "entropy": 2.41552734375, + "epoch": 0.02536091526561814, + "grad_norm": 0.5867946147918701, + "learning_rate": 9.998313054833123e-06, + "loss": 2.3628, + "mean_token_accuracy": 0.48890325892716646, + "num_tokens": 213525323.0, + "step": 1473 + }, + { + "entropy": 2.5234375, + "epoch": 0.0253781324518134, + "grad_norm": 0.5271015763282776, + "learning_rate": 9.998305899592378e-06, + "loss": 2.4971, + "mean_token_accuracy": 0.4719184576533735, + "num_tokens": 213676017.0, + "step": 1474 + }, + { + "entropy": 2.4503173828125, + "epoch": 0.02539534963800866, + "grad_norm": 0.5921691060066223, + "learning_rate": 9.99829872921169e-06, + "loss": 2.4328, + "mean_token_accuracy": 0.4846241660416126, + "num_tokens": 213819688.0, + "step": 1475 + }, + { + "entropy": 2.4017333984375, + "epoch": 0.02541256682420392, + "grad_norm": 0.5926265716552734, + "learning_rate": 9.998291543691081e-06, + "loss": 2.3141, + "mean_token_accuracy": 0.4890713025815785, + "num_tokens": 213966213.0, + "step": 1476 + }, + { + "entropy": 2.418212890625, + "epoch": 0.025429784010399182, + "grad_norm": 0.5591694116592407, + "learning_rate": 9.998284343030575e-06, + "loss": 2.3925, + "mean_token_accuracy": 0.4819628051482141, + "num_tokens": 214097650.0, + "step": 1477 + }, + { + "entropy": 2.39453125, + "epoch": 0.02544700119659444, + "grad_norm": 0.5935323238372803, + "learning_rate": 9.99827712723019e-06, + "loss": 2.3999, + "mean_token_accuracy": 0.48563890950754285, + "num_tokens": 214233345.0, + "step": 1478 + }, + { + "entropy": 2.5748291015625, + "epoch": 0.0254642183827897, + "grad_norm": 0.5952027440071106, + "learning_rate": 9.998269896289953e-06, + "loss": 2.531, + "mean_token_accuracy": 0.46888946322724223, + "num_tokens": 214371139.0, + "step": 1479 + }, + { + "entropy": 2.436767578125, + "epoch": 0.02548143556898496, + "grad_norm": 0.5706344842910767, + "learning_rate": 9.998262650209882e-06, + "loss": 2.4313, + "mean_token_accuracy": 0.4844216462224722, + "num_tokens": 214524344.0, + "step": 1480 + }, + { + "entropy": 2.444580078125, + "epoch": 0.02549865275518022, + "grad_norm": 0.6236752271652222, + "learning_rate": 9.99825538899e-06, + "loss": 2.4011, + "mean_token_accuracy": 0.48665000684559345, + "num_tokens": 214685683.0, + "step": 1481 + }, + { + "entropy": 2.4854736328125, + "epoch": 0.02551586994137548, + "grad_norm": 0.56282639503479, + "learning_rate": 9.998248112630328e-06, + "loss": 2.414, + "mean_token_accuracy": 0.47536311158910394, + "num_tokens": 214833325.0, + "step": 1482 + }, + { + "entropy": 2.44873046875, + "epoch": 0.02553308712757074, + "grad_norm": 0.5949015617370605, + "learning_rate": 9.99824082113089e-06, + "loss": 2.4348, + "mean_token_accuracy": 0.4832215076312423, + "num_tokens": 214974294.0, + "step": 1483 + }, + { + "entropy": 2.509765625, + "epoch": 0.025550304313766002, + "grad_norm": 0.6240081191062927, + "learning_rate": 9.998233514491706e-06, + "loss": 2.4456, + "mean_token_accuracy": 0.4775039851665497, + "num_tokens": 215107075.0, + "step": 1484 + }, + { + "entropy": 2.48486328125, + "epoch": 0.025567521499961263, + "grad_norm": 0.5603934526443481, + "learning_rate": 9.9982261927128e-06, + "loss": 2.4404, + "mean_token_accuracy": 0.4763817982748151, + "num_tokens": 215257991.0, + "step": 1485 + }, + { + "entropy": 2.472412109375, + "epoch": 0.02558473868615652, + "grad_norm": 0.5318784117698669, + "learning_rate": 9.998218855794193e-06, + "loss": 2.4512, + "mean_token_accuracy": 0.4735376350581646, + "num_tokens": 215424684.0, + "step": 1486 + }, + { + "entropy": 2.4873046875, + "epoch": 0.02560195587235178, + "grad_norm": 0.5659030079841614, + "learning_rate": 9.998211503735908e-06, + "loss": 2.4298, + "mean_token_accuracy": 0.47843836853280663, + "num_tokens": 215573454.0, + "step": 1487 + }, + { + "entropy": 2.3856201171875, + "epoch": 0.02561917305854704, + "grad_norm": 0.6163880228996277, + "learning_rate": 9.998204136537965e-06, + "loss": 2.3667, + "mean_token_accuracy": 0.49307617312297225, + "num_tokens": 215709724.0, + "step": 1488 + }, + { + "entropy": 2.4710693359375, + "epoch": 0.0256363902447423, + "grad_norm": 0.5732681751251221, + "learning_rate": 9.99819675420039e-06, + "loss": 2.4201, + "mean_token_accuracy": 0.4794590980745852, + "num_tokens": 215844176.0, + "step": 1489 + }, + { + "entropy": 2.4678955078125, + "epoch": 0.025653607430937562, + "grad_norm": 0.8923822045326233, + "learning_rate": 9.998189356723203e-06, + "loss": 2.4845, + "mean_token_accuracy": 0.47123232623562217, + "num_tokens": 215988104.0, + "step": 1490 + }, + { + "entropy": 2.5465087890625, + "epoch": 0.025670824617132822, + "grad_norm": 0.5661553740501404, + "learning_rate": 9.998181944106428e-06, + "loss": 2.5037, + "mean_token_accuracy": 0.4655986800789833, + "num_tokens": 216131658.0, + "step": 1491 + }, + { + "entropy": 2.486572265625, + "epoch": 0.025688041803328083, + "grad_norm": 0.5814360976219177, + "learning_rate": 9.998174516350086e-06, + "loss": 2.4732, + "mean_token_accuracy": 0.4705640790052712, + "num_tokens": 216274055.0, + "step": 1492 + }, + { + "entropy": 2.3916015625, + "epoch": 0.025705258989523343, + "grad_norm": 0.5932287573814392, + "learning_rate": 9.998167073454197e-06, + "loss": 2.3721, + "mean_token_accuracy": 0.49608813878148794, + "num_tokens": 216414995.0, + "step": 1493 + }, + { + "entropy": 2.4781494140625, + "epoch": 0.025722476175718604, + "grad_norm": 0.5934662222862244, + "learning_rate": 9.99815961541879e-06, + "loss": 2.4183, + "mean_token_accuracy": 0.4743193374015391, + "num_tokens": 216559579.0, + "step": 1494 + }, + { + "entropy": 2.4559326171875, + "epoch": 0.02573969336191386, + "grad_norm": 0.5569154620170593, + "learning_rate": 9.998152142243882e-06, + "loss": 2.3922, + "mean_token_accuracy": 0.48210062831640244, + "num_tokens": 216706806.0, + "step": 1495 + }, + { + "entropy": 2.4129638671875, + "epoch": 0.02575691054810912, + "grad_norm": 0.5600568056106567, + "learning_rate": 9.998144653929499e-06, + "loss": 2.371, + "mean_token_accuracy": 0.4889267375692725, + "num_tokens": 216861317.0, + "step": 1496 + }, + { + "entropy": 2.4239501953125, + "epoch": 0.025774127734304382, + "grad_norm": 0.5837308168411255, + "learning_rate": 9.99813715047566e-06, + "loss": 2.3703, + "mean_token_accuracy": 0.48130458453670144, + "num_tokens": 217013439.0, + "step": 1497 + }, + { + "entropy": 2.4178466796875, + "epoch": 0.025791344920499643, + "grad_norm": 0.5723254084587097, + "learning_rate": 9.998129631882392e-06, + "loss": 2.3257, + "mean_token_accuracy": 0.4927904959768057, + "num_tokens": 217161046.0, + "step": 1498 + }, + { + "entropy": 2.4444580078125, + "epoch": 0.025808562106694903, + "grad_norm": 0.6308748126029968, + "learning_rate": 9.998122098149714e-06, + "loss": 2.3964, + "mean_token_accuracy": 0.4866189956665039, + "num_tokens": 217301151.0, + "step": 1499 + }, + { + "entropy": 2.422607421875, + "epoch": 0.025825779292890164, + "grad_norm": 0.5735398530960083, + "learning_rate": 9.998114549277653e-06, + "loss": 2.4083, + "mean_token_accuracy": 0.4805009118281305, + "num_tokens": 217444559.0, + "step": 1500 + }, + { + "entropy": 2.4912109375, + "epoch": 0.025842996479085424, + "grad_norm": 0.5808705687522888, + "learning_rate": 9.998106985266229e-06, + "loss": 2.4692, + "mean_token_accuracy": 0.47182623157277703, + "num_tokens": 217596025.0, + "step": 1501 + }, + { + "entropy": 2.4501953125, + "epoch": 0.025860213665280685, + "grad_norm": 0.5593245625495911, + "learning_rate": 9.998099406115465e-06, + "loss": 2.4285, + "mean_token_accuracy": 0.4782219841144979, + "num_tokens": 217747354.0, + "step": 1502 + }, + { + "entropy": 2.429931640625, + "epoch": 0.02587743085147594, + "grad_norm": 0.5433899760246277, + "learning_rate": 9.998091811825383e-06, + "loss": 2.4056, + "mean_token_accuracy": 0.4807021920569241, + "num_tokens": 217895004.0, + "step": 1503 + }, + { + "entropy": 2.4007568359375, + "epoch": 0.025894648037671202, + "grad_norm": 0.5361343026161194, + "learning_rate": 9.998084202396008e-06, + "loss": 2.3451, + "mean_token_accuracy": 0.4870217302814126, + "num_tokens": 218041797.0, + "step": 1504 + }, + { + "entropy": 2.5289306640625, + "epoch": 0.025911865223866463, + "grad_norm": 0.5812287330627441, + "learning_rate": 9.998076577827363e-06, + "loss": 2.466, + "mean_token_accuracy": 0.4721836093813181, + "num_tokens": 218184823.0, + "step": 1505 + }, + { + "entropy": 2.4560546875, + "epoch": 0.025929082410061723, + "grad_norm": 0.667839527130127, + "learning_rate": 9.998068938119471e-06, + "loss": 2.4359, + "mean_token_accuracy": 0.4753602985292673, + "num_tokens": 218322885.0, + "step": 1506 + }, + { + "entropy": 2.56201171875, + "epoch": 0.025946299596256984, + "grad_norm": 0.5627098083496094, + "learning_rate": 9.998061283272353e-06, + "loss": 2.4754, + "mean_token_accuracy": 0.4772064401768148, + "num_tokens": 218485130.0, + "step": 1507 + }, + { + "entropy": 2.4844970703125, + "epoch": 0.025963516782452244, + "grad_norm": 0.5547756552696228, + "learning_rate": 9.998053613286036e-06, + "loss": 2.4126, + "mean_token_accuracy": 0.4796820618212223, + "num_tokens": 218631229.0, + "step": 1508 + }, + { + "entropy": 2.4619140625, + "epoch": 0.025980733968647505, + "grad_norm": 0.5786131024360657, + "learning_rate": 9.99804592816054e-06, + "loss": 2.435, + "mean_token_accuracy": 0.48094206769019365, + "num_tokens": 218784443.0, + "step": 1509 + }, + { + "entropy": 2.4591064453125, + "epoch": 0.025997951154842765, + "grad_norm": 0.5810285806655884, + "learning_rate": 9.99803822789589e-06, + "loss": 2.419, + "mean_token_accuracy": 0.47504253312945366, + "num_tokens": 218923714.0, + "step": 1510 + }, + { + "entropy": 2.465087890625, + "epoch": 0.026015168341038022, + "grad_norm": 0.570573091506958, + "learning_rate": 9.998030512492108e-06, + "loss": 2.4312, + "mean_token_accuracy": 0.4744260283187032, + "num_tokens": 219076633.0, + "step": 1511 + }, + { + "entropy": 2.4339599609375, + "epoch": 0.026032385527233283, + "grad_norm": 0.5941885709762573, + "learning_rate": 9.998022781949217e-06, + "loss": 2.4043, + "mean_token_accuracy": 0.48197115818038583, + "num_tokens": 219219462.0, + "step": 1512 + }, + { + "entropy": 2.4404296875, + "epoch": 0.026049602713428543, + "grad_norm": 0.6012705564498901, + "learning_rate": 9.998015036267243e-06, + "loss": 2.4462, + "mean_token_accuracy": 0.4794664392247796, + "num_tokens": 219366857.0, + "step": 1513 + }, + { + "entropy": 2.44775390625, + "epoch": 0.026066819899623804, + "grad_norm": 0.6487138271331787, + "learning_rate": 9.998007275446206e-06, + "loss": 2.4562, + "mean_token_accuracy": 0.48240925278514624, + "num_tokens": 219501713.0, + "step": 1514 + }, + { + "entropy": 2.4669189453125, + "epoch": 0.026084037085819065, + "grad_norm": 0.5475379228591919, + "learning_rate": 9.997999499486134e-06, + "loss": 2.4614, + "mean_token_accuracy": 0.4733142964541912, + "num_tokens": 219651285.0, + "step": 1515 + }, + { + "entropy": 2.4892578125, + "epoch": 0.026101254272014325, + "grad_norm": 0.5989498496055603, + "learning_rate": 9.997991708387047e-06, + "loss": 2.4537, + "mean_token_accuracy": 0.4730882477015257, + "num_tokens": 219781732.0, + "step": 1516 + }, + { + "entropy": 2.48681640625, + "epoch": 0.026118471458209586, + "grad_norm": 0.6295292973518372, + "learning_rate": 9.99798390214897e-06, + "loss": 2.5009, + "mean_token_accuracy": 0.4708400582894683, + "num_tokens": 219923239.0, + "step": 1517 + }, + { + "entropy": 2.525390625, + "epoch": 0.026135688644404846, + "grad_norm": 0.5331100225448608, + "learning_rate": 9.997976080771924e-06, + "loss": 2.5217, + "mean_token_accuracy": 0.4696351978927851, + "num_tokens": 220085911.0, + "step": 1518 + }, + { + "entropy": 2.3858642578125, + "epoch": 0.026152905830600107, + "grad_norm": 0.5755723118782043, + "learning_rate": 9.997968244255937e-06, + "loss": 2.3168, + "mean_token_accuracy": 0.4918542858213186, + "num_tokens": 220225909.0, + "step": 1519 + }, + { + "entropy": 2.41943359375, + "epoch": 0.026170123016795364, + "grad_norm": 0.583797812461853, + "learning_rate": 9.99796039260103e-06, + "loss": 2.4, + "mean_token_accuracy": 0.4865543758496642, + "num_tokens": 220360506.0, + "step": 1520 + }, + { + "entropy": 2.4930419921875, + "epoch": 0.026187340202990624, + "grad_norm": 0.5480349659919739, + "learning_rate": 9.997952525807229e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.4783789166249335, + "num_tokens": 220518785.0, + "step": 1521 + }, + { + "entropy": 2.446533203125, + "epoch": 0.026204557389185885, + "grad_norm": 0.565557062625885, + "learning_rate": 9.997944643874553e-06, + "loss": 2.4001, + "mean_token_accuracy": 0.48282787948846817, + "num_tokens": 220669933.0, + "step": 1522 + }, + { + "entropy": 2.398681640625, + "epoch": 0.026221774575381145, + "grad_norm": 0.5465447902679443, + "learning_rate": 9.997936746803032e-06, + "loss": 2.3459, + "mean_token_accuracy": 0.4924741433933377, + "num_tokens": 220817234.0, + "step": 1523 + }, + { + "entropy": 2.4630126953125, + "epoch": 0.026238991761576406, + "grad_norm": 0.6321006417274475, + "learning_rate": 9.997928834592686e-06, + "loss": 2.4352, + "mean_token_accuracy": 0.47752982610836625, + "num_tokens": 220954570.0, + "step": 1524 + }, + { + "entropy": 2.4580078125, + "epoch": 0.026256208947771666, + "grad_norm": 0.56505286693573, + "learning_rate": 9.99792090724354e-06, + "loss": 2.4511, + "mean_token_accuracy": 0.48103819927200675, + "num_tokens": 221097668.0, + "step": 1525 + }, + { + "entropy": 2.50634765625, + "epoch": 0.026273426133966927, + "grad_norm": 0.5791621208190918, + "learning_rate": 9.997912964755618e-06, + "loss": 2.4866, + "mean_token_accuracy": 0.46793483989313245, + "num_tokens": 221236494.0, + "step": 1526 + }, + { + "entropy": 2.4517822265625, + "epoch": 0.026290643320162187, + "grad_norm": 0.6186292171478271, + "learning_rate": 9.997905007128946e-06, + "loss": 2.4148, + "mean_token_accuracy": 0.47660065814852715, + "num_tokens": 221371798.0, + "step": 1527 + }, + { + "entropy": 2.457763671875, + "epoch": 0.026307860506357444, + "grad_norm": 0.5822423100471497, + "learning_rate": 9.997897034363544e-06, + "loss": 2.4246, + "mean_token_accuracy": 0.4802300548180938, + "num_tokens": 221514663.0, + "step": 1528 + }, + { + "entropy": 2.4844970703125, + "epoch": 0.026325077692552705, + "grad_norm": 0.5579072833061218, + "learning_rate": 9.997889046459438e-06, + "loss": 2.4567, + "mean_token_accuracy": 0.47267806623131037, + "num_tokens": 221654782.0, + "step": 1529 + }, + { + "entropy": 2.482666015625, + "epoch": 0.026342294878747965, + "grad_norm": 0.5542824864387512, + "learning_rate": 9.997881043416653e-06, + "loss": 2.4276, + "mean_token_accuracy": 0.47997074108570814, + "num_tokens": 221806130.0, + "step": 1530 + }, + { + "entropy": 2.4971923828125, + "epoch": 0.026359512064943226, + "grad_norm": 0.5979059934616089, + "learning_rate": 9.997873025235215e-06, + "loss": 2.4411, + "mean_token_accuracy": 0.4823180711828172, + "num_tokens": 221951263.0, + "step": 1531 + }, + { + "entropy": 2.483154296875, + "epoch": 0.026376729251138487, + "grad_norm": 1.1579712629318237, + "learning_rate": 9.997864991915142e-06, + "loss": 2.4429, + "mean_token_accuracy": 0.4803687371313572, + "num_tokens": 222106221.0, + "step": 1532 + }, + { + "entropy": 2.437255859375, + "epoch": 0.026393946437333747, + "grad_norm": 0.585996687412262, + "learning_rate": 9.997856943456465e-06, + "loss": 2.4413, + "mean_token_accuracy": 0.4799509192816913, + "num_tokens": 222278140.0, + "step": 1533 + }, + { + "entropy": 2.478759765625, + "epoch": 0.026411163623529008, + "grad_norm": 0.5700007081031799, + "learning_rate": 9.997848879859205e-06, + "loss": 2.3974, + "mean_token_accuracy": 0.48377181869000196, + "num_tokens": 222420219.0, + "step": 1534 + }, + { + "entropy": 2.4041748046875, + "epoch": 0.026428380809724268, + "grad_norm": 0.5667886137962341, + "learning_rate": 9.99784080112339e-06, + "loss": 2.387, + "mean_token_accuracy": 0.48956062365323305, + "num_tokens": 222562430.0, + "step": 1535 + }, + { + "entropy": 2.5068359375, + "epoch": 0.026445597995919525, + "grad_norm": 0.5823205709457397, + "learning_rate": 9.997832707249038e-06, + "loss": 2.4595, + "mean_token_accuracy": 0.47320169396698475, + "num_tokens": 222700414.0, + "step": 1536 + }, + { + "entropy": 2.4405517578125, + "epoch": 0.026462815182114786, + "grad_norm": 0.546593427658081, + "learning_rate": 9.997824598236179e-06, + "loss": 2.3965, + "mean_token_accuracy": 0.48351220693439245, + "num_tokens": 222856133.0, + "step": 1537 + }, + { + "entropy": 2.4632568359375, + "epoch": 0.026480032368310046, + "grad_norm": 0.5839020013809204, + "learning_rate": 9.997816474084833e-06, + "loss": 2.4677, + "mean_token_accuracy": 0.4769261754117906, + "num_tokens": 223003734.0, + "step": 1538 + }, + { + "entropy": 2.3848876953125, + "epoch": 0.026497249554505307, + "grad_norm": 0.5713815093040466, + "learning_rate": 9.997808334795032e-06, + "loss": 2.3267, + "mean_token_accuracy": 0.4928825213573873, + "num_tokens": 223150812.0, + "step": 1539 + }, + { + "entropy": 2.4129638671875, + "epoch": 0.026514466740700567, + "grad_norm": 0.5621561408042908, + "learning_rate": 9.997800180366792e-06, + "loss": 2.369, + "mean_token_accuracy": 0.486449159681797, + "num_tokens": 223302261.0, + "step": 1540 + }, + { + "entropy": 2.4468994140625, + "epoch": 0.026531683926895828, + "grad_norm": 0.5728362202644348, + "learning_rate": 9.997792010800144e-06, + "loss": 2.3988, + "mean_token_accuracy": 0.4837730205617845, + "num_tokens": 223448432.0, + "step": 1541 + }, + { + "entropy": 2.4149169921875, + "epoch": 0.02654890111309109, + "grad_norm": 0.6072005033493042, + "learning_rate": 9.99778382609511e-06, + "loss": 2.3772, + "mean_token_accuracy": 0.48883628472685814, + "num_tokens": 223586866.0, + "step": 1542 + }, + { + "entropy": 2.4757080078125, + "epoch": 0.02656611829928635, + "grad_norm": 0.5731198191642761, + "learning_rate": 9.997775626251715e-06, + "loss": 2.4887, + "mean_token_accuracy": 0.4729891321621835, + "num_tokens": 223739874.0, + "step": 1543 + }, + { + "entropy": 2.4337158203125, + "epoch": 0.02658333548548161, + "grad_norm": 0.5194607377052307, + "learning_rate": 9.997767411269984e-06, + "loss": 2.4039, + "mean_token_accuracy": 0.47655164077878, + "num_tokens": 223889106.0, + "step": 1544 + }, + { + "entropy": 2.372314453125, + "epoch": 0.026600552671676866, + "grad_norm": 0.5741761922836304, + "learning_rate": 9.997759181149941e-06, + "loss": 2.3325, + "mean_token_accuracy": 0.49444987904280424, + "num_tokens": 224029055.0, + "step": 1545 + }, + { + "entropy": 2.479736328125, + "epoch": 0.026617769857872127, + "grad_norm": 0.5601452589035034, + "learning_rate": 9.997750935891615e-06, + "loss": 2.4167, + "mean_token_accuracy": 0.47284849267452955, + "num_tokens": 224166339.0, + "step": 1546 + }, + { + "entropy": 2.44091796875, + "epoch": 0.026634987044067387, + "grad_norm": 0.6984487771987915, + "learning_rate": 9.997742675495025e-06, + "loss": 2.3943, + "mean_token_accuracy": 0.4849527506157756, + "num_tokens": 224314144.0, + "step": 1547 + }, + { + "entropy": 2.42138671875, + "epoch": 0.026652204230262648, + "grad_norm": 0.6234527230262756, + "learning_rate": 9.9977343999602e-06, + "loss": 2.3438, + "mean_token_accuracy": 0.49563004495576024, + "num_tokens": 224465327.0, + "step": 1548 + }, + { + "entropy": 2.462158203125, + "epoch": 0.02666942141645791, + "grad_norm": 0.5658462047576904, + "learning_rate": 9.997726109287164e-06, + "loss": 2.4079, + "mean_token_accuracy": 0.48139845160767436, + "num_tokens": 224608026.0, + "step": 1549 + }, + { + "entropy": 2.40087890625, + "epoch": 0.02668663860265317, + "grad_norm": 0.57075035572052, + "learning_rate": 9.997717803475942e-06, + "loss": 2.3665, + "mean_token_accuracy": 0.4883747296407819, + "num_tokens": 224758446.0, + "step": 1550 + }, + { + "entropy": 2.45947265625, + "epoch": 0.02670385578884843, + "grad_norm": 0.5546119809150696, + "learning_rate": 9.99770948252656e-06, + "loss": 2.4301, + "mean_token_accuracy": 0.47884270129725337, + "num_tokens": 224900091.0, + "step": 1551 + }, + { + "entropy": 2.4818115234375, + "epoch": 0.02672107297504369, + "grad_norm": 0.584319531917572, + "learning_rate": 9.99770114643904e-06, + "loss": 2.444, + "mean_token_accuracy": 0.4730917243286967, + "num_tokens": 225040740.0, + "step": 1552 + }, + { + "entropy": 2.4044189453125, + "epoch": 0.026738290161238947, + "grad_norm": 0.6092365980148315, + "learning_rate": 9.997692795213412e-06, + "loss": 2.3605, + "mean_token_accuracy": 0.4851328986696899, + "num_tokens": 225186755.0, + "step": 1553 + }, + { + "entropy": 2.4434814453125, + "epoch": 0.026755507347434208, + "grad_norm": 0.6220741868019104, + "learning_rate": 9.997684428849698e-06, + "loss": 2.4281, + "mean_token_accuracy": 0.4839205415919423, + "num_tokens": 225334853.0, + "step": 1554 + }, + { + "entropy": 2.4833984375, + "epoch": 0.026772724533629468, + "grad_norm": 0.6230896711349487, + "learning_rate": 9.997676047347926e-06, + "loss": 2.4525, + "mean_token_accuracy": 0.4806727943941951, + "num_tokens": 225463554.0, + "step": 1555 + }, + { + "entropy": 2.4664306640625, + "epoch": 0.02678994171982473, + "grad_norm": 0.5725734233856201, + "learning_rate": 9.997667650708117e-06, + "loss": 2.4501, + "mean_token_accuracy": 0.4741324451752007, + "num_tokens": 225606325.0, + "step": 1556 + }, + { + "entropy": 2.511962890625, + "epoch": 0.02680715890601999, + "grad_norm": 0.5844495296478271, + "learning_rate": 9.9976592389303e-06, + "loss": 2.4653, + "mean_token_accuracy": 0.47366986563429236, + "num_tokens": 225755308.0, + "step": 1557 + }, + { + "entropy": 2.454345703125, + "epoch": 0.02682437609221525, + "grad_norm": 0.5523918867111206, + "learning_rate": 9.9976508120145e-06, + "loss": 2.4185, + "mean_token_accuracy": 0.4829486352391541, + "num_tokens": 225916539.0, + "step": 1558 + }, + { + "entropy": 2.47607421875, + "epoch": 0.02684159327841051, + "grad_norm": 0.5556900501251221, + "learning_rate": 9.997642369960743e-06, + "loss": 2.4678, + "mean_token_accuracy": 0.47704968182370067, + "num_tokens": 226065388.0, + "step": 1559 + }, + { + "entropy": 2.4385986328125, + "epoch": 0.02685881046460577, + "grad_norm": 0.5409045219421387, + "learning_rate": 9.997633912769054e-06, + "loss": 2.3615, + "mean_token_accuracy": 0.4902169401757419, + "num_tokens": 226221599.0, + "step": 1560 + }, + { + "entropy": 2.439208984375, + "epoch": 0.026876027650801028, + "grad_norm": 0.612868070602417, + "learning_rate": 9.997625440439457e-06, + "loss": 2.4594, + "mean_token_accuracy": 0.47914552967995405, + "num_tokens": 226362147.0, + "step": 1561 + }, + { + "entropy": 2.46142578125, + "epoch": 0.02689324483699629, + "grad_norm": 0.5608747601509094, + "learning_rate": 9.997616952971979e-06, + "loss": 2.4424, + "mean_token_accuracy": 0.4790024384856224, + "num_tokens": 226510293.0, + "step": 1562 + }, + { + "entropy": 2.47900390625, + "epoch": 0.02691046202319155, + "grad_norm": 0.5777642726898193, + "learning_rate": 9.997608450366646e-06, + "loss": 2.4461, + "mean_token_accuracy": 0.4724403969012201, + "num_tokens": 226654656.0, + "step": 1563 + }, + { + "entropy": 2.42529296875, + "epoch": 0.02692767920938681, + "grad_norm": 0.5585452914237976, + "learning_rate": 9.997599932623485e-06, + "loss": 2.3716, + "mean_token_accuracy": 0.4799959072843194, + "num_tokens": 226810305.0, + "step": 1564 + }, + { + "entropy": 2.391357421875, + "epoch": 0.02694489639558207, + "grad_norm": 0.6446172595024109, + "learning_rate": 9.997591399742518e-06, + "loss": 2.3486, + "mean_token_accuracy": 0.4916059426032007, + "num_tokens": 226969167.0, + "step": 1565 + }, + { + "entropy": 2.4498291015625, + "epoch": 0.02696211358177733, + "grad_norm": 0.5548785924911499, + "learning_rate": 9.997582851723773e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.4788562678731978, + "num_tokens": 227115813.0, + "step": 1566 + }, + { + "entropy": 2.5760498046875, + "epoch": 0.02697933076797259, + "grad_norm": 0.5751841068267822, + "learning_rate": 9.997574288567277e-06, + "loss": 2.5083, + "mean_token_accuracy": 0.4620812046341598, + "num_tokens": 227265495.0, + "step": 1567 + }, + { + "entropy": 2.470947265625, + "epoch": 0.02699654795416785, + "grad_norm": 1.4221575260162354, + "learning_rate": 9.997565710273056e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.4814313040114939, + "num_tokens": 227406243.0, + "step": 1568 + }, + { + "entropy": 2.461669921875, + "epoch": 0.027013765140363112, + "grad_norm": 0.5775115489959717, + "learning_rate": 9.997557116841134e-06, + "loss": 2.4235, + "mean_token_accuracy": 0.4807606884278357, + "num_tokens": 227566411.0, + "step": 1569 + }, + { + "entropy": 2.447509765625, + "epoch": 0.02703098232655837, + "grad_norm": 0.5913234353065491, + "learning_rate": 9.997548508271537e-06, + "loss": 2.3974, + "mean_token_accuracy": 0.48274747421965003, + "num_tokens": 227695596.0, + "step": 1570 + }, + { + "entropy": 2.4384765625, + "epoch": 0.02704819951275363, + "grad_norm": 0.5852504968643188, + "learning_rate": 9.997539884564293e-06, + "loss": 2.3895, + "mean_token_accuracy": 0.4838076075538993, + "num_tokens": 227834644.0, + "step": 1571 + }, + { + "entropy": 2.4619140625, + "epoch": 0.02706541669894889, + "grad_norm": 0.5741419196128845, + "learning_rate": 9.997531245719427e-06, + "loss": 2.4267, + "mean_token_accuracy": 0.4829978961497545, + "num_tokens": 227981607.0, + "step": 1572 + }, + { + "entropy": 2.4593505859375, + "epoch": 0.02708263388514415, + "grad_norm": 0.5879804491996765, + "learning_rate": 9.997522591736965e-06, + "loss": 2.4387, + "mean_token_accuracy": 0.47719143191352487, + "num_tokens": 228129360.0, + "step": 1573 + }, + { + "entropy": 2.425048828125, + "epoch": 0.02709985107133941, + "grad_norm": 0.6101236343383789, + "learning_rate": 9.997513922616935e-06, + "loss": 2.3956, + "mean_token_accuracy": 0.4810835770331323, + "num_tokens": 228271608.0, + "step": 1574 + }, + { + "entropy": 2.447021484375, + "epoch": 0.027117068257534672, + "grad_norm": 0.5525056719779968, + "learning_rate": 9.997505238359362e-06, + "loss": 2.4221, + "mean_token_accuracy": 0.47596909245476127, + "num_tokens": 228425263.0, + "step": 1575 + }, + { + "entropy": 2.4608154296875, + "epoch": 0.027134285443729932, + "grad_norm": 0.5748025178909302, + "learning_rate": 9.997496538964269e-06, + "loss": 2.4484, + "mean_token_accuracy": 0.4767248681746423, + "num_tokens": 228577817.0, + "step": 1576 + }, + { + "entropy": 2.52978515625, + "epoch": 0.027151502629925193, + "grad_norm": 0.556822657585144, + "learning_rate": 9.997487824431687e-06, + "loss": 2.4771, + "mean_token_accuracy": 0.4696510974317789, + "num_tokens": 228731761.0, + "step": 1577 + }, + { + "entropy": 2.517578125, + "epoch": 0.02716871981612045, + "grad_norm": 0.6445668339729309, + "learning_rate": 9.997479094761641e-06, + "loss": 2.4696, + "mean_token_accuracy": 0.47059842152521014, + "num_tokens": 228882447.0, + "step": 1578 + }, + { + "entropy": 2.465576171875, + "epoch": 0.02718593700231571, + "grad_norm": 0.5520902276039124, + "learning_rate": 9.997470349954158e-06, + "loss": 2.4349, + "mean_token_accuracy": 0.4763860795646906, + "num_tokens": 229032651.0, + "step": 1579 + }, + { + "entropy": 2.477294921875, + "epoch": 0.02720315418851097, + "grad_norm": 0.6009482145309448, + "learning_rate": 9.997461590009263e-06, + "loss": 2.4098, + "mean_token_accuracy": 0.47861299896612763, + "num_tokens": 229180126.0, + "step": 1580 + }, + { + "entropy": 2.380615234375, + "epoch": 0.02722037137470623, + "grad_norm": 0.5625278353691101, + "learning_rate": 9.997452814926984e-06, + "loss": 2.3496, + "mean_token_accuracy": 0.4916512775234878, + "num_tokens": 229322174.0, + "step": 1581 + }, + { + "entropy": 2.5185546875, + "epoch": 0.027237588560901492, + "grad_norm": 0.6114506721496582, + "learning_rate": 9.997444024707345e-06, + "loss": 2.4779, + "mean_token_accuracy": 0.4617612757720053, + "num_tokens": 229454480.0, + "step": 1582 + }, + { + "entropy": 2.508544921875, + "epoch": 0.027254805747096753, + "grad_norm": 0.6254023909568787, + "learning_rate": 9.997435219350377e-06, + "loss": 2.4818, + "mean_token_accuracy": 0.47184726875275373, + "num_tokens": 229588770.0, + "step": 1583 + }, + { + "entropy": 2.37744140625, + "epoch": 0.027272022933292013, + "grad_norm": 0.5333112478256226, + "learning_rate": 9.997426398856103e-06, + "loss": 2.3411, + "mean_token_accuracy": 0.4888959531672299, + "num_tokens": 229754120.0, + "step": 1584 + }, + { + "entropy": 2.42529296875, + "epoch": 0.027289240119487274, + "grad_norm": 0.5652498602867126, + "learning_rate": 9.997417563224551e-06, + "loss": 2.3861, + "mean_token_accuracy": 0.4848038759082556, + "num_tokens": 229894455.0, + "step": 1585 + }, + { + "entropy": 2.4168701171875, + "epoch": 0.02730645730568253, + "grad_norm": 0.5742363929748535, + "learning_rate": 9.997408712455748e-06, + "loss": 2.3838, + "mean_token_accuracy": 0.4861548813059926, + "num_tokens": 230044890.0, + "step": 1586 + }, + { + "entropy": 2.4842529296875, + "epoch": 0.02732367449187779, + "grad_norm": 0.5573915839195251, + "learning_rate": 9.99739984654972e-06, + "loss": 2.4365, + "mean_token_accuracy": 0.4722730196081102, + "num_tokens": 230180808.0, + "step": 1587 + }, + { + "entropy": 2.497802734375, + "epoch": 0.02734089167807305, + "grad_norm": 0.6639450192451477, + "learning_rate": 9.997390965506495e-06, + "loss": 2.4367, + "mean_token_accuracy": 0.47327509289607406, + "num_tokens": 230325006.0, + "step": 1588 + }, + { + "entropy": 2.514404296875, + "epoch": 0.027358108864268312, + "grad_norm": 0.5561137795448303, + "learning_rate": 9.997382069326099e-06, + "loss": 2.4852, + "mean_token_accuracy": 0.4662863416597247, + "num_tokens": 230483748.0, + "step": 1589 + }, + { + "entropy": 2.4310302734375, + "epoch": 0.027375326050463573, + "grad_norm": 0.5391974449157715, + "learning_rate": 9.997373158008558e-06, + "loss": 2.4237, + "mean_token_accuracy": 0.4804062177427113, + "num_tokens": 230645992.0, + "step": 1590 + }, + { + "entropy": 2.4122314453125, + "epoch": 0.027392543236658833, + "grad_norm": 0.577002763748169, + "learning_rate": 9.997364231553902e-06, + "loss": 2.3422, + "mean_token_accuracy": 0.4947265670634806, + "num_tokens": 230788584.0, + "step": 1591 + }, + { + "entropy": 2.4708251953125, + "epoch": 0.027409760422854094, + "grad_norm": 0.5783287286758423, + "learning_rate": 9.997355289962157e-06, + "loss": 2.4332, + "mean_token_accuracy": 0.47268676944077015, + "num_tokens": 230920875.0, + "step": 1592 + }, + { + "entropy": 2.495361328125, + "epoch": 0.027426977609049354, + "grad_norm": 0.5736862421035767, + "learning_rate": 9.997346333233347e-06, + "loss": 2.4839, + "mean_token_accuracy": 0.47372210351750255, + "num_tokens": 231066994.0, + "step": 1593 + }, + { + "entropy": 2.414306640625, + "epoch": 0.027444194795244615, + "grad_norm": 0.6175742149353027, + "learning_rate": 9.9973373613675e-06, + "loss": 2.3682, + "mean_token_accuracy": 0.481872939504683, + "num_tokens": 231204186.0, + "step": 1594 + }, + { + "entropy": 2.45068359375, + "epoch": 0.027461411981439872, + "grad_norm": 0.6151363253593445, + "learning_rate": 9.997328374364647e-06, + "loss": 2.4043, + "mean_token_accuracy": 0.4820513529703021, + "num_tokens": 231352849.0, + "step": 1595 + }, + { + "entropy": 2.406005859375, + "epoch": 0.027478629167635132, + "grad_norm": 0.591463565826416, + "learning_rate": 9.997319372224814e-06, + "loss": 2.3365, + "mean_token_accuracy": 0.4920554677955806, + "num_tokens": 231492437.0, + "step": 1596 + }, + { + "entropy": 2.4417724609375, + "epoch": 0.027495846353830393, + "grad_norm": 0.531035304069519, + "learning_rate": 9.997310354948026e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.4780901288613677, + "num_tokens": 231644457.0, + "step": 1597 + }, + { + "entropy": 2.4251708984375, + "epoch": 0.027513063540025653, + "grad_norm": 0.5585000514984131, + "learning_rate": 9.99730132253431e-06, + "loss": 2.3906, + "mean_token_accuracy": 0.4829976833425462, + "num_tokens": 231800143.0, + "step": 1598 + }, + { + "entropy": 2.36767578125, + "epoch": 0.027530280726220914, + "grad_norm": 0.5619704127311707, + "learning_rate": 9.997292274983696e-06, + "loss": 2.3234, + "mean_token_accuracy": 0.49107312550768256, + "num_tokens": 231959373.0, + "step": 1599 + }, + { + "entropy": 2.4547119140625, + "epoch": 0.027547497912416175, + "grad_norm": 0.6032162308692932, + "learning_rate": 9.997283212296211e-06, + "loss": 2.4341, + "mean_token_accuracy": 0.4722294365055859, + "num_tokens": 232110415.0, + "step": 1600 + }, + { + "entropy": 2.40576171875, + "epoch": 0.027564715098611435, + "grad_norm": 0.6166012287139893, + "learning_rate": 9.99727413447188e-06, + "loss": 2.3535, + "mean_token_accuracy": 0.489782630931586, + "num_tokens": 232247894.0, + "step": 1601 + }, + { + "entropy": 2.4820556640625, + "epoch": 0.027581932284806696, + "grad_norm": 0.5851189494132996, + "learning_rate": 9.997265041510733e-06, + "loss": 2.4721, + "mean_token_accuracy": 0.4751852685585618, + "num_tokens": 232388779.0, + "step": 1602 + }, + { + "entropy": 2.4580078125, + "epoch": 0.027599149471001953, + "grad_norm": 0.5721387267112732, + "learning_rate": 9.997255933412797e-06, + "loss": 2.4279, + "mean_token_accuracy": 0.48017449537292123, + "num_tokens": 232534833.0, + "step": 1603 + }, + { + "entropy": 2.4854736328125, + "epoch": 0.027616366657197213, + "grad_norm": 0.5649420022964478, + "learning_rate": 9.997246810178099e-06, + "loss": 2.4592, + "mean_token_accuracy": 0.4723159852437675, + "num_tokens": 232685772.0, + "step": 1604 + }, + { + "entropy": 2.4073486328125, + "epoch": 0.027633583843392474, + "grad_norm": 1.3038853406906128, + "learning_rate": 9.997237671806665e-06, + "loss": 2.3341, + "mean_token_accuracy": 0.4938134755939245, + "num_tokens": 232844671.0, + "step": 1605 + }, + { + "entropy": 2.388671875, + "epoch": 0.027650801029587734, + "grad_norm": 0.5900284647941589, + "learning_rate": 9.997228518298527e-06, + "loss": 2.338, + "mean_token_accuracy": 0.4900298644788563, + "num_tokens": 232985544.0, + "step": 1606 + }, + { + "entropy": 2.411376953125, + "epoch": 0.027668018215782995, + "grad_norm": 0.5831287503242493, + "learning_rate": 9.997219349653708e-06, + "loss": 2.3888, + "mean_token_accuracy": 0.48680589720606804, + "num_tokens": 233120888.0, + "step": 1607 + }, + { + "entropy": 2.5101318359375, + "epoch": 0.027685235401978255, + "grad_norm": 0.5937960743904114, + "learning_rate": 9.99721016587224e-06, + "loss": 2.4551, + "mean_token_accuracy": 0.47399421967566013, + "num_tokens": 233260349.0, + "step": 1608 + }, + { + "entropy": 2.4505615234375, + "epoch": 0.027702452588173516, + "grad_norm": 0.5459933876991272, + "learning_rate": 9.997200966954149e-06, + "loss": 2.4176, + "mean_token_accuracy": 0.47501937579363585, + "num_tokens": 233405845.0, + "step": 1609 + }, + { + "entropy": 2.4483642578125, + "epoch": 0.027719669774368776, + "grad_norm": 0.5332619547843933, + "learning_rate": 9.997191752899462e-06, + "loss": 2.3866, + "mean_token_accuracy": 0.47997900983318686, + "num_tokens": 233570774.0, + "step": 1610 + }, + { + "entropy": 2.4091796875, + "epoch": 0.027736886960564033, + "grad_norm": 0.57257479429245, + "learning_rate": 9.997182523708208e-06, + "loss": 2.3707, + "mean_token_accuracy": 0.4883900200948119, + "num_tokens": 233720314.0, + "step": 1611 + }, + { + "entropy": 2.4427490234375, + "epoch": 0.027754104146759294, + "grad_norm": 0.5707916617393494, + "learning_rate": 9.997173279380415e-06, + "loss": 2.4081, + "mean_token_accuracy": 0.48100787540897727, + "num_tokens": 233865434.0, + "step": 1612 + }, + { + "entropy": 2.4808349609375, + "epoch": 0.027771321332954554, + "grad_norm": 0.6137810945510864, + "learning_rate": 9.997164019916109e-06, + "loss": 2.4036, + "mean_token_accuracy": 0.4781500454992056, + "num_tokens": 234000908.0, + "step": 1613 + }, + { + "entropy": 2.44140625, + "epoch": 0.027788538519149815, + "grad_norm": 0.569161593914032, + "learning_rate": 9.997154745315321e-06, + "loss": 2.4136, + "mean_token_accuracy": 0.48161053331568837, + "num_tokens": 234154080.0, + "step": 1614 + }, + { + "entropy": 2.43212890625, + "epoch": 0.027805755705345075, + "grad_norm": 0.5593316555023193, + "learning_rate": 9.997145455578076e-06, + "loss": 2.3662, + "mean_token_accuracy": 0.4847466191276908, + "num_tokens": 234304655.0, + "step": 1615 + }, + { + "entropy": 2.4293212890625, + "epoch": 0.027822972891540336, + "grad_norm": 0.6409513354301453, + "learning_rate": 9.997136150704407e-06, + "loss": 2.3821, + "mean_token_accuracy": 0.48784798802807927, + "num_tokens": 234428782.0, + "step": 1616 + }, + { + "entropy": 2.4359130859375, + "epoch": 0.027840190077735597, + "grad_norm": 0.5953328013420105, + "learning_rate": 9.997126830694336e-06, + "loss": 2.3689, + "mean_token_accuracy": 0.48606695234775543, + "num_tokens": 234563589.0, + "step": 1617 + }, + { + "entropy": 2.4755859375, + "epoch": 0.027857407263930857, + "grad_norm": 1.202374815940857, + "learning_rate": 9.997117495547897e-06, + "loss": 2.4366, + "mean_token_accuracy": 0.47866149339824915, + "num_tokens": 234713984.0, + "step": 1618 + }, + { + "entropy": 2.4501953125, + "epoch": 0.027874624450126118, + "grad_norm": 0.6060045957565308, + "learning_rate": 9.997108145265113e-06, + "loss": 2.4131, + "mean_token_accuracy": 0.48174894973635674, + "num_tokens": 234856641.0, + "step": 1619 + }, + { + "entropy": 2.4287109375, + "epoch": 0.027891841636321375, + "grad_norm": 0.5465239882469177, + "learning_rate": 9.997098779846017e-06, + "loss": 2.3764, + "mean_token_accuracy": 0.48386607645079494, + "num_tokens": 235015161.0, + "step": 1620 + }, + { + "entropy": 2.4818115234375, + "epoch": 0.027909058822516635, + "grad_norm": 0.5660187602043152, + "learning_rate": 9.997089399290635e-06, + "loss": 2.4657, + "mean_token_accuracy": 0.47468016063794494, + "num_tokens": 235151596.0, + "step": 1621 + }, + { + "entropy": 2.471435546875, + "epoch": 0.027926276008711896, + "grad_norm": 0.572928786277771, + "learning_rate": 9.997080003598995e-06, + "loss": 2.4231, + "mean_token_accuracy": 0.47549346927553415, + "num_tokens": 235291268.0, + "step": 1622 + }, + { + "entropy": 2.498046875, + "epoch": 0.027943493194907156, + "grad_norm": 0.566953182220459, + "learning_rate": 9.997070592771127e-06, + "loss": 2.4507, + "mean_token_accuracy": 0.47433658177033067, + "num_tokens": 235435246.0, + "step": 1623 + }, + { + "entropy": 2.448486328125, + "epoch": 0.027960710381102417, + "grad_norm": 0.6662017703056335, + "learning_rate": 9.997061166807057e-06, + "loss": 2.4008, + "mean_token_accuracy": 0.48391238413751125, + "num_tokens": 235570223.0, + "step": 1624 + }, + { + "entropy": 2.416015625, + "epoch": 0.027977927567297677, + "grad_norm": 0.5774509906768799, + "learning_rate": 9.997051725706816e-06, + "loss": 2.3538, + "mean_token_accuracy": 0.48118720296770334, + "num_tokens": 235725981.0, + "step": 1625 + }, + { + "entropy": 2.4285888671875, + "epoch": 0.027995144753492938, + "grad_norm": 0.5478702783584595, + "learning_rate": 9.997042269470431e-06, + "loss": 2.3242, + "mean_token_accuracy": 0.49387524649500847, + "num_tokens": 235876462.0, + "step": 1626 + }, + { + "entropy": 2.462158203125, + "epoch": 0.0280123619396882, + "grad_norm": 0.573614776134491, + "learning_rate": 9.997032798097935e-06, + "loss": 2.425, + "mean_token_accuracy": 0.48535694507882, + "num_tokens": 236015336.0, + "step": 1627 + }, + { + "entropy": 2.50634765625, + "epoch": 0.028029579125883455, + "grad_norm": 0.6228254437446594, + "learning_rate": 9.997023311589349e-06, + "loss": 2.4627, + "mean_token_accuracy": 0.47959386138245463, + "num_tokens": 236155028.0, + "step": 1628 + }, + { + "entropy": 2.5274658203125, + "epoch": 0.028046796312078716, + "grad_norm": 0.6030795574188232, + "learning_rate": 9.997013809944708e-06, + "loss": 2.4979, + "mean_token_accuracy": 0.4653917569667101, + "num_tokens": 236309148.0, + "step": 1629 + }, + { + "entropy": 2.388916015625, + "epoch": 0.028064013498273976, + "grad_norm": 0.558562695980072, + "learning_rate": 9.997004293164038e-06, + "loss": 2.3689, + "mean_token_accuracy": 0.48823840310797095, + "num_tokens": 236475648.0, + "step": 1630 + }, + { + "entropy": 2.43798828125, + "epoch": 0.028081230684469237, + "grad_norm": 0.5895214080810547, + "learning_rate": 9.996994761247368e-06, + "loss": 2.4171, + "mean_token_accuracy": 0.4803608972579241, + "num_tokens": 236615733.0, + "step": 1631 + }, + { + "entropy": 2.421875, + "epoch": 0.028098447870664497, + "grad_norm": 0.5724160075187683, + "learning_rate": 9.996985214194727e-06, + "loss": 2.3814, + "mean_token_accuracy": 0.483953685965389, + "num_tokens": 236756289.0, + "step": 1632 + }, + { + "entropy": 2.47021484375, + "epoch": 0.028115665056859758, + "grad_norm": 0.5840807557106018, + "learning_rate": 9.996975652006146e-06, + "loss": 2.4387, + "mean_token_accuracy": 0.4743587113916874, + "num_tokens": 236887859.0, + "step": 1633 + }, + { + "entropy": 2.46826171875, + "epoch": 0.02813288224305502, + "grad_norm": 0.5316832065582275, + "learning_rate": 9.996966074681651e-06, + "loss": 2.4542, + "mean_token_accuracy": 0.4724106374196708, + "num_tokens": 237042538.0, + "step": 1634 + }, + { + "entropy": 2.4212646484375, + "epoch": 0.02815009942925028, + "grad_norm": 0.5952966809272766, + "learning_rate": 9.996956482221273e-06, + "loss": 2.4116, + "mean_token_accuracy": 0.4807650912553072, + "num_tokens": 237190659.0, + "step": 1635 + }, + { + "entropy": 2.4124755859375, + "epoch": 0.028167316615445536, + "grad_norm": 0.5247029066085815, + "learning_rate": 9.99694687462504e-06, + "loss": 2.359, + "mean_token_accuracy": 0.48549360083416104, + "num_tokens": 237358115.0, + "step": 1636 + }, + { + "entropy": 2.387939453125, + "epoch": 0.028184533801640797, + "grad_norm": 0.5859695672988892, + "learning_rate": 9.996937251892982e-06, + "loss": 2.3148, + "mean_token_accuracy": 0.49537815153598785, + "num_tokens": 237496922.0, + "step": 1637 + }, + { + "entropy": 2.4844970703125, + "epoch": 0.028201750987836057, + "grad_norm": 0.6078894734382629, + "learning_rate": 9.996927614025127e-06, + "loss": 2.443, + "mean_token_accuracy": 0.4838123098015785, + "num_tokens": 237623117.0, + "step": 1638 + }, + { + "entropy": 2.533203125, + "epoch": 0.028218968174031318, + "grad_norm": 0.5760785341262817, + "learning_rate": 9.996917961021504e-06, + "loss": 2.4848, + "mean_token_accuracy": 0.4731389367952943, + "num_tokens": 237763918.0, + "step": 1639 + }, + { + "entropy": 2.46826171875, + "epoch": 0.028236185360226578, + "grad_norm": 0.539117157459259, + "learning_rate": 9.996908292882144e-06, + "loss": 2.4377, + "mean_token_accuracy": 0.47789577953517437, + "num_tokens": 237926377.0, + "step": 1640 + }, + { + "entropy": 2.465087890625, + "epoch": 0.02825340254642184, + "grad_norm": 0.5144519209861755, + "learning_rate": 9.996898609607075e-06, + "loss": 2.4543, + "mean_token_accuracy": 0.474421392660588, + "num_tokens": 238089528.0, + "step": 1641 + }, + { + "entropy": 2.4031982421875, + "epoch": 0.0282706197326171, + "grad_norm": 0.5754570960998535, + "learning_rate": 9.996888911196326e-06, + "loss": 2.3766, + "mean_token_accuracy": 0.4901612875983119, + "num_tokens": 238231433.0, + "step": 1642 + }, + { + "entropy": 2.4429931640625, + "epoch": 0.02828783691881236, + "grad_norm": 0.6072561740875244, + "learning_rate": 9.996879197649927e-06, + "loss": 2.394, + "mean_token_accuracy": 0.4848616924136877, + "num_tokens": 238354749.0, + "step": 1643 + }, + { + "entropy": 2.43603515625, + "epoch": 0.02830505410500762, + "grad_norm": 0.70046466588974, + "learning_rate": 9.996869468967909e-06, + "loss": 2.3639, + "mean_token_accuracy": 0.48499562544748187, + "num_tokens": 238506100.0, + "step": 1644 + }, + { + "entropy": 2.43603515625, + "epoch": 0.028322271291202877, + "grad_norm": 0.6182661652565002, + "learning_rate": 9.996859725150299e-06, + "loss": 2.3903, + "mean_token_accuracy": 0.4828766668215394, + "num_tokens": 238651208.0, + "step": 1645 + }, + { + "entropy": 2.39111328125, + "epoch": 0.028339488477398138, + "grad_norm": 0.5814700126647949, + "learning_rate": 9.996849966197127e-06, + "loss": 2.3526, + "mean_token_accuracy": 0.48803521366789937, + "num_tokens": 238805878.0, + "step": 1646 + }, + { + "entropy": 2.37841796875, + "epoch": 0.0283567056635934, + "grad_norm": 0.5634573698043823, + "learning_rate": 9.996840192108421e-06, + "loss": 2.3943, + "mean_token_accuracy": 0.48795266123488545, + "num_tokens": 238958546.0, + "step": 1647 + }, + { + "entropy": 2.5108642578125, + "epoch": 0.02837392284978866, + "grad_norm": 0.6036639213562012, + "learning_rate": 9.996830402884216e-06, + "loss": 2.4941, + "mean_token_accuracy": 0.4679912384599447, + "num_tokens": 239087208.0, + "step": 1648 + }, + { + "entropy": 2.4544677734375, + "epoch": 0.02839114003598392, + "grad_norm": 0.5786367654800415, + "learning_rate": 9.996820598524536e-06, + "loss": 2.4108, + "mean_token_accuracy": 0.4794202446937561, + "num_tokens": 239230718.0, + "step": 1649 + }, + { + "entropy": 2.427490234375, + "epoch": 0.02840835722217918, + "grad_norm": 0.5587934255599976, + "learning_rate": 9.996810779029413e-06, + "loss": 2.4071, + "mean_token_accuracy": 0.48648894764482975, + "num_tokens": 239388616.0, + "step": 1650 + }, + { + "entropy": 2.488037109375, + "epoch": 0.02842557440837444, + "grad_norm": 0.5868260860443115, + "learning_rate": 9.996800944398879e-06, + "loss": 2.4666, + "mean_token_accuracy": 0.47458827355876565, + "num_tokens": 239532563.0, + "step": 1651 + }, + { + "entropy": 2.3829345703125, + "epoch": 0.0284427915945697, + "grad_norm": 0.5640127658843994, + "learning_rate": 9.996791094632958e-06, + "loss": 2.3417, + "mean_token_accuracy": 0.4937547417357564, + "num_tokens": 239675698.0, + "step": 1652 + }, + { + "entropy": 2.542724609375, + "epoch": 0.028460008780764958, + "grad_norm": 0.5434319972991943, + "learning_rate": 9.996781229731685e-06, + "loss": 2.4758, + "mean_token_accuracy": 0.4686996676027775, + "num_tokens": 239821208.0, + "step": 1653 + }, + { + "entropy": 2.4918212890625, + "epoch": 0.02847722596696022, + "grad_norm": 0.5542386174201965, + "learning_rate": 9.996771349695089e-06, + "loss": 2.4299, + "mean_token_accuracy": 0.47332891169935465, + "num_tokens": 239965661.0, + "step": 1654 + }, + { + "entropy": 2.4259033203125, + "epoch": 0.02849444315315548, + "grad_norm": 0.5533227324485779, + "learning_rate": 9.996761454523198e-06, + "loss": 2.4194, + "mean_token_accuracy": 0.48256733594462276, + "num_tokens": 240122225.0, + "step": 1655 + }, + { + "entropy": 2.398193359375, + "epoch": 0.02851166033935074, + "grad_norm": 0.5755099058151245, + "learning_rate": 9.996751544216043e-06, + "loss": 2.3681, + "mean_token_accuracy": 0.48602569615468383, + "num_tokens": 240255685.0, + "step": 1656 + }, + { + "entropy": 2.336669921875, + "epoch": 0.028528877525546, + "grad_norm": 0.573204517364502, + "learning_rate": 9.996741618773654e-06, + "loss": 2.2821, + "mean_token_accuracy": 0.5057984632439911, + "num_tokens": 240412754.0, + "step": 1657 + }, + { + "entropy": 2.4609375, + "epoch": 0.02854609471174126, + "grad_norm": 0.6060733199119568, + "learning_rate": 9.99673167819606e-06, + "loss": 2.4604, + "mean_token_accuracy": 0.47678601182997227, + "num_tokens": 240540902.0, + "step": 1658 + }, + { + "entropy": 2.4752197265625, + "epoch": 0.02856331189793652, + "grad_norm": 0.5993353724479675, + "learning_rate": 9.996721722483296e-06, + "loss": 2.4133, + "mean_token_accuracy": 0.483869846444577, + "num_tokens": 240689582.0, + "step": 1659 + }, + { + "entropy": 2.38134765625, + "epoch": 0.028580529084131782, + "grad_norm": 0.5666155815124512, + "learning_rate": 9.996711751635384e-06, + "loss": 2.3604, + "mean_token_accuracy": 0.491028075106442, + "num_tokens": 240844782.0, + "step": 1660 + }, + { + "entropy": 2.441650390625, + "epoch": 0.02859774627032704, + "grad_norm": 0.5586746335029602, + "learning_rate": 9.996701765652361e-06, + "loss": 2.413, + "mean_token_accuracy": 0.4783767703920603, + "num_tokens": 240991388.0, + "step": 1661 + }, + { + "entropy": 2.455322265625, + "epoch": 0.0286149634565223, + "grad_norm": 0.5903511047363281, + "learning_rate": 9.996691764534255e-06, + "loss": 2.4062, + "mean_token_accuracy": 0.48205051850527525, + "num_tokens": 241124952.0, + "step": 1662 + }, + { + "entropy": 2.498046875, + "epoch": 0.02863218064271756, + "grad_norm": 0.5668911933898926, + "learning_rate": 9.996681748281094e-06, + "loss": 2.4549, + "mean_token_accuracy": 0.4747361782938242, + "num_tokens": 241262036.0, + "step": 1663 + }, + { + "entropy": 2.5220947265625, + "epoch": 0.02864939782891282, + "grad_norm": 0.6033514738082886, + "learning_rate": 9.996671716892914e-06, + "loss": 2.5395, + "mean_token_accuracy": 0.46681966073811054, + "num_tokens": 241416917.0, + "step": 1664 + }, + { + "entropy": 2.4417724609375, + "epoch": 0.02866661501510808, + "grad_norm": 0.6681655645370483, + "learning_rate": 9.996661670369739e-06, + "loss": 2.3974, + "mean_token_accuracy": 0.48373745242133737, + "num_tokens": 241554068.0, + "step": 1665 + }, + { + "entropy": 2.4580078125, + "epoch": 0.02868383220130334, + "grad_norm": 0.5739055275917053, + "learning_rate": 9.996651608711603e-06, + "loss": 2.361, + "mean_token_accuracy": 0.48040412552654743, + "num_tokens": 241714285.0, + "step": 1666 + }, + { + "entropy": 2.4683837890625, + "epoch": 0.028701049387498602, + "grad_norm": 0.6406212449073792, + "learning_rate": 9.996641531918536e-06, + "loss": 2.43, + "mean_token_accuracy": 0.48026127368211746, + "num_tokens": 241841614.0, + "step": 1667 + }, + { + "entropy": 2.5107421875, + "epoch": 0.028718266573693862, + "grad_norm": 0.5852003693580627, + "learning_rate": 9.996631439990568e-06, + "loss": 2.4992, + "mean_token_accuracy": 0.4704901375807822, + "num_tokens": 241980442.0, + "step": 1668 + }, + { + "entropy": 2.4345703125, + "epoch": 0.028735483759889123, + "grad_norm": 0.6459202170372009, + "learning_rate": 9.996621332927729e-06, + "loss": 2.4125, + "mean_token_accuracy": 0.4840882923454046, + "num_tokens": 242131950.0, + "step": 1669 + }, + { + "entropy": 2.4384765625, + "epoch": 0.02875270094608438, + "grad_norm": 0.6423004269599915, + "learning_rate": 9.996611210730053e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.48514924151822925, + "num_tokens": 242274852.0, + "step": 1670 + }, + { + "entropy": 2.502685546875, + "epoch": 0.02876991813227964, + "grad_norm": 0.5465385913848877, + "learning_rate": 9.996601073397568e-06, + "loss": 2.4691, + "mean_token_accuracy": 0.4714298346079886, + "num_tokens": 242426188.0, + "step": 1671 + }, + { + "entropy": 2.405517578125, + "epoch": 0.0287871353184749, + "grad_norm": 0.5957438945770264, + "learning_rate": 9.996590920930303e-06, + "loss": 2.3951, + "mean_token_accuracy": 0.48868590872734785, + "num_tokens": 242571018.0, + "step": 1672 + }, + { + "entropy": 2.550537109375, + "epoch": 0.02880435250467016, + "grad_norm": 0.5585090517997742, + "learning_rate": 9.99658075332829e-06, + "loss": 2.5181, + "mean_token_accuracy": 0.46476193610578775, + "num_tokens": 242721700.0, + "step": 1673 + }, + { + "entropy": 2.4349365234375, + "epoch": 0.028821569690865422, + "grad_norm": 0.6010538935661316, + "learning_rate": 9.996570570591561e-06, + "loss": 2.3535, + "mean_token_accuracy": 0.48848696844652295, + "num_tokens": 242858693.0, + "step": 1674 + }, + { + "entropy": 2.4791259765625, + "epoch": 0.028838786877060683, + "grad_norm": 0.5733229517936707, + "learning_rate": 9.996560372720147e-06, + "loss": 2.4825, + "mean_token_accuracy": 0.4683424327522516, + "num_tokens": 243006868.0, + "step": 1675 + }, + { + "entropy": 2.4215087890625, + "epoch": 0.028856004063255943, + "grad_norm": 0.5834994316101074, + "learning_rate": 9.996550159714078e-06, + "loss": 2.3763, + "mean_token_accuracy": 0.4828924615867436, + "num_tokens": 243154886.0, + "step": 1676 + }, + { + "entropy": 2.44775390625, + "epoch": 0.028873221249451204, + "grad_norm": 0.6166858077049255, + "learning_rate": 9.996539931573385e-06, + "loss": 2.3781, + "mean_token_accuracy": 0.4847164931707084, + "num_tokens": 243288516.0, + "step": 1677 + }, + { + "entropy": 2.49072265625, + "epoch": 0.02889043843564646, + "grad_norm": 0.5641864538192749, + "learning_rate": 9.996529688298098e-06, + "loss": 2.4465, + "mean_token_accuracy": 0.473747827578336, + "num_tokens": 243441352.0, + "step": 1678 + }, + { + "entropy": 2.451171875, + "epoch": 0.02890765562184172, + "grad_norm": 0.6030487418174744, + "learning_rate": 9.996519429888249e-06, + "loss": 2.4481, + "mean_token_accuracy": 0.4830109137110412, + "num_tokens": 243588150.0, + "step": 1679 + }, + { + "entropy": 2.4708251953125, + "epoch": 0.028924872808036982, + "grad_norm": 0.5804004073143005, + "learning_rate": 9.99650915634387e-06, + "loss": 2.4297, + "mean_token_accuracy": 0.47798701329156756, + "num_tokens": 243737262.0, + "step": 1680 + }, + { + "entropy": 2.4835205078125, + "epoch": 0.028942089994232242, + "grad_norm": 0.5946245789527893, + "learning_rate": 9.996498867664992e-06, + "loss": 2.4639, + "mean_token_accuracy": 0.47278458066284657, + "num_tokens": 243872204.0, + "step": 1681 + }, + { + "entropy": 2.450439453125, + "epoch": 0.028959307180427503, + "grad_norm": 0.6240355372428894, + "learning_rate": 9.996488563851646e-06, + "loss": 2.3891, + "mean_token_accuracy": 0.48081753915175796, + "num_tokens": 243998841.0, + "step": 1682 + }, + { + "entropy": 2.385009765625, + "epoch": 0.028976524366622763, + "grad_norm": 0.5424374341964722, + "learning_rate": 9.99647824490386e-06, + "loss": 2.3166, + "mean_token_accuracy": 0.490779772400856, + "num_tokens": 244149478.0, + "step": 1683 + }, + { + "entropy": 2.4439697265625, + "epoch": 0.028993741552818024, + "grad_norm": 0.718917727470398, + "learning_rate": 9.99646791082167e-06, + "loss": 2.4061, + "mean_token_accuracy": 0.4811778524890542, + "num_tokens": 244291993.0, + "step": 1684 + }, + { + "entropy": 2.394287109375, + "epoch": 0.029010958739013284, + "grad_norm": 0.6151554584503174, + "learning_rate": 9.996457561605105e-06, + "loss": 2.3617, + "mean_token_accuracy": 0.48714847723022103, + "num_tokens": 244435849.0, + "step": 1685 + }, + { + "entropy": 2.4591064453125, + "epoch": 0.02902817592520854, + "grad_norm": 0.5844502449035645, + "learning_rate": 9.996447197254195e-06, + "loss": 2.4036, + "mean_token_accuracy": 0.48159506963565946, + "num_tokens": 244580376.0, + "step": 1686 + }, + { + "entropy": 2.40625, + "epoch": 0.029045393111403802, + "grad_norm": 0.5591570734977722, + "learning_rate": 9.996436817768974e-06, + "loss": 2.3592, + "mean_token_accuracy": 0.49357689963653684, + "num_tokens": 244729260.0, + "step": 1687 + }, + { + "entropy": 2.4224853515625, + "epoch": 0.029062610297599063, + "grad_norm": 0.5616257190704346, + "learning_rate": 9.996426423149472e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.482600009534508, + "num_tokens": 244872812.0, + "step": 1688 + }, + { + "entropy": 2.500244140625, + "epoch": 0.029079827483794323, + "grad_norm": 0.5827319622039795, + "learning_rate": 9.996416013395721e-06, + "loss": 2.4213, + "mean_token_accuracy": 0.47733526350930333, + "num_tokens": 245013545.0, + "step": 1689 + }, + { + "entropy": 2.4273681640625, + "epoch": 0.029097044669989584, + "grad_norm": 0.5836253762245178, + "learning_rate": 9.996405588507753e-06, + "loss": 2.4129, + "mean_token_accuracy": 0.48227705620229244, + "num_tokens": 245158448.0, + "step": 1690 + }, + { + "entropy": 2.41015625, + "epoch": 0.029114261856184844, + "grad_norm": 0.5588595271110535, + "learning_rate": 9.996395148485598e-06, + "loss": 2.3954, + "mean_token_accuracy": 0.48199733812361956, + "num_tokens": 245323743.0, + "step": 1691 + }, + { + "entropy": 2.431396484375, + "epoch": 0.029131479042380105, + "grad_norm": 0.585265576839447, + "learning_rate": 9.996384693329289e-06, + "loss": 2.3807, + "mean_token_accuracy": 0.4879599893465638, + "num_tokens": 245479102.0, + "step": 1692 + }, + { + "entropy": 2.47900390625, + "epoch": 0.029148696228575365, + "grad_norm": 0.5371046662330627, + "learning_rate": 9.996374223038858e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.47392465313896537, + "num_tokens": 245627290.0, + "step": 1693 + }, + { + "entropy": 2.4324951171875, + "epoch": 0.029165913414770626, + "grad_norm": 0.608191967010498, + "learning_rate": 9.996363737614334e-06, + "loss": 2.3786, + "mean_token_accuracy": 0.4849713812582195, + "num_tokens": 245761478.0, + "step": 1694 + }, + { + "entropy": 2.5067138671875, + "epoch": 0.029183130600965883, + "grad_norm": 0.5639045834541321, + "learning_rate": 9.996353237055753e-06, + "loss": 2.4584, + "mean_token_accuracy": 0.47028431948274374, + "num_tokens": 245900244.0, + "step": 1695 + }, + { + "entropy": 2.4661865234375, + "epoch": 0.029200347787161143, + "grad_norm": 0.5640761852264404, + "learning_rate": 9.996342721363144e-06, + "loss": 2.4086, + "mean_token_accuracy": 0.47613369347527623, + "num_tokens": 246061900.0, + "step": 1696 + }, + { + "entropy": 2.413818359375, + "epoch": 0.029217564973356404, + "grad_norm": 0.5837756395339966, + "learning_rate": 9.996332190536538e-06, + "loss": 2.3655, + "mean_token_accuracy": 0.48909167293459177, + "num_tokens": 246203266.0, + "step": 1697 + }, + { + "entropy": 2.466064453125, + "epoch": 0.029234782159551664, + "grad_norm": 0.6410163044929504, + "learning_rate": 9.99632164457597e-06, + "loss": 2.4183, + "mean_token_accuracy": 0.47384981345385313, + "num_tokens": 246345295.0, + "step": 1698 + }, + { + "entropy": 2.474365234375, + "epoch": 0.029251999345746925, + "grad_norm": 0.5565051436424255, + "learning_rate": 9.99631108348147e-06, + "loss": 2.452, + "mean_token_accuracy": 0.4777741697616875, + "num_tokens": 246501317.0, + "step": 1699 + }, + { + "entropy": 2.4288330078125, + "epoch": 0.029269216531942185, + "grad_norm": 0.5428564548492432, + "learning_rate": 9.996300507253068e-06, + "loss": 2.3949, + "mean_token_accuracy": 0.48448564764112234, + "num_tokens": 246654551.0, + "step": 1700 + }, + { + "entropy": 2.4349365234375, + "epoch": 0.029286433718137446, + "grad_norm": 0.5566284656524658, + "learning_rate": 9.9962899158908e-06, + "loss": 2.4152, + "mean_token_accuracy": 0.4823923450894654, + "num_tokens": 246816439.0, + "step": 1701 + }, + { + "entropy": 2.48681640625, + "epoch": 0.029303650904332706, + "grad_norm": 0.5799583792686462, + "learning_rate": 9.996279309394697e-06, + "loss": 2.4778, + "mean_token_accuracy": 0.4676981116645038, + "num_tokens": 246961368.0, + "step": 1702 + }, + { + "entropy": 2.494140625, + "epoch": 0.029320868090527964, + "grad_norm": 0.5828601121902466, + "learning_rate": 9.99626868776479e-06, + "loss": 2.491, + "mean_token_accuracy": 0.4712865217588842, + "num_tokens": 247098743.0, + "step": 1703 + }, + { + "entropy": 2.4791259765625, + "epoch": 0.029338085276723224, + "grad_norm": 0.6346420645713806, + "learning_rate": 9.99625805100111e-06, + "loss": 2.462, + "mean_token_accuracy": 0.4806337282061577, + "num_tokens": 247245594.0, + "step": 1704 + }, + { + "entropy": 2.4534912109375, + "epoch": 0.029355302462918485, + "grad_norm": 0.5871755480766296, + "learning_rate": 9.996247399103693e-06, + "loss": 2.4171, + "mean_token_accuracy": 0.4820070252753794, + "num_tokens": 247385625.0, + "step": 1705 + }, + { + "entropy": 2.427001953125, + "epoch": 0.029372519649113745, + "grad_norm": 0.570570170879364, + "learning_rate": 9.996236732072568e-06, + "loss": 2.4239, + "mean_token_accuracy": 0.4824762148782611, + "num_tokens": 247535659.0, + "step": 1706 + }, + { + "entropy": 2.4500732421875, + "epoch": 0.029389736835309006, + "grad_norm": 0.5755301713943481, + "learning_rate": 9.99622604990777e-06, + "loss": 2.4254, + "mean_token_accuracy": 0.48341484228149056, + "num_tokens": 247679680.0, + "step": 1707 + }, + { + "entropy": 2.4388427734375, + "epoch": 0.029406954021504266, + "grad_norm": 0.5620051622390747, + "learning_rate": 9.996215352609327e-06, + "loss": 2.4294, + "mean_token_accuracy": 0.4801658443175256, + "num_tokens": 247823282.0, + "step": 1708 + }, + { + "entropy": 2.495361328125, + "epoch": 0.029424171207699527, + "grad_norm": 0.5927258133888245, + "learning_rate": 9.996204640177276e-06, + "loss": 2.4746, + "mean_token_accuracy": 0.47673043003305793, + "num_tokens": 247963838.0, + "step": 1709 + }, + { + "entropy": 2.48583984375, + "epoch": 0.029441388393894787, + "grad_norm": 0.620323657989502, + "learning_rate": 9.996193912611648e-06, + "loss": 2.4201, + "mean_token_accuracy": 0.4740843581967056, + "num_tokens": 248101252.0, + "step": 1710 + }, + { + "entropy": 2.49658203125, + "epoch": 0.029458605580090044, + "grad_norm": 0.5656887888908386, + "learning_rate": 9.996183169912474e-06, + "loss": 2.4453, + "mean_token_accuracy": 0.4755016630515456, + "num_tokens": 248251659.0, + "step": 1711 + }, + { + "entropy": 2.44482421875, + "epoch": 0.029475822766285305, + "grad_norm": 0.5744235515594482, + "learning_rate": 9.996172412079788e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.4831536212004721, + "num_tokens": 248394514.0, + "step": 1712 + }, + { + "entropy": 2.4427490234375, + "epoch": 0.029493039952480565, + "grad_norm": 0.5855416059494019, + "learning_rate": 9.996161639113622e-06, + "loss": 2.399, + "mean_token_accuracy": 0.48080876702442765, + "num_tokens": 248535417.0, + "step": 1713 + }, + { + "entropy": 2.4171142578125, + "epoch": 0.029510257138675826, + "grad_norm": 0.6481823325157166, + "learning_rate": 9.99615085101401e-06, + "loss": 2.3865, + "mean_token_accuracy": 0.48705816362053156, + "num_tokens": 248672896.0, + "step": 1714 + }, + { + "entropy": 2.397216796875, + "epoch": 0.029527474324871086, + "grad_norm": 0.525701642036438, + "learning_rate": 9.996140047780983e-06, + "loss": 2.3877, + "mean_token_accuracy": 0.48106732219457626, + "num_tokens": 248838851.0, + "step": 1715 + }, + { + "entropy": 2.412353515625, + "epoch": 0.029544691511066347, + "grad_norm": 0.5457813143730164, + "learning_rate": 9.996129229414573e-06, + "loss": 2.362, + "mean_token_accuracy": 0.4910212284885347, + "num_tokens": 249000921.0, + "step": 1716 + }, + { + "entropy": 2.3861083984375, + "epoch": 0.029561908697261607, + "grad_norm": 0.5792080163955688, + "learning_rate": 9.996118395914816e-06, + "loss": 2.3344, + "mean_token_accuracy": 0.4931281958706677, + "num_tokens": 249142323.0, + "step": 1717 + }, + { + "entropy": 2.4461669921875, + "epoch": 0.029579125883456868, + "grad_norm": 0.5751327872276306, + "learning_rate": 9.996107547281741e-06, + "loss": 2.4195, + "mean_token_accuracy": 0.4737996533513069, + "num_tokens": 249273907.0, + "step": 1718 + }, + { + "entropy": 2.50390625, + "epoch": 0.02959634306965213, + "grad_norm": 0.5305837988853455, + "learning_rate": 9.996096683515385e-06, + "loss": 2.4582, + "mean_token_accuracy": 0.47129624895751476, + "num_tokens": 249420011.0, + "step": 1719 + }, + { + "entropy": 2.50146484375, + "epoch": 0.029613560255847386, + "grad_norm": 0.5615363121032715, + "learning_rate": 9.996085804615776e-06, + "loss": 2.4453, + "mean_token_accuracy": 0.4726092144846916, + "num_tokens": 249577874.0, + "step": 1720 + }, + { + "entropy": 2.4356689453125, + "epoch": 0.029630777442042646, + "grad_norm": 0.584879457950592, + "learning_rate": 9.99607491058295e-06, + "loss": 2.3874, + "mean_token_accuracy": 0.4838989768177271, + "num_tokens": 249733995.0, + "step": 1721 + }, + { + "entropy": 2.4276123046875, + "epoch": 0.029647994628237907, + "grad_norm": 0.6611423492431641, + "learning_rate": 9.996064001416943e-06, + "loss": 2.3991, + "mean_token_accuracy": 0.4866844713687897, + "num_tokens": 249863834.0, + "step": 1722 + }, + { + "entropy": 2.387939453125, + "epoch": 0.029665211814433167, + "grad_norm": 0.5828907489776611, + "learning_rate": 9.996053077117781e-06, + "loss": 2.35, + "mean_token_accuracy": 0.48920594388619065, + "num_tokens": 249999433.0, + "step": 1723 + }, + { + "entropy": 2.5263671875, + "epoch": 0.029682429000628428, + "grad_norm": 0.6122536063194275, + "learning_rate": 9.996042137685502e-06, + "loss": 2.5203, + "mean_token_accuracy": 0.4699005661532283, + "num_tokens": 250134401.0, + "step": 1724 + }, + { + "entropy": 2.44482421875, + "epoch": 0.029699646186823688, + "grad_norm": 0.6737586259841919, + "learning_rate": 9.996031183120138e-06, + "loss": 2.3536, + "mean_token_accuracy": 0.48844635486602783, + "num_tokens": 250292079.0, + "step": 1725 + }, + { + "entropy": 2.47119140625, + "epoch": 0.02971686337301895, + "grad_norm": 0.5378682613372803, + "learning_rate": 9.996020213421722e-06, + "loss": 2.4471, + "mean_token_accuracy": 0.48123562429100275, + "num_tokens": 250449143.0, + "step": 1726 + }, + { + "entropy": 2.448974609375, + "epoch": 0.02973408055921421, + "grad_norm": 0.5978697538375854, + "learning_rate": 9.996009228590286e-06, + "loss": 2.4195, + "mean_token_accuracy": 0.48353995755314827, + "num_tokens": 250581816.0, + "step": 1727 + }, + { + "entropy": 2.515869140625, + "epoch": 0.029751297745409466, + "grad_norm": 0.5631060004234314, + "learning_rate": 9.995998228625868e-06, + "loss": 2.4653, + "mean_token_accuracy": 0.473801807500422, + "num_tokens": 250717401.0, + "step": 1728 + }, + { + "entropy": 2.482421875, + "epoch": 0.029768514931604727, + "grad_norm": 0.5445279479026794, + "learning_rate": 9.995987213528493e-06, + "loss": 2.4938, + "mean_token_accuracy": 0.47592513309791684, + "num_tokens": 250867240.0, + "step": 1729 + }, + { + "entropy": 2.5250244140625, + "epoch": 0.029785732117799987, + "grad_norm": 0.5898825526237488, + "learning_rate": 9.995976183298204e-06, + "loss": 2.4803, + "mean_token_accuracy": 0.4719364200718701, + "num_tokens": 251016870.0, + "step": 1730 + }, + { + "entropy": 2.4178466796875, + "epoch": 0.029802949303995248, + "grad_norm": 0.5892136096954346, + "learning_rate": 9.995965137935026e-06, + "loss": 2.3497, + "mean_token_accuracy": 0.48639260046184063, + "num_tokens": 251159320.0, + "step": 1731 + }, + { + "entropy": 2.41259765625, + "epoch": 0.02982016649019051, + "grad_norm": 0.6107653379440308, + "learning_rate": 9.995954077439e-06, + "loss": 2.3769, + "mean_token_accuracy": 0.4900634833611548, + "num_tokens": 251294730.0, + "step": 1732 + }, + { + "entropy": 2.41552734375, + "epoch": 0.02983738367638577, + "grad_norm": 0.5585425496101379, + "learning_rate": 9.995943001810151e-06, + "loss": 2.3654, + "mean_token_accuracy": 0.48663356387987733, + "num_tokens": 251438812.0, + "step": 1733 + }, + { + "entropy": 2.45751953125, + "epoch": 0.02985460086258103, + "grad_norm": 0.5406437516212463, + "learning_rate": 9.99593191104852e-06, + "loss": 2.4438, + "mean_token_accuracy": 0.4822356994263828, + "num_tokens": 251597411.0, + "step": 1734 + }, + { + "entropy": 2.44091796875, + "epoch": 0.02987181804877629, + "grad_norm": 0.5275185704231262, + "learning_rate": 9.995920805154136e-06, + "loss": 2.3614, + "mean_token_accuracy": 0.4821869912557304, + "num_tokens": 251751924.0, + "step": 1735 + }, + { + "entropy": 2.424560546875, + "epoch": 0.029889035234971547, + "grad_norm": 0.5398510694503784, + "learning_rate": 9.995909684127036e-06, + "loss": 2.3199, + "mean_token_accuracy": 0.4816825478337705, + "num_tokens": 251897737.0, + "step": 1736 + }, + { + "entropy": 2.421142578125, + "epoch": 0.029906252421166808, + "grad_norm": 0.5590670704841614, + "learning_rate": 9.995898547967252e-06, + "loss": 2.3626, + "mean_token_accuracy": 0.49043199233710766, + "num_tokens": 252047294.0, + "step": 1737 + }, + { + "entropy": 2.443359375, + "epoch": 0.029923469607362068, + "grad_norm": 0.5771318078041077, + "learning_rate": 9.995887396674816e-06, + "loss": 2.4432, + "mean_token_accuracy": 0.4781417637132108, + "num_tokens": 252192301.0, + "step": 1738 + }, + { + "entropy": 2.5240478515625, + "epoch": 0.02994068679355733, + "grad_norm": 0.5523571968078613, + "learning_rate": 9.995876230249765e-06, + "loss": 2.4913, + "mean_token_accuracy": 0.4711290653795004, + "num_tokens": 252343478.0, + "step": 1739 + }, + { + "entropy": 2.466796875, + "epoch": 0.02995790397975259, + "grad_norm": 0.5873834490776062, + "learning_rate": 9.99586504869213e-06, + "loss": 2.4351, + "mean_token_accuracy": 0.4716539951041341, + "num_tokens": 252494029.0, + "step": 1740 + }, + { + "entropy": 2.42822265625, + "epoch": 0.02997512116594785, + "grad_norm": 0.6091635227203369, + "learning_rate": 9.99585385200195e-06, + "loss": 2.3937, + "mean_token_accuracy": 0.4848588118329644, + "num_tokens": 252644102.0, + "step": 1741 + }, + { + "entropy": 2.36083984375, + "epoch": 0.02999233835214311, + "grad_norm": 0.6105889678001404, + "learning_rate": 9.995842640179251e-06, + "loss": 2.3327, + "mean_token_accuracy": 0.4923085803166032, + "num_tokens": 252785353.0, + "step": 1742 + }, + { + "entropy": 2.4542236328125, + "epoch": 0.03000955553833837, + "grad_norm": 0.5387972593307495, + "learning_rate": 9.995831413224073e-06, + "loss": 2.3902, + "mean_token_accuracy": 0.48074496537446976, + "num_tokens": 252931252.0, + "step": 1743 + }, + { + "entropy": 2.4627685546875, + "epoch": 0.03002677272453363, + "grad_norm": 0.588346540927887, + "learning_rate": 9.995820171136447e-06, + "loss": 2.4444, + "mean_token_accuracy": 0.4807250229641795, + "num_tokens": 253081891.0, + "step": 1744 + }, + { + "entropy": 2.404052734375, + "epoch": 0.030043989910728888, + "grad_norm": 0.6005964279174805, + "learning_rate": 9.995808913916409e-06, + "loss": 2.3296, + "mean_token_accuracy": 0.4894480253569782, + "num_tokens": 253227000.0, + "step": 1745 + }, + { + "entropy": 2.4176025390625, + "epoch": 0.03006120709692415, + "grad_norm": 0.5213177800178528, + "learning_rate": 9.995797641563993e-06, + "loss": 2.3511, + "mean_token_accuracy": 0.4866582774557173, + "num_tokens": 253382573.0, + "step": 1746 + }, + { + "entropy": 2.4058837890625, + "epoch": 0.03007842428311941, + "grad_norm": 0.6120131015777588, + "learning_rate": 9.995786354079232e-06, + "loss": 2.3611, + "mean_token_accuracy": 0.48763601342216134, + "num_tokens": 253516221.0, + "step": 1747 + }, + { + "entropy": 2.4691162109375, + "epoch": 0.03009564146931467, + "grad_norm": 0.5826076865196228, + "learning_rate": 9.99577505146216e-06, + "loss": 2.4718, + "mean_token_accuracy": 0.4789294474758208, + "num_tokens": 253656220.0, + "step": 1748 + }, + { + "entropy": 2.5205078125, + "epoch": 0.03011285865550993, + "grad_norm": 0.6555241346359253, + "learning_rate": 9.995763733712813e-06, + "loss": 2.4926, + "mean_token_accuracy": 0.47109587537124753, + "num_tokens": 253792424.0, + "step": 1749 + }, + { + "entropy": 2.447998046875, + "epoch": 0.03013007584170519, + "grad_norm": 1.2172026634216309, + "learning_rate": 9.995752400831224e-06, + "loss": 2.3486, + "mean_token_accuracy": 0.4864461640827358, + "num_tokens": 253936774.0, + "step": 1750 + }, + { + "entropy": 2.4464111328125, + "epoch": 0.03014729302790045, + "grad_norm": 0.5619203448295593, + "learning_rate": 9.995741052817426e-06, + "loss": 2.4002, + "mean_token_accuracy": 0.4801822490990162, + "num_tokens": 254079519.0, + "step": 1751 + }, + { + "entropy": 2.40087890625, + "epoch": 0.030164510214095712, + "grad_norm": 0.6061223745346069, + "learning_rate": 9.995729689671457e-06, + "loss": 2.3576, + "mean_token_accuracy": 0.49237477174028754, + "num_tokens": 254218671.0, + "step": 1752 + }, + { + "entropy": 2.4599609375, + "epoch": 0.03018172740029097, + "grad_norm": 0.5428354740142822, + "learning_rate": 9.995718311393348e-06, + "loss": 2.4407, + "mean_token_accuracy": 0.47417072020471096, + "num_tokens": 254366027.0, + "step": 1753 + }, + { + "entropy": 2.444580078125, + "epoch": 0.03019894458648623, + "grad_norm": 0.6137836575508118, + "learning_rate": 9.995706917983135e-06, + "loss": 2.3982, + "mean_token_accuracy": 0.4815092282369733, + "num_tokens": 254500348.0, + "step": 1754 + }, + { + "entropy": 2.4688720703125, + "epoch": 0.03021616177268149, + "grad_norm": 0.5226655602455139, + "learning_rate": 9.99569550944085e-06, + "loss": 2.4489, + "mean_token_accuracy": 0.47875166358426213, + "num_tokens": 254665949.0, + "step": 1755 + }, + { + "entropy": 2.4661865234375, + "epoch": 0.03023337895887675, + "grad_norm": 0.5874465107917786, + "learning_rate": 9.995684085766532e-06, + "loss": 2.4685, + "mean_token_accuracy": 0.47104707127436996, + "num_tokens": 254821526.0, + "step": 1756 + }, + { + "entropy": 2.3651123046875, + "epoch": 0.03025059614507201, + "grad_norm": 0.5636343955993652, + "learning_rate": 9.995672646960214e-06, + "loss": 2.3245, + "mean_token_accuracy": 0.49339101929217577, + "num_tokens": 254965888.0, + "step": 1757 + }, + { + "entropy": 2.449951171875, + "epoch": 0.03026781333126727, + "grad_norm": 0.5968552231788635, + "learning_rate": 9.995661193021929e-06, + "loss": 2.4269, + "mean_token_accuracy": 0.47896120324730873, + "num_tokens": 255097221.0, + "step": 1758 + }, + { + "entropy": 2.50341796875, + "epoch": 0.030285030517462532, + "grad_norm": 0.6104995012283325, + "learning_rate": 9.995649723951713e-06, + "loss": 2.4343, + "mean_token_accuracy": 0.48089546570554376, + "num_tokens": 255240193.0, + "step": 1759 + }, + { + "entropy": 2.516357421875, + "epoch": 0.030302247703657793, + "grad_norm": 0.6157956719398499, + "learning_rate": 9.995638239749601e-06, + "loss": 2.4065, + "mean_token_accuracy": 0.47861207462847233, + "num_tokens": 255377884.0, + "step": 1760 + }, + { + "entropy": 2.451416015625, + "epoch": 0.03031946488985305, + "grad_norm": 0.5781038403511047, + "learning_rate": 9.995626740415627e-06, + "loss": 2.3953, + "mean_token_accuracy": 0.4827661495655775, + "num_tokens": 255545815.0, + "step": 1761 + }, + { + "entropy": 2.41162109375, + "epoch": 0.03033668207604831, + "grad_norm": 0.5962122678756714, + "learning_rate": 9.995615225949826e-06, + "loss": 2.3384, + "mean_token_accuracy": 0.48970350436866283, + "num_tokens": 255687102.0, + "step": 1762 + }, + { + "entropy": 2.56103515625, + "epoch": 0.03035389926224357, + "grad_norm": 0.556058943271637, + "learning_rate": 9.995603696352231e-06, + "loss": 2.5805, + "mean_token_accuracy": 0.46257754508405924, + "num_tokens": 255832845.0, + "step": 1763 + }, + { + "entropy": 2.5029296875, + "epoch": 0.03037111644843883, + "grad_norm": 0.5889034867286682, + "learning_rate": 9.995592151622881e-06, + "loss": 2.4863, + "mean_token_accuracy": 0.4708904637955129, + "num_tokens": 255966812.0, + "step": 1764 + }, + { + "entropy": 2.4873046875, + "epoch": 0.030388333634634092, + "grad_norm": 0.564799964427948, + "learning_rate": 9.99558059176181e-06, + "loss": 2.4345, + "mean_token_accuracy": 0.47310470743104815, + "num_tokens": 256115280.0, + "step": 1765 + }, + { + "entropy": 2.46533203125, + "epoch": 0.030405550820829352, + "grad_norm": 0.5584237575531006, + "learning_rate": 9.995569016769052e-06, + "loss": 2.4093, + "mean_token_accuracy": 0.48134846799075603, + "num_tokens": 256262835.0, + "step": 1766 + }, + { + "entropy": 2.4100341796875, + "epoch": 0.030422768007024613, + "grad_norm": 0.6054108142852783, + "learning_rate": 9.99555742664464e-06, + "loss": 2.3842, + "mean_token_accuracy": 0.4840676193125546, + "num_tokens": 256412903.0, + "step": 1767 + }, + { + "entropy": 2.4122314453125, + "epoch": 0.030439985193219873, + "grad_norm": 0.5872666835784912, + "learning_rate": 9.99554582138861e-06, + "loss": 2.4073, + "mean_token_accuracy": 0.4819220928475261, + "num_tokens": 256556383.0, + "step": 1768 + }, + { + "entropy": 2.4541015625, + "epoch": 0.030457202379415134, + "grad_norm": 0.6415526270866394, + "learning_rate": 9.995534201001e-06, + "loss": 2.4326, + "mean_token_accuracy": 0.48358411993831396, + "num_tokens": 256701332.0, + "step": 1769 + }, + { + "entropy": 2.5523681640625, + "epoch": 0.03047441956561039, + "grad_norm": 0.5595957040786743, + "learning_rate": 9.995522565481843e-06, + "loss": 2.5369, + "mean_token_accuracy": 0.46628848975524306, + "num_tokens": 256846217.0, + "step": 1770 + }, + { + "entropy": 2.4697265625, + "epoch": 0.03049163675180565, + "grad_norm": 0.5512428879737854, + "learning_rate": 9.995510914831175e-06, + "loss": 2.4261, + "mean_token_accuracy": 0.4768143934197724, + "num_tokens": 256989853.0, + "step": 1771 + }, + { + "entropy": 2.4036865234375, + "epoch": 0.030508853938000912, + "grad_norm": 0.5823377370834351, + "learning_rate": 9.995499249049032e-06, + "loss": 2.3555, + "mean_token_accuracy": 0.4956060196273029, + "num_tokens": 257135041.0, + "step": 1772 + }, + { + "entropy": 2.4488525390625, + "epoch": 0.030526071124196173, + "grad_norm": 0.555253803730011, + "learning_rate": 9.995487568135447e-06, + "loss": 2.4463, + "mean_token_accuracy": 0.4722318626008928, + "num_tokens": 257277086.0, + "step": 1773 + }, + { + "entropy": 2.419921875, + "epoch": 0.030543288310391433, + "grad_norm": 0.6303548216819763, + "learning_rate": 9.995475872090456e-06, + "loss": 2.3485, + "mean_token_accuracy": 0.48873788258060813, + "num_tokens": 257419673.0, + "step": 1774 + }, + { + "entropy": 2.552490234375, + "epoch": 0.030560505496586694, + "grad_norm": 0.5881249904632568, + "learning_rate": 9.995464160914096e-06, + "loss": 2.5034, + "mean_token_accuracy": 0.4676412958651781, + "num_tokens": 257552483.0, + "step": 1775 + }, + { + "entropy": 2.4627685546875, + "epoch": 0.030577722682781954, + "grad_norm": 0.5897058248519897, + "learning_rate": 9.995452434606401e-06, + "loss": 2.3792, + "mean_token_accuracy": 0.4818587265908718, + "num_tokens": 257696825.0, + "step": 1776 + }, + { + "entropy": 2.431396484375, + "epoch": 0.030594939868977215, + "grad_norm": 0.5501959919929504, + "learning_rate": 9.995440693167407e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.4862223519012332, + "num_tokens": 257852558.0, + "step": 1777 + }, + { + "entropy": 2.48681640625, + "epoch": 0.03061215705517247, + "grad_norm": 0.5968089699745178, + "learning_rate": 9.99542893659715e-06, + "loss": 2.4541, + "mean_token_accuracy": 0.4725658977404237, + "num_tokens": 257972949.0, + "step": 1778 + }, + { + "entropy": 2.490966796875, + "epoch": 0.030629374241367732, + "grad_norm": 0.5689330101013184, + "learning_rate": 9.995417164895665e-06, + "loss": 2.4556, + "mean_token_accuracy": 0.4732924662530422, + "num_tokens": 258118477.0, + "step": 1779 + }, + { + "entropy": 2.4151611328125, + "epoch": 0.030646591427562993, + "grad_norm": 0.6031442880630493, + "learning_rate": 9.99540537806299e-06, + "loss": 2.4075, + "mean_token_accuracy": 0.47733619809150696, + "num_tokens": 258262583.0, + "step": 1780 + }, + { + "entropy": 2.418701171875, + "epoch": 0.030663808613758253, + "grad_norm": 0.5307676792144775, + "learning_rate": 9.995393576099156e-06, + "loss": 2.4052, + "mean_token_accuracy": 0.48428564984351397, + "num_tokens": 258415703.0, + "step": 1781 + }, + { + "entropy": 2.466552734375, + "epoch": 0.030681025799953514, + "grad_norm": 0.613979697227478, + "learning_rate": 9.995381759004202e-06, + "loss": 2.4284, + "mean_token_accuracy": 0.48056693421676755, + "num_tokens": 258546742.0, + "step": 1782 + }, + { + "entropy": 2.5028076171875, + "epoch": 0.030698242986148774, + "grad_norm": 0.6007286906242371, + "learning_rate": 9.995369926778163e-06, + "loss": 2.4515, + "mean_token_accuracy": 0.4713759426958859, + "num_tokens": 258667720.0, + "step": 1783 + }, + { + "entropy": 2.459228515625, + "epoch": 0.030715460172344035, + "grad_norm": 0.5262390375137329, + "learning_rate": 9.995358079421076e-06, + "loss": 2.4149, + "mean_token_accuracy": 0.476927958894521, + "num_tokens": 258818608.0, + "step": 1784 + }, + { + "entropy": 2.476318359375, + "epoch": 0.030732677358539295, + "grad_norm": 0.6184121370315552, + "learning_rate": 9.995346216932974e-06, + "loss": 2.4888, + "mean_token_accuracy": 0.47189527517184615, + "num_tokens": 258947687.0, + "step": 1785 + }, + { + "entropy": 2.421630859375, + "epoch": 0.030749894544734552, + "grad_norm": 0.641552209854126, + "learning_rate": 9.995334339313898e-06, + "loss": 2.3801, + "mean_token_accuracy": 0.48330466262996197, + "num_tokens": 259089154.0, + "step": 1786 + }, + { + "entropy": 2.3883056640625, + "epoch": 0.030767111730929813, + "grad_norm": 0.573010265827179, + "learning_rate": 9.995322446563878e-06, + "loss": 2.3846, + "mean_token_accuracy": 0.48760603182017803, + "num_tokens": 259231925.0, + "step": 1787 + }, + { + "entropy": 2.4847412109375, + "epoch": 0.030784328917125073, + "grad_norm": 0.5725507736206055, + "learning_rate": 9.995310538682954e-06, + "loss": 2.4799, + "mean_token_accuracy": 0.4728474370203912, + "num_tokens": 259374698.0, + "step": 1788 + }, + { + "entropy": 2.4398193359375, + "epoch": 0.030801546103320334, + "grad_norm": 0.681101381778717, + "learning_rate": 9.995298615671161e-06, + "loss": 2.4197, + "mean_token_accuracy": 0.4821738447062671, + "num_tokens": 259524367.0, + "step": 1789 + }, + { + "entropy": 2.400146484375, + "epoch": 0.030818763289515595, + "grad_norm": 0.5721313953399658, + "learning_rate": 9.995286677528533e-06, + "loss": 2.363, + "mean_token_accuracy": 0.48788571590557694, + "num_tokens": 259678466.0, + "step": 1790 + }, + { + "entropy": 2.4361572265625, + "epoch": 0.030835980475710855, + "grad_norm": 0.6100559234619141, + "learning_rate": 9.995274724255111e-06, + "loss": 2.4359, + "mean_token_accuracy": 0.48224303452298045, + "num_tokens": 259822604.0, + "step": 1791 + }, + { + "entropy": 2.4278564453125, + "epoch": 0.030853197661906116, + "grad_norm": 0.5751356482505798, + "learning_rate": 9.995262755850926e-06, + "loss": 2.4072, + "mean_token_accuracy": 0.48218475840985775, + "num_tokens": 259965240.0, + "step": 1792 + }, + { + "entropy": 2.448974609375, + "epoch": 0.030870414848101376, + "grad_norm": 0.6143491864204407, + "learning_rate": 9.995250772316019e-06, + "loss": 2.4289, + "mean_token_accuracy": 0.4808902507647872, + "num_tokens": 260105881.0, + "step": 1793 + }, + { + "entropy": 2.53466796875, + "epoch": 0.030887632034296633, + "grad_norm": 0.6435692310333252, + "learning_rate": 9.995238773650422e-06, + "loss": 2.5116, + "mean_token_accuracy": 0.46682596672326326, + "num_tokens": 260232986.0, + "step": 1794 + }, + { + "entropy": 2.459716796875, + "epoch": 0.030904849220491894, + "grad_norm": 0.5735113620758057, + "learning_rate": 9.995226759854173e-06, + "loss": 2.4054, + "mean_token_accuracy": 0.4811546648852527, + "num_tokens": 260366767.0, + "step": 1795 + }, + { + "entropy": 2.465576171875, + "epoch": 0.030922066406687154, + "grad_norm": 1.1088193655014038, + "learning_rate": 9.995214730927309e-06, + "loss": 2.4319, + "mean_token_accuracy": 0.47814410272985697, + "num_tokens": 260518664.0, + "step": 1796 + }, + { + "entropy": 2.4351806640625, + "epoch": 0.030939283592882415, + "grad_norm": 0.6060841083526611, + "learning_rate": 9.995202686869867e-06, + "loss": 2.4261, + "mean_token_accuracy": 0.48954142862930894, + "num_tokens": 260665493.0, + "step": 1797 + }, + { + "entropy": 2.4580078125, + "epoch": 0.030956500779077675, + "grad_norm": 0.5547589659690857, + "learning_rate": 9.995190627681883e-06, + "loss": 2.4379, + "mean_token_accuracy": 0.476859278511256, + "num_tokens": 260813391.0, + "step": 1798 + }, + { + "entropy": 2.43603515625, + "epoch": 0.030973717965272936, + "grad_norm": 0.6191805601119995, + "learning_rate": 9.995178553363392e-06, + "loss": 2.4344, + "mean_token_accuracy": 0.47961972700431943, + "num_tokens": 260955219.0, + "step": 1799 + }, + { + "entropy": 2.4512939453125, + "epoch": 0.030990935151468196, + "grad_norm": 0.5580746531486511, + "learning_rate": 9.995166463914432e-06, + "loss": 2.3881, + "mean_token_accuracy": 0.4819506905041635, + "num_tokens": 261107398.0, + "step": 1800 + }, + { + "entropy": 2.4354248046875, + "epoch": 0.031008152337663457, + "grad_norm": 0.5572928190231323, + "learning_rate": 9.99515435933504e-06, + "loss": 2.4291, + "mean_token_accuracy": 0.47748745791614056, + "num_tokens": 261258885.0, + "step": 1801 + }, + { + "entropy": 2.4228515625, + "epoch": 0.031025369523858717, + "grad_norm": 0.5744470953941345, + "learning_rate": 9.99514223962525e-06, + "loss": 2.4143, + "mean_token_accuracy": 0.4853782169520855, + "num_tokens": 261408577.0, + "step": 1802 + }, + { + "entropy": 2.5040283203125, + "epoch": 0.031042586710053974, + "grad_norm": 0.5782204866409302, + "learning_rate": 9.995130104785103e-06, + "loss": 2.4765, + "mean_token_accuracy": 0.47027752734720707, + "num_tokens": 261547340.0, + "step": 1803 + }, + { + "entropy": 2.4500732421875, + "epoch": 0.031059803896249235, + "grad_norm": 0.6056774258613586, + "learning_rate": 9.995117954814632e-06, + "loss": 2.4059, + "mean_token_accuracy": 0.47932664630934596, + "num_tokens": 261680759.0, + "step": 1804 + }, + { + "entropy": 2.4530029296875, + "epoch": 0.031077021082444495, + "grad_norm": 0.5747044682502747, + "learning_rate": 9.995105789713874e-06, + "loss": 2.407, + "mean_token_accuracy": 0.4791347337886691, + "num_tokens": 261839318.0, + "step": 1805 + }, + { + "entropy": 2.420654296875, + "epoch": 0.031094238268639756, + "grad_norm": 0.5714244246482849, + "learning_rate": 9.995093609482868e-06, + "loss": 2.3137, + "mean_token_accuracy": 0.4866795940324664, + "num_tokens": 261983899.0, + "step": 1806 + }, + { + "entropy": 2.45361328125, + "epoch": 0.031111455454835017, + "grad_norm": 0.6132124066352844, + "learning_rate": 9.995081414121652e-06, + "loss": 2.3875, + "mean_token_accuracy": 0.4830195610411465, + "num_tokens": 262123814.0, + "step": 1807 + }, + { + "entropy": 2.4935302734375, + "epoch": 0.031128672641030277, + "grad_norm": 0.5878351330757141, + "learning_rate": 9.995069203630258e-06, + "loss": 2.4513, + "mean_token_accuracy": 0.4727471759542823, + "num_tokens": 262264324.0, + "step": 1808 + }, + { + "entropy": 2.50146484375, + "epoch": 0.031145889827225538, + "grad_norm": 0.8043968081474304, + "learning_rate": 9.995056978008728e-06, + "loss": 2.4545, + "mean_token_accuracy": 0.47300353879109025, + "num_tokens": 262404423.0, + "step": 1809 + }, + { + "entropy": 2.419677734375, + "epoch": 0.031163107013420798, + "grad_norm": 0.5692616105079651, + "learning_rate": 9.995044737257097e-06, + "loss": 2.3788, + "mean_token_accuracy": 0.4819628787226975, + "num_tokens": 262554058.0, + "step": 1810 + }, + { + "entropy": 2.4549560546875, + "epoch": 0.031180324199616055, + "grad_norm": 0.6038571000099182, + "learning_rate": 9.995032481375401e-06, + "loss": 2.4808, + "mean_token_accuracy": 0.4799831509590149, + "num_tokens": 262687962.0, + "step": 1811 + }, + { + "entropy": 2.389404296875, + "epoch": 0.031197541385811316, + "grad_norm": 0.6061134338378906, + "learning_rate": 9.995020210363678e-06, + "loss": 2.3422, + "mean_token_accuracy": 0.49405871424824, + "num_tokens": 262821281.0, + "step": 1812 + }, + { + "entropy": 2.488037109375, + "epoch": 0.031214758572006576, + "grad_norm": 0.5849748849868774, + "learning_rate": 9.995007924221966e-06, + "loss": 2.4825, + "mean_token_accuracy": 0.4694899981841445, + "num_tokens": 262963950.0, + "step": 1813 + }, + { + "entropy": 2.513671875, + "epoch": 0.031231975758201837, + "grad_norm": 0.5811004638671875, + "learning_rate": 9.994995622950301e-06, + "loss": 2.4641, + "mean_token_accuracy": 0.4680064986459911, + "num_tokens": 263089602.0, + "step": 1814 + }, + { + "entropy": 2.4034423828125, + "epoch": 0.031249192944397097, + "grad_norm": 0.5289223790168762, + "learning_rate": 9.994983306548722e-06, + "loss": 2.3548, + "mean_token_accuracy": 0.48990146769210696, + "num_tokens": 263262419.0, + "step": 1815 + }, + { + "entropy": 2.536865234375, + "epoch": 0.03126641013059236, + "grad_norm": 0.5391739010810852, + "learning_rate": 9.994970975017264e-06, + "loss": 2.5102, + "mean_token_accuracy": 0.46705460641533136, + "num_tokens": 263415410.0, + "step": 1816 + }, + { + "entropy": 2.4627685546875, + "epoch": 0.03128362731678762, + "grad_norm": 0.5562330484390259, + "learning_rate": 9.994958628355964e-06, + "loss": 2.4007, + "mean_token_accuracy": 0.4822265561670065, + "num_tokens": 263560708.0, + "step": 1817 + }, + { + "entropy": 2.5174560546875, + "epoch": 0.03130084450298288, + "grad_norm": 0.589061975479126, + "learning_rate": 9.994946266564862e-06, + "loss": 2.4517, + "mean_token_accuracy": 0.4671993814408779, + "num_tokens": 263706338.0, + "step": 1818 + }, + { + "entropy": 2.431396484375, + "epoch": 0.03131806168917814, + "grad_norm": 0.5534610152244568, + "learning_rate": 9.994933889643995e-06, + "loss": 2.4091, + "mean_token_accuracy": 0.4794508134946227, + "num_tokens": 263846694.0, + "step": 1819 + }, + { + "entropy": 2.5390625, + "epoch": 0.0313352788753734, + "grad_norm": 0.691927969455719, + "learning_rate": 9.9949214975934e-06, + "loss": 2.5067, + "mean_token_accuracy": 0.46535237692296505, + "num_tokens": 263993842.0, + "step": 1820 + }, + { + "entropy": 2.4317626953125, + "epoch": 0.03135249606156866, + "grad_norm": 0.5267771482467651, + "learning_rate": 9.994909090413113e-06, + "loss": 2.3885, + "mean_token_accuracy": 0.48383715003728867, + "num_tokens": 264149235.0, + "step": 1821 + }, + { + "entropy": 2.4241943359375, + "epoch": 0.03136971324776392, + "grad_norm": 0.5582144856452942, + "learning_rate": 9.994896668103174e-06, + "loss": 2.4032, + "mean_token_accuracy": 0.48607371421530843, + "num_tokens": 264305264.0, + "step": 1822 + }, + { + "entropy": 2.4718017578125, + "epoch": 0.03138693043395918, + "grad_norm": 0.5681629180908203, + "learning_rate": 9.994884230663619e-06, + "loss": 2.4183, + "mean_token_accuracy": 0.47495387122035027, + "num_tokens": 264445168.0, + "step": 1823 + }, + { + "entropy": 2.498779296875, + "epoch": 0.031404147620154435, + "grad_norm": 0.5814370512962341, + "learning_rate": 9.994871778094486e-06, + "loss": 2.4998, + "mean_token_accuracy": 0.475694193970412, + "num_tokens": 264584998.0, + "step": 1824 + }, + { + "entropy": 2.4818115234375, + "epoch": 0.031421364806349696, + "grad_norm": 0.5599236488342285, + "learning_rate": 9.994859310395812e-06, + "loss": 2.4159, + "mean_token_accuracy": 0.47769313864409924, + "num_tokens": 264735369.0, + "step": 1825 + }, + { + "entropy": 2.4952392578125, + "epoch": 0.031438581992544956, + "grad_norm": 0.5620182156562805, + "learning_rate": 9.994846827567638e-06, + "loss": 2.481, + "mean_token_accuracy": 0.4710625964216888, + "num_tokens": 264876244.0, + "step": 1826 + }, + { + "entropy": 2.4227294921875, + "epoch": 0.03145579917874022, + "grad_norm": 0.5782105326652527, + "learning_rate": 9.994834329609997e-06, + "loss": 2.3754, + "mean_token_accuracy": 0.4855298362672329, + "num_tokens": 265015068.0, + "step": 1827 + }, + { + "entropy": 2.49755859375, + "epoch": 0.03147301636493548, + "grad_norm": 0.578761637210846, + "learning_rate": 9.994821816522931e-06, + "loss": 2.4221, + "mean_token_accuracy": 0.47399685718119144, + "num_tokens": 265142077.0, + "step": 1828 + }, + { + "entropy": 2.4862060546875, + "epoch": 0.03149023355113074, + "grad_norm": 0.5534428954124451, + "learning_rate": 9.994809288306475e-06, + "loss": 2.4853, + "mean_token_accuracy": 0.47513916390016675, + "num_tokens": 265286452.0, + "step": 1829 + }, + { + "entropy": 2.52734375, + "epoch": 0.031507450737326, + "grad_norm": 0.5408220887184143, + "learning_rate": 9.99479674496067e-06, + "loss": 2.4523, + "mean_token_accuracy": 0.47395798983052373, + "num_tokens": 265441215.0, + "step": 1830 + }, + { + "entropy": 2.4600830078125, + "epoch": 0.03152466792352126, + "grad_norm": 0.5792752504348755, + "learning_rate": 9.994784186485551e-06, + "loss": 2.4759, + "mean_token_accuracy": 0.47542623011395335, + "num_tokens": 265586593.0, + "step": 1831 + }, + { + "entropy": 2.4188232421875, + "epoch": 0.03154188510971652, + "grad_norm": 0.5839810371398926, + "learning_rate": 9.994771612881157e-06, + "loss": 2.3782, + "mean_token_accuracy": 0.48431128449738026, + "num_tokens": 265725310.0, + "step": 1832 + }, + { + "entropy": 2.454345703125, + "epoch": 0.03155910229591178, + "grad_norm": 0.5647464394569397, + "learning_rate": 9.994759024147526e-06, + "loss": 2.4133, + "mean_token_accuracy": 0.47814659122377634, + "num_tokens": 265858415.0, + "step": 1833 + }, + { + "entropy": 2.45654296875, + "epoch": 0.03157631948210704, + "grad_norm": 0.5568596124649048, + "learning_rate": 9.994746420284698e-06, + "loss": 2.4259, + "mean_token_accuracy": 0.4771571112796664, + "num_tokens": 266011950.0, + "step": 1834 + }, + { + "entropy": 2.413330078125, + "epoch": 0.0315935366683023, + "grad_norm": 0.5861586332321167, + "learning_rate": 9.994733801292709e-06, + "loss": 2.3792, + "mean_token_accuracy": 0.4801113181747496, + "num_tokens": 266157685.0, + "step": 1835 + }, + { + "entropy": 2.4085693359375, + "epoch": 0.03161075385449756, + "grad_norm": 0.5990592837333679, + "learning_rate": 9.994721167171597e-06, + "loss": 2.3922, + "mean_token_accuracy": 0.48092158418148756, + "num_tokens": 266293266.0, + "step": 1836 + }, + { + "entropy": 2.432373046875, + "epoch": 0.03162797104069282, + "grad_norm": 0.5652367472648621, + "learning_rate": 9.9947085179214e-06, + "loss": 2.3796, + "mean_token_accuracy": 0.48524676635861397, + "num_tokens": 266448458.0, + "step": 1837 + }, + { + "entropy": 2.5150146484375, + "epoch": 0.03164518822688808, + "grad_norm": 0.581610918045044, + "learning_rate": 9.99469585354216e-06, + "loss": 2.4421, + "mean_token_accuracy": 0.4794493457302451, + "num_tokens": 266588403.0, + "step": 1838 + }, + { + "entropy": 2.479248046875, + "epoch": 0.03166240541308334, + "grad_norm": 0.5524604320526123, + "learning_rate": 9.994683174033913e-06, + "loss": 2.409, + "mean_token_accuracy": 0.4818028402514756, + "num_tokens": 266748122.0, + "step": 1839 + }, + { + "entropy": 2.4130859375, + "epoch": 0.031679622599278597, + "grad_norm": 0.5744349360466003, + "learning_rate": 9.994670479396695e-06, + "loss": 2.3541, + "mean_token_accuracy": 0.4899553945288062, + "num_tokens": 266889574.0, + "step": 1840 + }, + { + "entropy": 2.50341796875, + "epoch": 0.03169683978547386, + "grad_norm": 0.5297801494598389, + "learning_rate": 9.994657769630546e-06, + "loss": 2.4784, + "mean_token_accuracy": 0.4729268723167479, + "num_tokens": 267038916.0, + "step": 1841 + }, + { + "entropy": 2.42041015625, + "epoch": 0.03171405697166912, + "grad_norm": 0.570783257484436, + "learning_rate": 9.994645044735507e-06, + "loss": 2.377, + "mean_token_accuracy": 0.4859691094607115, + "num_tokens": 267183760.0, + "step": 1842 + }, + { + "entropy": 2.4842529296875, + "epoch": 0.03173127415786438, + "grad_norm": 0.5840896368026733, + "learning_rate": 9.994632304711616e-06, + "loss": 2.4273, + "mean_token_accuracy": 0.4811637466773391, + "num_tokens": 267324455.0, + "step": 1843 + }, + { + "entropy": 2.4864501953125, + "epoch": 0.03174849134405964, + "grad_norm": 0.5744589567184448, + "learning_rate": 9.994619549558908e-06, + "loss": 2.4713, + "mean_token_accuracy": 0.4727841066196561, + "num_tokens": 267472269.0, + "step": 1844 + }, + { + "entropy": 2.46923828125, + "epoch": 0.0317657085302549, + "grad_norm": 0.5857048630714417, + "learning_rate": 9.994606779277425e-06, + "loss": 2.4301, + "mean_token_accuracy": 0.48071943665854633, + "num_tokens": 267624717.0, + "step": 1845 + }, + { + "entropy": 2.420654296875, + "epoch": 0.03178292571645016, + "grad_norm": 0.5835974812507629, + "learning_rate": 9.994593993867203e-06, + "loss": 2.3782, + "mean_token_accuracy": 0.4824722218327224, + "num_tokens": 267762034.0, + "step": 1846 + }, + { + "entropy": 2.503662109375, + "epoch": 0.03180014290264542, + "grad_norm": 0.5819142460823059, + "learning_rate": 9.994581193328283e-06, + "loss": 2.5042, + "mean_token_accuracy": 0.47309408662840724, + "num_tokens": 267897855.0, + "step": 1847 + }, + { + "entropy": 2.4564208984375, + "epoch": 0.03181736008884068, + "grad_norm": 0.6381955742835999, + "learning_rate": 9.994568377660703e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.48583648912608624, + "num_tokens": 268025165.0, + "step": 1848 + }, + { + "entropy": 2.439453125, + "epoch": 0.03183457727503594, + "grad_norm": 0.5733355283737183, + "learning_rate": 9.994555546864504e-06, + "loss": 2.418, + "mean_token_accuracy": 0.48078050184994936, + "num_tokens": 268172403.0, + "step": 1849 + }, + { + "entropy": 2.3912353515625, + "epoch": 0.0318517944612312, + "grad_norm": 0.5635650157928467, + "learning_rate": 9.99454270093972e-06, + "loss": 2.3733, + "mean_token_accuracy": 0.4842658434063196, + "num_tokens": 268337839.0, + "step": 1850 + }, + { + "entropy": 2.4051513671875, + "epoch": 0.03186901164742646, + "grad_norm": 0.5738821625709534, + "learning_rate": 9.994529839886395e-06, + "loss": 2.332, + "mean_token_accuracy": 0.49162986455485225, + "num_tokens": 268490293.0, + "step": 1851 + }, + { + "entropy": 2.4752197265625, + "epoch": 0.03188622883362172, + "grad_norm": 0.5878849029541016, + "learning_rate": 9.994516963704564e-06, + "loss": 2.4238, + "mean_token_accuracy": 0.47905471734702587, + "num_tokens": 268627952.0, + "step": 1852 + }, + { + "entropy": 2.494384765625, + "epoch": 0.03190344601981698, + "grad_norm": 0.5611072182655334, + "learning_rate": 9.994504072394268e-06, + "loss": 2.4512, + "mean_token_accuracy": 0.4697739346884191, + "num_tokens": 268771756.0, + "step": 1853 + }, + { + "entropy": 2.4912109375, + "epoch": 0.031920663206012244, + "grad_norm": 0.5786696672439575, + "learning_rate": 9.994491165955546e-06, + "loss": 2.4426, + "mean_token_accuracy": 0.47656662855297327, + "num_tokens": 268906129.0, + "step": 1854 + }, + { + "entropy": 2.48193359375, + "epoch": 0.031937880392207504, + "grad_norm": 0.5850675106048584, + "learning_rate": 9.994478244388437e-06, + "loss": 2.4469, + "mean_token_accuracy": 0.47550681745633483, + "num_tokens": 269056386.0, + "step": 1855 + }, + { + "entropy": 2.499755859375, + "epoch": 0.031955097578402765, + "grad_norm": 0.5520582795143127, + "learning_rate": 9.994465307692978e-06, + "loss": 2.4417, + "mean_token_accuracy": 0.47765777353197336, + "num_tokens": 269195599.0, + "step": 1856 + }, + { + "entropy": 2.4632568359375, + "epoch": 0.03197231476459802, + "grad_norm": 0.5770227313041687, + "learning_rate": 9.994452355869211e-06, + "loss": 2.4842, + "mean_token_accuracy": 0.4763926393352449, + "num_tokens": 269336161.0, + "step": 1857 + }, + { + "entropy": 2.4757080078125, + "epoch": 0.03198953195079328, + "grad_norm": 0.6584348678588867, + "learning_rate": 9.994439388917177e-06, + "loss": 2.3984, + "mean_token_accuracy": 0.4819053406827152, + "num_tokens": 269461648.0, + "step": 1858 + }, + { + "entropy": 2.427001953125, + "epoch": 0.03200674913698854, + "grad_norm": 0.5825558304786682, + "learning_rate": 9.99442640683691e-06, + "loss": 2.3939, + "mean_token_accuracy": 0.4860147815197706, + "num_tokens": 269614043.0, + "step": 1859 + }, + { + "entropy": 2.5079345703125, + "epoch": 0.0320239663231838, + "grad_norm": 0.5787304043769836, + "learning_rate": 9.994413409628451e-06, + "loss": 2.526, + "mean_token_accuracy": 0.4718983927741647, + "num_tokens": 269763008.0, + "step": 1860 + }, + { + "entropy": 2.4483642578125, + "epoch": 0.03204118350937906, + "grad_norm": 0.5473356246948242, + "learning_rate": 9.994400397291843e-06, + "loss": 2.4192, + "mean_token_accuracy": 0.4761442271992564, + "num_tokens": 269916961.0, + "step": 1861 + }, + { + "entropy": 2.347412109375, + "epoch": 0.03205840069557432, + "grad_norm": 0.5818413496017456, + "learning_rate": 9.99438736982712e-06, + "loss": 2.3176, + "mean_token_accuracy": 0.49792582634836435, + "num_tokens": 270053323.0, + "step": 1862 + }, + { + "entropy": 2.522216796875, + "epoch": 0.03207561788176958, + "grad_norm": 0.5837598443031311, + "learning_rate": 9.994374327234326e-06, + "loss": 2.5254, + "mean_token_accuracy": 0.4689773670397699, + "num_tokens": 270183004.0, + "step": 1863 + }, + { + "entropy": 2.424560546875, + "epoch": 0.03209283506796484, + "grad_norm": 0.5816727876663208, + "learning_rate": 9.994361269513499e-06, + "loss": 2.3983, + "mean_token_accuracy": 0.4842468095012009, + "num_tokens": 270337766.0, + "step": 1864 + }, + { + "entropy": 2.4515380859375, + "epoch": 0.0321100522541601, + "grad_norm": 0.5595372319221497, + "learning_rate": 9.994348196664679e-06, + "loss": 2.4535, + "mean_token_accuracy": 0.4758255537599325, + "num_tokens": 270501411.0, + "step": 1865 + }, + { + "entropy": 2.4014892578125, + "epoch": 0.03212726944035536, + "grad_norm": 0.5381937623023987, + "learning_rate": 9.994335108687903e-06, + "loss": 2.3243, + "mean_token_accuracy": 0.4866001163609326, + "num_tokens": 270652865.0, + "step": 1866 + }, + { + "entropy": 2.427734375, + "epoch": 0.032144486626550624, + "grad_norm": 0.5453839302062988, + "learning_rate": 9.994322005583213e-06, + "loss": 2.3798, + "mean_token_accuracy": 0.4782120902091265, + "num_tokens": 270798390.0, + "step": 1867 + }, + { + "entropy": 2.5419921875, + "epoch": 0.032161703812745884, + "grad_norm": 0.6079609394073486, + "learning_rate": 9.994308887350647e-06, + "loss": 2.5727, + "mean_token_accuracy": 0.46313366387039423, + "num_tokens": 270943300.0, + "step": 1868 + }, + { + "entropy": 2.486083984375, + "epoch": 0.032178920998941145, + "grad_norm": 0.597481906414032, + "learning_rate": 9.994295753990247e-06, + "loss": 2.4698, + "mean_token_accuracy": 0.46938649471849203, + "num_tokens": 271077719.0, + "step": 1869 + }, + { + "entropy": 2.40771484375, + "epoch": 0.032196138185136405, + "grad_norm": 1.105528473854065, + "learning_rate": 9.99428260550205e-06, + "loss": 2.3737, + "mean_token_accuracy": 0.4825026970356703, + "num_tokens": 271237964.0, + "step": 1870 + }, + { + "entropy": 2.4666748046875, + "epoch": 0.032213355371331666, + "grad_norm": 0.5856791138648987, + "learning_rate": 9.9942694418861e-06, + "loss": 2.4256, + "mean_token_accuracy": 0.47253927774727345, + "num_tokens": 271380338.0, + "step": 1871 + }, + { + "entropy": 2.3770751953125, + "epoch": 0.032230572557526926, + "grad_norm": 0.5685958862304688, + "learning_rate": 9.994256263142433e-06, + "loss": 2.3337, + "mean_token_accuracy": 0.4951976570300758, + "num_tokens": 271518289.0, + "step": 1872 + }, + { + "entropy": 2.5079345703125, + "epoch": 0.03224778974372219, + "grad_norm": 0.5702466368675232, + "learning_rate": 9.99424306927109e-06, + "loss": 2.4513, + "mean_token_accuracy": 0.4732716833241284, + "num_tokens": 271660591.0, + "step": 1873 + }, + { + "entropy": 2.501953125, + "epoch": 0.03226500692991744, + "grad_norm": 0.5536970496177673, + "learning_rate": 9.994229860272114e-06, + "loss": 2.4598, + "mean_token_accuracy": 0.47078709537163377, + "num_tokens": 271815738.0, + "step": 1874 + }, + { + "entropy": 2.433349609375, + "epoch": 0.0322822241161127, + "grad_norm": 0.5575986504554749, + "learning_rate": 9.99421663614554e-06, + "loss": 2.3716, + "mean_token_accuracy": 0.4846403314732015, + "num_tokens": 271964724.0, + "step": 1875 + }, + { + "entropy": 2.439208984375, + "epoch": 0.03229944130230796, + "grad_norm": 0.594307541847229, + "learning_rate": 9.99420339689141e-06, + "loss": 2.3674, + "mean_token_accuracy": 0.48272377625107765, + "num_tokens": 272103280.0, + "step": 1876 + }, + { + "entropy": 2.382080078125, + "epoch": 0.03231665848850322, + "grad_norm": 0.5934799909591675, + "learning_rate": 9.994190142509766e-06, + "loss": 2.3689, + "mean_token_accuracy": 0.4859785996377468, + "num_tokens": 272237744.0, + "step": 1877 + }, + { + "entropy": 2.453369140625, + "epoch": 0.03233387567469848, + "grad_norm": 0.5809702277183533, + "learning_rate": 9.994176873000646e-06, + "loss": 2.4275, + "mean_token_accuracy": 0.4772698413580656, + "num_tokens": 272384323.0, + "step": 1878 + }, + { + "entropy": 2.457275390625, + "epoch": 0.03235109286089374, + "grad_norm": 0.5598036646842957, + "learning_rate": 9.99416358836409e-06, + "loss": 2.4166, + "mean_token_accuracy": 0.47385495621711016, + "num_tokens": 272530813.0, + "step": 1879 + }, + { + "entropy": 2.51318359375, + "epoch": 0.032368310047089004, + "grad_norm": 0.567221462726593, + "learning_rate": 9.994150288600139e-06, + "loss": 2.5107, + "mean_token_accuracy": 0.4726970442570746, + "num_tokens": 272679093.0, + "step": 1880 + }, + { + "entropy": 2.4515380859375, + "epoch": 0.032385527233284264, + "grad_norm": 0.6015093922615051, + "learning_rate": 9.994136973708834e-06, + "loss": 2.399, + "mean_token_accuracy": 0.47881753370165825, + "num_tokens": 272819409.0, + "step": 1881 + }, + { + "entropy": 2.424072265625, + "epoch": 0.032402744419479525, + "grad_norm": 0.5528237819671631, + "learning_rate": 9.994123643690214e-06, + "loss": 2.3889, + "mean_token_accuracy": 0.4852414163760841, + "num_tokens": 272964763.0, + "step": 1882 + }, + { + "entropy": 2.4576416015625, + "epoch": 0.032419961605674785, + "grad_norm": 0.6007446050643921, + "learning_rate": 9.99411029854432e-06, + "loss": 2.4264, + "mean_token_accuracy": 0.4797611120156944, + "num_tokens": 273090453.0, + "step": 1883 + }, + { + "entropy": 2.463623046875, + "epoch": 0.032437178791870046, + "grad_norm": 0.5631487369537354, + "learning_rate": 9.994096938271193e-06, + "loss": 2.4681, + "mean_token_accuracy": 0.4756158501841128, + "num_tokens": 273239046.0, + "step": 1884 + }, + { + "entropy": 2.4407958984375, + "epoch": 0.032454395978065306, + "grad_norm": 0.5506601929664612, + "learning_rate": 9.994083562870873e-06, + "loss": 2.3948, + "mean_token_accuracy": 0.47984826657921076, + "num_tokens": 273396483.0, + "step": 1885 + }, + { + "entropy": 2.4609375, + "epoch": 0.03247161316426057, + "grad_norm": 0.5861914157867432, + "learning_rate": 9.9940701723434e-06, + "loss": 2.4491, + "mean_token_accuracy": 0.47965921740978956, + "num_tokens": 273539136.0, + "step": 1886 + }, + { + "entropy": 2.4434814453125, + "epoch": 0.03248883035045583, + "grad_norm": 0.5634111166000366, + "learning_rate": 9.994056766688815e-06, + "loss": 2.4001, + "mean_token_accuracy": 0.4822332635521889, + "num_tokens": 273683327.0, + "step": 1887 + }, + { + "entropy": 2.4342041015625, + "epoch": 0.03250604753665109, + "grad_norm": 0.6063517332077026, + "learning_rate": 9.994043345907158e-06, + "loss": 2.3837, + "mean_token_accuracy": 0.49188648257404566, + "num_tokens": 273812912.0, + "step": 1888 + }, + { + "entropy": 2.454345703125, + "epoch": 0.03252326472284635, + "grad_norm": 0.6061086058616638, + "learning_rate": 9.99402990999847e-06, + "loss": 2.3902, + "mean_token_accuracy": 0.4804657520726323, + "num_tokens": 273953195.0, + "step": 1889 + }, + { + "entropy": 2.3865966796875, + "epoch": 0.0325404819090416, + "grad_norm": 0.5631823539733887, + "learning_rate": 9.994016458962795e-06, + "loss": 2.3116, + "mean_token_accuracy": 0.4911833154037595, + "num_tokens": 274105327.0, + "step": 1890 + }, + { + "entropy": 2.44384765625, + "epoch": 0.03255769909523686, + "grad_norm": 0.5700172185897827, + "learning_rate": 9.994002992800167e-06, + "loss": 2.4395, + "mean_token_accuracy": 0.4804955665022135, + "num_tokens": 274245805.0, + "step": 1891 + }, + { + "entropy": 2.508056640625, + "epoch": 0.03257491628143212, + "grad_norm": 0.5785242915153503, + "learning_rate": 9.993989511510633e-06, + "loss": 2.4574, + "mean_token_accuracy": 0.47288175392895937, + "num_tokens": 274390327.0, + "step": 1892 + }, + { + "entropy": 2.5166015625, + "epoch": 0.032592133467627384, + "grad_norm": 0.5731043815612793, + "learning_rate": 9.99397601509423e-06, + "loss": 2.4638, + "mean_token_accuracy": 0.47718239948153496, + "num_tokens": 274536899.0, + "step": 1893 + }, + { + "entropy": 2.3447265625, + "epoch": 0.032609350653822644, + "grad_norm": 0.6108200550079346, + "learning_rate": 9.993962503551e-06, + "loss": 2.2998, + "mean_token_accuracy": 0.504625148139894, + "num_tokens": 274690625.0, + "step": 1894 + }, + { + "entropy": 2.4359130859375, + "epoch": 0.032626567840017905, + "grad_norm": 0.5686972141265869, + "learning_rate": 9.993948976880985e-06, + "loss": 2.3772, + "mean_token_accuracy": 0.4784249165095389, + "num_tokens": 274857384.0, + "step": 1895 + }, + { + "entropy": 2.465576171875, + "epoch": 0.032643785026213165, + "grad_norm": 0.5697457194328308, + "learning_rate": 9.993935435084225e-06, + "loss": 2.3909, + "mean_token_accuracy": 0.483038027305156, + "num_tokens": 275007281.0, + "step": 1896 + }, + { + "entropy": 2.4967041015625, + "epoch": 0.032661002212408426, + "grad_norm": 0.5822637677192688, + "learning_rate": 9.99392187816076e-06, + "loss": 2.4637, + "mean_token_accuracy": 0.46831735828891397, + "num_tokens": 275145804.0, + "step": 1897 + }, + { + "entropy": 2.4696044921875, + "epoch": 0.032678219398603686, + "grad_norm": 0.5560740828514099, + "learning_rate": 9.993908306110632e-06, + "loss": 2.4337, + "mean_token_accuracy": 0.4820185494609177, + "num_tokens": 275303743.0, + "step": 1898 + }, + { + "entropy": 2.4893798828125, + "epoch": 0.03269543658479895, + "grad_norm": 0.5658345222473145, + "learning_rate": 9.993894718933882e-06, + "loss": 2.4622, + "mean_token_accuracy": 0.4734317073598504, + "num_tokens": 275455938.0, + "step": 1899 + }, + { + "entropy": 2.367431640625, + "epoch": 0.03271265377099421, + "grad_norm": 0.5981510281562805, + "learning_rate": 9.993881116630556e-06, + "loss": 2.2978, + "mean_token_accuracy": 0.495200231205672, + "num_tokens": 275607789.0, + "step": 1900 + }, + { + "entropy": 2.46435546875, + "epoch": 0.03272987095718947, + "grad_norm": 0.5610291361808777, + "learning_rate": 9.993867499200684e-06, + "loss": 2.4479, + "mean_token_accuracy": 0.4744836580939591, + "num_tokens": 275761693.0, + "step": 1901 + }, + { + "entropy": 2.5068359375, + "epoch": 0.03274708814338473, + "grad_norm": 0.6446607708930969, + "learning_rate": 9.993853866644319e-06, + "loss": 2.4878, + "mean_token_accuracy": 0.46817948622629046, + "num_tokens": 275887793.0, + "step": 1902 + }, + { + "entropy": 2.503662109375, + "epoch": 0.03276430532957999, + "grad_norm": 0.6273615956306458, + "learning_rate": 9.993840218961495e-06, + "loss": 2.4879, + "mean_token_accuracy": 0.4687243062071502, + "num_tokens": 276018500.0, + "step": 1903 + }, + { + "entropy": 2.498779296875, + "epoch": 0.03278152251577525, + "grad_norm": 0.5124494433403015, + "learning_rate": 9.993826556152255e-06, + "loss": 2.4705, + "mean_token_accuracy": 0.4730762280523777, + "num_tokens": 276183672.0, + "step": 1904 + }, + { + "entropy": 2.518798828125, + "epoch": 0.03279873970197051, + "grad_norm": 0.5735337138175964, + "learning_rate": 9.993812878216642e-06, + "loss": 2.4894, + "mean_token_accuracy": 0.47411146527156234, + "num_tokens": 276336067.0, + "step": 1905 + }, + { + "entropy": 2.498291015625, + "epoch": 0.03281595688816577, + "grad_norm": 0.6396939158439636, + "learning_rate": 9.993799185154695e-06, + "loss": 2.4308, + "mean_token_accuracy": 0.4775810223072767, + "num_tokens": 276454633.0, + "step": 1906 + }, + { + "entropy": 2.43798828125, + "epoch": 0.032833174074361024, + "grad_norm": 0.5700352191925049, + "learning_rate": 9.993785476966458e-06, + "loss": 2.4091, + "mean_token_accuracy": 0.47954965522512794, + "num_tokens": 276592071.0, + "step": 1907 + }, + { + "entropy": 2.555419921875, + "epoch": 0.032850391260556285, + "grad_norm": 0.5614842772483826, + "learning_rate": 9.993771753651971e-06, + "loss": 2.5613, + "mean_token_accuracy": 0.4670336083509028, + "num_tokens": 276746077.0, + "step": 1908 + }, + { + "entropy": 2.423095703125, + "epoch": 0.032867608446751545, + "grad_norm": 0.5809895396232605, + "learning_rate": 9.993758015211276e-06, + "loss": 2.3669, + "mean_token_accuracy": 0.48658125940710306, + "num_tokens": 276900397.0, + "step": 1909 + }, + { + "entropy": 2.384765625, + "epoch": 0.032884825632946806, + "grad_norm": 0.5184022784233093, + "learning_rate": 9.993744261644414e-06, + "loss": 2.3631, + "mean_token_accuracy": 0.4841692647896707, + "num_tokens": 277080897.0, + "step": 1910 + }, + { + "entropy": 2.4111328125, + "epoch": 0.032902042819142066, + "grad_norm": 0.575218915939331, + "learning_rate": 9.993730492951428e-06, + "loss": 2.3794, + "mean_token_accuracy": 0.48858750332146883, + "num_tokens": 277229579.0, + "step": 1911 + }, + { + "entropy": 2.4375, + "epoch": 0.03291926000533733, + "grad_norm": 0.8869200944900513, + "learning_rate": 9.993716709132359e-06, + "loss": 2.3684, + "mean_token_accuracy": 0.48189750453457236, + "num_tokens": 277362179.0, + "step": 1912 + }, + { + "entropy": 2.4571533203125, + "epoch": 0.03293647719153259, + "grad_norm": 0.5700924396514893, + "learning_rate": 9.993702910187247e-06, + "loss": 2.3851, + "mean_token_accuracy": 0.48076847614720464, + "num_tokens": 277520460.0, + "step": 1913 + }, + { + "entropy": 2.51025390625, + "epoch": 0.03295369437772785, + "grad_norm": 0.6675810813903809, + "learning_rate": 9.993689096116138e-06, + "loss": 2.4804, + "mean_token_accuracy": 0.47350738383829594, + "num_tokens": 277667131.0, + "step": 1914 + }, + { + "entropy": 2.4278564453125, + "epoch": 0.03297091156392311, + "grad_norm": 0.6161773204803467, + "learning_rate": 9.993675266919068e-06, + "loss": 2.3634, + "mean_token_accuracy": 0.48864840995520353, + "num_tokens": 277798538.0, + "step": 1915 + }, + { + "entropy": 2.42724609375, + "epoch": 0.03298812875011837, + "grad_norm": 0.6031524538993835, + "learning_rate": 9.993661422596083e-06, + "loss": 2.3735, + "mean_token_accuracy": 0.4872046741656959, + "num_tokens": 277938299.0, + "step": 1916 + }, + { + "entropy": 2.459716796875, + "epoch": 0.03300534593631363, + "grad_norm": 0.5436295866966248, + "learning_rate": 9.993647563147226e-06, + "loss": 2.4323, + "mean_token_accuracy": 0.47822323255240917, + "num_tokens": 278080906.0, + "step": 1917 + }, + { + "entropy": 2.482177734375, + "epoch": 0.03302256312250889, + "grad_norm": 0.5724844336509705, + "learning_rate": 9.993633688572536e-06, + "loss": 2.4886, + "mean_token_accuracy": 0.47133151395246387, + "num_tokens": 278220379.0, + "step": 1918 + }, + { + "entropy": 2.4127197265625, + "epoch": 0.03303978030870415, + "grad_norm": 0.583770751953125, + "learning_rate": 9.993619798872056e-06, + "loss": 2.3904, + "mean_token_accuracy": 0.48571639275178313, + "num_tokens": 278372188.0, + "step": 1919 + }, + { + "entropy": 2.45849609375, + "epoch": 0.03305699749489941, + "grad_norm": 0.5579352378845215, + "learning_rate": 9.993605894045828e-06, + "loss": 2.4178, + "mean_token_accuracy": 0.4752850723452866, + "num_tokens": 278515572.0, + "step": 1920 + }, + { + "entropy": 2.46728515625, + "epoch": 0.03307421468109467, + "grad_norm": 0.5899266600608826, + "learning_rate": 9.993591974093896e-06, + "loss": 2.4453, + "mean_token_accuracy": 0.4791210265830159, + "num_tokens": 278659949.0, + "step": 1921 + }, + { + "entropy": 2.431640625, + "epoch": 0.03309143186728993, + "grad_norm": 0.5982487797737122, + "learning_rate": 9.993578039016298e-06, + "loss": 2.4021, + "mean_token_accuracy": 0.4784347270615399, + "num_tokens": 278784019.0, + "step": 1922 + }, + { + "entropy": 2.448486328125, + "epoch": 0.03310864905348519, + "grad_norm": 0.7279030680656433, + "learning_rate": 9.993564088813078e-06, + "loss": 2.4092, + "mean_token_accuracy": 0.47610331838950515, + "num_tokens": 278941353.0, + "step": 1923 + }, + { + "entropy": 2.5162353515625, + "epoch": 0.033125866239680446, + "grad_norm": 0.5260778665542603, + "learning_rate": 9.993550123484282e-06, + "loss": 2.4839, + "mean_token_accuracy": 0.4682774804532528, + "num_tokens": 279101143.0, + "step": 1924 + }, + { + "entropy": 2.448974609375, + "epoch": 0.033143083425875706, + "grad_norm": 0.5392525792121887, + "learning_rate": 9.993536143029946e-06, + "loss": 2.3976, + "mean_token_accuracy": 0.479318049736321, + "num_tokens": 279249944.0, + "step": 1925 + }, + { + "entropy": 2.4193115234375, + "epoch": 0.03316030061207097, + "grad_norm": 0.5925559401512146, + "learning_rate": 9.993522147450118e-06, + "loss": 2.3594, + "mean_token_accuracy": 0.49047204852104187, + "num_tokens": 279387621.0, + "step": 1926 + }, + { + "entropy": 2.51318359375, + "epoch": 0.03317751779826623, + "grad_norm": 0.6154163479804993, + "learning_rate": 9.993508136744838e-06, + "loss": 2.4331, + "mean_token_accuracy": 0.47704603895545006, + "num_tokens": 279525188.0, + "step": 1927 + }, + { + "entropy": 2.469482421875, + "epoch": 0.03319473498446149, + "grad_norm": 0.5946447253227234, + "learning_rate": 9.993494110914149e-06, + "loss": 2.4433, + "mean_token_accuracy": 0.4768550405278802, + "num_tokens": 279675297.0, + "step": 1928 + }, + { + "entropy": 2.4324951171875, + "epoch": 0.03321195217065675, + "grad_norm": 0.5541896820068359, + "learning_rate": 9.99348006995809e-06, + "loss": 2.4008, + "mean_token_accuracy": 0.4792257468216121, + "num_tokens": 279826165.0, + "step": 1929 + }, + { + "entropy": 2.430908203125, + "epoch": 0.03322916935685201, + "grad_norm": 0.5395227670669556, + "learning_rate": 9.993466013876707e-06, + "loss": 2.3978, + "mean_token_accuracy": 0.4806228969246149, + "num_tokens": 279980558.0, + "step": 1930 + }, + { + "entropy": 2.423828125, + "epoch": 0.03324638654304727, + "grad_norm": 0.5909799933433533, + "learning_rate": 9.993451942670044e-06, + "loss": 2.393, + "mean_token_accuracy": 0.48585162637755275, + "num_tokens": 280119585.0, + "step": 1931 + }, + { + "entropy": 2.512939453125, + "epoch": 0.03326360372924253, + "grad_norm": 0.6035844683647156, + "learning_rate": 9.993437856338139e-06, + "loss": 2.5345, + "mean_token_accuracy": 0.4715218679048121, + "num_tokens": 280244876.0, + "step": 1932 + }, + { + "entropy": 2.4185791015625, + "epoch": 0.03328082091543779, + "grad_norm": 0.5301707983016968, + "learning_rate": 9.993423754881039e-06, + "loss": 2.4086, + "mean_token_accuracy": 0.48241995088756084, + "num_tokens": 280401852.0, + "step": 1933 + }, + { + "entropy": 2.4444580078125, + "epoch": 0.03329803810163305, + "grad_norm": 0.5419909358024597, + "learning_rate": 9.993409638298785e-06, + "loss": 2.4023, + "mean_token_accuracy": 0.4783248957246542, + "num_tokens": 280561735.0, + "step": 1934 + }, + { + "entropy": 2.4560546875, + "epoch": 0.03331525528782831, + "grad_norm": 0.5301235914230347, + "learning_rate": 9.99339550659142e-06, + "loss": 2.4201, + "mean_token_accuracy": 0.4791613514535129, + "num_tokens": 280719202.0, + "step": 1935 + }, + { + "entropy": 2.3726806640625, + "epoch": 0.03333247247402357, + "grad_norm": 0.5838637948036194, + "learning_rate": 9.993381359758985e-06, + "loss": 2.347, + "mean_token_accuracy": 0.4987304233945906, + "num_tokens": 280866306.0, + "step": 1936 + }, + { + "entropy": 2.45068359375, + "epoch": 0.03334968966021883, + "grad_norm": 0.5787121057510376, + "learning_rate": 9.993367197801527e-06, + "loss": 2.3823, + "mean_token_accuracy": 0.47824144922196865, + "num_tokens": 280996610.0, + "step": 1937 + }, + { + "entropy": 2.5333251953125, + "epoch": 0.03336690684641409, + "grad_norm": 0.5946321487426758, + "learning_rate": 9.993353020719083e-06, + "loss": 2.5036, + "mean_token_accuracy": 0.4754126025363803, + "num_tokens": 281133369.0, + "step": 1938 + }, + { + "entropy": 2.444580078125, + "epoch": 0.033384124032609354, + "grad_norm": 0.5681336522102356, + "learning_rate": 9.993338828511701e-06, + "loss": 2.419, + "mean_token_accuracy": 0.480977026745677, + "num_tokens": 281293453.0, + "step": 1939 + }, + { + "entropy": 2.469970703125, + "epoch": 0.03340134121880461, + "grad_norm": 0.5857387185096741, + "learning_rate": 9.993324621179424e-06, + "loss": 2.446, + "mean_token_accuracy": 0.48483802145347, + "num_tokens": 281434754.0, + "step": 1940 + }, + { + "entropy": 2.4569091796875, + "epoch": 0.03341855840499987, + "grad_norm": 0.6091550588607788, + "learning_rate": 9.99331039872229e-06, + "loss": 2.4002, + "mean_token_accuracy": 0.4800125630572438, + "num_tokens": 281571983.0, + "step": 1941 + }, + { + "entropy": 2.4578857421875, + "epoch": 0.03343577559119513, + "grad_norm": 0.6192805171012878, + "learning_rate": 9.993296161140346e-06, + "loss": 2.3834, + "mean_token_accuracy": 0.4805147326551378, + "num_tokens": 281716426.0, + "step": 1942 + }, + { + "entropy": 2.5145263671875, + "epoch": 0.03345299277739039, + "grad_norm": 0.5912413001060486, + "learning_rate": 9.993281908433637e-06, + "loss": 2.4856, + "mean_token_accuracy": 0.47100443998351693, + "num_tokens": 281859208.0, + "step": 1943 + }, + { + "entropy": 2.4542236328125, + "epoch": 0.03347020996358565, + "grad_norm": 0.5443603992462158, + "learning_rate": 9.9932676406022e-06, + "loss": 2.4321, + "mean_token_accuracy": 0.4776157597079873, + "num_tokens": 282011533.0, + "step": 1944 + }, + { + "entropy": 2.4461669921875, + "epoch": 0.03348742714978091, + "grad_norm": 0.5912041068077087, + "learning_rate": 9.993253357646085e-06, + "loss": 2.4245, + "mean_token_accuracy": 0.4797707684338093, + "num_tokens": 282142801.0, + "step": 1945 + }, + { + "entropy": 2.4661865234375, + "epoch": 0.03350464433597617, + "grad_norm": 0.5891805291175842, + "learning_rate": 9.99323905956533e-06, + "loss": 2.408, + "mean_token_accuracy": 0.4835380604490638, + "num_tokens": 282282290.0, + "step": 1946 + }, + { + "entropy": 2.505859375, + "epoch": 0.03352186152217143, + "grad_norm": 0.6140239238739014, + "learning_rate": 9.993224746359981e-06, + "loss": 2.4778, + "mean_token_accuracy": 0.47382217831909657, + "num_tokens": 282421711.0, + "step": 1947 + }, + { + "entropy": 2.464111328125, + "epoch": 0.03353907870836669, + "grad_norm": 0.5682819485664368, + "learning_rate": 9.993210418030082e-06, + "loss": 2.4156, + "mean_token_accuracy": 0.47630415530875325, + "num_tokens": 282576394.0, + "step": 1948 + }, + { + "entropy": 2.3934326171875, + "epoch": 0.03355629589456195, + "grad_norm": 0.5857619643211365, + "learning_rate": 9.993196074575673e-06, + "loss": 2.3721, + "mean_token_accuracy": 0.49081736570224166, + "num_tokens": 282729923.0, + "step": 1949 + }, + { + "entropy": 2.4425048828125, + "epoch": 0.03357351308075721, + "grad_norm": 0.6023845076560974, + "learning_rate": 9.9931817159968e-06, + "loss": 2.3967, + "mean_token_accuracy": 0.4840724840760231, + "num_tokens": 282862205.0, + "step": 1950 + }, + { + "entropy": 2.423095703125, + "epoch": 0.03359073026695247, + "grad_norm": 0.5758981108665466, + "learning_rate": 9.993167342293508e-06, + "loss": 2.3915, + "mean_token_accuracy": 0.48772860085591674, + "num_tokens": 282997486.0, + "step": 1951 + }, + { + "entropy": 2.5380859375, + "epoch": 0.033607947453147734, + "grad_norm": 0.550911545753479, + "learning_rate": 9.993152953465837e-06, + "loss": 2.5101, + "mean_token_accuracy": 0.4724648226983845, + "num_tokens": 283148960.0, + "step": 1952 + }, + { + "entropy": 2.501708984375, + "epoch": 0.033625164639342994, + "grad_norm": 0.5853079557418823, + "learning_rate": 9.993138549513834e-06, + "loss": 2.46, + "mean_token_accuracy": 0.4766088160686195, + "num_tokens": 283290181.0, + "step": 1953 + }, + { + "entropy": 2.412353515625, + "epoch": 0.033642381825538255, + "grad_norm": 0.5772018432617188, + "learning_rate": 9.99312413043754e-06, + "loss": 2.3818, + "mean_token_accuracy": 0.4874812732450664, + "num_tokens": 283446157.0, + "step": 1954 + }, + { + "entropy": 2.4298095703125, + "epoch": 0.033659599011733515, + "grad_norm": 0.5674806833267212, + "learning_rate": 9.993109696236997e-06, + "loss": 2.4043, + "mean_token_accuracy": 0.48742856411263347, + "num_tokens": 283583169.0, + "step": 1955 + }, + { + "entropy": 2.4627685546875, + "epoch": 0.033676816197928776, + "grad_norm": 0.5692137479782104, + "learning_rate": 9.993095246912254e-06, + "loss": 2.4209, + "mean_token_accuracy": 0.4784468016587198, + "num_tokens": 283726808.0, + "step": 1956 + }, + { + "entropy": 2.411865234375, + "epoch": 0.03369403338412403, + "grad_norm": 0.5745928287506104, + "learning_rate": 9.99308078246335e-06, + "loss": 2.3845, + "mean_token_accuracy": 0.48880593106150627, + "num_tokens": 283884790.0, + "step": 1957 + }, + { + "entropy": 2.4476318359375, + "epoch": 0.03371125057031929, + "grad_norm": 0.5757923722267151, + "learning_rate": 9.993066302890335e-06, + "loss": 2.4339, + "mean_token_accuracy": 0.47860253555700183, + "num_tokens": 284026807.0, + "step": 1958 + }, + { + "entropy": 2.4261474609375, + "epoch": 0.03372846775651455, + "grad_norm": 0.509971559047699, + "learning_rate": 9.993051808193246e-06, + "loss": 2.39, + "mean_token_accuracy": 0.4785323585383594, + "num_tokens": 284189392.0, + "step": 1959 + }, + { + "entropy": 2.4044189453125, + "epoch": 0.03374568494270981, + "grad_norm": 0.575952410697937, + "learning_rate": 9.99303729837213e-06, + "loss": 2.3911, + "mean_token_accuracy": 0.48549577174708247, + "num_tokens": 284324103.0, + "step": 1960 + }, + { + "entropy": 2.46044921875, + "epoch": 0.03376290212890507, + "grad_norm": 0.5873697996139526, + "learning_rate": 9.993022773427028e-06, + "loss": 2.419, + "mean_token_accuracy": 0.4775766748934984, + "num_tokens": 284466158.0, + "step": 1961 + }, + { + "entropy": 2.533935546875, + "epoch": 0.03378011931510033, + "grad_norm": 0.5654860734939575, + "learning_rate": 9.99300823335799e-06, + "loss": 2.4545, + "mean_token_accuracy": 0.47519396571442485, + "num_tokens": 284602979.0, + "step": 1962 + }, + { + "entropy": 2.472412109375, + "epoch": 0.03379733650129559, + "grad_norm": 0.7695214748382568, + "learning_rate": 9.992993678165055e-06, + "loss": 2.4068, + "mean_token_accuracy": 0.4817899586632848, + "num_tokens": 284740325.0, + "step": 1963 + }, + { + "entropy": 2.4033203125, + "epoch": 0.03381455368749085, + "grad_norm": 0.5517217516899109, + "learning_rate": 9.99297910784827e-06, + "loss": 2.3849, + "mean_token_accuracy": 0.48449106933549047, + "num_tokens": 284885765.0, + "step": 1964 + }, + { + "entropy": 2.4239501953125, + "epoch": 0.033831770873686114, + "grad_norm": 0.6454329490661621, + "learning_rate": 9.992964522407676e-06, + "loss": 2.4082, + "mean_token_accuracy": 0.48670034017413855, + "num_tokens": 285020125.0, + "step": 1965 + }, + { + "entropy": 2.4947509765625, + "epoch": 0.033848988059881374, + "grad_norm": 0.5675487518310547, + "learning_rate": 9.99294992184332e-06, + "loss": 2.4896, + "mean_token_accuracy": 0.4730933913961053, + "num_tokens": 285165664.0, + "step": 1966 + }, + { + "entropy": 2.4700927734375, + "epoch": 0.033866205246076635, + "grad_norm": 0.581511914730072, + "learning_rate": 9.992935306155246e-06, + "loss": 2.4402, + "mean_token_accuracy": 0.4833594365045428, + "num_tokens": 285304514.0, + "step": 1967 + }, + { + "entropy": 2.4307861328125, + "epoch": 0.033883422432271895, + "grad_norm": 0.5686525702476501, + "learning_rate": 9.992920675343496e-06, + "loss": 2.412, + "mean_token_accuracy": 0.4830741249024868, + "num_tokens": 285451178.0, + "step": 1968 + }, + { + "entropy": 2.44921875, + "epoch": 0.033900639618467156, + "grad_norm": 0.5576603412628174, + "learning_rate": 9.992906029408115e-06, + "loss": 2.3827, + "mean_token_accuracy": 0.4801848717033863, + "num_tokens": 285595105.0, + "step": 1969 + }, + { + "entropy": 2.415771484375, + "epoch": 0.033917856804662416, + "grad_norm": 0.5764795541763306, + "learning_rate": 9.992891368349151e-06, + "loss": 2.3845, + "mean_token_accuracy": 0.48759686667472124, + "num_tokens": 285726774.0, + "step": 1970 + }, + { + "entropy": 2.4775390625, + "epoch": 0.03393507399085768, + "grad_norm": 0.6182502508163452, + "learning_rate": 9.992876692166644e-06, + "loss": 2.491, + "mean_token_accuracy": 0.4704209272749722, + "num_tokens": 285846737.0, + "step": 1971 + }, + { + "entropy": 2.4888916015625, + "epoch": 0.03395229117705294, + "grad_norm": 0.5855472683906555, + "learning_rate": 9.99286200086064e-06, + "loss": 2.4541, + "mean_token_accuracy": 0.4759668963961303, + "num_tokens": 285986773.0, + "step": 1972 + }, + { + "entropy": 2.473388671875, + "epoch": 0.03396950836324819, + "grad_norm": 0.5262144804000854, + "learning_rate": 9.992847294431186e-06, + "loss": 2.473, + "mean_token_accuracy": 0.4791461192071438, + "num_tokens": 286147251.0, + "step": 1973 + }, + { + "entropy": 2.4501953125, + "epoch": 0.03398672554944345, + "grad_norm": 0.558925211429596, + "learning_rate": 9.99283257287832e-06, + "loss": 2.4154, + "mean_token_accuracy": 0.47716324497014284, + "num_tokens": 286310437.0, + "step": 1974 + }, + { + "entropy": 2.426025390625, + "epoch": 0.03400394273563871, + "grad_norm": 0.5795169472694397, + "learning_rate": 9.992817836202093e-06, + "loss": 2.3785, + "mean_token_accuracy": 0.4854766642674804, + "num_tokens": 286448769.0, + "step": 1975 + }, + { + "entropy": 2.52880859375, + "epoch": 0.03402115992183397, + "grad_norm": 0.5628653764724731, + "learning_rate": 9.992803084402547e-06, + "loss": 2.5193, + "mean_token_accuracy": 0.4707297068089247, + "num_tokens": 286588873.0, + "step": 1976 + }, + { + "entropy": 2.4481201171875, + "epoch": 0.03403837710802923, + "grad_norm": 0.5932325124740601, + "learning_rate": 9.992788317479727e-06, + "loss": 2.4434, + "mean_token_accuracy": 0.47935432521626353, + "num_tokens": 286733286.0, + "step": 1977 + }, + { + "entropy": 2.45361328125, + "epoch": 0.034055594294224494, + "grad_norm": 0.576995313167572, + "learning_rate": 9.992773535433678e-06, + "loss": 2.4079, + "mean_token_accuracy": 0.47960167936980724, + "num_tokens": 286875771.0, + "step": 1978 + }, + { + "entropy": 2.4423828125, + "epoch": 0.034072811480419754, + "grad_norm": 0.5512486100196838, + "learning_rate": 9.992758738264442e-06, + "loss": 2.4155, + "mean_token_accuracy": 0.480304847471416, + "num_tokens": 287031003.0, + "step": 1979 + }, + { + "entropy": 2.3990478515625, + "epoch": 0.034090028666615015, + "grad_norm": 0.5462455153465271, + "learning_rate": 9.992743925972069e-06, + "loss": 2.3775, + "mean_token_accuracy": 0.48557318560779095, + "num_tokens": 287180788.0, + "step": 1980 + }, + { + "entropy": 2.5538330078125, + "epoch": 0.034107245852810275, + "grad_norm": 0.5789005756378174, + "learning_rate": 9.992729098556601e-06, + "loss": 2.5184, + "mean_token_accuracy": 0.4639392225071788, + "num_tokens": 287318088.0, + "step": 1981 + }, + { + "entropy": 2.4896240234375, + "epoch": 0.034124463039005536, + "grad_norm": 0.66868656873703, + "learning_rate": 9.992714256018082e-06, + "loss": 2.4807, + "mean_token_accuracy": 0.4695613798685372, + "num_tokens": 287466895.0, + "step": 1982 + }, + { + "entropy": 2.4345703125, + "epoch": 0.034141680225200796, + "grad_norm": 0.5860902667045593, + "learning_rate": 9.99269939835656e-06, + "loss": 2.3898, + "mean_token_accuracy": 0.4838330140337348, + "num_tokens": 287612255.0, + "step": 1983 + }, + { + "entropy": 2.507080078125, + "epoch": 0.03415889741139606, + "grad_norm": 0.5566897988319397, + "learning_rate": 9.992684525572076e-06, + "loss": 2.498, + "mean_token_accuracy": 0.47418344393372536, + "num_tokens": 287765365.0, + "step": 1984 + }, + { + "entropy": 2.367431640625, + "epoch": 0.03417611459759132, + "grad_norm": 0.5585086941719055, + "learning_rate": 9.992669637664679e-06, + "loss": 2.3324, + "mean_token_accuracy": 0.49284734204411507, + "num_tokens": 287923606.0, + "step": 1985 + }, + { + "entropy": 2.4853515625, + "epoch": 0.03419333178378658, + "grad_norm": 0.5721833109855652, + "learning_rate": 9.99265473463441e-06, + "loss": 2.4174, + "mean_token_accuracy": 0.47697839234024286, + "num_tokens": 288065850.0, + "step": 1986 + }, + { + "entropy": 2.4268798828125, + "epoch": 0.03421054896998184, + "grad_norm": 0.5846219062805176, + "learning_rate": 9.992639816481317e-06, + "loss": 2.4077, + "mean_token_accuracy": 0.4877488249912858, + "num_tokens": 288219901.0, + "step": 1987 + }, + { + "entropy": 2.494140625, + "epoch": 0.0342277661561771, + "grad_norm": 0.7576902508735657, + "learning_rate": 9.992624883205446e-06, + "loss": 2.4919, + "mean_token_accuracy": 0.4697610507719219, + "num_tokens": 288368125.0, + "step": 1988 + }, + { + "entropy": 2.4490966796875, + "epoch": 0.03424498334237236, + "grad_norm": 0.5778993368148804, + "learning_rate": 9.99260993480684e-06, + "loss": 2.3865, + "mean_token_accuracy": 0.48272112663835287, + "num_tokens": 288502228.0, + "step": 1989 + }, + { + "entropy": 2.4388427734375, + "epoch": 0.03426220052856761, + "grad_norm": 0.5564126372337341, + "learning_rate": 9.992594971285545e-06, + "loss": 2.4208, + "mean_token_accuracy": 0.479803703725338, + "num_tokens": 288650170.0, + "step": 1990 + }, + { + "entropy": 2.4871826171875, + "epoch": 0.03427941771476287, + "grad_norm": 0.5940274596214294, + "learning_rate": 9.992579992641606e-06, + "loss": 2.4353, + "mean_token_accuracy": 0.4715647897683084, + "num_tokens": 288780528.0, + "step": 1991 + }, + { + "entropy": 2.4788818359375, + "epoch": 0.034296634900958134, + "grad_norm": 0.6397010087966919, + "learning_rate": 9.99256499887507e-06, + "loss": 2.4715, + "mean_token_accuracy": 0.4812845438718796, + "num_tokens": 288936902.0, + "step": 1992 + }, + { + "entropy": 2.417724609375, + "epoch": 0.034313852087153394, + "grad_norm": 0.5475616455078125, + "learning_rate": 9.99254998998598e-06, + "loss": 2.3702, + "mean_token_accuracy": 0.4824229711666703, + "num_tokens": 289091883.0, + "step": 1993 + }, + { + "entropy": 2.4339599609375, + "epoch": 0.034331069273348655, + "grad_norm": 0.5338613390922546, + "learning_rate": 9.99253496597438e-06, + "loss": 2.417, + "mean_token_accuracy": 0.479421756695956, + "num_tokens": 289241366.0, + "step": 1994 + }, + { + "entropy": 2.406005859375, + "epoch": 0.034348286459543916, + "grad_norm": 0.6116018295288086, + "learning_rate": 9.992519926840322e-06, + "loss": 2.3794, + "mean_token_accuracy": 0.4870318342000246, + "num_tokens": 289380641.0, + "step": 1995 + }, + { + "entropy": 2.4573974609375, + "epoch": 0.034365503645739176, + "grad_norm": 0.6071226596832275, + "learning_rate": 9.992504872583847e-06, + "loss": 2.4109, + "mean_token_accuracy": 0.47875087382271886, + "num_tokens": 289521937.0, + "step": 1996 + }, + { + "entropy": 2.4119873046875, + "epoch": 0.03438272083193444, + "grad_norm": 0.5995868444442749, + "learning_rate": 9.992489803205e-06, + "loss": 2.3732, + "mean_token_accuracy": 0.4848559848032892, + "num_tokens": 289670647.0, + "step": 1997 + }, + { + "entropy": 2.4595947265625, + "epoch": 0.0343999380181297, + "grad_norm": 0.5530126094818115, + "learning_rate": 9.992474718703829e-06, + "loss": 2.4787, + "mean_token_accuracy": 0.47666746005415916, + "num_tokens": 289833608.0, + "step": 1998 + }, + { + "entropy": 2.4443359375, + "epoch": 0.03441715520432496, + "grad_norm": 0.5142894387245178, + "learning_rate": 9.99245961908038e-06, + "loss": 2.3763, + "mean_token_accuracy": 0.48295469116419554, + "num_tokens": 289992658.0, + "step": 1999 + }, + { + "entropy": 2.42138671875, + "epoch": 0.03443437239052022, + "grad_norm": 0.5748444199562073, + "learning_rate": 9.992444504334696e-06, + "loss": 2.4077, + "mean_token_accuracy": 0.48649854911491275, + "num_tokens": 290134205.0, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 58082, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7734025191882752e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}