{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 10000, "global_step": 81684, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.13171875, "epoch": 0.0012242299593555653, "grad_norm": 83.0, "learning_rate": 7.197062423500612e-08, "loss": 1.0766, "mean_token_accuracy": 0.8785432744026184, "num_tokens": 5226465.0, "step": 50 }, { "entropy": 1.1265625, "epoch": 0.0024484599187111307, "grad_norm": 78.0, "learning_rate": 1.4541003671970627e-07, "loss": 1.0658, "mean_token_accuracy": 0.8774244856834411, "num_tokens": 10264339.0, "step": 100 }, { "entropy": 1.12125, "epoch": 0.003672689878066696, "grad_norm": 65.5, "learning_rate": 2.1884944920440638e-07, "loss": 1.0161, "mean_token_accuracy": 0.8803484535217285, "num_tokens": 15456221.0, "step": 150 }, { "entropy": 1.12859375, "epoch": 0.004896919837422261, "grad_norm": 73.0, "learning_rate": 2.922888616891065e-07, "loss": 0.9905, "mean_token_accuracy": 0.8795530760288238, "num_tokens": 20433188.0, "step": 200 }, { "entropy": 1.14265625, "epoch": 0.006121149796777827, "grad_norm": 69.0, "learning_rate": 3.6572827417380663e-07, "loss": 0.8835, "mean_token_accuracy": 0.884142210483551, "num_tokens": 25654586.0, "step": 250 }, { "entropy": 1.1475, "epoch": 0.007345379756133392, "grad_norm": 60.0, "learning_rate": 4.391676866585067e-07, "loss": 0.7555, "mean_token_accuracy": 0.8876442670822143, "num_tokens": 30682210.0, "step": 300 }, { "entropy": 1.14234375, "epoch": 0.008569609715488957, "grad_norm": 30.0, "learning_rate": 5.126070991432069e-07, "loss": 0.691, "mean_token_accuracy": 0.8891012752056122, "num_tokens": 36107614.0, "step": 350 }, { "entropy": 1.1603125, "epoch": 0.009793839674844523, "grad_norm": 15.6875, "learning_rate": 5.860465116279069e-07, "loss": 0.5872, "mean_token_accuracy": 0.9278727066516876, "num_tokens": 41528585.0, "step": 400 }, { "entropy": 1.20984375, "epoch": 0.011018069634200088, "grad_norm": 10.375, "learning_rate": 6.594859241126071e-07, "loss": 0.5128, "mean_token_accuracy": 0.9328850126266479, "num_tokens": 47205376.0, "step": 450 }, { "entropy": 1.23328125, "epoch": 0.012242299593555654, "grad_norm": 8.875, "learning_rate": 7.329253365973072e-07, "loss": 0.464, "mean_token_accuracy": 0.9372936522960663, "num_tokens": 52484312.0, "step": 500 }, { "entropy": 1.2515625, "epoch": 0.013466529552911218, "grad_norm": 10.375, "learning_rate": 8.063647490820073e-07, "loss": 0.4469, "mean_token_accuracy": 0.9350792992115021, "num_tokens": 57610761.0, "step": 550 }, { "entropy": 1.28046875, "epoch": 0.014690759512266784, "grad_norm": 7.8125, "learning_rate": 8.798041615667075e-07, "loss": 0.4233, "mean_token_accuracy": 0.9361693727970123, "num_tokens": 62744047.0, "step": 600 }, { "entropy": 1.30703125, "epoch": 0.01591498947162235, "grad_norm": 11.375, "learning_rate": 9.532435740514075e-07, "loss": 0.4228, "mean_token_accuracy": 0.9353253149986267, "num_tokens": 68113654.0, "step": 650 }, { "entropy": 1.3890625, "epoch": 0.017139219430977914, "grad_norm": 4.96875, "learning_rate": 1.0266829865361079e-06, "loss": 0.3897, "mean_token_accuracy": 0.934454687833786, "num_tokens": 73614017.0, "step": 700 }, { "entropy": 1.4703125, "epoch": 0.01836344939033348, "grad_norm": 4.25, "learning_rate": 1.100122399020808e-06, "loss": 0.3618, "mean_token_accuracy": 0.9344593751430511, "num_tokens": 79174232.0, "step": 750 }, { "entropy": 1.52625, "epoch": 0.019587679349689045, "grad_norm": 3.671875, "learning_rate": 1.173561811505508e-06, "loss": 0.3804, "mean_token_accuracy": 0.9336516118049621, "num_tokens": 84608483.0, "step": 800 }, { "entropy": 1.5390625, "epoch": 0.02081190930904461, "grad_norm": 3.8125, "learning_rate": 1.2470012239902082e-06, "loss": 0.353, "mean_token_accuracy": 0.9379545438289643, "num_tokens": 89999996.0, "step": 850 }, { "entropy": 1.54515625, "epoch": 0.022036139268400177, "grad_norm": 3.296875, "learning_rate": 1.3204406364749082e-06, "loss": 0.3294, "mean_token_accuracy": 0.9422214996814727, "num_tokens": 95124008.0, "step": 900 }, { "entropy": 1.5690625, "epoch": 0.02326036922775574, "grad_norm": 4.09375, "learning_rate": 1.3938800489596082e-06, "loss": 0.3514, "mean_token_accuracy": 0.9378081679344177, "num_tokens": 100136013.0, "step": 950 }, { "entropy": 1.55765625, "epoch": 0.02448459918711131, "grad_norm": 3.171875, "learning_rate": 1.4673194614443085e-06, "loss": 0.3343, "mean_token_accuracy": 0.9409069657325745, "num_tokens": 105114554.0, "step": 1000 }, { "entropy": 1.53890625, "epoch": 0.025708829146466872, "grad_norm": 3.296875, "learning_rate": 1.5407588739290085e-06, "loss": 0.3284, "mean_token_accuracy": 0.9414800906181335, "num_tokens": 110370176.0, "step": 1050 }, { "entropy": 1.5546875, "epoch": 0.026933059105822436, "grad_norm": 3.625, "learning_rate": 1.6141982864137085e-06, "loss": 0.3183, "mean_token_accuracy": 0.9426046288013459, "num_tokens": 115321229.0, "step": 1100 }, { "entropy": 1.55046875, "epoch": 0.028157289065178004, "grad_norm": 2.515625, "learning_rate": 1.687637698898409e-06, "loss": 0.332, "mean_token_accuracy": 0.9409434747695923, "num_tokens": 120648053.0, "step": 1150 }, { "entropy": 1.54046875, "epoch": 0.029381519024533568, "grad_norm": 2.578125, "learning_rate": 1.761077111383109e-06, "loss": 0.3266, "mean_token_accuracy": 0.941448210477829, "num_tokens": 126142957.0, "step": 1200 }, { "entropy": 1.54453125, "epoch": 0.030605748983889135, "grad_norm": 3.484375, "learning_rate": 1.8345165238678093e-06, "loss": 0.3357, "mean_token_accuracy": 0.9392308318614959, "num_tokens": 131721983.0, "step": 1250 }, { "entropy": 1.54265625, "epoch": 0.0318299789432447, "grad_norm": 3.890625, "learning_rate": 1.9079559363525093e-06, "loss": 0.323, "mean_token_accuracy": 0.9425621521472931, "num_tokens": 136834110.0, "step": 1300 }, { "entropy": 1.55375, "epoch": 0.03305420890260027, "grad_norm": 3.046875, "learning_rate": 1.9813953488372093e-06, "loss": 0.3103, "mean_token_accuracy": 0.9435288536548615, "num_tokens": 142077567.0, "step": 1350 }, { "entropy": 1.5815625, "epoch": 0.03427843886195583, "grad_norm": 1.703125, "learning_rate": 2.0548347613219094e-06, "loss": 0.325, "mean_token_accuracy": 0.9404278743267059, "num_tokens": 147938512.0, "step": 1400 }, { "entropy": 1.603125, "epoch": 0.035502668821311395, "grad_norm": 2.46875, "learning_rate": 2.1282741738066094e-06, "loss": 0.292, "mean_token_accuracy": 0.94657958984375, "num_tokens": 152967734.0, "step": 1450 }, { "entropy": 1.5790625, "epoch": 0.03672689878066696, "grad_norm": 4.84375, "learning_rate": 2.20171358629131e-06, "loss": 0.3027, "mean_token_accuracy": 0.9442594313621521, "num_tokens": 158252140.0, "step": 1500 }, { "entropy": 1.5709375, "epoch": 0.03795112874002252, "grad_norm": 2.796875, "learning_rate": 2.27515299877601e-06, "loss": 0.2935, "mean_token_accuracy": 0.9451294171810151, "num_tokens": 163906622.0, "step": 1550 }, { "entropy": 1.58890625, "epoch": 0.03917535869937809, "grad_norm": 2.703125, "learning_rate": 2.34859241126071e-06, "loss": 0.2963, "mean_token_accuracy": 0.9451341640949249, "num_tokens": 169532126.0, "step": 1600 }, { "entropy": 1.55125, "epoch": 0.04039958865873366, "grad_norm": 2.46875, "learning_rate": 2.42203182374541e-06, "loss": 0.2704, "mean_token_accuracy": 0.9490817248821258, "num_tokens": 174811062.0, "step": 1650 }, { "entropy": 1.54515625, "epoch": 0.04162381861808922, "grad_norm": 1.9296875, "learning_rate": 2.49547123623011e-06, "loss": 0.2704, "mean_token_accuracy": 0.9497274696826935, "num_tokens": 180019609.0, "step": 1700 }, { "entropy": 1.545625, "epoch": 0.042848048577444786, "grad_norm": 2.171875, "learning_rate": 2.56891064871481e-06, "loss": 0.2729, "mean_token_accuracy": 0.9472972440719605, "num_tokens": 185410342.0, "step": 1750 }, { "entropy": 1.54171875, "epoch": 0.044072278536800354, "grad_norm": 2.3125, "learning_rate": 2.6423500611995105e-06, "loss": 0.2723, "mean_token_accuracy": 0.9487947750091553, "num_tokens": 190878398.0, "step": 1800 }, { "entropy": 1.54234375, "epoch": 0.04529650849615592, "grad_norm": 2.8125, "learning_rate": 2.715789473684211e-06, "loss": 0.2761, "mean_token_accuracy": 0.9490153706073761, "num_tokens": 196027836.0, "step": 1850 }, { "entropy": 1.53703125, "epoch": 0.04652073845551148, "grad_norm": 2.578125, "learning_rate": 2.789228886168911e-06, "loss": 0.2882, "mean_token_accuracy": 0.9455779695510864, "num_tokens": 201616019.0, "step": 1900 }, { "entropy": 1.52796875, "epoch": 0.04774496841486705, "grad_norm": 1.671875, "learning_rate": 2.862668298653611e-06, "loss": 0.2609, "mean_token_accuracy": 0.9508947324752808, "num_tokens": 206756502.0, "step": 1950 }, { "entropy": 1.51796875, "epoch": 0.04896919837422262, "grad_norm": 2.296875, "learning_rate": 2.936107711138311e-06, "loss": 0.2627, "mean_token_accuracy": 0.9504942214488983, "num_tokens": 211966544.0, "step": 2000 }, { "entropy": 1.5309375, "epoch": 0.05019342833357818, "grad_norm": 2.84375, "learning_rate": 3.0095471236230106e-06, "loss": 0.2622, "mean_token_accuracy": 0.9505769073963165, "num_tokens": 217058889.0, "step": 2050 }, { "entropy": 1.52125, "epoch": 0.051417658292933745, "grad_norm": 2.203125, "learning_rate": 3.082986536107711e-06, "loss": 0.271, "mean_token_accuracy": 0.9492247033119202, "num_tokens": 222302364.0, "step": 2100 }, { "entropy": 1.5103125, "epoch": 0.05264188825228931, "grad_norm": 2.046875, "learning_rate": 3.1564259485924115e-06, "loss": 0.2836, "mean_token_accuracy": 0.9467552089691162, "num_tokens": 227892169.0, "step": 2150 }, { "entropy": 1.5121875, "epoch": 0.05386611821164487, "grad_norm": 1.6796875, "learning_rate": 3.2298653610771116e-06, "loss": 0.2772, "mean_token_accuracy": 0.9473760116100312, "num_tokens": 233522252.0, "step": 2200 }, { "entropy": 1.51453125, "epoch": 0.05509034817100044, "grad_norm": 2.46875, "learning_rate": 3.303304773561812e-06, "loss": 0.2814, "mean_token_accuracy": 0.9471155571937561, "num_tokens": 239241678.0, "step": 2250 }, { "entropy": 1.50359375, "epoch": 0.05631457813035601, "grad_norm": 6.65625, "learning_rate": 3.3767441860465116e-06, "loss": 0.252, "mean_token_accuracy": 0.9517535066604614, "num_tokens": 244573352.0, "step": 2300 }, { "entropy": 1.489375, "epoch": 0.05753880808971157, "grad_norm": 2.578125, "learning_rate": 3.450183598531212e-06, "loss": 0.2686, "mean_token_accuracy": 0.9490506839752197, "num_tokens": 249799704.0, "step": 2350 }, { "entropy": 1.5084375, "epoch": 0.058763038049067136, "grad_norm": 2.859375, "learning_rate": 3.5236230110159117e-06, "loss": 0.2593, "mean_token_accuracy": 0.951296364068985, "num_tokens": 255107263.0, "step": 2400 }, { "entropy": 1.49984375, "epoch": 0.0599872680084227, "grad_norm": 3.03125, "learning_rate": 3.597062423500612e-06, "loss": 0.2734, "mean_token_accuracy": 0.9485626530647278, "num_tokens": 260533835.0, "step": 2450 }, { "entropy": 1.48375, "epoch": 0.06121149796777827, "grad_norm": 1.0390625, "learning_rate": 3.670501835985312e-06, "loss": 0.2529, "mean_token_accuracy": 0.9517404818534851, "num_tokens": 265773085.0, "step": 2500 }, { "entropy": 1.4821875, "epoch": 0.06243572792713383, "grad_norm": 2.703125, "learning_rate": 3.743941248470012e-06, "loss": 0.2616, "mean_token_accuracy": 0.9500162851810455, "num_tokens": 271036305.0, "step": 2550 }, { "entropy": 1.47765625, "epoch": 0.0636599578864894, "grad_norm": 1.953125, "learning_rate": 3.817380660954712e-06, "loss": 0.2462, "mean_token_accuracy": 0.9525675988197326, "num_tokens": 275952458.0, "step": 2600 }, { "entropy": 1.48328125, "epoch": 0.06488418784584496, "grad_norm": 2.125, "learning_rate": 3.890820073439412e-06, "loss": 0.2592, "mean_token_accuracy": 0.9498057246208191, "num_tokens": 281644324.0, "step": 2650 }, { "entropy": 1.47390625, "epoch": 0.06610841780520053, "grad_norm": 1.90625, "learning_rate": 3.964259485924113e-06, "loss": 0.2416, "mean_token_accuracy": 0.9530806469917298, "num_tokens": 286839400.0, "step": 2700 }, { "entropy": 1.471875, "epoch": 0.0673326477645561, "grad_norm": 1.9296875, "learning_rate": 4.037698898408813e-06, "loss": 0.2483, "mean_token_accuracy": 0.9517722308635712, "num_tokens": 292713093.0, "step": 2750 }, { "entropy": 1.47, "epoch": 0.06855687772391166, "grad_norm": 1.8984375, "learning_rate": 4.111138310893514e-06, "loss": 0.2357, "mean_token_accuracy": 0.9542278277873993, "num_tokens": 297994633.0, "step": 2800 }, { "entropy": 1.48640625, "epoch": 0.06978110768326723, "grad_norm": 2.328125, "learning_rate": 4.184577723378213e-06, "loss": 0.2434, "mean_token_accuracy": 0.9529701387882232, "num_tokens": 303305735.0, "step": 2850 }, { "entropy": 1.46640625, "epoch": 0.07100533764262279, "grad_norm": 1.9765625, "learning_rate": 4.258017135862914e-06, "loss": 0.2228, "mean_token_accuracy": 0.9564117324352265, "num_tokens": 308665099.0, "step": 2900 }, { "entropy": 1.47671875, "epoch": 0.07222956760197835, "grad_norm": 2.546875, "learning_rate": 4.331456548347613e-06, "loss": 0.2485, "mean_token_accuracy": 0.9520271122455597, "num_tokens": 313894105.0, "step": 2950 }, { "entropy": 1.46984375, "epoch": 0.07345379756133393, "grad_norm": 2.125, "learning_rate": 4.404895960832314e-06, "loss": 0.2354, "mean_token_accuracy": 0.9531759965419769, "num_tokens": 319439357.0, "step": 3000 }, { "entropy": 1.479375, "epoch": 0.07467802752068949, "grad_norm": 1.96875, "learning_rate": 4.478335373317013e-06, "loss": 0.2506, "mean_token_accuracy": 0.9517410743236542, "num_tokens": 325090760.0, "step": 3050 }, { "entropy": 1.475, "epoch": 0.07590225748004505, "grad_norm": 1.6796875, "learning_rate": 4.551774785801714e-06, "loss": 0.2273, "mean_token_accuracy": 0.955747674703598, "num_tokens": 330405470.0, "step": 3100 }, { "entropy": 1.47546875, "epoch": 0.07712648743940062, "grad_norm": 1.8828125, "learning_rate": 4.6252141982864134e-06, "loss": 0.2391, "mean_token_accuracy": 0.9522111368179321, "num_tokens": 335678826.0, "step": 3150 }, { "entropy": 1.4603125, "epoch": 0.07835071739875618, "grad_norm": 1.53125, "learning_rate": 4.698653610771114e-06, "loss": 0.2344, "mean_token_accuracy": 0.9539849495887757, "num_tokens": 340918671.0, "step": 3200 }, { "entropy": 1.4509375, "epoch": 0.07957494735811174, "grad_norm": 2.5625, "learning_rate": 4.7720930232558135e-06, "loss": 0.2191, "mean_token_accuracy": 0.9559646666049957, "num_tokens": 346171106.0, "step": 3250 }, { "entropy": 1.454375, "epoch": 0.08079917731746732, "grad_norm": 5.6875, "learning_rate": 4.845532435740514e-06, "loss": 0.2356, "mean_token_accuracy": 0.9528819477558136, "num_tokens": 351560226.0, "step": 3300 }, { "entropy": 1.46609375, "epoch": 0.08202340727682288, "grad_norm": 1.859375, "learning_rate": 4.918971848225214e-06, "loss": 0.2387, "mean_token_accuracy": 0.9533221650123597, "num_tokens": 357311606.0, "step": 3350 }, { "entropy": 1.45046875, "epoch": 0.08324763723617844, "grad_norm": 3.0625, "learning_rate": 4.992411260709914e-06, "loss": 0.218, "mean_token_accuracy": 0.9566865241527558, "num_tokens": 362184714.0, "step": 3400 }, { "entropy": 1.44765625, "epoch": 0.08447186719553401, "grad_norm": 3.03125, "learning_rate": 5.0658506731946145e-06, "loss": 0.2163, "mean_token_accuracy": 0.9571156585216523, "num_tokens": 367118033.0, "step": 3450 }, { "entropy": 1.4721875, "epoch": 0.08569609715488957, "grad_norm": 1.90625, "learning_rate": 5.139290085679315e-06, "loss": 0.2269, "mean_token_accuracy": 0.9551365935802459, "num_tokens": 372554179.0, "step": 3500 }, { "entropy": 1.43546875, "epoch": 0.08692032711424515, "grad_norm": 2.65625, "learning_rate": 5.212729498164015e-06, "loss": 0.2235, "mean_token_accuracy": 0.9559626686573028, "num_tokens": 377909880.0, "step": 3550 }, { "entropy": 1.4384375, "epoch": 0.08814455707360071, "grad_norm": 1.7578125, "learning_rate": 5.286168910648715e-06, "loss": 0.2151, "mean_token_accuracy": 0.9575206100940704, "num_tokens": 383194488.0, "step": 3600 }, { "entropy": 1.42265625, "epoch": 0.08936878703295627, "grad_norm": 1.9609375, "learning_rate": 5.3596083231334154e-06, "loss": 0.229, "mean_token_accuracy": 0.9538651633262635, "num_tokens": 389073618.0, "step": 3650 }, { "entropy": 1.429375, "epoch": 0.09059301699231184, "grad_norm": 2.15625, "learning_rate": 5.433047735618115e-06, "loss": 0.2294, "mean_token_accuracy": 0.9545065891742707, "num_tokens": 394553347.0, "step": 3700 }, { "entropy": 1.42375, "epoch": 0.0918172469516674, "grad_norm": 2.078125, "learning_rate": 5.5064871481028155e-06, "loss": 0.2085, "mean_token_accuracy": 0.9575728678703308, "num_tokens": 399579739.0, "step": 3750 }, { "entropy": 1.411875, "epoch": 0.09304147691102296, "grad_norm": 1.6484375, "learning_rate": 5.579926560587515e-06, "loss": 0.2211, "mean_token_accuracy": 0.9557280552387237, "num_tokens": 404841496.0, "step": 3800 }, { "entropy": 1.40765625, "epoch": 0.09426570687037854, "grad_norm": 2.015625, "learning_rate": 5.6533659730722156e-06, "loss": 0.2125, "mean_token_accuracy": 0.9576599287986756, "num_tokens": 410023001.0, "step": 3850 }, { "entropy": 1.42984375, "epoch": 0.0954899368297341, "grad_norm": 2.640625, "learning_rate": 5.726805385556916e-06, "loss": 0.2279, "mean_token_accuracy": 0.9547258257865906, "num_tokens": 415549547.0, "step": 3900 }, { "entropy": 1.3978125, "epoch": 0.09671416678908966, "grad_norm": 2.59375, "learning_rate": 5.800244798041616e-06, "loss": 0.2232, "mean_token_accuracy": 0.9551710951328277, "num_tokens": 421034105.0, "step": 3950 }, { "entropy": 1.38796875, "epoch": 0.09793839674844523, "grad_norm": 1.515625, "learning_rate": 5.873684210526316e-06, "loss": 0.2162, "mean_token_accuracy": 0.9557711553573608, "num_tokens": 426688731.0, "step": 4000 }, { "entropy": 1.3903125, "epoch": 0.0991626267078008, "grad_norm": 10.25, "learning_rate": 5.947123623011016e-06, "loss": 0.2102, "mean_token_accuracy": 0.9573217809200287, "num_tokens": 431945587.0, "step": 4050 }, { "entropy": 1.37515625, "epoch": 0.10038685666715635, "grad_norm": 2.703125, "learning_rate": 5.9999995181245345e-06, "loss": 0.2068, "mean_token_accuracy": 0.9580986511707306, "num_tokens": 436945746.0, "step": 4100 }, { "entropy": 1.3790625, "epoch": 0.10161108662651193, "grad_norm": 2.171875, "learning_rate": 5.999989929791556e-06, "loss": 0.2008, "mean_token_accuracy": 0.9594962692260742, "num_tokens": 441913649.0, "step": 4150 }, { "entropy": 1.39890625, "epoch": 0.10283531658586749, "grad_norm": 2.25, "learning_rate": 5.9999680487622435e-06, "loss": 0.2158, "mean_token_accuracy": 0.9564687287807465, "num_tokens": 447263639.0, "step": 4200 }, { "entropy": 1.39796875, "epoch": 0.10405954654522305, "grad_norm": 2.359375, "learning_rate": 5.999933875126256e-06, "loss": 0.2235, "mean_token_accuracy": 0.9537206184864044, "num_tokens": 452831245.0, "step": 4250 }, { "entropy": 1.40046875, "epoch": 0.10528377650457862, "grad_norm": 2.484375, "learning_rate": 5.999887409023625e-06, "loss": 0.1983, "mean_token_accuracy": 0.9605963575839996, "num_tokens": 457920235.0, "step": 4300 }, { "entropy": 1.37109375, "epoch": 0.10650800646393419, "grad_norm": 2.46875, "learning_rate": 5.9998286506447455e-06, "loss": 0.1985, "mean_token_accuracy": 0.9589159560203552, "num_tokens": 463428491.0, "step": 4350 }, { "entropy": 1.393125, "epoch": 0.10773223642328975, "grad_norm": 2.4375, "learning_rate": 5.999757600230387e-06, "loss": 0.2181, "mean_token_accuracy": 0.9564608442783356, "num_tokens": 469183579.0, "step": 4400 }, { "entropy": 1.40828125, "epoch": 0.10895646638264532, "grad_norm": 1.953125, "learning_rate": 5.999674258071684e-06, "loss": 0.1997, "mean_token_accuracy": 0.9596063613891601, "num_tokens": 474548123.0, "step": 4450 }, { "entropy": 1.38171875, "epoch": 0.11018069634200088, "grad_norm": 2.25, "learning_rate": 5.999578624510137e-06, "loss": 0.2113, "mean_token_accuracy": 0.9565052735805512, "num_tokens": 480099691.0, "step": 4500 }, { "entropy": 1.39328125, "epoch": 0.11140492630135644, "grad_norm": 2.328125, "learning_rate": 5.9994706999376126e-06, "loss": 0.2096, "mean_token_accuracy": 0.9578315222263336, "num_tokens": 485485141.0, "step": 4550 }, { "entropy": 1.39828125, "epoch": 0.11262915626071202, "grad_norm": 2.125, "learning_rate": 5.999350484796339e-06, "loss": 0.1935, "mean_token_accuracy": 0.9609186232089997, "num_tokens": 490314941.0, "step": 4600 }, { "entropy": 1.41859375, "epoch": 0.11385338622006758, "grad_norm": 2.28125, "learning_rate": 5.999217979578909e-06, "loss": 0.2132, "mean_token_accuracy": 0.9569031345844269, "num_tokens": 495604676.0, "step": 4650 }, { "entropy": 1.41984375, "epoch": 0.11507761617942314, "grad_norm": 1.90625, "learning_rate": 5.999073184828273e-06, "loss": 0.1948, "mean_token_accuracy": 0.9596328222751618, "num_tokens": 500772718.0, "step": 4700 }, { "entropy": 1.42, "epoch": 0.11630184613877871, "grad_norm": 2.75, "learning_rate": 5.998916101137737e-06, "loss": 0.2128, "mean_token_accuracy": 0.9574012553691864, "num_tokens": 506105312.0, "step": 4750 }, { "entropy": 1.40890625, "epoch": 0.11752607609813427, "grad_norm": 2.671875, "learning_rate": 5.998746729150967e-06, "loss": 0.2019, "mean_token_accuracy": 0.958700270652771, "num_tokens": 511311990.0, "step": 4800 }, { "entropy": 1.41671875, "epoch": 0.11875030605748983, "grad_norm": 1.515625, "learning_rate": 5.998565069561976e-06, "loss": 0.2044, "mean_token_accuracy": 0.9582890093326568, "num_tokens": 516615202.0, "step": 4850 }, { "entropy": 1.4115625, "epoch": 0.1199745360168454, "grad_norm": 1.828125, "learning_rate": 5.998371123115128e-06, "loss": 0.207, "mean_token_accuracy": 0.9571990466117859, "num_tokens": 521934656.0, "step": 4900 }, { "entropy": 1.396875, "epoch": 0.12119876597620097, "grad_norm": 2.140625, "learning_rate": 5.9981648906051355e-06, "loss": 0.2069, "mean_token_accuracy": 0.9578309345245362, "num_tokens": 527328179.0, "step": 4950 }, { "entropy": 1.41046875, "epoch": 0.12242299593555654, "grad_norm": 2.484375, "learning_rate": 5.9979463728770525e-06, "loss": 0.1965, "mean_token_accuracy": 0.9601268231868744, "num_tokens": 532420262.0, "step": 5000 }, { "entropy": 1.3953125, "epoch": 0.1236472258949121, "grad_norm": 2.46875, "learning_rate": 5.997715570826272e-06, "loss": 0.1938, "mean_token_accuracy": 0.9605181181430816, "num_tokens": 537756232.0, "step": 5050 }, { "entropy": 1.390625, "epoch": 0.12487145585426766, "grad_norm": 1.5703125, "learning_rate": 5.997472485398524e-06, "loss": 0.2038, "mean_token_accuracy": 0.9585963201522827, "num_tokens": 543281806.0, "step": 5100 }, { "entropy": 1.4215625, "epoch": 0.12609568581362324, "grad_norm": 1.75, "learning_rate": 5.99721711758987e-06, "loss": 0.1969, "mean_token_accuracy": 0.9599570655822753, "num_tokens": 548233812.0, "step": 5150 }, { "entropy": 1.40515625, "epoch": 0.1273199157729788, "grad_norm": 2.375, "learning_rate": 5.9969494684466985e-06, "loss": 0.2041, "mean_token_accuracy": 0.9577370703220367, "num_tokens": 553736654.0, "step": 5200 }, { "entropy": 1.3990625, "epoch": 0.12854414573233436, "grad_norm": 2.140625, "learning_rate": 5.996669539065727e-06, "loss": 0.1945, "mean_token_accuracy": 0.9612773549556732, "num_tokens": 558856334.0, "step": 5250 }, { "entropy": 1.40203125, "epoch": 0.12976837569168992, "grad_norm": 1.7734375, "learning_rate": 5.996377330593983e-06, "loss": 0.2145, "mean_token_accuracy": 0.9565242063999176, "num_tokens": 564032272.0, "step": 5300 }, { "entropy": 1.39671875, "epoch": 0.13099260565104548, "grad_norm": 2.09375, "learning_rate": 5.9960728442288186e-06, "loss": 0.1992, "mean_token_accuracy": 0.958374012708664, "num_tokens": 569306892.0, "step": 5350 }, { "entropy": 1.38578125, "epoch": 0.13221683561040107, "grad_norm": 2.6875, "learning_rate": 5.995756081217889e-06, "loss": 0.1979, "mean_token_accuracy": 0.9593621265888214, "num_tokens": 574741752.0, "step": 5400 }, { "entropy": 1.38234375, "epoch": 0.13344106556975663, "grad_norm": 2.15625, "learning_rate": 5.9954270428591555e-06, "loss": 0.2003, "mean_token_accuracy": 0.9591895163059234, "num_tokens": 580457265.0, "step": 5450 }, { "entropy": 1.394375, "epoch": 0.1346652955291122, "grad_norm": 2.078125, "learning_rate": 5.995085730500878e-06, "loss": 0.1896, "mean_token_accuracy": 0.9607266175746918, "num_tokens": 585705175.0, "step": 5500 }, { "entropy": 1.39078125, "epoch": 0.13588952548846775, "grad_norm": 1.5234375, "learning_rate": 5.994732145541613e-06, "loss": 0.2003, "mean_token_accuracy": 0.9587921166419983, "num_tokens": 590923544.0, "step": 5550 }, { "entropy": 1.380625, "epoch": 0.1371137554478233, "grad_norm": 3.265625, "learning_rate": 5.9943662894302e-06, "loss": 0.1945, "mean_token_accuracy": 0.9587338602542878, "num_tokens": 596469221.0, "step": 5600 }, { "entropy": 1.4028125, "epoch": 0.1383379854071789, "grad_norm": 1.5859375, "learning_rate": 5.993988163665767e-06, "loss": 0.2225, "mean_token_accuracy": 0.9551014530658722, "num_tokens": 602167038.0, "step": 5650 }, { "entropy": 1.3846875, "epoch": 0.13956221536653446, "grad_norm": 2.640625, "learning_rate": 5.9935977697977114e-06, "loss": 0.201, "mean_token_accuracy": 0.958451042175293, "num_tokens": 607292638.0, "step": 5700 }, { "entropy": 1.3784375, "epoch": 0.14078644532589002, "grad_norm": 2.203125, "learning_rate": 5.993195109425705e-06, "loss": 0.2112, "mean_token_accuracy": 0.9564135050773621, "num_tokens": 613202323.0, "step": 5750 }, { "entropy": 1.38828125, "epoch": 0.14201067528524558, "grad_norm": 2.40625, "learning_rate": 5.9927801841996784e-06, "loss": 0.1937, "mean_token_accuracy": 0.9602376103401185, "num_tokens": 618640198.0, "step": 5800 }, { "entropy": 1.385, "epoch": 0.14323490524460114, "grad_norm": 2.609375, "learning_rate": 5.992352995819822e-06, "loss": 0.2075, "mean_token_accuracy": 0.9579639828205109, "num_tokens": 623893423.0, "step": 5850 }, { "entropy": 1.375625, "epoch": 0.1444591352039567, "grad_norm": 2.84375, "learning_rate": 5.991913546036574e-06, "loss": 0.2106, "mean_token_accuracy": 0.9564978110790253, "num_tokens": 629592369.0, "step": 5900 }, { "entropy": 1.37296875, "epoch": 0.1456833651633123, "grad_norm": 2.078125, "learning_rate": 5.991461836650615e-06, "loss": 0.211, "mean_token_accuracy": 0.9563369131088257, "num_tokens": 635736307.0, "step": 5950 }, { "entropy": 1.38203125, "epoch": 0.14690759512266785, "grad_norm": 3.0, "learning_rate": 5.990997869512859e-06, "loss": 0.1961, "mean_token_accuracy": 0.9592690026760101, "num_tokens": 641116233.0, "step": 6000 }, { "entropy": 1.378125, "epoch": 0.1481318250820234, "grad_norm": 2.65625, "learning_rate": 5.990521646524447e-06, "loss": 0.2008, "mean_token_accuracy": 0.9585745882987976, "num_tokens": 646167116.0, "step": 6050 }, { "entropy": 1.37140625, "epoch": 0.14935605504137897, "grad_norm": 2.25, "learning_rate": 5.990033169636744e-06, "loss": 0.1783, "mean_token_accuracy": 0.962623051404953, "num_tokens": 651158602.0, "step": 6100 }, { "entropy": 1.38609375, "epoch": 0.15058028500073453, "grad_norm": 2.390625, "learning_rate": 5.989532440851319e-06, "loss": 0.1925, "mean_token_accuracy": 0.9600079596042633, "num_tokens": 656353157.0, "step": 6150 }, { "entropy": 1.375625, "epoch": 0.1518045149600901, "grad_norm": 2.09375, "learning_rate": 5.98901946221995e-06, "loss": 0.1956, "mean_token_accuracy": 0.9591733336448669, "num_tokens": 661516084.0, "step": 6200 }, { "entropy": 1.3775, "epoch": 0.15302874491944568, "grad_norm": 2.59375, "learning_rate": 5.988494235844608e-06, "loss": 0.1857, "mean_token_accuracy": 0.9618037152290344, "num_tokens": 666952800.0, "step": 6250 }, { "entropy": 1.3721875, "epoch": 0.15425297487880124, "grad_norm": 1.546875, "learning_rate": 5.987956763877448e-06, "loss": 0.1994, "mean_token_accuracy": 0.9587778007984161, "num_tokens": 672306196.0, "step": 6300 }, { "entropy": 1.390625, "epoch": 0.1554772048381568, "grad_norm": 2.1875, "learning_rate": 5.987407048520806e-06, "loss": 0.1843, "mean_token_accuracy": 0.9617053723335266, "num_tokens": 677399978.0, "step": 6350 }, { "entropy": 1.38171875, "epoch": 0.15670143479751236, "grad_norm": 1.8671875, "learning_rate": 5.986845092027181e-06, "loss": 0.1937, "mean_token_accuracy": 0.9602959334850312, "num_tokens": 682747630.0, "step": 6400 }, { "entropy": 1.38578125, "epoch": 0.15792566475686792, "grad_norm": 2.671875, "learning_rate": 5.986270896699237e-06, "loss": 0.177, "mean_token_accuracy": 0.964161764383316, "num_tokens": 687573308.0, "step": 6450 }, { "entropy": 1.394375, "epoch": 0.15914989471622348, "grad_norm": 2.15625, "learning_rate": 5.985684464889784e-06, "loss": 0.1956, "mean_token_accuracy": 0.9590267181396485, "num_tokens": 692719553.0, "step": 6500 }, { "entropy": 1.4165625, "epoch": 0.16037412467557907, "grad_norm": 2.640625, "learning_rate": 5.985085799001773e-06, "loss": 0.21, "mean_token_accuracy": 0.9567484962940216, "num_tokens": 698446523.0, "step": 6550 }, { "entropy": 1.39546875, "epoch": 0.16159835463493463, "grad_norm": 1.8984375, "learning_rate": 5.984474901488284e-06, "loss": 0.1936, "mean_token_accuracy": 0.9587848937511444, "num_tokens": 703964383.0, "step": 6600 }, { "entropy": 1.3865625, "epoch": 0.1628225845942902, "grad_norm": 2.5625, "learning_rate": 5.983851774852519e-06, "loss": 0.1814, "mean_token_accuracy": 0.9620046615600586, "num_tokens": 708987822.0, "step": 6650 }, { "entropy": 1.38390625, "epoch": 0.16404681455364575, "grad_norm": 1.6015625, "learning_rate": 5.983216421647789e-06, "loss": 0.1997, "mean_token_accuracy": 0.9585830473899841, "num_tokens": 714405287.0, "step": 6700 }, { "entropy": 1.37453125, "epoch": 0.16527104451300131, "grad_norm": 2.40625, "learning_rate": 5.982568844477502e-06, "loss": 0.1944, "mean_token_accuracy": 0.9597526073455811, "num_tokens": 719693246.0, "step": 6750 }, { "entropy": 1.34859375, "epoch": 0.16649527447235687, "grad_norm": 2.265625, "learning_rate": 5.9819090459951595e-06, "loss": 0.1792, "mean_token_accuracy": 0.9628249955177307, "num_tokens": 724856885.0, "step": 6800 }, { "entropy": 1.37203125, "epoch": 0.16771950443171246, "grad_norm": 1.921875, "learning_rate": 5.981237028904336e-06, "loss": 0.2106, "mean_token_accuracy": 0.9559559297561645, "num_tokens": 730337882.0, "step": 6850 }, { "entropy": 1.3596875, "epoch": 0.16894373439106802, "grad_norm": 2.78125, "learning_rate": 5.980552795958676e-06, "loss": 0.1715, "mean_token_accuracy": 0.964083902835846, "num_tokens": 735194384.0, "step": 6900 }, { "entropy": 1.37875, "epoch": 0.17016796435042358, "grad_norm": 2.890625, "learning_rate": 5.979856349961876e-06, "loss": 0.1884, "mean_token_accuracy": 0.961032167673111, "num_tokens": 740456561.0, "step": 6950 }, { "entropy": 1.34078125, "epoch": 0.17139219430977914, "grad_norm": 1.875, "learning_rate": 5.979147693767682e-06, "loss": 0.1824, "mean_token_accuracy": 0.9612845265865326, "num_tokens": 745438122.0, "step": 7000 }, { "entropy": 1.35234375, "epoch": 0.1726164242691347, "grad_norm": 1.8828125, "learning_rate": 5.978426830279867e-06, "loss": 0.2001, "mean_token_accuracy": 0.9585837364196778, "num_tokens": 750857417.0, "step": 7050 }, { "entropy": 1.35828125, "epoch": 0.1738406542284903, "grad_norm": 1.5703125, "learning_rate": 5.977693762452226e-06, "loss": 0.2077, "mean_token_accuracy": 0.956944135427475, "num_tokens": 756565585.0, "step": 7100 }, { "entropy": 1.37453125, "epoch": 0.17506488418784585, "grad_norm": 1.59375, "learning_rate": 5.976948493288563e-06, "loss": 0.1978, "mean_token_accuracy": 0.9594669210910797, "num_tokens": 762042483.0, "step": 7150 }, { "entropy": 1.38609375, "epoch": 0.17628911414720141, "grad_norm": 1.96875, "learning_rate": 5.976191025842678e-06, "loss": 0.1967, "mean_token_accuracy": 0.9588606441020966, "num_tokens": 767082096.0, "step": 7200 }, { "entropy": 1.3721875, "epoch": 0.17751334410655698, "grad_norm": 2.4375, "learning_rate": 5.975421363218352e-06, "loss": 0.1896, "mean_token_accuracy": 0.9610229313373566, "num_tokens": 772416657.0, "step": 7250 }, { "entropy": 1.37078125, "epoch": 0.17873757406591254, "grad_norm": 2.46875, "learning_rate": 5.97463950856934e-06, "loss": 0.187, "mean_token_accuracy": 0.9611088275909424, "num_tokens": 777391863.0, "step": 7300 }, { "entropy": 1.3696875, "epoch": 0.1799618040252681, "grad_norm": 2.9375, "learning_rate": 5.973845465099352e-06, "loss": 0.196, "mean_token_accuracy": 0.9594384169578553, "num_tokens": 782502134.0, "step": 7350 }, { "entropy": 1.3825, "epoch": 0.18118603398462368, "grad_norm": 3.296875, "learning_rate": 5.973039236062047e-06, "loss": 0.1826, "mean_token_accuracy": 0.9621104383468628, "num_tokens": 787376887.0, "step": 7400 }, { "entropy": 1.3746875, "epoch": 0.18241026394397925, "grad_norm": 2.609375, "learning_rate": 5.9722208247610095e-06, "loss": 0.1904, "mean_token_accuracy": 0.9605046558380127, "num_tokens": 792554125.0, "step": 7450 }, { "entropy": 1.39890625, "epoch": 0.1836344939033348, "grad_norm": 2.375, "learning_rate": 5.971390234549746e-06, "loss": 0.1981, "mean_token_accuracy": 0.9588062584400177, "num_tokens": 797990011.0, "step": 7500 }, { "entropy": 1.39328125, "epoch": 0.18485872386269037, "grad_norm": 2.1875, "learning_rate": 5.970547468831664e-06, "loss": 0.1827, "mean_token_accuracy": 0.9626439011096954, "num_tokens": 802985973.0, "step": 7550 }, { "entropy": 1.40375, "epoch": 0.18608295382204593, "grad_norm": 2.140625, "learning_rate": 5.969692531060065e-06, "loss": 0.1851, "mean_token_accuracy": 0.9621277391910553, "num_tokens": 808398744.0, "step": 7600 }, { "entropy": 1.391875, "epoch": 0.1873071837814015, "grad_norm": 1.421875, "learning_rate": 5.9688254247381225e-06, "loss": 0.1859, "mean_token_accuracy": 0.9607931089401245, "num_tokens": 813549741.0, "step": 7650 }, { "entropy": 1.3784375, "epoch": 0.18853141374075708, "grad_norm": 3.171875, "learning_rate": 5.967946153418875e-06, "loss": 0.1862, "mean_token_accuracy": 0.9606724309921265, "num_tokens": 818604872.0, "step": 7700 }, { "entropy": 1.3865625, "epoch": 0.18975564370011264, "grad_norm": 2.046875, "learning_rate": 5.967054720705204e-06, "loss": 0.1934, "mean_token_accuracy": 0.9598609590530396, "num_tokens": 824064581.0, "step": 7750 }, { "entropy": 1.39875, "epoch": 0.1909798736594682, "grad_norm": 2.53125, "learning_rate": 5.966151130249828e-06, "loss": 0.1926, "mean_token_accuracy": 0.9593923246860504, "num_tokens": 829369830.0, "step": 7800 }, { "entropy": 1.3865625, "epoch": 0.19220410361882376, "grad_norm": 2.28125, "learning_rate": 5.965235385755279e-06, "loss": 0.1926, "mean_token_accuracy": 0.9593356001377106, "num_tokens": 834877335.0, "step": 7850 }, { "entropy": 1.39328125, "epoch": 0.19342833357817932, "grad_norm": 9.0, "learning_rate": 5.9643074909738936e-06, "loss": 0.1847, "mean_token_accuracy": 0.9613538563251496, "num_tokens": 840076176.0, "step": 7900 }, { "entropy": 1.38703125, "epoch": 0.19465256353753488, "grad_norm": 2.3125, "learning_rate": 5.963367449707793e-06, "loss": 0.1815, "mean_token_accuracy": 0.9614927160739899, "num_tokens": 845350867.0, "step": 7950 }, { "entropy": 1.39875, "epoch": 0.19587679349689047, "grad_norm": 1.8359375, "learning_rate": 5.962415265808872e-06, "loss": 0.1921, "mean_token_accuracy": 0.9596588695049286, "num_tokens": 850547684.0, "step": 8000 }, { "entropy": 1.3890625, "epoch": 0.19710102345624603, "grad_norm": 2.6875, "learning_rate": 5.961450943178779e-06, "loss": 0.1915, "mean_token_accuracy": 0.9603916919231414, "num_tokens": 855721426.0, "step": 8050 }, { "entropy": 1.37421875, "epoch": 0.1983252534156016, "grad_norm": 2.734375, "learning_rate": 5.960474485768902e-06, "loss": 0.1722, "mean_token_accuracy": 0.963141576051712, "num_tokens": 860509090.0, "step": 8100 }, { "entropy": 1.34984375, "epoch": 0.19954948337495715, "grad_norm": 2.109375, "learning_rate": 5.959485897580353e-06, "loss": 0.1799, "mean_token_accuracy": 0.9624167239665985, "num_tokens": 865732499.0, "step": 8150 }, { "entropy": 1.37765625, "epoch": 0.2007737133343127, "grad_norm": 2.875, "learning_rate": 5.95848518266395e-06, "loss": 0.1955, "mean_token_accuracy": 0.9592999804019928, "num_tokens": 870715442.0, "step": 8200 }, { "entropy": 1.3496875, "epoch": 0.20199794329366827, "grad_norm": 1.8359375, "learning_rate": 5.957472345120202e-06, "loss": 0.1826, "mean_token_accuracy": 0.9611281609535217, "num_tokens": 875976771.0, "step": 8250 }, { "entropy": 1.331875, "epoch": 0.20322217325302386, "grad_norm": 2.34375, "learning_rate": 5.95644738909929e-06, "loss": 0.1801, "mean_token_accuracy": 0.9619064545631408, "num_tokens": 881030532.0, "step": 8300 }, { "entropy": 1.33828125, "epoch": 0.20444640321237942, "grad_norm": 2.3125, "learning_rate": 5.9554103188010544e-06, "loss": 0.1844, "mean_token_accuracy": 0.9607453966140747, "num_tokens": 886102364.0, "step": 8350 }, { "entropy": 1.33625, "epoch": 0.20567063317173498, "grad_norm": 2.59375, "learning_rate": 5.9543611384749716e-06, "loss": 0.1896, "mean_token_accuracy": 0.9599519455432892, "num_tokens": 891339628.0, "step": 8400 }, { "entropy": 1.3515625, "epoch": 0.20689486313109054, "grad_norm": 3.1875, "learning_rate": 5.953299852420142e-06, "loss": 0.1963, "mean_token_accuracy": 0.9594342112541199, "num_tokens": 896598491.0, "step": 8450 }, { "entropy": 1.3475, "epoch": 0.2081190930904461, "grad_norm": 1.6171875, "learning_rate": 5.952226464985268e-06, "loss": 0.1876, "mean_token_accuracy": 0.9601819491386414, "num_tokens": 901857034.0, "step": 8500 }, { "entropy": 1.34546875, "epoch": 0.2093433230498017, "grad_norm": 2.484375, "learning_rate": 5.951140980568639e-06, "loss": 0.2025, "mean_token_accuracy": 0.9580735051631928, "num_tokens": 907672007.0, "step": 8550 }, { "entropy": 1.3434375, "epoch": 0.21056755300915725, "grad_norm": 2.859375, "learning_rate": 5.950043403618116e-06, "loss": 0.182, "mean_token_accuracy": 0.9620107614994049, "num_tokens": 912959621.0, "step": 8600 }, { "entropy": 1.34140625, "epoch": 0.2117917829685128, "grad_norm": 2.015625, "learning_rate": 5.948933738631106e-06, "loss": 0.182, "mean_token_accuracy": 0.9617352223396302, "num_tokens": 918075673.0, "step": 8650 }, { "entropy": 1.3446875, "epoch": 0.21301601292786837, "grad_norm": 2.625, "learning_rate": 5.9478119901545485e-06, "loss": 0.1863, "mean_token_accuracy": 0.960466115474701, "num_tokens": 923511470.0, "step": 8700 }, { "entropy": 1.3490625, "epoch": 0.21424024288722393, "grad_norm": 2.4375, "learning_rate": 5.946678162784898e-06, "loss": 0.1997, "mean_token_accuracy": 0.9574442803859711, "num_tokens": 929168035.0, "step": 8750 }, { "entropy": 1.3559375, "epoch": 0.2154644728465795, "grad_norm": 2.59375, "learning_rate": 5.945532261168101e-06, "loss": 0.188, "mean_token_accuracy": 0.9608505368232727, "num_tokens": 934643696.0, "step": 8800 }, { "entropy": 1.37, "epoch": 0.21668870280593508, "grad_norm": 2.84375, "learning_rate": 5.9443742899995815e-06, "loss": 0.1987, "mean_token_accuracy": 0.9590060126781463, "num_tokens": 940012909.0, "step": 8850 }, { "entropy": 1.360625, "epoch": 0.21791293276529064, "grad_norm": 2.28125, "learning_rate": 5.943204254024216e-06, "loss": 0.1835, "mean_token_accuracy": 0.9617989957332611, "num_tokens": 945384360.0, "step": 8900 }, { "entropy": 1.3675, "epoch": 0.2191371627246462, "grad_norm": 3.03125, "learning_rate": 5.942022158036322e-06, "loss": 0.1955, "mean_token_accuracy": 0.9601530432701111, "num_tokens": 950833742.0, "step": 8950 }, { "entropy": 1.38125, "epoch": 0.22036139268400176, "grad_norm": 2.578125, "learning_rate": 5.9408280068796286e-06, "loss": 0.2066, "mean_token_accuracy": 0.9570643317699432, "num_tokens": 956401892.0, "step": 9000 }, { "entropy": 1.37234375, "epoch": 0.22158562264335732, "grad_norm": 1.71875, "learning_rate": 5.939621805447267e-06, "loss": 0.1804, "mean_token_accuracy": 0.9623953711986541, "num_tokens": 961223140.0, "step": 9050 }, { "entropy": 1.391875, "epoch": 0.22280985260271288, "grad_norm": 2.15625, "learning_rate": 5.938403558681743e-06, "loss": 0.202, "mean_token_accuracy": 0.9580870044231414, "num_tokens": 966771629.0, "step": 9100 }, { "entropy": 1.36703125, "epoch": 0.22403408256206847, "grad_norm": 2.609375, "learning_rate": 5.9371732715749175e-06, "loss": 0.1866, "mean_token_accuracy": 0.9609157121181489, "num_tokens": 972305399.0, "step": 9150 }, { "entropy": 1.35140625, "epoch": 0.22525831252142403, "grad_norm": 1.6796875, "learning_rate": 5.935930949167991e-06, "loss": 0.1815, "mean_token_accuracy": 0.9617423331737518, "num_tokens": 977370470.0, "step": 9200 }, { "entropy": 1.36953125, "epoch": 0.2264825424807796, "grad_norm": 2.140625, "learning_rate": 5.934676596551477e-06, "loss": 0.1884, "mean_token_accuracy": 0.9609754991531372, "num_tokens": 982652269.0, "step": 9250 }, { "entropy": 1.363125, "epoch": 0.22770677244013515, "grad_norm": 2.484375, "learning_rate": 5.933410218865186e-06, "loss": 0.1858, "mean_token_accuracy": 0.9611955726146698, "num_tokens": 988014138.0, "step": 9300 }, { "entropy": 1.37265625, "epoch": 0.2289310023994907, "grad_norm": 2.53125, "learning_rate": 5.932131821298198e-06, "loss": 0.1856, "mean_token_accuracy": 0.9616758930683136, "num_tokens": 993370242.0, "step": 9350 }, { "entropy": 1.38515625, "epoch": 0.23015523235884627, "grad_norm": 2.34375, "learning_rate": 5.930841409088853e-06, "loss": 0.1906, "mean_token_accuracy": 0.9603582990169525, "num_tokens": 998918502.0, "step": 9400 }, { "entropy": 1.39, "epoch": 0.23137946231820186, "grad_norm": 2.578125, "learning_rate": 5.929538987524712e-06, "loss": 0.1854, "mean_token_accuracy": 0.9604568040370941, "num_tokens": 1004326538.0, "step": 9450 }, { "entropy": 1.3890625, "epoch": 0.23260369227755742, "grad_norm": 2.75, "learning_rate": 5.928224561942554e-06, "loss": 0.1812, "mean_token_accuracy": 0.9616895508766174, "num_tokens": 1009603548.0, "step": 9500 }, { "entropy": 1.3871875, "epoch": 0.23382792223691298, "grad_norm": 2.3125, "learning_rate": 5.92689813772834e-06, "loss": 0.1963, "mean_token_accuracy": 0.9590861582756043, "num_tokens": 1015070964.0, "step": 9550 }, { "entropy": 1.36609375, "epoch": 0.23505215219626854, "grad_norm": 3.109375, "learning_rate": 5.9255597203172e-06, "loss": 0.1828, "mean_token_accuracy": 0.9619620275497437, "num_tokens": 1020492153.0, "step": 9600 }, { "entropy": 1.38609375, "epoch": 0.2362763821556241, "grad_norm": 2.421875, "learning_rate": 5.924209315193405e-06, "loss": 0.1845, "mean_token_accuracy": 0.961515667438507, "num_tokens": 1025864529.0, "step": 9650 }, { "entropy": 1.3715625, "epoch": 0.23750061211497966, "grad_norm": 2.296875, "learning_rate": 5.922846927890345e-06, "loss": 0.1797, "mean_token_accuracy": 0.9618804860115051, "num_tokens": 1031024359.0, "step": 9700 }, { "entropy": 1.36359375, "epoch": 0.23872484207433525, "grad_norm": 2.46875, "learning_rate": 5.9214725639905115e-06, "loss": 0.1863, "mean_token_accuracy": 0.9610350334644318, "num_tokens": 1036377471.0, "step": 9750 }, { "entropy": 1.3715625, "epoch": 0.2399490720336908, "grad_norm": 2.859375, "learning_rate": 5.92008622912547e-06, "loss": 0.1831, "mean_token_accuracy": 0.9612818145751953, "num_tokens": 1041703688.0, "step": 9800 }, { "entropy": 1.35671875, "epoch": 0.24117330199304637, "grad_norm": 2.6875, "learning_rate": 5.918687928975836e-06, "loss": 0.1839, "mean_token_accuracy": 0.9616091656684875, "num_tokens": 1046917985.0, "step": 9850 }, { "entropy": 1.39015625, "epoch": 0.24239753195240193, "grad_norm": 1.8046875, "learning_rate": 5.9172776692712575e-06, "loss": 0.1965, "mean_token_accuracy": 0.9584881782531738, "num_tokens": 1052482737.0, "step": 9900 }, { "entropy": 1.38703125, "epoch": 0.2436217619117575, "grad_norm": 2.6875, "learning_rate": 5.915855455790381e-06, "loss": 0.1884, "mean_token_accuracy": 0.9608153140544892, "num_tokens": 1057868410.0, "step": 9950 }, { "entropy": 1.395, "epoch": 0.24484599187111308, "grad_norm": 2.8125, "learning_rate": 5.914421294360843e-06, "loss": 0.1904, "mean_token_accuracy": 0.9597806739807129, "num_tokens": 1063175179.0, "step": 10000 }, { "epoch": 0.24484599187111308, "eval_entropy": 1.359765625, "eval_loss": 0.20250044763088226, "eval_mean_token_accuracy": 0.9580152039726575, "eval_num_tokens": 1063175179.0, "eval_runtime": 600.0597, "eval_samples_per_second": 16.092, "eval_steps_per_second": 0.202, "step": 10000 }, { "entropy": 1.3840625, "epoch": 0.24607022183046864, "grad_norm": 2.28125, "learning_rate": 5.912975190859232e-06, "loss": 0.195, "mean_token_accuracy": 0.9596641564369202, "num_tokens": 1068741854.0, "step": 10050 }, { "entropy": 1.3790625, "epoch": 0.2472944517898242, "grad_norm": 2.484375, "learning_rate": 5.9115171512110714e-06, "loss": 0.1854, "mean_token_accuracy": 0.9604480576515197, "num_tokens": 1074116479.0, "step": 10100 }, { "entropy": 1.36453125, "epoch": 0.24851868174917977, "grad_norm": 2.171875, "learning_rate": 5.910047181390794e-06, "loss": 0.1697, "mean_token_accuracy": 0.9642793035507202, "num_tokens": 1079159902.0, "step": 10150 }, { "entropy": 1.373125, "epoch": 0.24974291170853533, "grad_norm": 1.9765625, "learning_rate": 5.908565287421718e-06, "loss": 0.1861, "mean_token_accuracy": 0.9611909198760986, "num_tokens": 1084521049.0, "step": 10200 }, { "entropy": 1.3578125, "epoch": 0.2509671416678909, "grad_norm": 2.65625, "learning_rate": 5.907071475376021e-06, "loss": 0.1787, "mean_token_accuracy": 0.9620854771137237, "num_tokens": 1089493722.0, "step": 10250 }, { "entropy": 1.36484375, "epoch": 0.2521913716272465, "grad_norm": 2.640625, "learning_rate": 5.905565751374717e-06, "loss": 0.1732, "mean_token_accuracy": 0.9639436435699463, "num_tokens": 1094338571.0, "step": 10300 }, { "entropy": 1.37234375, "epoch": 0.25341560158660204, "grad_norm": 2.5625, "learning_rate": 5.904048121587628e-06, "loss": 0.1772, "mean_token_accuracy": 0.9625762343406677, "num_tokens": 1099742354.0, "step": 10350 }, { "entropy": 1.38359375, "epoch": 0.2546398315459576, "grad_norm": 1.5078125, "learning_rate": 5.902518592233363e-06, "loss": 0.1987, "mean_token_accuracy": 0.9577878427505493, "num_tokens": 1105617487.0, "step": 10400 }, { "entropy": 1.3615625, "epoch": 0.25586406150531316, "grad_norm": 3.234375, "learning_rate": 5.9009771695792905e-06, "loss": 0.1811, "mean_token_accuracy": 0.9621189975738526, "num_tokens": 1110680544.0, "step": 10450 }, { "entropy": 1.37375, "epoch": 0.2570882914646687, "grad_norm": 2.140625, "learning_rate": 5.899423859941511e-06, "loss": 0.1882, "mean_token_accuracy": 0.9606586790084839, "num_tokens": 1116178837.0, "step": 10500 }, { "entropy": 1.37484375, "epoch": 0.2583125214240243, "grad_norm": 1.7578125, "learning_rate": 5.897858669684833e-06, "loss": 0.1893, "mean_token_accuracy": 0.9598471677303314, "num_tokens": 1121511467.0, "step": 10550 }, { "entropy": 1.3609375, "epoch": 0.25953675138337984, "grad_norm": 2.078125, "learning_rate": 5.896281605222749e-06, "loss": 0.1806, "mean_token_accuracy": 0.9624120283126831, "num_tokens": 1126507233.0, "step": 10600 }, { "entropy": 1.34734375, "epoch": 0.2607609813427354, "grad_norm": 2.28125, "learning_rate": 5.8946926730174045e-06, "loss": 0.1863, "mean_token_accuracy": 0.9608824181556702, "num_tokens": 1131912464.0, "step": 10650 }, { "entropy": 1.33921875, "epoch": 0.26198521130209096, "grad_norm": 2.5625, "learning_rate": 5.893091879579575e-06, "loss": 0.1856, "mean_token_accuracy": 0.9607326745986938, "num_tokens": 1136882208.0, "step": 10700 }, { "entropy": 1.343125, "epoch": 0.2632094412614466, "grad_norm": 1.9921875, "learning_rate": 5.89147923146864e-06, "loss": 0.1813, "mean_token_accuracy": 0.9620126748085022, "num_tokens": 1142095292.0, "step": 10750 }, { "entropy": 1.34765625, "epoch": 0.26443367122080214, "grad_norm": 3.234375, "learning_rate": 5.889854735292551e-06, "loss": 0.1841, "mean_token_accuracy": 0.9618128108978271, "num_tokens": 1147363920.0, "step": 10800 }, { "entropy": 1.356875, "epoch": 0.2656579011801577, "grad_norm": 2.46875, "learning_rate": 5.888218397707811e-06, "loss": 0.1742, "mean_token_accuracy": 0.9638459277153015, "num_tokens": 1152380705.0, "step": 10850 }, { "entropy": 1.32984375, "epoch": 0.26688213113951326, "grad_norm": 2.109375, "learning_rate": 5.886570225419441e-06, "loss": 0.1865, "mean_token_accuracy": 0.9608019030094147, "num_tokens": 1157839898.0, "step": 10900 }, { "entropy": 1.34609375, "epoch": 0.2681063610988688, "grad_norm": 3.453125, "learning_rate": 5.88491022518096e-06, "loss": 0.1918, "mean_token_accuracy": 0.9609634006023406, "num_tokens": 1163068506.0, "step": 10950 }, { "entropy": 1.32734375, "epoch": 0.2693305910582244, "grad_norm": 2.125, "learning_rate": 5.883238403794349e-06, "loss": 0.1758, "mean_token_accuracy": 0.9633646559715271, "num_tokens": 1168287852.0, "step": 11000 }, { "entropy": 1.34375, "epoch": 0.27055482101757994, "grad_norm": 2.296875, "learning_rate": 5.881554768110028e-06, "loss": 0.1914, "mean_token_accuracy": 0.9605349290370941, "num_tokens": 1173597061.0, "step": 11050 }, { "entropy": 1.3434375, "epoch": 0.2717790509769355, "grad_norm": 3.5, "learning_rate": 5.879859325026828e-06, "loss": 0.1864, "mean_token_accuracy": 0.9604840254783631, "num_tokens": 1178845621.0, "step": 11100 }, { "entropy": 1.35984375, "epoch": 0.27300328093629106, "grad_norm": 2.734375, "learning_rate": 5.878152081491963e-06, "loss": 0.1925, "mean_token_accuracy": 0.9589577269554138, "num_tokens": 1184054388.0, "step": 11150 }, { "entropy": 1.34875, "epoch": 0.2742275108956466, "grad_norm": 2.625, "learning_rate": 5.876433044500996e-06, "loss": 0.1921, "mean_token_accuracy": 0.9595346593856812, "num_tokens": 1189697396.0, "step": 11200 }, { "entropy": 1.34390625, "epoch": 0.2754517408550022, "grad_norm": 2.0, "learning_rate": 5.874702221097819e-06, "loss": 0.1882, "mean_token_accuracy": 0.960370112657547, "num_tokens": 1195166226.0, "step": 11250 }, { "entropy": 1.34515625, "epoch": 0.2766759708143578, "grad_norm": 2.734375, "learning_rate": 5.8729596183746175e-06, "loss": 0.1805, "mean_token_accuracy": 0.9621370649337768, "num_tokens": 1200392905.0, "step": 11300 }, { "entropy": 1.3428125, "epoch": 0.27790020077371336, "grad_norm": 3.078125, "learning_rate": 5.871205243471844e-06, "loss": 0.1841, "mean_token_accuracy": 0.9613085889816284, "num_tokens": 1205618541.0, "step": 11350 }, { "entropy": 1.35171875, "epoch": 0.2791244307330689, "grad_norm": 3.40625, "learning_rate": 5.869439103578189e-06, "loss": 0.1852, "mean_token_accuracy": 0.9616814315319061, "num_tokens": 1210836329.0, "step": 11400 }, { "entropy": 1.3453125, "epoch": 0.2803486606924245, "grad_norm": 1.8359375, "learning_rate": 5.867661205930549e-06, "loss": 0.1821, "mean_token_accuracy": 0.9620612812042236, "num_tokens": 1215867506.0, "step": 11450 }, { "entropy": 1.35875, "epoch": 0.28157289065178004, "grad_norm": 2.953125, "learning_rate": 5.865871557814003e-06, "loss": 0.1915, "mean_token_accuracy": 0.9604600322246551, "num_tokens": 1220793244.0, "step": 11500 }, { "entropy": 1.353125, "epoch": 0.2827971206111356, "grad_norm": 2.796875, "learning_rate": 5.864070166561775e-06, "loss": 0.1937, "mean_token_accuracy": 0.9599918603897095, "num_tokens": 1226305868.0, "step": 11550 }, { "entropy": 1.394375, "epoch": 0.28402135057049116, "grad_norm": 2.046875, "learning_rate": 5.862257039555207e-06, "loss": 0.1991, "mean_token_accuracy": 0.9583842658996582, "num_tokens": 1232013095.0, "step": 11600 }, { "entropy": 1.37578125, "epoch": 0.2852455805298467, "grad_norm": 2.015625, "learning_rate": 5.860432184223731e-06, "loss": 0.1913, "mean_token_accuracy": 0.9596893274784088, "num_tokens": 1237458606.0, "step": 11650 }, { "entropy": 1.35703125, "epoch": 0.2864698104892023, "grad_norm": 2.09375, "learning_rate": 5.858595608044837e-06, "loss": 0.1835, "mean_token_accuracy": 0.9611952984333039, "num_tokens": 1242972251.0, "step": 11700 }, { "entropy": 1.37078125, "epoch": 0.28769404044855784, "grad_norm": 3.1875, "learning_rate": 5.856747318544041e-06, "loss": 0.1865, "mean_token_accuracy": 0.9609648621082306, "num_tokens": 1248318638.0, "step": 11750 }, { "entropy": 1.365, "epoch": 0.2889182704079134, "grad_norm": 2.15625, "learning_rate": 5.854887323294856e-06, "loss": 0.183, "mean_token_accuracy": 0.9627510058879852, "num_tokens": 1253680002.0, "step": 11800 }, { "entropy": 1.37578125, "epoch": 0.29014250036726896, "grad_norm": 1.8828125, "learning_rate": 5.853015629918759e-06, "loss": 0.1862, "mean_token_accuracy": 0.9614068794250489, "num_tokens": 1258924764.0, "step": 11850 }, { "entropy": 1.37796875, "epoch": 0.2913667303266246, "grad_norm": 1.90625, "learning_rate": 5.8511322460851624e-06, "loss": 0.1832, "mean_token_accuracy": 0.9620686209201813, "num_tokens": 1264051390.0, "step": 11900 }, { "entropy": 1.37328125, "epoch": 0.29259096028598014, "grad_norm": 2.3125, "learning_rate": 5.849237179511381e-06, "loss": 0.1769, "mean_token_accuracy": 0.9628199970722199, "num_tokens": 1269148836.0, "step": 11950 }, { "entropy": 1.376875, "epoch": 0.2938151902453357, "grad_norm": 3.125, "learning_rate": 5.8473304379626e-06, "loss": 0.1871, "mean_token_accuracy": 0.9601672506332397, "num_tokens": 1274348582.0, "step": 12000 }, { "entropy": 1.35203125, "epoch": 0.29503942020469126, "grad_norm": 2.46875, "learning_rate": 5.845412029251843e-06, "loss": 0.1796, "mean_token_accuracy": 0.9622039210796356, "num_tokens": 1279184908.0, "step": 12050 }, { "entropy": 1.35859375, "epoch": 0.2962636501640468, "grad_norm": 2.921875, "learning_rate": 5.843481961239942e-06, "loss": 0.1772, "mean_token_accuracy": 0.9627481973171235, "num_tokens": 1284410532.0, "step": 12100 }, { "entropy": 1.35953125, "epoch": 0.2974878801234024, "grad_norm": 7.40625, "learning_rate": 5.841540241835504e-06, "loss": 0.1768, "mean_token_accuracy": 0.9626896047592163, "num_tokens": 1289768837.0, "step": 12150 }, { "entropy": 1.378125, "epoch": 0.29871211008275794, "grad_norm": 2.3125, "learning_rate": 5.8395868789948775e-06, "loss": 0.1848, "mean_token_accuracy": 0.9612694227695465, "num_tokens": 1295005247.0, "step": 12200 }, { "entropy": 1.37359375, "epoch": 0.2999363400421135, "grad_norm": 2.34375, "learning_rate": 5.837621880722122e-06, "loss": 0.1909, "mean_token_accuracy": 0.9603909432888031, "num_tokens": 1300316507.0, "step": 12250 }, { "entropy": 1.35953125, "epoch": 0.30116057000146906, "grad_norm": 2.75, "learning_rate": 5.835645255068973e-06, "loss": 0.1838, "mean_token_accuracy": 0.9617878496646881, "num_tokens": 1305931141.0, "step": 12300 }, { "entropy": 1.34640625, "epoch": 0.3023847999608246, "grad_norm": 2.375, "learning_rate": 5.8336570101348115e-06, "loss": 0.1651, "mean_token_accuracy": 0.9648260760307312, "num_tokens": 1310803906.0, "step": 12350 }, { "entropy": 1.358125, "epoch": 0.3036090299201802, "grad_norm": 2.84375, "learning_rate": 5.831657154066629e-06, "loss": 0.1827, "mean_token_accuracy": 0.9618698525428772, "num_tokens": 1315973080.0, "step": 12400 }, { "entropy": 1.35328125, "epoch": 0.30483325987953575, "grad_norm": 3.578125, "learning_rate": 5.829645695058992e-06, "loss": 0.1747, "mean_token_accuracy": 0.9627145206928254, "num_tokens": 1321381888.0, "step": 12450 }, { "entropy": 1.37859375, "epoch": 0.30605748983889136, "grad_norm": 2.609375, "learning_rate": 5.827622641354014e-06, "loss": 0.1787, "mean_token_accuracy": 0.9626282620429992, "num_tokens": 1326557068.0, "step": 12500 }, { "entropy": 1.3759375, "epoch": 0.3072817197982469, "grad_norm": 2.328125, "learning_rate": 5.825588001241318e-06, "loss": 0.1912, "mean_token_accuracy": 0.9598784649372101, "num_tokens": 1332216024.0, "step": 12550 }, { "entropy": 1.35890625, "epoch": 0.3085059497576025, "grad_norm": 1.8359375, "learning_rate": 5.823541783058005e-06, "loss": 0.174, "mean_token_accuracy": 0.962734831571579, "num_tokens": 1337390329.0, "step": 12600 }, { "entropy": 1.37375, "epoch": 0.30973017971695804, "grad_norm": 2.140625, "learning_rate": 5.821483995188612e-06, "loss": 0.1881, "mean_token_accuracy": 0.9605675613880158, "num_tokens": 1343045143.0, "step": 12650 }, { "entropy": 1.3415625, "epoch": 0.3109544096763136, "grad_norm": 2.28125, "learning_rate": 5.81941464606509e-06, "loss": 0.1666, "mean_token_accuracy": 0.9643463969230652, "num_tokens": 1348034262.0, "step": 12700 }, { "entropy": 1.3440625, "epoch": 0.31217863963566916, "grad_norm": 2.96875, "learning_rate": 5.817333744166762e-06, "loss": 0.1921, "mean_token_accuracy": 0.9586631393432617, "num_tokens": 1353723053.0, "step": 12750 }, { "entropy": 1.3721875, "epoch": 0.3134028695950247, "grad_norm": 2.203125, "learning_rate": 5.815241298020286e-06, "loss": 0.1846, "mean_token_accuracy": 0.9600662136077881, "num_tokens": 1358674728.0, "step": 12800 }, { "entropy": 1.365625, "epoch": 0.3146270995543803, "grad_norm": 2.6875, "learning_rate": 5.813137316199628e-06, "loss": 0.1835, "mean_token_accuracy": 0.961768034696579, "num_tokens": 1363933473.0, "step": 12850 }, { "entropy": 1.38015625, "epoch": 0.31585132951373585, "grad_norm": 2.5, "learning_rate": 5.811021807326018e-06, "loss": 0.1982, "mean_token_accuracy": 0.9590709102153778, "num_tokens": 1369281803.0, "step": 12900 }, { "entropy": 1.37, "epoch": 0.3170755594730914, "grad_norm": 3.03125, "learning_rate": 5.808894780067923e-06, "loss": 0.1949, "mean_token_accuracy": 0.9586555528640747, "num_tokens": 1374853145.0, "step": 12950 }, { "entropy": 1.36421875, "epoch": 0.31829978943244697, "grad_norm": 2.015625, "learning_rate": 5.8067562431410045e-06, "loss": 0.171, "mean_token_accuracy": 0.9631958258152008, "num_tokens": 1379934830.0, "step": 13000 }, { "entropy": 1.3609375, "epoch": 0.3195240193918026, "grad_norm": 2.09375, "learning_rate": 5.804606205308088e-06, "loss": 0.1841, "mean_token_accuracy": 0.9605684506893158, "num_tokens": 1385105704.0, "step": 13050 }, { "entropy": 1.37671875, "epoch": 0.32074824935115814, "grad_norm": 2.875, "learning_rate": 5.802444675379122e-06, "loss": 0.1947, "mean_token_accuracy": 0.9595759809017181, "num_tokens": 1390581041.0, "step": 13100 }, { "entropy": 1.37828125, "epoch": 0.3219724793105137, "grad_norm": 1.9453125, "learning_rate": 5.8002716622111485e-06, "loss": 0.1858, "mean_token_accuracy": 0.9617175209522247, "num_tokens": 1395850769.0, "step": 13150 }, { "entropy": 1.365, "epoch": 0.32319670926986926, "grad_norm": 2.515625, "learning_rate": 5.79808717470826e-06, "loss": 0.1676, "mean_token_accuracy": 0.9655633735656738, "num_tokens": 1400935540.0, "step": 13200 }, { "entropy": 1.3709375, "epoch": 0.3244209392292248, "grad_norm": 2.421875, "learning_rate": 5.795891221821569e-06, "loss": 0.1807, "mean_token_accuracy": 0.9624592447280884, "num_tokens": 1406376315.0, "step": 13250 }, { "entropy": 1.34875, "epoch": 0.3256451691885804, "grad_norm": 3.09375, "learning_rate": 5.793683812549162e-06, "loss": 0.1727, "mean_token_accuracy": 0.9637568819522858, "num_tokens": 1411533562.0, "step": 13300 }, { "entropy": 1.36421875, "epoch": 0.32686939914793595, "grad_norm": 2.703125, "learning_rate": 5.791464955936077e-06, "loss": 0.1938, "mean_token_accuracy": 0.9592576730251312, "num_tokens": 1417402528.0, "step": 13350 }, { "entropy": 1.36109375, "epoch": 0.3280936291072915, "grad_norm": 1.7109375, "learning_rate": 5.789234661074254e-06, "loss": 0.1744, "mean_token_accuracy": 0.9627709448337555, "num_tokens": 1422622878.0, "step": 13400 }, { "entropy": 1.3790625, "epoch": 0.32931785906664707, "grad_norm": 2.421875, "learning_rate": 5.786992937102503e-06, "loss": 0.1959, "mean_token_accuracy": 0.9586515820026398, "num_tokens": 1427838914.0, "step": 13450 }, { "entropy": 1.36, "epoch": 0.33054208902600263, "grad_norm": 3.140625, "learning_rate": 5.784739793206464e-06, "loss": 0.1794, "mean_token_accuracy": 0.9625478911399842, "num_tokens": 1432973891.0, "step": 13500 }, { "entropy": 1.37546875, "epoch": 0.3317663189853582, "grad_norm": 2.875, "learning_rate": 5.782475238618574e-06, "loss": 0.1952, "mean_token_accuracy": 0.958906524181366, "num_tokens": 1438425313.0, "step": 13550 }, { "entropy": 1.39109375, "epoch": 0.33299054894471375, "grad_norm": 3.15625, "learning_rate": 5.780199282618026e-06, "loss": 0.1937, "mean_token_accuracy": 0.9599265992641449, "num_tokens": 1443930223.0, "step": 13600 }, { "entropy": 1.3784375, "epoch": 0.33421477890406937, "grad_norm": 2.359375, "learning_rate": 5.777911934530726e-06, "loss": 0.1896, "mean_token_accuracy": 0.9606879663467407, "num_tokens": 1449235492.0, "step": 13650 }, { "entropy": 1.3740625, "epoch": 0.3354390088634249, "grad_norm": 2.25, "learning_rate": 5.7756132037292665e-06, "loss": 0.1845, "mean_token_accuracy": 0.9607800352573395, "num_tokens": 1454874971.0, "step": 13700 }, { "entropy": 1.3565625, "epoch": 0.3366632388227805, "grad_norm": 3.0, "learning_rate": 5.77330309963288e-06, "loss": 0.1664, "mean_token_accuracy": 0.9650224351882934, "num_tokens": 1459910564.0, "step": 13750 }, { "entropy": 1.3896875, "epoch": 0.33788746878213605, "grad_norm": 2.6875, "learning_rate": 5.7709816317074e-06, "loss": 0.1852, "mean_token_accuracy": 0.9610321772098541, "num_tokens": 1465214852.0, "step": 13800 }, { "entropy": 1.3609375, "epoch": 0.3391116987414916, "grad_norm": 2.421875, "learning_rate": 5.768648809465223e-06, "loss": 0.173, "mean_token_accuracy": 0.9646092760562897, "num_tokens": 1470405224.0, "step": 13850 }, { "entropy": 1.3671875, "epoch": 0.34033592870084717, "grad_norm": 2.421875, "learning_rate": 5.766304642465277e-06, "loss": 0.1684, "mean_token_accuracy": 0.964150664806366, "num_tokens": 1475222511.0, "step": 13900 }, { "entropy": 1.3615625, "epoch": 0.34156015866020273, "grad_norm": 2.015625, "learning_rate": 5.763949140312969e-06, "loss": 0.1903, "mean_token_accuracy": 0.9601925635337829, "num_tokens": 1480884593.0, "step": 13950 }, { "entropy": 1.35734375, "epoch": 0.3427843886195583, "grad_norm": 2.859375, "learning_rate": 5.7615823126601565e-06, "loss": 0.1853, "mean_token_accuracy": 0.9617584705352783, "num_tokens": 1485873672.0, "step": 14000 }, { "entropy": 1.37375, "epoch": 0.34400861857891385, "grad_norm": 2.3125, "learning_rate": 5.759204169205102e-06, "loss": 0.1862, "mean_token_accuracy": 0.9605587136745453, "num_tokens": 1490904541.0, "step": 14050 }, { "entropy": 1.36359375, "epoch": 0.3452328485382694, "grad_norm": 2.140625, "learning_rate": 5.7568147196924395e-06, "loss": 0.1891, "mean_token_accuracy": 0.9609455835819244, "num_tokens": 1496373059.0, "step": 14100 }, { "entropy": 1.35421875, "epoch": 0.34645707849762497, "grad_norm": 0.0322265625, "learning_rate": 5.754413973913126e-06, "loss": 0.1673, "mean_token_accuracy": 0.9642012619972229, "num_tokens": 1500901681.0, "step": 14150 }, { "entropy": 1.343125, "epoch": 0.3476813084569806, "grad_norm": 1.859375, "learning_rate": 5.752001941704407e-06, "loss": 0.1759, "mean_token_accuracy": 0.9625442051887512, "num_tokens": 1506040261.0, "step": 14200 }, { "entropy": 1.36625, "epoch": 0.34890553841633615, "grad_norm": 3.0, "learning_rate": 5.749578632949776e-06, "loss": 0.1802, "mean_token_accuracy": 0.9619328999519348, "num_tokens": 1511536121.0, "step": 14250 }, { "entropy": 1.356875, "epoch": 0.3501297683756917, "grad_norm": 3.703125, "learning_rate": 5.747144057578932e-06, "loss": 0.1843, "mean_token_accuracy": 0.9613735234737396, "num_tokens": 1516899260.0, "step": 14300 }, { "entropy": 1.36203125, "epoch": 0.35135399833504727, "grad_norm": 3.671875, "learning_rate": 5.744698225567742e-06, "loss": 0.1929, "mean_token_accuracy": 0.9596503937244415, "num_tokens": 1522277914.0, "step": 14350 }, { "entropy": 1.35921875, "epoch": 0.35257822829440283, "grad_norm": 2.15625, "learning_rate": 5.742241146938195e-06, "loss": 0.18, "mean_token_accuracy": 0.9617201662063599, "num_tokens": 1527559983.0, "step": 14400 }, { "entropy": 1.3353125, "epoch": 0.3538024582537584, "grad_norm": 3.03125, "learning_rate": 5.739772831758365e-06, "loss": 0.171, "mean_token_accuracy": 0.9635174345970153, "num_tokens": 1532501983.0, "step": 14450 }, { "entropy": 1.37234375, "epoch": 0.35502668821311395, "grad_norm": 1.84375, "learning_rate": 5.737293290142369e-06, "loss": 0.1957, "mean_token_accuracy": 0.9595348858833312, "num_tokens": 1538384868.0, "step": 14500 }, { "entropy": 1.36453125, "epoch": 0.3562509181724695, "grad_norm": 2.734375, "learning_rate": 5.734802532250327e-06, "loss": 0.1721, "mean_token_accuracy": 0.9636399447917938, "num_tokens": 1543550967.0, "step": 14550 }, { "entropy": 1.36703125, "epoch": 0.35747514813182507, "grad_norm": 2.390625, "learning_rate": 5.7323005682883144e-06, "loss": 0.1817, "mean_token_accuracy": 0.9614765977859497, "num_tokens": 1548814643.0, "step": 14600 }, { "entropy": 1.37171875, "epoch": 0.35869937809118063, "grad_norm": 2.140625, "learning_rate": 5.729787408508328e-06, "loss": 0.1854, "mean_token_accuracy": 0.9606961834430695, "num_tokens": 1554002337.0, "step": 14650 }, { "entropy": 1.363125, "epoch": 0.3599236080505362, "grad_norm": 2.359375, "learning_rate": 5.7272630632082385e-06, "loss": 0.1788, "mean_token_accuracy": 0.9617051208019256, "num_tokens": 1558888261.0, "step": 14700 }, { "entropy": 1.3603125, "epoch": 0.36114783800989175, "grad_norm": 1.9609375, "learning_rate": 5.7247275427317515e-06, "loss": 0.1882, "mean_token_accuracy": 0.9613351905345917, "num_tokens": 1564034699.0, "step": 14750 }, { "entropy": 1.38765625, "epoch": 0.36237206796924737, "grad_norm": 3.90625, "learning_rate": 5.722180857468361e-06, "loss": 0.2015, "mean_token_accuracy": 0.9581510519981384, "num_tokens": 1569662314.0, "step": 14800 }, { "entropy": 1.35671875, "epoch": 0.36359629792860293, "grad_norm": 1.875, "learning_rate": 5.719623017853315e-06, "loss": 0.1858, "mean_token_accuracy": 0.9616824269294739, "num_tokens": 1575167487.0, "step": 14850 }, { "entropy": 1.36796875, "epoch": 0.3648205278879585, "grad_norm": 2.921875, "learning_rate": 5.7170540343675596e-06, "loss": 0.1858, "mean_token_accuracy": 0.9607573926448822, "num_tokens": 1580657915.0, "step": 14900 }, { "entropy": 1.3684375, "epoch": 0.36604475784731405, "grad_norm": 2.578125, "learning_rate": 5.714473917537712e-06, "loss": 0.1771, "mean_token_accuracy": 0.9625304937362671, "num_tokens": 1585664001.0, "step": 14950 }, { "entropy": 1.36109375, "epoch": 0.3672689878066696, "grad_norm": 2.546875, "learning_rate": 5.711882677936003e-06, "loss": 0.1781, "mean_token_accuracy": 0.961945322751999, "num_tokens": 1590920113.0, "step": 15000 }, { "entropy": 1.3575, "epoch": 0.36849321776602517, "grad_norm": 2.3125, "learning_rate": 5.709280326180242e-06, "loss": 0.1737, "mean_token_accuracy": 0.9629940688610077, "num_tokens": 1596062396.0, "step": 15050 }, { "entropy": 1.37359375, "epoch": 0.36971744772538073, "grad_norm": 2.140625, "learning_rate": 5.7066668729337725e-06, "loss": 0.1782, "mean_token_accuracy": 0.9626081240177154, "num_tokens": 1601254217.0, "step": 15100 }, { "entropy": 1.36609375, "epoch": 0.3709416776847363, "grad_norm": 2.109375, "learning_rate": 5.704042328905426e-06, "loss": 0.1851, "mean_token_accuracy": 0.9608933937549591, "num_tokens": 1606561855.0, "step": 15150 }, { "entropy": 1.34859375, "epoch": 0.37216590764409185, "grad_norm": 1.8515625, "learning_rate": 5.701406704849479e-06, "loss": 0.1893, "mean_token_accuracy": 0.9602335524559021, "num_tokens": 1612223884.0, "step": 15200 }, { "entropy": 1.36765625, "epoch": 0.3733901376034474, "grad_norm": 2.703125, "learning_rate": 5.69876001156561e-06, "loss": 0.1837, "mean_token_accuracy": 0.9612676846981049, "num_tokens": 1617459423.0, "step": 15250 }, { "entropy": 1.366875, "epoch": 0.374614367562803, "grad_norm": 2.0625, "learning_rate": 5.696102259898855e-06, "loss": 0.1895, "mean_token_accuracy": 0.9605361771583557, "num_tokens": 1622772691.0, "step": 15300 }, { "entropy": 1.3678125, "epoch": 0.37583859752215854, "grad_norm": 2.21875, "learning_rate": 5.693433460739561e-06, "loss": 0.1794, "mean_token_accuracy": 0.9623438572883606, "num_tokens": 1627992421.0, "step": 15350 }, { "entropy": 1.385, "epoch": 0.37706282748151415, "grad_norm": 2.15625, "learning_rate": 5.690753625023344e-06, "loss": 0.1903, "mean_token_accuracy": 0.9602718544006348, "num_tokens": 1633295976.0, "step": 15400 }, { "entropy": 1.36546875, "epoch": 0.3782870574408697, "grad_norm": 2.078125, "learning_rate": 5.688062763731044e-06, "loss": 0.2002, "mean_token_accuracy": 0.9582274675369262, "num_tokens": 1638988248.0, "step": 15450 }, { "entropy": 1.35359375, "epoch": 0.3795112874002253, "grad_norm": 1.9921875, "learning_rate": 5.685360887888677e-06, "loss": 0.1789, "mean_token_accuracy": 0.9629680168628693, "num_tokens": 1644498341.0, "step": 15500 }, { "entropy": 1.369375, "epoch": 0.38073551735958083, "grad_norm": 2.65625, "learning_rate": 5.682648008567394e-06, "loss": 0.1758, "mean_token_accuracy": 0.9636906123161316, "num_tokens": 1649900901.0, "step": 15550 }, { "entropy": 1.36546875, "epoch": 0.3819597473189364, "grad_norm": 2.40625, "learning_rate": 5.679924136883432e-06, "loss": 0.1916, "mean_token_accuracy": 0.9601245021820068, "num_tokens": 1655743468.0, "step": 15600 }, { "entropy": 1.37828125, "epoch": 0.38318397727829195, "grad_norm": 2.578125, "learning_rate": 5.677189283998073e-06, "loss": 0.1755, "mean_token_accuracy": 0.963598461151123, "num_tokens": 1660916320.0, "step": 15650 }, { "entropy": 1.35796875, "epoch": 0.3844082072376475, "grad_norm": 2.265625, "learning_rate": 5.674443461117591e-06, "loss": 0.1778, "mean_token_accuracy": 0.9613646280765533, "num_tokens": 1666271922.0, "step": 15700 }, { "entropy": 1.3571875, "epoch": 0.3856324371970031, "grad_norm": 2.328125, "learning_rate": 5.671686679493215e-06, "loss": 0.187, "mean_token_accuracy": 0.9609103786945343, "num_tokens": 1671766527.0, "step": 15750 }, { "entropy": 1.36625, "epoch": 0.38685666715635864, "grad_norm": 1.6328125, "learning_rate": 5.668918950421074e-06, "loss": 0.1886, "mean_token_accuracy": 0.9606494891643524, "num_tokens": 1677165332.0, "step": 15800 }, { "entropy": 1.3475, "epoch": 0.3880808971157142, "grad_norm": 3.046875, "learning_rate": 5.666140285242158e-06, "loss": 0.1801, "mean_token_accuracy": 0.9625120401382447, "num_tokens": 1682494165.0, "step": 15850 }, { "entropy": 1.36125, "epoch": 0.38930512707506976, "grad_norm": 2.0625, "learning_rate": 5.663350695342268e-06, "loss": 0.1892, "mean_token_accuracy": 0.9604367816448212, "num_tokens": 1688253134.0, "step": 15900 }, { "entropy": 1.35328125, "epoch": 0.3905293570344254, "grad_norm": 1.6640625, "learning_rate": 5.660550192151967e-06, "loss": 0.1845, "mean_token_accuracy": 0.9621007204055786, "num_tokens": 1693632232.0, "step": 15950 }, { "entropy": 1.3690625, "epoch": 0.39175358699378093, "grad_norm": 1.8359375, "learning_rate": 5.657738787146543e-06, "loss": 0.1885, "mean_token_accuracy": 0.9610405099391938, "num_tokens": 1698678337.0, "step": 16000 }, { "entropy": 1.346875, "epoch": 0.3929778169531365, "grad_norm": 2.765625, "learning_rate": 5.654916491845947e-06, "loss": 0.1733, "mean_token_accuracy": 0.9640054357051849, "num_tokens": 1704187251.0, "step": 16050 }, { "entropy": 1.35375, "epoch": 0.39420204691249205, "grad_norm": 2.46875, "learning_rate": 5.652083317814759e-06, "loss": 0.1745, "mean_token_accuracy": 0.9634167146682739, "num_tokens": 1709408694.0, "step": 16100 }, { "entropy": 1.34265625, "epoch": 0.3954262768718476, "grad_norm": 2.8125, "learning_rate": 5.649239276662133e-06, "loss": 0.1724, "mean_token_accuracy": 0.963241057395935, "num_tokens": 1714585157.0, "step": 16150 }, { "entropy": 1.3303125, "epoch": 0.3966505068312032, "grad_norm": 2.578125, "learning_rate": 5.646384380041755e-06, "loss": 0.1759, "mean_token_accuracy": 0.9634040462970733, "num_tokens": 1719749974.0, "step": 16200 }, { "entropy": 1.33890625, "epoch": 0.39787473679055874, "grad_norm": 2.296875, "learning_rate": 5.643518639651789e-06, "loss": 0.1754, "mean_token_accuracy": 0.963290364742279, "num_tokens": 1724935979.0, "step": 16250 }, { "entropy": 1.341875, "epoch": 0.3990989667499143, "grad_norm": 3.828125, "learning_rate": 5.640642067234832e-06, "loss": 0.1869, "mean_token_accuracy": 0.9608835780620575, "num_tokens": 1729904911.0, "step": 16300 }, { "entropy": 1.3525, "epoch": 0.40032319670926986, "grad_norm": 3.015625, "learning_rate": 5.637754674577869e-06, "loss": 0.193, "mean_token_accuracy": 0.9592759358882904, "num_tokens": 1735603402.0, "step": 16350 }, { "entropy": 1.33984375, "epoch": 0.4015474266686254, "grad_norm": 2.671875, "learning_rate": 5.634856473512218e-06, "loss": 0.1787, "mean_token_accuracy": 0.9626182532310485, "num_tokens": 1740876722.0, "step": 16400 }, { "entropy": 1.3328125, "epoch": 0.402771656627981, "grad_norm": 2.421875, "learning_rate": 5.631947475913489e-06, "loss": 0.1951, "mean_token_accuracy": 0.9596171510219574, "num_tokens": 1746470991.0, "step": 16450 }, { "entropy": 1.31375, "epoch": 0.40399588658733654, "grad_norm": 2.734375, "learning_rate": 5.629027693701531e-06, "loss": 0.1646, "mean_token_accuracy": 0.9641488230228424, "num_tokens": 1751600795.0, "step": 16500 }, { "entropy": 1.3459375, "epoch": 0.40522011654669216, "grad_norm": 0.01904296875, "learning_rate": 5.626097138840379e-06, "loss": 0.1931, "mean_token_accuracy": 0.9586203134059906, "num_tokens": 1757280148.0, "step": 16550 }, { "entropy": 1.32203125, "epoch": 0.4064443465060477, "grad_norm": 3.125, "learning_rate": 5.623155823338219e-06, "loss": 0.1845, "mean_token_accuracy": 0.961804312467575, "num_tokens": 1762386072.0, "step": 16600 }, { "entropy": 1.309375, "epoch": 0.4076685764654033, "grad_norm": 1.9609375, "learning_rate": 5.62020375924732e-06, "loss": 0.1679, "mean_token_accuracy": 0.9640087175369263, "num_tokens": 1767593608.0, "step": 16650 }, { "entropy": 1.33890625, "epoch": 0.40889280642475884, "grad_norm": 2.296875, "learning_rate": 5.617240958664e-06, "loss": 0.1778, "mean_token_accuracy": 0.9619925379753113, "num_tokens": 1772859293.0, "step": 16700 }, { "entropy": 1.3303125, "epoch": 0.4101170363841144, "grad_norm": 2.453125, "learning_rate": 5.614267433728569e-06, "loss": 0.1784, "mean_token_accuracy": 0.9621168851852417, "num_tokens": 1778176957.0, "step": 16750 }, { "entropy": 1.33359375, "epoch": 0.41134126634346996, "grad_norm": 2.28125, "learning_rate": 5.611283196625281e-06, "loss": 0.1876, "mean_token_accuracy": 0.9608843457698822, "num_tokens": 1783513531.0, "step": 16800 }, { "entropy": 1.31875, "epoch": 0.4125654963028255, "grad_norm": 2.375, "learning_rate": 5.6082882595822835e-06, "loss": 0.1743, "mean_token_accuracy": 0.9634191727638245, "num_tokens": 1788649179.0, "step": 16850 }, { "entropy": 1.34703125, "epoch": 0.4137897262621811, "grad_norm": 3.0, "learning_rate": 5.605282634871569e-06, "loss": 0.1846, "mean_token_accuracy": 0.9604820072650909, "num_tokens": 1794020681.0, "step": 16900 }, { "entropy": 1.341875, "epoch": 0.41501395622153664, "grad_norm": 2.265625, "learning_rate": 5.602266334808922e-06, "loss": 0.1917, "mean_token_accuracy": 0.9598517632484436, "num_tokens": 1799786050.0, "step": 16950 }, { "entropy": 1.32484375, "epoch": 0.4162381861808922, "grad_norm": 2.421875, "learning_rate": 5.599239371753871e-06, "loss": 0.1843, "mean_token_accuracy": 0.9613809895515442, "num_tokens": 1805308121.0, "step": 17000 }, { "entropy": 1.3296875, "epoch": 0.41746241614024776, "grad_norm": 2.265625, "learning_rate": 5.596201758109636e-06, "loss": 0.1971, "mean_token_accuracy": 0.9585018038749695, "num_tokens": 1811016191.0, "step": 17050 }, { "entropy": 1.34390625, "epoch": 0.4186866460996034, "grad_norm": 2.65625, "learning_rate": 5.593153506323082e-06, "loss": 0.1912, "mean_token_accuracy": 0.9609514188766479, "num_tokens": 1816538866.0, "step": 17100 }, { "entropy": 1.319375, "epoch": 0.41991087605895894, "grad_norm": 2.578125, "learning_rate": 5.59009462888466e-06, "loss": 0.1692, "mean_token_accuracy": 0.9638219344615936, "num_tokens": 1821484676.0, "step": 17150 }, { "entropy": 1.3296875, "epoch": 0.4211351060183145, "grad_norm": 3.078125, "learning_rate": 5.587025138328363e-06, "loss": 0.1855, "mean_token_accuracy": 0.9604250502586364, "num_tokens": 1826760752.0, "step": 17200 }, { "entropy": 1.32703125, "epoch": 0.42235933597767006, "grad_norm": 4.375, "learning_rate": 5.583945047231672e-06, "loss": 0.1756, "mean_token_accuracy": 0.9626831936836243, "num_tokens": 1831709955.0, "step": 17250 }, { "entropy": 1.3278125, "epoch": 0.4235835659370256, "grad_norm": 3.578125, "learning_rate": 5.580854368215504e-06, "loss": 0.1688, "mean_token_accuracy": 0.9641677963733674, "num_tokens": 1836539757.0, "step": 17300 }, { "entropy": 1.35453125, "epoch": 0.4248077958963812, "grad_norm": 3.203125, "learning_rate": 5.577753113944161e-06, "loss": 0.1795, "mean_token_accuracy": 0.9620350849628448, "num_tokens": 1841748836.0, "step": 17350 }, { "entropy": 1.35484375, "epoch": 0.42603202585573674, "grad_norm": 3.046875, "learning_rate": 5.574641297125277e-06, "loss": 0.1903, "mean_token_accuracy": 0.9602237248420715, "num_tokens": 1846964872.0, "step": 17400 }, { "entropy": 1.3465625, "epoch": 0.4272562558150923, "grad_norm": 2.375, "learning_rate": 5.5715189305097705e-06, "loss": 0.18, "mean_token_accuracy": 0.9612255036830902, "num_tokens": 1852195890.0, "step": 17450 }, { "entropy": 1.34734375, "epoch": 0.42848048577444786, "grad_norm": 1.921875, "learning_rate": 5.568386026891784e-06, "loss": 0.1852, "mean_token_accuracy": 0.9614002680778504, "num_tokens": 1857781986.0, "step": 17500 }, { "entropy": 1.383125, "epoch": 0.4297047157338034, "grad_norm": 3.59375, "learning_rate": 5.565242599108638e-06, "loss": 0.1733, "mean_token_accuracy": 0.9632753264904023, "num_tokens": 1862697378.0, "step": 17550 }, { "entropy": 1.37734375, "epoch": 0.430928945693159, "grad_norm": 2.578125, "learning_rate": 5.5620886600407775e-06, "loss": 0.1793, "mean_token_accuracy": 0.9618914890289306, "num_tokens": 1867900164.0, "step": 17600 }, { "entropy": 1.37453125, "epoch": 0.43215317565251454, "grad_norm": 3.359375, "learning_rate": 5.558924222611718e-06, "loss": 0.189, "mean_token_accuracy": 0.9601231980323791, "num_tokens": 1873349723.0, "step": 17650 }, { "entropy": 1.3796875, "epoch": 0.43337740561187016, "grad_norm": 2.125, "learning_rate": 5.555749299787992e-06, "loss": 0.183, "mean_token_accuracy": 0.9612041318416595, "num_tokens": 1878516011.0, "step": 17700 }, { "entropy": 1.36796875, "epoch": 0.4346016355712257, "grad_norm": 1.703125, "learning_rate": 5.552563904579097e-06, "loss": 0.1666, "mean_token_accuracy": 0.965571962594986, "num_tokens": 1883672436.0, "step": 17750 }, { "entropy": 1.37421875, "epoch": 0.4358258655305813, "grad_norm": 2.140625, "learning_rate": 5.549368050037442e-06, "loss": 0.1822, "mean_token_accuracy": 0.9618594205379486, "num_tokens": 1889075709.0, "step": 17800 }, { "entropy": 1.3753125, "epoch": 0.43705009548993684, "grad_norm": 1.703125, "learning_rate": 5.5461617492582955e-06, "loss": 0.1847, "mean_token_accuracy": 0.9609970545768738, "num_tokens": 1894320611.0, "step": 17850 }, { "entropy": 1.35203125, "epoch": 0.4382743254492924, "grad_norm": 3.265625, "learning_rate": 5.542945015379727e-06, "loss": 0.1819, "mean_token_accuracy": 0.9610999655723572, "num_tokens": 1899502888.0, "step": 17900 }, { "entropy": 1.3653125, "epoch": 0.43949855540864796, "grad_norm": 3.125, "learning_rate": 5.53971786158256e-06, "loss": 0.1783, "mean_token_accuracy": 0.9628078281879425, "num_tokens": 1904727333.0, "step": 17950 }, { "entropy": 1.37265625, "epoch": 0.4407227853680035, "grad_norm": 2.15625, "learning_rate": 5.536480301090311e-06, "loss": 0.1825, "mean_token_accuracy": 0.9612684857845306, "num_tokens": 1910269964.0, "step": 18000 }, { "entropy": 1.36875, "epoch": 0.4419470153273591, "grad_norm": 2.421875, "learning_rate": 5.533232347169142e-06, "loss": 0.1769, "mean_token_accuracy": 0.9630991363525391, "num_tokens": 1915481678.0, "step": 18050 }, { "entropy": 1.37703125, "epoch": 0.44317124528671464, "grad_norm": 0.007720947265625, "learning_rate": 5.5299740131278e-06, "loss": 0.1776, "mean_token_accuracy": 0.9631426560878754, "num_tokens": 1920892313.0, "step": 18100 }, { "entropy": 1.3784375, "epoch": 0.4443954752460702, "grad_norm": 2.25, "learning_rate": 5.5267053123175685e-06, "loss": 0.1793, "mean_token_accuracy": 0.9618562459945679, "num_tokens": 1925855441.0, "step": 18150 }, { "entropy": 1.40484375, "epoch": 0.44561970520542576, "grad_norm": 2.390625, "learning_rate": 5.523426258132208e-06, "loss": 0.1895, "mean_token_accuracy": 0.9602830135822296, "num_tokens": 1931433927.0, "step": 18200 }, { "entropy": 1.381875, "epoch": 0.4468439351647813, "grad_norm": 2.140625, "learning_rate": 5.520136864007901e-06, "loss": 0.179, "mean_token_accuracy": 0.9617183935642243, "num_tokens": 1937093589.0, "step": 18250 }, { "entropy": 1.3784375, "epoch": 0.44806816512413694, "grad_norm": 2.890625, "learning_rate": 5.516837143423201e-06, "loss": 0.1807, "mean_token_accuracy": 0.9620720791816711, "num_tokens": 1942266157.0, "step": 18300 }, { "entropy": 1.3815625, "epoch": 0.4492923950834925, "grad_norm": 2.734375, "learning_rate": 5.5135271098989745e-06, "loss": 0.1739, "mean_token_accuracy": 0.9636857545375824, "num_tokens": 1947254229.0, "step": 18350 }, { "entropy": 1.39609375, "epoch": 0.45051662504284806, "grad_norm": 2.28125, "learning_rate": 5.510206776998347e-06, "loss": 0.2004, "mean_token_accuracy": 0.9576922535896302, "num_tokens": 1953541405.0, "step": 18400 }, { "entropy": 1.38515625, "epoch": 0.4517408550022036, "grad_norm": 1.8671875, "learning_rate": 5.5068761583266446e-06, "loss": 0.1815, "mean_token_accuracy": 0.9612382733821869, "num_tokens": 1958947967.0, "step": 18450 }, { "entropy": 1.38546875, "epoch": 0.4529650849615592, "grad_norm": 2.609375, "learning_rate": 5.503535267531341e-06, "loss": 0.1756, "mean_token_accuracy": 0.9630067098140717, "num_tokens": 1964172588.0, "step": 18500 }, { "entropy": 1.37171875, "epoch": 0.45418931492091474, "grad_norm": 2.453125, "learning_rate": 5.500184118302001e-06, "loss": 0.1737, "mean_token_accuracy": 0.9629046404361725, "num_tokens": 1969146021.0, "step": 18550 }, { "entropy": 1.35796875, "epoch": 0.4554135448802703, "grad_norm": 2.390625, "learning_rate": 5.496822724370225e-06, "loss": 0.1726, "mean_token_accuracy": 0.9641622114181518, "num_tokens": 1974171622.0, "step": 18600 }, { "entropy": 1.35109375, "epoch": 0.45663777483962587, "grad_norm": 1.9375, "learning_rate": 5.493451099509589e-06, "loss": 0.1797, "mean_token_accuracy": 0.9615970349311829, "num_tokens": 1979453512.0, "step": 18650 }, { "entropy": 1.3515625, "epoch": 0.4578620047989814, "grad_norm": 2.421875, "learning_rate": 5.490069257535595e-06, "loss": 0.1786, "mean_token_accuracy": 0.9625794899463653, "num_tokens": 1984570640.0, "step": 18700 }, { "entropy": 1.37140625, "epoch": 0.459086234758337, "grad_norm": 2.296875, "learning_rate": 5.4866772123056055e-06, "loss": 0.1928, "mean_token_accuracy": 0.9605653440952301, "num_tokens": 1990199710.0, "step": 18750 }, { "entropy": 1.375625, "epoch": 0.46031046471769255, "grad_norm": 2.09375, "learning_rate": 5.483274977718797e-06, "loss": 0.1885, "mean_token_accuracy": 0.9597025084495544, "num_tokens": 1995518980.0, "step": 18800 }, { "entropy": 1.37984375, "epoch": 0.46153469467704816, "grad_norm": 2.6875, "learning_rate": 5.479862567716095e-06, "loss": 0.1703, "mean_token_accuracy": 0.9633987152576446, "num_tokens": 2000479352.0, "step": 18850 }, { "entropy": 1.38640625, "epoch": 0.4627589246364037, "grad_norm": 3.671875, "learning_rate": 5.476439996280118e-06, "loss": 0.1941, "mean_token_accuracy": 0.959332902431488, "num_tokens": 2005933401.0, "step": 18900 }, { "entropy": 1.3975, "epoch": 0.4639831545957593, "grad_norm": 2.5625, "learning_rate": 5.473007277435125e-06, "loss": 0.1731, "mean_token_accuracy": 0.9638979506492614, "num_tokens": 2010666027.0, "step": 18950 }, { "entropy": 1.38140625, "epoch": 0.46520738455511484, "grad_norm": 3.640625, "learning_rate": 5.469564425246953e-06, "loss": 0.1852, "mean_token_accuracy": 0.9617711079120635, "num_tokens": 2016049085.0, "step": 19000 }, { "entropy": 1.37015625, "epoch": 0.4664316145144704, "grad_norm": 1.71875, "learning_rate": 5.46611145382296e-06, "loss": 0.1678, "mean_token_accuracy": 0.9642109513282776, "num_tokens": 2021148599.0, "step": 19050 }, { "entropy": 1.35875, "epoch": 0.46765584447382597, "grad_norm": 1.6875, "learning_rate": 5.462648377311973e-06, "loss": 0.1785, "mean_token_accuracy": 0.9610287690162659, "num_tokens": 2026306056.0, "step": 19100 }, { "entropy": 1.34953125, "epoch": 0.4688800744331815, "grad_norm": 2.78125, "learning_rate": 5.459175209904221e-06, "loss": 0.1769, "mean_token_accuracy": 0.9627043080329895, "num_tokens": 2031493225.0, "step": 19150 }, { "entropy": 1.34484375, "epoch": 0.4701043043925371, "grad_norm": 1.8671875, "learning_rate": 5.455691965831281e-06, "loss": 0.1758, "mean_token_accuracy": 0.9625547790527343, "num_tokens": 2036730518.0, "step": 19200 }, { "entropy": 1.3490625, "epoch": 0.47132853435189265, "grad_norm": 2.546875, "learning_rate": 5.452198659366023e-06, "loss": 0.167, "mean_token_accuracy": 0.9653509867191314, "num_tokens": 2041648821.0, "step": 19250 }, { "entropy": 1.33796875, "epoch": 0.4725527643112482, "grad_norm": 1.921875, "learning_rate": 5.448695304822545e-06, "loss": 0.1733, "mean_token_accuracy": 0.9637433886528015, "num_tokens": 2046695948.0, "step": 19300 }, { "entropy": 1.35109375, "epoch": 0.47377699427060377, "grad_norm": 3.15625, "learning_rate": 5.445181916556123e-06, "loss": 0.1712, "mean_token_accuracy": 0.96383709192276, "num_tokens": 2051915262.0, "step": 19350 }, { "entropy": 1.3453125, "epoch": 0.47500122422995933, "grad_norm": 2.578125, "learning_rate": 5.4416585089631414e-06, "loss": 0.163, "mean_token_accuracy": 0.9646891450881958, "num_tokens": 2056999566.0, "step": 19400 }, { "entropy": 1.36125, "epoch": 0.47622545418931495, "grad_norm": 2.875, "learning_rate": 5.438125096481043e-06, "loss": 0.1833, "mean_token_accuracy": 0.96080885887146, "num_tokens": 2062335975.0, "step": 19450 }, { "entropy": 1.368125, "epoch": 0.4774496841486705, "grad_norm": 3.140625, "learning_rate": 5.434581693588263e-06, "loss": 0.175, "mean_token_accuracy": 0.9632956290245056, "num_tokens": 2067247038.0, "step": 19500 }, { "entropy": 1.36484375, "epoch": 0.47867391410802607, "grad_norm": 2.59375, "learning_rate": 5.4310283148041775e-06, "loss": 0.185, "mean_token_accuracy": 0.9606440508365631, "num_tokens": 2072775995.0, "step": 19550 }, { "entropy": 1.36171875, "epoch": 0.4798981440673816, "grad_norm": 2.265625, "learning_rate": 5.427464974689038e-06, "loss": 0.1772, "mean_token_accuracy": 0.963237328529358, "num_tokens": 2078139054.0, "step": 19600 }, { "entropy": 1.35703125, "epoch": 0.4811223740267372, "grad_norm": 2.90625, "learning_rate": 5.42389168784391e-06, "loss": 0.1726, "mean_token_accuracy": 0.9635715174674988, "num_tokens": 2083527202.0, "step": 19650 }, { "entropy": 1.37875, "epoch": 0.48234660398609275, "grad_norm": 3.3125, "learning_rate": 5.4203084689106225e-06, "loss": 0.1927, "mean_token_accuracy": 0.9599621570110322, "num_tokens": 2089385771.0, "step": 19700 }, { "entropy": 1.34265625, "epoch": 0.4835708339454483, "grad_norm": 2.296875, "learning_rate": 5.4167153325716976e-06, "loss": 0.1663, "mean_token_accuracy": 0.9641843712329865, "num_tokens": 2094456460.0, "step": 19750 }, { "entropy": 1.3609375, "epoch": 0.48479506390480387, "grad_norm": 3.734375, "learning_rate": 5.413112293550296e-06, "loss": 0.181, "mean_token_accuracy": 0.9612398469448089, "num_tokens": 2099504284.0, "step": 19800 }, { "entropy": 1.3709375, "epoch": 0.48601929386415943, "grad_norm": 2.53125, "learning_rate": 5.409499366610154e-06, "loss": 0.1699, "mean_token_accuracy": 0.9642571318149566, "num_tokens": 2104524371.0, "step": 19850 }, { "entropy": 1.378125, "epoch": 0.487243523823515, "grad_norm": 5.53125, "learning_rate": 5.405876566555529e-06, "loss": 0.181, "mean_token_accuracy": 0.9618199968338013, "num_tokens": 2109740174.0, "step": 19900 }, { "entropy": 1.40078125, "epoch": 0.48846775378287055, "grad_norm": 2.0, "learning_rate": 5.402243908231129e-06, "loss": 0.1804, "mean_token_accuracy": 0.962717422246933, "num_tokens": 2115362415.0, "step": 19950 }, { "entropy": 1.37703125, "epoch": 0.48969198374222617, "grad_norm": 3.40625, "learning_rate": 5.398601406522059e-06, "loss": 0.19, "mean_token_accuracy": 0.9599020183086395, "num_tokens": 2121188022.0, "step": 20000 }, { "epoch": 0.48969198374222617, "eval_entropy": 1.366015625, "eval_loss": 0.1947789192199707, "eval_mean_token_accuracy": 0.9590674425164859, "eval_num_tokens": 2121188022.0, "eval_runtime": 605.3557, "eval_samples_per_second": 15.951, "eval_steps_per_second": 0.2, "step": 20000 }, { "entropy": 1.36578125, "epoch": 0.4909162137015817, "grad_norm": 2.71875, "learning_rate": 5.3949490763537594e-06, "loss": 0.1838, "mean_token_accuracy": 0.9606946921348571, "num_tokens": 2126472622.0, "step": 20050 }, { "entropy": 1.36359375, "epoch": 0.4921404436609373, "grad_norm": 2.21875, "learning_rate": 5.391286932691941e-06, "loss": 0.1717, "mean_token_accuracy": 0.963376579284668, "num_tokens": 2131377659.0, "step": 20100 }, { "entropy": 1.37875, "epoch": 0.49336467362029285, "grad_norm": 2.46875, "learning_rate": 5.38761499054253e-06, "loss": 0.1855, "mean_token_accuracy": 0.9612623798847199, "num_tokens": 2136546167.0, "step": 20150 }, { "entropy": 1.37296875, "epoch": 0.4945889035796484, "grad_norm": 4.40625, "learning_rate": 5.383933264951596e-06, "loss": 0.1826, "mean_token_accuracy": 0.9621403360366821, "num_tokens": 2141814792.0, "step": 20200 }, { "entropy": 1.37328125, "epoch": 0.49581313353900397, "grad_norm": 2.40625, "learning_rate": 5.3802417710053056e-06, "loss": 0.1804, "mean_token_accuracy": 0.9616746437549591, "num_tokens": 2147071830.0, "step": 20250 }, { "entropy": 1.38625, "epoch": 0.49703736349835953, "grad_norm": 3.375, "learning_rate": 5.376540523829846e-06, "loss": 0.1782, "mean_token_accuracy": 0.9625440466403962, "num_tokens": 2152428456.0, "step": 20300 }, { "entropy": 1.3896875, "epoch": 0.4982615934577151, "grad_norm": 2.203125, "learning_rate": 5.372829538591368e-06, "loss": 0.1876, "mean_token_accuracy": 0.9597011947631836, "num_tokens": 2157932348.0, "step": 20350 }, { "entropy": 1.38671875, "epoch": 0.49948582341707065, "grad_norm": 2.78125, "learning_rate": 5.369108830495932e-06, "loss": 0.1791, "mean_token_accuracy": 0.9618503451347351, "num_tokens": 2163273400.0, "step": 20400 }, { "entropy": 1.39640625, "epoch": 0.5007100533764263, "grad_norm": 2.1875, "learning_rate": 5.365378414789431e-06, "loss": 0.1744, "mean_token_accuracy": 0.9630714511871338, "num_tokens": 2168498693.0, "step": 20450 }, { "entropy": 1.38453125, "epoch": 0.5019342833357818, "grad_norm": 5.0625, "learning_rate": 5.361638306757539e-06, "loss": 0.1757, "mean_token_accuracy": 0.963210039138794, "num_tokens": 2173679268.0, "step": 20500 }, { "entropy": 1.40171875, "epoch": 0.5031585132951374, "grad_norm": 2.46875, "learning_rate": 5.357888521725646e-06, "loss": 0.1827, "mean_token_accuracy": 0.9613598906993865, "num_tokens": 2178826743.0, "step": 20550 }, { "entropy": 1.3775, "epoch": 0.504382743254493, "grad_norm": 2.546875, "learning_rate": 5.354129075058793e-06, "loss": 0.1786, "mean_token_accuracy": 0.9626466917991638, "num_tokens": 2184130873.0, "step": 20600 }, { "entropy": 1.35796875, "epoch": 0.5056069732138485, "grad_norm": 1.546875, "learning_rate": 5.35035998216161e-06, "loss": 0.1699, "mean_token_accuracy": 0.9637439405918121, "num_tokens": 2189388837.0, "step": 20650 }, { "entropy": 1.38328125, "epoch": 0.5068312031732041, "grad_norm": 1.703125, "learning_rate": 5.3465812584782545e-06, "loss": 0.1964, "mean_token_accuracy": 0.9594271278381348, "num_tokens": 2195050047.0, "step": 20700 }, { "entropy": 1.34203125, "epoch": 0.5080554331325596, "grad_norm": 2.3125, "learning_rate": 5.342792919492344e-06, "loss": 0.1749, "mean_token_accuracy": 0.9626959478855133, "num_tokens": 2200302347.0, "step": 20750 }, { "entropy": 1.356875, "epoch": 0.5092796630919152, "grad_norm": 2.09375, "learning_rate": 5.338994980726901e-06, "loss": 0.1794, "mean_token_accuracy": 0.9620554232597351, "num_tokens": 2205512738.0, "step": 20800 }, { "entropy": 1.3575, "epoch": 0.5105038930512708, "grad_norm": 2.78125, "learning_rate": 5.335187457744277e-06, "loss": 0.1823, "mean_token_accuracy": 0.9618464136123657, "num_tokens": 2210651777.0, "step": 20850 }, { "entropy": 1.33390625, "epoch": 0.5117281230106263, "grad_norm": 1.6875, "learning_rate": 5.3313703661461e-06, "loss": 0.1819, "mean_token_accuracy": 0.9613965570926666, "num_tokens": 2215880518.0, "step": 20900 }, { "entropy": 1.3253125, "epoch": 0.5129523529699819, "grad_norm": 2.984375, "learning_rate": 5.327543721573206e-06, "loss": 0.1752, "mean_token_accuracy": 0.9638756012916565, "num_tokens": 2221245311.0, "step": 20950 }, { "entropy": 1.32234375, "epoch": 0.5141765829293374, "grad_norm": 3.28125, "learning_rate": 5.323707539705574e-06, "loss": 0.1748, "mean_token_accuracy": 0.963612312078476, "num_tokens": 2226359631.0, "step": 21000 }, { "entropy": 1.30609375, "epoch": 0.515400812888693, "grad_norm": 2.15625, "learning_rate": 5.3198618362622614e-06, "loss": 0.1702, "mean_token_accuracy": 0.9639462912082672, "num_tokens": 2231563334.0, "step": 21050 }, { "entropy": 1.31953125, "epoch": 0.5166250428480486, "grad_norm": 3.265625, "learning_rate": 5.316006627001344e-06, "loss": 0.1805, "mean_token_accuracy": 0.961728732585907, "num_tokens": 2236847732.0, "step": 21100 }, { "entropy": 1.32125, "epoch": 0.5178492728074041, "grad_norm": 2.375, "learning_rate": 5.312141927719849e-06, "loss": 0.172, "mean_token_accuracy": 0.9636801743507385, "num_tokens": 2242148614.0, "step": 21150 }, { "entropy": 1.3134375, "epoch": 0.5190735027667597, "grad_norm": 2.546875, "learning_rate": 5.308267754253684e-06, "loss": 0.1755, "mean_token_accuracy": 0.9632048571109771, "num_tokens": 2247694541.0, "step": 21200 }, { "entropy": 1.36203125, "epoch": 0.5202977327261152, "grad_norm": 1.8359375, "learning_rate": 5.304384122477584e-06, "loss": 0.1983, "mean_token_accuracy": 0.9583926129341126, "num_tokens": 2253386473.0, "step": 21250 }, { "entropy": 1.34703125, "epoch": 0.5215219626854708, "grad_norm": 2.140625, "learning_rate": 5.300491048305037e-06, "loss": 0.1753, "mean_token_accuracy": 0.9633457577228546, "num_tokens": 2258591416.0, "step": 21300 }, { "entropy": 1.3553125, "epoch": 0.5227461926448264, "grad_norm": 3.140625, "learning_rate": 5.296588547688221e-06, "loss": 0.1809, "mean_token_accuracy": 0.9621423208713531, "num_tokens": 2263908714.0, "step": 21350 }, { "entropy": 1.35140625, "epoch": 0.5239704226041819, "grad_norm": 2.5, "learning_rate": 5.292676636617946e-06, "loss": 0.1746, "mean_token_accuracy": 0.9637291979789734, "num_tokens": 2269014561.0, "step": 21400 }, { "entropy": 1.3440625, "epoch": 0.5251946525635376, "grad_norm": 2.5625, "learning_rate": 5.2887553311235736e-06, "loss": 0.1753, "mean_token_accuracy": 0.963253127336502, "num_tokens": 2274143387.0, "step": 21450 }, { "entropy": 1.34984375, "epoch": 0.5264188825228932, "grad_norm": 1.8203125, "learning_rate": 5.284824647272965e-06, "loss": 0.1751, "mean_token_accuracy": 0.9633476626873017, "num_tokens": 2279551937.0, "step": 21500 }, { "entropy": 1.3815625, "epoch": 0.5276431124822487, "grad_norm": 1.765625, "learning_rate": 5.280884601172408e-06, "loss": 0.1901, "mean_token_accuracy": 0.9609255039691925, "num_tokens": 2284998091.0, "step": 21550 }, { "entropy": 1.37375, "epoch": 0.5288673424416043, "grad_norm": 2.078125, "learning_rate": 5.276935208966554e-06, "loss": 0.1805, "mean_token_accuracy": 0.9621355581283569, "num_tokens": 2290404419.0, "step": 21600 }, { "entropy": 1.35875, "epoch": 0.5300915724009598, "grad_norm": 2.546875, "learning_rate": 5.272976486838349e-06, "loss": 0.1839, "mean_token_accuracy": 0.9618707728385926, "num_tokens": 2295855308.0, "step": 21650 }, { "entropy": 1.34296875, "epoch": 0.5313158023603154, "grad_norm": 3.84375, "learning_rate": 5.269008451008974e-06, "loss": 0.1683, "mean_token_accuracy": 0.9649140095710754, "num_tokens": 2300888682.0, "step": 21700 }, { "entropy": 1.3709375, "epoch": 0.532540032319671, "grad_norm": 2.046875, "learning_rate": 5.265031117737765e-06, "loss": 0.1856, "mean_token_accuracy": 0.9606757354736328, "num_tokens": 2306530067.0, "step": 21750 }, { "entropy": 1.3528125, "epoch": 0.5337642622790265, "grad_norm": 2.984375, "learning_rate": 5.261044503322165e-06, "loss": 0.1826, "mean_token_accuracy": 0.9615514528751373, "num_tokens": 2312022301.0, "step": 21800 }, { "entropy": 1.35828125, "epoch": 0.5349884922383821, "grad_norm": 2.5, "learning_rate": 5.257048624097639e-06, "loss": 0.1826, "mean_token_accuracy": 0.9617948019504547, "num_tokens": 2317336429.0, "step": 21850 }, { "entropy": 1.365625, "epoch": 0.5362127221977376, "grad_norm": 3.25, "learning_rate": 5.253043496437619e-06, "loss": 0.1875, "mean_token_accuracy": 0.9604008531570435, "num_tokens": 2322605855.0, "step": 21900 }, { "entropy": 1.3403125, "epoch": 0.5374369521570932, "grad_norm": 1.1171875, "learning_rate": 5.249029136753436e-06, "loss": 0.1757, "mean_token_accuracy": 0.9632094752788544, "num_tokens": 2328163176.0, "step": 21950 }, { "entropy": 1.3684375, "epoch": 0.5386611821164488, "grad_norm": 2.484375, "learning_rate": 5.245005561494242e-06, "loss": 0.1804, "mean_token_accuracy": 0.9627390444278717, "num_tokens": 2333245056.0, "step": 22000 }, { "entropy": 1.384375, "epoch": 0.5398854120758043, "grad_norm": 2.859375, "learning_rate": 5.2409727871469585e-06, "loss": 0.1926, "mean_token_accuracy": 0.9592073571681976, "num_tokens": 2338758359.0, "step": 22050 }, { "entropy": 1.35546875, "epoch": 0.5411096420351599, "grad_norm": 2.90625, "learning_rate": 5.236930830236195e-06, "loss": 0.179, "mean_token_accuracy": 0.9627534210681915, "num_tokens": 2344276248.0, "step": 22100 }, { "entropy": 1.34953125, "epoch": 0.5423338719945154, "grad_norm": 2.078125, "learning_rate": 5.232879707324194e-06, "loss": 0.1634, "mean_token_accuracy": 0.965645101070404, "num_tokens": 2349615408.0, "step": 22150 }, { "entropy": 1.37578125, "epoch": 0.543558101953871, "grad_norm": 2.34375, "learning_rate": 5.228819435010749e-06, "loss": 0.1678, "mean_token_accuracy": 0.9645935368537902, "num_tokens": 2354669027.0, "step": 22200 }, { "entropy": 1.3884375, "epoch": 0.5447823319132266, "grad_norm": 3.109375, "learning_rate": 5.224750029933149e-06, "loss": 0.1811, "mean_token_accuracy": 0.9621996486186981, "num_tokens": 2359585884.0, "step": 22250 }, { "entropy": 1.38390625, "epoch": 0.5460065618725821, "grad_norm": 2.375, "learning_rate": 5.220671508766104e-06, "loss": 0.1716, "mean_token_accuracy": 0.9631420743465423, "num_tokens": 2364818902.0, "step": 22300 }, { "entropy": 1.40234375, "epoch": 0.5472307918319377, "grad_norm": 2.03125, "learning_rate": 5.216583888221676e-06, "loss": 0.1888, "mean_token_accuracy": 0.9602623808383942, "num_tokens": 2370249320.0, "step": 22350 }, { "entropy": 1.3871875, "epoch": 0.5484550217912932, "grad_norm": 2.078125, "learning_rate": 5.212487185049215e-06, "loss": 0.1656, "mean_token_accuracy": 0.9649445843696595, "num_tokens": 2375353386.0, "step": 22400 }, { "entropy": 1.415625, "epoch": 0.5496792517506488, "grad_norm": 2.09375, "learning_rate": 5.208381416035286e-06, "loss": 0.1863, "mean_token_accuracy": 0.9609400224685669, "num_tokens": 2380836963.0, "step": 22450 }, { "entropy": 1.395, "epoch": 0.5509034817100044, "grad_norm": 0.00396728515625, "learning_rate": 5.204266598003604e-06, "loss": 0.1759, "mean_token_accuracy": 0.9629833257198334, "num_tokens": 2385836401.0, "step": 22500 }, { "entropy": 1.39046875, "epoch": 0.5521277116693599, "grad_norm": 3.671875, "learning_rate": 5.20014274781496e-06, "loss": 0.176, "mean_token_accuracy": 0.9624341118335724, "num_tokens": 2391023729.0, "step": 22550 }, { "entropy": 1.410625, "epoch": 0.5533519416287156, "grad_norm": 2.59375, "learning_rate": 5.196009882367158e-06, "loss": 0.175, "mean_token_accuracy": 0.9633600628376007, "num_tokens": 2396091073.0, "step": 22600 }, { "entropy": 1.40546875, "epoch": 0.5545761715880712, "grad_norm": 1.640625, "learning_rate": 5.191868018594941e-06, "loss": 0.1828, "mean_token_accuracy": 0.9620015740394592, "num_tokens": 2401188218.0, "step": 22650 }, { "entropy": 1.4009375, "epoch": 0.5558004015474267, "grad_norm": 3.328125, "learning_rate": 5.187717173469924e-06, "loss": 0.1711, "mean_token_accuracy": 0.9637360453605652, "num_tokens": 2406245988.0, "step": 22700 }, { "entropy": 1.39234375, "epoch": 0.5570246315067823, "grad_norm": 2.0625, "learning_rate": 5.183557364000523e-06, "loss": 0.1737, "mean_token_accuracy": 0.9634659576416016, "num_tokens": 2411368109.0, "step": 22750 }, { "entropy": 1.40296875, "epoch": 0.5582488614661378, "grad_norm": 2.265625, "learning_rate": 5.179388607231889e-06, "loss": 0.1728, "mean_token_accuracy": 0.9633192873001098, "num_tokens": 2416689928.0, "step": 22800 }, { "entropy": 1.410625, "epoch": 0.5594730914254934, "grad_norm": 2.4375, "learning_rate": 5.17521092024583e-06, "loss": 0.1867, "mean_token_accuracy": 0.9608077311515808, "num_tokens": 2422352742.0, "step": 22850 }, { "entropy": 1.39109375, "epoch": 0.560697321384849, "grad_norm": 0.08642578125, "learning_rate": 5.171024320160752e-06, "loss": 0.1667, "mean_token_accuracy": 0.9654168891906738, "num_tokens": 2427576584.0, "step": 22900 }, { "entropy": 1.38734375, "epoch": 0.5619215513442045, "grad_norm": 2.75, "learning_rate": 5.166828824131578e-06, "loss": 0.1696, "mean_token_accuracy": 0.9640141320228577, "num_tokens": 2432765937.0, "step": 22950 }, { "entropy": 1.3884375, "epoch": 0.5631457813035601, "grad_norm": 2.75, "learning_rate": 5.162624449349686e-06, "loss": 0.1801, "mean_token_accuracy": 0.9613782787322998, "num_tokens": 2437980184.0, "step": 23000 }, { "entropy": 1.3728125, "epoch": 0.5643700112629156, "grad_norm": 2.953125, "learning_rate": 5.158411213042835e-06, "loss": 0.1675, "mean_token_accuracy": 0.9656554198265076, "num_tokens": 2443001633.0, "step": 23050 }, { "entropy": 1.39265625, "epoch": 0.5655942412222712, "grad_norm": 2.140625, "learning_rate": 5.154189132475095e-06, "loss": 0.1826, "mean_token_accuracy": 0.9614216196537018, "num_tokens": 2448599009.0, "step": 23100 }, { "entropy": 1.3725, "epoch": 0.5668184711816268, "grad_norm": 3.34375, "learning_rate": 5.149958224946776e-06, "loss": 0.1871, "mean_token_accuracy": 0.9604478991031646, "num_tokens": 2454134698.0, "step": 23150 }, { "entropy": 1.3503125, "epoch": 0.5680427011409823, "grad_norm": 3.140625, "learning_rate": 5.145718507794354e-06, "loss": 0.1725, "mean_token_accuracy": 0.9635867273807526, "num_tokens": 2459430485.0, "step": 23200 }, { "entropy": 1.3696875, "epoch": 0.5692669311003379, "grad_norm": 2.0, "learning_rate": 5.141469998390408e-06, "loss": 0.1778, "mean_token_accuracy": 0.9624897265434265, "num_tokens": 2464814573.0, "step": 23250 }, { "entropy": 1.34359375, "epoch": 0.5704911610596934, "grad_norm": 3.109375, "learning_rate": 5.1372127141435415e-06, "loss": 0.1866, "mean_token_accuracy": 0.961111787557602, "num_tokens": 2470288053.0, "step": 23300 }, { "entropy": 1.36140625, "epoch": 0.571715391019049, "grad_norm": 2.609375, "learning_rate": 5.132946672498313e-06, "loss": 0.1847, "mean_token_accuracy": 0.9609505522251129, "num_tokens": 2475912972.0, "step": 23350 }, { "entropy": 1.3640625, "epoch": 0.5729396209784046, "grad_norm": 2.015625, "learning_rate": 5.128671890935168e-06, "loss": 0.1868, "mean_token_accuracy": 0.9606727063655853, "num_tokens": 2481260397.0, "step": 23400 }, { "entropy": 1.36171875, "epoch": 0.5741638509377601, "grad_norm": 3.0625, "learning_rate": 5.12438838697036e-06, "loss": 0.1667, "mean_token_accuracy": 0.9649614369869233, "num_tokens": 2486480334.0, "step": 23450 }, { "entropy": 1.34078125, "epoch": 0.5753880808971157, "grad_norm": 2.453125, "learning_rate": 5.120096178155887e-06, "loss": 0.1739, "mean_token_accuracy": 0.9637984907627106, "num_tokens": 2491784273.0, "step": 23500 }, { "entropy": 1.37375, "epoch": 0.5766123108564712, "grad_norm": 2.796875, "learning_rate": 5.115795282079414e-06, "loss": 0.1825, "mean_token_accuracy": 0.9622078704833984, "num_tokens": 2496936761.0, "step": 23550 }, { "entropy": 1.37890625, "epoch": 0.5778365408158268, "grad_norm": 2.578125, "learning_rate": 5.111485716364204e-06, "loss": 0.1713, "mean_token_accuracy": 0.9633621573448181, "num_tokens": 2502372671.0, "step": 23600 }, { "entropy": 1.37671875, "epoch": 0.5790607707751824, "grad_norm": 2.34375, "learning_rate": 5.107167498669044e-06, "loss": 0.1888, "mean_token_accuracy": 0.9600040495395661, "num_tokens": 2508248084.0, "step": 23650 }, { "entropy": 1.3646875, "epoch": 0.5802850007345379, "grad_norm": 3.296875, "learning_rate": 5.102840646688173e-06, "loss": 0.1778, "mean_token_accuracy": 0.9631288397312164, "num_tokens": 2513722383.0, "step": 23700 }, { "entropy": 1.3534375, "epoch": 0.5815092306938935, "grad_norm": 1.7890625, "learning_rate": 5.0985051781512076e-06, "loss": 0.1853, "mean_token_accuracy": 0.9618443667888641, "num_tokens": 2518947610.0, "step": 23750 }, { "entropy": 1.34390625, "epoch": 0.5827334606532492, "grad_norm": 2.65625, "learning_rate": 5.094161110823076e-06, "loss": 0.178, "mean_token_accuracy": 0.963310706615448, "num_tokens": 2524269424.0, "step": 23800 }, { "entropy": 1.35328125, "epoch": 0.5839576906126047, "grad_norm": 2.59375, "learning_rate": 5.089808462503938e-06, "loss": 0.1839, "mean_token_accuracy": 0.9614792597293854, "num_tokens": 2529803600.0, "step": 23850 }, { "entropy": 1.3525, "epoch": 0.5851819205719603, "grad_norm": 3.046875, "learning_rate": 5.085447251029113e-06, "loss": 0.1721, "mean_token_accuracy": 0.963988184928894, "num_tokens": 2534916174.0, "step": 23900 }, { "entropy": 1.35859375, "epoch": 0.5864061505313158, "grad_norm": 2.140625, "learning_rate": 5.081077494269013e-06, "loss": 0.1857, "mean_token_accuracy": 0.9612233006954193, "num_tokens": 2540205630.0, "step": 23950 }, { "entropy": 1.35015625, "epoch": 0.5876303804906714, "grad_norm": 2.125, "learning_rate": 5.076699210129059e-06, "loss": 0.1741, "mean_token_accuracy": 0.9633960282802582, "num_tokens": 2545114709.0, "step": 24000 }, { "entropy": 1.346875, "epoch": 0.588854610450027, "grad_norm": 2.265625, "learning_rate": 5.072312416549619e-06, "loss": 0.171, "mean_token_accuracy": 0.9637422835826874, "num_tokens": 2550645548.0, "step": 24050 }, { "entropy": 1.35140625, "epoch": 0.5900788404093825, "grad_norm": 1.8046875, "learning_rate": 5.067917131505928e-06, "loss": 0.186, "mean_token_accuracy": 0.9609566831588745, "num_tokens": 2556096356.0, "step": 24100 }, { "entropy": 1.34828125, "epoch": 0.5913030703687381, "grad_norm": 2.375, "learning_rate": 5.063513373008014e-06, "loss": 0.1874, "mean_token_accuracy": 0.9602975726127625, "num_tokens": 2561716691.0, "step": 24150 }, { "entropy": 1.36828125, "epoch": 0.5925273003280936, "grad_norm": 1.7578125, "learning_rate": 5.059101159100625e-06, "loss": 0.1911, "mean_token_accuracy": 0.9601788830757141, "num_tokens": 2566995725.0, "step": 24200 }, { "entropy": 1.36234375, "epoch": 0.5937515302874492, "grad_norm": 2.671875, "learning_rate": 5.054680507863158e-06, "loss": 0.196, "mean_token_accuracy": 0.9593268644809723, "num_tokens": 2572823278.0, "step": 24250 }, { "entropy": 1.36125, "epoch": 0.5949757602468048, "grad_norm": 2.375, "learning_rate": 5.050251437409581e-06, "loss": 0.1746, "mean_token_accuracy": 0.9630362141132355, "num_tokens": 2577835467.0, "step": 24300 }, { "entropy": 1.365625, "epoch": 0.5961999902061603, "grad_norm": 3.140625, "learning_rate": 5.045813965888362e-06, "loss": 0.184, "mean_token_accuracy": 0.9621260786056518, "num_tokens": 2582930120.0, "step": 24350 }, { "entropy": 1.355625, "epoch": 0.5974242201655159, "grad_norm": 3.40625, "learning_rate": 5.04136811148239e-06, "loss": 0.1697, "mean_token_accuracy": 0.963900375366211, "num_tokens": 2587853502.0, "step": 24400 }, { "entropy": 1.36140625, "epoch": 0.5986484501248714, "grad_norm": 2.4375, "learning_rate": 5.036913892408908e-06, "loss": 0.1837, "mean_token_accuracy": 0.9621051216125488, "num_tokens": 2593227737.0, "step": 24450 }, { "entropy": 1.3525, "epoch": 0.599872680084227, "grad_norm": 2.203125, "learning_rate": 5.032451326919429e-06, "loss": 0.1799, "mean_token_accuracy": 0.962098822593689, "num_tokens": 2598591436.0, "step": 24500 }, { "entropy": 1.34015625, "epoch": 0.6010969100435826, "grad_norm": 2.53125, "learning_rate": 5.027980433299671e-06, "loss": 0.1758, "mean_token_accuracy": 0.9619297671318054, "num_tokens": 2604000565.0, "step": 24550 }, { "entropy": 1.3484375, "epoch": 0.6023211400029381, "grad_norm": 2.71875, "learning_rate": 5.023501229869474e-06, "loss": 0.1737, "mean_token_accuracy": 0.9643021488189697, "num_tokens": 2608991683.0, "step": 24600 }, { "entropy": 1.33015625, "epoch": 0.6035453699622937, "grad_norm": 1.9765625, "learning_rate": 5.0190137349827266e-06, "loss": 0.1665, "mean_token_accuracy": 0.9643359172344208, "num_tokens": 2614123184.0, "step": 24650 }, { "entropy": 1.344375, "epoch": 0.6047695999216492, "grad_norm": 2.96875, "learning_rate": 5.014517967027297e-06, "loss": 0.1805, "mean_token_accuracy": 0.962350081205368, "num_tokens": 2619309044.0, "step": 24700 }, { "entropy": 1.3540625, "epoch": 0.6059938298810048, "grad_norm": 2.734375, "learning_rate": 5.01001394442495e-06, "loss": 0.1776, "mean_token_accuracy": 0.9621638679504394, "num_tokens": 2624919047.0, "step": 24750 }, { "entropy": 1.34859375, "epoch": 0.6072180598403604, "grad_norm": 3.03125, "learning_rate": 5.005501685631273e-06, "loss": 0.1733, "mean_token_accuracy": 0.9635497546195984, "num_tokens": 2630407723.0, "step": 24800 }, { "entropy": 1.3534375, "epoch": 0.6084422897997159, "grad_norm": 1.5390625, "learning_rate": 5.000981209135607e-06, "loss": 0.1781, "mean_token_accuracy": 0.9629986727237702, "num_tokens": 2635671685.0, "step": 24850 }, { "entropy": 1.3459375, "epoch": 0.6096665197590715, "grad_norm": 3.71875, "learning_rate": 4.9964525334609604e-06, "loss": 0.174, "mean_token_accuracy": 0.9627162063121796, "num_tokens": 2641068693.0, "step": 24900 }, { "entropy": 1.35453125, "epoch": 0.6108907497184272, "grad_norm": 2.75, "learning_rate": 4.99191567716394e-06, "loss": 0.1796, "mean_token_accuracy": 0.9617865860462189, "num_tokens": 2646610014.0, "step": 24950 }, { "entropy": 1.37453125, "epoch": 0.6121149796777827, "grad_norm": 3.109375, "learning_rate": 4.987370658834675e-06, "loss": 0.1833, "mean_token_accuracy": 0.9610668885707855, "num_tokens": 2651951764.0, "step": 25000 }, { "entropy": 1.40046875, "epoch": 0.6133392096371383, "grad_norm": 3.828125, "learning_rate": 4.982817497096737e-06, "loss": 0.1758, "mean_token_accuracy": 0.9631572890281678, "num_tokens": 2657065776.0, "step": 25050 }, { "entropy": 1.38859375, "epoch": 0.6145634395964938, "grad_norm": 3.0625, "learning_rate": 4.978256210607068e-06, "loss": 0.1738, "mean_token_accuracy": 0.9639844071865081, "num_tokens": 2662222291.0, "step": 25100 }, { "entropy": 1.3496875, "epoch": 0.6157876695558494, "grad_norm": 3.21875, "learning_rate": 4.973686818055901e-06, "loss": 0.1684, "mean_token_accuracy": 0.9642084753513336, "num_tokens": 2667209443.0, "step": 25150 }, { "entropy": 1.36375, "epoch": 0.617011899515205, "grad_norm": 1.859375, "learning_rate": 4.969109338166683e-06, "loss": 0.1719, "mean_token_accuracy": 0.9646093189716339, "num_tokens": 2672346139.0, "step": 25200 }, { "entropy": 1.38625, "epoch": 0.6182361294745605, "grad_norm": 2.40625, "learning_rate": 4.964523789695999e-06, "loss": 0.1855, "mean_token_accuracy": 0.9612112033367157, "num_tokens": 2677709139.0, "step": 25250 }, { "entropy": 1.38171875, "epoch": 0.6194603594339161, "grad_norm": 2.90625, "learning_rate": 4.959930191433498e-06, "loss": 0.1832, "mean_token_accuracy": 0.9613463747501373, "num_tokens": 2682889432.0, "step": 25300 }, { "entropy": 1.39375, "epoch": 0.6206845893932716, "grad_norm": 2.8125, "learning_rate": 4.955328562201814e-06, "loss": 0.1953, "mean_token_accuracy": 0.959397931098938, "num_tokens": 2688531671.0, "step": 25350 }, { "entropy": 1.396875, "epoch": 0.6219088193526272, "grad_norm": 1.8984375, "learning_rate": 4.950718920856486e-06, "loss": 0.1882, "mean_token_accuracy": 0.9605313742160797, "num_tokens": 2693586026.0, "step": 25400 }, { "entropy": 1.38203125, "epoch": 0.6231330493119828, "grad_norm": 2.328125, "learning_rate": 4.946101286285884e-06, "loss": 0.1708, "mean_token_accuracy": 0.9638578796386719, "num_tokens": 2698728829.0, "step": 25450 }, { "entropy": 1.3803125, "epoch": 0.6243572792713383, "grad_norm": 3.53125, "learning_rate": 4.9414756774111335e-06, "loss": 0.167, "mean_token_accuracy": 0.9648666107654571, "num_tokens": 2703894118.0, "step": 25500 }, { "entropy": 1.4071875, "epoch": 0.6255815092306939, "grad_norm": 3.46875, "learning_rate": 4.93684211318603e-06, "loss": 0.1782, "mean_token_accuracy": 0.962544618844986, "num_tokens": 2709087928.0, "step": 25550 }, { "entropy": 1.40078125, "epoch": 0.6268057391900494, "grad_norm": 3.28125, "learning_rate": 4.932200612596974e-06, "loss": 0.1757, "mean_token_accuracy": 0.963033629655838, "num_tokens": 2714244664.0, "step": 25600 }, { "entropy": 1.401875, "epoch": 0.628029969149405, "grad_norm": 3.859375, "learning_rate": 4.927551194662878e-06, "loss": 0.1701, "mean_token_accuracy": 0.9642516016960144, "num_tokens": 2719276387.0, "step": 25650 }, { "entropy": 1.4296875, "epoch": 0.6292541991087606, "grad_norm": 2.625, "learning_rate": 4.922893878435101e-06, "loss": 0.1877, "mean_token_accuracy": 0.9612637603282929, "num_tokens": 2724924886.0, "step": 25700 }, { "entropy": 1.40390625, "epoch": 0.6304784290681161, "grad_norm": 2.546875, "learning_rate": 4.918228682997367e-06, "loss": 0.1751, "mean_token_accuracy": 0.9626137948036194, "num_tokens": 2730190384.0, "step": 25750 }, { "entropy": 1.4384375, "epoch": 0.6317026590274717, "grad_norm": 1.7421875, "learning_rate": 4.9135556274656825e-06, "loss": 0.1921, "mean_token_accuracy": 0.9599238002300262, "num_tokens": 2735642568.0, "step": 25800 }, { "entropy": 1.43296875, "epoch": 0.6329268889868273, "grad_norm": 2.609375, "learning_rate": 4.908874730988262e-06, "loss": 0.1859, "mean_token_accuracy": 0.9601176917552948, "num_tokens": 2741009627.0, "step": 25850 }, { "entropy": 1.42296875, "epoch": 0.6341511189461828, "grad_norm": 2.171875, "learning_rate": 4.904186012745451e-06, "loss": 0.1836, "mean_token_accuracy": 0.9604202997684479, "num_tokens": 2746576865.0, "step": 25900 }, { "entropy": 1.42078125, "epoch": 0.6353753489055384, "grad_norm": 3.109375, "learning_rate": 4.899489491949643e-06, "loss": 0.1678, "mean_token_accuracy": 0.9639356219768525, "num_tokens": 2751636571.0, "step": 25950 }, { "entropy": 1.43125, "epoch": 0.6365995788648939, "grad_norm": 3.328125, "learning_rate": 4.894785187845203e-06, "loss": 0.1763, "mean_token_accuracy": 0.9626227140426635, "num_tokens": 2756749043.0, "step": 26000 }, { "entropy": 1.41953125, "epoch": 0.6378238088242495, "grad_norm": 1.921875, "learning_rate": 4.890073119708392e-06, "loss": 0.1716, "mean_token_accuracy": 0.9636380136013031, "num_tokens": 2761887971.0, "step": 26050 }, { "entropy": 1.42109375, "epoch": 0.6390480387836052, "grad_norm": 2.0625, "learning_rate": 4.88535330684728e-06, "loss": 0.1754, "mean_token_accuracy": 0.9623912250995637, "num_tokens": 2767051370.0, "step": 26100 }, { "entropy": 1.4259375, "epoch": 0.6402722687429607, "grad_norm": 2.546875, "learning_rate": 4.880625768601674e-06, "loss": 0.1781, "mean_token_accuracy": 0.9622378349304199, "num_tokens": 2772481902.0, "step": 26150 }, { "entropy": 1.4315625, "epoch": 0.6414964987023163, "grad_norm": 2.484375, "learning_rate": 4.87589052434304e-06, "loss": 0.1874, "mean_token_accuracy": 0.9602720224857331, "num_tokens": 2777927527.0, "step": 26200 }, { "entropy": 1.4140625, "epoch": 0.6427207286616718, "grad_norm": 2.421875, "learning_rate": 4.871147593474412e-06, "loss": 0.184, "mean_token_accuracy": 0.9599432504177093, "num_tokens": 2783446389.0, "step": 26250 }, { "entropy": 1.4053125, "epoch": 0.6439449586210274, "grad_norm": 2.40625, "learning_rate": 4.866396995430328e-06, "loss": 0.1786, "mean_token_accuracy": 0.9628067684173583, "num_tokens": 2788980882.0, "step": 26300 }, { "entropy": 1.38875, "epoch": 0.645169188580383, "grad_norm": 2.71875, "learning_rate": 4.861638749676737e-06, "loss": 0.1677, "mean_token_accuracy": 0.9639978551864624, "num_tokens": 2793955184.0, "step": 26350 }, { "entropy": 1.4034375, "epoch": 0.6463934185397385, "grad_norm": 1.6953125, "learning_rate": 4.85687287571093e-06, "loss": 0.1721, "mean_token_accuracy": 0.9636970722675323, "num_tokens": 2799185455.0, "step": 26400 }, { "entropy": 1.40828125, "epoch": 0.6476176484990941, "grad_norm": 3.640625, "learning_rate": 4.852099393061452e-06, "loss": 0.1818, "mean_token_accuracy": 0.962208844423294, "num_tokens": 2804463803.0, "step": 26450 }, { "entropy": 1.38484375, "epoch": 0.6488418784584497, "grad_norm": 1.75, "learning_rate": 4.847318321288027e-06, "loss": 0.165, "mean_token_accuracy": 0.9649109244346619, "num_tokens": 2809874779.0, "step": 26500 }, { "entropy": 1.37953125, "epoch": 0.6500661084178052, "grad_norm": 2.984375, "learning_rate": 4.842529679981474e-06, "loss": 0.1694, "mean_token_accuracy": 0.9632159042358398, "num_tokens": 2814714128.0, "step": 26550 }, { "entropy": 1.39625, "epoch": 0.6512903383771608, "grad_norm": 2.765625, "learning_rate": 4.8377334887636305e-06, "loss": 0.1697, "mean_token_accuracy": 0.9637495183944702, "num_tokens": 2819740494.0, "step": 26600 }, { "entropy": 1.39109375, "epoch": 0.6525145683365163, "grad_norm": 3.03125, "learning_rate": 4.8329297672872695e-06, "loss": 0.1816, "mean_token_accuracy": 0.9610202670097351, "num_tokens": 2824966205.0, "step": 26650 }, { "entropy": 1.37796875, "epoch": 0.6537387982958719, "grad_norm": 2.53125, "learning_rate": 4.828118535236023e-06, "loss": 0.1742, "mean_token_accuracy": 0.9625972366333008, "num_tokens": 2830034251.0, "step": 26700 }, { "entropy": 1.3953125, "epoch": 0.6549630282552275, "grad_norm": 2.28125, "learning_rate": 4.823299812324291e-06, "loss": 0.1847, "mean_token_accuracy": 0.9611959600448609, "num_tokens": 2835494370.0, "step": 26750 }, { "entropy": 1.38203125, "epoch": 0.656187258214583, "grad_norm": 2.15625, "learning_rate": 4.818473618297175e-06, "loss": 0.1728, "mean_token_accuracy": 0.9636625552177429, "num_tokens": 2840744565.0, "step": 26800 }, { "entropy": 1.3696875, "epoch": 0.6574114881739386, "grad_norm": 3.671875, "learning_rate": 4.8136399729303875e-06, "loss": 0.1599, "mean_token_accuracy": 0.9664247930049896, "num_tokens": 2845515500.0, "step": 26850 }, { "entropy": 1.39671875, "epoch": 0.6586357181332941, "grad_norm": 2.140625, "learning_rate": 4.808798896030171e-06, "loss": 0.182, "mean_token_accuracy": 0.9610953998565673, "num_tokens": 2850746030.0, "step": 26900 }, { "entropy": 1.38609375, "epoch": 0.6598599480926497, "grad_norm": 1.578125, "learning_rate": 4.803950407433224e-06, "loss": 0.1774, "mean_token_accuracy": 0.9627044332027436, "num_tokens": 2856071580.0, "step": 26950 }, { "entropy": 1.38640625, "epoch": 0.6610841780520053, "grad_norm": 2.359375, "learning_rate": 4.799094527006611e-06, "loss": 0.1747, "mean_token_accuracy": 0.9633591079711914, "num_tokens": 2861236205.0, "step": 27000 }, { "entropy": 1.38140625, "epoch": 0.6623084080113608, "grad_norm": 2.046875, "learning_rate": 4.794231274647687e-06, "loss": 0.175, "mean_token_accuracy": 0.9629326021671295, "num_tokens": 2866317531.0, "step": 27050 }, { "entropy": 1.37421875, "epoch": 0.6635326379707164, "grad_norm": 2.765625, "learning_rate": 4.789360670284014e-06, "loss": 0.178, "mean_token_accuracy": 0.962060467004776, "num_tokens": 2871541131.0, "step": 27100 }, { "entropy": 1.4078125, "epoch": 0.6647568679300719, "grad_norm": 1.921875, "learning_rate": 4.784482733873279e-06, "loss": 0.1962, "mean_token_accuracy": 0.959048901796341, "num_tokens": 2877146197.0, "step": 27150 }, { "entropy": 1.3890625, "epoch": 0.6659810978894275, "grad_norm": 2.125, "learning_rate": 4.7795974854032114e-06, "loss": 0.1823, "mean_token_accuracy": 0.9619522738456726, "num_tokens": 2882596630.0, "step": 27200 }, { "entropy": 1.3603125, "epoch": 0.6672053278487832, "grad_norm": 2.421875, "learning_rate": 4.774704944891505e-06, "loss": 0.175, "mean_token_accuracy": 0.9625801253318786, "num_tokens": 2887948438.0, "step": 27250 }, { "entropy": 1.39546875, "epoch": 0.6684295578081387, "grad_norm": 2.265625, "learning_rate": 4.769805132385734e-06, "loss": 0.1879, "mean_token_accuracy": 0.9613603317737579, "num_tokens": 2893501173.0, "step": 27300 }, { "entropy": 1.40875, "epoch": 0.6696537877674943, "grad_norm": 2.3125, "learning_rate": 4.764898067963265e-06, "loss": 0.1873, "mean_token_accuracy": 0.9604850566387176, "num_tokens": 2898869944.0, "step": 27350 }, { "entropy": 1.37859375, "epoch": 0.6708780177268499, "grad_norm": 2.40625, "learning_rate": 4.759983771731184e-06, "loss": 0.1679, "mean_token_accuracy": 0.965053141117096, "num_tokens": 2903596870.0, "step": 27400 }, { "entropy": 1.37453125, "epoch": 0.6721022476862054, "grad_norm": 2.03125, "learning_rate": 4.75506226382621e-06, "loss": 0.1862, "mean_token_accuracy": 0.9613700366020203, "num_tokens": 2909474929.0, "step": 27450 }, { "entropy": 1.36875, "epoch": 0.673326477645561, "grad_norm": 2.453125, "learning_rate": 4.750133564414611e-06, "loss": 0.1667, "mean_token_accuracy": 0.9644119250774383, "num_tokens": 2914673564.0, "step": 27500 }, { "entropy": 1.396875, "epoch": 0.6745507076049165, "grad_norm": 2.796875, "learning_rate": 4.745197693692121e-06, "loss": 0.1852, "mean_token_accuracy": 0.9608116745948792, "num_tokens": 2920176865.0, "step": 27550 }, { "entropy": 1.41515625, "epoch": 0.6757749375642721, "grad_norm": 1.8359375, "learning_rate": 4.740254671883864e-06, "loss": 0.1912, "mean_token_accuracy": 0.9596376729011535, "num_tokens": 2925586459.0, "step": 27600 }, { "entropy": 1.3996875, "epoch": 0.6769991675236277, "grad_norm": 3.65625, "learning_rate": 4.735304519244263e-06, "loss": 0.1745, "mean_token_accuracy": 0.9637066113948822, "num_tokens": 2930825954.0, "step": 27650 }, { "entropy": 1.3809375, "epoch": 0.6782233974829832, "grad_norm": 1.921875, "learning_rate": 4.73034725605696e-06, "loss": 0.1658, "mean_token_accuracy": 0.9653242897987365, "num_tokens": 2935862959.0, "step": 27700 }, { "entropy": 1.38953125, "epoch": 0.6794476274423388, "grad_norm": 3.0625, "learning_rate": 4.725382902634733e-06, "loss": 0.1681, "mean_token_accuracy": 0.9643997454643249, "num_tokens": 2940725166.0, "step": 27750 }, { "entropy": 1.40421875, "epoch": 0.6806718574016943, "grad_norm": 2.859375, "learning_rate": 4.720411479319414e-06, "loss": 0.1725, "mean_token_accuracy": 0.9641519057750702, "num_tokens": 2946188027.0, "step": 27800 }, { "entropy": 1.40796875, "epoch": 0.6818960873610499, "grad_norm": 2.828125, "learning_rate": 4.7154330064818045e-06, "loss": 0.1841, "mean_token_accuracy": 0.9606011056900025, "num_tokens": 2951612651.0, "step": 27850 }, { "entropy": 1.395625, "epoch": 0.6831203173204055, "grad_norm": 2.96875, "learning_rate": 4.710447504521588e-06, "loss": 0.1647, "mean_token_accuracy": 0.9641698563098907, "num_tokens": 2956787623.0, "step": 27900 }, { "entropy": 1.40359375, "epoch": 0.684344547279761, "grad_norm": 3.5625, "learning_rate": 4.705454993867257e-06, "loss": 0.1751, "mean_token_accuracy": 0.9634602963924408, "num_tokens": 2961925459.0, "step": 27950 }, { "entropy": 1.3925, "epoch": 0.6855687772391166, "grad_norm": 1.921875, "learning_rate": 4.700455494976019e-06, "loss": 0.1751, "mean_token_accuracy": 0.9632600677013398, "num_tokens": 2967274024.0, "step": 28000 }, { "entropy": 1.3640625, "epoch": 0.6867930071984721, "grad_norm": 2.140625, "learning_rate": 4.695449028333715e-06, "loss": 0.1581, "mean_token_accuracy": 0.965574380159378, "num_tokens": 2972439136.0, "step": 28050 }, { "entropy": 1.37203125, "epoch": 0.6880172371578277, "grad_norm": 2.640625, "learning_rate": 4.6904356144547405e-06, "loss": 0.1833, "mean_token_accuracy": 0.9605630087852478, "num_tokens": 2977717715.0, "step": 28100 }, { "entropy": 1.38703125, "epoch": 0.6892414671171833, "grad_norm": 2.65625, "learning_rate": 4.685415273881955e-06, "loss": 0.1849, "mean_token_accuracy": 0.9602934348583222, "num_tokens": 2983019999.0, "step": 28150 }, { "entropy": 1.36609375, "epoch": 0.6904656970765388, "grad_norm": 1.65625, "learning_rate": 4.6803880271866e-06, "loss": 0.1635, "mean_token_accuracy": 0.9659206521511078, "num_tokens": 2987974089.0, "step": 28200 }, { "entropy": 1.38875, "epoch": 0.6916899270358944, "grad_norm": 2.171875, "learning_rate": 4.675353894968219e-06, "loss": 0.1956, "mean_token_accuracy": 0.958441025018692, "num_tokens": 2993587967.0, "step": 28250 }, { "entropy": 1.3828125, "epoch": 0.6929141569952499, "grad_norm": 1.796875, "learning_rate": 4.670312897854568e-06, "loss": 0.1822, "mean_token_accuracy": 0.9611673438549042, "num_tokens": 2999047067.0, "step": 28300 }, { "entropy": 1.36875, "epoch": 0.6941383869546055, "grad_norm": 2.375, "learning_rate": 4.665265056501529e-06, "loss": 0.1743, "mean_token_accuracy": 0.9631416380405426, "num_tokens": 3004064576.0, "step": 28350 }, { "entropy": 1.34109375, "epoch": 0.6953626169139612, "grad_norm": 3.0625, "learning_rate": 4.660210391593035e-06, "loss": 0.1593, "mean_token_accuracy": 0.9659523034095764, "num_tokens": 3009178123.0, "step": 28400 }, { "entropy": 1.36859375, "epoch": 0.6965868468733167, "grad_norm": 2.96875, "learning_rate": 4.655148923840974e-06, "loss": 0.1848, "mean_token_accuracy": 0.9613404250144959, "num_tokens": 3014406061.0, "step": 28450 }, { "entropy": 1.36828125, "epoch": 0.6978110768326723, "grad_norm": 2.234375, "learning_rate": 4.6500806739851114e-06, "loss": 0.1754, "mean_token_accuracy": 0.9632516479492188, "num_tokens": 3019405252.0, "step": 28500 }, { "entropy": 1.36640625, "epoch": 0.6990353067920279, "grad_norm": 3.265625, "learning_rate": 4.645005662793002e-06, "loss": 0.1765, "mean_token_accuracy": 0.9634008550643921, "num_tokens": 3024715395.0, "step": 28550 }, { "entropy": 1.386875, "epoch": 0.7002595367513834, "grad_norm": 1.7265625, "learning_rate": 4.639923911059907e-06, "loss": 0.1792, "mean_token_accuracy": 0.9633400416374207, "num_tokens": 3030214594.0, "step": 28600 }, { "entropy": 1.36390625, "epoch": 0.701483766710739, "grad_norm": 2.828125, "learning_rate": 4.634835439608706e-06, "loss": 0.1712, "mean_token_accuracy": 0.9632709419727326, "num_tokens": 3035472593.0, "step": 28650 }, { "entropy": 1.34984375, "epoch": 0.7027079966700945, "grad_norm": 2.640625, "learning_rate": 4.629740269289813e-06, "loss": 0.1634, "mean_token_accuracy": 0.9657196223735809, "num_tokens": 3040576077.0, "step": 28700 }, { "entropy": 1.37296875, "epoch": 0.7039322266294501, "grad_norm": 1.8125, "learning_rate": 4.6246384209810935e-06, "loss": 0.1857, "mean_token_accuracy": 0.9612914025783539, "num_tokens": 3046057341.0, "step": 28750 }, { "entropy": 1.35765625, "epoch": 0.7051564565888057, "grad_norm": 3.5, "learning_rate": 4.6195299155877746e-06, "loss": 0.1752, "mean_token_accuracy": 0.9628597724437714, "num_tokens": 3051406159.0, "step": 28800 }, { "entropy": 1.34625, "epoch": 0.7063806865481612, "grad_norm": 2.046875, "learning_rate": 4.61441477404236e-06, "loss": 0.1736, "mean_token_accuracy": 0.963384006023407, "num_tokens": 3056663844.0, "step": 28850 }, { "entropy": 1.35421875, "epoch": 0.7076049165075168, "grad_norm": 2.546875, "learning_rate": 4.60929301730455e-06, "loss": 0.1857, "mean_token_accuracy": 0.9611174511909485, "num_tokens": 3062180594.0, "step": 28900 }, { "entropy": 1.3396875, "epoch": 0.7088291464668723, "grad_norm": 2.171875, "learning_rate": 4.604164666361146e-06, "loss": 0.1771, "mean_token_accuracy": 0.9630412280559539, "num_tokens": 3067629529.0, "step": 28950 }, { "entropy": 1.3521875, "epoch": 0.7100533764262279, "grad_norm": 2.53125, "learning_rate": 4.599029742225975e-06, "loss": 0.1854, "mean_token_accuracy": 0.9603700506687164, "num_tokens": 3072962675.0, "step": 29000 }, { "entropy": 1.34265625, "epoch": 0.7112776063855835, "grad_norm": 2.578125, "learning_rate": 4.593888265939793e-06, "loss": 0.1668, "mean_token_accuracy": 0.9641862511634827, "num_tokens": 3078457917.0, "step": 29050 }, { "entropy": 1.3565625, "epoch": 0.712501836344939, "grad_norm": 2.484375, "learning_rate": 4.5887402585702056e-06, "loss": 0.1741, "mean_token_accuracy": 0.9627685403823852, "num_tokens": 3083722495.0, "step": 29100 }, { "entropy": 1.3690625, "epoch": 0.7137260663042946, "grad_norm": 2.0, "learning_rate": 4.583585741211583e-06, "loss": 0.1782, "mean_token_accuracy": 0.9620171189308167, "num_tokens": 3089097439.0, "step": 29150 }, { "entropy": 1.3615625, "epoch": 0.7149502962636501, "grad_norm": 2.90625, "learning_rate": 4.5784247349849666e-06, "loss": 0.183, "mean_token_accuracy": 0.9622057628631592, "num_tokens": 3094373355.0, "step": 29200 }, { "entropy": 1.3421875, "epoch": 0.7161745262230057, "grad_norm": 1.953125, "learning_rate": 4.57325726103799e-06, "loss": 0.1771, "mean_token_accuracy": 0.9627100145816803, "num_tokens": 3099619006.0, "step": 29250 }, { "entropy": 1.33015625, "epoch": 0.7173987561823613, "grad_norm": 3.296875, "learning_rate": 4.568083340544785e-06, "loss": 0.1738, "mean_token_accuracy": 0.9631901240348816, "num_tokens": 3104769496.0, "step": 29300 }, { "entropy": 1.32921875, "epoch": 0.7186229861417168, "grad_norm": 2.359375, "learning_rate": 4.562902994705902e-06, "loss": 0.1689, "mean_token_accuracy": 0.9646138906478882, "num_tokens": 3110079410.0, "step": 29350 }, { "entropy": 1.3515625, "epoch": 0.7198472161010724, "grad_norm": 2.640625, "learning_rate": 4.557716244748217e-06, "loss": 0.186, "mean_token_accuracy": 0.9605904114246369, "num_tokens": 3115590754.0, "step": 29400 }, { "entropy": 1.33421875, "epoch": 0.721071446060428, "grad_norm": 1.859375, "learning_rate": 4.55252311192485e-06, "loss": 0.1727, "mean_token_accuracy": 0.9634395956993103, "num_tokens": 3120943769.0, "step": 29450 }, { "entropy": 1.3384375, "epoch": 0.7222956760197835, "grad_norm": 1.8515625, "learning_rate": 4.547323617515073e-06, "loss": 0.1754, "mean_token_accuracy": 0.9623040866851806, "num_tokens": 3126534469.0, "step": 29500 }, { "entropy": 1.306875, "epoch": 0.7235199059791391, "grad_norm": 3.5, "learning_rate": 4.542117782824228e-06, "loss": 0.1649, "mean_token_accuracy": 0.9650185751914978, "num_tokens": 3131829007.0, "step": 29550 }, { "entropy": 1.31984375, "epoch": 0.7247441359384947, "grad_norm": 1.7109375, "learning_rate": 4.536905629183632e-06, "loss": 0.1844, "mean_token_accuracy": 0.9605432045459747, "num_tokens": 3137395527.0, "step": 29600 }, { "entropy": 1.3121875, "epoch": 0.7259683658978503, "grad_norm": 2.3125, "learning_rate": 4.5316871779505e-06, "loss": 0.1663, "mean_token_accuracy": 0.9653282749652863, "num_tokens": 3142501686.0, "step": 29650 }, { "entropy": 1.33921875, "epoch": 0.7271925958572059, "grad_norm": 1.9765625, "learning_rate": 4.5264624505078485e-06, "loss": 0.1796, "mean_token_accuracy": 0.9623512411117554, "num_tokens": 3147984109.0, "step": 29700 }, { "entropy": 1.3259375, "epoch": 0.7284168258165614, "grad_norm": 3.671875, "learning_rate": 4.521231468264411e-06, "loss": 0.173, "mean_token_accuracy": 0.9634522151947021, "num_tokens": 3153428961.0, "step": 29750 }, { "entropy": 1.339375, "epoch": 0.729641055775917, "grad_norm": 1.8046875, "learning_rate": 4.515994252654552e-06, "loss": 0.1846, "mean_token_accuracy": 0.9607186770439148, "num_tokens": 3158828246.0, "step": 29800 }, { "entropy": 1.29671875, "epoch": 0.7308652857352725, "grad_norm": 3.140625, "learning_rate": 4.510750825138178e-06, "loss": 0.1608, "mean_token_accuracy": 0.9657926094532013, "num_tokens": 3163804439.0, "step": 29850 }, { "entropy": 1.3315625, "epoch": 0.7320895156946281, "grad_norm": 2.9375, "learning_rate": 4.505501207200649e-06, "loss": 0.1818, "mean_token_accuracy": 0.9619475591182709, "num_tokens": 3169333412.0, "step": 29900 }, { "entropy": 1.324375, "epoch": 0.7333137456539837, "grad_norm": 2.15625, "learning_rate": 4.500245420352687e-06, "loss": 0.1733, "mean_token_accuracy": 0.963250036239624, "num_tokens": 3174683947.0, "step": 29950 }, { "entropy": 1.32015625, "epoch": 0.7345379756133392, "grad_norm": 3.171875, "learning_rate": 4.494983486130298e-06, "loss": 0.1755, "mean_token_accuracy": 0.9633795261383057, "num_tokens": 3179817804.0, "step": 30000 }, { "epoch": 0.7345379756133392, "eval_entropy": 1.3244140625, "eval_loss": 0.1920091211795807, "eval_mean_token_accuracy": 0.9597868000467619, "eval_num_tokens": 3179817804.0, "eval_runtime": 606.2695, "eval_samples_per_second": 15.927, "eval_steps_per_second": 0.2, "step": 30000 }, { "entropy": 1.34265625, "epoch": 0.7357622055726948, "grad_norm": 2.828125, "learning_rate": 4.489715426094674e-06, "loss": 0.1971, "mean_token_accuracy": 0.9590841460227967, "num_tokens": 3185695558.0, "step": 30050 }, { "entropy": 1.33234375, "epoch": 0.7369864355320503, "grad_norm": 2.28125, "learning_rate": 4.484441261832107e-06, "loss": 0.1767, "mean_token_accuracy": 0.9629596638679504, "num_tokens": 3191177099.0, "step": 30100 }, { "entropy": 1.3253125, "epoch": 0.7382106654914059, "grad_norm": 2.75, "learning_rate": 4.479161014953903e-06, "loss": 0.1795, "mean_token_accuracy": 0.9617591965198516, "num_tokens": 3196688072.0, "step": 30150 }, { "entropy": 1.3171875, "epoch": 0.7394348954507615, "grad_norm": 2.578125, "learning_rate": 4.473874707096293e-06, "loss": 0.185, "mean_token_accuracy": 0.9615085804462433, "num_tokens": 3202252950.0, "step": 30200 }, { "entropy": 1.3203125, "epoch": 0.740659125410117, "grad_norm": 3.078125, "learning_rate": 4.46858235992034e-06, "loss": 0.1716, "mean_token_accuracy": 0.9639656889438629, "num_tokens": 3207720004.0, "step": 30250 }, { "entropy": 1.33046875, "epoch": 0.7418833553694726, "grad_norm": 3.4375, "learning_rate": 4.463283995111858e-06, "loss": 0.1909, "mean_token_accuracy": 0.9597360849380493, "num_tokens": 3213270190.0, "step": 30300 }, { "entropy": 1.32171875, "epoch": 0.7431075853288281, "grad_norm": 3.671875, "learning_rate": 4.4579796343813155e-06, "loss": 0.1746, "mean_token_accuracy": 0.9631195080280304, "num_tokens": 3218354333.0, "step": 30350 }, { "entropy": 1.3359375, "epoch": 0.7443318152881837, "grad_norm": 3.15625, "learning_rate": 4.452669299463749e-06, "loss": 0.172, "mean_token_accuracy": 0.963985036611557, "num_tokens": 3223570126.0, "step": 30400 }, { "entropy": 1.32640625, "epoch": 0.7455560452475393, "grad_norm": 1.8125, "learning_rate": 4.44735301211868e-06, "loss": 0.1807, "mean_token_accuracy": 0.9622200524806976, "num_tokens": 3228934737.0, "step": 30450 }, { "entropy": 1.34375, "epoch": 0.7467802752068948, "grad_norm": 3.109375, "learning_rate": 4.442030794130013e-06, "loss": 0.1719, "mean_token_accuracy": 0.9641703021526337, "num_tokens": 3234092609.0, "step": 30500 }, { "entropy": 1.3525, "epoch": 0.7480045051662504, "grad_norm": 2.328125, "learning_rate": 4.43670266730596e-06, "loss": 0.1871, "mean_token_accuracy": 0.9610934937000275, "num_tokens": 3239470570.0, "step": 30550 }, { "entropy": 1.35859375, "epoch": 0.749228735125606, "grad_norm": 3.234375, "learning_rate": 4.431368653478943e-06, "loss": 0.1799, "mean_token_accuracy": 0.9625358593463897, "num_tokens": 3245129970.0, "step": 30600 }, { "entropy": 1.36859375, "epoch": 0.7504529650849615, "grad_norm": 1.921875, "learning_rate": 4.426028774505504e-06, "loss": 0.1895, "mean_token_accuracy": 0.9608589220046997, "num_tokens": 3250417534.0, "step": 30650 }, { "entropy": 1.37203125, "epoch": 0.7516771950443171, "grad_norm": 3.125, "learning_rate": 4.420683052266223e-06, "loss": 0.1962, "mean_token_accuracy": 0.9591640889644623, "num_tokens": 3256202020.0, "step": 30700 }, { "entropy": 1.35421875, "epoch": 0.7529014250036727, "grad_norm": 2.84375, "learning_rate": 4.415331508665619e-06, "loss": 0.1723, "mean_token_accuracy": 0.9638619077205658, "num_tokens": 3261559010.0, "step": 30750 }, { "entropy": 1.36328125, "epoch": 0.7541256549630283, "grad_norm": 3.5625, "learning_rate": 4.409974165632064e-06, "loss": 0.1819, "mean_token_accuracy": 0.9618020045757294, "num_tokens": 3267151095.0, "step": 30800 }, { "entropy": 1.3546875, "epoch": 0.7553498849223839, "grad_norm": 3.484375, "learning_rate": 4.404611045117696e-06, "loss": 0.1792, "mean_token_accuracy": 0.9617926621437073, "num_tokens": 3272412916.0, "step": 30850 }, { "entropy": 1.3534375, "epoch": 0.7565741148817394, "grad_norm": 2.578125, "learning_rate": 4.399242169098329e-06, "loss": 0.1745, "mean_token_accuracy": 0.9625967741012573, "num_tokens": 3277577448.0, "step": 30900 }, { "entropy": 1.35625, "epoch": 0.757798344841095, "grad_norm": 2.65625, "learning_rate": 4.393867559573354e-06, "loss": 0.1744, "mean_token_accuracy": 0.9626732635498046, "num_tokens": 3282706579.0, "step": 30950 }, { "entropy": 1.36421875, "epoch": 0.7590225748004505, "grad_norm": 3.0, "learning_rate": 4.388487238565661e-06, "loss": 0.1784, "mean_token_accuracy": 0.9623777115345001, "num_tokens": 3287949862.0, "step": 31000 }, { "entropy": 1.36, "epoch": 0.7602468047598061, "grad_norm": 0.0230712890625, "learning_rate": 4.383101228121541e-06, "loss": 0.1788, "mean_token_accuracy": 0.9617887794971466, "num_tokens": 3293406088.0, "step": 31050 }, { "entropy": 1.35609375, "epoch": 0.7614710347191617, "grad_norm": 2.984375, "learning_rate": 4.377709550310598e-06, "loss": 0.1699, "mean_token_accuracy": 0.9636480760574341, "num_tokens": 3298608896.0, "step": 31100 }, { "entropy": 1.35375, "epoch": 0.7626952646785172, "grad_norm": 3.65625, "learning_rate": 4.37231222722566e-06, "loss": 0.1643, "mean_token_accuracy": 0.9644955229759217, "num_tokens": 3303290550.0, "step": 31150 }, { "entropy": 1.37390625, "epoch": 0.7639194946378728, "grad_norm": 2.46875, "learning_rate": 4.366909280982685e-06, "loss": 0.1766, "mean_token_accuracy": 0.9628056597709655, "num_tokens": 3308295645.0, "step": 31200 }, { "entropy": 1.36515625, "epoch": 0.7651437245972283, "grad_norm": 2.453125, "learning_rate": 4.361500733720674e-06, "loss": 0.1662, "mean_token_accuracy": 0.9649233341217041, "num_tokens": 3313438478.0, "step": 31250 }, { "entropy": 1.3575, "epoch": 0.7663679545565839, "grad_norm": 4.46875, "learning_rate": 4.356086607601575e-06, "loss": 0.1749, "mean_token_accuracy": 0.9627750849723816, "num_tokens": 3319025887.0, "step": 31300 }, { "entropy": 1.34359375, "epoch": 0.7675921845159395, "grad_norm": 2.6875, "learning_rate": 4.350666924810203e-06, "loss": 0.1647, "mean_token_accuracy": 0.9644002187252044, "num_tokens": 3323975976.0, "step": 31350 }, { "entropy": 1.35203125, "epoch": 0.768816414475295, "grad_norm": 3.765625, "learning_rate": 4.345241707554134e-06, "loss": 0.1674, "mean_token_accuracy": 0.9647248589992523, "num_tokens": 3329356054.0, "step": 31400 }, { "entropy": 1.36625, "epoch": 0.7700406444346506, "grad_norm": 2.6875, "learning_rate": 4.339810978063626e-06, "loss": 0.1776, "mean_token_accuracy": 0.9627327370643616, "num_tokens": 3334739313.0, "step": 31450 }, { "entropy": 1.35125, "epoch": 0.7712648743940062, "grad_norm": 1.875, "learning_rate": 4.334374758591524e-06, "loss": 0.1896, "mean_token_accuracy": 0.9596246099472046, "num_tokens": 3340200973.0, "step": 31500 }, { "entropy": 1.36171875, "epoch": 0.7724891043533617, "grad_norm": 2.328125, "learning_rate": 4.328933071413168e-06, "loss": 0.1731, "mean_token_accuracy": 0.9636253571510315, "num_tokens": 3345689303.0, "step": 31550 }, { "entropy": 1.36078125, "epoch": 0.7737133343127173, "grad_norm": 3.4375, "learning_rate": 4.323485938826302e-06, "loss": 0.1896, "mean_token_accuracy": 0.9603872370719909, "num_tokens": 3350984033.0, "step": 31600 }, { "entropy": 1.3403125, "epoch": 0.7749375642720728, "grad_norm": 2.5, "learning_rate": 4.318033383150981e-06, "loss": 0.1735, "mean_token_accuracy": 0.9628359317779541, "num_tokens": 3356162417.0, "step": 31650 }, { "entropy": 1.34640625, "epoch": 0.7761617942314284, "grad_norm": 2.0625, "learning_rate": 4.312575426729486e-06, "loss": 0.1848, "mean_token_accuracy": 0.9605207931995392, "num_tokens": 3361647453.0, "step": 31700 }, { "entropy": 1.33171875, "epoch": 0.777386024190784, "grad_norm": 1.9453125, "learning_rate": 4.307112091926226e-06, "loss": 0.1637, "mean_token_accuracy": 0.965142446756363, "num_tokens": 3366481444.0, "step": 31750 }, { "entropy": 1.37390625, "epoch": 0.7786102541501395, "grad_norm": 3.1875, "learning_rate": 4.301643401127647e-06, "loss": 0.1778, "mean_token_accuracy": 0.9628903007507325, "num_tokens": 3371649682.0, "step": 31800 }, { "entropy": 1.3721875, "epoch": 0.7798344841094951, "grad_norm": 2.625, "learning_rate": 4.2961693767421435e-06, "loss": 0.1645, "mean_token_accuracy": 0.9658307003974914, "num_tokens": 3376382887.0, "step": 31850 }, { "entropy": 1.358125, "epoch": 0.7810587140688507, "grad_norm": 2.921875, "learning_rate": 4.290690041199963e-06, "loss": 0.179, "mean_token_accuracy": 0.9622143077850341, "num_tokens": 3381791030.0, "step": 31900 }, { "entropy": 1.37015625, "epoch": 0.7822829440282063, "grad_norm": 2.125, "learning_rate": 4.285205416953118e-06, "loss": 0.1876, "mean_token_accuracy": 0.9609373700618744, "num_tokens": 3387334981.0, "step": 31950 }, { "entropy": 1.34765625, "epoch": 0.7835071739875619, "grad_norm": 2.515625, "learning_rate": 4.279715526475289e-06, "loss": 0.1762, "mean_token_accuracy": 0.962603681087494, "num_tokens": 3392713314.0, "step": 32000 }, { "entropy": 1.3678125, "epoch": 0.7847314039469174, "grad_norm": 2.609375, "learning_rate": 4.274220392261738e-06, "loss": 0.1887, "mean_token_accuracy": 0.9606349515914917, "num_tokens": 3398537796.0, "step": 32050 }, { "entropy": 1.33734375, "epoch": 0.785955633906273, "grad_norm": 2.921875, "learning_rate": 4.268720036829214e-06, "loss": 0.1748, "mean_token_accuracy": 0.964071912765503, "num_tokens": 3403920236.0, "step": 32100 }, { "entropy": 1.37, "epoch": 0.7871798638656285, "grad_norm": 2.328125, "learning_rate": 4.263214482715857e-06, "loss": 0.1654, "mean_token_accuracy": 0.9644496822357178, "num_tokens": 3409108918.0, "step": 32150 }, { "entropy": 1.35046875, "epoch": 0.7884040938249841, "grad_norm": 3.125, "learning_rate": 4.2577037524811104e-06, "loss": 0.1714, "mean_token_accuracy": 0.9636311101913452, "num_tokens": 3414387238.0, "step": 32200 }, { "entropy": 1.34359375, "epoch": 0.7896283237843397, "grad_norm": 2.328125, "learning_rate": 4.25218786870563e-06, "loss": 0.1552, "mean_token_accuracy": 0.965884006023407, "num_tokens": 3419148471.0, "step": 32250 }, { "entropy": 1.34875, "epoch": 0.7908525537436952, "grad_norm": 0.004241943359375, "learning_rate": 4.246666853991186e-06, "loss": 0.1676, "mean_token_accuracy": 0.9639466750621796, "num_tokens": 3424295496.0, "step": 32300 }, { "entropy": 1.364375, "epoch": 0.7920767837030508, "grad_norm": 1.6953125, "learning_rate": 4.241140730960573e-06, "loss": 0.1829, "mean_token_accuracy": 0.9615444934368134, "num_tokens": 3429846223.0, "step": 32350 }, { "entropy": 1.33828125, "epoch": 0.7933010136624064, "grad_norm": 3.53125, "learning_rate": 4.235609522257517e-06, "loss": 0.178, "mean_token_accuracy": 0.9621382772922515, "num_tokens": 3434814232.0, "step": 32400 }, { "entropy": 1.37265625, "epoch": 0.7945252436217619, "grad_norm": 2.28125, "learning_rate": 4.230073250546585e-06, "loss": 0.1854, "mean_token_accuracy": 0.9616455745697021, "num_tokens": 3440013747.0, "step": 32450 }, { "entropy": 1.33484375, "epoch": 0.7957494735811175, "grad_norm": 2.828125, "learning_rate": 4.224531938513088e-06, "loss": 0.175, "mean_token_accuracy": 0.9632323062419892, "num_tokens": 3445299571.0, "step": 32500 }, { "entropy": 1.34203125, "epoch": 0.796973703540473, "grad_norm": 2.421875, "learning_rate": 4.218985608862992e-06, "loss": 0.1814, "mean_token_accuracy": 0.9623367011547088, "num_tokens": 3450664579.0, "step": 32550 }, { "entropy": 1.3540625, "epoch": 0.7981979334998286, "grad_norm": 2.0625, "learning_rate": 4.213434284322819e-06, "loss": 0.1729, "mean_token_accuracy": 0.9627703261375428, "num_tokens": 3455979121.0, "step": 32600 }, { "entropy": 1.33734375, "epoch": 0.7994221634591842, "grad_norm": 2.796875, "learning_rate": 4.207877987639566e-06, "loss": 0.1764, "mean_token_accuracy": 0.9627932643890381, "num_tokens": 3461283678.0, "step": 32650 }, { "entropy": 1.3596875, "epoch": 0.8006463934185397, "grad_norm": 1.8984375, "learning_rate": 4.202316741580594e-06, "loss": 0.1854, "mean_token_accuracy": 0.9612032771110535, "num_tokens": 3467126201.0, "step": 32700 }, { "entropy": 1.344375, "epoch": 0.8018706233778953, "grad_norm": 2.921875, "learning_rate": 4.196750568933551e-06, "loss": 0.1721, "mean_token_accuracy": 0.9638476753234864, "num_tokens": 3472599559.0, "step": 32750 }, { "entropy": 1.3415625, "epoch": 0.8030948533372508, "grad_norm": 2.34375, "learning_rate": 4.191179492506271e-06, "loss": 0.1754, "mean_token_accuracy": 0.9628195893764496, "num_tokens": 3477994415.0, "step": 32800 }, { "entropy": 1.34953125, "epoch": 0.8043190832966064, "grad_norm": 2.15625, "learning_rate": 4.18560353512668e-06, "loss": 0.1778, "mean_token_accuracy": 0.9618653762340545, "num_tokens": 3483437386.0, "step": 32850 }, { "entropy": 1.34390625, "epoch": 0.805543313255962, "grad_norm": 2.875, "learning_rate": 4.1800227196427055e-06, "loss": 0.1751, "mean_token_accuracy": 0.9623115694522858, "num_tokens": 3488795577.0, "step": 32900 }, { "entropy": 1.32609375, "epoch": 0.8067675432153175, "grad_norm": 1.9921875, "learning_rate": 4.17443706892218e-06, "loss": 0.1766, "mean_token_accuracy": 0.9626455020904541, "num_tokens": 3494139245.0, "step": 32950 }, { "entropy": 1.34953125, "epoch": 0.8079917731746731, "grad_norm": 3.640625, "learning_rate": 4.168846605852751e-06, "loss": 0.1811, "mean_token_accuracy": 0.9624789762496948, "num_tokens": 3499294686.0, "step": 33000 }, { "entropy": 1.34546875, "epoch": 0.8092160031340287, "grad_norm": 3.234375, "learning_rate": 4.1632513533417825e-06, "loss": 0.1629, "mean_token_accuracy": 0.9650925529003144, "num_tokens": 3504042622.0, "step": 33050 }, { "entropy": 1.3675, "epoch": 0.8104402330933843, "grad_norm": 1.8984375, "learning_rate": 4.157651334316264e-06, "loss": 0.159, "mean_token_accuracy": 0.9659399092197418, "num_tokens": 3509103882.0, "step": 33100 }, { "entropy": 1.35625, "epoch": 0.8116644630527399, "grad_norm": 1.9765625, "learning_rate": 4.1520465717227206e-06, "loss": 0.1782, "mean_token_accuracy": 0.9628897225856781, "num_tokens": 3514150747.0, "step": 33150 }, { "entropy": 1.3603125, "epoch": 0.8128886930120954, "grad_norm": 2.859375, "learning_rate": 4.146437088527108e-06, "loss": 0.1811, "mean_token_accuracy": 0.9617001414299011, "num_tokens": 3519220750.0, "step": 33200 }, { "entropy": 1.36859375, "epoch": 0.814112922971451, "grad_norm": 2.921875, "learning_rate": 4.140822907714728e-06, "loss": 0.1885, "mean_token_accuracy": 0.9607588303089142, "num_tokens": 3524668178.0, "step": 33250 }, { "entropy": 1.35484375, "epoch": 0.8153371529308066, "grad_norm": 1.6015625, "learning_rate": 4.135204052290131e-06, "loss": 0.1645, "mean_token_accuracy": 0.9654926788806916, "num_tokens": 3529737924.0, "step": 33300 }, { "entropy": 1.33109375, "epoch": 0.8165613828901621, "grad_norm": 3.0, "learning_rate": 4.129580545277023e-06, "loss": 0.1637, "mean_token_accuracy": 0.9648844826221467, "num_tokens": 3534673592.0, "step": 33350 }, { "entropy": 1.33046875, "epoch": 0.8177856128495177, "grad_norm": 2.1875, "learning_rate": 4.123952409718169e-06, "loss": 0.1705, "mean_token_accuracy": 0.963813624382019, "num_tokens": 3539705624.0, "step": 33400 }, { "entropy": 1.3225, "epoch": 0.8190098428088732, "grad_norm": 2.65625, "learning_rate": 4.118319668675301e-06, "loss": 0.1607, "mean_token_accuracy": 0.9656564962863922, "num_tokens": 3544723634.0, "step": 33450 }, { "entropy": 1.34328125, "epoch": 0.8202340727682288, "grad_norm": 3.625, "learning_rate": 4.112682345229019e-06, "loss": 0.1858, "mean_token_accuracy": 0.9613649821281434, "num_tokens": 3550196451.0, "step": 33500 }, { "entropy": 1.34546875, "epoch": 0.8214583027275844, "grad_norm": 2.375, "learning_rate": 4.107040462478706e-06, "loss": 0.1698, "mean_token_accuracy": 0.9640332353115082, "num_tokens": 3555769583.0, "step": 33550 }, { "entropy": 1.35515625, "epoch": 0.8226825326869399, "grad_norm": 4.6875, "learning_rate": 4.101394043542421e-06, "loss": 0.1781, "mean_token_accuracy": 0.9626898431777954, "num_tokens": 3560775725.0, "step": 33600 }, { "entropy": 1.37046875, "epoch": 0.8239067626462955, "grad_norm": 2.828125, "learning_rate": 4.095743111556813e-06, "loss": 0.1822, "mean_token_accuracy": 0.9615408968925476, "num_tokens": 3566233997.0, "step": 33650 }, { "entropy": 1.3565625, "epoch": 0.825130992605651, "grad_norm": 2.6875, "learning_rate": 4.090087689677025e-06, "loss": 0.1798, "mean_token_accuracy": 0.9622524130344391, "num_tokens": 3571629994.0, "step": 33700 }, { "entropy": 1.35453125, "epoch": 0.8263552225650066, "grad_norm": 2.34375, "learning_rate": 4.084427801076592e-06, "loss": 0.1631, "mean_token_accuracy": 0.965935331583023, "num_tokens": 3576662114.0, "step": 33750 }, { "entropy": 1.36453125, "epoch": 0.8275794525243622, "grad_norm": 2.609375, "learning_rate": 4.0787634689473605e-06, "loss": 0.1704, "mean_token_accuracy": 0.9641584491729737, "num_tokens": 3581699530.0, "step": 33800 }, { "entropy": 1.33421875, "epoch": 0.8288036824837177, "grad_norm": 3.71875, "learning_rate": 4.0730947164993775e-06, "loss": 0.1746, "mean_token_accuracy": 0.9626482093334198, "num_tokens": 3586891414.0, "step": 33850 }, { "entropy": 1.34828125, "epoch": 0.8300279124430733, "grad_norm": 2.9375, "learning_rate": 4.067421566960805e-06, "loss": 0.173, "mean_token_accuracy": 0.9637481319904327, "num_tokens": 3591845863.0, "step": 33900 }, { "entropy": 1.32796875, "epoch": 0.8312521424024288, "grad_norm": 4.3125, "learning_rate": 4.061744043577822e-06, "loss": 0.1826, "mean_token_accuracy": 0.960258857011795, "num_tokens": 3597325814.0, "step": 33950 }, { "entropy": 1.343125, "epoch": 0.8324763723617844, "grad_norm": 3.65625, "learning_rate": 4.056062169614533e-06, "loss": 0.1788, "mean_token_accuracy": 0.9624998271465302, "num_tokens": 3602589177.0, "step": 34000 }, { "entropy": 1.33171875, "epoch": 0.83370060232114, "grad_norm": 5.5, "learning_rate": 4.050375968352865e-06, "loss": 0.1749, "mean_token_accuracy": 0.9635978293418884, "num_tokens": 3607686315.0, "step": 34050 }, { "entropy": 1.35046875, "epoch": 0.8349248322804955, "grad_norm": 2.921875, "learning_rate": 4.044685463092477e-06, "loss": 0.1823, "mean_token_accuracy": 0.9619014573097229, "num_tokens": 3613032357.0, "step": 34100 }, { "entropy": 1.3278125, "epoch": 0.8361490622398511, "grad_norm": 3.796875, "learning_rate": 4.0389906771506666e-06, "loss": 0.1567, "mean_token_accuracy": 0.9672730362415314, "num_tokens": 3617947758.0, "step": 34150 }, { "entropy": 1.3509375, "epoch": 0.8373732921992068, "grad_norm": 3.0, "learning_rate": 4.03329163386227e-06, "loss": 0.1821, "mean_token_accuracy": 0.9615289163589478, "num_tokens": 3623324648.0, "step": 34200 }, { "entropy": 1.36625, "epoch": 0.8385975221585623, "grad_norm": 2.21875, "learning_rate": 4.027588356579567e-06, "loss": 0.1807, "mean_token_accuracy": 0.962299063205719, "num_tokens": 3628936189.0, "step": 34250 }, { "entropy": 1.34484375, "epoch": 0.8398217521179179, "grad_norm": 1.9375, "learning_rate": 4.0218808686721884e-06, "loss": 0.1766, "mean_token_accuracy": 0.9632388269901275, "num_tokens": 3634256824.0, "step": 34300 }, { "entropy": 1.3365625, "epoch": 0.8410459820772734, "grad_norm": 3.265625, "learning_rate": 4.01616919352702e-06, "loss": 0.1653, "mean_token_accuracy": 0.9652460610866547, "num_tokens": 3639058717.0, "step": 34350 }, { "entropy": 1.3490625, "epoch": 0.842270212036629, "grad_norm": 3.53125, "learning_rate": 4.010453354548101e-06, "loss": 0.1587, "mean_token_accuracy": 0.9665447866916657, "num_tokens": 3644031006.0, "step": 34400 }, { "entropy": 1.36546875, "epoch": 0.8434944419959846, "grad_norm": 2.0625, "learning_rate": 4.004733375156534e-06, "loss": 0.1862, "mean_token_accuracy": 0.9608346676826477, "num_tokens": 3649652142.0, "step": 34450 }, { "entropy": 1.36640625, "epoch": 0.8447186719553401, "grad_norm": 2.34375, "learning_rate": 3.999009278790389e-06, "loss": 0.1692, "mean_token_accuracy": 0.9642466914653778, "num_tokens": 3654831381.0, "step": 34500 }, { "entropy": 1.35890625, "epoch": 0.8459429019146957, "grad_norm": 1.96875, "learning_rate": 3.993281088904603e-06, "loss": 0.1659, "mean_token_accuracy": 0.9651599872112274, "num_tokens": 3659811312.0, "step": 34550 }, { "entropy": 1.36734375, "epoch": 0.8471671318740512, "grad_norm": 3.578125, "learning_rate": 3.9875488289708895e-06, "loss": 0.1693, "mean_token_accuracy": 0.9640548026561737, "num_tokens": 3665088140.0, "step": 34600 }, { "entropy": 1.35578125, "epoch": 0.8483913618334068, "grad_norm": 2.671875, "learning_rate": 3.981812522477634e-06, "loss": 0.1683, "mean_token_accuracy": 0.9642880761623382, "num_tokens": 3670199765.0, "step": 34650 }, { "entropy": 1.371875, "epoch": 0.8496155917927624, "grad_norm": 2.125, "learning_rate": 3.976072192929812e-06, "loss": 0.1859, "mean_token_accuracy": 0.961214131116867, "num_tokens": 3675973370.0, "step": 34700 }, { "entropy": 1.335, "epoch": 0.8508398217521179, "grad_norm": 2.234375, "learning_rate": 3.970327863848874e-06, "loss": 0.163, "mean_token_accuracy": 0.9652379751205444, "num_tokens": 3680935151.0, "step": 34750 }, { "entropy": 1.35953125, "epoch": 0.8520640517114735, "grad_norm": 3.984375, "learning_rate": 3.964579558772665e-06, "loss": 0.1686, "mean_token_accuracy": 0.9643210101127625, "num_tokens": 3686151191.0, "step": 34800 }, { "entropy": 1.35, "epoch": 0.853288281670829, "grad_norm": 2.46875, "learning_rate": 3.95882730125532e-06, "loss": 0.1755, "mean_token_accuracy": 0.9624910676479339, "num_tokens": 3691478654.0, "step": 34850 }, { "entropy": 1.338125, "epoch": 0.8545125116301846, "grad_norm": 2.109375, "learning_rate": 3.953071114867171e-06, "loss": 0.1711, "mean_token_accuracy": 0.9633730280399323, "num_tokens": 3696633906.0, "step": 34900 }, { "entropy": 1.34890625, "epoch": 0.8557367415895402, "grad_norm": 1.890625, "learning_rate": 3.947311023194645e-06, "loss": 0.1804, "mean_token_accuracy": 0.9618865346908569, "num_tokens": 3701978753.0, "step": 34950 }, { "entropy": 1.3384375, "epoch": 0.8569609715488957, "grad_norm": 2.203125, "learning_rate": 3.941547049840176e-06, "loss": 0.1645, "mean_token_accuracy": 0.9649497640132904, "num_tokens": 3706915348.0, "step": 35000 }, { "entropy": 1.32359375, "epoch": 0.8581852015082513, "grad_norm": 2.171875, "learning_rate": 3.9357792184221005e-06, "loss": 0.1739, "mean_token_accuracy": 0.9632923007011414, "num_tokens": 3712046907.0, "step": 35050 }, { "entropy": 1.3240625, "epoch": 0.8594094314676068, "grad_norm": 3.078125, "learning_rate": 3.930007552574564e-06, "loss": 0.1763, "mean_token_accuracy": 0.9626149117946625, "num_tokens": 3717274859.0, "step": 35100 }, { "entropy": 1.33484375, "epoch": 0.8606336614269624, "grad_norm": 3.03125, "learning_rate": 3.924232075947427e-06, "loss": 0.186, "mean_token_accuracy": 0.9613423335552216, "num_tokens": 3722674538.0, "step": 35150 }, { "entropy": 1.33484375, "epoch": 0.861857891386318, "grad_norm": 3.40625, "learning_rate": 3.918452812206159e-06, "loss": 0.1777, "mean_token_accuracy": 0.9628440749645233, "num_tokens": 3727975730.0, "step": 35200 }, { "entropy": 1.34125, "epoch": 0.8630821213456735, "grad_norm": 1.8359375, "learning_rate": 3.9126697850317525e-06, "loss": 0.1761, "mean_token_accuracy": 0.963371901512146, "num_tokens": 3733241093.0, "step": 35250 }, { "entropy": 1.34328125, "epoch": 0.8643063513050291, "grad_norm": 2.640625, "learning_rate": 3.906883018120619e-06, "loss": 0.1707, "mean_token_accuracy": 0.9642481172084808, "num_tokens": 3738164559.0, "step": 35300 }, { "entropy": 1.3203125, "epoch": 0.8655305812643848, "grad_norm": 3.546875, "learning_rate": 3.901092535184496e-06, "loss": 0.1713, "mean_token_accuracy": 0.9637637650966644, "num_tokens": 3743459921.0, "step": 35350 }, { "entropy": 1.35578125, "epoch": 0.8667548112237403, "grad_norm": 3.40625, "learning_rate": 3.895298359950343e-06, "loss": 0.1829, "mean_token_accuracy": 0.9605180990695953, "num_tokens": 3748868327.0, "step": 35400 }, { "entropy": 1.34265625, "epoch": 0.8679790411830959, "grad_norm": 2.125, "learning_rate": 3.889500516160254e-06, "loss": 0.1715, "mean_token_accuracy": 0.9643005490303039, "num_tokens": 3753748677.0, "step": 35450 }, { "entropy": 1.3384375, "epoch": 0.8692032711424514, "grad_norm": 2.375, "learning_rate": 3.883699027571352e-06, "loss": 0.1668, "mean_token_accuracy": 0.965086680650711, "num_tokens": 3759201853.0, "step": 35500 }, { "entropy": 1.34390625, "epoch": 0.870427501101807, "grad_norm": 3.25, "learning_rate": 3.8778939179556976e-06, "loss": 0.1694, "mean_token_accuracy": 0.9643353164196015, "num_tokens": 3764158638.0, "step": 35550 }, { "entropy": 1.33015625, "epoch": 0.8716517310611626, "grad_norm": 2.015625, "learning_rate": 3.872085211100185e-06, "loss": 0.1621, "mean_token_accuracy": 0.9657464909553528, "num_tokens": 3769226815.0, "step": 35600 }, { "entropy": 1.35078125, "epoch": 0.8728759610205181, "grad_norm": 1.890625, "learning_rate": 3.86627293080645e-06, "loss": 0.1836, "mean_token_accuracy": 0.9611875438690185, "num_tokens": 3774861819.0, "step": 35650 }, { "entropy": 1.34953125, "epoch": 0.8741001909798737, "grad_norm": 3.203125, "learning_rate": 3.860457100890776e-06, "loss": 0.1795, "mean_token_accuracy": 0.9616686987876892, "num_tokens": 3780181646.0, "step": 35700 }, { "entropy": 1.34359375, "epoch": 0.8753244209392292, "grad_norm": 3.046875, "learning_rate": 3.854637745183983e-06, "loss": 0.1762, "mean_token_accuracy": 0.9630369508266449, "num_tokens": 3785489246.0, "step": 35750 }, { "entropy": 1.3425, "epoch": 0.8765486508985848, "grad_norm": 1.953125, "learning_rate": 3.848814887531342e-06, "loss": 0.1865, "mean_token_accuracy": 0.9609660315513611, "num_tokens": 3790970702.0, "step": 35800 }, { "entropy": 1.3375, "epoch": 0.8777728808579404, "grad_norm": 2.890625, "learning_rate": 3.842988551792473e-06, "loss": 0.1666, "mean_token_accuracy": 0.9646478390693665, "num_tokens": 3796002667.0, "step": 35850 }, { "entropy": 1.33828125, "epoch": 0.8789971108172959, "grad_norm": 3.234375, "learning_rate": 3.83715876184125e-06, "loss": 0.1727, "mean_token_accuracy": 0.9642738771438598, "num_tokens": 3801134844.0, "step": 35900 }, { "entropy": 1.33859375, "epoch": 0.8802213407766515, "grad_norm": 2.8125, "learning_rate": 3.831325541565699e-06, "loss": 0.1714, "mean_token_accuracy": 0.9640265047550202, "num_tokens": 3806453829.0, "step": 35950 }, { "entropy": 1.34015625, "epoch": 0.881445570736007, "grad_norm": 3.046875, "learning_rate": 3.825488914867901e-06, "loss": 0.1762, "mean_token_accuracy": 0.9627239561080932, "num_tokens": 3811628461.0, "step": 36000 }, { "entropy": 1.35203125, "epoch": 0.8826698006953626, "grad_norm": 2.171875, "learning_rate": 3.8196489056638965e-06, "loss": 0.1849, "mean_token_accuracy": 0.9613272595405579, "num_tokens": 3816892701.0, "step": 36050 }, { "entropy": 1.34703125, "epoch": 0.8838940306547182, "grad_norm": 2.015625, "learning_rate": 3.813805537883585e-06, "loss": 0.1711, "mean_token_accuracy": 0.9637981843948364, "num_tokens": 3822028448.0, "step": 36100 }, { "entropy": 1.34875, "epoch": 0.8851182606140737, "grad_norm": 1.8671875, "learning_rate": 3.80795883547063e-06, "loss": 0.1672, "mean_token_accuracy": 0.9647044801712036, "num_tokens": 3827213092.0, "step": 36150 }, { "entropy": 1.34578125, "epoch": 0.8863424905734293, "grad_norm": 2.484375, "learning_rate": 3.8021088223823558e-06, "loss": 0.1927, "mean_token_accuracy": 0.9597675764560699, "num_tokens": 3832709039.0, "step": 36200 }, { "entropy": 1.33359375, "epoch": 0.8875667205327848, "grad_norm": 2.046875, "learning_rate": 3.7962555225896563e-06, "loss": 0.177, "mean_token_accuracy": 0.9623324680328369, "num_tokens": 3837879687.0, "step": 36250 }, { "entropy": 1.33890625, "epoch": 0.8887909504921404, "grad_norm": 2.328125, "learning_rate": 3.790398960076891e-06, "loss": 0.1769, "mean_token_accuracy": 0.9629685461521149, "num_tokens": 3843045671.0, "step": 36300 }, { "entropy": 1.32703125, "epoch": 0.890015180451496, "grad_norm": 3.09375, "learning_rate": 3.7845391588417876e-06, "loss": 0.173, "mean_token_accuracy": 0.9636087584495544, "num_tokens": 3848206427.0, "step": 36350 }, { "entropy": 1.32984375, "epoch": 0.8912394104108515, "grad_norm": 2.171875, "learning_rate": 3.778676142895346e-06, "loss": 0.1734, "mean_token_accuracy": 0.9632059478759766, "num_tokens": 3853828427.0, "step": 36400 }, { "entropy": 1.32390625, "epoch": 0.8924636403702071, "grad_norm": 2.5625, "learning_rate": 3.772809936261739e-06, "loss": 0.1894, "mean_token_accuracy": 0.9601573574543, "num_tokens": 3859273920.0, "step": 36450 }, { "entropy": 1.3265625, "epoch": 0.8936878703295627, "grad_norm": 2.875, "learning_rate": 3.766940562978211e-06, "loss": 0.1763, "mean_token_accuracy": 0.9631186270713806, "num_tokens": 3864494355.0, "step": 36500 }, { "entropy": 1.33109375, "epoch": 0.8949121002889183, "grad_norm": 2.71875, "learning_rate": 3.761068047094987e-06, "loss": 0.1736, "mean_token_accuracy": 0.963892787694931, "num_tokens": 3869689661.0, "step": 36550 }, { "entropy": 1.3115625, "epoch": 0.8961363302482739, "grad_norm": 2.625, "learning_rate": 3.7551924126751624e-06, "loss": 0.1832, "mean_token_accuracy": 0.9618776285648346, "num_tokens": 3875053980.0, "step": 36600 }, { "entropy": 1.3021875, "epoch": 0.8973605602076294, "grad_norm": 2.734375, "learning_rate": 3.7493136837946177e-06, "loss": 0.1749, "mean_token_accuracy": 0.962455780506134, "num_tokens": 3880568995.0, "step": 36650 }, { "entropy": 1.3209375, "epoch": 0.898584790166985, "grad_norm": 2.90625, "learning_rate": 3.743431884541909e-06, "loss": 0.1835, "mean_token_accuracy": 0.9612640655040741, "num_tokens": 3885898540.0, "step": 36700 }, { "entropy": 1.31390625, "epoch": 0.8998090201263406, "grad_norm": 2.75, "learning_rate": 3.737547039018173e-06, "loss": 0.1664, "mean_token_accuracy": 0.9649625384807586, "num_tokens": 3891014489.0, "step": 36750 }, { "entropy": 1.323125, "epoch": 0.9010332500856961, "grad_norm": 2.1875, "learning_rate": 3.7316591713370315e-06, "loss": 0.1774, "mean_token_accuracy": 0.9622565031051635, "num_tokens": 3896408077.0, "step": 36800 }, { "entropy": 1.34515625, "epoch": 0.9022574800450517, "grad_norm": 1.8203125, "learning_rate": 3.7257683056244895e-06, "loss": 0.178, "mean_token_accuracy": 0.9631640148162842, "num_tokens": 3901699376.0, "step": 36850 }, { "entropy": 1.32171875, "epoch": 0.9034817100044072, "grad_norm": 2.84375, "learning_rate": 3.7198744660188347e-06, "loss": 0.1578, "mean_token_accuracy": 0.966994469165802, "num_tokens": 3906644235.0, "step": 36900 }, { "entropy": 1.3284375, "epoch": 0.9047059399637628, "grad_norm": 1.8828125, "learning_rate": 3.7139776766705433e-06, "loss": 0.161, "mean_token_accuracy": 0.9657053291797638, "num_tokens": 3911529877.0, "step": 36950 }, { "entropy": 1.320625, "epoch": 0.9059301699231184, "grad_norm": 2.640625, "learning_rate": 3.7080779617421733e-06, "loss": 0.1663, "mean_token_accuracy": 0.9647897446155548, "num_tokens": 3917023608.0, "step": 37000 }, { "entropy": 1.3315625, "epoch": 0.9071543998824739, "grad_norm": 3.078125, "learning_rate": 3.7021753454082772e-06, "loss": 0.1851, "mean_token_accuracy": 0.9609014749526977, "num_tokens": 3922789580.0, "step": 37050 }, { "entropy": 1.31453125, "epoch": 0.9083786298418295, "grad_norm": 2.484375, "learning_rate": 3.696269851855292e-06, "loss": 0.1738, "mean_token_accuracy": 0.9629218196868896, "num_tokens": 3927904246.0, "step": 37100 }, { "entropy": 1.29828125, "epoch": 0.909602859801185, "grad_norm": 2.875, "learning_rate": 3.6903615052814444e-06, "loss": 0.1723, "mean_token_accuracy": 0.96382728099823, "num_tokens": 3933096610.0, "step": 37150 }, { "entropy": 1.275, "epoch": 0.9108270897605406, "grad_norm": 1.640625, "learning_rate": 3.684450329896653e-06, "loss": 0.1538, "mean_token_accuracy": 0.9664675867557526, "num_tokens": 3938208531.0, "step": 37200 }, { "entropy": 1.2990625, "epoch": 0.9120513197198962, "grad_norm": 1.6640625, "learning_rate": 3.6785363499224266e-06, "loss": 0.1676, "mean_token_accuracy": 0.9638699948787689, "num_tokens": 3943507764.0, "step": 37250 }, { "entropy": 1.29953125, "epoch": 0.9132755496792517, "grad_norm": 2.921875, "learning_rate": 3.672619589591768e-06, "loss": 0.1737, "mean_token_accuracy": 0.9631060230731964, "num_tokens": 3948883174.0, "step": 37300 }, { "entropy": 1.3128125, "epoch": 0.9144997796386073, "grad_norm": 3.34375, "learning_rate": 3.6667000731490695e-06, "loss": 0.1769, "mean_token_accuracy": 0.9630844449996948, "num_tokens": 3954228445.0, "step": 37350 }, { "entropy": 1.31328125, "epoch": 0.9157240095979629, "grad_norm": 2.71875, "learning_rate": 3.660777824850019e-06, "loss": 0.178, "mean_token_accuracy": 0.9625172114372254, "num_tokens": 3959522338.0, "step": 37400 }, { "entropy": 1.3109375, "epoch": 0.9169482395573184, "grad_norm": 3.875, "learning_rate": 3.6548528689614985e-06, "loss": 0.1615, "mean_token_accuracy": 0.9651338791847229, "num_tokens": 3964674293.0, "step": 37450 }, { "entropy": 1.3209375, "epoch": 0.918172469516674, "grad_norm": 2.765625, "learning_rate": 3.6489252297614833e-06, "loss": 0.1743, "mean_token_accuracy": 0.9630649185180664, "num_tokens": 3970201603.0, "step": 37500 }, { "entropy": 1.33578125, "epoch": 0.9193966994760295, "grad_norm": 1.8984375, "learning_rate": 3.6429949315389455e-06, "loss": 0.1792, "mean_token_accuracy": 0.9619642412662506, "num_tokens": 3975729221.0, "step": 37550 }, { "entropy": 1.3121875, "epoch": 0.9206209294353851, "grad_norm": 2.484375, "learning_rate": 3.6370619985937513e-06, "loss": 0.1658, "mean_token_accuracy": 0.9639672470092774, "num_tokens": 3980440332.0, "step": 37600 }, { "entropy": 1.314375, "epoch": 0.9218451593947407, "grad_norm": 2.0, "learning_rate": 3.6311264552365634e-06, "loss": 0.1748, "mean_token_accuracy": 0.9630878198146821, "num_tokens": 3985861602.0, "step": 37650 }, { "entropy": 1.31109375, "epoch": 0.9230693893540963, "grad_norm": 1.671875, "learning_rate": 3.62518832578874e-06, "loss": 0.1647, "mean_token_accuracy": 0.9646557712554932, "num_tokens": 3991141130.0, "step": 37700 }, { "entropy": 1.31140625, "epoch": 0.9242936193134519, "grad_norm": 1.6328125, "learning_rate": 3.619247634582238e-06, "loss": 0.1798, "mean_token_accuracy": 0.961934734582901, "num_tokens": 3996774043.0, "step": 37750 }, { "entropy": 1.326875, "epoch": 0.9255178492728074, "grad_norm": 2.078125, "learning_rate": 3.6133044059595083e-06, "loss": 0.1817, "mean_token_accuracy": 0.9612915456295014, "num_tokens": 4002462308.0, "step": 37800 }, { "entropy": 1.31359375, "epoch": 0.926742079232163, "grad_norm": 3.296875, "learning_rate": 3.6073586642734027e-06, "loss": 0.1779, "mean_token_accuracy": 0.9622733199596405, "num_tokens": 4007870657.0, "step": 37850 }, { "entropy": 1.3059375, "epoch": 0.9279663091915186, "grad_norm": 1.734375, "learning_rate": 3.601410433887068e-06, "loss": 0.1696, "mean_token_accuracy": 0.9639555370807648, "num_tokens": 4012925044.0, "step": 37900 }, { "entropy": 1.30625, "epoch": 0.9291905391508741, "grad_norm": 3.203125, "learning_rate": 3.5954597391738487e-06, "loss": 0.1749, "mean_token_accuracy": 0.9627858221530914, "num_tokens": 4018089645.0, "step": 37950 }, { "entropy": 1.3059375, "epoch": 0.9304147691102297, "grad_norm": 3.0, "learning_rate": 3.589506604517189e-06, "loss": 0.1668, "mean_token_accuracy": 0.9654299330711364, "num_tokens": 4023139809.0, "step": 38000 }, { "entropy": 1.32140625, "epoch": 0.9316389990695852, "grad_norm": 1.765625, "learning_rate": 3.583551054310529e-06, "loss": 0.1743, "mean_token_accuracy": 0.9638527107238769, "num_tokens": 4028437262.0, "step": 38050 }, { "entropy": 1.31890625, "epoch": 0.9328632290289408, "grad_norm": 3.03125, "learning_rate": 3.5775931129572072e-06, "loss": 0.1658, "mean_token_accuracy": 0.9640737462043762, "num_tokens": 4033659635.0, "step": 38100 }, { "entropy": 1.31625, "epoch": 0.9340874589882964, "grad_norm": 4.28125, "learning_rate": 3.57163280487036e-06, "loss": 0.1742, "mean_token_accuracy": 0.9627125465869903, "num_tokens": 4039135210.0, "step": 38150 }, { "entropy": 1.31125, "epoch": 0.9353116889476519, "grad_norm": 2.546875, "learning_rate": 3.5656701544728222e-06, "loss": 0.1739, "mean_token_accuracy": 0.9629321038722992, "num_tokens": 4044192912.0, "step": 38200 }, { "entropy": 1.315625, "epoch": 0.9365359189070075, "grad_norm": 2.109375, "learning_rate": 3.559705186197026e-06, "loss": 0.1641, "mean_token_accuracy": 0.9655595874786377, "num_tokens": 4049649393.0, "step": 38250 }, { "entropy": 1.31484375, "epoch": 0.937760148866363, "grad_norm": 3.0625, "learning_rate": 3.5537379244849017e-06, "loss": 0.1739, "mean_token_accuracy": 0.9634083175659179, "num_tokens": 4054901732.0, "step": 38300 }, { "entropy": 1.31140625, "epoch": 0.9389843788257186, "grad_norm": 2.5625, "learning_rate": 3.5477683937877755e-06, "loss": 0.1694, "mean_token_accuracy": 0.9634031581878663, "num_tokens": 4060033796.0, "step": 38350 }, { "entropy": 1.30640625, "epoch": 0.9402086087850742, "grad_norm": 2.71875, "learning_rate": 3.541796618566273e-06, "loss": 0.1634, "mean_token_accuracy": 0.9645454668998719, "num_tokens": 4065362004.0, "step": 38400 }, { "entropy": 1.281875, "epoch": 0.9414328387444297, "grad_norm": 3.0625, "learning_rate": 3.535822623290217e-06, "loss": 0.1456, "mean_token_accuracy": 0.9695195186138154, "num_tokens": 4070167345.0, "step": 38450 }, { "entropy": 1.3134375, "epoch": 0.9426570687037853, "grad_norm": 0.004974365234375, "learning_rate": 3.5298464324385246e-06, "loss": 0.1636, "mean_token_accuracy": 0.9643441307544708, "num_tokens": 4075331852.0, "step": 38500 }, { "entropy": 1.29109375, "epoch": 0.9438812986631409, "grad_norm": 2.609375, "learning_rate": 3.523868070499112e-06, "loss": 0.1522, "mean_token_accuracy": 0.9671092510223389, "num_tokens": 4080329045.0, "step": 38550 }, { "entropy": 1.2840625, "epoch": 0.9451055286224964, "grad_norm": 3.75, "learning_rate": 3.517887561968791e-06, "loss": 0.1616, "mean_token_accuracy": 0.9650249874591827, "num_tokens": 4085382254.0, "step": 38600 }, { "entropy": 1.29203125, "epoch": 0.946329758581852, "grad_norm": 3.5, "learning_rate": 3.5119049313531687e-06, "loss": 0.1698, "mean_token_accuracy": 0.9630708813667297, "num_tokens": 4090736615.0, "step": 38650 }, { "entropy": 1.29640625, "epoch": 0.9475539885412075, "grad_norm": 2.28125, "learning_rate": 3.5059202031665473e-06, "loss": 0.1761, "mean_token_accuracy": 0.962629064321518, "num_tokens": 4096335023.0, "step": 38700 }, { "entropy": 1.28390625, "epoch": 0.9487782185005631, "grad_norm": 2.53125, "learning_rate": 3.499933401931826e-06, "loss": 0.1711, "mean_token_accuracy": 0.9639296698570251, "num_tokens": 4101408840.0, "step": 38750 }, { "entropy": 1.26234375, "epoch": 0.9500024484599187, "grad_norm": 3.515625, "learning_rate": 3.493944552180395e-06, "loss": 0.1548, "mean_token_accuracy": 0.9661567640304566, "num_tokens": 4106422813.0, "step": 38800 }, { "entropy": 1.2709375, "epoch": 0.9512266784192743, "grad_norm": 3.109375, "learning_rate": 3.487953678452042e-06, "loss": 0.1544, "mean_token_accuracy": 0.9669099247455597, "num_tokens": 4111614226.0, "step": 38850 }, { "entropy": 1.2715625, "epoch": 0.9524509083786299, "grad_norm": 3.1875, "learning_rate": 3.481960805294847e-06, "loss": 0.1652, "mean_token_accuracy": 0.9649276435375214, "num_tokens": 4116902981.0, "step": 38900 }, { "entropy": 1.26828125, "epoch": 0.9536751383379855, "grad_norm": 3.390625, "learning_rate": 3.47596595726508e-06, "loss": 0.1682, "mean_token_accuracy": 0.9636393487453461, "num_tokens": 4122056561.0, "step": 38950 }, { "entropy": 1.270625, "epoch": 0.954899368297341, "grad_norm": 2.125, "learning_rate": 3.4699691589271076e-06, "loss": 0.1685, "mean_token_accuracy": 0.9632602989673614, "num_tokens": 4127685041.0, "step": 39000 }, { "entropy": 1.2453125, "epoch": 0.9561235982566966, "grad_norm": 2.4375, "learning_rate": 3.463970434853285e-06, "loss": 0.142, "mean_token_accuracy": 0.9697425818443298, "num_tokens": 4132578966.0, "step": 39050 }, { "entropy": 1.258125, "epoch": 0.9573478282160521, "grad_norm": 2.375, "learning_rate": 3.45796980962386e-06, "loss": 0.1678, "mean_token_accuracy": 0.9646705484390259, "num_tokens": 4138012784.0, "step": 39100 }, { "entropy": 1.26859375, "epoch": 0.9585720581754077, "grad_norm": 1.7890625, "learning_rate": 3.451967307826869e-06, "loss": 0.1757, "mean_token_accuracy": 0.9628133857250214, "num_tokens": 4143616072.0, "step": 39150 }, { "entropy": 1.259375, "epoch": 0.9597962881347633, "grad_norm": 4.1875, "learning_rate": 3.445962954058039e-06, "loss": 0.1752, "mean_token_accuracy": 0.962674834728241, "num_tokens": 4148944121.0, "step": 39200 }, { "entropy": 1.25921875, "epoch": 0.9610205180941188, "grad_norm": 3.375, "learning_rate": 3.439956772920685e-06, "loss": 0.1648, "mean_token_accuracy": 0.9645766019821167, "num_tokens": 4153880493.0, "step": 39250 }, { "entropy": 1.2525, "epoch": 0.9622447480534744, "grad_norm": 3.015625, "learning_rate": 3.4339487890256097e-06, "loss": 0.161, "mean_token_accuracy": 0.965018298625946, "num_tokens": 4158921325.0, "step": 39300 }, { "entropy": 1.2459375, "epoch": 0.9634689780128299, "grad_norm": 3.5, "learning_rate": 3.4279390269910033e-06, "loss": 0.1658, "mean_token_accuracy": 0.9649594247341156, "num_tokens": 4163950443.0, "step": 39350 }, { "entropy": 1.2590625, "epoch": 0.9646932079721855, "grad_norm": 2.875, "learning_rate": 3.421927511442341e-06, "loss": 0.172, "mean_token_accuracy": 0.9640387868881226, "num_tokens": 4169489034.0, "step": 39400 }, { "entropy": 1.261875, "epoch": 0.9659174379315411, "grad_norm": 2.28125, "learning_rate": 3.4159142670122845e-06, "loss": 0.1719, "mean_token_accuracy": 0.9637044394016265, "num_tokens": 4174842337.0, "step": 39450 }, { "entropy": 1.26265625, "epoch": 0.9671416678908966, "grad_norm": 2.390625, "learning_rate": 3.4098993183405793e-06, "loss": 0.1725, "mean_token_accuracy": 0.9634046721458435, "num_tokens": 4180354181.0, "step": 39500 }, { "entropy": 1.26046875, "epoch": 0.9683658978502522, "grad_norm": 1.6484375, "learning_rate": 3.403882690073954e-06, "loss": 0.1653, "mean_token_accuracy": 0.9639586913585663, "num_tokens": 4185417059.0, "step": 39550 }, { "entropy": 1.27140625, "epoch": 0.9695901278096077, "grad_norm": 1.2421875, "learning_rate": 3.3978644068660175e-06, "loss": 0.1583, "mean_token_accuracy": 0.96663733959198, "num_tokens": 4190550088.0, "step": 39600 }, { "entropy": 1.28140625, "epoch": 0.9708143577689633, "grad_norm": 2.859375, "learning_rate": 3.3918444933771637e-06, "loss": 0.1755, "mean_token_accuracy": 0.9624445605278015, "num_tokens": 4196306371.0, "step": 39650 }, { "entropy": 1.27078125, "epoch": 0.9720385877283189, "grad_norm": 1.875, "learning_rate": 3.385822974274465e-06, "loss": 0.1673, "mean_token_accuracy": 0.9644521117210388, "num_tokens": 4201403065.0, "step": 39700 }, { "entropy": 1.2859375, "epoch": 0.9732628176876744, "grad_norm": 2.78125, "learning_rate": 3.3797998742315724e-06, "loss": 0.1646, "mean_token_accuracy": 0.9653528666496277, "num_tokens": 4206711792.0, "step": 39750 }, { "entropy": 1.2709375, "epoch": 0.97448704764703, "grad_norm": 3.625, "learning_rate": 3.3737752179286158e-06, "loss": 0.1694, "mean_token_accuracy": 0.964444397687912, "num_tokens": 4212047599.0, "step": 39800 }, { "entropy": 1.289375, "epoch": 0.9757112776063855, "grad_norm": 2.5, "learning_rate": 3.3677490300521e-06, "loss": 0.1697, "mean_token_accuracy": 0.963803733587265, "num_tokens": 4217477603.0, "step": 39850 }, { "entropy": 1.27140625, "epoch": 0.9769355075657411, "grad_norm": 2.25, "learning_rate": 3.361721335294809e-06, "loss": 0.1579, "mean_token_accuracy": 0.9657166159152984, "num_tokens": 4222290662.0, "step": 39900 }, { "entropy": 1.3034375, "epoch": 0.9781597375250967, "grad_norm": 2.03125, "learning_rate": 3.355692158355699e-06, "loss": 0.1816, "mean_token_accuracy": 0.9609908378124237, "num_tokens": 4228024616.0, "step": 39950 }, { "entropy": 1.2765625, "epoch": 0.9793839674844523, "grad_norm": 1.8515625, "learning_rate": 3.349661523939799e-06, "loss": 0.1549, "mean_token_accuracy": 0.9669453859329223, "num_tokens": 4233080108.0, "step": 40000 }, { "epoch": 0.9793839674844523, "eval_entropy": 1.2830078125, "eval_loss": 0.18154892325401306, "eval_mean_token_accuracy": 0.9611844887336095, "eval_num_tokens": 4233080108.0, "eval_runtime": 601.7254, "eval_samples_per_second": 16.047, "eval_steps_per_second": 0.201, "step": 40000 }, { "entropy": 1.28, "epoch": 0.9806081974438079, "grad_norm": 3.40625, "learning_rate": 3.3436294567581125e-06, "loss": 0.1685, "mean_token_accuracy": 0.9643000710010529, "num_tokens": 4238491459.0, "step": 40050 }, { "entropy": 1.29109375, "epoch": 0.9818324274031635, "grad_norm": 1.9453125, "learning_rate": 3.3375959815275103e-06, "loss": 0.1714, "mean_token_accuracy": 0.9640710878372193, "num_tokens": 4244109418.0, "step": 40100 }, { "entropy": 1.27515625, "epoch": 0.983056657362519, "grad_norm": 0.029052734375, "learning_rate": 3.3315611229706377e-06, "loss": 0.1519, "mean_token_accuracy": 0.9681409633159638, "num_tokens": 4249250373.0, "step": 40150 }, { "entropy": 1.2784375, "epoch": 0.9842808873218746, "grad_norm": 2.09375, "learning_rate": 3.325524905815804e-06, "loss": 0.1654, "mean_token_accuracy": 0.9648780179023743, "num_tokens": 4254623197.0, "step": 40200 }, { "entropy": 1.2959375, "epoch": 0.9855051172812301, "grad_norm": 2.0, "learning_rate": 3.3194873547968867e-06, "loss": 0.1667, "mean_token_accuracy": 0.963757860660553, "num_tokens": 4260002335.0, "step": 40250 }, { "entropy": 1.28453125, "epoch": 0.9867293472405857, "grad_norm": 0.0078125, "learning_rate": 3.313448494653232e-06, "loss": 0.1738, "mean_token_accuracy": 0.9633991587162017, "num_tokens": 4265450665.0, "step": 40300 }, { "entropy": 1.26625, "epoch": 0.9879535771999413, "grad_norm": 2.90625, "learning_rate": 3.3074083501295447e-06, "loss": 0.1441, "mean_token_accuracy": 0.9687767088413238, "num_tokens": 4270155512.0, "step": 40350 }, { "entropy": 1.2728125, "epoch": 0.9891778071592968, "grad_norm": 2.15625, "learning_rate": 3.3013669459757956e-06, "loss": 0.1546, "mean_token_accuracy": 0.9668355488777161, "num_tokens": 4275174062.0, "step": 40400 }, { "entropy": 1.28171875, "epoch": 0.9904020371186524, "grad_norm": 2.421875, "learning_rate": 3.2953243069471187e-06, "loss": 0.1692, "mean_token_accuracy": 0.9641734325885772, "num_tokens": 4280291982.0, "step": 40450 }, { "entropy": 1.29375, "epoch": 0.9916262670780079, "grad_norm": 1.8828125, "learning_rate": 3.2892804578037036e-06, "loss": 0.1754, "mean_token_accuracy": 0.9624480056762695, "num_tokens": 4285827143.0, "step": 40500 }, { "entropy": 1.29921875, "epoch": 0.9928504970373635, "grad_norm": 3.328125, "learning_rate": 3.2832354233107023e-06, "loss": 0.1717, "mean_token_accuracy": 0.9635557103157043, "num_tokens": 4291196556.0, "step": 40550 }, { "entropy": 1.29515625, "epoch": 0.9940747269967191, "grad_norm": 2.046875, "learning_rate": 3.2771892282381226e-06, "loss": 0.1535, "mean_token_accuracy": 0.9667463576793671, "num_tokens": 4296297335.0, "step": 40600 }, { "entropy": 1.2765625, "epoch": 0.9952989569560746, "grad_norm": 2.15625, "learning_rate": 3.2711418973607257e-06, "loss": 0.1584, "mean_token_accuracy": 0.9667293214797974, "num_tokens": 4301506384.0, "step": 40650 }, { "entropy": 1.27078125, "epoch": 0.9965231869154302, "grad_norm": 2.234375, "learning_rate": 3.2650934554579314e-06, "loss": 0.1551, "mean_token_accuracy": 0.9660876715183258, "num_tokens": 4306603792.0, "step": 40700 }, { "entropy": 1.27515625, "epoch": 0.9977474168747857, "grad_norm": 4.15625, "learning_rate": 3.2590439273137074e-06, "loss": 0.1702, "mean_token_accuracy": 0.9637362861633301, "num_tokens": 4312148607.0, "step": 40750 }, { "entropy": 1.266875, "epoch": 0.9989716468341413, "grad_norm": 3.546875, "learning_rate": 3.2529933377164754e-06, "loss": 0.1498, "mean_token_accuracy": 0.9686801016330719, "num_tokens": 4317085828.0, "step": 40800 }, { "entropy": 1.2784375, "epoch": 1.0001958767934969, "grad_norm": 3.578125, "learning_rate": 3.2469417114590055e-06, "loss": 0.1627, "mean_token_accuracy": 0.9648519742488861, "num_tokens": 4322221289.0, "step": 40850 }, { "entropy": 1.28578125, "epoch": 1.0014201067528525, "grad_norm": 4.375, "learning_rate": 3.240889073338315e-06, "loss": 0.1602, "mean_token_accuracy": 0.9657353925704956, "num_tokens": 4327372960.0, "step": 40900 }, { "entropy": 1.268125, "epoch": 1.002644336712208, "grad_norm": 2.078125, "learning_rate": 3.2348354481555692e-06, "loss": 0.1607, "mean_token_accuracy": 0.9653881311416626, "num_tokens": 4332436449.0, "step": 40950 }, { "entropy": 1.27359375, "epoch": 1.0038685666715637, "grad_norm": 0.0013580322265625, "learning_rate": 3.2287808607159753e-06, "loss": 0.153, "mean_token_accuracy": 0.9669638919830322, "num_tokens": 4337572886.0, "step": 41000 }, { "entropy": 1.26375, "epoch": 1.005092796630919, "grad_norm": 2.609375, "learning_rate": 3.222725335828685e-06, "loss": 0.1474, "mean_token_accuracy": 0.9681554007530212, "num_tokens": 4342524064.0, "step": 41050 }, { "entropy": 1.290625, "epoch": 1.0063170265902748, "grad_norm": 2.796875, "learning_rate": 3.216668898306692e-06, "loss": 0.1723, "mean_token_accuracy": 0.9632875370979309, "num_tokens": 4347805365.0, "step": 41100 }, { "entropy": 1.28421875, "epoch": 1.0075412565496302, "grad_norm": 3.546875, "learning_rate": 3.210611572966728e-06, "loss": 0.1571, "mean_token_accuracy": 0.9664819014072418, "num_tokens": 4352875723.0, "step": 41150 }, { "entropy": 1.29171875, "epoch": 1.008765486508986, "grad_norm": 1.7421875, "learning_rate": 3.2045533846291643e-06, "loss": 0.1755, "mean_token_accuracy": 0.9631037187576293, "num_tokens": 4358561815.0, "step": 41200 }, { "entropy": 1.276875, "epoch": 1.0099897164683413, "grad_norm": 2.15625, "learning_rate": 3.1984943581179053e-06, "loss": 0.1553, "mean_token_accuracy": 0.9667964303493499, "num_tokens": 4363644242.0, "step": 41250 }, { "entropy": 1.30296875, "epoch": 1.011213946427697, "grad_norm": 3.96875, "learning_rate": 3.1924345182602943e-06, "loss": 0.1749, "mean_token_accuracy": 0.9630448269844055, "num_tokens": 4369318393.0, "step": 41300 }, { "entropy": 1.28875, "epoch": 1.0124381763870525, "grad_norm": 3.296875, "learning_rate": 3.1863738898870033e-06, "loss": 0.1669, "mean_token_accuracy": 0.9647123277187347, "num_tokens": 4374659681.0, "step": 41350 }, { "entropy": 1.27265625, "epoch": 1.0136624063464081, "grad_norm": 3.125, "learning_rate": 3.180312497831938e-06, "loss": 0.1567, "mean_token_accuracy": 0.9661735820770264, "num_tokens": 4379733438.0, "step": 41400 }, { "entropy": 1.28484375, "epoch": 1.0148866363057636, "grad_norm": 3.046875, "learning_rate": 3.174250366932133e-06, "loss": 0.1612, "mean_token_accuracy": 0.9659793210029602, "num_tokens": 4384885742.0, "step": 41450 }, { "entropy": 1.2934375, "epoch": 1.0161108662651193, "grad_norm": 3.65625, "learning_rate": 3.1681875220276487e-06, "loss": 0.1702, "mean_token_accuracy": 0.9628891766071319, "num_tokens": 4390251007.0, "step": 41500 }, { "entropy": 1.29703125, "epoch": 1.0173350962244747, "grad_norm": 2.546875, "learning_rate": 3.1621239879614722e-06, "loss": 0.1752, "mean_token_accuracy": 0.9631851124763489, "num_tokens": 4395820970.0, "step": 41550 }, { "entropy": 1.289375, "epoch": 1.0185593261838304, "grad_norm": 3.15625, "learning_rate": 3.1560597895794157e-06, "loss": 0.1651, "mean_token_accuracy": 0.9643260395526886, "num_tokens": 4401284321.0, "step": 41600 }, { "entropy": 1.3046875, "epoch": 1.0197835561431858, "grad_norm": 2.859375, "learning_rate": 3.149994951730011e-06, "loss": 0.1879, "mean_token_accuracy": 0.9601117408275605, "num_tokens": 4406768060.0, "step": 41650 }, { "entropy": 1.29484375, "epoch": 1.0210077861025415, "grad_norm": 3.0, "learning_rate": 3.143929499264413e-06, "loss": 0.1665, "mean_token_accuracy": 0.9648369300365448, "num_tokens": 4412201333.0, "step": 41700 }, { "entropy": 1.28328125, "epoch": 1.0222320160618972, "grad_norm": 3.40625, "learning_rate": 3.137863457036292e-06, "loss": 0.1533, "mean_token_accuracy": 0.9676184570789337, "num_tokens": 4417135073.0, "step": 41750 }, { "entropy": 1.3009375, "epoch": 1.0234562460212526, "grad_norm": 2.828125, "learning_rate": 3.1317968499017366e-06, "loss": 0.1742, "mean_token_accuracy": 0.9627422571182251, "num_tokens": 4422234270.0, "step": 41800 }, { "entropy": 1.29265625, "epoch": 1.0246804759806083, "grad_norm": 1.6875, "learning_rate": 3.1257297027191517e-06, "loss": 0.1579, "mean_token_accuracy": 0.9664195513725281, "num_tokens": 4427309878.0, "step": 41850 }, { "entropy": 1.275, "epoch": 1.0259047059399637, "grad_norm": 2.015625, "learning_rate": 3.1196620403491515e-06, "loss": 0.1651, "mean_token_accuracy": 0.9644128286838531, "num_tokens": 4432672891.0, "step": 41900 }, { "entropy": 1.2815625, "epoch": 1.0271289358993194, "grad_norm": 2.28125, "learning_rate": 3.113593887654463e-06, "loss": 0.1513, "mean_token_accuracy": 0.9673609352111816, "num_tokens": 4437526358.0, "step": 41950 }, { "entropy": 1.290625, "epoch": 1.0283531658586749, "grad_norm": 3.125, "learning_rate": 3.107525269499825e-06, "loss": 0.1706, "mean_token_accuracy": 0.9627550756931305, "num_tokens": 4442820350.0, "step": 42000 }, { "entropy": 1.29484375, "epoch": 1.0295773958180305, "grad_norm": 3.4375, "learning_rate": 3.1014562107518786e-06, "loss": 0.1684, "mean_token_accuracy": 0.9646277678012848, "num_tokens": 4448357734.0, "step": 42050 }, { "entropy": 1.300625, "epoch": 1.030801625777386, "grad_norm": 3.5625, "learning_rate": 3.0953867362790734e-06, "loss": 0.1802, "mean_token_accuracy": 0.9611736404895782, "num_tokens": 4453928087.0, "step": 42100 }, { "entropy": 1.29171875, "epoch": 1.0320258557367417, "grad_norm": 3.375, "learning_rate": 3.089316870951562e-06, "loss": 0.162, "mean_token_accuracy": 0.9649739050865174, "num_tokens": 4458946227.0, "step": 42150 }, { "entropy": 1.289375, "epoch": 1.033250085696097, "grad_norm": 3.21875, "learning_rate": 3.083246639641098e-06, "loss": 0.1723, "mean_token_accuracy": 0.9634380388259888, "num_tokens": 4464192504.0, "step": 42200 }, { "entropy": 1.3146875, "epoch": 1.0344743156554528, "grad_norm": 2.453125, "learning_rate": 3.077176067220935e-06, "loss": 0.1793, "mean_token_accuracy": 0.9617934930324554, "num_tokens": 4469999689.0, "step": 42250 }, { "entropy": 1.3025, "epoch": 1.0356985456148082, "grad_norm": 2.125, "learning_rate": 3.0711051785657236e-06, "loss": 0.1649, "mean_token_accuracy": 0.964527097940445, "num_tokens": 4475221088.0, "step": 42300 }, { "entropy": 1.29015625, "epoch": 1.036922775574164, "grad_norm": 2.84375, "learning_rate": 3.065033998551413e-06, "loss": 0.1741, "mean_token_accuracy": 0.9632121896743775, "num_tokens": 4480484467.0, "step": 42350 }, { "entropy": 1.29890625, "epoch": 1.0381470055335194, "grad_norm": 3.234375, "learning_rate": 3.0589625520551414e-06, "loss": 0.168, "mean_token_accuracy": 0.9637061321735382, "num_tokens": 4486042679.0, "step": 42400 }, { "entropy": 1.31703125, "epoch": 1.039371235492875, "grad_norm": 2.75, "learning_rate": 3.0528908639551436e-06, "loss": 0.1726, "mean_token_accuracy": 0.9634595859050751, "num_tokens": 4491749175.0, "step": 42450 }, { "entropy": 1.280625, "epoch": 1.0405954654522305, "grad_norm": 2.59375, "learning_rate": 3.0468189591306418e-06, "loss": 0.1637, "mean_token_accuracy": 0.9648339354991913, "num_tokens": 4497083391.0, "step": 42500 }, { "entropy": 1.275, "epoch": 1.0418196954115861, "grad_norm": 3.59375, "learning_rate": 3.040746862461747e-06, "loss": 0.1573, "mean_token_accuracy": 0.9660842347145081, "num_tokens": 4502213588.0, "step": 42550 }, { "entropy": 1.27265625, "epoch": 1.0430439253709416, "grad_norm": 2.078125, "learning_rate": 3.0346745988293553e-06, "loss": 0.1638, "mean_token_accuracy": 0.9644993054866791, "num_tokens": 4507601887.0, "step": 42600 }, { "entropy": 1.2703125, "epoch": 1.0442681553302973, "grad_norm": 2.234375, "learning_rate": 3.02860219311505e-06, "loss": 0.162, "mean_token_accuracy": 0.965209093093872, "num_tokens": 4512999351.0, "step": 42650 }, { "entropy": 1.2659375, "epoch": 1.0454923852896527, "grad_norm": 2.84375, "learning_rate": 3.0225296702009917e-06, "loss": 0.1708, "mean_token_accuracy": 0.9636136376857758, "num_tokens": 4518295845.0, "step": 42700 }, { "entropy": 1.27453125, "epoch": 1.0467166152490084, "grad_norm": 2.609375, "learning_rate": 3.016457054969827e-06, "loss": 0.165, "mean_token_accuracy": 0.9648648130893708, "num_tokens": 4523705084.0, "step": 42750 }, { "entropy": 1.27328125, "epoch": 1.0479408452083638, "grad_norm": 2.140625, "learning_rate": 3.0103843723045753e-06, "loss": 0.1587, "mean_token_accuracy": 0.9660780084133148, "num_tokens": 4528928559.0, "step": 42800 }, { "entropy": 1.27140625, "epoch": 1.0491650751677195, "grad_norm": 1.7265625, "learning_rate": 3.004311647088536e-06, "loss": 0.1608, "mean_token_accuracy": 0.9661289596557617, "num_tokens": 4534161929.0, "step": 42850 }, { "entropy": 1.28, "epoch": 1.0503893051270752, "grad_norm": 2.734375, "learning_rate": 2.9982389042051802e-06, "loss": 0.1596, "mean_token_accuracy": 0.9655217385292053, "num_tokens": 4539230226.0, "step": 42900 }, { "entropy": 1.27828125, "epoch": 1.0516135350864306, "grad_norm": 1.8359375, "learning_rate": 2.992166168538055e-06, "loss": 0.1654, "mean_token_accuracy": 0.9645612442493439, "num_tokens": 4544444757.0, "step": 42950 }, { "entropy": 1.28609375, "epoch": 1.0528377650457863, "grad_norm": 3.1875, "learning_rate": 2.986093464970675e-06, "loss": 0.1809, "mean_token_accuracy": 0.961436516046524, "num_tokens": 4550024290.0, "step": 43000 }, { "entropy": 1.2921875, "epoch": 1.0540619950051417, "grad_norm": 1.8046875, "learning_rate": 2.9800208183864225e-06, "loss": 0.1737, "mean_token_accuracy": 0.9631437683105468, "num_tokens": 4555846037.0, "step": 43050 }, { "entropy": 1.29046875, "epoch": 1.0552862249644974, "grad_norm": 2.859375, "learning_rate": 2.97394825366845e-06, "loss": 0.1824, "mean_token_accuracy": 0.9611044287681579, "num_tokens": 4561556919.0, "step": 43100 }, { "entropy": 1.2615625, "epoch": 1.0565104549238529, "grad_norm": 2.578125, "learning_rate": 2.9678757956995704e-06, "loss": 0.1519, "mean_token_accuracy": 0.967376263141632, "num_tokens": 4566754673.0, "step": 43150 }, { "entropy": 1.24921875, "epoch": 1.0577346848832085, "grad_norm": 3.5, "learning_rate": 2.9618034693621624e-06, "loss": 0.1651, "mean_token_accuracy": 0.9647138011455536, "num_tokens": 4571961153.0, "step": 43200 }, { "entropy": 1.27078125, "epoch": 1.058958914842564, "grad_norm": 4.0625, "learning_rate": 2.955731299538065e-06, "loss": 0.1664, "mean_token_accuracy": 0.9643959999084473, "num_tokens": 4577276643.0, "step": 43250 }, { "entropy": 1.27125, "epoch": 1.0601831448019197, "grad_norm": 3.5, "learning_rate": 2.9496593111084725e-06, "loss": 0.1764, "mean_token_accuracy": 0.9621264743804931, "num_tokens": 4582787780.0, "step": 43300 }, { "entropy": 1.2503125, "epoch": 1.0614073747612751, "grad_norm": 0.06201171875, "learning_rate": 2.9435875289538397e-06, "loss": 0.1616, "mean_token_accuracy": 0.9652257537841797, "num_tokens": 4587978646.0, "step": 43350 }, { "entropy": 1.25390625, "epoch": 1.0626316047206308, "grad_norm": 2.34375, "learning_rate": 2.937515977953776e-06, "loss": 0.1601, "mean_token_accuracy": 0.9656472432613373, "num_tokens": 4593105594.0, "step": 43400 }, { "entropy": 1.235, "epoch": 1.0638558346799862, "grad_norm": 1.796875, "learning_rate": 2.93144468298694e-06, "loss": 0.1465, "mean_token_accuracy": 0.9684570038318634, "num_tokens": 4598082227.0, "step": 43450 }, { "entropy": 1.2615625, "epoch": 1.065080064639342, "grad_norm": 1.75, "learning_rate": 2.9253736689309453e-06, "loss": 0.1739, "mean_token_accuracy": 0.9627693855762481, "num_tokens": 4603820936.0, "step": 43500 }, { "entropy": 1.2409375, "epoch": 1.0663042945986974, "grad_norm": 2.4375, "learning_rate": 2.919302960662252e-06, "loss": 0.1665, "mean_token_accuracy": 0.9645286548137665, "num_tokens": 4609111825.0, "step": 43550 }, { "entropy": 1.251875, "epoch": 1.067528524558053, "grad_norm": 2.265625, "learning_rate": 2.9132325830560694e-06, "loss": 0.1708, "mean_token_accuracy": 0.9642206788063049, "num_tokens": 4614988638.0, "step": 43600 }, { "entropy": 1.23515625, "epoch": 1.0687527545174085, "grad_norm": 3.109375, "learning_rate": 2.907162560986249e-06, "loss": 0.1665, "mean_token_accuracy": 0.9648200106620789, "num_tokens": 4620258466.0, "step": 43650 }, { "entropy": 1.23046875, "epoch": 1.0699769844767641, "grad_norm": 1.78125, "learning_rate": 2.9010929193251877e-06, "loss": 0.1587, "mean_token_accuracy": 0.9666041648387909, "num_tokens": 4625541440.0, "step": 43700 }, { "entropy": 1.23578125, "epoch": 1.0712012144361196, "grad_norm": 2.59375, "learning_rate": 2.8950236829437243e-06, "loss": 0.1595, "mean_token_accuracy": 0.9665923917293548, "num_tokens": 4630862596.0, "step": 43750 }, { "entropy": 1.24796875, "epoch": 1.0724254443954753, "grad_norm": 3.625, "learning_rate": 2.8889548767110325e-06, "loss": 0.1726, "mean_token_accuracy": 0.9622351431846619, "num_tokens": 4636080162.0, "step": 43800 }, { "entropy": 1.255, "epoch": 1.0736496743548307, "grad_norm": 2.984375, "learning_rate": 2.882886525494528e-06, "loss": 0.1677, "mean_token_accuracy": 0.9641489648818969, "num_tokens": 4641603830.0, "step": 43850 }, { "entropy": 1.26390625, "epoch": 1.0748739043141864, "grad_norm": 2.203125, "learning_rate": 2.8768186541597617e-06, "loss": 0.1803, "mean_token_accuracy": 0.9621511352062225, "num_tokens": 4647162733.0, "step": 43900 }, { "entropy": 1.26953125, "epoch": 1.0760981342735418, "grad_norm": 2.359375, "learning_rate": 2.8707512875703146e-06, "loss": 0.1724, "mean_token_accuracy": 0.963198972940445, "num_tokens": 4652659894.0, "step": 43950 }, { "entropy": 1.261875, "epoch": 1.0773223642328975, "grad_norm": 2.984375, "learning_rate": 2.8646844505877032e-06, "loss": 0.1702, "mean_token_accuracy": 0.963871557712555, "num_tokens": 4657833019.0, "step": 44000 }, { "entropy": 1.25171875, "epoch": 1.078546594192253, "grad_norm": 3.875, "learning_rate": 2.8586181680712726e-06, "loss": 0.1671, "mean_token_accuracy": 0.9647689509391785, "num_tokens": 4663099416.0, "step": 44050 }, { "entropy": 1.2353125, "epoch": 1.0797708241516086, "grad_norm": 1.921875, "learning_rate": 2.852552464878096e-06, "loss": 0.1626, "mean_token_accuracy": 0.9649975061416626, "num_tokens": 4668463403.0, "step": 44100 }, { "entropy": 1.2309375, "epoch": 1.0809950541109643, "grad_norm": 3.515625, "learning_rate": 2.846487365862872e-06, "loss": 0.1622, "mean_token_accuracy": 0.966260347366333, "num_tokens": 4673588957.0, "step": 44150 }, { "entropy": 1.2703125, "epoch": 1.0822192840703198, "grad_norm": 2.921875, "learning_rate": 2.840422895877824e-06, "loss": 0.1829, "mean_token_accuracy": 0.9611806380748749, "num_tokens": 4679435999.0, "step": 44200 }, { "entropy": 1.21671875, "epoch": 1.0834435140296754, "grad_norm": 2.515625, "learning_rate": 2.8343590797725993e-06, "loss": 0.1595, "mean_token_accuracy": 0.9657203650474548, "num_tokens": 4684283427.0, "step": 44250 }, { "entropy": 1.23546875, "epoch": 1.0846677439890309, "grad_norm": 2.296875, "learning_rate": 2.828295942394163e-06, "loss": 0.1545, "mean_token_accuracy": 0.9663613975048065, "num_tokens": 4689166634.0, "step": 44300 }, { "entropy": 1.2715625, "epoch": 1.0858919739483865, "grad_norm": 2.140625, "learning_rate": 2.822233508586702e-06, "loss": 0.1721, "mean_token_accuracy": 0.9638037300109863, "num_tokens": 4694728156.0, "step": 44350 }, { "entropy": 1.246875, "epoch": 1.087116203907742, "grad_norm": 1.1171875, "learning_rate": 2.8161718031915194e-06, "loss": 0.1629, "mean_token_accuracy": 0.9652890110015869, "num_tokens": 4700054529.0, "step": 44400 }, { "entropy": 1.26203125, "epoch": 1.0883404338670977, "grad_norm": 1.9765625, "learning_rate": 2.8101108510469308e-06, "loss": 0.1667, "mean_token_accuracy": 0.9647334861755371, "num_tokens": 4705521940.0, "step": 44450 }, { "entropy": 1.26171875, "epoch": 1.0895646638264531, "grad_norm": 2.15625, "learning_rate": 2.804050676988169e-06, "loss": 0.1764, "mean_token_accuracy": 0.9625956809520722, "num_tokens": 4711285057.0, "step": 44500 }, { "entropy": 1.26828125, "epoch": 1.0907888937858088, "grad_norm": 3.21875, "learning_rate": 2.797991305847279e-06, "loss": 0.1695, "mean_token_accuracy": 0.9635378420352936, "num_tokens": 4716659220.0, "step": 44550 }, { "entropy": 1.25296875, "epoch": 1.0920131237451642, "grad_norm": 2.84375, "learning_rate": 2.7919327624530105e-06, "loss": 0.1589, "mean_token_accuracy": 0.966244969367981, "num_tokens": 4721738500.0, "step": 44600 }, { "entropy": 1.25390625, "epoch": 1.09323735370452, "grad_norm": 1.5859375, "learning_rate": 2.7858750716307267e-06, "loss": 0.1629, "mean_token_accuracy": 0.9655514645576477, "num_tokens": 4727007974.0, "step": 44650 }, { "entropy": 1.261875, "epoch": 1.0944615836638754, "grad_norm": 3.15625, "learning_rate": 2.7798182582022956e-06, "loss": 0.1666, "mean_token_accuracy": 0.9647921168804169, "num_tokens": 4732247570.0, "step": 44700 }, { "entropy": 1.275, "epoch": 1.095685813623231, "grad_norm": 3.0, "learning_rate": 2.7737623469859904e-06, "loss": 0.1753, "mean_token_accuracy": 0.9633481323719024, "num_tokens": 4737626660.0, "step": 44750 }, { "entropy": 1.27203125, "epoch": 1.0969100435825865, "grad_norm": 3.1875, "learning_rate": 2.767707362796385e-06, "loss": 0.1707, "mean_token_accuracy": 0.9635563850402832, "num_tokens": 4743127298.0, "step": 44800 }, { "entropy": 1.2753125, "epoch": 1.0981342735419422, "grad_norm": 2.28125, "learning_rate": 2.7616533304442583e-06, "loss": 0.1725, "mean_token_accuracy": 0.9624858343601227, "num_tokens": 4748930038.0, "step": 44850 }, { "entropy": 1.251875, "epoch": 1.0993585035012976, "grad_norm": 2.828125, "learning_rate": 2.7556002747364882e-06, "loss": 0.1618, "mean_token_accuracy": 0.965050835609436, "num_tokens": 4754015548.0, "step": 44900 }, { "entropy": 1.24, "epoch": 1.1005827334606533, "grad_norm": 2.515625, "learning_rate": 2.749548220475947e-06, "loss": 0.1556, "mean_token_accuracy": 0.9672428011894226, "num_tokens": 4759064667.0, "step": 44950 }, { "entropy": 1.24671875, "epoch": 1.1018069634200087, "grad_norm": 3.796875, "learning_rate": 2.7434971924614085e-06, "loss": 0.1581, "mean_token_accuracy": 0.9658971416950226, "num_tokens": 4764080171.0, "step": 45000 }, { "entropy": 1.27921875, "epoch": 1.1030311933793644, "grad_norm": 1.6953125, "learning_rate": 2.7374472154874396e-06, "loss": 0.1743, "mean_token_accuracy": 0.9628953158855438, "num_tokens": 4769590544.0, "step": 45050 }, { "entropy": 1.27578125, "epoch": 1.1042554233387198, "grad_norm": 3.203125, "learning_rate": 2.731398314344298e-06, "loss": 0.172, "mean_token_accuracy": 0.9631561875343323, "num_tokens": 4774983478.0, "step": 45100 }, { "entropy": 1.26796875, "epoch": 1.1054796532980755, "grad_norm": 2.625, "learning_rate": 2.7253505138178363e-06, "loss": 0.1626, "mean_token_accuracy": 0.9651547718048096, "num_tokens": 4780291854.0, "step": 45150 }, { "entropy": 1.24359375, "epoch": 1.1067038832574312, "grad_norm": 1.7734375, "learning_rate": 2.719303838689397e-06, "loss": 0.1586, "mean_token_accuracy": 0.9661097753047944, "num_tokens": 4785746067.0, "step": 45200 }, { "entropy": 1.27703125, "epoch": 1.1079281132167866, "grad_norm": 3.21875, "learning_rate": 2.7132583137357085e-06, "loss": 0.173, "mean_token_accuracy": 0.9634435415267945, "num_tokens": 4791411988.0, "step": 45250 }, { "entropy": 1.26703125, "epoch": 1.1091523431761423, "grad_norm": 2.390625, "learning_rate": 2.70721396372879e-06, "loss": 0.1574, "mean_token_accuracy": 0.9663924646377563, "num_tokens": 4796839124.0, "step": 45300 }, { "entropy": 1.251875, "epoch": 1.1103765731354978, "grad_norm": 1.7265625, "learning_rate": 2.7011708134358433e-06, "loss": 0.1702, "mean_token_accuracy": 0.963711371421814, "num_tokens": 4802261281.0, "step": 45350 }, { "entropy": 1.26109375, "epoch": 1.1116008030948534, "grad_norm": 3.34375, "learning_rate": 2.6951288876191554e-06, "loss": 0.163, "mean_token_accuracy": 0.9658736658096313, "num_tokens": 4807722190.0, "step": 45400 }, { "entropy": 1.2421875, "epoch": 1.1128250330542089, "grad_norm": 2.515625, "learning_rate": 2.689088211035996e-06, "loss": 0.1582, "mean_token_accuracy": 0.9665179479122162, "num_tokens": 4812528854.0, "step": 45450 }, { "entropy": 1.26859375, "epoch": 1.1140492630135646, "grad_norm": 4.28125, "learning_rate": 2.6830488084385153e-06, "loss": 0.1633, "mean_token_accuracy": 0.9647966718673706, "num_tokens": 4817654045.0, "step": 45500 }, { "entropy": 1.276875, "epoch": 1.11527349297292, "grad_norm": 3.078125, "learning_rate": 2.6770107045736457e-06, "loss": 0.1659, "mean_token_accuracy": 0.9641125738620758, "num_tokens": 4823118089.0, "step": 45550 }, { "entropy": 1.26890625, "epoch": 1.1164977229322757, "grad_norm": 3.40625, "learning_rate": 2.670973924182993e-06, "loss": 0.1652, "mean_token_accuracy": 0.965114232301712, "num_tokens": 4828253691.0, "step": 45600 }, { "entropy": 1.25203125, "epoch": 1.1177219528916311, "grad_norm": 4.03125, "learning_rate": 2.664938492002745e-06, "loss": 0.1578, "mean_token_accuracy": 0.965950778722763, "num_tokens": 4833456111.0, "step": 45650 }, { "entropy": 1.27203125, "epoch": 1.1189461828509868, "grad_norm": 2.421875, "learning_rate": 2.658904432763564e-06, "loss": 0.172, "mean_token_accuracy": 0.962825288772583, "num_tokens": 4838982999.0, "step": 45700 }, { "entropy": 1.2459375, "epoch": 1.1201704128103422, "grad_norm": 2.53125, "learning_rate": 2.6528717711904823e-06, "loss": 0.1553, "mean_token_accuracy": 0.9660564112663269, "num_tokens": 4844057439.0, "step": 45750 }, { "entropy": 1.2546875, "epoch": 1.121394642769698, "grad_norm": 2.78125, "learning_rate": 2.6468405320028107e-06, "loss": 0.1758, "mean_token_accuracy": 0.9631454050540924, "num_tokens": 4849526204.0, "step": 45800 }, { "entropy": 1.2471875, "epoch": 1.1226188727290534, "grad_norm": 2.375, "learning_rate": 2.6408107399140297e-06, "loss": 0.1525, "mean_token_accuracy": 0.9672383844852448, "num_tokens": 4854563999.0, "step": 45850 }, { "entropy": 1.2390625, "epoch": 1.123843102688409, "grad_norm": 2.796875, "learning_rate": 2.6347824196316884e-06, "loss": 0.1571, "mean_token_accuracy": 0.9666775286197662, "num_tokens": 4859889553.0, "step": 45900 }, { "entropy": 1.251875, "epoch": 1.1250673326477645, "grad_norm": 6.1875, "learning_rate": 2.628755595857308e-06, "loss": 0.1659, "mean_token_accuracy": 0.964877005815506, "num_tokens": 4865439463.0, "step": 45950 }, { "entropy": 1.26578125, "epoch": 1.1262915626071202, "grad_norm": 2.9375, "learning_rate": 2.622730293286276e-06, "loss": 0.1663, "mean_token_accuracy": 0.9647691214084625, "num_tokens": 4870527275.0, "step": 46000 }, { "entropy": 1.2690625, "epoch": 1.1275157925664756, "grad_norm": 3.84375, "learning_rate": 2.6167065366077473e-06, "loss": 0.164, "mean_token_accuracy": 0.9649512505531311, "num_tokens": 4875809735.0, "step": 46050 }, { "entropy": 1.2575, "epoch": 1.1287400225258313, "grad_norm": 3.546875, "learning_rate": 2.6106843505045403e-06, "loss": 0.1637, "mean_token_accuracy": 0.9659580600261688, "num_tokens": 4881072058.0, "step": 46100 }, { "entropy": 1.2534375, "epoch": 1.1299642524851867, "grad_norm": 3.265625, "learning_rate": 2.6046637596530405e-06, "loss": 0.1738, "mean_token_accuracy": 0.9629634070396423, "num_tokens": 4886211504.0, "step": 46150 }, { "entropy": 1.255, "epoch": 1.1311884824445424, "grad_norm": 2.953125, "learning_rate": 2.598644788723097e-06, "loss": 0.1635, "mean_token_accuracy": 0.964535938501358, "num_tokens": 4891417957.0, "step": 46200 }, { "entropy": 1.26296875, "epoch": 1.132412712403898, "grad_norm": 2.25, "learning_rate": 2.5926274623779176e-06, "loss": 0.1648, "mean_token_accuracy": 0.9648210310935974, "num_tokens": 4897027521.0, "step": 46250 }, { "entropy": 1.2715625, "epoch": 1.1336369423632535, "grad_norm": 2.359375, "learning_rate": 2.5866118052739744e-06, "loss": 0.1701, "mean_token_accuracy": 0.9643675744533539, "num_tokens": 4902630666.0, "step": 46300 }, { "entropy": 1.2640625, "epoch": 1.134861172322609, "grad_norm": 2.921875, "learning_rate": 2.5805978420608995e-06, "loss": 0.1588, "mean_token_accuracy": 0.9654871869087219, "num_tokens": 4907957609.0, "step": 46350 }, { "entropy": 1.25765625, "epoch": 1.1360854022819646, "grad_norm": 1.96875, "learning_rate": 2.574585597381383e-06, "loss": 0.1657, "mean_token_accuracy": 0.964663782119751, "num_tokens": 4913108629.0, "step": 46400 }, { "entropy": 1.26984375, "epoch": 1.1373096322413203, "grad_norm": 2.890625, "learning_rate": 2.5685750958710737e-06, "loss": 0.1654, "mean_token_accuracy": 0.9640021121501923, "num_tokens": 4918622288.0, "step": 46450 }, { "entropy": 1.2890625, "epoch": 1.1385338622006758, "grad_norm": 2.59375, "learning_rate": 2.5625663621584777e-06, "loss": 0.1822, "mean_token_accuracy": 0.9616779792308807, "num_tokens": 4924224135.0, "step": 46500 }, { "entropy": 1.2665625, "epoch": 1.1397580921600314, "grad_norm": 3.171875, "learning_rate": 2.5565594208648566e-06, "loss": 0.1703, "mean_token_accuracy": 0.9643717563152313, "num_tokens": 4929573607.0, "step": 46550 }, { "entropy": 1.2684375, "epoch": 1.1409823221193869, "grad_norm": 3.296875, "learning_rate": 2.5505542966041285e-06, "loss": 0.1726, "mean_token_accuracy": 0.9641470229625702, "num_tokens": 4935198269.0, "step": 46600 }, { "entropy": 1.2725, "epoch": 1.1422065520787426, "grad_norm": 2.484375, "learning_rate": 2.5445510139827656e-06, "loss": 0.1731, "mean_token_accuracy": 0.9628414344787598, "num_tokens": 4940751379.0, "step": 46650 }, { "entropy": 1.2690625, "epoch": 1.143430782038098, "grad_norm": 1.78125, "learning_rate": 2.5385495975996952e-06, "loss": 0.1769, "mean_token_accuracy": 0.9626391875743866, "num_tokens": 4946216596.0, "step": 46700 }, { "entropy": 1.27125, "epoch": 1.1446550119974537, "grad_norm": 3.359375, "learning_rate": 2.532550072046194e-06, "loss": 0.179, "mean_token_accuracy": 0.9620010888576508, "num_tokens": 4951891973.0, "step": 46750 }, { "entropy": 1.28984375, "epoch": 1.1458792419568091, "grad_norm": 1.71875, "learning_rate": 2.5265524619057936e-06, "loss": 0.1822, "mean_token_accuracy": 0.9611503231525421, "num_tokens": 4957928188.0, "step": 46800 }, { "entropy": 1.27203125, "epoch": 1.1471034719161648, "grad_norm": 2.265625, "learning_rate": 2.520556791754179e-06, "loss": 0.1675, "mean_token_accuracy": 0.9632143163681031, "num_tokens": 4963189602.0, "step": 46850 }, { "entropy": 1.2546875, "epoch": 1.1483277018755202, "grad_norm": 2.296875, "learning_rate": 2.5145630861590806e-06, "loss": 0.1677, "mean_token_accuracy": 0.9636298882961273, "num_tokens": 4968384917.0, "step": 46900 }, { "entropy": 1.234375, "epoch": 1.149551931834876, "grad_norm": 2.0, "learning_rate": 2.5085713696801825e-06, "loss": 0.1456, "mean_token_accuracy": 0.9684996688365937, "num_tokens": 4973304826.0, "step": 46950 }, { "entropy": 1.2384375, "epoch": 1.1507761617942314, "grad_norm": 3.1875, "learning_rate": 2.5025816668690183e-06, "loss": 0.1615, "mean_token_accuracy": 0.9655906355381012, "num_tokens": 4978583670.0, "step": 47000 }, { "entropy": 1.23921875, "epoch": 1.152000391753587, "grad_norm": 2.796875, "learning_rate": 2.496594002268869e-06, "loss": 0.1633, "mean_token_accuracy": 0.9643825757503509, "num_tokens": 4983769645.0, "step": 47050 }, { "entropy": 1.2440625, "epoch": 1.1532246217129425, "grad_norm": 2.359375, "learning_rate": 2.490608400414664e-06, "loss": 0.1601, "mean_token_accuracy": 0.9659870672225952, "num_tokens": 4989133497.0, "step": 47100 }, { "entropy": 1.2484375, "epoch": 1.1544488516722982, "grad_norm": 3.15625, "learning_rate": 2.484624885832883e-06, "loss": 0.1618, "mean_token_accuracy": 0.9654805910587311, "num_tokens": 4994369533.0, "step": 47150 }, { "entropy": 1.2490625, "epoch": 1.1556730816316536, "grad_norm": 3.109375, "learning_rate": 2.478643483041449e-06, "loss": 0.1616, "mean_token_accuracy": 0.9649089682102203, "num_tokens": 4999527347.0, "step": 47200 }, { "entropy": 1.2553125, "epoch": 1.1568973115910093, "grad_norm": 2.4375, "learning_rate": 2.472664216549633e-06, "loss": 0.1627, "mean_token_accuracy": 0.9657234275341033, "num_tokens": 5004961075.0, "step": 47250 }, { "entropy": 1.24203125, "epoch": 1.1581215415503647, "grad_norm": 2.640625, "learning_rate": 2.466687110857955e-06, "loss": 0.1533, "mean_token_accuracy": 0.9676401782035827, "num_tokens": 5009801621.0, "step": 47300 }, { "entropy": 1.2534375, "epoch": 1.1593457715097204, "grad_norm": 1.875, "learning_rate": 2.4607121904580796e-06, "loss": 0.1689, "mean_token_accuracy": 0.96378169298172, "num_tokens": 5015019832.0, "step": 47350 }, { "entropy": 1.285625, "epoch": 1.1605700014690759, "grad_norm": 2.6875, "learning_rate": 2.4547394798327127e-06, "loss": 0.1824, "mean_token_accuracy": 0.961477290391922, "num_tokens": 5020771556.0, "step": 47400 }, { "entropy": 1.2609375, "epoch": 1.1617942314284315, "grad_norm": 0.00445556640625, "learning_rate": 2.448769003455512e-06, "loss": 0.1606, "mean_token_accuracy": 0.9650316751003265, "num_tokens": 5026174408.0, "step": 47450 }, { "entropy": 1.24875, "epoch": 1.1630184613877872, "grad_norm": 2.96875, "learning_rate": 2.442800785790977e-06, "loss": 0.1554, "mean_token_accuracy": 0.9664806413650513, "num_tokens": 5031142557.0, "step": 47500 }, { "entropy": 1.25828125, "epoch": 1.1642426913471426, "grad_norm": 2.75, "learning_rate": 2.436834851294351e-06, "loss": 0.1731, "mean_token_accuracy": 0.9635387444496155, "num_tokens": 5036598656.0, "step": 47550 }, { "entropy": 1.2440625, "epoch": 1.165466921306498, "grad_norm": 3.125, "learning_rate": 2.4308712244115256e-06, "loss": 0.1652, "mean_token_accuracy": 0.9645625805854797, "num_tokens": 5041932484.0, "step": 47600 }, { "entropy": 1.23, "epoch": 1.1666911512658538, "grad_norm": 4.53125, "learning_rate": 2.4249099295789315e-06, "loss": 0.1503, "mean_token_accuracy": 0.9676901125907897, "num_tokens": 5047049390.0, "step": 47650 }, { "entropy": 1.24640625, "epoch": 1.1679153812252094, "grad_norm": 1.6953125, "learning_rate": 2.4189509912234475e-06, "loss": 0.1754, "mean_token_accuracy": 0.9623109328746796, "num_tokens": 5052498083.0, "step": 47700 }, { "entropy": 1.2228125, "epoch": 1.1691396111845649, "grad_norm": 2.21875, "learning_rate": 2.412994433762295e-06, "loss": 0.1438, "mean_token_accuracy": 0.9679240989685058, "num_tokens": 5057358329.0, "step": 47750 }, { "entropy": 1.236875, "epoch": 1.1703638411439206, "grad_norm": 3.390625, "learning_rate": 2.407040281602942e-06, "loss": 0.1549, "mean_token_accuracy": 0.9666338443756104, "num_tokens": 5062500243.0, "step": 47800 }, { "entropy": 1.2196875, "epoch": 1.171588071103276, "grad_norm": 0.005096435546875, "learning_rate": 2.4010885591429955e-06, "loss": 0.1541, "mean_token_accuracy": 0.9668021559715271, "num_tokens": 5067435842.0, "step": 47850 }, { "entropy": 1.25109375, "epoch": 1.1728123010626317, "grad_norm": 3.40625, "learning_rate": 2.3951392907701115e-06, "loss": 0.1831, "mean_token_accuracy": 0.9610938668251038, "num_tokens": 5073063170.0, "step": 47900 }, { "entropy": 1.24375, "epoch": 1.1740365310219871, "grad_norm": 3.328125, "learning_rate": 2.389192500861888e-06, "loss": 0.1754, "mean_token_accuracy": 0.9621718871593475, "num_tokens": 5078828458.0, "step": 47950 }, { "entropy": 1.26078125, "epoch": 1.1752607609813428, "grad_norm": 2.578125, "learning_rate": 2.3832482137857685e-06, "loss": 0.175, "mean_token_accuracy": 0.9630187213420868, "num_tokens": 5084161692.0, "step": 48000 }, { "entropy": 1.2571875, "epoch": 1.1764849909406982, "grad_norm": 1.8046875, "learning_rate": 2.377306453898938e-06, "loss": 0.1689, "mean_token_accuracy": 0.9643845617771148, "num_tokens": 5089346169.0, "step": 48050 }, { "entropy": 1.25125, "epoch": 1.177709220900054, "grad_norm": 3.34375, "learning_rate": 2.3713672455482293e-06, "loss": 0.1609, "mean_token_accuracy": 0.9652318274974823, "num_tokens": 5094622581.0, "step": 48100 }, { "entropy": 1.24921875, "epoch": 1.1789334508594094, "grad_norm": 1.953125, "learning_rate": 2.36543061307002e-06, "loss": 0.1611, "mean_token_accuracy": 0.9650622093677521, "num_tokens": 5099539248.0, "step": 48150 }, { "entropy": 1.2584375, "epoch": 1.180157680818765, "grad_norm": 4.5, "learning_rate": 2.35949658079013e-06, "loss": 0.1693, "mean_token_accuracy": 0.9631922256946563, "num_tokens": 5104589567.0, "step": 48200 }, { "entropy": 1.26328125, "epoch": 1.1813819107781205, "grad_norm": 3.09375, "learning_rate": 2.3535651730237275e-06, "loss": 0.1613, "mean_token_accuracy": 0.9661449313163757, "num_tokens": 5109766096.0, "step": 48250 }, { "entropy": 1.25484375, "epoch": 1.1826061407374762, "grad_norm": 3.125, "learning_rate": 2.3476364140752266e-06, "loss": 0.1599, "mean_token_accuracy": 0.9653767657279968, "num_tokens": 5114683078.0, "step": 48300 }, { "entropy": 1.28109375, "epoch": 1.1838303706968316, "grad_norm": 3.46875, "learning_rate": 2.341710328238185e-06, "loss": 0.1725, "mean_token_accuracy": 0.9629187500476837, "num_tokens": 5120172628.0, "step": 48350 }, { "entropy": 1.2590625, "epoch": 1.1850546006561873, "grad_norm": 2.125, "learning_rate": 2.335786939795209e-06, "loss": 0.1574, "mean_token_accuracy": 0.966355732679367, "num_tokens": 5125111521.0, "step": 48400 }, { "entropy": 1.2721875, "epoch": 1.1862788306155427, "grad_norm": 2.046875, "learning_rate": 2.3298662730178536e-06, "loss": 0.1635, "mean_token_accuracy": 0.9648284649848938, "num_tokens": 5130646209.0, "step": 48450 }, { "entropy": 1.2484375, "epoch": 1.1875030605748984, "grad_norm": 2.703125, "learning_rate": 2.3239483521665165e-06, "loss": 0.1529, "mean_token_accuracy": 0.9668037176132203, "num_tokens": 5135665531.0, "step": 48500 }, { "entropy": 1.25546875, "epoch": 1.188727290534254, "grad_norm": 1.5703125, "learning_rate": 2.31803320149035e-06, "loss": 0.1674, "mean_token_accuracy": 0.9642703318595887, "num_tokens": 5140993137.0, "step": 48550 }, { "entropy": 1.2759375, "epoch": 1.1899515204936095, "grad_norm": 1.875, "learning_rate": 2.312120845227151e-06, "loss": 0.1682, "mean_token_accuracy": 0.9635923814773559, "num_tokens": 5146394110.0, "step": 48600 }, { "entropy": 1.269375, "epoch": 1.191175750452965, "grad_norm": 2.125, "learning_rate": 2.306211307603269e-06, "loss": 0.1603, "mean_token_accuracy": 0.9650293779373169, "num_tokens": 5151444447.0, "step": 48650 }, { "entropy": 1.2778125, "epoch": 1.1923999804123206, "grad_norm": 3.484375, "learning_rate": 2.3003046128335004e-06, "loss": 0.1725, "mean_token_accuracy": 0.962925443649292, "num_tokens": 5157164016.0, "step": 48700 }, { "entropy": 1.2559375, "epoch": 1.1936242103716763, "grad_norm": 0.00262451171875, "learning_rate": 2.2944007851209967e-06, "loss": 0.1555, "mean_token_accuracy": 0.9663327503204345, "num_tokens": 5162287319.0, "step": 48750 }, { "entropy": 1.25578125, "epoch": 1.1948484403310318, "grad_norm": 1.84375, "learning_rate": 2.2884998486571587e-06, "loss": 0.1623, "mean_token_accuracy": 0.9643605947494507, "num_tokens": 5167697788.0, "step": 48800 }, { "entropy": 1.2621875, "epoch": 1.1960726702903874, "grad_norm": 2.421875, "learning_rate": 2.2826018276215404e-06, "loss": 0.1641, "mean_token_accuracy": 0.9648311936855316, "num_tokens": 5172726413.0, "step": 48850 }, { "entropy": 1.256875, "epoch": 1.197296900249743, "grad_norm": 3.9375, "learning_rate": 2.276706746181751e-06, "loss": 0.1647, "mean_token_accuracy": 0.9653891062736512, "num_tokens": 5177807515.0, "step": 48900 }, { "entropy": 1.24484375, "epoch": 1.1985211302090986, "grad_norm": 3.359375, "learning_rate": 2.2708146284933544e-06, "loss": 0.1491, "mean_token_accuracy": 0.9672402215003967, "num_tokens": 5182682002.0, "step": 48950 }, { "entropy": 1.2434375, "epoch": 1.199745360168454, "grad_norm": 2.09375, "learning_rate": 2.2649254986997666e-06, "loss": 0.1625, "mean_token_accuracy": 0.9646528875827789, "num_tokens": 5187927187.0, "step": 49000 }, { "entropy": 1.26171875, "epoch": 1.2009695901278097, "grad_norm": 2.140625, "learning_rate": 2.2590393809321657e-06, "loss": 0.1601, "mean_token_accuracy": 0.9654495012760163, "num_tokens": 5192885819.0, "step": 49050 }, { "entropy": 1.26296875, "epoch": 1.2021938200871651, "grad_norm": 2.015625, "learning_rate": 2.2531562993093854e-06, "loss": 0.1631, "mean_token_accuracy": 0.9647388279438018, "num_tokens": 5198240652.0, "step": 49100 }, { "entropy": 1.23265625, "epoch": 1.2034180500465208, "grad_norm": 2.5625, "learning_rate": 2.247276277937817e-06, "loss": 0.1537, "mean_token_accuracy": 0.966611897945404, "num_tokens": 5203287957.0, "step": 49150 }, { "entropy": 1.24109375, "epoch": 1.2046422800058763, "grad_norm": 2.671875, "learning_rate": 2.241399340911315e-06, "loss": 0.1582, "mean_token_accuracy": 0.9648150885105133, "num_tokens": 5208259781.0, "step": 49200 }, { "entropy": 1.22828125, "epoch": 1.205866509965232, "grad_norm": 1.796875, "learning_rate": 2.235525512311094e-06, "loss": 0.1659, "mean_token_accuracy": 0.9645445287227631, "num_tokens": 5213559098.0, "step": 49250 }, { "entropy": 1.23921875, "epoch": 1.2070907399245874, "grad_norm": 3.1875, "learning_rate": 2.229654816205632e-06, "loss": 0.1694, "mean_token_accuracy": 0.9639151406288147, "num_tokens": 5218710994.0, "step": 49300 }, { "entropy": 1.2425, "epoch": 1.208314969883943, "grad_norm": 2.828125, "learning_rate": 2.2237872766505715e-06, "loss": 0.1676, "mean_token_accuracy": 0.9631175470352172, "num_tokens": 5224096915.0, "step": 49350 }, { "entropy": 1.25015625, "epoch": 1.2095391998432985, "grad_norm": 3.71875, "learning_rate": 2.2179229176886196e-06, "loss": 0.1731, "mean_token_accuracy": 0.9628188860416412, "num_tokens": 5229833600.0, "step": 49400 }, { "entropy": 1.24265625, "epoch": 1.2107634298026542, "grad_norm": 4.125, "learning_rate": 2.212061763349454e-06, "loss": 0.1616, "mean_token_accuracy": 0.9654302883148194, "num_tokens": 5235131114.0, "step": 49450 }, { "entropy": 1.23765625, "epoch": 1.2119876597620096, "grad_norm": 1.9375, "learning_rate": 2.206203837649615e-06, "loss": 0.1555, "mean_token_accuracy": 0.9665101909637451, "num_tokens": 5240317138.0, "step": 49500 }, { "entropy": 1.24921875, "epoch": 1.2132118897213653, "grad_norm": 1.9609375, "learning_rate": 2.2003491645924195e-06, "loss": 0.1715, "mean_token_accuracy": 0.9628171730041504, "num_tokens": 5245861371.0, "step": 49550 }, { "entropy": 1.24640625, "epoch": 1.2144361196807207, "grad_norm": 2.859375, "learning_rate": 2.194497768167855e-06, "loss": 0.1703, "mean_token_accuracy": 0.9627651238441467, "num_tokens": 5251350220.0, "step": 49600 }, { "entropy": 1.2528125, "epoch": 1.2156603496400764, "grad_norm": 2.296875, "learning_rate": 2.188649672352479e-06, "loss": 0.1707, "mean_token_accuracy": 0.964025752544403, "num_tokens": 5256995465.0, "step": 49650 }, { "entropy": 1.25234375, "epoch": 1.2168845795994319, "grad_norm": 2.703125, "learning_rate": 2.1828049011093286e-06, "loss": 0.1702, "mean_token_accuracy": 0.9648704588413238, "num_tokens": 5262286472.0, "step": 49700 }, { "entropy": 1.241875, "epoch": 1.2181088095587875, "grad_norm": 3.578125, "learning_rate": 2.1769634783878182e-06, "loss": 0.1579, "mean_token_accuracy": 0.9658465564250946, "num_tokens": 5267436922.0, "step": 49750 }, { "entropy": 1.26, "epoch": 1.2193330395181432, "grad_norm": 3.453125, "learning_rate": 2.1711254281236373e-06, "loss": 0.1804, "mean_token_accuracy": 0.9622203695774079, "num_tokens": 5273103073.0, "step": 49800 }, { "entropy": 1.258125, "epoch": 1.2205572694774987, "grad_norm": 2.78125, "learning_rate": 2.1652907742386613e-06, "loss": 0.178, "mean_token_accuracy": 0.9619389712810517, "num_tokens": 5278483949.0, "step": 49850 }, { "entropy": 1.24796875, "epoch": 1.221781499436854, "grad_norm": 1.5546875, "learning_rate": 2.159459540640847e-06, "loss": 0.161, "mean_token_accuracy": 0.9660306286811828, "num_tokens": 5283427597.0, "step": 49900 }, { "entropy": 1.27984375, "epoch": 1.2230057293962098, "grad_norm": 1.5703125, "learning_rate": 2.1536317512241348e-06, "loss": 0.1777, "mean_token_accuracy": 0.9623690032958985, "num_tokens": 5288987030.0, "step": 49950 }, { "entropy": 1.2584375, "epoch": 1.2242299593555654, "grad_norm": 2.71875, "learning_rate": 2.147807429868352e-06, "loss": 0.1658, "mean_token_accuracy": 0.9644541823863984, "num_tokens": 5294529728.0, "step": 50000 }, { "epoch": 1.2242299593555654, "eval_entropy": 1.2479817708333334, "eval_loss": 0.17940963804721832, "eval_mean_token_accuracy": 0.9616454169154167, "eval_num_tokens": 5294529728.0, "eval_runtime": 604.376, "eval_samples_per_second": 15.977, "eval_steps_per_second": 0.2, "step": 50000 }, { "entropy": 1.2259375, "epoch": 1.225454189314921, "grad_norm": 2.09375, "learning_rate": 2.141986600439119e-06, "loss": 0.153, "mean_token_accuracy": 0.9670542335510254, "num_tokens": 5299381949.0, "step": 50050 }, { "entropy": 1.26140625, "epoch": 1.2266784192742766, "grad_norm": 2.625, "learning_rate": 2.1361692867877455e-06, "loss": 0.1754, "mean_token_accuracy": 0.9621517550945282, "num_tokens": 5304936166.0, "step": 50100 }, { "entropy": 1.24140625, "epoch": 1.227902649233632, "grad_norm": 0.00927734375, "learning_rate": 2.1303555127511327e-06, "loss": 0.1545, "mean_token_accuracy": 0.96613614320755, "num_tokens": 5310169155.0, "step": 50150 }, { "entropy": 1.261875, "epoch": 1.2291268791929877, "grad_norm": 2.65625, "learning_rate": 2.124545302151681e-06, "loss": 0.1693, "mean_token_accuracy": 0.9642032277584076, "num_tokens": 5315607723.0, "step": 50200 }, { "entropy": 1.26796875, "epoch": 1.2303511091523431, "grad_norm": 2.640625, "learning_rate": 2.118738678797191e-06, "loss": 0.1677, "mean_token_accuracy": 0.9641611945629119, "num_tokens": 5321112342.0, "step": 50250 }, { "entropy": 1.25578125, "epoch": 1.2315753391116988, "grad_norm": 3.6875, "learning_rate": 2.112935666480758e-06, "loss": 0.1583, "mean_token_accuracy": 0.965636430978775, "num_tokens": 5326352547.0, "step": 50300 }, { "entropy": 1.26484375, "epoch": 1.2327995690710543, "grad_norm": 2.046875, "learning_rate": 2.1071362889806863e-06, "loss": 0.1729, "mean_token_accuracy": 0.963402829170227, "num_tokens": 5331870603.0, "step": 50350 }, { "entropy": 1.27546875, "epoch": 1.23402379903041, "grad_norm": 2.46875, "learning_rate": 2.101340570060385e-06, "loss": 0.1711, "mean_token_accuracy": 0.9636083686351776, "num_tokens": 5337306717.0, "step": 50400 }, { "entropy": 1.24609375, "epoch": 1.2352480289897654, "grad_norm": 4.125, "learning_rate": 2.09554853346827e-06, "loss": 0.1558, "mean_token_accuracy": 0.9663618934154511, "num_tokens": 5342628594.0, "step": 50450 }, { "entropy": 1.2796875, "epoch": 1.236472258949121, "grad_norm": 2.1875, "learning_rate": 2.089760202937671e-06, "loss": 0.1711, "mean_token_accuracy": 0.9637987637519836, "num_tokens": 5348316678.0, "step": 50500 }, { "entropy": 1.2409375, "epoch": 1.2376964889084765, "grad_norm": 4.09375, "learning_rate": 2.0839756021867306e-06, "loss": 0.1499, "mean_token_accuracy": 0.967620609998703, "num_tokens": 5353095952.0, "step": 50550 }, { "entropy": 1.26671875, "epoch": 1.2389207188678322, "grad_norm": 2.1875, "learning_rate": 2.07819475491831e-06, "loss": 0.1675, "mean_token_accuracy": 0.9643842697143554, "num_tokens": 5358561384.0, "step": 50600 }, { "entropy": 1.25734375, "epoch": 1.2401449488271876, "grad_norm": 3.546875, "learning_rate": 2.0724176848198856e-06, "loss": 0.1578, "mean_token_accuracy": 0.9659811770915985, "num_tokens": 5363968041.0, "step": 50650 }, { "entropy": 1.2559375, "epoch": 1.2413691787865433, "grad_norm": 3.265625, "learning_rate": 2.0666444155634613e-06, "loss": 0.1678, "mean_token_accuracy": 0.9649008166790009, "num_tokens": 5369138043.0, "step": 50700 }, { "entropy": 1.2790625, "epoch": 1.2425934087458987, "grad_norm": 1.796875, "learning_rate": 2.0608749708054666e-06, "loss": 0.1717, "mean_token_accuracy": 0.9624824106693268, "num_tokens": 5374681050.0, "step": 50750 }, { "entropy": 1.274375, "epoch": 1.2438176387052544, "grad_norm": 3.5, "learning_rate": 2.0551093741866555e-06, "loss": 0.1653, "mean_token_accuracy": 0.964318573474884, "num_tokens": 5379930328.0, "step": 50800 }, { "entropy": 1.2709375, "epoch": 1.24504186866461, "grad_norm": 2.328125, "learning_rate": 2.0493476493320182e-06, "loss": 0.1639, "mean_token_accuracy": 0.9642879796028138, "num_tokens": 5385290824.0, "step": 50850 }, { "entropy": 1.27515625, "epoch": 1.2462660986239655, "grad_norm": 3.15625, "learning_rate": 2.043589819850679e-06, "loss": 0.1784, "mean_token_accuracy": 0.9621766293048859, "num_tokens": 5390915687.0, "step": 50900 }, { "entropy": 1.26828125, "epoch": 1.247490328583321, "grad_norm": 2.671875, "learning_rate": 2.037835909335799e-06, "loss": 0.1653, "mean_token_accuracy": 0.9644598591327668, "num_tokens": 5396364664.0, "step": 50950 }, { "entropy": 1.23453125, "epoch": 1.2487145585426767, "grad_norm": 3.4375, "learning_rate": 2.032085941364483e-06, "loss": 0.1475, "mean_token_accuracy": 0.9683002579212189, "num_tokens": 5401284379.0, "step": 51000 }, { "entropy": 1.264375, "epoch": 1.2499387885020323, "grad_norm": 2.671875, "learning_rate": 2.026339939497681e-06, "loss": 0.1672, "mean_token_accuracy": 0.9641962945461273, "num_tokens": 5406818098.0, "step": 51050 }, { "entropy": 1.23828125, "epoch": 1.2511630184613878, "grad_norm": 0.2138671875, "learning_rate": 2.020597927280089e-06, "loss": 0.1498, "mean_token_accuracy": 0.9685159015655518, "num_tokens": 5411689647.0, "step": 51100 }, { "entropy": 1.24640625, "epoch": 1.2523872484207432, "grad_norm": 2.640625, "learning_rate": 2.014859928240058e-06, "loss": 0.1583, "mean_token_accuracy": 0.9665188646316528, "num_tokens": 5416677115.0, "step": 51150 }, { "entropy": 1.24796875, "epoch": 1.253611478380099, "grad_norm": 1.84375, "learning_rate": 2.0091259658894926e-06, "loss": 0.1525, "mean_token_accuracy": 0.9675477313995361, "num_tokens": 5422071895.0, "step": 51200 }, { "entropy": 1.27703125, "epoch": 1.2548357083394546, "grad_norm": 2.15625, "learning_rate": 2.00339606372376e-06, "loss": 0.1796, "mean_token_accuracy": 0.9615858125686646, "num_tokens": 5427896152.0, "step": 51250 }, { "entropy": 1.25203125, "epoch": 1.25605993829881, "grad_norm": 2.21875, "learning_rate": 1.9976702452215846e-06, "loss": 0.1615, "mean_token_accuracy": 0.9655699288845062, "num_tokens": 5432956715.0, "step": 51300 }, { "entropy": 1.25671875, "epoch": 1.2572841682581657, "grad_norm": 2.5, "learning_rate": 1.9919485338449633e-06, "loss": 0.1669, "mean_token_accuracy": 0.963955899477005, "num_tokens": 5438521726.0, "step": 51350 }, { "entropy": 1.25890625, "epoch": 1.2585083982175211, "grad_norm": 3.671875, "learning_rate": 1.9862309530390627e-06, "loss": 0.1604, "mean_token_accuracy": 0.9649885761737823, "num_tokens": 5443663826.0, "step": 51400 }, { "entropy": 1.25375, "epoch": 1.2597326281768768, "grad_norm": 1.703125, "learning_rate": 1.98051752623212e-06, "loss": 0.1607, "mean_token_accuracy": 0.9659333276748657, "num_tokens": 5448801306.0, "step": 51450 }, { "entropy": 1.26546875, "epoch": 1.2609568581362323, "grad_norm": 2.234375, "learning_rate": 1.9748082768353554e-06, "loss": 0.1624, "mean_token_accuracy": 0.9649898850917816, "num_tokens": 5454048809.0, "step": 51500 }, { "entropy": 1.2559375, "epoch": 1.262181088095588, "grad_norm": 3.40625, "learning_rate": 1.969103228242872e-06, "loss": 0.1671, "mean_token_accuracy": 0.9636943113803863, "num_tokens": 5459063221.0, "step": 51550 }, { "entropy": 1.26359375, "epoch": 1.2634053180549434, "grad_norm": 0.01025390625, "learning_rate": 1.9634024038315556e-06, "loss": 0.1555, "mean_token_accuracy": 0.9668670952320099, "num_tokens": 5464218533.0, "step": 51600 }, { "entropy": 1.26984375, "epoch": 1.264629548014299, "grad_norm": 1.796875, "learning_rate": 1.9577058269609873e-06, "loss": 0.1677, "mean_token_accuracy": 0.9646493744850159, "num_tokens": 5469633751.0, "step": 51650 }, { "entropy": 1.26015625, "epoch": 1.2658537779736545, "grad_norm": 2.5, "learning_rate": 1.9520135209733434e-06, "loss": 0.1548, "mean_token_accuracy": 0.9670298910140991, "num_tokens": 5474658175.0, "step": 51700 }, { "entropy": 1.24671875, "epoch": 1.2670780079330102, "grad_norm": 2.921875, "learning_rate": 1.9463255091932946e-06, "loss": 0.168, "mean_token_accuracy": 0.9642450773715973, "num_tokens": 5480009732.0, "step": 51750 }, { "entropy": 1.25875, "epoch": 1.2683022378923656, "grad_norm": 2.703125, "learning_rate": 1.9406418149279224e-06, "loss": 0.1667, "mean_token_accuracy": 0.9646876096725464, "num_tokens": 5485352642.0, "step": 51800 }, { "entropy": 1.25078125, "epoch": 1.2695264678517213, "grad_norm": 2.40625, "learning_rate": 1.9349624614666137e-06, "loss": 0.1599, "mean_token_accuracy": 0.9663380241394043, "num_tokens": 5490516069.0, "step": 51850 }, { "entropy": 1.2540625, "epoch": 1.270750697811077, "grad_norm": 2.125, "learning_rate": 1.9292874720809706e-06, "loss": 0.1691, "mean_token_accuracy": 0.9637067282199859, "num_tokens": 5495858878.0, "step": 51900 }, { "entropy": 1.2459375, "epoch": 1.2719749277704324, "grad_norm": 2.03125, "learning_rate": 1.9236168700247085e-06, "loss": 0.1597, "mean_token_accuracy": 0.9652304399013519, "num_tokens": 5500992334.0, "step": 51950 }, { "entropy": 1.26390625, "epoch": 1.2731991577297879, "grad_norm": 2.40625, "learning_rate": 1.9179506785335695e-06, "loss": 0.1784, "mean_token_accuracy": 0.9612833940982819, "num_tokens": 5506364973.0, "step": 52000 }, { "entropy": 1.2540625, "epoch": 1.2744233876891435, "grad_norm": 3.09375, "learning_rate": 1.912288920825224e-06, "loss": 0.1668, "mean_token_accuracy": 0.9639379584789276, "num_tokens": 5511847363.0, "step": 52050 }, { "entropy": 1.26140625, "epoch": 1.2756476176484992, "grad_norm": 3.59375, "learning_rate": 1.9066316200991702e-06, "loss": 0.1739, "mean_token_accuracy": 0.9622644722461701, "num_tokens": 5517402202.0, "step": 52100 }, { "entropy": 1.23109375, "epoch": 1.2768718476078547, "grad_norm": 2.5625, "learning_rate": 1.9009787995366464e-06, "loss": 0.1571, "mean_token_accuracy": 0.9665352630615235, "num_tokens": 5522479618.0, "step": 52150 }, { "entropy": 1.2396875, "epoch": 1.27809607756721, "grad_norm": 1.8046875, "learning_rate": 1.8953304823005346e-06, "loss": 0.159, "mean_token_accuracy": 0.965977475643158, "num_tokens": 5527761846.0, "step": 52200 }, { "entropy": 1.24140625, "epoch": 1.2793203075265658, "grad_norm": 2.765625, "learning_rate": 1.889686691535259e-06, "loss": 0.1713, "mean_token_accuracy": 0.9641374492645264, "num_tokens": 5533078395.0, "step": 52250 }, { "entropy": 1.22796875, "epoch": 1.2805445374859215, "grad_norm": 2.0625, "learning_rate": 1.8840474503667003e-06, "loss": 0.1613, "mean_token_accuracy": 0.96567800283432, "num_tokens": 5538079639.0, "step": 52300 }, { "entropy": 1.233125, "epoch": 1.281768767445277, "grad_norm": 1.828125, "learning_rate": 1.8784127819020977e-06, "loss": 0.1696, "mean_token_accuracy": 0.9639940130710601, "num_tokens": 5543060468.0, "step": 52350 }, { "entropy": 1.23828125, "epoch": 1.2829929974046324, "grad_norm": 2.40625, "learning_rate": 1.8727827092299486e-06, "loss": 0.1713, "mean_token_accuracy": 0.9634285986423492, "num_tokens": 5548455628.0, "step": 52400 }, { "entropy": 1.2671875, "epoch": 1.284217227363988, "grad_norm": 1.8515625, "learning_rate": 1.8671572554199227e-06, "loss": 0.1745, "mean_token_accuracy": 0.9630351853370667, "num_tokens": 5554243712.0, "step": 52450 }, { "entropy": 1.256875, "epoch": 1.2854414573233437, "grad_norm": 3.09375, "learning_rate": 1.8615364435227627e-06, "loss": 0.1713, "mean_token_accuracy": 0.9632880544662475, "num_tokens": 5559645728.0, "step": 52500 }, { "entropy": 1.25578125, "epoch": 1.2866656872826991, "grad_norm": 2.4375, "learning_rate": 1.8559202965701921e-06, "loss": 0.1729, "mean_token_accuracy": 0.9628579890727997, "num_tokens": 5565441017.0, "step": 52550 }, { "entropy": 1.241875, "epoch": 1.2878899172420548, "grad_norm": 3.0625, "learning_rate": 1.850308837574815e-06, "loss": 0.1567, "mean_token_accuracy": 0.9662058663368225, "num_tokens": 5570548727.0, "step": 52600 }, { "entropy": 1.2465625, "epoch": 1.2891141472014103, "grad_norm": 3.671875, "learning_rate": 1.8447020895300304e-06, "loss": 0.1627, "mean_token_accuracy": 0.9654901123046875, "num_tokens": 5575812384.0, "step": 52650 }, { "entropy": 1.26609375, "epoch": 1.290338377160766, "grad_norm": 2.609375, "learning_rate": 1.8391000754099329e-06, "loss": 0.1704, "mean_token_accuracy": 0.9641706418991088, "num_tokens": 5581119333.0, "step": 52700 }, { "entropy": 1.25296875, "epoch": 1.2915626071201214, "grad_norm": 3.484375, "learning_rate": 1.8335028181692183e-06, "loss": 0.1591, "mean_token_accuracy": 0.9657709896564484, "num_tokens": 5586146551.0, "step": 52750 }, { "entropy": 1.26609375, "epoch": 1.292786837079477, "grad_norm": 2.15625, "learning_rate": 1.8279103407430918e-06, "loss": 0.1682, "mean_token_accuracy": 0.9645370328426361, "num_tokens": 5591535827.0, "step": 52800 }, { "entropy": 1.24609375, "epoch": 1.2940110670388325, "grad_norm": 3.09375, "learning_rate": 1.822322666047173e-06, "loss": 0.156, "mean_token_accuracy": 0.966865359544754, "num_tokens": 5596513224.0, "step": 52850 }, { "entropy": 1.23125, "epoch": 1.2952352969981882, "grad_norm": 2.59375, "learning_rate": 1.8167398169774003e-06, "loss": 0.1562, "mean_token_accuracy": 0.9663991129398346, "num_tokens": 5601409756.0, "step": 52900 }, { "entropy": 1.24203125, "epoch": 1.2964595269575436, "grad_norm": 2.09375, "learning_rate": 1.8111618164099405e-06, "loss": 0.1586, "mean_token_accuracy": 0.965841782093048, "num_tokens": 5606579901.0, "step": 52950 }, { "entropy": 1.25640625, "epoch": 1.2976837569168993, "grad_norm": 4.40625, "learning_rate": 1.805588687201094e-06, "loss": 0.1551, "mean_token_accuracy": 0.9661786913871765, "num_tokens": 5611890254.0, "step": 53000 }, { "entropy": 1.27453125, "epoch": 1.2989079868762547, "grad_norm": 2.9375, "learning_rate": 1.8000204521871968e-06, "loss": 0.1736, "mean_token_accuracy": 0.9631719040870667, "num_tokens": 5617317192.0, "step": 53050 }, { "entropy": 1.254375, "epoch": 1.3001322168356104, "grad_norm": 2.0625, "learning_rate": 1.7944571341845338e-06, "loss": 0.1735, "mean_token_accuracy": 0.9628773295879364, "num_tokens": 5622759860.0, "step": 53100 }, { "entropy": 1.251875, "epoch": 1.301356446794966, "grad_norm": 3.328125, "learning_rate": 1.788898755989241e-06, "loss": 0.1544, "mean_token_accuracy": 0.966829891204834, "num_tokens": 5628009830.0, "step": 53150 }, { "entropy": 1.2346875, "epoch": 1.3025806767543215, "grad_norm": 1.9140625, "learning_rate": 1.7833453403772148e-06, "loss": 0.1496, "mean_token_accuracy": 0.9679068636894226, "num_tokens": 5633028331.0, "step": 53200 }, { "entropy": 1.23625, "epoch": 1.303804906713677, "grad_norm": 2.765625, "learning_rate": 1.7777969101040137e-06, "loss": 0.1598, "mean_token_accuracy": 0.9658224785327911, "num_tokens": 5638192081.0, "step": 53250 }, { "entropy": 1.25921875, "epoch": 1.3050291366730327, "grad_norm": 1.765625, "learning_rate": 1.7722534879047704e-06, "loss": 0.1679, "mean_token_accuracy": 0.9648814105987549, "num_tokens": 5643678649.0, "step": 53300 }, { "entropy": 1.23703125, "epoch": 1.3062533666323883, "grad_norm": 2.171875, "learning_rate": 1.7667150964940981e-06, "loss": 0.1542, "mean_token_accuracy": 0.9665197932720184, "num_tokens": 5648865610.0, "step": 53350 }, { "entropy": 1.2546875, "epoch": 1.3074775965917438, "grad_norm": 3.46875, "learning_rate": 1.7611817585659915e-06, "loss": 0.1695, "mean_token_accuracy": 0.96389883518219, "num_tokens": 5654452208.0, "step": 53400 }, { "entropy": 1.23046875, "epoch": 1.3087018265510992, "grad_norm": 2.9375, "learning_rate": 1.7556534967937428e-06, "loss": 0.1477, "mean_token_accuracy": 0.967578010559082, "num_tokens": 5659553855.0, "step": 53450 }, { "entropy": 1.2696875, "epoch": 1.309926056510455, "grad_norm": 2.71875, "learning_rate": 1.750130333829843e-06, "loss": 0.174, "mean_token_accuracy": 0.9626197755336762, "num_tokens": 5665208689.0, "step": 53500 }, { "entropy": 1.230625, "epoch": 1.3111502864698106, "grad_norm": 2.265625, "learning_rate": 1.744612292305887e-06, "loss": 0.1488, "mean_token_accuracy": 0.9678456223011017, "num_tokens": 5670219320.0, "step": 53550 }, { "entropy": 1.26109375, "epoch": 1.312374516429166, "grad_norm": 3.46875, "learning_rate": 1.73909939483249e-06, "loss": 0.176, "mean_token_accuracy": 0.9616470074653626, "num_tokens": 5676005681.0, "step": 53600 }, { "entropy": 1.23359375, "epoch": 1.3135987463885217, "grad_norm": 3.46875, "learning_rate": 1.7335916639991833e-06, "loss": 0.1579, "mean_token_accuracy": 0.9656192350387574, "num_tokens": 5680838804.0, "step": 53650 }, { "entropy": 1.24828125, "epoch": 1.3148229763478771, "grad_norm": 3.25, "learning_rate": 1.7280891223743347e-06, "loss": 0.1663, "mean_token_accuracy": 0.9647430288791656, "num_tokens": 5686118856.0, "step": 53700 }, { "entropy": 1.25203125, "epoch": 1.3160472063072328, "grad_norm": 1.7890625, "learning_rate": 1.7225917925050384e-06, "loss": 0.1808, "mean_token_accuracy": 0.9617255198955535, "num_tokens": 5691606584.0, "step": 53750 }, { "entropy": 1.23875, "epoch": 1.3172714362665883, "grad_norm": 2.796875, "learning_rate": 1.7170996969170434e-06, "loss": 0.1643, "mean_token_accuracy": 0.9644413828849793, "num_tokens": 5697025528.0, "step": 53800 }, { "entropy": 1.23609375, "epoch": 1.318495666225944, "grad_norm": 3.0, "learning_rate": 1.7116128581146443e-06, "loss": 0.1579, "mean_token_accuracy": 0.9660075342655182, "num_tokens": 5702129646.0, "step": 53850 }, { "entropy": 1.239375, "epoch": 1.3197198961852994, "grad_norm": 2.46875, "learning_rate": 1.7061312985805986e-06, "loss": 0.1659, "mean_token_accuracy": 0.9642334473133087, "num_tokens": 5707290385.0, "step": 53900 }, { "entropy": 1.23515625, "epoch": 1.320944126144655, "grad_norm": 3.421875, "learning_rate": 1.7006550407760285e-06, "loss": 0.1636, "mean_token_accuracy": 0.9647632312774658, "num_tokens": 5712555849.0, "step": 53950 }, { "entropy": 1.2396875, "epoch": 1.3221683561040105, "grad_norm": 3.71875, "learning_rate": 1.695184107140337e-06, "loss": 0.1682, "mean_token_accuracy": 0.9639084780216217, "num_tokens": 5717928890.0, "step": 54000 }, { "entropy": 1.2246875, "epoch": 1.3233925860633662, "grad_norm": 3.921875, "learning_rate": 1.6897185200911068e-06, "loss": 0.1468, "mean_token_accuracy": 0.9690938425064087, "num_tokens": 5722987021.0, "step": 54050 }, { "entropy": 1.2565625, "epoch": 1.3246168160227216, "grad_norm": 2.875, "learning_rate": 1.6842583020240137e-06, "loss": 0.166, "mean_token_accuracy": 0.9647270548343658, "num_tokens": 5728523665.0, "step": 54100 }, { "entropy": 1.2253125, "epoch": 1.3258410459820773, "grad_norm": 3.046875, "learning_rate": 1.6788034753127332e-06, "loss": 0.1509, "mean_token_accuracy": 0.9676713216304779, "num_tokens": 5733724051.0, "step": 54150 }, { "entropy": 1.2478125, "epoch": 1.327065275941433, "grad_norm": 1.7890625, "learning_rate": 1.6733540623088485e-06, "loss": 0.1703, "mean_token_accuracy": 0.9635128057003022, "num_tokens": 5739544907.0, "step": 54200 }, { "entropy": 1.245, "epoch": 1.3282895059007884, "grad_norm": 2.25, "learning_rate": 1.6679100853417647e-06, "loss": 0.1592, "mean_token_accuracy": 0.9656123912334442, "num_tokens": 5744896935.0, "step": 54250 }, { "entropy": 1.25453125, "epoch": 1.3295137358601439, "grad_norm": 2.53125, "learning_rate": 1.6624715667186047e-06, "loss": 0.1756, "mean_token_accuracy": 0.962364639043808, "num_tokens": 5750164763.0, "step": 54300 }, { "entropy": 1.23609375, "epoch": 1.3307379658194995, "grad_norm": 3.15625, "learning_rate": 1.6570385287241335e-06, "loss": 0.1577, "mean_token_accuracy": 0.9660208249092102, "num_tokens": 5755265140.0, "step": 54350 }, { "entropy": 1.25390625, "epoch": 1.3319621957788552, "grad_norm": 1.640625, "learning_rate": 1.6516109936206498e-06, "loss": 0.1756, "mean_token_accuracy": 0.9626241695880889, "num_tokens": 5760623089.0, "step": 54400 }, { "entropy": 1.246875, "epoch": 1.3331864257382107, "grad_norm": 2.125, "learning_rate": 1.646188983647912e-06, "loss": 0.1734, "mean_token_accuracy": 0.9631841456890107, "num_tokens": 5766177496.0, "step": 54450 }, { "entropy": 1.26140625, "epoch": 1.3344106556975661, "grad_norm": 2.921875, "learning_rate": 1.6407725210230344e-06, "loss": 0.1766, "mean_token_accuracy": 0.9622941052913666, "num_tokens": 5771692920.0, "step": 54500 }, { "entropy": 1.2415625, "epoch": 1.3356348856569218, "grad_norm": 4.0, "learning_rate": 1.6353616279404013e-06, "loss": 0.1569, "mean_token_accuracy": 0.9662493073940277, "num_tokens": 5777098724.0, "step": 54550 }, { "entropy": 1.23234375, "epoch": 1.3368591156162775, "grad_norm": 1.0, "learning_rate": 1.6299563265715747e-06, "loss": 0.148, "mean_token_accuracy": 0.9682403624057769, "num_tokens": 5782119917.0, "step": 54600 }, { "entropy": 1.25578125, "epoch": 1.338083345575633, "grad_norm": 3.46875, "learning_rate": 1.624556639065207e-06, "loss": 0.1594, "mean_token_accuracy": 0.9662695753574372, "num_tokens": 5787291101.0, "step": 54650 }, { "entropy": 1.24171875, "epoch": 1.3393075755349884, "grad_norm": 3.609375, "learning_rate": 1.6191625875469446e-06, "loss": 0.157, "mean_token_accuracy": 0.9663849449157715, "num_tokens": 5792520283.0, "step": 54700 }, { "entropy": 1.25046875, "epoch": 1.340531805494344, "grad_norm": 1.7734375, "learning_rate": 1.6137741941193398e-06, "loss": 0.1495, "mean_token_accuracy": 0.9671278047561646, "num_tokens": 5797431576.0, "step": 54750 }, { "entropy": 1.26546875, "epoch": 1.3417560354536997, "grad_norm": 2.734375, "learning_rate": 1.6083914808617645e-06, "loss": 0.1765, "mean_token_accuracy": 0.9622493016719819, "num_tokens": 5803286714.0, "step": 54800 }, { "entropy": 1.224375, "epoch": 1.3429802654130552, "grad_norm": 3.109375, "learning_rate": 1.6030144698303079e-06, "loss": 0.1544, "mean_token_accuracy": 0.9669049537181854, "num_tokens": 5807862828.0, "step": 54850 }, { "entropy": 1.26, "epoch": 1.3442044953724108, "grad_norm": 3.0, "learning_rate": 1.5976431830577022e-06, "loss": 0.1636, "mean_token_accuracy": 0.964913833141327, "num_tokens": 5813034358.0, "step": 54900 }, { "entropy": 1.25109375, "epoch": 1.3454287253317663, "grad_norm": 3.515625, "learning_rate": 1.5922776425532186e-06, "loss": 0.1659, "mean_token_accuracy": 0.9639725112915039, "num_tokens": 5818413943.0, "step": 54950 }, { "entropy": 1.2321875, "epoch": 1.346652955291122, "grad_norm": 2.453125, "learning_rate": 1.5869178703025869e-06, "loss": 0.1489, "mean_token_accuracy": 0.9674529373645783, "num_tokens": 5823085402.0, "step": 55000 }, { "entropy": 1.23859375, "epoch": 1.3478771852504774, "grad_norm": 1.7109375, "learning_rate": 1.5815638882678944e-06, "loss": 0.1608, "mean_token_accuracy": 0.9654952967166901, "num_tokens": 5828359072.0, "step": 55050 }, { "entropy": 1.2465625, "epoch": 1.349101415209833, "grad_norm": 2.8125, "learning_rate": 1.5762157183875092e-06, "loss": 0.1618, "mean_token_accuracy": 0.965077908039093, "num_tokens": 5833897215.0, "step": 55100 }, { "entropy": 1.24125, "epoch": 1.3503256451691885, "grad_norm": 2.25, "learning_rate": 1.5708733825759804e-06, "loss": 0.1597, "mean_token_accuracy": 0.9658141255378723, "num_tokens": 5839005187.0, "step": 55150 }, { "entropy": 1.2690625, "epoch": 1.3515498751285442, "grad_norm": 3.0625, "learning_rate": 1.5655369027239507e-06, "loss": 0.1728, "mean_token_accuracy": 0.9630602359771728, "num_tokens": 5844499544.0, "step": 55200 }, { "entropy": 1.2484375, "epoch": 1.3527741050878996, "grad_norm": 2.734375, "learning_rate": 1.5602063006980713e-06, "loss": 0.1606, "mean_token_accuracy": 0.9662463283538818, "num_tokens": 5849831304.0, "step": 55250 }, { "entropy": 1.243125, "epoch": 1.3539983350472553, "grad_norm": 4.125, "learning_rate": 1.5548815983409054e-06, "loss": 0.1584, "mean_token_accuracy": 0.9648811197280884, "num_tokens": 5854831384.0, "step": 55300 }, { "entropy": 1.2475, "epoch": 1.3552225650066108, "grad_norm": 0.0169677734375, "learning_rate": 1.5495628174708422e-06, "loss": 0.1583, "mean_token_accuracy": 0.9666490364074707, "num_tokens": 5860380821.0, "step": 55350 }, { "entropy": 1.2428125, "epoch": 1.3564467949659664, "grad_norm": 2.96875, "learning_rate": 1.5442499798820062e-06, "loss": 0.1636, "mean_token_accuracy": 0.9649770343303681, "num_tokens": 5865590076.0, "step": 55400 }, { "entropy": 1.25265625, "epoch": 1.357671024925322, "grad_norm": 5.0625, "learning_rate": 1.5389431073441742e-06, "loss": 0.1625, "mean_token_accuracy": 0.9651528835296631, "num_tokens": 5870893580.0, "step": 55450 }, { "entropy": 1.2590625, "epoch": 1.3588952548846776, "grad_norm": 3.03125, "learning_rate": 1.5336422216026717e-06, "loss": 0.1708, "mean_token_accuracy": 0.9625674414634705, "num_tokens": 5876137820.0, "step": 55500 }, { "entropy": 1.24515625, "epoch": 1.360119484844033, "grad_norm": 3.046875, "learning_rate": 1.5283473443783021e-06, "loss": 0.1575, "mean_token_accuracy": 0.9658649146556855, "num_tokens": 5881136105.0, "step": 55550 }, { "entropy": 1.2434375, "epoch": 1.3613437148033887, "grad_norm": 3.015625, "learning_rate": 1.5230584973672404e-06, "loss": 0.1716, "mean_token_accuracy": 0.9642657494544983, "num_tokens": 5886333380.0, "step": 55600 }, { "entropy": 1.2628125, "epoch": 1.3625679447627443, "grad_norm": 1.6640625, "learning_rate": 1.5177757022409606e-06, "loss": 0.1788, "mean_token_accuracy": 0.9618762648105621, "num_tokens": 5892147042.0, "step": 55650 }, { "entropy": 1.25875, "epoch": 1.3637921747220998, "grad_norm": 3.0625, "learning_rate": 1.5124989806461293e-06, "loss": 0.1678, "mean_token_accuracy": 0.9644319689273835, "num_tokens": 5897583102.0, "step": 55700 }, { "entropy": 1.2546875, "epoch": 1.3650164046814552, "grad_norm": 2.453125, "learning_rate": 1.5072283542045348e-06, "loss": 0.1558, "mean_token_accuracy": 0.9658961379528046, "num_tokens": 5902701860.0, "step": 55750 }, { "entropy": 1.2584375, "epoch": 1.366240634640811, "grad_norm": 3.03125, "learning_rate": 1.5019638445129849e-06, "loss": 0.1656, "mean_token_accuracy": 0.9642118716239929, "num_tokens": 5908066266.0, "step": 55800 }, { "entropy": 1.24375, "epoch": 1.3674648646001666, "grad_norm": 1.8203125, "learning_rate": 1.496705473143224e-06, "loss": 0.1467, "mean_token_accuracy": 0.9683407878875733, "num_tokens": 5913106858.0, "step": 55850 }, { "entropy": 1.24109375, "epoch": 1.368689094559522, "grad_norm": 2.65625, "learning_rate": 1.4914532616418477e-06, "loss": 0.1619, "mean_token_accuracy": 0.9651940071582794, "num_tokens": 5918299911.0, "step": 55900 }, { "entropy": 1.24296875, "epoch": 1.3699133245188777, "grad_norm": 3.015625, "learning_rate": 1.486207231530207e-06, "loss": 0.1533, "mean_token_accuracy": 0.966886637210846, "num_tokens": 5923373367.0, "step": 55950 }, { "entropy": 1.25984375, "epoch": 1.3711375544782332, "grad_norm": 2.21875, "learning_rate": 1.4809674043043262e-06, "loss": 0.1714, "mean_token_accuracy": 0.9631552195549011, "num_tokens": 5928830248.0, "step": 56000 }, { "entropy": 1.24640625, "epoch": 1.3723617844375888, "grad_norm": 2.84375, "learning_rate": 1.4757338014348108e-06, "loss": 0.17, "mean_token_accuracy": 0.9638724672794342, "num_tokens": 5934360325.0, "step": 56050 }, { "entropy": 1.2428125, "epoch": 1.3735860143969443, "grad_norm": 2.296875, "learning_rate": 1.4705064443667672e-06, "loss": 0.1672, "mean_token_accuracy": 0.9640205073356628, "num_tokens": 5939749032.0, "step": 56100 }, { "entropy": 1.2396875, "epoch": 1.3748102443563, "grad_norm": 3.140625, "learning_rate": 1.4652853545196994e-06, "loss": 0.1698, "mean_token_accuracy": 0.9635356509685516, "num_tokens": 5944946908.0, "step": 56150 }, { "entropy": 1.2471875, "epoch": 1.3760344743156554, "grad_norm": 1.9765625, "learning_rate": 1.4600705532874409e-06, "loss": 0.1612, "mean_token_accuracy": 0.9657069194316864, "num_tokens": 5950153678.0, "step": 56200 }, { "entropy": 1.2515625, "epoch": 1.377258704275011, "grad_norm": 1.6953125, "learning_rate": 1.45486206203805e-06, "loss": 0.1694, "mean_token_accuracy": 0.9643088591098785, "num_tokens": 5955488321.0, "step": 56250 }, { "entropy": 1.24984375, "epoch": 1.3784829342343665, "grad_norm": 2.296875, "learning_rate": 1.4496599021137346e-06, "loss": 0.1802, "mean_token_accuracy": 0.9621450281143189, "num_tokens": 5961263793.0, "step": 56300 }, { "entropy": 1.22625, "epoch": 1.3797071641937222, "grad_norm": 2.203125, "learning_rate": 1.4444640948307554e-06, "loss": 0.1567, "mean_token_accuracy": 0.9664753973484039, "num_tokens": 5966590895.0, "step": 56350 }, { "entropy": 1.2453125, "epoch": 1.3809313941530776, "grad_norm": 1.9921875, "learning_rate": 1.4392746614793446e-06, "loss": 0.162, "mean_token_accuracy": 0.9654717576503754, "num_tokens": 5972160004.0, "step": 56400 }, { "entropy": 1.24125, "epoch": 1.3821556241124333, "grad_norm": 2.90625, "learning_rate": 1.4340916233236167e-06, "loss": 0.1685, "mean_token_accuracy": 0.9643662881851196, "num_tokens": 5977855909.0, "step": 56450 }, { "entropy": 1.2490625, "epoch": 1.383379854071789, "grad_norm": 1.6171875, "learning_rate": 1.4289150016014792e-06, "loss": 0.1663, "mean_token_accuracy": 0.9650551450252532, "num_tokens": 5983284719.0, "step": 56500 }, { "entropy": 1.245, "epoch": 1.3846040840311444, "grad_norm": 1.8046875, "learning_rate": 1.4237448175245523e-06, "loss": 0.1565, "mean_token_accuracy": 0.9658044958114624, "num_tokens": 5988559128.0, "step": 56550 }, { "entropy": 1.24140625, "epoch": 1.3858283139904999, "grad_norm": 2.234375, "learning_rate": 1.4185810922780736e-06, "loss": 0.1665, "mean_token_accuracy": 0.9643181717395782, "num_tokens": 5993939256.0, "step": 56600 }, { "entropy": 1.245, "epoch": 1.3870525439498556, "grad_norm": 2.796875, "learning_rate": 1.413423847020816e-06, "loss": 0.1721, "mean_token_accuracy": 0.963967101573944, "num_tokens": 5999401709.0, "step": 56650 }, { "entropy": 1.23953125, "epoch": 1.3882767739092112, "grad_norm": 3.203125, "learning_rate": 1.4082731028849995e-06, "loss": 0.1636, "mean_token_accuracy": 0.9649562358856201, "num_tokens": 6004763257.0, "step": 56700 }, { "entropy": 1.263125, "epoch": 1.3895010038685667, "grad_norm": 2.3125, "learning_rate": 1.4031288809762096e-06, "loss": 0.1734, "mean_token_accuracy": 0.9629300630092621, "num_tokens": 6010451639.0, "step": 56750 }, { "entropy": 1.23171875, "epoch": 1.3907252338279221, "grad_norm": 2.734375, "learning_rate": 1.397991202373298e-06, "loss": 0.16, "mean_token_accuracy": 0.9664403641223908, "num_tokens": 6015769794.0, "step": 56800 }, { "entropy": 1.24171875, "epoch": 1.3919494637872778, "grad_norm": 1.71875, "learning_rate": 1.3928600881283135e-06, "loss": 0.1741, "mean_token_accuracy": 0.9627274203300477, "num_tokens": 6020957098.0, "step": 56850 }, { "entropy": 1.2315625, "epoch": 1.3931736937466335, "grad_norm": 0.00994873046875, "learning_rate": 1.3877355592664005e-06, "loss": 0.1509, "mean_token_accuracy": 0.9681152474880218, "num_tokens": 6026298682.0, "step": 56900 }, { "entropy": 1.24703125, "epoch": 1.394397923705989, "grad_norm": 2.09375, "learning_rate": 1.3826176367857244e-06, "loss": 0.1599, "mean_token_accuracy": 0.9659165751934051, "num_tokens": 6031577635.0, "step": 56950 }, { "entropy": 1.23828125, "epoch": 1.3956221536653444, "grad_norm": 3.734375, "learning_rate": 1.3775063416573772e-06, "loss": 0.1602, "mean_token_accuracy": 0.9653304886817932, "num_tokens": 6036759854.0, "step": 57000 }, { "entropy": 1.23265625, "epoch": 1.3968463836247, "grad_norm": 1.6875, "learning_rate": 1.3724016948252932e-06, "loss": 0.1561, "mean_token_accuracy": 0.9671315121650695, "num_tokens": 6042005844.0, "step": 57050 }, { "entropy": 1.245, "epoch": 1.3980706135840557, "grad_norm": 4.125, "learning_rate": 1.3673037172061715e-06, "loss": 0.1645, "mean_token_accuracy": 0.9652763676643371, "num_tokens": 6047109956.0, "step": 57100 }, { "entropy": 1.23859375, "epoch": 1.3992948435434112, "grad_norm": 3.53125, "learning_rate": 1.362212429689374e-06, "loss": 0.1638, "mean_token_accuracy": 0.9652803325653077, "num_tokens": 6052155256.0, "step": 57150 }, { "entropy": 1.270625, "epoch": 1.4005190735027668, "grad_norm": 2.140625, "learning_rate": 1.3571278531368583e-06, "loss": 0.1746, "mean_token_accuracy": 0.9618336653709412, "num_tokens": 6057754576.0, "step": 57200 }, { "entropy": 1.25484375, "epoch": 1.4017433034621223, "grad_norm": 1.7109375, "learning_rate": 1.3520500083830786e-06, "loss": 0.1611, "mean_token_accuracy": 0.9656724345684051, "num_tokens": 6063117197.0, "step": 57250 }, { "entropy": 1.25125, "epoch": 1.402967533421478, "grad_norm": 2.5625, "learning_rate": 1.346978916234905e-06, "loss": 0.1737, "mean_token_accuracy": 0.9628279542922974, "num_tokens": 6068604024.0, "step": 57300 }, { "entropy": 1.25015625, "epoch": 1.4041917633808334, "grad_norm": 3.34375, "learning_rate": 1.3419145974715394e-06, "loss": 0.1561, "mean_token_accuracy": 0.9659430325031281, "num_tokens": 6073902078.0, "step": 57350 }, { "entropy": 1.26703125, "epoch": 1.405415993340189, "grad_norm": 3.859375, "learning_rate": 1.3368570728444298e-06, "loss": 0.1718, "mean_token_accuracy": 0.9625124716758728, "num_tokens": 6079405655.0, "step": 57400 }, { "entropy": 1.2446875, "epoch": 1.4066402232995445, "grad_norm": 3.828125, "learning_rate": 1.331806363077184e-06, "loss": 0.1662, "mean_token_accuracy": 0.9648419404029847, "num_tokens": 6084626144.0, "step": 57450 }, { "entropy": 1.23234375, "epoch": 1.4078644532589002, "grad_norm": 2.5625, "learning_rate": 1.3267624888654835e-06, "loss": 0.1479, "mean_token_accuracy": 0.9676874935626983, "num_tokens": 6089664069.0, "step": 57500 }, { "entropy": 1.255625, "epoch": 1.4090886832182556, "grad_norm": 2.609375, "learning_rate": 1.3217254708770053e-06, "loss": 0.1648, "mean_token_accuracy": 0.964464715719223, "num_tokens": 6095025878.0, "step": 57550 }, { "entropy": 1.25140625, "epoch": 1.4103129131776113, "grad_norm": 2.34375, "learning_rate": 1.3166953297513275e-06, "loss": 0.1638, "mean_token_accuracy": 0.9649744808673859, "num_tokens": 6100414900.0, "step": 57600 }, { "entropy": 1.24765625, "epoch": 1.4115371431369668, "grad_norm": 0.0166015625, "learning_rate": 1.311672086099852e-06, "loss": 0.1621, "mean_token_accuracy": 0.9656559634208679, "num_tokens": 6105532948.0, "step": 57650 }, { "entropy": 1.2375, "epoch": 1.4127613730963224, "grad_norm": 2.671875, "learning_rate": 1.3066557605057167e-06, "loss": 0.1633, "mean_token_accuracy": 0.9653026688098908, "num_tokens": 6110851956.0, "step": 57700 }, { "entropy": 1.26578125, "epoch": 1.413985603055678, "grad_norm": 1.9921875, "learning_rate": 1.3016463735237164e-06, "loss": 0.1721, "mean_token_accuracy": 0.9625765991210937, "num_tokens": 6116317682.0, "step": 57750 }, { "entropy": 1.2565625, "epoch": 1.4152098330150336, "grad_norm": 2.03125, "learning_rate": 1.2966439456802059e-06, "loss": 0.1742, "mean_token_accuracy": 0.9632444334030151, "num_tokens": 6122164130.0, "step": 57800 }, { "entropy": 1.24140625, "epoch": 1.416434062974389, "grad_norm": 2.234375, "learning_rate": 1.2916484974730335e-06, "loss": 0.1672, "mean_token_accuracy": 0.9641308975219727, "num_tokens": 6127574306.0, "step": 57850 }, { "entropy": 1.246875, "epoch": 1.4176582929337447, "grad_norm": 2.6875, "learning_rate": 1.2866600493714425e-06, "loss": 0.1725, "mean_token_accuracy": 0.9628300058841706, "num_tokens": 6133295960.0, "step": 57900 }, { "entropy": 1.25625, "epoch": 1.4188825228931004, "grad_norm": 2.546875, "learning_rate": 1.281678621815994e-06, "loss": 0.1727, "mean_token_accuracy": 0.9640992879867554, "num_tokens": 6138729294.0, "step": 57950 }, { "entropy": 1.21765625, "epoch": 1.4201067528524558, "grad_norm": 3.34375, "learning_rate": 1.276704235218481e-06, "loss": 0.1483, "mean_token_accuracy": 0.9675537276268006, "num_tokens": 6143658701.0, "step": 58000 }, { "entropy": 1.24703125, "epoch": 1.4213309828118112, "grad_norm": 3.359375, "learning_rate": 1.2717369099618487e-06, "loss": 0.168, "mean_token_accuracy": 0.9638211143016815, "num_tokens": 6148836685.0, "step": 58050 }, { "entropy": 1.2190625, "epoch": 1.422555212771167, "grad_norm": 3.296875, "learning_rate": 1.2667766664001044e-06, "loss": 0.1527, "mean_token_accuracy": 0.9670968425273895, "num_tokens": 6153703845.0, "step": 58100 }, { "entropy": 1.23734375, "epoch": 1.4237794427305226, "grad_norm": 2.15625, "learning_rate": 1.2618235248582383e-06, "loss": 0.1583, "mean_token_accuracy": 0.9668286955356598, "num_tokens": 6158817391.0, "step": 58150 }, { "entropy": 1.23171875, "epoch": 1.425003672689878, "grad_norm": 3.28125, "learning_rate": 1.2568775056321422e-06, "loss": 0.1593, "mean_token_accuracy": 0.9661485147476196, "num_tokens": 6163833832.0, "step": 58200 }, { "entropy": 1.24328125, "epoch": 1.4262279026492337, "grad_norm": 1.8515625, "learning_rate": 1.25193862898852e-06, "loss": 0.1737, "mean_token_accuracy": 0.9620695877075195, "num_tokens": 6169273441.0, "step": 58250 }, { "entropy": 1.2259375, "epoch": 1.4274521326085892, "grad_norm": 4.15625, "learning_rate": 1.2470069151648105e-06, "loss": 0.1605, "mean_token_accuracy": 0.964862027168274, "num_tokens": 6174358443.0, "step": 58300 }, { "entropy": 1.23609375, "epoch": 1.4286763625679448, "grad_norm": 3.1875, "learning_rate": 1.2420823843691005e-06, "loss": 0.1665, "mean_token_accuracy": 0.9651170766353607, "num_tokens": 6179906475.0, "step": 58350 }, { "entropy": 1.2340625, "epoch": 1.4299005925273003, "grad_norm": 2.90625, "learning_rate": 1.2371650567800477e-06, "loss": 0.1489, "mean_token_accuracy": 0.967512333393097, "num_tokens": 6184768923.0, "step": 58400 }, { "entropy": 1.250625, "epoch": 1.431124822486656, "grad_norm": 3.484375, "learning_rate": 1.2322549525467878e-06, "loss": 0.1697, "mean_token_accuracy": 0.9635206353664398, "num_tokens": 6190151181.0, "step": 58450 }, { "entropy": 1.23453125, "epoch": 1.4323490524460114, "grad_norm": 3.65625, "learning_rate": 1.2273520917888645e-06, "loss": 0.1624, "mean_token_accuracy": 0.9650914788246154, "num_tokens": 6195374468.0, "step": 58500 }, { "entropy": 1.24296875, "epoch": 1.433573282405367, "grad_norm": 2.046875, "learning_rate": 1.2224564945961372e-06, "loss": 0.1738, "mean_token_accuracy": 0.9630816507339478, "num_tokens": 6200703908.0, "step": 58550 }, { "entropy": 1.21984375, "epoch": 1.4347975123647225, "grad_norm": 2.96875, "learning_rate": 1.2175681810287018e-06, "loss": 0.142, "mean_token_accuracy": 0.96914306640625, "num_tokens": 6205730956.0, "step": 58600 }, { "entropy": 1.24125, "epoch": 1.4360217423240782, "grad_norm": 3.109375, "learning_rate": 1.2126871711168126e-06, "loss": 0.1744, "mean_token_accuracy": 0.9625077545642853, "num_tokens": 6211224150.0, "step": 58650 }, { "entropy": 1.23828125, "epoch": 1.4372459722834336, "grad_norm": 2.890625, "learning_rate": 1.2078134848607935e-06, "loss": 0.1578, "mean_token_accuracy": 0.9665833008289337, "num_tokens": 6216480413.0, "step": 58700 }, { "entropy": 1.22734375, "epoch": 1.4384702022427893, "grad_norm": 2.328125, "learning_rate": 1.2029471422309593e-06, "loss": 0.1592, "mean_token_accuracy": 0.9655974650382996, "num_tokens": 6221594113.0, "step": 58750 }, { "entropy": 1.2396875, "epoch": 1.4396944322021448, "grad_norm": 3.15625, "learning_rate": 1.1980881631675338e-06, "loss": 0.1642, "mean_token_accuracy": 0.9646211445331574, "num_tokens": 6226912535.0, "step": 58800 }, { "entropy": 1.2421875, "epoch": 1.4409186621615004, "grad_norm": 2.953125, "learning_rate": 1.1932365675805704e-06, "loss": 0.1704, "mean_token_accuracy": 0.9632949602603912, "num_tokens": 6232510565.0, "step": 58850 }, { "entropy": 1.2271875, "epoch": 1.442142892120856, "grad_norm": 2.5625, "learning_rate": 1.1883923753498652e-06, "loss": 0.1629, "mean_token_accuracy": 0.9651079893112182, "num_tokens": 6237750599.0, "step": 58900 }, { "entropy": 1.235, "epoch": 1.4433671220802116, "grad_norm": 2.4375, "learning_rate": 1.1835556063248796e-06, "loss": 0.157, "mean_token_accuracy": 0.9665428209304809, "num_tokens": 6243089430.0, "step": 58950 }, { "entropy": 1.22171875, "epoch": 1.4445913520395672, "grad_norm": 1.9453125, "learning_rate": 1.1787262803246568e-06, "loss": 0.159, "mean_token_accuracy": 0.9651802563667298, "num_tokens": 6248152093.0, "step": 59000 }, { "entropy": 1.2453125, "epoch": 1.4458155819989227, "grad_norm": 2.078125, "learning_rate": 1.1739044171377455e-06, "loss": 0.1685, "mean_token_accuracy": 0.963554357290268, "num_tokens": 6253653648.0, "step": 59050 }, { "entropy": 1.24859375, "epoch": 1.4470398119582781, "grad_norm": 2.171875, "learning_rate": 1.1690900365221082e-06, "loss": 0.1675, "mean_token_accuracy": 0.9636942827701569, "num_tokens": 6259395328.0, "step": 59100 }, { "entropy": 1.23515625, "epoch": 1.4482640419176338, "grad_norm": 1.9609375, "learning_rate": 1.164283158205053e-06, "loss": 0.163, "mean_token_accuracy": 0.9648255848884583, "num_tokens": 6264597318.0, "step": 59150 }, { "entropy": 1.22296875, "epoch": 1.4494882718769895, "grad_norm": 2.890625, "learning_rate": 1.1594838018831444e-06, "loss": 0.1506, "mean_token_accuracy": 0.9675889956951141, "num_tokens": 6269482590.0, "step": 59200 }, { "entropy": 1.26875, "epoch": 1.450712501836345, "grad_norm": 3.71875, "learning_rate": 1.1546919872221238e-06, "loss": 0.1858, "mean_token_accuracy": 0.9605572533607483, "num_tokens": 6275753206.0, "step": 59250 }, { "entropy": 1.235625, "epoch": 1.4519367317957004, "grad_norm": 3.34375, "learning_rate": 1.1499077338568329e-06, "loss": 0.1589, "mean_token_accuracy": 0.9655532228946686, "num_tokens": 6281061992.0, "step": 59300 }, { "entropy": 1.2371875, "epoch": 1.453160961755056, "grad_norm": 2.09375, "learning_rate": 1.1451310613911282e-06, "loss": 0.1668, "mean_token_accuracy": 0.9643084633350373, "num_tokens": 6286356933.0, "step": 59350 }, { "entropy": 1.22546875, "epoch": 1.4543851917144117, "grad_norm": 0.0068359375, "learning_rate": 1.1403619893978035e-06, "loss": 0.1536, "mean_token_accuracy": 0.9669885611534119, "num_tokens": 6291298254.0, "step": 59400 }, { "entropy": 1.22484375, "epoch": 1.4556094216737672, "grad_norm": 2.734375, "learning_rate": 1.1356005374185075e-06, "loss": 0.1541, "mean_token_accuracy": 0.9667747104167939, "num_tokens": 6296386141.0, "step": 59450 }, { "entropy": 1.233125, "epoch": 1.4568336516331228, "grad_norm": 2.890625, "learning_rate": 1.1308467249636693e-06, "loss": 0.1546, "mean_token_accuracy": 0.9666030180454254, "num_tokens": 6301578433.0, "step": 59500 }, { "entropy": 1.228125, "epoch": 1.4580578815924783, "grad_norm": 2.171875, "learning_rate": 1.1261005715124106e-06, "loss": 0.1653, "mean_token_accuracy": 0.9642830669879914, "num_tokens": 6306834089.0, "step": 59550 }, { "entropy": 1.24140625, "epoch": 1.459282111551834, "grad_norm": 3.28125, "learning_rate": 1.1213620965124711e-06, "loss": 0.1713, "mean_token_accuracy": 0.9641312193870545, "num_tokens": 6312270957.0, "step": 59600 }, { "entropy": 1.238125, "epoch": 1.4605063415111894, "grad_norm": 2.46875, "learning_rate": 1.1166313193801264e-06, "loss": 0.1717, "mean_token_accuracy": 0.9619838237762451, "num_tokens": 6317571444.0, "step": 59650 }, { "entropy": 1.235, "epoch": 1.461730571470545, "grad_norm": 1.6328125, "learning_rate": 1.1119082595001127e-06, "loss": 0.1617, "mean_token_accuracy": 0.9648803687095642, "num_tokens": 6322810865.0, "step": 59700 }, { "entropy": 1.24390625, "epoch": 1.4629548014299005, "grad_norm": 2.421875, "learning_rate": 1.1071929362255407e-06, "loss": 0.1768, "mean_token_accuracy": 0.9624212658405304, "num_tokens": 6328065527.0, "step": 59750 }, { "entropy": 1.2346875, "epoch": 1.4641790313892562, "grad_norm": 1.90625, "learning_rate": 1.102485368877821e-06, "loss": 0.1547, "mean_token_accuracy": 0.96669025182724, "num_tokens": 6332934140.0, "step": 59800 }, { "entropy": 1.24796875, "epoch": 1.4654032613486117, "grad_norm": 2.34375, "learning_rate": 1.0977855767465834e-06, "loss": 0.1683, "mean_token_accuracy": 0.9648297607898713, "num_tokens": 6338286149.0, "step": 59850 }, { "entropy": 1.23640625, "epoch": 1.4666274913079673, "grad_norm": 2.703125, "learning_rate": 1.0930935790895982e-06, "loss": 0.1481, "mean_token_accuracy": 0.9682129454612732, "num_tokens": 6343347728.0, "step": 59900 }, { "entropy": 1.2359375, "epoch": 1.4678517212673228, "grad_norm": 1.65625, "learning_rate": 1.0884093951326982e-06, "loss": 0.1662, "mean_token_accuracy": 0.9638714647293091, "num_tokens": 6348595585.0, "step": 59950 }, { "entropy": 1.23671875, "epoch": 1.4690759512266784, "grad_norm": 2.0625, "learning_rate": 1.083733044069698e-06, "loss": 0.1533, "mean_token_accuracy": 0.9660887753963471, "num_tokens": 6353539392.0, "step": 60000 }, { "epoch": 1.4690759512266784, "eval_entropy": 1.2380208333333333, "eval_loss": 0.17763087153434753, "eval_mean_token_accuracy": 0.9620065187414487, "eval_num_tokens": 6353539392.0, "eval_runtime": 603.0528, "eval_samples_per_second": 16.012, "eval_steps_per_second": 0.201, "step": 60000 }, { "entropy": 1.229375, "epoch": 1.4703001811860341, "grad_norm": 2.96875, "learning_rate": 1.0790645450623166e-06, "loss": 0.1552, "mean_token_accuracy": 0.9666960227489472, "num_tokens": 6358769999.0, "step": 60050 }, { "entropy": 1.23546875, "epoch": 1.4715244111453896, "grad_norm": 3.0, "learning_rate": 1.0744039172400965e-06, "loss": 0.1538, "mean_token_accuracy": 0.9672531485557556, "num_tokens": 6363778830.0, "step": 60100 }, { "entropy": 1.21921875, "epoch": 1.472748641104745, "grad_norm": 2.171875, "learning_rate": 1.0697511797003325e-06, "loss": 0.1562, "mean_token_accuracy": 0.9664645326137543, "num_tokens": 6368813861.0, "step": 60150 }, { "entropy": 1.2353125, "epoch": 1.4739728710641007, "grad_norm": 2.296875, "learning_rate": 1.0651063515079833e-06, "loss": 0.1537, "mean_token_accuracy": 0.9665102756023407, "num_tokens": 6374106711.0, "step": 60200 }, { "entropy": 1.22546875, "epoch": 1.4751971010234564, "grad_norm": 1.75, "learning_rate": 1.0604694516956e-06, "loss": 0.151, "mean_token_accuracy": 0.9675907123088837, "num_tokens": 6379244247.0, "step": 60250 }, { "entropy": 1.22953125, "epoch": 1.4764213309828118, "grad_norm": 2.71875, "learning_rate": 1.055840499263247e-06, "loss": 0.1624, "mean_token_accuracy": 0.964186635017395, "num_tokens": 6384481392.0, "step": 60300 }, { "entropy": 1.23578125, "epoch": 1.4776455609421673, "grad_norm": 3.5625, "learning_rate": 1.0512195131784247e-06, "loss": 0.1575, "mean_token_accuracy": 0.965451090335846, "num_tokens": 6389460183.0, "step": 60350 }, { "entropy": 1.2571875, "epoch": 1.478869790901523, "grad_norm": 2.359375, "learning_rate": 1.0466065123759882e-06, "loss": 0.1706, "mean_token_accuracy": 0.9634547913074494, "num_tokens": 6395040346.0, "step": 60400 }, { "entropy": 1.2428125, "epoch": 1.4800940208608786, "grad_norm": 2.578125, "learning_rate": 1.0420015157580736e-06, "loss": 0.1614, "mean_token_accuracy": 0.9662553632259369, "num_tokens": 6400379406.0, "step": 60450 }, { "entropy": 1.2384375, "epoch": 1.481318250820234, "grad_norm": 2.03125, "learning_rate": 1.0374045421940215e-06, "loss": 0.1574, "mean_token_accuracy": 0.9662669360637665, "num_tokens": 6405924043.0, "step": 60500 }, { "entropy": 1.24515625, "epoch": 1.4825424807795895, "grad_norm": 2.609375, "learning_rate": 1.0328156105202916e-06, "loss": 0.1666, "mean_token_accuracy": 0.9644035375118256, "num_tokens": 6411487076.0, "step": 60550 }, { "entropy": 1.22125, "epoch": 1.4837667107389452, "grad_norm": 2.59375, "learning_rate": 1.0282347395403978e-06, "loss": 0.1501, "mean_token_accuracy": 0.9667956507205964, "num_tokens": 6416699077.0, "step": 60600 }, { "entropy": 1.22921875, "epoch": 1.4849909406983008, "grad_norm": 3.0625, "learning_rate": 1.0236619480248205e-06, "loss": 0.1649, "mean_token_accuracy": 0.9641565144062042, "num_tokens": 6421663477.0, "step": 60650 }, { "entropy": 1.223125, "epoch": 1.4862151706576563, "grad_norm": 0.012939453125, "learning_rate": 1.0190972547109352e-06, "loss": 0.1465, "mean_token_accuracy": 0.9683601307868958, "num_tokens": 6426657308.0, "step": 60700 }, { "entropy": 1.23296875, "epoch": 1.487439400617012, "grad_norm": 2.34375, "learning_rate": 1.0145406783029337e-06, "loss": 0.1654, "mean_token_accuracy": 0.9649899744987488, "num_tokens": 6432023783.0, "step": 60750 }, { "entropy": 1.2375, "epoch": 1.4886636305763674, "grad_norm": 3.046875, "learning_rate": 1.0099922374717499e-06, "loss": 0.162, "mean_token_accuracy": 0.9657110357284546, "num_tokens": 6437497556.0, "step": 60800 }, { "entropy": 1.2409375, "epoch": 1.489887860535723, "grad_norm": 3.96875, "learning_rate": 1.0054519508549797e-06, "loss": 0.177, "mean_token_accuracy": 0.9619574582576752, "num_tokens": 6443350702.0, "step": 60850 }, { "entropy": 1.23125, "epoch": 1.4911120904950785, "grad_norm": 2.78125, "learning_rate": 1.0009198370568066e-06, "loss": 0.1627, "mean_token_accuracy": 0.964564654827118, "num_tokens": 6448658491.0, "step": 60900 }, { "entropy": 1.2153125, "epoch": 1.4923363204544342, "grad_norm": 3.15625, "learning_rate": 9.96395914647927e-07, "loss": 0.1507, "mean_token_accuracy": 0.9671316814422607, "num_tokens": 6453556941.0, "step": 60950 }, { "entropy": 1.21390625, "epoch": 1.4935605504137897, "grad_norm": 2.5, "learning_rate": 9.91880202165471e-07, "loss": 0.1637, "mean_token_accuracy": 0.964778846502304, "num_tokens": 6458635664.0, "step": 61000 }, { "entropy": 1.23234375, "epoch": 1.4947847803731453, "grad_norm": 2.671875, "learning_rate": 9.873727181129275e-07, "loss": 0.17, "mean_token_accuracy": 0.9645189070701599, "num_tokens": 6464088495.0, "step": 61050 }, { "entropy": 1.23078125, "epoch": 1.4960090103325008, "grad_norm": 2.953125, "learning_rate": 9.828734809600687e-07, "loss": 0.1594, "mean_token_accuracy": 0.9656787288188934, "num_tokens": 6469234194.0, "step": 61100 }, { "entropy": 1.22921875, "epoch": 1.4972332402918564, "grad_norm": 2.25, "learning_rate": 9.783825091428782e-07, "loss": 0.1618, "mean_token_accuracy": 0.965996481180191, "num_tokens": 6474528140.0, "step": 61150 }, { "entropy": 1.2309375, "epoch": 1.498457470251212, "grad_norm": 2.234375, "learning_rate": 9.738998210634644e-07, "loss": 0.1728, "mean_token_accuracy": 0.9626871156692505, "num_tokens": 6480082901.0, "step": 61200 }, { "entropy": 1.23, "epoch": 1.4996817002105676, "grad_norm": 3.796875, "learning_rate": 9.694254350900005e-07, "loss": 0.1585, "mean_token_accuracy": 0.9654109764099121, "num_tokens": 6485373470.0, "step": 61250 }, { "entropy": 1.24640625, "epoch": 1.5009059301699232, "grad_norm": 2.15625, "learning_rate": 9.649593695566355e-07, "loss": 0.1673, "mean_token_accuracy": 0.9639886951446534, "num_tokens": 6490817618.0, "step": 61300 }, { "entropy": 1.2215625, "epoch": 1.5021301601292787, "grad_norm": 2.34375, "learning_rate": 9.605016427634272e-07, "loss": 0.1513, "mean_token_accuracy": 0.9674781799316406, "num_tokens": 6495843357.0, "step": 61350 }, { "entropy": 1.2359375, "epoch": 1.5033543900886341, "grad_norm": 1.9609375, "learning_rate": 9.560522729762628e-07, "loss": 0.1621, "mean_token_accuracy": 0.96533607006073, "num_tokens": 6500949587.0, "step": 61400 }, { "entropy": 1.2359375, "epoch": 1.5045786200479898, "grad_norm": 2.875, "learning_rate": 9.516112784267896e-07, "loss": 0.1714, "mean_token_accuracy": 0.9637338280677795, "num_tokens": 6506340396.0, "step": 61450 }, { "entropy": 1.2215625, "epoch": 1.5058028500073455, "grad_norm": 4.65625, "learning_rate": 9.471786773123337e-07, "loss": 0.1591, "mean_token_accuracy": 0.9650114715099335, "num_tokens": 6511689926.0, "step": 61500 }, { "entropy": 1.243125, "epoch": 1.507027079966701, "grad_norm": 2.859375, "learning_rate": 9.427544877958278e-07, "loss": 0.1678, "mean_token_accuracy": 0.9641639375686646, "num_tokens": 6517204008.0, "step": 61550 }, { "entropy": 1.21296875, "epoch": 1.5082513099260564, "grad_norm": 2.453125, "learning_rate": 9.383387280057409e-07, "loss": 0.1615, "mean_token_accuracy": 0.9646773946285248, "num_tokens": 6522483140.0, "step": 61600 }, { "entropy": 1.23234375, "epoch": 1.509475539885412, "grad_norm": 2.953125, "learning_rate": 9.339314160359977e-07, "loss": 0.1588, "mean_token_accuracy": 0.9658515179157257, "num_tokens": 6527644811.0, "step": 61650 }, { "entropy": 1.233125, "epoch": 1.5106997698447677, "grad_norm": 3.125, "learning_rate": 9.295325699459082e-07, "loss": 0.1629, "mean_token_accuracy": 0.9652837121486664, "num_tokens": 6532774529.0, "step": 61700 }, { "entropy": 1.22390625, "epoch": 1.5119239998041232, "grad_norm": 3.90625, "learning_rate": 9.251422077600911e-07, "loss": 0.1658, "mean_token_accuracy": 0.9642423093318939, "num_tokens": 6538188895.0, "step": 61750 }, { "entropy": 1.223125, "epoch": 1.5131482297634786, "grad_norm": 2.84375, "learning_rate": 9.207603474684063e-07, "loss": 0.1576, "mean_token_accuracy": 0.9674744582176209, "num_tokens": 6543389288.0, "step": 61800 }, { "entropy": 1.22765625, "epoch": 1.5143724597228343, "grad_norm": 2.375, "learning_rate": 9.163870070258698e-07, "loss": 0.1563, "mean_token_accuracy": 0.9665237212181091, "num_tokens": 6548548612.0, "step": 61850 }, { "entropy": 1.22171875, "epoch": 1.51559668968219, "grad_norm": 3.375, "learning_rate": 9.120222043525931e-07, "loss": 0.1515, "mean_token_accuracy": 0.9670096004009247, "num_tokens": 6553657775.0, "step": 61900 }, { "entropy": 1.23703125, "epoch": 1.5168209196415454, "grad_norm": 2.15625, "learning_rate": 9.076659573337e-07, "loss": 0.1619, "mean_token_accuracy": 0.9654546058177949, "num_tokens": 6559027325.0, "step": 61950 }, { "entropy": 1.22515625, "epoch": 1.5180451496009009, "grad_norm": 1.8359375, "learning_rate": 9.033182838192564e-07, "loss": 0.1595, "mean_token_accuracy": 0.9660532510280609, "num_tokens": 6564515287.0, "step": 62000 }, { "entropy": 1.236875, "epoch": 1.5192693795602565, "grad_norm": 1.9375, "learning_rate": 8.98979201624201e-07, "loss": 0.161, "mean_token_accuracy": 0.9655505573749542, "num_tokens": 6569987402.0, "step": 62050 }, { "entropy": 1.249375, "epoch": 1.5204936095196122, "grad_norm": 3.125, "learning_rate": 8.946487285282659e-07, "loss": 0.1724, "mean_token_accuracy": 0.9626421999931335, "num_tokens": 6575526706.0, "step": 62100 }, { "entropy": 1.225, "epoch": 1.5217178394789679, "grad_norm": 3.65625, "learning_rate": 8.903268822759075e-07, "loss": 0.1615, "mean_token_accuracy": 0.966062741279602, "num_tokens": 6580795009.0, "step": 62150 }, { "entropy": 1.23375, "epoch": 1.5229420694383233, "grad_norm": 4.5, "learning_rate": 8.860136805762319e-07, "loss": 0.1617, "mean_token_accuracy": 0.9658437705039978, "num_tokens": 6586016211.0, "step": 62200 }, { "entropy": 1.2359375, "epoch": 1.5241662993976788, "grad_norm": 2.359375, "learning_rate": 8.817091411029271e-07, "loss": 0.1593, "mean_token_accuracy": 0.966154944896698, "num_tokens": 6591160444.0, "step": 62250 }, { "entropy": 1.2134375, "epoch": 1.5253905293570345, "grad_norm": 2.390625, "learning_rate": 8.774132814941828e-07, "loss": 0.1579, "mean_token_accuracy": 0.9668165516853332, "num_tokens": 6596300228.0, "step": 62300 }, { "entropy": 1.21953125, "epoch": 1.5266147593163901, "grad_norm": 2.859375, "learning_rate": 8.731261193526248e-07, "loss": 0.1586, "mean_token_accuracy": 0.9657115602493286, "num_tokens": 6601689242.0, "step": 62350 }, { "entropy": 1.2521875, "epoch": 1.5278389892757456, "grad_norm": 3.046875, "learning_rate": 8.688476722452379e-07, "loss": 0.1732, "mean_token_accuracy": 0.9633473336696625, "num_tokens": 6607301069.0, "step": 62400 }, { "entropy": 1.23234375, "epoch": 1.529063219235101, "grad_norm": 2.0, "learning_rate": 8.645779577033011e-07, "loss": 0.1655, "mean_token_accuracy": 0.9651632213592529, "num_tokens": 6612690182.0, "step": 62450 }, { "entropy": 1.22234375, "epoch": 1.5302874491944567, "grad_norm": 2.234375, "learning_rate": 8.603169932223042e-07, "loss": 0.1644, "mean_token_accuracy": 0.9645105350017548, "num_tokens": 6618066965.0, "step": 62500 }, { "entropy": 1.2134375, "epoch": 1.5315116791538124, "grad_norm": 3.03125, "learning_rate": 8.560647962618894e-07, "loss": 0.1473, "mean_token_accuracy": 0.9680246078968048, "num_tokens": 6623009283.0, "step": 62550 }, { "entropy": 1.2359375, "epoch": 1.5327359091131678, "grad_norm": 2.9375, "learning_rate": 8.518213842457696e-07, "loss": 0.1684, "mean_token_accuracy": 0.9639063477516174, "num_tokens": 6628694150.0, "step": 62600 }, { "entropy": 1.2365625, "epoch": 1.5339601390725233, "grad_norm": 2.15625, "learning_rate": 8.475867745616605e-07, "loss": 0.1699, "mean_token_accuracy": 0.9639629209041596, "num_tokens": 6634163539.0, "step": 62650 }, { "entropy": 1.23515625, "epoch": 1.535184369031879, "grad_norm": 1.8125, "learning_rate": 8.433609845612123e-07, "loss": 0.1681, "mean_token_accuracy": 0.9637242484092713, "num_tokens": 6639673078.0, "step": 62700 }, { "entropy": 1.21796875, "epoch": 1.5364085989912346, "grad_norm": 2.1875, "learning_rate": 8.39144031559933e-07, "loss": 0.1653, "mean_token_accuracy": 0.9641383695602417, "num_tokens": 6645021375.0, "step": 62750 }, { "entropy": 1.218125, "epoch": 1.53763282895059, "grad_norm": 3.65625, "learning_rate": 8.349359328371241e-07, "loss": 0.1557, "mean_token_accuracy": 0.9672486507892608, "num_tokens": 6650282385.0, "step": 62800 }, { "entropy": 1.24453125, "epoch": 1.5388570589099455, "grad_norm": 2.265625, "learning_rate": 8.307367056357993e-07, "loss": 0.1744, "mean_token_accuracy": 0.9627921509742737, "num_tokens": 6655617849.0, "step": 62850 }, { "entropy": 1.2384375, "epoch": 1.5400812888693012, "grad_norm": 2.234375, "learning_rate": 8.265463671626277e-07, "loss": 0.1643, "mean_token_accuracy": 0.9646277320384979, "num_tokens": 6660898400.0, "step": 62900 }, { "entropy": 1.2315625, "epoch": 1.5413055188286569, "grad_norm": 2.53125, "learning_rate": 8.223649345878521e-07, "loss": 0.1595, "mean_token_accuracy": 0.9663047862052917, "num_tokens": 6666546321.0, "step": 62950 }, { "entropy": 1.21890625, "epoch": 1.5425297487880123, "grad_norm": 0.01312255859375, "learning_rate": 8.181924250452234e-07, "loss": 0.1479, "mean_token_accuracy": 0.9685621929168701, "num_tokens": 6671900409.0, "step": 63000 }, { "entropy": 1.22140625, "epoch": 1.5437539787473677, "grad_norm": 2.578125, "learning_rate": 8.140288556319295e-07, "loss": 0.1564, "mean_token_accuracy": 0.9663173937797547, "num_tokens": 6676916235.0, "step": 63050 }, { "entropy": 1.2315625, "epoch": 1.5449782087067234, "grad_norm": 2.171875, "learning_rate": 8.098742434085274e-07, "loss": 0.1619, "mean_token_accuracy": 0.9653417527675628, "num_tokens": 6681811077.0, "step": 63100 }, { "entropy": 1.2278125, "epoch": 1.546202438666079, "grad_norm": 2.609375, "learning_rate": 8.057286053988688e-07, "loss": 0.155, "mean_token_accuracy": 0.9668863129615783, "num_tokens": 6687079259.0, "step": 63150 }, { "entropy": 1.23734375, "epoch": 1.5474266686254348, "grad_norm": 2.3125, "learning_rate": 8.015919585900328e-07, "loss": 0.1698, "mean_token_accuracy": 0.9634287714958191, "num_tokens": 6692413841.0, "step": 63200 }, { "entropy": 1.20328125, "epoch": 1.5486508985847902, "grad_norm": 2.28125, "learning_rate": 7.974643199322591e-07, "loss": 0.1459, "mean_token_accuracy": 0.9686257600784302, "num_tokens": 6697530112.0, "step": 63250 }, { "entropy": 1.238125, "epoch": 1.5498751285441457, "grad_norm": 2.84375, "learning_rate": 7.933457063388733e-07, "loss": 0.171, "mean_token_accuracy": 0.9629907369613647, "num_tokens": 6702988908.0, "step": 63300 }, { "entropy": 1.215625, "epoch": 1.5510993585035013, "grad_norm": 1.6953125, "learning_rate": 7.892361346862206e-07, "loss": 0.1588, "mean_token_accuracy": 0.9652132534980774, "num_tokens": 6708127766.0, "step": 63350 }, { "entropy": 1.216875, "epoch": 1.552323588462857, "grad_norm": 3.546875, "learning_rate": 7.851356218135953e-07, "loss": 0.1565, "mean_token_accuracy": 0.9663667130470276, "num_tokens": 6713202542.0, "step": 63400 }, { "entropy": 1.2165625, "epoch": 1.5535478184222125, "grad_norm": 3.546875, "learning_rate": 7.810441845231768e-07, "loss": 0.1562, "mean_token_accuracy": 0.9665763390064239, "num_tokens": 6718170250.0, "step": 63450 }, { "entropy": 1.239375, "epoch": 1.554772048381568, "grad_norm": 2.671875, "learning_rate": 7.769618395799495e-07, "loss": 0.1701, "mean_token_accuracy": 0.9642471766471863, "num_tokens": 6723417011.0, "step": 63500 }, { "entropy": 1.20984375, "epoch": 1.5559962783409236, "grad_norm": 2.6875, "learning_rate": 7.728886037116482e-07, "loss": 0.1445, "mean_token_accuracy": 0.9684971439838409, "num_tokens": 6728453094.0, "step": 63550 }, { "entropy": 1.22625, "epoch": 1.5572205083002792, "grad_norm": 3.203125, "learning_rate": 7.688244936086779e-07, "loss": 0.1591, "mean_token_accuracy": 0.9653982555866242, "num_tokens": 6733460582.0, "step": 63600 }, { "entropy": 1.23765625, "epoch": 1.5584447382596347, "grad_norm": 1.5625, "learning_rate": 7.64769525924052e-07, "loss": 0.1631, "mean_token_accuracy": 0.9650383579730988, "num_tokens": 6739025377.0, "step": 63650 }, { "entropy": 1.241875, "epoch": 1.5596689682189901, "grad_norm": 1.921875, "learning_rate": 7.607237172733212e-07, "loss": 0.1629, "mean_token_accuracy": 0.9644639611244201, "num_tokens": 6744632607.0, "step": 63700 }, { "entropy": 1.21015625, "epoch": 1.5608931981783458, "grad_norm": 2.8125, "learning_rate": 7.566870842345078e-07, "loss": 0.1438, "mean_token_accuracy": 0.9694548106193542, "num_tokens": 6749711105.0, "step": 63750 }, { "entropy": 1.22625, "epoch": 1.5621174281377015, "grad_norm": 2.5625, "learning_rate": 7.526596433480352e-07, "loss": 0.162, "mean_token_accuracy": 0.9650256216526032, "num_tokens": 6755001114.0, "step": 63800 }, { "entropy": 1.2203125, "epoch": 1.563341658097057, "grad_norm": 2.078125, "learning_rate": 7.486414111166603e-07, "loss": 0.1585, "mean_token_accuracy": 0.9653235769271851, "num_tokens": 6760148593.0, "step": 63850 }, { "entropy": 1.22015625, "epoch": 1.5645658880564124, "grad_norm": 4.15625, "learning_rate": 7.446324040054098e-07, "loss": 0.1545, "mean_token_accuracy": 0.9676208901405334, "num_tokens": 6765196202.0, "step": 63900 }, { "entropy": 1.2396875, "epoch": 1.565790118015768, "grad_norm": 1.6640625, "learning_rate": 7.406326384415069e-07, "loss": 0.1645, "mean_token_accuracy": 0.964854439496994, "num_tokens": 6770864758.0, "step": 63950 }, { "entropy": 1.23265625, "epoch": 1.5670143479751237, "grad_norm": 4.53125, "learning_rate": 7.366421308143074e-07, "loss": 0.1678, "mean_token_accuracy": 0.9636308062076568, "num_tokens": 6776309871.0, "step": 64000 }, { "entropy": 1.22203125, "epoch": 1.5682385779344792, "grad_norm": 2.484375, "learning_rate": 7.326608974752318e-07, "loss": 0.1537, "mean_token_accuracy": 0.9670063924789428, "num_tokens": 6781591477.0, "step": 64050 }, { "entropy": 1.23421875, "epoch": 1.5694628078938346, "grad_norm": 3.671875, "learning_rate": 7.286889547377019e-07, "loss": 0.1576, "mean_token_accuracy": 0.9661747896671296, "num_tokens": 6787008758.0, "step": 64100 }, { "entropy": 1.2321875, "epoch": 1.5706870378531903, "grad_norm": 2.921875, "learning_rate": 7.247263188770635e-07, "loss": 0.1658, "mean_token_accuracy": 0.9641131579875946, "num_tokens": 6792453198.0, "step": 64150 }, { "entropy": 1.23484375, "epoch": 1.571911267812546, "grad_norm": 3.59375, "learning_rate": 7.207730061305342e-07, "loss": 0.1715, "mean_token_accuracy": 0.9631493031978607, "num_tokens": 6798199941.0, "step": 64200 }, { "entropy": 1.2353125, "epoch": 1.5731354977719014, "grad_norm": 3.3125, "learning_rate": 7.168290326971248e-07, "loss": 0.1629, "mean_token_accuracy": 0.9649174082279205, "num_tokens": 6803443062.0, "step": 64250 }, { "entropy": 1.220625, "epoch": 1.5743597277312569, "grad_norm": 2.203125, "learning_rate": 7.128944147375779e-07, "loss": 0.1518, "mean_token_accuracy": 0.967359025478363, "num_tokens": 6808707076.0, "step": 64300 }, { "entropy": 1.2209375, "epoch": 1.5755839576906125, "grad_norm": 1.9375, "learning_rate": 7.08969168374304e-07, "loss": 0.1596, "mean_token_accuracy": 0.9663796508312226, "num_tokens": 6813958298.0, "step": 64350 }, { "entropy": 1.2228125, "epoch": 1.5768081876499682, "grad_norm": 3.828125, "learning_rate": 7.050533096913104e-07, "loss": 0.162, "mean_token_accuracy": 0.9654451417922973, "num_tokens": 6819296053.0, "step": 64400 }, { "entropy": 1.228125, "epoch": 1.578032417609324, "grad_norm": 2.390625, "learning_rate": 7.011468547341376e-07, "loss": 0.1488, "mean_token_accuracy": 0.9677229869365692, "num_tokens": 6824596472.0, "step": 64450 }, { "entropy": 1.23953125, "epoch": 1.5792566475686793, "grad_norm": 2.84375, "learning_rate": 6.972498195097937e-07, "loss": 0.1723, "mean_token_accuracy": 0.962650990486145, "num_tokens": 6830407037.0, "step": 64500 }, { "entropy": 1.230625, "epoch": 1.5804808775280348, "grad_norm": 2.375, "learning_rate": 6.933622199866912e-07, "loss": 0.1624, "mean_token_accuracy": 0.9654111993312836, "num_tokens": 6835900402.0, "step": 64550 }, { "entropy": 1.234375, "epoch": 1.5817051074873905, "grad_norm": 3.0625, "learning_rate": 6.894840720945754e-07, "loss": 0.1665, "mean_token_accuracy": 0.9645081627368927, "num_tokens": 6841235827.0, "step": 64600 }, { "entropy": 1.21890625, "epoch": 1.5829293374467461, "grad_norm": 2.421875, "learning_rate": 6.856153917244647e-07, "loss": 0.1611, "mean_token_accuracy": 0.9654888653755188, "num_tokens": 6846579737.0, "step": 64650 }, { "entropy": 1.2153125, "epoch": 1.5841535674061016, "grad_norm": 3.203125, "learning_rate": 6.81756194728583e-07, "loss": 0.1546, "mean_token_accuracy": 0.9667556810379029, "num_tokens": 6851881949.0, "step": 64700 }, { "entropy": 1.22421875, "epoch": 1.585377797365457, "grad_norm": 3.046875, "learning_rate": 6.779064969202973e-07, "loss": 0.1583, "mean_token_accuracy": 0.966250067949295, "num_tokens": 6857094183.0, "step": 64750 }, { "entropy": 1.2265625, "epoch": 1.5866020273248127, "grad_norm": 2.96875, "learning_rate": 6.740663140740467e-07, "loss": 0.163, "mean_token_accuracy": 0.9652321350574493, "num_tokens": 6862381095.0, "step": 64800 }, { "entropy": 1.2184375, "epoch": 1.5878262572841684, "grad_norm": 1.9765625, "learning_rate": 6.70235661925287e-07, "loss": 0.1594, "mean_token_accuracy": 0.965182945728302, "num_tokens": 6867345829.0, "step": 64850 }, { "entropy": 1.22640625, "epoch": 1.5890504872435238, "grad_norm": 2.6875, "learning_rate": 6.664145561704173e-07, "loss": 0.1548, "mean_token_accuracy": 0.9668359410762787, "num_tokens": 6872899925.0, "step": 64900 }, { "entropy": 1.23359375, "epoch": 1.5902747172028793, "grad_norm": 2.265625, "learning_rate": 6.626030124667204e-07, "loss": 0.1695, "mean_token_accuracy": 0.9634568047523498, "num_tokens": 6878428253.0, "step": 64950 }, { "entropy": 1.22609375, "epoch": 1.591498947162235, "grad_norm": 3.40625, "learning_rate": 6.588010464323006e-07, "loss": 0.1689, "mean_token_accuracy": 0.9639648401737213, "num_tokens": 6883915733.0, "step": 65000 }, { "entropy": 1.22859375, "epoch": 1.5927231771215906, "grad_norm": 2.28125, "learning_rate": 6.550086736460136e-07, "loss": 0.1719, "mean_token_accuracy": 0.9634046721458435, "num_tokens": 6889133852.0, "step": 65050 }, { "entropy": 1.23578125, "epoch": 1.593947407080946, "grad_norm": 2.984375, "learning_rate": 6.512259096474075e-07, "loss": 0.1729, "mean_token_accuracy": 0.9630839240550995, "num_tokens": 6894861703.0, "step": 65100 }, { "entropy": 1.21921875, "epoch": 1.5951716370403015, "grad_norm": 2.484375, "learning_rate": 6.474527699366567e-07, "loss": 0.1599, "mean_token_accuracy": 0.965704824924469, "num_tokens": 6899940861.0, "step": 65150 }, { "entropy": 1.21625, "epoch": 1.5963958669996572, "grad_norm": 2.078125, "learning_rate": 6.436892699745009e-07, "loss": 0.1572, "mean_token_accuracy": 0.9666438150405884, "num_tokens": 6905083361.0, "step": 65200 }, { "entropy": 1.2153125, "epoch": 1.5976200969590129, "grad_norm": 3.5625, "learning_rate": 6.399354251821792e-07, "loss": 0.1554, "mean_token_accuracy": 0.9674275135993957, "num_tokens": 6910092703.0, "step": 65250 }, { "entropy": 1.22984375, "epoch": 1.5988443269183683, "grad_norm": 2.828125, "learning_rate": 6.361912509413676e-07, "loss": 0.1645, "mean_token_accuracy": 0.9646131348609924, "num_tokens": 6915320978.0, "step": 65300 }, { "entropy": 1.22984375, "epoch": 1.6000685568777238, "grad_norm": 2.546875, "learning_rate": 6.32456762594116e-07, "loss": 0.1594, "mean_token_accuracy": 0.9651407063007355, "num_tokens": 6920827957.0, "step": 65350 }, { "entropy": 1.21140625, "epoch": 1.6012927868370794, "grad_norm": 2.578125, "learning_rate": 6.287319754427873e-07, "loss": 0.1533, "mean_token_accuracy": 0.9665750122070312, "num_tokens": 6926133415.0, "step": 65400 }, { "entropy": 1.22109375, "epoch": 1.602517016796435, "grad_norm": 2.859375, "learning_rate": 6.250169047499916e-07, "loss": 0.1563, "mean_token_accuracy": 0.9660931730270386, "num_tokens": 6931165132.0, "step": 65450 }, { "entropy": 1.2040625, "epoch": 1.6037412467557908, "grad_norm": 3.890625, "learning_rate": 6.213115657385244e-07, "loss": 0.1473, "mean_token_accuracy": 0.9677533149719239, "num_tokens": 6936236474.0, "step": 65500 }, { "entropy": 1.22515625, "epoch": 1.6049654767151462, "grad_norm": 1.9140625, "learning_rate": 6.176159735913079e-07, "loss": 0.1698, "mean_token_accuracy": 0.9640149748325348, "num_tokens": 6941667389.0, "step": 65550 }, { "entropy": 1.210625, "epoch": 1.6061897066745017, "grad_norm": 2.828125, "learning_rate": 6.139301434513204e-07, "loss": 0.1495, "mean_token_accuracy": 0.9672247707843781, "num_tokens": 6947023413.0, "step": 65600 }, { "entropy": 1.21921875, "epoch": 1.6074139366338573, "grad_norm": 2.859375, "learning_rate": 6.102540904215455e-07, "loss": 0.1579, "mean_token_accuracy": 0.9656173276901245, "num_tokens": 6952441096.0, "step": 65650 }, { "entropy": 1.223125, "epoch": 1.608638166593213, "grad_norm": 3.71875, "learning_rate": 6.065878295649004e-07, "loss": 0.166, "mean_token_accuracy": 0.9646958529949188, "num_tokens": 6957942190.0, "step": 65700 }, { "entropy": 1.2084375, "epoch": 1.6098623965525685, "grad_norm": 2.3125, "learning_rate": 6.0293137590418e-07, "loss": 0.15, "mean_token_accuracy": 0.9669717216491699, "num_tokens": 6963300846.0, "step": 65750 }, { "entropy": 1.22921875, "epoch": 1.611086626511924, "grad_norm": 2.078125, "learning_rate": 5.992847444219915e-07, "loss": 0.1614, "mean_token_accuracy": 0.9650086772441864, "num_tokens": 6968779335.0, "step": 65800 }, { "entropy": 1.22625, "epoch": 1.6123108564712796, "grad_norm": 2.78125, "learning_rate": 5.956479500606977e-07, "loss": 0.171, "mean_token_accuracy": 0.9639155077934265, "num_tokens": 6974202109.0, "step": 65850 }, { "entropy": 1.21328125, "epoch": 1.6135350864306353, "grad_norm": 3.375, "learning_rate": 5.920210077223508e-07, "loss": 0.1488, "mean_token_accuracy": 0.9683645820617676, "num_tokens": 6979171497.0, "step": 65900 }, { "entropy": 1.21875, "epoch": 1.6147593163899907, "grad_norm": 2.734375, "learning_rate": 5.884039322686345e-07, "loss": 0.1593, "mean_token_accuracy": 0.9662387585639953, "num_tokens": 6984410380.0, "step": 65950 }, { "entropy": 1.198125, "epoch": 1.6159835463493462, "grad_norm": 2.0, "learning_rate": 5.847967385208012e-07, "loss": 0.1521, "mean_token_accuracy": 0.966891850233078, "num_tokens": 6989408812.0, "step": 66000 }, { "entropy": 1.20296875, "epoch": 1.6172077763087018, "grad_norm": 2.109375, "learning_rate": 5.81199441259614e-07, "loss": 0.1509, "mean_token_accuracy": 0.9681426680088043, "num_tokens": 6994432516.0, "step": 66050 }, { "entropy": 1.225625, "epoch": 1.6184320062680575, "grad_norm": 3.140625, "learning_rate": 5.776120552252833e-07, "loss": 0.1638, "mean_token_accuracy": 0.965145457983017, "num_tokens": 6999763932.0, "step": 66100 }, { "entropy": 1.22421875, "epoch": 1.619656236227413, "grad_norm": 3.078125, "learning_rate": 5.740345951174062e-07, "loss": 0.1654, "mean_token_accuracy": 0.9642065274715423, "num_tokens": 7005089905.0, "step": 66150 }, { "entropy": 1.238125, "epoch": 1.6208804661867684, "grad_norm": 2.78125, "learning_rate": 5.704670755949111e-07, "loss": 0.1742, "mean_token_accuracy": 0.962605128288269, "num_tokens": 7010758688.0, "step": 66200 }, { "entropy": 1.2284375, "epoch": 1.622104696146124, "grad_norm": 2.359375, "learning_rate": 5.669095112759893e-07, "loss": 0.1699, "mean_token_accuracy": 0.9639213311672211, "num_tokens": 7015757555.0, "step": 66250 }, { "entropy": 1.215, "epoch": 1.6233289261054797, "grad_norm": 3.609375, "learning_rate": 5.633619167380439e-07, "loss": 0.1542, "mean_token_accuracy": 0.9669547820091248, "num_tokens": 7020934918.0, "step": 66300 }, { "entropy": 1.20421875, "epoch": 1.6245531560648352, "grad_norm": 2.609375, "learning_rate": 5.598243065176243e-07, "loss": 0.1491, "mean_token_accuracy": 0.9682400977611542, "num_tokens": 7026062287.0, "step": 66350 }, { "entropy": 1.224375, "epoch": 1.6257773860241906, "grad_norm": 3.328125, "learning_rate": 5.56296695110368e-07, "loss": 0.1563, "mean_token_accuracy": 0.965864109992981, "num_tokens": 7031243491.0, "step": 66400 }, { "entropy": 1.21640625, "epoch": 1.6270016159835463, "grad_norm": 1.875, "learning_rate": 5.527790969709421e-07, "loss": 0.1591, "mean_token_accuracy": 0.9661051654815673, "num_tokens": 7036518719.0, "step": 66450 }, { "entropy": 1.21765625, "epoch": 1.628225845942902, "grad_norm": 2.265625, "learning_rate": 5.492715265129842e-07, "loss": 0.1526, "mean_token_accuracy": 0.967378306388855, "num_tokens": 7041605356.0, "step": 66500 }, { "entropy": 1.22578125, "epoch": 1.6294500759022574, "grad_norm": 3.25, "learning_rate": 5.457739981090422e-07, "loss": 0.1608, "mean_token_accuracy": 0.965805538892746, "num_tokens": 7047131119.0, "step": 66550 }, { "entropy": 1.22296875, "epoch": 1.6306743058616129, "grad_norm": 3.9375, "learning_rate": 5.422865260905141e-07, "loss": 0.162, "mean_token_accuracy": 0.9653668451309204, "num_tokens": 7052461810.0, "step": 66600 }, { "entropy": 1.2321875, "epoch": 1.6318985358209686, "grad_norm": 2.015625, "learning_rate": 5.388091247475948e-07, "loss": 0.1674, "mean_token_accuracy": 0.9641144728660583, "num_tokens": 7057861665.0, "step": 66650 }, { "entropy": 1.22, "epoch": 1.6331227657803242, "grad_norm": 4.875, "learning_rate": 5.35341808329211e-07, "loss": 0.1612, "mean_token_accuracy": 0.9650032806396485, "num_tokens": 7063074323.0, "step": 66700 }, { "entropy": 1.2290625, "epoch": 1.63434699573968, "grad_norm": 2.21875, "learning_rate": 5.31884591042966e-07, "loss": 0.1642, "mean_token_accuracy": 0.9645574033260346, "num_tokens": 7068850662.0, "step": 66750 }, { "entropy": 1.21140625, "epoch": 1.6355712256990353, "grad_norm": 2.21875, "learning_rate": 5.284374870550806e-07, "loss": 0.1513, "mean_token_accuracy": 0.9664854764938354, "num_tokens": 7073845156.0, "step": 66800 }, { "entropy": 1.2134375, "epoch": 1.6367954556583908, "grad_norm": 2.59375, "learning_rate": 5.250005104903391e-07, "loss": 0.1526, "mean_token_accuracy": 0.9672818171977997, "num_tokens": 7078890553.0, "step": 66850 }, { "entropy": 1.21890625, "epoch": 1.6380196856177465, "grad_norm": 3.1875, "learning_rate": 5.215736754320221e-07, "loss": 0.1559, "mean_token_accuracy": 0.9661696362495422, "num_tokens": 7084113116.0, "step": 66900 }, { "entropy": 1.2209375, "epoch": 1.6392439155771021, "grad_norm": 2.578125, "learning_rate": 5.181569959218593e-07, "loss": 0.1537, "mean_token_accuracy": 0.9654488229751587, "num_tokens": 7089341607.0, "step": 66950 }, { "entropy": 1.21953125, "epoch": 1.6404681455364576, "grad_norm": 2.9375, "learning_rate": 5.147504859599658e-07, "loss": 0.1627, "mean_token_accuracy": 0.9644181895256042, "num_tokens": 7094625061.0, "step": 67000 }, { "entropy": 1.226875, "epoch": 1.641692375495813, "grad_norm": 2.78125, "learning_rate": 5.113541595047853e-07, "loss": 0.1638, "mean_token_accuracy": 0.9646450591087341, "num_tokens": 7100017216.0, "step": 67050 }, { "entropy": 1.22828125, "epoch": 1.6429166054551687, "grad_norm": 2.515625, "learning_rate": 5.079680304730336e-07, "loss": 0.1632, "mean_token_accuracy": 0.9642895436286927, "num_tokens": 7105647390.0, "step": 67100 }, { "entropy": 1.2190625, "epoch": 1.6441408354145244, "grad_norm": 3.1875, "learning_rate": 5.045921127396446e-07, "loss": 0.1568, "mean_token_accuracy": 0.9664517366886138, "num_tokens": 7111038795.0, "step": 67150 }, { "entropy": 1.20453125, "epoch": 1.6453650653738798, "grad_norm": 3.5625, "learning_rate": 5.012264201377073e-07, "loss": 0.1546, "mean_token_accuracy": 0.9667070829868316, "num_tokens": 7116213641.0, "step": 67200 }, { "entropy": 1.22828125, "epoch": 1.6465892953332353, "grad_norm": 2.484375, "learning_rate": 4.978709664584132e-07, "loss": 0.1502, "mean_token_accuracy": 0.9669265413284301, "num_tokens": 7121369080.0, "step": 67250 }, { "entropy": 1.2240625, "epoch": 1.647813525292591, "grad_norm": 2.328125, "learning_rate": 4.945257654510013e-07, "loss": 0.1614, "mean_token_accuracy": 0.966176050901413, "num_tokens": 7126738052.0, "step": 67300 }, { "entropy": 1.21375, "epoch": 1.6490377552519466, "grad_norm": 3.21875, "learning_rate": 4.911908308226965e-07, "loss": 0.1425, "mean_token_accuracy": 0.969027806520462, "num_tokens": 7131902692.0, "step": 67350 }, { "entropy": 1.20609375, "epoch": 1.650261985211302, "grad_norm": 2.46875, "learning_rate": 4.878661762386575e-07, "loss": 0.1494, "mean_token_accuracy": 0.966635344028473, "num_tokens": 7136808281.0, "step": 67400 }, { "entropy": 1.2134375, "epoch": 1.6514862151706575, "grad_norm": 3.921875, "learning_rate": 4.845518153219194e-07, "loss": 0.1536, "mean_token_accuracy": 0.9664989912509918, "num_tokens": 7141996551.0, "step": 67450 }, { "entropy": 1.2096875, "epoch": 1.6527104451300132, "grad_norm": 2.875, "learning_rate": 4.812477616533406e-07, "loss": 0.1517, "mean_token_accuracy": 0.9665413784980774, "num_tokens": 7146993092.0, "step": 67500 }, { "entropy": 1.209375, "epoch": 1.6539346750893689, "grad_norm": 3.3125, "learning_rate": 4.779540287715394e-07, "loss": 0.1583, "mean_token_accuracy": 0.965690256357193, "num_tokens": 7152324580.0, "step": 67550 }, { "entropy": 1.2259375, "epoch": 1.6551589050487243, "grad_norm": 3.828125, "learning_rate": 4.7467063017285005e-07, "loss": 0.1632, "mean_token_accuracy": 0.9648753714561462, "num_tokens": 7157642715.0, "step": 67600 }, { "entropy": 1.21328125, "epoch": 1.6563831350080798, "grad_norm": 3.4375, "learning_rate": 4.713975793112569e-07, "loss": 0.1542, "mean_token_accuracy": 0.9669430148601532, "num_tokens": 7162998030.0, "step": 67650 }, { "entropy": 1.185625, "epoch": 1.6576073649674354, "grad_norm": 3.953125, "learning_rate": 4.681348895983448e-07, "loss": 0.1379, "mean_token_accuracy": 0.9700025701522828, "num_tokens": 7167607013.0, "step": 67700 }, { "entropy": 1.2225, "epoch": 1.658831594926791, "grad_norm": 2.359375, "learning_rate": 4.648825744032449e-07, "loss": 0.1614, "mean_token_accuracy": 0.9637822723388672, "num_tokens": 7172916071.0, "step": 67750 }, { "entropy": 1.22109375, "epoch": 1.6600558248861468, "grad_norm": 0.003997802734375, "learning_rate": 4.6164064705257424e-07, "loss": 0.1604, "mean_token_accuracy": 0.9653963768482208, "num_tokens": 7178344100.0, "step": 67800 }, { "entropy": 1.21921875, "epoch": 1.6612800548455022, "grad_norm": 2.453125, "learning_rate": 4.584091208303891e-07, "loss": 0.1583, "mean_token_accuracy": 0.9654520618915557, "num_tokens": 7183547126.0, "step": 67850 }, { "entropy": 1.2121875, "epoch": 1.6625042848048577, "grad_norm": 1.7578125, "learning_rate": 4.5518800897812174e-07, "loss": 0.1521, "mean_token_accuracy": 0.9661059749126434, "num_tokens": 7188532212.0, "step": 67900 }, { "entropy": 1.2209375, "epoch": 1.6637285147642134, "grad_norm": 2.734375, "learning_rate": 4.519773246945349e-07, "loss": 0.1576, "mean_token_accuracy": 0.9657674777507782, "num_tokens": 7193693940.0, "step": 67950 }, { "entropy": 1.23375, "epoch": 1.664952744723569, "grad_norm": 2.953125, "learning_rate": 4.487770811356612e-07, "loss": 0.1664, "mean_token_accuracy": 0.9635096192359924, "num_tokens": 7199191726.0, "step": 68000 }, { "entropy": 1.21625, "epoch": 1.6661769746829245, "grad_norm": 3.03125, "learning_rate": 4.455872914147521e-07, "loss": 0.1614, "mean_token_accuracy": 0.965412712097168, "num_tokens": 7204740271.0, "step": 68050 }, { "entropy": 1.2178125, "epoch": 1.66740120464228, "grad_norm": 1.96875, "learning_rate": 4.424079686022223e-07, "loss": 0.1647, "mean_token_accuracy": 0.9641766202449799, "num_tokens": 7210407120.0, "step": 68100 }, { "entropy": 1.22875, "epoch": 1.6686254346016356, "grad_norm": 2.953125, "learning_rate": 4.39239125725601e-07, "loss": 0.162, "mean_token_accuracy": 0.9659585297107697, "num_tokens": 7215783474.0, "step": 68150 }, { "entropy": 1.226875, "epoch": 1.6698496645609913, "grad_norm": 2.4375, "learning_rate": 4.360807757694718e-07, "loss": 0.1626, "mean_token_accuracy": 0.9646227335929871, "num_tokens": 7220993281.0, "step": 68200 }, { "entropy": 1.19703125, "epoch": 1.6710738945203467, "grad_norm": 2.0625, "learning_rate": 4.329329316754236e-07, "loss": 0.1441, "mean_token_accuracy": 0.9685395467281341, "num_tokens": 7225810836.0, "step": 68250 }, { "entropy": 1.21875, "epoch": 1.6722981244797022, "grad_norm": 2.984375, "learning_rate": 4.2979560634199754e-07, "loss": 0.1688, "mean_token_accuracy": 0.9636458623409271, "num_tokens": 7231649459.0, "step": 68300 }, { "entropy": 1.19296875, "epoch": 1.6735223544390578, "grad_norm": 2.65625, "learning_rate": 4.266688126246311e-07, "loss": 0.1424, "mean_token_accuracy": 0.9688647317886353, "num_tokens": 7236848069.0, "step": 68350 }, { "entropy": 1.2278125, "epoch": 1.6747465843984135, "grad_norm": 3.203125, "learning_rate": 4.235525633356111e-07, "loss": 0.1676, "mean_token_accuracy": 0.963608900308609, "num_tokens": 7242384952.0, "step": 68400 }, { "entropy": 1.238125, "epoch": 1.675970814357769, "grad_norm": 3.359375, "learning_rate": 4.204468712440144e-07, "loss": 0.1653, "mean_token_accuracy": 0.9638743424415588, "num_tokens": 7247699380.0, "step": 68450 }, { "entropy": 1.21671875, "epoch": 1.6771950443171244, "grad_norm": 3.015625, "learning_rate": 4.1735174907566234e-07, "loss": 0.1507, "mean_token_accuracy": 0.9674655389785767, "num_tokens": 7252973599.0, "step": 68500 }, { "entropy": 1.2109375, "epoch": 1.67841927427648, "grad_norm": 2.4375, "learning_rate": 4.142672095130603e-07, "loss": 0.1488, "mean_token_accuracy": 0.9676065123081208, "num_tokens": 7257981736.0, "step": 68550 }, { "entropy": 1.2084375, "epoch": 1.6796435042358357, "grad_norm": 2.765625, "learning_rate": 4.111932651953554e-07, "loss": 0.1537, "mean_token_accuracy": 0.9668715631961823, "num_tokens": 7263067623.0, "step": 68600 }, { "entropy": 1.2253125, "epoch": 1.6808677341951912, "grad_norm": 1.953125, "learning_rate": 4.0812992871827737e-07, "loss": 0.1514, "mean_token_accuracy": 0.967187968492508, "num_tokens": 7268515412.0, "step": 68650 }, { "entropy": 1.2240625, "epoch": 1.6820919641545466, "grad_norm": 2.1875, "learning_rate": 4.0507721263409016e-07, "loss": 0.155, "mean_token_accuracy": 0.9657605230808258, "num_tokens": 7273767424.0, "step": 68700 }, { "entropy": 1.21890625, "epoch": 1.6833161941139023, "grad_norm": 2.078125, "learning_rate": 4.0203512945153874e-07, "loss": 0.1501, "mean_token_accuracy": 0.9671496486663819, "num_tokens": 7279187672.0, "step": 68750 }, { "entropy": 1.20953125, "epoch": 1.684540424073258, "grad_norm": 3.0625, "learning_rate": 3.990036916358014e-07, "loss": 0.1466, "mean_token_accuracy": 0.9685079550743103, "num_tokens": 7284104561.0, "step": 68800 }, { "entropy": 1.21328125, "epoch": 1.6857646540326134, "grad_norm": 4.0625, "learning_rate": 3.9598291160843393e-07, "loss": 0.1557, "mean_token_accuracy": 0.9655941009521485, "num_tokens": 7289492586.0, "step": 68850 }, { "entropy": 1.18875, "epoch": 1.686988883991969, "grad_norm": 1.875, "learning_rate": 3.929728017473213e-07, "loss": 0.14, "mean_token_accuracy": 0.969061805009842, "num_tokens": 7294671673.0, "step": 68900 }, { "entropy": 1.21671875, "epoch": 1.6882131139513246, "grad_norm": 1.578125, "learning_rate": 3.8997337438662893e-07, "loss": 0.1628, "mean_token_accuracy": 0.9643185365200043, "num_tokens": 7300014488.0, "step": 68950 }, { "entropy": 1.22359375, "epoch": 1.6894373439106802, "grad_norm": 0.01251220703125, "learning_rate": 3.869846418167452e-07, "loss": 0.1521, "mean_token_accuracy": 0.9664946186542511, "num_tokens": 7305132050.0, "step": 69000 }, { "entropy": 1.21640625, "epoch": 1.690661573870036, "grad_norm": 2.6875, "learning_rate": 3.840066162842405e-07, "loss": 0.1518, "mean_token_accuracy": 0.9676698422431946, "num_tokens": 7310341663.0, "step": 69050 }, { "entropy": 1.22984375, "epoch": 1.6918858038293914, "grad_norm": 3.625, "learning_rate": 3.8103930999180936e-07, "loss": 0.1685, "mean_token_accuracy": 0.963647495508194, "num_tokens": 7315713992.0, "step": 69100 }, { "entropy": 1.2271875, "epoch": 1.6931100337887468, "grad_norm": 2.5625, "learning_rate": 3.780827350982258e-07, "loss": 0.1558, "mean_token_accuracy": 0.9662664186954498, "num_tokens": 7321152260.0, "step": 69150 }, { "entropy": 1.21296875, "epoch": 1.6943342637481025, "grad_norm": 2.390625, "learning_rate": 3.751369037182869e-07, "loss": 0.1532, "mean_token_accuracy": 0.9662709140777588, "num_tokens": 7326190569.0, "step": 69200 }, { "entropy": 1.198125, "epoch": 1.6955584937074581, "grad_norm": 2.9375, "learning_rate": 3.722018279227728e-07, "loss": 0.1412, "mean_token_accuracy": 0.9689172983169556, "num_tokens": 7331368151.0, "step": 69250 }, { "entropy": 1.21125, "epoch": 1.6967827236668136, "grad_norm": 3.25, "learning_rate": 3.6927751973838777e-07, "loss": 0.1578, "mean_token_accuracy": 0.9661315476894379, "num_tokens": 7336566118.0, "step": 69300 }, { "entropy": 1.2215625, "epoch": 1.698006953626169, "grad_norm": 1.765625, "learning_rate": 3.66363991147716e-07, "loss": 0.1577, "mean_token_accuracy": 0.9653751969337463, "num_tokens": 7341728443.0, "step": 69350 }, { "entropy": 1.20796875, "epoch": 1.6992311835855247, "grad_norm": 2.53125, "learning_rate": 3.6346125408917155e-07, "loss": 0.1497, "mean_token_accuracy": 0.9668842852115631, "num_tokens": 7346956092.0, "step": 69400 }, { "entropy": 1.216875, "epoch": 1.7004554135448804, "grad_norm": 3.15625, "learning_rate": 3.605693204569506e-07, "loss": 0.1547, "mean_token_accuracy": 0.967246618270874, "num_tokens": 7352423947.0, "step": 69450 }, { "entropy": 1.2075, "epoch": 1.7016796435042358, "grad_norm": 2.46875, "learning_rate": 3.576882021009792e-07, "loss": 0.1489, "mean_token_accuracy": 0.9667674267292022, "num_tokens": 7357669096.0, "step": 69500 }, { "entropy": 1.19796875, "epoch": 1.7029038734635913, "grad_norm": 2.15625, "learning_rate": 3.5481791082686757e-07, "loss": 0.1421, "mean_token_accuracy": 0.9695830595493317, "num_tokens": 7362784518.0, "step": 69550 }, { "entropy": 1.2278125, "epoch": 1.704128103422947, "grad_norm": 2.15625, "learning_rate": 3.519584583958636e-07, "loss": 0.162, "mean_token_accuracy": 0.9651164734363555, "num_tokens": 7368275670.0, "step": 69600 }, { "entropy": 1.21578125, "epoch": 1.7053523333823026, "grad_norm": 2.640625, "learning_rate": 3.4910985652479757e-07, "loss": 0.1506, "mean_token_accuracy": 0.9667972207069397, "num_tokens": 7373607544.0, "step": 69650 }, { "entropy": 1.20625, "epoch": 1.706576563341658, "grad_norm": 4.71875, "learning_rate": 3.462721168860428e-07, "loss": 0.1492, "mean_token_accuracy": 0.9675750434398651, "num_tokens": 7378823181.0, "step": 69700 }, { "entropy": 1.2265625, "epoch": 1.7078007933010135, "grad_norm": 2.84375, "learning_rate": 3.4344525110746127e-07, "loss": 0.1603, "mean_token_accuracy": 0.965987560749054, "num_tokens": 7384384951.0, "step": 69750 }, { "entropy": 1.21953125, "epoch": 1.7090250232603692, "grad_norm": 1.640625, "learning_rate": 3.4062927077236106e-07, "loss": 0.1574, "mean_token_accuracy": 0.9660314428806305, "num_tokens": 7389942384.0, "step": 69800 }, { "entropy": 1.21640625, "epoch": 1.7102492532197249, "grad_norm": 2.109375, "learning_rate": 3.3782418741944244e-07, "loss": 0.1629, "mean_token_accuracy": 0.9638810443878174, "num_tokens": 7395323756.0, "step": 69850 }, { "entropy": 1.20765625, "epoch": 1.7114734831790803, "grad_norm": 2.625, "learning_rate": 3.350300125427578e-07, "loss": 0.1384, "mean_token_accuracy": 0.9689883410930633, "num_tokens": 7400575411.0, "step": 69900 }, { "entropy": 1.20546875, "epoch": 1.7126977131384358, "grad_norm": 3.109375, "learning_rate": 3.3224675759166026e-07, "loss": 0.1515, "mean_token_accuracy": 0.9666663575172424, "num_tokens": 7405984120.0, "step": 69950 }, { "entropy": 1.2203125, "epoch": 1.7139219430977914, "grad_norm": 2.328125, "learning_rate": 3.294744339707564e-07, "loss": 0.1566, "mean_token_accuracy": 0.9662071549892426, "num_tokens": 7411306216.0, "step": 70000 }, { "epoch": 1.7139219430977914, "eval_entropy": 1.2108072916666666, "eval_loss": 0.17756883800029755, "eval_mean_token_accuracy": 0.9620932574073474, "eval_num_tokens": 7411306216.0, "eval_runtime": 601.9385, "eval_samples_per_second": 16.042, "eval_steps_per_second": 0.201, "step": 70000 }, { "entropy": 1.21734375, "epoch": 1.7151461730571471, "grad_norm": 0.0033111572265625, "learning_rate": 3.2671305303986264e-07, "loss": 0.1546, "mean_token_accuracy": 0.9665888488292694, "num_tokens": 7416539172.0, "step": 70050 }, { "entropy": 1.21734375, "epoch": 1.7163704030165026, "grad_norm": 2.84375, "learning_rate": 3.23962626113956e-07, "loss": 0.151, "mean_token_accuracy": 0.9668701207637787, "num_tokens": 7421707836.0, "step": 70100 }, { "entropy": 1.20390625, "epoch": 1.7175946329758582, "grad_norm": 2.875, "learning_rate": 3.212231644631286e-07, "loss": 0.1522, "mean_token_accuracy": 0.967432736158371, "num_tokens": 7427044054.0, "step": 70150 }, { "entropy": 1.1990625, "epoch": 1.7188188629352137, "grad_norm": 2.234375, "learning_rate": 3.184946793125406e-07, "loss": 0.1454, "mean_token_accuracy": 0.9683572733402253, "num_tokens": 7432165156.0, "step": 70200 }, { "entropy": 1.22375, "epoch": 1.7200430928945694, "grad_norm": 3.15625, "learning_rate": 3.157771818423778e-07, "loss": 0.1574, "mean_token_accuracy": 0.9646234130859375, "num_tokens": 7437729163.0, "step": 70250 }, { "entropy": 1.2253125, "epoch": 1.721267322853925, "grad_norm": 1.78125, "learning_rate": 3.130706831877993e-07, "loss": 0.1583, "mean_token_accuracy": 0.965836591720581, "num_tokens": 7443255376.0, "step": 70300 }, { "entropy": 1.21734375, "epoch": 1.7224915528132805, "grad_norm": 3.8125, "learning_rate": 3.1037519443889927e-07, "loss": 0.1502, "mean_token_accuracy": 0.967227201461792, "num_tokens": 7448723374.0, "step": 70350 }, { "entropy": 1.1978125, "epoch": 1.723715782772636, "grad_norm": 2.15625, "learning_rate": 3.07690726640655e-07, "loss": 0.1386, "mean_token_accuracy": 0.9692979896068573, "num_tokens": 7453945048.0, "step": 70400 }, { "entropy": 1.21671875, "epoch": 1.7249400127319916, "grad_norm": 3.359375, "learning_rate": 3.050172907928872e-07, "loss": 0.1601, "mean_token_accuracy": 0.9648488080501556, "num_tokens": 7459709955.0, "step": 70450 }, { "entropy": 1.194375, "epoch": 1.7261642426913473, "grad_norm": 1.2109375, "learning_rate": 3.0235489785021073e-07, "loss": 0.1429, "mean_token_accuracy": 0.968617148399353, "num_tokens": 7464731391.0, "step": 70500 }, { "entropy": 1.21328125, "epoch": 1.7273884726507027, "grad_norm": 4.1875, "learning_rate": 2.997035587219911e-07, "loss": 0.1509, "mean_token_accuracy": 0.9667483043670654, "num_tokens": 7470148354.0, "step": 70550 }, { "entropy": 1.21015625, "epoch": 1.7286127026100582, "grad_norm": 2.890625, "learning_rate": 2.970632842723001e-07, "loss": 0.1537, "mean_token_accuracy": 0.9668030095100403, "num_tokens": 7475597114.0, "step": 70600 }, { "entropy": 1.21203125, "epoch": 1.7298369325694138, "grad_norm": 1.78125, "learning_rate": 2.944340853198715e-07, "loss": 0.1489, "mean_token_accuracy": 0.9677174651622772, "num_tokens": 7480924480.0, "step": 70650 }, { "entropy": 1.1978125, "epoch": 1.7310611625287695, "grad_norm": 2.578125, "learning_rate": 2.9181597263805703e-07, "loss": 0.1381, "mean_token_accuracy": 0.9692902910709381, "num_tokens": 7485944672.0, "step": 70700 }, { "entropy": 1.2234375, "epoch": 1.732285392488125, "grad_norm": 3.15625, "learning_rate": 2.8920895695478036e-07, "loss": 0.1575, "mean_token_accuracy": 0.9657765531539917, "num_tokens": 7491484223.0, "step": 70750 }, { "entropy": 1.21984375, "epoch": 1.7335096224474804, "grad_norm": 1.640625, "learning_rate": 2.866130489524946e-07, "loss": 0.1497, "mean_token_accuracy": 0.9674056577682495, "num_tokens": 7496915236.0, "step": 70800 }, { "entropy": 1.2109375, "epoch": 1.734733852406836, "grad_norm": 1.9375, "learning_rate": 2.8402825926813793e-07, "loss": 0.1541, "mean_token_accuracy": 0.9666642725467682, "num_tokens": 7502068005.0, "step": 70850 }, { "entropy": 1.22796875, "epoch": 1.7359580823661918, "grad_norm": 1.171875, "learning_rate": 2.814545984930923e-07, "loss": 0.1643, "mean_token_accuracy": 0.9640646266937256, "num_tokens": 7507947357.0, "step": 70900 }, { "entropy": 1.2171875, "epoch": 1.7371823123255472, "grad_norm": 3.78125, "learning_rate": 2.788920771731344e-07, "loss": 0.1515, "mean_token_accuracy": 0.96691251039505, "num_tokens": 7513464788.0, "step": 70950 }, { "entropy": 1.21421875, "epoch": 1.7384065422849027, "grad_norm": 2.828125, "learning_rate": 2.763407058083999e-07, "loss": 0.1562, "mean_token_accuracy": 0.9653972661495209, "num_tokens": 7518965009.0, "step": 71000 }, { "entropy": 1.22109375, "epoch": 1.7396307722442583, "grad_norm": 3.09375, "learning_rate": 2.738004948533338e-07, "loss": 0.1553, "mean_token_accuracy": 0.9661720776557923, "num_tokens": 7524509007.0, "step": 71050 }, { "entropy": 1.2178125, "epoch": 1.740855002203614, "grad_norm": 2.640625, "learning_rate": 2.712714547166534e-07, "loss": 0.1494, "mean_token_accuracy": 0.9680777621269226, "num_tokens": 7529983645.0, "step": 71100 }, { "entropy": 1.22078125, "epoch": 1.7420792321629694, "grad_norm": 2.640625, "learning_rate": 2.6875359576129975e-07, "loss": 0.1604, "mean_token_accuracy": 0.9644283270835876, "num_tokens": 7535464039.0, "step": 71150 }, { "entropy": 1.206875, "epoch": 1.743303462122325, "grad_norm": 1.609375, "learning_rate": 2.662469283043991e-07, "loss": 0.1434, "mean_token_accuracy": 0.9683542418479919, "num_tokens": 7540523414.0, "step": 71200 }, { "entropy": 1.214375, "epoch": 1.7445276920816806, "grad_norm": 2.953125, "learning_rate": 2.637514626172213e-07, "loss": 0.1549, "mean_token_accuracy": 0.9665893888473511, "num_tokens": 7545849728.0, "step": 71250 }, { "entropy": 1.2040625, "epoch": 1.7457519220410362, "grad_norm": 2.765625, "learning_rate": 2.6126720892513277e-07, "loss": 0.1487, "mean_token_accuracy": 0.9680774366855621, "num_tokens": 7551159210.0, "step": 71300 }, { "entropy": 1.19421875, "epoch": 1.746976152000392, "grad_norm": 2.640625, "learning_rate": 2.5879417740756093e-07, "loss": 0.1363, "mean_token_accuracy": 0.9701401054859161, "num_tokens": 7556078762.0, "step": 71350 }, { "entropy": 1.218125, "epoch": 1.7482003819597474, "grad_norm": 1.5625, "learning_rate": 2.563323781979482e-07, "loss": 0.1656, "mean_token_accuracy": 0.9642888736724854, "num_tokens": 7561736323.0, "step": 71400 }, { "entropy": 1.21859375, "epoch": 1.7494246119191028, "grad_norm": 1.9609375, "learning_rate": 2.5388182138371173e-07, "loss": 0.1517, "mean_token_accuracy": 0.966708824634552, "num_tokens": 7567328811.0, "step": 71450 }, { "entropy": 1.22109375, "epoch": 1.7506488418784585, "grad_norm": 2.3125, "learning_rate": 2.5144251700620135e-07, "loss": 0.1629, "mean_token_accuracy": 0.9650636351108551, "num_tokens": 7572752827.0, "step": 71500 }, { "entropy": 1.21, "epoch": 1.7518730718378142, "grad_norm": 2.78125, "learning_rate": 2.4901447506066133e-07, "loss": 0.1599, "mean_token_accuracy": 0.9643032836914063, "num_tokens": 7578362509.0, "step": 71550 }, { "entropy": 1.2090625, "epoch": 1.7530973017971696, "grad_norm": 1.6484375, "learning_rate": 2.465977054961852e-07, "loss": 0.1493, "mean_token_accuracy": 0.9673759829998017, "num_tokens": 7583839931.0, "step": 71600 }, { "entropy": 1.21171875, "epoch": 1.754321531756525, "grad_norm": 2.828125, "learning_rate": 2.441922182156775e-07, "loss": 0.1518, "mean_token_accuracy": 0.9662256014347076, "num_tokens": 7589236608.0, "step": 71650 }, { "entropy": 1.209375, "epoch": 1.7555457617158807, "grad_norm": 2.890625, "learning_rate": 2.4179802307581234e-07, "loss": 0.1495, "mean_token_accuracy": 0.9674426424503326, "num_tokens": 7594652077.0, "step": 71700 }, { "entropy": 1.20265625, "epoch": 1.7567699916752364, "grad_norm": 2.96875, "learning_rate": 2.394151298869952e-07, "loss": 0.1451, "mean_token_accuracy": 0.9673744821548462, "num_tokens": 7599701409.0, "step": 71750 }, { "entropy": 1.2153125, "epoch": 1.7579942216345918, "grad_norm": 2.71875, "learning_rate": 2.3704354841331932e-07, "loss": 0.1505, "mean_token_accuracy": 0.9669674754142761, "num_tokens": 7605091932.0, "step": 71800 }, { "entropy": 1.2065625, "epoch": 1.7592184515939473, "grad_norm": 2.1875, "learning_rate": 2.3468328837252628e-07, "loss": 0.1478, "mean_token_accuracy": 0.9676505529880524, "num_tokens": 7610186489.0, "step": 71850 }, { "entropy": 1.20890625, "epoch": 1.760442681553303, "grad_norm": 1.765625, "learning_rate": 2.3233435943597114e-07, "loss": 0.1503, "mean_token_accuracy": 0.9671880280971528, "num_tokens": 7615665531.0, "step": 71900 }, { "entropy": 1.20375, "epoch": 1.7616669115126586, "grad_norm": 2.453125, "learning_rate": 2.299967712285731e-07, "loss": 0.1423, "mean_token_accuracy": 0.9683215701580048, "num_tokens": 7620773654.0, "step": 71950 }, { "entropy": 1.19234375, "epoch": 1.762891141472014, "grad_norm": 3.140625, "learning_rate": 2.276705333287875e-07, "loss": 0.1315, "mean_token_accuracy": 0.9702609395980835, "num_tokens": 7625470551.0, "step": 72000 }, { "entropy": 1.21046875, "epoch": 1.7641153714313695, "grad_norm": 2.234375, "learning_rate": 2.253556552685573e-07, "loss": 0.1433, "mean_token_accuracy": 0.9681813132762909, "num_tokens": 7630517430.0, "step": 72050 }, { "entropy": 1.21, "epoch": 1.7653396013907252, "grad_norm": 3.125, "learning_rate": 2.2305214653327855e-07, "loss": 0.1406, "mean_token_accuracy": 0.9686529791355133, "num_tokens": 7635763079.0, "step": 72100 }, { "entropy": 1.201875, "epoch": 1.7665638313500809, "grad_norm": 1.703125, "learning_rate": 2.207600165617607e-07, "loss": 0.1475, "mean_token_accuracy": 0.9678330075740814, "num_tokens": 7641423146.0, "step": 72150 }, { "entropy": 1.176875, "epoch": 1.7677880613094363, "grad_norm": 1.8125, "learning_rate": 2.1847927474618846e-07, "loss": 0.1314, "mean_token_accuracy": 0.9702327287197113, "num_tokens": 7646275038.0, "step": 72200 }, { "entropy": 1.205, "epoch": 1.7690122912687918, "grad_norm": 1.515625, "learning_rate": 2.1620993043208182e-07, "loss": 0.1371, "mean_token_accuracy": 0.9702345824241638, "num_tokens": 7651591457.0, "step": 72250 }, { "entropy": 1.2225, "epoch": 1.7702365212281475, "grad_norm": 1.6796875, "learning_rate": 2.139519929182585e-07, "loss": 0.1507, "mean_token_accuracy": 0.9666866302490235, "num_tokens": 7656975261.0, "step": 72300 }, { "entropy": 1.1996875, "epoch": 1.7714607511875031, "grad_norm": 2.46875, "learning_rate": 2.1170547145679665e-07, "loss": 0.1492, "mean_token_accuracy": 0.966531822681427, "num_tokens": 7662430438.0, "step": 72350 }, { "entropy": 1.21703125, "epoch": 1.7726849811468586, "grad_norm": 1.8203125, "learning_rate": 2.0947037525299606e-07, "loss": 0.1501, "mean_token_accuracy": 0.9673058640956879, "num_tokens": 7667987024.0, "step": 72400 }, { "entropy": 1.20890625, "epoch": 1.7739092111062142, "grad_norm": 2.640625, "learning_rate": 2.0724671346533975e-07, "loss": 0.1483, "mean_token_accuracy": 0.9672919237613677, "num_tokens": 7673092874.0, "step": 72450 }, { "entropy": 1.21171875, "epoch": 1.7751334410655697, "grad_norm": 2.421875, "learning_rate": 2.0503449520545814e-07, "loss": 0.1454, "mean_token_accuracy": 0.9677470910549164, "num_tokens": 7678350890.0, "step": 72500 }, { "entropy": 1.21125, "epoch": 1.7763576710249254, "grad_norm": 4.03125, "learning_rate": 2.0283372953809187e-07, "loss": 0.1506, "mean_token_accuracy": 0.9673129177093506, "num_tokens": 7683768054.0, "step": 72550 }, { "entropy": 1.19046875, "epoch": 1.777581900984281, "grad_norm": 0.010009765625, "learning_rate": 2.0064442548105078e-07, "loss": 0.1311, "mean_token_accuracy": 0.9706909394264222, "num_tokens": 7688732517.0, "step": 72600 }, { "entropy": 1.20234375, "epoch": 1.7788061309436365, "grad_norm": 2.625, "learning_rate": 1.9846659200518323e-07, "loss": 0.1443, "mean_token_accuracy": 0.9685131824016571, "num_tokens": 7693833105.0, "step": 72650 }, { "entropy": 1.1996875, "epoch": 1.780030360902992, "grad_norm": 2.8125, "learning_rate": 1.963002380343336e-07, "loss": 0.1372, "mean_token_accuracy": 0.9696123468875885, "num_tokens": 7698671416.0, "step": 72700 }, { "entropy": 1.2096875, "epoch": 1.7812545908623476, "grad_norm": 3.46875, "learning_rate": 1.9414537244530883e-07, "loss": 0.1447, "mean_token_accuracy": 0.9681323492527008, "num_tokens": 7704099695.0, "step": 72750 }, { "entropy": 1.209375, "epoch": 1.7824788208217033, "grad_norm": 3.8125, "learning_rate": 1.9200200406784084e-07, "loss": 0.1471, "mean_token_accuracy": 0.9671408832073212, "num_tokens": 7709413054.0, "step": 72800 }, { "entropy": 1.22046875, "epoch": 1.7837030507810587, "grad_norm": 2.375, "learning_rate": 1.8987014168455263e-07, "loss": 0.1513, "mean_token_accuracy": 0.9667081344127655, "num_tokens": 7714999778.0, "step": 72850 }, { "entropy": 1.21765625, "epoch": 1.7849272807404142, "grad_norm": 1.59375, "learning_rate": 1.8774979403091852e-07, "loss": 0.1467, "mean_token_accuracy": 0.9685576283931732, "num_tokens": 7720722054.0, "step": 72900 }, { "entropy": 1.18796875, "epoch": 1.7861515106997699, "grad_norm": 3.015625, "learning_rate": 1.8564096979523027e-07, "loss": 0.1448, "mean_token_accuracy": 0.9685378670692444, "num_tokens": 7726037284.0, "step": 72950 }, { "entropy": 1.21359375, "epoch": 1.7873757406591255, "grad_norm": 2.75, "learning_rate": 1.835436776185634e-07, "loss": 0.1305, "mean_token_accuracy": 0.9697797727584839, "num_tokens": 7731254143.0, "step": 73000 }, { "entropy": 1.189375, "epoch": 1.788599970618481, "grad_norm": 2.71875, "learning_rate": 1.814579260947379e-07, "loss": 0.1367, "mean_token_accuracy": 0.969087952375412, "num_tokens": 7736558719.0, "step": 73050 }, { "entropy": 1.20109375, "epoch": 1.7898242005778364, "grad_norm": 2.640625, "learning_rate": 1.7938372377028622e-07, "loss": 0.1265, "mean_token_accuracy": 0.9715298664569855, "num_tokens": 7741441296.0, "step": 73100 }, { "entropy": 1.1953125, "epoch": 1.791048430537192, "grad_norm": 2.078125, "learning_rate": 1.773210791444161e-07, "loss": 0.131, "mean_token_accuracy": 0.9706771004199982, "num_tokens": 7746461885.0, "step": 73150 }, { "entropy": 1.2090625, "epoch": 1.7922726604965478, "grad_norm": 3.375, "learning_rate": 1.7527000066897837e-07, "loss": 0.1469, "mean_token_accuracy": 0.9673126399517059, "num_tokens": 7752002392.0, "step": 73200 }, { "entropy": 1.1975, "epoch": 1.7934968904559032, "grad_norm": 1.5, "learning_rate": 1.7323049674842783e-07, "loss": 0.1437, "mean_token_accuracy": 0.9683597016334534, "num_tokens": 7756991548.0, "step": 73250 }, { "entropy": 1.2171875, "epoch": 1.7947211204152587, "grad_norm": 2.046875, "learning_rate": 1.7120257573979492e-07, "loss": 0.1454, "mean_token_accuracy": 0.968316274881363, "num_tokens": 7762203324.0, "step": 73300 }, { "entropy": 1.1959375, "epoch": 1.7959453503746143, "grad_norm": 2.109375, "learning_rate": 1.6918624595264597e-07, "loss": 0.1366, "mean_token_accuracy": 0.9702933692932129, "num_tokens": 7767460924.0, "step": 73350 }, { "entropy": 1.199375, "epoch": 1.79716958033397, "grad_norm": 2.265625, "learning_rate": 1.671815156490517e-07, "loss": 0.143, "mean_token_accuracy": 0.9685783159732818, "num_tokens": 7772824486.0, "step": 73400 }, { "entropy": 1.21921875, "epoch": 1.7983938102933255, "grad_norm": 2.953125, "learning_rate": 1.651883930435535e-07, "loss": 0.1362, "mean_token_accuracy": 0.9696711504459381, "num_tokens": 7778088634.0, "step": 73450 }, { "entropy": 1.2078125, "epoch": 1.799618040252681, "grad_norm": 0.004302978515625, "learning_rate": 1.6320688630312908e-07, "loss": 0.1363, "mean_token_accuracy": 0.9695776212215423, "num_tokens": 7783380087.0, "step": 73500 }, { "entropy": 1.22859375, "epoch": 1.8008422702120366, "grad_norm": 1.5625, "learning_rate": 1.6123700354716032e-07, "loss": 0.1559, "mean_token_accuracy": 0.9663217055797577, "num_tokens": 7789343726.0, "step": 73550 }, { "entropy": 1.21328125, "epoch": 1.8020665001713922, "grad_norm": 1.65625, "learning_rate": 1.5927875284739546e-07, "loss": 0.1356, "mean_token_accuracy": 0.9702400255203247, "num_tokens": 7794792440.0, "step": 73600 }, { "entropy": 1.21484375, "epoch": 1.803290730130748, "grad_norm": 1.71875, "learning_rate": 1.5733214222792392e-07, "loss": 0.1418, "mean_token_accuracy": 0.9687067580223083, "num_tokens": 7800254887.0, "step": 73650 }, { "entropy": 1.21421875, "epoch": 1.8045149600901034, "grad_norm": 3.625, "learning_rate": 1.5539717966513623e-07, "loss": 0.1361, "mean_token_accuracy": 0.969369399547577, "num_tokens": 7805607043.0, "step": 73700 }, { "entropy": 1.20984375, "epoch": 1.8057391900494588, "grad_norm": 2.609375, "learning_rate": 1.5347387308769478e-07, "loss": 0.1326, "mean_token_accuracy": 0.9703532266616821, "num_tokens": 7810964969.0, "step": 73750 }, { "entropy": 1.20515625, "epoch": 1.8069634200088145, "grad_norm": 2.234375, "learning_rate": 1.5156223037649985e-07, "loss": 0.1506, "mean_token_accuracy": 0.9663440334796906, "num_tokens": 7816484836.0, "step": 73800 }, { "entropy": 1.1890625, "epoch": 1.8081876499681702, "grad_norm": 3.03125, "learning_rate": 1.4966225936465993e-07, "loss": 0.1304, "mean_token_accuracy": 0.9708381593227386, "num_tokens": 7821459721.0, "step": 73850 }, { "entropy": 1.19953125, "epoch": 1.8094118799275256, "grad_norm": 2.1875, "learning_rate": 1.4777396783745612e-07, "loss": 0.128, "mean_token_accuracy": 0.9713588643074036, "num_tokens": 7826287539.0, "step": 73900 }, { "entropy": 1.1978125, "epoch": 1.810636109886881, "grad_norm": 2.15625, "learning_rate": 1.4589736353231308e-07, "loss": 0.1202, "mean_token_accuracy": 0.9729771482944488, "num_tokens": 7831387963.0, "step": 73950 }, { "entropy": 1.195, "epoch": 1.8118603398462367, "grad_norm": 2.296875, "learning_rate": 1.4403245413876486e-07, "loss": 0.1344, "mean_token_accuracy": 0.9699731683731079, "num_tokens": 7836315700.0, "step": 74000 }, { "entropy": 1.18796875, "epoch": 1.8130845698055924, "grad_norm": 2.296875, "learning_rate": 1.4217924729842513e-07, "loss": 0.1381, "mean_token_accuracy": 0.9699892640113831, "num_tokens": 7841453471.0, "step": 74050 }, { "entropy": 1.2075, "epoch": 1.8143087997649479, "grad_norm": 2.3125, "learning_rate": 1.403377506049569e-07, "loss": 0.1451, "mean_token_accuracy": 0.9681575572490693, "num_tokens": 7846798475.0, "step": 74100 }, { "entropy": 1.1890625, "epoch": 1.8155330297243033, "grad_norm": 3.328125, "learning_rate": 1.385079716040376e-07, "loss": 0.1253, "mean_token_accuracy": 0.9720281398296357, "num_tokens": 7851768429.0, "step": 74150 }, { "entropy": 1.19671875, "epoch": 1.816757259683659, "grad_norm": 2.40625, "learning_rate": 1.3668991779333308e-07, "loss": 0.1218, "mean_token_accuracy": 0.9725555181503296, "num_tokens": 7856881793.0, "step": 74200 }, { "entropy": 1.19890625, "epoch": 1.8179814896430146, "grad_norm": 1.8984375, "learning_rate": 1.3488359662246087e-07, "loss": 0.1272, "mean_token_accuracy": 0.9715735244750977, "num_tokens": 7861890257.0, "step": 74250 }, { "entropy": 1.20390625, "epoch": 1.81920571960237, "grad_norm": 1.90625, "learning_rate": 1.3308901549296604e-07, "loss": 0.1275, "mean_token_accuracy": 0.9717478513717651, "num_tokens": 7867074576.0, "step": 74300 }, { "entropy": 1.20203125, "epoch": 1.8204299495617255, "grad_norm": 2.46875, "learning_rate": 1.3130618175828713e-07, "loss": 0.1367, "mean_token_accuracy": 0.9701256167888641, "num_tokens": 7872381109.0, "step": 74350 }, { "entropy": 1.20828125, "epoch": 1.8216541795210812, "grad_norm": 3.359375, "learning_rate": 1.2953510272372647e-07, "loss": 0.1287, "mean_token_accuracy": 0.9719671607017517, "num_tokens": 7877881928.0, "step": 74400 }, { "entropy": 1.199375, "epoch": 1.822878409480437, "grad_norm": 2.59375, "learning_rate": 1.2777578564641969e-07, "loss": 0.1309, "mean_token_accuracy": 0.9707298684120178, "num_tokens": 7882820168.0, "step": 74450 }, { "entropy": 1.21734375, "epoch": 1.8241026394397923, "grad_norm": 2.546875, "learning_rate": 1.2602823773530915e-07, "loss": 0.1426, "mean_token_accuracy": 0.9688560748100281, "num_tokens": 7888372934.0, "step": 74500 }, { "entropy": 1.2046875, "epoch": 1.8253268693991478, "grad_norm": 2.703125, "learning_rate": 1.2429246615111024e-07, "loss": 0.1331, "mean_token_accuracy": 0.970300270318985, "num_tokens": 7893801088.0, "step": 74550 }, { "entropy": 1.21171875, "epoch": 1.8265510993585035, "grad_norm": 2.03125, "learning_rate": 1.2256847800628425e-07, "loss": 0.1223, "mean_token_accuracy": 0.973189731836319, "num_tokens": 7898852778.0, "step": 74600 }, { "entropy": 1.20671875, "epoch": 1.8277753293178591, "grad_norm": 2.078125, "learning_rate": 1.2085628036501007e-07, "loss": 0.123, "mean_token_accuracy": 0.9726410353183746, "num_tokens": 7903818883.0, "step": 74650 }, { "entropy": 1.19265625, "epoch": 1.8289995592772146, "grad_norm": 3.21875, "learning_rate": 1.1915588024315194e-07, "loss": 0.1278, "mean_token_accuracy": 0.9702788054943084, "num_tokens": 7908897679.0, "step": 74700 }, { "entropy": 1.20984375, "epoch": 1.83022378923657, "grad_norm": 3.15625, "learning_rate": 1.1746728460823508e-07, "loss": 0.1303, "mean_token_accuracy": 0.9711257565021515, "num_tokens": 7914006448.0, "step": 74750 }, { "entropy": 1.2140625, "epoch": 1.8314480191959257, "grad_norm": 1.9609375, "learning_rate": 1.1579050037941275e-07, "loss": 0.1362, "mean_token_accuracy": 0.969500253200531, "num_tokens": 7919510157.0, "step": 74800 }, { "entropy": 1.21421875, "epoch": 1.8326722491552814, "grad_norm": 2.40625, "learning_rate": 1.1412553442744255e-07, "loss": 0.132, "mean_token_accuracy": 0.970678209066391, "num_tokens": 7924726404.0, "step": 74850 }, { "entropy": 1.1996875, "epoch": 1.833896479114637, "grad_norm": 2.703125, "learning_rate": 1.1247239357465255e-07, "loss": 0.13, "mean_token_accuracy": 0.9713816094398499, "num_tokens": 7929934384.0, "step": 74900 }, { "entropy": 1.18921875, "epoch": 1.8351207090739925, "grad_norm": 1.9921875, "learning_rate": 1.1083108459491986e-07, "loss": 0.1256, "mean_token_accuracy": 0.9721748220920563, "num_tokens": 7935196457.0, "step": 74950 }, { "entropy": 1.2003125, "epoch": 1.836344939033348, "grad_norm": 2.703125, "learning_rate": 1.0920161421363773e-07, "loss": 0.119, "mean_token_accuracy": 0.9733594739437104, "num_tokens": 7940201367.0, "step": 75000 }, { "entropy": 1.22375, "epoch": 1.8375691689927036, "grad_norm": 1.7265625, "learning_rate": 1.0758398910768951e-07, "loss": 0.1373, "mean_token_accuracy": 0.9692693221569061, "num_tokens": 7945635438.0, "step": 75050 }, { "entropy": 1.20890625, "epoch": 1.8387933989520593, "grad_norm": 1.546875, "learning_rate": 1.0597821590542211e-07, "loss": 0.1282, "mean_token_accuracy": 0.9722434699535369, "num_tokens": 7951091367.0, "step": 75100 }, { "entropy": 1.18828125, "epoch": 1.8400176289114147, "grad_norm": 0.004425048828125, "learning_rate": 1.0438430118661924e-07, "loss": 0.124, "mean_token_accuracy": 0.9725795328617096, "num_tokens": 7956255217.0, "step": 75150 }, { "entropy": 1.1903125, "epoch": 1.8412418588707702, "grad_norm": 1.921875, "learning_rate": 1.0280225148247213e-07, "loss": 0.1179, "mean_token_accuracy": 0.9743827605247497, "num_tokens": 7961236486.0, "step": 75200 }, { "entropy": 1.1996875, "epoch": 1.8424660888301259, "grad_norm": 1.640625, "learning_rate": 1.0123207327555462e-07, "loss": 0.1156, "mean_token_accuracy": 0.9743783438205719, "num_tokens": 7966324215.0, "step": 75250 }, { "entropy": 1.2090625, "epoch": 1.8436903187894815, "grad_norm": 1.71875, "learning_rate": 9.967377299979708e-08, "loss": 0.134, "mean_token_accuracy": 0.9705902481079102, "num_tokens": 7971817863.0, "step": 75300 }, { "entropy": 1.19578125, "epoch": 1.844914548748837, "grad_norm": 2.15625, "learning_rate": 9.812735704045684e-08, "loss": 0.1185, "mean_token_accuracy": 0.9737985277175903, "num_tokens": 7977008142.0, "step": 75350 }, { "entropy": 1.190625, "epoch": 1.8461387787081924, "grad_norm": 1.75, "learning_rate": 9.65928317340975e-08, "loss": 0.1201, "mean_token_accuracy": 0.9731456315517426, "num_tokens": 7982011592.0, "step": 75400 }, { "entropy": 1.20875, "epoch": 1.847363008667548, "grad_norm": 1.765625, "learning_rate": 9.507020336855632e-08, "loss": 0.1221, "mean_token_accuracy": 0.9724456059932709, "num_tokens": 7987367141.0, "step": 75450 }, { "entropy": 1.20234375, "epoch": 1.8485872386269038, "grad_norm": 1.625, "learning_rate": 9.355947818292554e-08, "loss": 0.1149, "mean_token_accuracy": 0.9738513994216919, "num_tokens": 7992500198.0, "step": 75500 }, { "entropy": 1.21625, "epoch": 1.8498114685862592, "grad_norm": 1.78125, "learning_rate": 9.206066236751943e-08, "loss": 0.1328, "mean_token_accuracy": 0.9707795882225037, "num_tokens": 7998217427.0, "step": 75550 }, { "entropy": 1.1975, "epoch": 1.8510356985456147, "grad_norm": 2.125, "learning_rate": 9.057376206385559e-08, "loss": 0.1175, "mean_token_accuracy": 0.9741839158535004, "num_tokens": 8003308568.0, "step": 75600 }, { "entropy": 1.1878125, "epoch": 1.8522599285049703, "grad_norm": 3.21875, "learning_rate": 8.90987833646254e-08, "loss": 0.1077, "mean_token_accuracy": 0.9759363722801209, "num_tokens": 8008259087.0, "step": 75650 }, { "entropy": 1.20125, "epoch": 1.853484158464326, "grad_norm": 2.109375, "learning_rate": 8.763573231367062e-08, "loss": 0.1256, "mean_token_accuracy": 0.9727174258232116, "num_tokens": 8013653351.0, "step": 75700 }, { "entropy": 1.20078125, "epoch": 1.8547083884236815, "grad_norm": 2.765625, "learning_rate": 8.618461490595975e-08, "loss": 0.1214, "mean_token_accuracy": 0.9735188388824463, "num_tokens": 8018956628.0, "step": 75750 }, { "entropy": 1.209375, "epoch": 1.855932618383037, "grad_norm": 2.84375, "learning_rate": 8.474543708756044e-08, "loss": 0.1225, "mean_token_accuracy": 0.9721533727645874, "num_tokens": 8024197226.0, "step": 75800 }, { "entropy": 1.19015625, "epoch": 1.8571568483423926, "grad_norm": 0.005462646484375, "learning_rate": 8.33182047556178e-08, "loss": 0.1076, "mean_token_accuracy": 0.9760002064704895, "num_tokens": 8029024717.0, "step": 75850 }, { "entropy": 1.1953125, "epoch": 1.8583810783017483, "grad_norm": 1.640625, "learning_rate": 8.190292375832975e-08, "loss": 0.1274, "mean_token_accuracy": 0.971969587802887, "num_tokens": 8034254868.0, "step": 75900 }, { "entropy": 1.20546875, "epoch": 1.859605308261104, "grad_norm": 2.78125, "learning_rate": 8.049959989492239e-08, "loss": 0.1248, "mean_token_accuracy": 0.9728272747993469, "num_tokens": 8039555218.0, "step": 75950 }, { "entropy": 1.21359375, "epoch": 1.8608295382204594, "grad_norm": 1.640625, "learning_rate": 7.910823891562536e-08, "loss": 0.131, "mean_token_accuracy": 0.9710195803642273, "num_tokens": 8044915571.0, "step": 76000 }, { "entropy": 1.19625, "epoch": 1.8620537681798148, "grad_norm": 1.6953125, "learning_rate": 7.77288465216518e-08, "loss": 0.1189, "mean_token_accuracy": 0.9735661280155182, "num_tokens": 8050222763.0, "step": 76050 }, { "entropy": 1.1953125, "epoch": 1.8632779981391705, "grad_norm": 2.375, "learning_rate": 7.636142836517013e-08, "loss": 0.1211, "mean_token_accuracy": 0.9737051403522492, "num_tokens": 8055473678.0, "step": 76100 }, { "entropy": 1.196875, "epoch": 1.8645022280985262, "grad_norm": 1.6796875, "learning_rate": 7.500599004928565e-08, "loss": 0.1122, "mean_token_accuracy": 0.974678498506546, "num_tokens": 8060311800.0, "step": 76150 }, { "entropy": 1.18984375, "epoch": 1.8657264580578816, "grad_norm": 2.5, "learning_rate": 7.36625371280133e-08, "loss": 0.1164, "mean_token_accuracy": 0.9736955296993256, "num_tokens": 8065567322.0, "step": 76200 }, { "entropy": 1.211875, "epoch": 1.866950688017237, "grad_norm": 2.109375, "learning_rate": 7.233107510625858e-08, "loss": 0.1262, "mean_token_accuracy": 0.9716404461860657, "num_tokens": 8070882224.0, "step": 76250 }, { "entropy": 1.20234375, "epoch": 1.8681749179765927, "grad_norm": 1.65625, "learning_rate": 7.101160943979201e-08, "loss": 0.1242, "mean_token_accuracy": 0.9728803491592407, "num_tokens": 8075963376.0, "step": 76300 }, { "entropy": 1.20921875, "epoch": 1.8693991479359484, "grad_norm": 1.625, "learning_rate": 6.970414553522842e-08, "loss": 0.1223, "mean_token_accuracy": 0.9728834819793701, "num_tokens": 8081448166.0, "step": 76350 }, { "entropy": 1.1978125, "epoch": 1.8706233778953039, "grad_norm": 2.78125, "learning_rate": 6.840868875000561e-08, "loss": 0.1146, "mean_token_accuracy": 0.9747687363624573, "num_tokens": 8086285902.0, "step": 76400 }, { "entropy": 1.200625, "epoch": 1.8718476078546593, "grad_norm": 2.765625, "learning_rate": 6.712524439235978e-08, "loss": 0.1171, "mean_token_accuracy": 0.9743122577667236, "num_tokens": 8091436927.0, "step": 76450 }, { "entropy": 1.211875, "epoch": 1.873071837814015, "grad_norm": 2.078125, "learning_rate": 6.585381772130584e-08, "loss": 0.1327, "mean_token_accuracy": 0.9712537932395935, "num_tokens": 8097048708.0, "step": 76500 }, { "entropy": 1.2128125, "epoch": 1.8742960677733707, "grad_norm": 2.703125, "learning_rate": 6.459441394661536e-08, "loss": 0.1342, "mean_token_accuracy": 0.9702994549274444, "num_tokens": 8102302631.0, "step": 76550 }, { "entropy": 1.20875, "epoch": 1.875520297732726, "grad_norm": 1.7890625, "learning_rate": 6.334703822879506e-08, "loss": 0.1337, "mean_token_accuracy": 0.970585721731186, "num_tokens": 8107702374.0, "step": 76600 }, { "entropy": 1.208125, "epoch": 1.8767445276920816, "grad_norm": 2.359375, "learning_rate": 6.211169567906572e-08, "loss": 0.1419, "mean_token_accuracy": 0.9687972629070282, "num_tokens": 8113119431.0, "step": 76650 }, { "entropy": 1.20546875, "epoch": 1.8779687576514372, "grad_norm": 3.0625, "learning_rate": 6.08883913593412e-08, "loss": 0.1354, "mean_token_accuracy": 0.9701398539543152, "num_tokens": 8118309412.0, "step": 76700 }, { "entropy": 1.19796875, "epoch": 1.879192987610793, "grad_norm": 2.546875, "learning_rate": 5.967713028220756e-08, "loss": 0.1334, "mean_token_accuracy": 0.9713104116916657, "num_tokens": 8123346693.0, "step": 76750 }, { "entropy": 1.2065625, "epoch": 1.8804172175701483, "grad_norm": 2.46875, "learning_rate": 5.8477917410903914e-08, "loss": 0.1449, "mean_token_accuracy": 0.968209480047226, "num_tokens": 8128745782.0, "step": 76800 }, { "entropy": 1.19703125, "epoch": 1.8816414475295038, "grad_norm": 2.78125, "learning_rate": 5.729075765929925e-08, "loss": 0.1602, "mean_token_accuracy": 0.9653090810775757, "num_tokens": 8133734566.0, "step": 76850 }, { "entropy": 1.2078125, "epoch": 1.8828656774888595, "grad_norm": 3.046875, "learning_rate": 5.61156558918744e-08, "loss": 0.1748, "mean_token_accuracy": 0.9636254405975342, "num_tokens": 8139112182.0, "step": 76900 }, { "entropy": 1.19765625, "epoch": 1.8840899074482151, "grad_norm": 3.125, "learning_rate": 5.4952616923703014e-08, "loss": 0.1508, "mean_token_accuracy": 0.9667049193382263, "num_tokens": 8144120297.0, "step": 76950 }, { "entropy": 1.20921875, "epoch": 1.8853141374075706, "grad_norm": 2.8125, "learning_rate": 5.380164552042832e-08, "loss": 0.1581, "mean_token_accuracy": 0.9663659358024597, "num_tokens": 8149360110.0, "step": 77000 }, { "entropy": 1.2215625, "epoch": 1.886538367366926, "grad_norm": 2.046875, "learning_rate": 5.266274639824742e-08, "loss": 0.1807, "mean_token_accuracy": 0.9613511979579925, "num_tokens": 8154930968.0, "step": 77050 }, { "entropy": 1.1940625, "epoch": 1.8877625973262817, "grad_norm": 3.390625, "learning_rate": 5.1535924223889305e-08, "loss": 0.1593, "mean_token_accuracy": 0.9654444575309753, "num_tokens": 8159971112.0, "step": 77100 }, { "entropy": 1.2128125, "epoch": 1.8889868272856374, "grad_norm": 3.328125, "learning_rate": 5.042118361459724e-08, "loss": 0.1693, "mean_token_accuracy": 0.964167617559433, "num_tokens": 8165136464.0, "step": 77150 }, { "entropy": 1.20234375, "epoch": 1.890211057244993, "grad_norm": 2.84375, "learning_rate": 4.931852913810875e-08, "loss": 0.1597, "mean_token_accuracy": 0.9660988628864289, "num_tokens": 8170440548.0, "step": 77200 }, { "entropy": 1.2046875, "epoch": 1.8914352872043485, "grad_norm": 2.71875, "learning_rate": 4.822796531263862e-08, "loss": 0.163, "mean_token_accuracy": 0.9647459161281585, "num_tokens": 8175965156.0, "step": 77250 }, { "entropy": 1.21484375, "epoch": 1.892659517163704, "grad_norm": 3.09375, "learning_rate": 4.7149496606857966e-08, "loss": 0.1777, "mean_token_accuracy": 0.9630069530010223, "num_tokens": 8181436041.0, "step": 77300 }, { "entropy": 1.20734375, "epoch": 1.8938837471230596, "grad_norm": 3.359375, "learning_rate": 4.608312743987819e-08, "loss": 0.1646, "mean_token_accuracy": 0.9651682090759277, "num_tokens": 8186577107.0, "step": 77350 }, { "entropy": 1.2134375, "epoch": 1.8951079770824153, "grad_norm": 4.21875, "learning_rate": 4.50288621812307e-08, "loss": 0.1701, "mean_token_accuracy": 0.9638711404800415, "num_tokens": 8191908989.0, "step": 77400 }, { "entropy": 1.1978125, "epoch": 1.8963322070417707, "grad_norm": 2.921875, "learning_rate": 4.398670515085157e-08, "loss": 0.1672, "mean_token_accuracy": 0.964127391576767, "num_tokens": 8197252149.0, "step": 77450 }, { "entropy": 1.2015625, "epoch": 1.8975564370011262, "grad_norm": 2.75, "learning_rate": 4.295666061906156e-08, "loss": 0.1741, "mean_token_accuracy": 0.9626425766944885, "num_tokens": 8202870180.0, "step": 77500 }, { "entropy": 1.20109375, "epoch": 1.8987806669604819, "grad_norm": 4.0625, "learning_rate": 4.193873280654914e-08, "loss": 0.1645, "mean_token_accuracy": 0.964863383769989, "num_tokens": 8208065173.0, "step": 77550 }, { "entropy": 1.20234375, "epoch": 1.9000048969198375, "grad_norm": 2.28125, "learning_rate": 4.093292588435549e-08, "loss": 0.1605, "mean_token_accuracy": 0.965006741285324, "num_tokens": 8213242226.0, "step": 77600 }, { "entropy": 1.20734375, "epoch": 1.901229126879193, "grad_norm": 2.0, "learning_rate": 3.993924397385251e-08, "loss": 0.1693, "mean_token_accuracy": 0.9635647284984589, "num_tokens": 8218628064.0, "step": 77650 }, { "entropy": 1.21203125, "epoch": 1.9024533568385484, "grad_norm": 3.09375, "learning_rate": 3.895769114673187e-08, "loss": 0.1657, "mean_token_accuracy": 0.9649439096450806, "num_tokens": 8223851321.0, "step": 77700 }, { "entropy": 1.18859375, "epoch": 1.903677586797904, "grad_norm": 1.8203125, "learning_rate": 3.798827142498329e-08, "loss": 0.1508, "mean_token_accuracy": 0.9679539859294891, "num_tokens": 8228778299.0, "step": 77750 }, { "entropy": 1.20296875, "epoch": 1.9049018167572598, "grad_norm": 3.0625, "learning_rate": 3.7030988780880957e-08, "loss": 0.1541, "mean_token_accuracy": 0.966580958366394, "num_tokens": 8233727662.0, "step": 77800 }, { "entropy": 1.21453125, "epoch": 1.9061260467166152, "grad_norm": 2.046875, "learning_rate": 3.6085847136966164e-08, "loss": 0.1622, "mean_token_accuracy": 0.9650613677501678, "num_tokens": 8239365249.0, "step": 77850 }, { "entropy": 1.22, "epoch": 1.9073502766759707, "grad_norm": 2.546875, "learning_rate": 3.515285036603233e-08, "loss": 0.1736, "mean_token_accuracy": 0.9626342761516571, "num_tokens": 8244922468.0, "step": 77900 }, { "entropy": 1.21125, "epoch": 1.9085745066353264, "grad_norm": 2.65625, "learning_rate": 3.423200229110701e-08, "loss": 0.1665, "mean_token_accuracy": 0.9643622922897339, "num_tokens": 8250033392.0, "step": 77950 }, { "entropy": 1.20125, "epoch": 1.909798736594682, "grad_norm": 3.546875, "learning_rate": 3.3323306685437926e-08, "loss": 0.1587, "mean_token_accuracy": 0.9665237700939179, "num_tokens": 8255293579.0, "step": 78000 }, { "entropy": 1.189375, "epoch": 1.9110229665540375, "grad_norm": 3.296875, "learning_rate": 3.242676727247795e-08, "loss": 0.146, "mean_token_accuracy": 0.9674337708950043, "num_tokens": 8260317228.0, "step": 78050 }, { "entropy": 1.2103125, "epoch": 1.912247196513393, "grad_norm": 4.0, "learning_rate": 3.1542387725868146e-08, "loss": 0.1651, "mean_token_accuracy": 0.9643155598640442, "num_tokens": 8265716396.0, "step": 78100 }, { "entropy": 1.20078125, "epoch": 1.9134714264727486, "grad_norm": 2.453125, "learning_rate": 3.0670171669423764e-08, "loss": 0.1625, "mean_token_accuracy": 0.9650612294673919, "num_tokens": 8270999547.0, "step": 78150 }, { "entropy": 1.2115625, "epoch": 1.9146956564321043, "grad_norm": 2.421875, "learning_rate": 2.981012267711858e-08, "loss": 0.1725, "mean_token_accuracy": 0.9635538387298584, "num_tokens": 8276439622.0, "step": 78200 }, { "entropy": 1.203125, "epoch": 1.91591988639146, "grad_norm": 3.5625, "learning_rate": 2.896224427307226e-08, "loss": 0.1649, "mean_token_accuracy": 0.9643189585208893, "num_tokens": 8281629841.0, "step": 78250 }, { "entropy": 1.20921875, "epoch": 1.9171441163508154, "grad_norm": 3.5, "learning_rate": 2.8126539931533023e-08, "loss": 0.1601, "mean_token_accuracy": 0.9657320499420166, "num_tokens": 8286850296.0, "step": 78300 }, { "entropy": 1.2075, "epoch": 1.9183683463101708, "grad_norm": 3.078125, "learning_rate": 2.7303013076866335e-08, "loss": 0.1675, "mean_token_accuracy": 0.964200325012207, "num_tokens": 8292528304.0, "step": 78350 }, { "entropy": 1.21671875, "epoch": 1.9195925762695265, "grad_norm": 4.125, "learning_rate": 2.6491667083537896e-08, "loss": 0.1674, "mean_token_accuracy": 0.9635697185993195, "num_tokens": 8297851717.0, "step": 78400 }, { "entropy": 1.203125, "epoch": 1.9208168062288822, "grad_norm": 3.5625, "learning_rate": 2.5692505276102673e-08, "loss": 0.1639, "mean_token_accuracy": 0.9647056591510773, "num_tokens": 8302822545.0, "step": 78450 }, { "entropy": 1.20234375, "epoch": 1.9220410361882376, "grad_norm": 4.125, "learning_rate": 2.490553092918957e-08, "loss": 0.167, "mean_token_accuracy": 0.9645107495784759, "num_tokens": 8308044186.0, "step": 78500 }, { "entropy": 1.20390625, "epoch": 1.923265266147593, "grad_norm": 4.1875, "learning_rate": 2.4130747267488096e-08, "loss": 0.1587, "mean_token_accuracy": 0.9651757764816284, "num_tokens": 8313261711.0, "step": 78550 }, { "entropy": 1.20625, "epoch": 1.9244894961069487, "grad_norm": 2.390625, "learning_rate": 2.3368157465735727e-08, "loss": 0.1729, "mean_token_accuracy": 0.9643122732639313, "num_tokens": 8318954245.0, "step": 78600 }, { "entropy": 1.21640625, "epoch": 1.9257137260663044, "grad_norm": 5.46875, "learning_rate": 2.261776464870424e-08, "loss": 0.1712, "mean_token_accuracy": 0.9633339118957519, "num_tokens": 8324544756.0, "step": 78650 }, { "entropy": 1.21515625, "epoch": 1.9269379560256599, "grad_norm": 2.875, "learning_rate": 2.1879571891188054e-08, "loss": 0.1751, "mean_token_accuracy": 0.9626336395740509, "num_tokens": 8329948691.0, "step": 78700 }, { "entropy": 1.20515625, "epoch": 1.9281621859850153, "grad_norm": 2.71875, "learning_rate": 2.1153582217990574e-08, "loss": 0.1655, "mean_token_accuracy": 0.964772834777832, "num_tokens": 8335173517.0, "step": 78750 }, { "entropy": 1.2015625, "epoch": 1.929386415944371, "grad_norm": 2.796875, "learning_rate": 2.043979860391154e-08, "loss": 0.1711, "mean_token_accuracy": 0.9635234928131103, "num_tokens": 8340379735.0, "step": 78800 }, { "entropy": 1.1909375, "epoch": 1.9306106459037267, "grad_norm": 3.703125, "learning_rate": 1.9738223973735702e-08, "loss": 0.1559, "mean_token_accuracy": 0.9672637641429901, "num_tokens": 8345381104.0, "step": 78850 }, { "entropy": 1.21375, "epoch": 1.9318348758630821, "grad_norm": 2.484375, "learning_rate": 1.9048861202221823e-08, "loss": 0.1681, "mean_token_accuracy": 0.9651576709747315, "num_tokens": 8350559447.0, "step": 78900 }, { "entropy": 1.21234375, "epoch": 1.9330591058224376, "grad_norm": 3.9375, "learning_rate": 1.8371713114086697e-08, "loss": 0.1652, "mean_token_accuracy": 0.9637591278553009, "num_tokens": 8355928028.0, "step": 78950 }, { "entropy": 1.20640625, "epoch": 1.9342833357817932, "grad_norm": 0.4453125, "learning_rate": 1.770678248399982e-08, "loss": 0.1621, "mean_token_accuracy": 0.9652046132087707, "num_tokens": 8361366979.0, "step": 79000 }, { "entropy": 1.20453125, "epoch": 1.935507565741149, "grad_norm": 4.1875, "learning_rate": 1.7054072036566394e-08, "loss": 0.1685, "mean_token_accuracy": 0.9641025936603547, "num_tokens": 8366288409.0, "step": 79050 }, { "entropy": 1.2125, "epoch": 1.9367317957005044, "grad_norm": 2.234375, "learning_rate": 1.6413584446319018e-08, "loss": 0.1632, "mean_token_accuracy": 0.9653700625896454, "num_tokens": 8371880970.0, "step": 79100 }, { "entropy": 1.20578125, "epoch": 1.9379560256598598, "grad_norm": 2.609375, "learning_rate": 1.5785322337706688e-08, "loss": 0.164, "mean_token_accuracy": 0.9650757694244385, "num_tokens": 8377110509.0, "step": 79150 }, { "entropy": 1.20046875, "epoch": 1.9391802556192155, "grad_norm": 2.140625, "learning_rate": 1.5169288285082793e-08, "loss": 0.1631, "mean_token_accuracy": 0.9651304471492768, "num_tokens": 8382268459.0, "step": 79200 }, { "entropy": 1.2075, "epoch": 1.9404044855785711, "grad_norm": 3.40625, "learning_rate": 1.4565484812696151e-08, "loss": 0.155, "mean_token_accuracy": 0.9661095356941223, "num_tokens": 8387474552.0, "step": 79250 }, { "entropy": 1.1728125, "epoch": 1.9416287155379266, "grad_norm": 2.578125, "learning_rate": 1.3973914394678655e-08, "loss": 0.1379, "mean_token_accuracy": 0.9702671027183533, "num_tokens": 8392280218.0, "step": 79300 }, { "entropy": 1.21859375, "epoch": 1.942852945497282, "grad_norm": 3.828125, "learning_rate": 1.3394579455037637e-08, "loss": 0.1586, "mean_token_accuracy": 0.9652379488945008, "num_tokens": 8397377045.0, "step": 79350 }, { "entropy": 1.20515625, "epoch": 1.9440771754566377, "grad_norm": 2.453125, "learning_rate": 1.2827482367643862e-08, "loss": 0.1537, "mean_token_accuracy": 0.9671237909793854, "num_tokens": 8402675630.0, "step": 79400 }, { "entropy": 1.18984375, "epoch": 1.9453014054159934, "grad_norm": 2.703125, "learning_rate": 1.2272625456221875e-08, "loss": 0.1511, "mean_token_accuracy": 0.9674056422710419, "num_tokens": 8407470922.0, "step": 79450 }, { "entropy": 1.22015625, "epoch": 1.946525635375349, "grad_norm": 3.078125, "learning_rate": 1.1730010994342344e-08, "loss": 0.1683, "mean_token_accuracy": 0.963656575679779, "num_tokens": 8413030681.0, "step": 79500 }, { "entropy": 1.2075, "epoch": 1.9477498653347045, "grad_norm": 3.703125, "learning_rate": 1.1199641205410727e-08, "loss": 0.1676, "mean_token_accuracy": 0.9641730666160584, "num_tokens": 8418435608.0, "step": 79550 }, { "entropy": 1.20109375, "epoch": 1.94897409529406, "grad_norm": 2.609375, "learning_rate": 1.0681518262659618e-08, "loss": 0.1612, "mean_token_accuracy": 0.9652375304698944, "num_tokens": 8423410030.0, "step": 79600 }, { "entropy": 1.19328125, "epoch": 1.9501983252534156, "grad_norm": 4.3125, "learning_rate": 1.0175644289138419e-08, "loss": 0.1565, "mean_token_accuracy": 0.9664306437969208, "num_tokens": 8428505734.0, "step": 79650 }, { "entropy": 1.19609375, "epoch": 1.9514225552127713, "grad_norm": 3.15625, "learning_rate": 9.682021357706018e-09, "loss": 0.1491, "mean_token_accuracy": 0.968139351606369, "num_tokens": 8433821851.0, "step": 79700 }, { "entropy": 1.19390625, "epoch": 1.9526467851721268, "grad_norm": 3.53125, "learning_rate": 9.20065149102145e-09, "loss": 0.1566, "mean_token_accuracy": 0.9661858582496643, "num_tokens": 8438988974.0, "step": 79750 }, { "entropy": 1.20546875, "epoch": 1.9538710151314822, "grad_norm": 2.5625, "learning_rate": 8.731536661535588e-09, "loss": 0.1691, "mean_token_accuracy": 0.9629131543636322, "num_tokens": 8444297546.0, "step": 79800 }, { "entropy": 1.20671875, "epoch": 1.9550952450908379, "grad_norm": 2.640625, "learning_rate": 8.274678791484136e-09, "loss": 0.1603, "mean_token_accuracy": 0.9652000117301941, "num_tokens": 8449852335.0, "step": 79850 }, { "entropy": 1.18640625, "epoch": 1.9563194750501935, "grad_norm": 3.359375, "learning_rate": 7.830079752877973e-09, "loss": 0.1394, "mean_token_accuracy": 0.9697746348381042, "num_tokens": 8454775071.0, "step": 79900 }, { "entropy": 1.2021875, "epoch": 1.957543705009549, "grad_norm": 2.734375, "learning_rate": 7.397741367497157e-09, "loss": 0.1613, "mean_token_accuracy": 0.9663744091987609, "num_tokens": 8460122393.0, "step": 79950 }, { "entropy": 1.21125, "epoch": 1.9587679349689044, "grad_norm": 3.125, "learning_rate": 6.977665406882272e-09, "loss": 0.1689, "mean_token_accuracy": 0.9630868649482727, "num_tokens": 8465752957.0, "step": 80000 }, { "epoch": 1.9587679349689044, "eval_entropy": 1.2009765625, "eval_loss": 0.17771507799625397, "eval_mean_token_accuracy": 0.9620104561249415, "eval_num_tokens": 8465752957.0, "eval_runtime": 611.9165, "eval_samples_per_second": 15.78, "eval_steps_per_second": 0.198, "step": 80000 }, { "entropy": 1.21203125, "epoch": 1.9599921649282601, "grad_norm": 3.421875, "learning_rate": 6.569853592327757e-09, "loss": 0.1792, "mean_token_accuracy": 0.9620552754402161, "num_tokens": 8471172063.0, "step": 80050 }, { "entropy": 1.200625, "epoch": 1.9612163948876158, "grad_norm": 3.28125, "learning_rate": 6.174307594874917e-09, "loss": 0.1558, "mean_token_accuracy": 0.9663512742519379, "num_tokens": 8476017366.0, "step": 80100 }, { "entropy": 1.195, "epoch": 1.9624406248469712, "grad_norm": 3.65625, "learning_rate": 5.7910290353049285e-09, "loss": 0.1529, "mean_token_accuracy": 0.9670116317272186, "num_tokens": 8481042255.0, "step": 80150 }, { "entropy": 1.18890625, "epoch": 1.9636648548063267, "grad_norm": 2.0, "learning_rate": 5.420019484131844e-09, "loss": 0.1608, "mean_token_accuracy": 0.9656985890865326, "num_tokens": 8486092212.0, "step": 80200 }, { "entropy": 1.209375, "epoch": 1.9648890847656824, "grad_norm": 2.203125, "learning_rate": 5.061280461596929e-09, "loss": 0.1747, "mean_token_accuracy": 0.962527574300766, "num_tokens": 8491676835.0, "step": 80250 }, { "entropy": 1.20671875, "epoch": 1.966113314725038, "grad_norm": 3.21875, "learning_rate": 4.714813437661336e-09, "loss": 0.1636, "mean_token_accuracy": 0.9649911904335022, "num_tokens": 8497110970.0, "step": 80300 }, { "entropy": 1.2090625, "epoch": 1.9673375446843935, "grad_norm": 3.078125, "learning_rate": 4.380619832001775e-09, "loss": 0.1698, "mean_token_accuracy": 0.9634296333789826, "num_tokens": 8502537441.0, "step": 80350 }, { "entropy": 1.20875, "epoch": 1.968561774643749, "grad_norm": 4.9375, "learning_rate": 4.058701014002187e-09, "loss": 0.1637, "mean_token_accuracy": 0.9648308408260345, "num_tokens": 8507630732.0, "step": 80400 }, { "entropy": 1.20390625, "epoch": 1.9697860046031046, "grad_norm": 2.15625, "learning_rate": 3.749058302751074e-09, "loss": 0.1531, "mean_token_accuracy": 0.9671729254722595, "num_tokens": 8512848795.0, "step": 80450 }, { "entropy": 1.2090625, "epoch": 1.9710102345624603, "grad_norm": 3.984375, "learning_rate": 3.451692967033848e-09, "loss": 0.1643, "mean_token_accuracy": 0.9643464314937592, "num_tokens": 8518368412.0, "step": 80500 }, { "entropy": 1.20875, "epoch": 1.972234464521816, "grad_norm": 3.203125, "learning_rate": 3.1666062253284942e-09, "loss": 0.1677, "mean_token_accuracy": 0.9644135737419128, "num_tokens": 8523653890.0, "step": 80550 }, { "entropy": 1.2159375, "epoch": 1.9734586944811714, "grad_norm": 3.875, "learning_rate": 2.893799245800244e-09, "loss": 0.166, "mean_token_accuracy": 0.9650824117660523, "num_tokens": 8529015592.0, "step": 80600 }, { "entropy": 1.2, "epoch": 1.9746829244405268, "grad_norm": 2.46875, "learning_rate": 2.633273146297577e-09, "loss": 0.1591, "mean_token_accuracy": 0.9664743864536285, "num_tokens": 8534375896.0, "step": 80650 }, { "entropy": 1.21421875, "epoch": 1.9759071543998825, "grad_norm": 2.484375, "learning_rate": 2.385028994346894e-09, "loss": 0.1685, "mean_token_accuracy": 0.9637958765029907, "num_tokens": 8539783916.0, "step": 80700 }, { "entropy": 1.19375, "epoch": 1.9771313843592382, "grad_norm": 2.34375, "learning_rate": 2.149067807147853e-09, "loss": 0.1589, "mean_token_accuracy": 0.9659115636348724, "num_tokens": 8544696959.0, "step": 80750 }, { "entropy": 1.21125, "epoch": 1.9783556143185936, "grad_norm": 2.25, "learning_rate": 1.925390551570705e-09, "loss": 0.1649, "mean_token_accuracy": 0.9644637072086334, "num_tokens": 8550159905.0, "step": 80800 }, { "entropy": 1.19796875, "epoch": 1.979579844277949, "grad_norm": 1.8828125, "learning_rate": 1.7139981441502973e-09, "loss": 0.1526, "mean_token_accuracy": 0.9669871032238007, "num_tokens": 8555193317.0, "step": 80850 }, { "entropy": 1.1953125, "epoch": 1.9808040742373048, "grad_norm": 2.828125, "learning_rate": 1.514891451083744e-09, "loss": 0.1676, "mean_token_accuracy": 0.9646944868564605, "num_tokens": 8560635630.0, "step": 80900 }, { "entropy": 1.21921875, "epoch": 1.9820283041966604, "grad_norm": 2.90625, "learning_rate": 1.328071288226762e-09, "loss": 0.1694, "mean_token_accuracy": 0.9641423618793488, "num_tokens": 8566246965.0, "step": 80950 }, { "entropy": 1.1896875, "epoch": 1.9832525341560159, "grad_norm": 1.9453125, "learning_rate": 1.1535384210893395e-09, "loss": 0.1436, "mean_token_accuracy": 0.9696673655509949, "num_tokens": 8571430149.0, "step": 81000 }, { "entropy": 1.2009375, "epoch": 1.9844767641153713, "grad_norm": 2.015625, "learning_rate": 9.912935648344057e-10, "loss": 0.1667, "mean_token_accuracy": 0.9643544840812683, "num_tokens": 8576922262.0, "step": 81050 }, { "entropy": 1.208125, "epoch": 1.985700994074727, "grad_norm": 2.71875, "learning_rate": 8.413373842721672e-10, "loss": 0.1569, "mean_token_accuracy": 0.9652732384204864, "num_tokens": 8582076472.0, "step": 81100 }, { "entropy": 1.20875, "epoch": 1.9869252240340827, "grad_norm": 2.671875, "learning_rate": 7.036704938611083e-10, "loss": 0.1691, "mean_token_accuracy": 0.9644241857528687, "num_tokens": 8587501762.0, "step": 81150 }, { "entropy": 1.17515625, "epoch": 1.9881494539934381, "grad_norm": 3.0, "learning_rate": 5.782934577009957e-10, "loss": 0.1391, "mean_token_accuracy": 0.9696795284748078, "num_tokens": 8592212871.0, "step": 81200 }, { "entropy": 1.195, "epoch": 1.9893736839527936, "grad_norm": 2.28125, "learning_rate": 4.652067895352108e-10, "loss": 0.1522, "mean_token_accuracy": 0.9666969799995422, "num_tokens": 8597378423.0, "step": 81250 }, { "entropy": 1.198125, "epoch": 1.9905979139121492, "grad_norm": 2.828125, "learning_rate": 3.644109527447537e-10, "loss": 0.1695, "mean_token_accuracy": 0.9640089082717895, "num_tokens": 8602514679.0, "step": 81300 }, { "entropy": 1.20984375, "epoch": 1.991822143871505, "grad_norm": 4.5, "learning_rate": 2.7590636034857675e-10, "loss": 0.1634, "mean_token_accuracy": 0.964598093032837, "num_tokens": 8607950288.0, "step": 81350 }, { "entropy": 1.21296875, "epoch": 1.9930463738308604, "grad_norm": 3.625, "learning_rate": 1.9969337500125308e-10, "loss": 0.166, "mean_token_accuracy": 0.9647457122802734, "num_tokens": 8613281748.0, "step": 81400 }, { "entropy": 1.21078125, "epoch": 1.9942706037902158, "grad_norm": 1.78125, "learning_rate": 1.3577230899197712e-10, "loss": 0.1541, "mean_token_accuracy": 0.9666912174224853, "num_tokens": 8618635116.0, "step": 81450 }, { "entropy": 1.18859375, "epoch": 1.9954948337495715, "grad_norm": 3.09375, "learning_rate": 8.414342424156729e-11, "loss": 0.149, "mean_token_accuracy": 0.9679227757453919, "num_tokens": 8623674169.0, "step": 81500 }, { "entropy": 1.18640625, "epoch": 1.9967190637089272, "grad_norm": 2.75, "learning_rate": 4.48069323044642e-11, "loss": 0.1535, "mean_token_accuracy": 0.966040461063385, "num_tokens": 8628848521.0, "step": 81550 }, { "entropy": 1.1934375, "epoch": 1.9979432936682826, "grad_norm": 3.5625, "learning_rate": 1.776299436406781e-11, "loss": 0.1668, "mean_token_accuracy": 0.9646425199508667, "num_tokens": 8634212914.0, "step": 81600 }, { "entropy": 1.1825, "epoch": 1.999167523627638, "grad_norm": 2.65625, "learning_rate": 3.0117212357350098e-12, "loss": 0.147, "mean_token_accuracy": 0.9685306799411774, "num_tokens": 8639315101.0, "step": 81650 } ], "logging_steps": 50, "max_steps": 81684, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.811857454193967e+19, "train_batch_size": 12, "trial_name": null, "trial_params": null }