diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16452 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 10000, + "global_step": 81684, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.13171875, + "epoch": 0.0012242299593555653, + "grad_norm": 83.0, + "learning_rate": 7.197062423500612e-08, + "loss": 1.0766, + "mean_token_accuracy": 0.8785432744026184, + "num_tokens": 5226465.0, + "step": 50 + }, + { + "entropy": 1.1265625, + "epoch": 0.0024484599187111307, + "grad_norm": 78.0, + "learning_rate": 1.4541003671970627e-07, + "loss": 1.0658, + "mean_token_accuracy": 0.8774244856834411, + "num_tokens": 10264339.0, + "step": 100 + }, + { + "entropy": 1.12125, + "epoch": 0.003672689878066696, + "grad_norm": 65.5, + "learning_rate": 2.1884944920440638e-07, + "loss": 1.0161, + "mean_token_accuracy": 0.8803484535217285, + "num_tokens": 15456221.0, + "step": 150 + }, + { + "entropy": 1.12859375, + "epoch": 0.004896919837422261, + "grad_norm": 73.0, + "learning_rate": 2.922888616891065e-07, + "loss": 0.9905, + "mean_token_accuracy": 0.8795530760288238, + "num_tokens": 20433188.0, + "step": 200 + }, + { + "entropy": 1.14265625, + "epoch": 0.006121149796777827, + "grad_norm": 69.0, + "learning_rate": 3.6572827417380663e-07, + "loss": 0.8835, + "mean_token_accuracy": 0.884142210483551, + "num_tokens": 25654586.0, + "step": 250 + }, + { + "entropy": 1.1475, + "epoch": 0.007345379756133392, + "grad_norm": 60.0, + "learning_rate": 4.391676866585067e-07, + "loss": 0.7555, + "mean_token_accuracy": 0.8876442670822143, + "num_tokens": 30682210.0, + "step": 300 + }, + { + "entropy": 1.14234375, + "epoch": 0.008569609715488957, + "grad_norm": 30.0, + "learning_rate": 5.126070991432069e-07, + "loss": 0.691, + "mean_token_accuracy": 0.8891012752056122, + "num_tokens": 36107614.0, + "step": 350 + }, + { + "entropy": 1.1603125, + "epoch": 0.009793839674844523, + "grad_norm": 15.6875, + "learning_rate": 5.860465116279069e-07, + "loss": 0.5872, + "mean_token_accuracy": 0.9278727066516876, + "num_tokens": 41528585.0, + "step": 400 + }, + { + "entropy": 1.20984375, + "epoch": 0.011018069634200088, + "grad_norm": 10.375, + "learning_rate": 6.594859241126071e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.9328850126266479, + "num_tokens": 47205376.0, + "step": 450 + }, + { + "entropy": 1.23328125, + "epoch": 0.012242299593555654, + "grad_norm": 8.875, + "learning_rate": 7.329253365973072e-07, + "loss": 0.464, + "mean_token_accuracy": 0.9372936522960663, + "num_tokens": 52484312.0, + "step": 500 + }, + { + "entropy": 1.2515625, + "epoch": 0.013466529552911218, + "grad_norm": 10.375, + "learning_rate": 8.063647490820073e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.9350792992115021, + "num_tokens": 57610761.0, + "step": 550 + }, + { + "entropy": 1.28046875, + "epoch": 0.014690759512266784, + "grad_norm": 7.8125, + "learning_rate": 8.798041615667075e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.9361693727970123, + "num_tokens": 62744047.0, + "step": 600 + }, + { + "entropy": 1.30703125, + "epoch": 0.01591498947162235, + "grad_norm": 11.375, + "learning_rate": 9.532435740514075e-07, + "loss": 0.4228, + "mean_token_accuracy": 0.9353253149986267, + "num_tokens": 68113654.0, + "step": 650 + }, + { + "entropy": 1.3890625, + "epoch": 0.017139219430977914, + "grad_norm": 4.96875, + "learning_rate": 1.0266829865361079e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.934454687833786, + "num_tokens": 73614017.0, + "step": 700 + }, + { + "entropy": 1.4703125, + "epoch": 0.01836344939033348, + "grad_norm": 4.25, + "learning_rate": 1.100122399020808e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.9344593751430511, + "num_tokens": 79174232.0, + "step": 750 + }, + { + "entropy": 1.52625, + "epoch": 0.019587679349689045, + "grad_norm": 3.671875, + "learning_rate": 1.173561811505508e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.9336516118049621, + "num_tokens": 84608483.0, + "step": 800 + }, + { + "entropy": 1.5390625, + "epoch": 0.02081190930904461, + "grad_norm": 3.8125, + "learning_rate": 1.2470012239902082e-06, + "loss": 0.353, + "mean_token_accuracy": 0.9379545438289643, + "num_tokens": 89999996.0, + "step": 850 + }, + { + "entropy": 1.54515625, + "epoch": 0.022036139268400177, + "grad_norm": 3.296875, + "learning_rate": 1.3204406364749082e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.9422214996814727, + "num_tokens": 95124008.0, + "step": 900 + }, + { + "entropy": 1.5690625, + "epoch": 0.02326036922775574, + "grad_norm": 4.09375, + "learning_rate": 1.3938800489596082e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.9378081679344177, + "num_tokens": 100136013.0, + "step": 950 + }, + { + "entropy": 1.55765625, + "epoch": 0.02448459918711131, + "grad_norm": 3.171875, + "learning_rate": 1.4673194614443085e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.9409069657325745, + "num_tokens": 105114554.0, + "step": 1000 + }, + { + "entropy": 1.53890625, + "epoch": 0.025708829146466872, + "grad_norm": 3.296875, + "learning_rate": 1.5407588739290085e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.9414800906181335, + "num_tokens": 110370176.0, + "step": 1050 + }, + { + "entropy": 1.5546875, + "epoch": 0.026933059105822436, + "grad_norm": 3.625, + "learning_rate": 1.6141982864137085e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.9426046288013459, + "num_tokens": 115321229.0, + "step": 1100 + }, + { + "entropy": 1.55046875, + "epoch": 0.028157289065178004, + "grad_norm": 2.515625, + "learning_rate": 1.687637698898409e-06, + "loss": 0.332, + "mean_token_accuracy": 0.9409434747695923, + "num_tokens": 120648053.0, + "step": 1150 + }, + { + "entropy": 1.54046875, + "epoch": 0.029381519024533568, + "grad_norm": 2.578125, + "learning_rate": 1.761077111383109e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.941448210477829, + "num_tokens": 126142957.0, + "step": 1200 + }, + { + "entropy": 1.54453125, + "epoch": 0.030605748983889135, + "grad_norm": 3.484375, + "learning_rate": 1.8345165238678093e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.9392308318614959, + "num_tokens": 131721983.0, + "step": 1250 + }, + { + "entropy": 1.54265625, + "epoch": 0.0318299789432447, + "grad_norm": 3.890625, + "learning_rate": 1.9079559363525093e-06, + "loss": 0.323, + "mean_token_accuracy": 0.9425621521472931, + "num_tokens": 136834110.0, + "step": 1300 + }, + { + "entropy": 1.55375, + "epoch": 0.03305420890260027, + "grad_norm": 3.046875, + "learning_rate": 1.9813953488372093e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.9435288536548615, + "num_tokens": 142077567.0, + "step": 1350 + }, + { + "entropy": 1.5815625, + "epoch": 0.03427843886195583, + "grad_norm": 1.703125, + "learning_rate": 2.0548347613219094e-06, + "loss": 0.325, + "mean_token_accuracy": 0.9404278743267059, + "num_tokens": 147938512.0, + "step": 1400 + }, + { + "entropy": 1.603125, + "epoch": 0.035502668821311395, + "grad_norm": 2.46875, + "learning_rate": 2.1282741738066094e-06, + "loss": 0.292, + "mean_token_accuracy": 0.94657958984375, + "num_tokens": 152967734.0, + "step": 1450 + }, + { + "entropy": 1.5790625, + "epoch": 0.03672689878066696, + "grad_norm": 4.84375, + "learning_rate": 2.20171358629131e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.9442594313621521, + "num_tokens": 158252140.0, + "step": 1500 + }, + { + "entropy": 1.5709375, + "epoch": 0.03795112874002252, + "grad_norm": 2.796875, + "learning_rate": 2.27515299877601e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.9451294171810151, + "num_tokens": 163906622.0, + "step": 1550 + }, + { + "entropy": 1.58890625, + "epoch": 0.03917535869937809, + "grad_norm": 2.703125, + "learning_rate": 2.34859241126071e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9451341640949249, + "num_tokens": 169532126.0, + "step": 1600 + }, + { + "entropy": 1.55125, + "epoch": 0.04039958865873366, + "grad_norm": 2.46875, + "learning_rate": 2.42203182374541e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9490817248821258, + "num_tokens": 174811062.0, + "step": 1650 + }, + { + "entropy": 1.54515625, + "epoch": 0.04162381861808922, + "grad_norm": 1.9296875, + "learning_rate": 2.49547123623011e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9497274696826935, + "num_tokens": 180019609.0, + "step": 1700 + }, + { + "entropy": 1.545625, + "epoch": 0.042848048577444786, + "grad_norm": 2.171875, + "learning_rate": 2.56891064871481e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9472972440719605, + "num_tokens": 185410342.0, + "step": 1750 + }, + { + "entropy": 1.54171875, + "epoch": 0.044072278536800354, + "grad_norm": 2.3125, + "learning_rate": 2.6423500611995105e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9487947750091553, + "num_tokens": 190878398.0, + "step": 1800 + }, + { + "entropy": 1.54234375, + "epoch": 0.04529650849615592, + "grad_norm": 2.8125, + "learning_rate": 2.715789473684211e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9490153706073761, + "num_tokens": 196027836.0, + "step": 1850 + }, + { + "entropy": 1.53703125, + "epoch": 0.04652073845551148, + "grad_norm": 2.578125, + "learning_rate": 2.789228886168911e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9455779695510864, + "num_tokens": 201616019.0, + "step": 1900 + }, + { + "entropy": 1.52796875, + "epoch": 0.04774496841486705, + "grad_norm": 1.671875, + "learning_rate": 2.862668298653611e-06, + "loss": 0.2609, + "mean_token_accuracy": 0.9508947324752808, + "num_tokens": 206756502.0, + "step": 1950 + }, + { + "entropy": 1.51796875, + "epoch": 0.04896919837422262, + "grad_norm": 2.296875, + "learning_rate": 2.936107711138311e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9504942214488983, + "num_tokens": 211966544.0, + "step": 2000 + }, + { + "entropy": 1.5309375, + "epoch": 0.05019342833357818, + "grad_norm": 2.84375, + "learning_rate": 3.0095471236230106e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9505769073963165, + "num_tokens": 217058889.0, + "step": 2050 + }, + { + "entropy": 1.52125, + "epoch": 0.051417658292933745, + "grad_norm": 2.203125, + "learning_rate": 3.082986536107711e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9492247033119202, + "num_tokens": 222302364.0, + "step": 2100 + }, + { + "entropy": 1.5103125, + "epoch": 0.05264188825228931, + "grad_norm": 2.046875, + "learning_rate": 3.1564259485924115e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.9467552089691162, + "num_tokens": 227892169.0, + "step": 2150 + }, + { + "entropy": 1.5121875, + "epoch": 0.05386611821164487, + "grad_norm": 1.6796875, + "learning_rate": 3.2298653610771116e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9473760116100312, + "num_tokens": 233522252.0, + "step": 2200 + }, + { + "entropy": 1.51453125, + "epoch": 0.05509034817100044, + "grad_norm": 2.46875, + "learning_rate": 3.303304773561812e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.9471155571937561, + "num_tokens": 239241678.0, + "step": 2250 + }, + { + "entropy": 1.50359375, + "epoch": 0.05631457813035601, + "grad_norm": 6.65625, + "learning_rate": 3.3767441860465116e-06, + "loss": 0.252, + "mean_token_accuracy": 0.9517535066604614, + "num_tokens": 244573352.0, + "step": 2300 + }, + { + "entropy": 1.489375, + "epoch": 0.05753880808971157, + "grad_norm": 2.578125, + "learning_rate": 3.450183598531212e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9490506839752197, + "num_tokens": 249799704.0, + "step": 2350 + }, + { + "entropy": 1.5084375, + "epoch": 0.058763038049067136, + "grad_norm": 2.859375, + "learning_rate": 3.5236230110159117e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.951296364068985, + "num_tokens": 255107263.0, + "step": 2400 + }, + { + "entropy": 1.49984375, + "epoch": 0.0599872680084227, + "grad_norm": 3.03125, + "learning_rate": 3.597062423500612e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.9485626530647278, + "num_tokens": 260533835.0, + "step": 2450 + }, + { + "entropy": 1.48375, + "epoch": 0.06121149796777827, + "grad_norm": 1.0390625, + "learning_rate": 3.670501835985312e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9517404818534851, + "num_tokens": 265773085.0, + "step": 2500 + }, + { + "entropy": 1.4821875, + "epoch": 0.06243572792713383, + "grad_norm": 2.703125, + "learning_rate": 3.743941248470012e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9500162851810455, + "num_tokens": 271036305.0, + "step": 2550 + }, + { + "entropy": 1.47765625, + "epoch": 0.0636599578864894, + "grad_norm": 1.953125, + "learning_rate": 3.817380660954712e-06, + "loss": 0.2462, + "mean_token_accuracy": 0.9525675988197326, + "num_tokens": 275952458.0, + "step": 2600 + }, + { + "entropy": 1.48328125, + "epoch": 0.06488418784584496, + "grad_norm": 2.125, + "learning_rate": 3.890820073439412e-06, + "loss": 0.2592, + "mean_token_accuracy": 0.9498057246208191, + "num_tokens": 281644324.0, + "step": 2650 + }, + { + "entropy": 1.47390625, + "epoch": 0.06610841780520053, + "grad_norm": 1.90625, + "learning_rate": 3.964259485924113e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.9530806469917298, + "num_tokens": 286839400.0, + "step": 2700 + }, + { + "entropy": 1.471875, + "epoch": 0.0673326477645561, + "grad_norm": 1.9296875, + "learning_rate": 4.037698898408813e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.9517722308635712, + "num_tokens": 292713093.0, + "step": 2750 + }, + { + "entropy": 1.47, + "epoch": 0.06855687772391166, + "grad_norm": 1.8984375, + "learning_rate": 4.111138310893514e-06, + "loss": 0.2357, + "mean_token_accuracy": 0.9542278277873993, + "num_tokens": 297994633.0, + "step": 2800 + }, + { + "entropy": 1.48640625, + "epoch": 0.06978110768326723, + "grad_norm": 2.328125, + "learning_rate": 4.184577723378213e-06, + "loss": 0.2434, + "mean_token_accuracy": 0.9529701387882232, + "num_tokens": 303305735.0, + "step": 2850 + }, + { + "entropy": 1.46640625, + "epoch": 0.07100533764262279, + "grad_norm": 1.9765625, + "learning_rate": 4.258017135862914e-06, + "loss": 0.2228, + "mean_token_accuracy": 0.9564117324352265, + "num_tokens": 308665099.0, + "step": 2900 + }, + { + "entropy": 1.47671875, + "epoch": 0.07222956760197835, + "grad_norm": 2.546875, + "learning_rate": 4.331456548347613e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.9520271122455597, + "num_tokens": 313894105.0, + "step": 2950 + }, + { + "entropy": 1.46984375, + "epoch": 0.07345379756133393, + "grad_norm": 2.125, + "learning_rate": 4.404895960832314e-06, + "loss": 0.2354, + "mean_token_accuracy": 0.9531759965419769, + "num_tokens": 319439357.0, + "step": 3000 + }, + { + "entropy": 1.479375, + "epoch": 0.07467802752068949, + "grad_norm": 1.96875, + "learning_rate": 4.478335373317013e-06, + "loss": 0.2506, + "mean_token_accuracy": 0.9517410743236542, + "num_tokens": 325090760.0, + "step": 3050 + }, + { + "entropy": 1.475, + "epoch": 0.07590225748004505, + "grad_norm": 1.6796875, + "learning_rate": 4.551774785801714e-06, + "loss": 0.2273, + "mean_token_accuracy": 0.955747674703598, + "num_tokens": 330405470.0, + "step": 3100 + }, + { + "entropy": 1.47546875, + "epoch": 0.07712648743940062, + "grad_norm": 1.8828125, + "learning_rate": 4.6252141982864134e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9522111368179321, + "num_tokens": 335678826.0, + "step": 3150 + }, + { + "entropy": 1.4603125, + "epoch": 0.07835071739875618, + "grad_norm": 1.53125, + "learning_rate": 4.698653610771114e-06, + "loss": 0.2344, + "mean_token_accuracy": 0.9539849495887757, + "num_tokens": 340918671.0, + "step": 3200 + }, + { + "entropy": 1.4509375, + "epoch": 0.07957494735811174, + "grad_norm": 2.5625, + "learning_rate": 4.7720930232558135e-06, + "loss": 0.2191, + "mean_token_accuracy": 0.9559646666049957, + "num_tokens": 346171106.0, + "step": 3250 + }, + { + "entropy": 1.454375, + "epoch": 0.08079917731746732, + "grad_norm": 5.6875, + "learning_rate": 4.845532435740514e-06, + "loss": 0.2356, + "mean_token_accuracy": 0.9528819477558136, + "num_tokens": 351560226.0, + "step": 3300 + }, + { + "entropy": 1.46609375, + "epoch": 0.08202340727682288, + "grad_norm": 1.859375, + "learning_rate": 4.918971848225214e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9533221650123597, + "num_tokens": 357311606.0, + "step": 3350 + }, + { + "entropy": 1.45046875, + "epoch": 0.08324763723617844, + "grad_norm": 3.0625, + "learning_rate": 4.992411260709914e-06, + "loss": 0.218, + "mean_token_accuracy": 0.9566865241527558, + "num_tokens": 362184714.0, + "step": 3400 + }, + { + "entropy": 1.44765625, + "epoch": 0.08447186719553401, + "grad_norm": 3.03125, + "learning_rate": 5.0658506731946145e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9571156585216523, + "num_tokens": 367118033.0, + "step": 3450 + }, + { + "entropy": 1.4721875, + "epoch": 0.08569609715488957, + "grad_norm": 1.90625, + "learning_rate": 5.139290085679315e-06, + "loss": 0.2269, + "mean_token_accuracy": 0.9551365935802459, + "num_tokens": 372554179.0, + "step": 3500 + }, + { + "entropy": 1.43546875, + "epoch": 0.08692032711424515, + "grad_norm": 2.65625, + "learning_rate": 5.212729498164015e-06, + "loss": 0.2235, + "mean_token_accuracy": 0.9559626686573028, + "num_tokens": 377909880.0, + "step": 3550 + }, + { + "entropy": 1.4384375, + "epoch": 0.08814455707360071, + "grad_norm": 1.7578125, + "learning_rate": 5.286168910648715e-06, + "loss": 0.2151, + "mean_token_accuracy": 0.9575206100940704, + "num_tokens": 383194488.0, + "step": 3600 + }, + { + "entropy": 1.42265625, + "epoch": 0.08936878703295627, + "grad_norm": 1.9609375, + "learning_rate": 5.3596083231334154e-06, + "loss": 0.229, + "mean_token_accuracy": 0.9538651633262635, + "num_tokens": 389073618.0, + "step": 3650 + }, + { + "entropy": 1.429375, + "epoch": 0.09059301699231184, + "grad_norm": 2.15625, + "learning_rate": 5.433047735618115e-06, + "loss": 0.2294, + "mean_token_accuracy": 0.9545065891742707, + "num_tokens": 394553347.0, + "step": 3700 + }, + { + "entropy": 1.42375, + "epoch": 0.0918172469516674, + "grad_norm": 2.078125, + "learning_rate": 5.5064871481028155e-06, + "loss": 0.2085, + "mean_token_accuracy": 0.9575728678703308, + "num_tokens": 399579739.0, + "step": 3750 + }, + { + "entropy": 1.411875, + "epoch": 0.09304147691102296, + "grad_norm": 1.6484375, + "learning_rate": 5.579926560587515e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.9557280552387237, + "num_tokens": 404841496.0, + "step": 3800 + }, + { + "entropy": 1.40765625, + "epoch": 0.09426570687037854, + "grad_norm": 2.015625, + "learning_rate": 5.6533659730722156e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9576599287986756, + "num_tokens": 410023001.0, + "step": 3850 + }, + { + "entropy": 1.42984375, + "epoch": 0.0954899368297341, + "grad_norm": 2.640625, + "learning_rate": 5.726805385556916e-06, + "loss": 0.2279, + "mean_token_accuracy": 0.9547258257865906, + "num_tokens": 415549547.0, + "step": 3900 + }, + { + "entropy": 1.3978125, + "epoch": 0.09671416678908966, + "grad_norm": 2.59375, + "learning_rate": 5.800244798041616e-06, + "loss": 0.2232, + "mean_token_accuracy": 0.9551710951328277, + "num_tokens": 421034105.0, + "step": 3950 + }, + { + "entropy": 1.38796875, + "epoch": 0.09793839674844523, + "grad_norm": 1.515625, + "learning_rate": 5.873684210526316e-06, + "loss": 0.2162, + "mean_token_accuracy": 0.9557711553573608, + "num_tokens": 426688731.0, + "step": 4000 + }, + { + "entropy": 1.3903125, + "epoch": 0.0991626267078008, + "grad_norm": 10.25, + "learning_rate": 5.947123623011016e-06, + "loss": 0.2102, + "mean_token_accuracy": 0.9573217809200287, + "num_tokens": 431945587.0, + "step": 4050 + }, + { + "entropy": 1.37515625, + "epoch": 0.10038685666715635, + "grad_norm": 2.703125, + "learning_rate": 5.9999995181245345e-06, + "loss": 0.2068, + "mean_token_accuracy": 0.9580986511707306, + "num_tokens": 436945746.0, + "step": 4100 + }, + { + "entropy": 1.3790625, + "epoch": 0.10161108662651193, + "grad_norm": 2.171875, + "learning_rate": 5.999989929791556e-06, + "loss": 0.2008, + "mean_token_accuracy": 0.9594962692260742, + "num_tokens": 441913649.0, + "step": 4150 + }, + { + "entropy": 1.39890625, + "epoch": 0.10283531658586749, + "grad_norm": 2.25, + "learning_rate": 5.9999680487622435e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9564687287807465, + "num_tokens": 447263639.0, + "step": 4200 + }, + { + "entropy": 1.39796875, + "epoch": 0.10405954654522305, + "grad_norm": 2.359375, + "learning_rate": 5.999933875126256e-06, + "loss": 0.2235, + "mean_token_accuracy": 0.9537206184864044, + "num_tokens": 452831245.0, + "step": 4250 + }, + { + "entropy": 1.40046875, + "epoch": 0.10528377650457862, + "grad_norm": 2.484375, + "learning_rate": 5.999887409023625e-06, + "loss": 0.1983, + "mean_token_accuracy": 0.9605963575839996, + "num_tokens": 457920235.0, + "step": 4300 + }, + { + "entropy": 1.37109375, + "epoch": 0.10650800646393419, + "grad_norm": 2.46875, + "learning_rate": 5.9998286506447455e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9589159560203552, + "num_tokens": 463428491.0, + "step": 4350 + }, + { + "entropy": 1.393125, + "epoch": 0.10773223642328975, + "grad_norm": 2.4375, + "learning_rate": 5.999757600230387e-06, + "loss": 0.2181, + "mean_token_accuracy": 0.9564608442783356, + "num_tokens": 469183579.0, + "step": 4400 + }, + { + "entropy": 1.40828125, + "epoch": 0.10895646638264532, + "grad_norm": 1.953125, + "learning_rate": 5.999674258071684e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9596063613891601, + "num_tokens": 474548123.0, + "step": 4450 + }, + { + "entropy": 1.38171875, + "epoch": 0.11018069634200088, + "grad_norm": 2.25, + "learning_rate": 5.999578624510137e-06, + "loss": 0.2113, + "mean_token_accuracy": 0.9565052735805512, + "num_tokens": 480099691.0, + "step": 4500 + }, + { + "entropy": 1.39328125, + "epoch": 0.11140492630135644, + "grad_norm": 2.328125, + "learning_rate": 5.9994706999376126e-06, + "loss": 0.2096, + "mean_token_accuracy": 0.9578315222263336, + "num_tokens": 485485141.0, + "step": 4550 + }, + { + "entropy": 1.39828125, + "epoch": 0.11262915626071202, + "grad_norm": 2.125, + "learning_rate": 5.999350484796339e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.9609186232089997, + "num_tokens": 490314941.0, + "step": 4600 + }, + { + "entropy": 1.41859375, + "epoch": 0.11385338622006758, + "grad_norm": 2.28125, + "learning_rate": 5.999217979578909e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.9569031345844269, + "num_tokens": 495604676.0, + "step": 4650 + }, + { + "entropy": 1.41984375, + "epoch": 0.11507761617942314, + "grad_norm": 1.90625, + "learning_rate": 5.999073184828273e-06, + "loss": 0.1948, + "mean_token_accuracy": 0.9596328222751618, + "num_tokens": 500772718.0, + "step": 4700 + }, + { + "entropy": 1.42, + "epoch": 0.11630184613877871, + "grad_norm": 2.75, + "learning_rate": 5.998916101137737e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9574012553691864, + "num_tokens": 506105312.0, + "step": 4750 + }, + { + "entropy": 1.40890625, + "epoch": 0.11752607609813427, + "grad_norm": 2.671875, + "learning_rate": 5.998746729150967e-06, + "loss": 0.2019, + "mean_token_accuracy": 0.958700270652771, + "num_tokens": 511311990.0, + "step": 4800 + }, + { + "entropy": 1.41671875, + "epoch": 0.11875030605748983, + "grad_norm": 1.515625, + "learning_rate": 5.998565069561976e-06, + "loss": 0.2044, + "mean_token_accuracy": 0.9582890093326568, + "num_tokens": 516615202.0, + "step": 4850 + }, + { + "entropy": 1.4115625, + "epoch": 0.1199745360168454, + "grad_norm": 1.828125, + "learning_rate": 5.998371123115128e-06, + "loss": 0.207, + "mean_token_accuracy": 0.9571990466117859, + "num_tokens": 521934656.0, + "step": 4900 + }, + { + "entropy": 1.396875, + "epoch": 0.12119876597620097, + "grad_norm": 2.140625, + "learning_rate": 5.9981648906051355e-06, + "loss": 0.2069, + "mean_token_accuracy": 0.9578309345245362, + "num_tokens": 527328179.0, + "step": 4950 + }, + { + "entropy": 1.41046875, + "epoch": 0.12242299593555654, + "grad_norm": 2.484375, + "learning_rate": 5.9979463728770525e-06, + "loss": 0.1965, + "mean_token_accuracy": 0.9601268231868744, + "num_tokens": 532420262.0, + "step": 5000 + }, + { + "entropy": 1.3953125, + "epoch": 0.1236472258949121, + "grad_norm": 2.46875, + "learning_rate": 5.997715570826272e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.9605181181430816, + "num_tokens": 537756232.0, + "step": 5050 + }, + { + "entropy": 1.390625, + "epoch": 0.12487145585426766, + "grad_norm": 1.5703125, + "learning_rate": 5.997472485398524e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9585963201522827, + "num_tokens": 543281806.0, + "step": 5100 + }, + { + "entropy": 1.4215625, + "epoch": 0.12609568581362324, + "grad_norm": 1.75, + "learning_rate": 5.99721711758987e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.9599570655822753, + "num_tokens": 548233812.0, + "step": 5150 + }, + { + "entropy": 1.40515625, + "epoch": 0.1273199157729788, + "grad_norm": 2.375, + "learning_rate": 5.9969494684466985e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.9577370703220367, + "num_tokens": 553736654.0, + "step": 5200 + }, + { + "entropy": 1.3990625, + "epoch": 0.12854414573233436, + "grad_norm": 2.140625, + "learning_rate": 5.996669539065727e-06, + "loss": 0.1945, + "mean_token_accuracy": 0.9612773549556732, + "num_tokens": 558856334.0, + "step": 5250 + }, + { + "entropy": 1.40203125, + "epoch": 0.12976837569168992, + "grad_norm": 1.7734375, + "learning_rate": 5.996377330593983e-06, + "loss": 0.2145, + "mean_token_accuracy": 0.9565242063999176, + "num_tokens": 564032272.0, + "step": 5300 + }, + { + "entropy": 1.39671875, + "epoch": 0.13099260565104548, + "grad_norm": 2.09375, + "learning_rate": 5.9960728442288186e-06, + "loss": 0.1992, + "mean_token_accuracy": 0.958374012708664, + "num_tokens": 569306892.0, + "step": 5350 + }, + { + "entropy": 1.38578125, + "epoch": 0.13221683561040107, + "grad_norm": 2.6875, + "learning_rate": 5.995756081217889e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9593621265888214, + "num_tokens": 574741752.0, + "step": 5400 + }, + { + "entropy": 1.38234375, + "epoch": 0.13344106556975663, + "grad_norm": 2.15625, + "learning_rate": 5.9954270428591555e-06, + "loss": 0.2003, + "mean_token_accuracy": 0.9591895163059234, + "num_tokens": 580457265.0, + "step": 5450 + }, + { + "entropy": 1.394375, + "epoch": 0.1346652955291122, + "grad_norm": 2.078125, + "learning_rate": 5.995085730500878e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9607266175746918, + "num_tokens": 585705175.0, + "step": 5500 + }, + { + "entropy": 1.39078125, + "epoch": 0.13588952548846775, + "grad_norm": 1.5234375, + "learning_rate": 5.994732145541613e-06, + "loss": 0.2003, + "mean_token_accuracy": 0.9587921166419983, + "num_tokens": 590923544.0, + "step": 5550 + }, + { + "entropy": 1.380625, + "epoch": 0.1371137554478233, + "grad_norm": 3.265625, + "learning_rate": 5.9943662894302e-06, + "loss": 0.1945, + "mean_token_accuracy": 0.9587338602542878, + "num_tokens": 596469221.0, + "step": 5600 + }, + { + "entropy": 1.4028125, + "epoch": 0.1383379854071789, + "grad_norm": 1.5859375, + "learning_rate": 5.993988163665767e-06, + "loss": 0.2225, + "mean_token_accuracy": 0.9551014530658722, + "num_tokens": 602167038.0, + "step": 5650 + }, + { + "entropy": 1.3846875, + "epoch": 0.13956221536653446, + "grad_norm": 2.640625, + "learning_rate": 5.9935977697977114e-06, + "loss": 0.201, + "mean_token_accuracy": 0.958451042175293, + "num_tokens": 607292638.0, + "step": 5700 + }, + { + "entropy": 1.3784375, + "epoch": 0.14078644532589002, + "grad_norm": 2.203125, + "learning_rate": 5.993195109425705e-06, + "loss": 0.2112, + "mean_token_accuracy": 0.9564135050773621, + "num_tokens": 613202323.0, + "step": 5750 + }, + { + "entropy": 1.38828125, + "epoch": 0.14201067528524558, + "grad_norm": 2.40625, + "learning_rate": 5.9927801841996784e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9602376103401185, + "num_tokens": 618640198.0, + "step": 5800 + }, + { + "entropy": 1.385, + "epoch": 0.14323490524460114, + "grad_norm": 2.609375, + "learning_rate": 5.992352995819822e-06, + "loss": 0.2075, + "mean_token_accuracy": 0.9579639828205109, + "num_tokens": 623893423.0, + "step": 5850 + }, + { + "entropy": 1.375625, + "epoch": 0.1444591352039567, + "grad_norm": 2.84375, + "learning_rate": 5.991913546036574e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.9564978110790253, + "num_tokens": 629592369.0, + "step": 5900 + }, + { + "entropy": 1.37296875, + "epoch": 0.1456833651633123, + "grad_norm": 2.078125, + "learning_rate": 5.991461836650615e-06, + "loss": 0.211, + "mean_token_accuracy": 0.9563369131088257, + "num_tokens": 635736307.0, + "step": 5950 + }, + { + "entropy": 1.38203125, + "epoch": 0.14690759512266785, + "grad_norm": 3.0, + "learning_rate": 5.990997869512859e-06, + "loss": 0.1961, + "mean_token_accuracy": 0.9592690026760101, + "num_tokens": 641116233.0, + "step": 6000 + }, + { + "entropy": 1.378125, + "epoch": 0.1481318250820234, + "grad_norm": 2.65625, + "learning_rate": 5.990521646524447e-06, + "loss": 0.2008, + "mean_token_accuracy": 0.9585745882987976, + "num_tokens": 646167116.0, + "step": 6050 + }, + { + "entropy": 1.37140625, + "epoch": 0.14935605504137897, + "grad_norm": 2.25, + "learning_rate": 5.990033169636744e-06, + "loss": 0.1783, + "mean_token_accuracy": 0.962623051404953, + "num_tokens": 651158602.0, + "step": 6100 + }, + { + "entropy": 1.38609375, + "epoch": 0.15058028500073453, + "grad_norm": 2.390625, + "learning_rate": 5.989532440851319e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.9600079596042633, + "num_tokens": 656353157.0, + "step": 6150 + }, + { + "entropy": 1.375625, + "epoch": 0.1518045149600901, + "grad_norm": 2.09375, + "learning_rate": 5.98901946221995e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9591733336448669, + "num_tokens": 661516084.0, + "step": 6200 + }, + { + "entropy": 1.3775, + "epoch": 0.15302874491944568, + "grad_norm": 2.59375, + "learning_rate": 5.988494235844608e-06, + "loss": 0.1857, + "mean_token_accuracy": 0.9618037152290344, + "num_tokens": 666952800.0, + "step": 6250 + }, + { + "entropy": 1.3721875, + "epoch": 0.15425297487880124, + "grad_norm": 1.546875, + "learning_rate": 5.987956763877448e-06, + "loss": 0.1994, + "mean_token_accuracy": 0.9587778007984161, + "num_tokens": 672306196.0, + "step": 6300 + }, + { + "entropy": 1.390625, + "epoch": 0.1554772048381568, + "grad_norm": 2.1875, + "learning_rate": 5.987407048520806e-06, + "loss": 0.1843, + "mean_token_accuracy": 0.9617053723335266, + "num_tokens": 677399978.0, + "step": 6350 + }, + { + "entropy": 1.38171875, + "epoch": 0.15670143479751236, + "grad_norm": 1.8671875, + "learning_rate": 5.986845092027181e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9602959334850312, + "num_tokens": 682747630.0, + "step": 6400 + }, + { + "entropy": 1.38578125, + "epoch": 0.15792566475686792, + "grad_norm": 2.671875, + "learning_rate": 5.986270896699237e-06, + "loss": 0.177, + "mean_token_accuracy": 0.964161764383316, + "num_tokens": 687573308.0, + "step": 6450 + }, + { + "entropy": 1.394375, + "epoch": 0.15914989471622348, + "grad_norm": 2.15625, + "learning_rate": 5.985684464889784e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9590267181396485, + "num_tokens": 692719553.0, + "step": 6500 + }, + { + "entropy": 1.4165625, + "epoch": 0.16037412467557907, + "grad_norm": 2.640625, + "learning_rate": 5.985085799001773e-06, + "loss": 0.21, + "mean_token_accuracy": 0.9567484962940216, + "num_tokens": 698446523.0, + "step": 6550 + }, + { + "entropy": 1.39546875, + "epoch": 0.16159835463493463, + "grad_norm": 1.8984375, + "learning_rate": 5.984474901488284e-06, + "loss": 0.1936, + "mean_token_accuracy": 0.9587848937511444, + "num_tokens": 703964383.0, + "step": 6600 + }, + { + "entropy": 1.3865625, + "epoch": 0.1628225845942902, + "grad_norm": 2.5625, + "learning_rate": 5.983851774852519e-06, + "loss": 0.1814, + "mean_token_accuracy": 0.9620046615600586, + "num_tokens": 708987822.0, + "step": 6650 + }, + { + "entropy": 1.38390625, + "epoch": 0.16404681455364575, + "grad_norm": 1.6015625, + "learning_rate": 5.983216421647789e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9585830473899841, + "num_tokens": 714405287.0, + "step": 6700 + }, + { + "entropy": 1.37453125, + "epoch": 0.16527104451300131, + "grad_norm": 2.40625, + "learning_rate": 5.982568844477502e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9597526073455811, + "num_tokens": 719693246.0, + "step": 6750 + }, + { + "entropy": 1.34859375, + "epoch": 0.16649527447235687, + "grad_norm": 2.265625, + "learning_rate": 5.9819090459951595e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9628249955177307, + "num_tokens": 724856885.0, + "step": 6800 + }, + { + "entropy": 1.37203125, + "epoch": 0.16771950443171246, + "grad_norm": 1.921875, + "learning_rate": 5.981237028904336e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.9559559297561645, + "num_tokens": 730337882.0, + "step": 6850 + }, + { + "entropy": 1.3596875, + "epoch": 0.16894373439106802, + "grad_norm": 2.78125, + "learning_rate": 5.980552795958676e-06, + "loss": 0.1715, + "mean_token_accuracy": 0.964083902835846, + "num_tokens": 735194384.0, + "step": 6900 + }, + { + "entropy": 1.37875, + "epoch": 0.17016796435042358, + "grad_norm": 2.890625, + "learning_rate": 5.979856349961876e-06, + "loss": 0.1884, + "mean_token_accuracy": 0.961032167673111, + "num_tokens": 740456561.0, + "step": 6950 + }, + { + "entropy": 1.34078125, + "epoch": 0.17139219430977914, + "grad_norm": 1.875, + "learning_rate": 5.979147693767682e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.9612845265865326, + "num_tokens": 745438122.0, + "step": 7000 + }, + { + "entropy": 1.35234375, + "epoch": 0.1726164242691347, + "grad_norm": 1.8828125, + "learning_rate": 5.978426830279867e-06, + "loss": 0.2001, + "mean_token_accuracy": 0.9585837364196778, + "num_tokens": 750857417.0, + "step": 7050 + }, + { + "entropy": 1.35828125, + "epoch": 0.1738406542284903, + "grad_norm": 1.5703125, + "learning_rate": 5.977693762452226e-06, + "loss": 0.2077, + "mean_token_accuracy": 0.956944135427475, + "num_tokens": 756565585.0, + "step": 7100 + }, + { + "entropy": 1.37453125, + "epoch": 0.17506488418784585, + "grad_norm": 1.59375, + "learning_rate": 5.976948493288563e-06, + "loss": 0.1978, + "mean_token_accuracy": 0.9594669210910797, + "num_tokens": 762042483.0, + "step": 7150 + }, + { + "entropy": 1.38609375, + "epoch": 0.17628911414720141, + "grad_norm": 1.96875, + "learning_rate": 5.976191025842678e-06, + "loss": 0.1967, + "mean_token_accuracy": 0.9588606441020966, + "num_tokens": 767082096.0, + "step": 7200 + }, + { + "entropy": 1.3721875, + "epoch": 0.17751334410655698, + "grad_norm": 2.4375, + "learning_rate": 5.975421363218352e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9610229313373566, + "num_tokens": 772416657.0, + "step": 7250 + }, + { + "entropy": 1.37078125, + "epoch": 0.17873757406591254, + "grad_norm": 2.46875, + "learning_rate": 5.97463950856934e-06, + "loss": 0.187, + "mean_token_accuracy": 0.9611088275909424, + "num_tokens": 777391863.0, + "step": 7300 + }, + { + "entropy": 1.3696875, + "epoch": 0.1799618040252681, + "grad_norm": 2.9375, + "learning_rate": 5.973845465099352e-06, + "loss": 0.196, + "mean_token_accuracy": 0.9594384169578553, + "num_tokens": 782502134.0, + "step": 7350 + }, + { + "entropy": 1.3825, + "epoch": 0.18118603398462368, + "grad_norm": 3.296875, + "learning_rate": 5.973039236062047e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9621104383468628, + "num_tokens": 787376887.0, + "step": 7400 + }, + { + "entropy": 1.3746875, + "epoch": 0.18241026394397925, + "grad_norm": 2.609375, + "learning_rate": 5.9722208247610095e-06, + "loss": 0.1904, + "mean_token_accuracy": 0.9605046558380127, + "num_tokens": 792554125.0, + "step": 7450 + }, + { + "entropy": 1.39890625, + "epoch": 0.1836344939033348, + "grad_norm": 2.375, + "learning_rate": 5.971390234549746e-06, + "loss": 0.1981, + "mean_token_accuracy": 0.9588062584400177, + "num_tokens": 797990011.0, + "step": 7500 + }, + { + "entropy": 1.39328125, + "epoch": 0.18485872386269037, + "grad_norm": 2.1875, + "learning_rate": 5.970547468831664e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9626439011096954, + "num_tokens": 802985973.0, + "step": 7550 + }, + { + "entropy": 1.40375, + "epoch": 0.18608295382204593, + "grad_norm": 2.140625, + "learning_rate": 5.969692531060065e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9621277391910553, + "num_tokens": 808398744.0, + "step": 7600 + }, + { + "entropy": 1.391875, + "epoch": 0.1873071837814015, + "grad_norm": 1.421875, + "learning_rate": 5.9688254247381225e-06, + "loss": 0.1859, + "mean_token_accuracy": 0.9607931089401245, + "num_tokens": 813549741.0, + "step": 7650 + }, + { + "entropy": 1.3784375, + "epoch": 0.18853141374075708, + "grad_norm": 3.171875, + "learning_rate": 5.967946153418875e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9606724309921265, + "num_tokens": 818604872.0, + "step": 7700 + }, + { + "entropy": 1.3865625, + "epoch": 0.18975564370011264, + "grad_norm": 2.046875, + "learning_rate": 5.967054720705204e-06, + "loss": 0.1934, + "mean_token_accuracy": 0.9598609590530396, + "num_tokens": 824064581.0, + "step": 7750 + }, + { + "entropy": 1.39875, + "epoch": 0.1909798736594682, + "grad_norm": 2.53125, + "learning_rate": 5.966151130249828e-06, + "loss": 0.1926, + "mean_token_accuracy": 0.9593923246860504, + "num_tokens": 829369830.0, + "step": 7800 + }, + { + "entropy": 1.3865625, + "epoch": 0.19220410361882376, + "grad_norm": 2.28125, + "learning_rate": 5.965235385755279e-06, + "loss": 0.1926, + "mean_token_accuracy": 0.9593356001377106, + "num_tokens": 834877335.0, + "step": 7850 + }, + { + "entropy": 1.39328125, + "epoch": 0.19342833357817932, + "grad_norm": 9.0, + "learning_rate": 5.9643074909738936e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.9613538563251496, + "num_tokens": 840076176.0, + "step": 7900 + }, + { + "entropy": 1.38703125, + "epoch": 0.19465256353753488, + "grad_norm": 2.3125, + "learning_rate": 5.963367449707793e-06, + "loss": 0.1815, + "mean_token_accuracy": 0.9614927160739899, + "num_tokens": 845350867.0, + "step": 7950 + }, + { + "entropy": 1.39875, + "epoch": 0.19587679349689047, + "grad_norm": 1.8359375, + "learning_rate": 5.962415265808872e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9596588695049286, + "num_tokens": 850547684.0, + "step": 8000 + }, + { + "entropy": 1.3890625, + "epoch": 0.19710102345624603, + "grad_norm": 2.6875, + "learning_rate": 5.961450943178779e-06, + "loss": 0.1915, + "mean_token_accuracy": 0.9603916919231414, + "num_tokens": 855721426.0, + "step": 8050 + }, + { + "entropy": 1.37421875, + "epoch": 0.1983252534156016, + "grad_norm": 2.734375, + "learning_rate": 5.960474485768902e-06, + "loss": 0.1722, + "mean_token_accuracy": 0.963141576051712, + "num_tokens": 860509090.0, + "step": 8100 + }, + { + "entropy": 1.34984375, + "epoch": 0.19954948337495715, + "grad_norm": 2.109375, + "learning_rate": 5.959485897580353e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.9624167239665985, + "num_tokens": 865732499.0, + "step": 8150 + }, + { + "entropy": 1.37765625, + "epoch": 0.2007737133343127, + "grad_norm": 2.875, + "learning_rate": 5.95848518266395e-06, + "loss": 0.1955, + "mean_token_accuracy": 0.9592999804019928, + "num_tokens": 870715442.0, + "step": 8200 + }, + { + "entropy": 1.3496875, + "epoch": 0.20199794329366827, + "grad_norm": 1.8359375, + "learning_rate": 5.957472345120202e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9611281609535217, + "num_tokens": 875976771.0, + "step": 8250 + }, + { + "entropy": 1.331875, + "epoch": 0.20322217325302386, + "grad_norm": 2.34375, + "learning_rate": 5.95644738909929e-06, + "loss": 0.1801, + "mean_token_accuracy": 0.9619064545631408, + "num_tokens": 881030532.0, + "step": 8300 + }, + { + "entropy": 1.33828125, + "epoch": 0.20444640321237942, + "grad_norm": 2.3125, + "learning_rate": 5.9554103188010544e-06, + "loss": 0.1844, + "mean_token_accuracy": 0.9607453966140747, + "num_tokens": 886102364.0, + "step": 8350 + }, + { + "entropy": 1.33625, + "epoch": 0.20567063317173498, + "grad_norm": 2.59375, + "learning_rate": 5.9543611384749716e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9599519455432892, + "num_tokens": 891339628.0, + "step": 8400 + }, + { + "entropy": 1.3515625, + "epoch": 0.20689486313109054, + "grad_norm": 3.1875, + "learning_rate": 5.953299852420142e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9594342112541199, + "num_tokens": 896598491.0, + "step": 8450 + }, + { + "entropy": 1.3475, + "epoch": 0.2081190930904461, + "grad_norm": 1.6171875, + "learning_rate": 5.952226464985268e-06, + "loss": 0.1876, + "mean_token_accuracy": 0.9601819491386414, + "num_tokens": 901857034.0, + "step": 8500 + }, + { + "entropy": 1.34546875, + "epoch": 0.2093433230498017, + "grad_norm": 2.484375, + "learning_rate": 5.951140980568639e-06, + "loss": 0.2025, + "mean_token_accuracy": 0.9580735051631928, + "num_tokens": 907672007.0, + "step": 8550 + }, + { + "entropy": 1.3434375, + "epoch": 0.21056755300915725, + "grad_norm": 2.859375, + "learning_rate": 5.950043403618116e-06, + "loss": 0.182, + "mean_token_accuracy": 0.9620107614994049, + "num_tokens": 912959621.0, + "step": 8600 + }, + { + "entropy": 1.34140625, + "epoch": 0.2117917829685128, + "grad_norm": 2.015625, + "learning_rate": 5.948933738631106e-06, + "loss": 0.182, + "mean_token_accuracy": 0.9617352223396302, + "num_tokens": 918075673.0, + "step": 8650 + }, + { + "entropy": 1.3446875, + "epoch": 0.21301601292786837, + "grad_norm": 2.625, + "learning_rate": 5.9478119901545485e-06, + "loss": 0.1863, + "mean_token_accuracy": 0.960466115474701, + "num_tokens": 923511470.0, + "step": 8700 + }, + { + "entropy": 1.3490625, + "epoch": 0.21424024288722393, + "grad_norm": 2.4375, + "learning_rate": 5.946678162784898e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9574442803859711, + "num_tokens": 929168035.0, + "step": 8750 + }, + { + "entropy": 1.3559375, + "epoch": 0.2154644728465795, + "grad_norm": 2.59375, + "learning_rate": 5.945532261168101e-06, + "loss": 0.188, + "mean_token_accuracy": 0.9608505368232727, + "num_tokens": 934643696.0, + "step": 8800 + }, + { + "entropy": 1.37, + "epoch": 0.21668870280593508, + "grad_norm": 2.84375, + "learning_rate": 5.9443742899995815e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.9590060126781463, + "num_tokens": 940012909.0, + "step": 8850 + }, + { + "entropy": 1.360625, + "epoch": 0.21791293276529064, + "grad_norm": 2.28125, + "learning_rate": 5.943204254024216e-06, + "loss": 0.1835, + "mean_token_accuracy": 0.9617989957332611, + "num_tokens": 945384360.0, + "step": 8900 + }, + { + "entropy": 1.3675, + "epoch": 0.2191371627246462, + "grad_norm": 3.03125, + "learning_rate": 5.942022158036322e-06, + "loss": 0.1955, + "mean_token_accuracy": 0.9601530432701111, + "num_tokens": 950833742.0, + "step": 8950 + }, + { + "entropy": 1.38125, + "epoch": 0.22036139268400176, + "grad_norm": 2.578125, + "learning_rate": 5.9408280068796286e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.9570643317699432, + "num_tokens": 956401892.0, + "step": 9000 + }, + { + "entropy": 1.37234375, + "epoch": 0.22158562264335732, + "grad_norm": 1.71875, + "learning_rate": 5.939621805447267e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9623953711986541, + "num_tokens": 961223140.0, + "step": 9050 + }, + { + "entropy": 1.391875, + "epoch": 0.22280985260271288, + "grad_norm": 2.15625, + "learning_rate": 5.938403558681743e-06, + "loss": 0.202, + "mean_token_accuracy": 0.9580870044231414, + "num_tokens": 966771629.0, + "step": 9100 + }, + { + "entropy": 1.36703125, + "epoch": 0.22403408256206847, + "grad_norm": 2.609375, + "learning_rate": 5.9371732715749175e-06, + "loss": 0.1866, + "mean_token_accuracy": 0.9609157121181489, + "num_tokens": 972305399.0, + "step": 9150 + }, + { + "entropy": 1.35140625, + "epoch": 0.22525831252142403, + "grad_norm": 1.6796875, + "learning_rate": 5.935930949167991e-06, + "loss": 0.1815, + "mean_token_accuracy": 0.9617423331737518, + "num_tokens": 977370470.0, + "step": 9200 + }, + { + "entropy": 1.36953125, + "epoch": 0.2264825424807796, + "grad_norm": 2.140625, + "learning_rate": 5.934676596551477e-06, + "loss": 0.1884, + "mean_token_accuracy": 0.9609754991531372, + "num_tokens": 982652269.0, + "step": 9250 + }, + { + "entropy": 1.363125, + "epoch": 0.22770677244013515, + "grad_norm": 2.484375, + "learning_rate": 5.933410218865186e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9611955726146698, + "num_tokens": 988014138.0, + "step": 9300 + }, + { + "entropy": 1.37265625, + "epoch": 0.2289310023994907, + "grad_norm": 2.53125, + "learning_rate": 5.932131821298198e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9616758930683136, + "num_tokens": 993370242.0, + "step": 9350 + }, + { + "entropy": 1.38515625, + "epoch": 0.23015523235884627, + "grad_norm": 2.34375, + "learning_rate": 5.930841409088853e-06, + "loss": 0.1906, + "mean_token_accuracy": 0.9603582990169525, + "num_tokens": 998918502.0, + "step": 9400 + }, + { + "entropy": 1.39, + "epoch": 0.23137946231820186, + "grad_norm": 2.578125, + "learning_rate": 5.929538987524712e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9604568040370941, + "num_tokens": 1004326538.0, + "step": 9450 + }, + { + "entropy": 1.3890625, + "epoch": 0.23260369227755742, + "grad_norm": 2.75, + "learning_rate": 5.928224561942554e-06, + "loss": 0.1812, + "mean_token_accuracy": 0.9616895508766174, + "num_tokens": 1009603548.0, + "step": 9500 + }, + { + "entropy": 1.3871875, + "epoch": 0.23382792223691298, + "grad_norm": 2.3125, + "learning_rate": 5.92689813772834e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9590861582756043, + "num_tokens": 1015070964.0, + "step": 9550 + }, + { + "entropy": 1.36609375, + "epoch": 0.23505215219626854, + "grad_norm": 3.109375, + "learning_rate": 5.9255597203172e-06, + "loss": 0.1828, + "mean_token_accuracy": 0.9619620275497437, + "num_tokens": 1020492153.0, + "step": 9600 + }, + { + "entropy": 1.38609375, + "epoch": 0.2362763821556241, + "grad_norm": 2.421875, + "learning_rate": 5.924209315193405e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.961515667438507, + "num_tokens": 1025864529.0, + "step": 9650 + }, + { + "entropy": 1.3715625, + "epoch": 0.23750061211497966, + "grad_norm": 2.296875, + "learning_rate": 5.922846927890345e-06, + "loss": 0.1797, + "mean_token_accuracy": 0.9618804860115051, + "num_tokens": 1031024359.0, + "step": 9700 + }, + { + "entropy": 1.36359375, + "epoch": 0.23872484207433525, + "grad_norm": 2.46875, + "learning_rate": 5.9214725639905115e-06, + "loss": 0.1863, + "mean_token_accuracy": 0.9610350334644318, + "num_tokens": 1036377471.0, + "step": 9750 + }, + { + "entropy": 1.3715625, + "epoch": 0.2399490720336908, + "grad_norm": 2.859375, + "learning_rate": 5.92008622912547e-06, + "loss": 0.1831, + "mean_token_accuracy": 0.9612818145751953, + "num_tokens": 1041703688.0, + "step": 9800 + }, + { + "entropy": 1.35671875, + "epoch": 0.24117330199304637, + "grad_norm": 2.6875, + "learning_rate": 5.918687928975836e-06, + "loss": 0.1839, + "mean_token_accuracy": 0.9616091656684875, + "num_tokens": 1046917985.0, + "step": 9850 + }, + { + "entropy": 1.39015625, + "epoch": 0.24239753195240193, + "grad_norm": 1.8046875, + "learning_rate": 5.9172776692712575e-06, + "loss": 0.1965, + "mean_token_accuracy": 0.9584881782531738, + "num_tokens": 1052482737.0, + "step": 9900 + }, + { + "entropy": 1.38703125, + "epoch": 0.2436217619117575, + "grad_norm": 2.6875, + "learning_rate": 5.915855455790381e-06, + "loss": 0.1884, + "mean_token_accuracy": 0.9608153140544892, + "num_tokens": 1057868410.0, + "step": 9950 + }, + { + "entropy": 1.395, + "epoch": 0.24484599187111308, + "grad_norm": 2.8125, + "learning_rate": 5.914421294360843e-06, + "loss": 0.1904, + "mean_token_accuracy": 0.9597806739807129, + "num_tokens": 1063175179.0, + "step": 10000 + }, + { + "epoch": 0.24484599187111308, + "eval_entropy": 1.359765625, + "eval_loss": 0.20250044763088226, + "eval_mean_token_accuracy": 0.9580152039726575, + "eval_num_tokens": 1063175179.0, + "eval_runtime": 600.0597, + "eval_samples_per_second": 16.092, + "eval_steps_per_second": 0.202, + "step": 10000 + }, + { + "entropy": 1.3840625, + "epoch": 0.24607022183046864, + "grad_norm": 2.28125, + "learning_rate": 5.912975190859232e-06, + "loss": 0.195, + "mean_token_accuracy": 0.9596641564369202, + "num_tokens": 1068741854.0, + "step": 10050 + }, + { + "entropy": 1.3790625, + "epoch": 0.2472944517898242, + "grad_norm": 2.484375, + "learning_rate": 5.9115171512110714e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9604480576515197, + "num_tokens": 1074116479.0, + "step": 10100 + }, + { + "entropy": 1.36453125, + "epoch": 0.24851868174917977, + "grad_norm": 2.171875, + "learning_rate": 5.910047181390794e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.9642793035507202, + "num_tokens": 1079159902.0, + "step": 10150 + }, + { + "entropy": 1.373125, + "epoch": 0.24974291170853533, + "grad_norm": 1.9765625, + "learning_rate": 5.908565287421718e-06, + "loss": 0.1861, + "mean_token_accuracy": 0.9611909198760986, + "num_tokens": 1084521049.0, + "step": 10200 + }, + { + "entropy": 1.3578125, + "epoch": 0.2509671416678909, + "grad_norm": 2.65625, + "learning_rate": 5.907071475376021e-06, + "loss": 0.1787, + "mean_token_accuracy": 0.9620854771137237, + "num_tokens": 1089493722.0, + "step": 10250 + }, + { + "entropy": 1.36484375, + "epoch": 0.2521913716272465, + "grad_norm": 2.640625, + "learning_rate": 5.905565751374717e-06, + "loss": 0.1732, + "mean_token_accuracy": 0.9639436435699463, + "num_tokens": 1094338571.0, + "step": 10300 + }, + { + "entropy": 1.37234375, + "epoch": 0.25341560158660204, + "grad_norm": 2.5625, + "learning_rate": 5.904048121587628e-06, + "loss": 0.1772, + "mean_token_accuracy": 0.9625762343406677, + "num_tokens": 1099742354.0, + "step": 10350 + }, + { + "entropy": 1.38359375, + "epoch": 0.2546398315459576, + "grad_norm": 1.5078125, + "learning_rate": 5.902518592233363e-06, + "loss": 0.1987, + "mean_token_accuracy": 0.9577878427505493, + "num_tokens": 1105617487.0, + "step": 10400 + }, + { + "entropy": 1.3615625, + "epoch": 0.25586406150531316, + "grad_norm": 3.234375, + "learning_rate": 5.9009771695792905e-06, + "loss": 0.1811, + "mean_token_accuracy": 0.9621189975738526, + "num_tokens": 1110680544.0, + "step": 10450 + }, + { + "entropy": 1.37375, + "epoch": 0.2570882914646687, + "grad_norm": 2.140625, + "learning_rate": 5.899423859941511e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.9606586790084839, + "num_tokens": 1116178837.0, + "step": 10500 + }, + { + "entropy": 1.37484375, + "epoch": 0.2583125214240243, + "grad_norm": 1.7578125, + "learning_rate": 5.897858669684833e-06, + "loss": 0.1893, + "mean_token_accuracy": 0.9598471677303314, + "num_tokens": 1121511467.0, + "step": 10550 + }, + { + "entropy": 1.3609375, + "epoch": 0.25953675138337984, + "grad_norm": 2.078125, + "learning_rate": 5.896281605222749e-06, + "loss": 0.1806, + "mean_token_accuracy": 0.9624120283126831, + "num_tokens": 1126507233.0, + "step": 10600 + }, + { + "entropy": 1.34734375, + "epoch": 0.2607609813427354, + "grad_norm": 2.28125, + "learning_rate": 5.8946926730174045e-06, + "loss": 0.1863, + "mean_token_accuracy": 0.9608824181556702, + "num_tokens": 1131912464.0, + "step": 10650 + }, + { + "entropy": 1.33921875, + "epoch": 0.26198521130209096, + "grad_norm": 2.5625, + "learning_rate": 5.893091879579575e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9607326745986938, + "num_tokens": 1136882208.0, + "step": 10700 + }, + { + "entropy": 1.343125, + "epoch": 0.2632094412614466, + "grad_norm": 1.9921875, + "learning_rate": 5.89147923146864e-06, + "loss": 0.1813, + "mean_token_accuracy": 0.9620126748085022, + "num_tokens": 1142095292.0, + "step": 10750 + }, + { + "entropy": 1.34765625, + "epoch": 0.26443367122080214, + "grad_norm": 3.234375, + "learning_rate": 5.889854735292551e-06, + "loss": 0.1841, + "mean_token_accuracy": 0.9618128108978271, + "num_tokens": 1147363920.0, + "step": 10800 + }, + { + "entropy": 1.356875, + "epoch": 0.2656579011801577, + "grad_norm": 2.46875, + "learning_rate": 5.888218397707811e-06, + "loss": 0.1742, + "mean_token_accuracy": 0.9638459277153015, + "num_tokens": 1152380705.0, + "step": 10850 + }, + { + "entropy": 1.32984375, + "epoch": 0.26688213113951326, + "grad_norm": 2.109375, + "learning_rate": 5.886570225419441e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9608019030094147, + "num_tokens": 1157839898.0, + "step": 10900 + }, + { + "entropy": 1.34609375, + "epoch": 0.2681063610988688, + "grad_norm": 3.453125, + "learning_rate": 5.88491022518096e-06, + "loss": 0.1918, + "mean_token_accuracy": 0.9609634006023406, + "num_tokens": 1163068506.0, + "step": 10950 + }, + { + "entropy": 1.32734375, + "epoch": 0.2693305910582244, + "grad_norm": 2.125, + "learning_rate": 5.883238403794349e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9633646559715271, + "num_tokens": 1168287852.0, + "step": 11000 + }, + { + "entropy": 1.34375, + "epoch": 0.27055482101757994, + "grad_norm": 2.296875, + "learning_rate": 5.881554768110028e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9605349290370941, + "num_tokens": 1173597061.0, + "step": 11050 + }, + { + "entropy": 1.3434375, + "epoch": 0.2717790509769355, + "grad_norm": 3.5, + "learning_rate": 5.879859325026828e-06, + "loss": 0.1864, + "mean_token_accuracy": 0.9604840254783631, + "num_tokens": 1178845621.0, + "step": 11100 + }, + { + "entropy": 1.35984375, + "epoch": 0.27300328093629106, + "grad_norm": 2.734375, + "learning_rate": 5.878152081491963e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.9589577269554138, + "num_tokens": 1184054388.0, + "step": 11150 + }, + { + "entropy": 1.34875, + "epoch": 0.2742275108956466, + "grad_norm": 2.625, + "learning_rate": 5.876433044500996e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9595346593856812, + "num_tokens": 1189697396.0, + "step": 11200 + }, + { + "entropy": 1.34390625, + "epoch": 0.2754517408550022, + "grad_norm": 2.0, + "learning_rate": 5.874702221097819e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.960370112657547, + "num_tokens": 1195166226.0, + "step": 11250 + }, + { + "entropy": 1.34515625, + "epoch": 0.2766759708143578, + "grad_norm": 2.734375, + "learning_rate": 5.8729596183746175e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.9621370649337768, + "num_tokens": 1200392905.0, + "step": 11300 + }, + { + "entropy": 1.3428125, + "epoch": 0.27790020077371336, + "grad_norm": 3.078125, + "learning_rate": 5.871205243471844e-06, + "loss": 0.1841, + "mean_token_accuracy": 0.9613085889816284, + "num_tokens": 1205618541.0, + "step": 11350 + }, + { + "entropy": 1.35171875, + "epoch": 0.2791244307330689, + "grad_norm": 3.40625, + "learning_rate": 5.869439103578189e-06, + "loss": 0.1852, + "mean_token_accuracy": 0.9616814315319061, + "num_tokens": 1210836329.0, + "step": 11400 + }, + { + "entropy": 1.3453125, + "epoch": 0.2803486606924245, + "grad_norm": 1.8359375, + "learning_rate": 5.867661205930549e-06, + "loss": 0.1821, + "mean_token_accuracy": 0.9620612812042236, + "num_tokens": 1215867506.0, + "step": 11450 + }, + { + "entropy": 1.35875, + "epoch": 0.28157289065178004, + "grad_norm": 2.953125, + "learning_rate": 5.865871557814003e-06, + "loss": 0.1915, + "mean_token_accuracy": 0.9604600322246551, + "num_tokens": 1220793244.0, + "step": 11500 + }, + { + "entropy": 1.353125, + "epoch": 0.2827971206111356, + "grad_norm": 2.796875, + "learning_rate": 5.864070166561775e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9599918603897095, + "num_tokens": 1226305868.0, + "step": 11550 + }, + { + "entropy": 1.394375, + "epoch": 0.28402135057049116, + "grad_norm": 2.046875, + "learning_rate": 5.862257039555207e-06, + "loss": 0.1991, + "mean_token_accuracy": 0.9583842658996582, + "num_tokens": 1232013095.0, + "step": 11600 + }, + { + "entropy": 1.37578125, + "epoch": 0.2852455805298467, + "grad_norm": 2.015625, + "learning_rate": 5.860432184223731e-06, + "loss": 0.1913, + "mean_token_accuracy": 0.9596893274784088, + "num_tokens": 1237458606.0, + "step": 11650 + }, + { + "entropy": 1.35703125, + "epoch": 0.2864698104892023, + "grad_norm": 2.09375, + "learning_rate": 5.858595608044837e-06, + "loss": 0.1835, + "mean_token_accuracy": 0.9611952984333039, + "num_tokens": 1242972251.0, + "step": 11700 + }, + { + "entropy": 1.37078125, + "epoch": 0.28769404044855784, + "grad_norm": 3.1875, + "learning_rate": 5.856747318544041e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9609648621082306, + "num_tokens": 1248318638.0, + "step": 11750 + }, + { + "entropy": 1.365, + "epoch": 0.2889182704079134, + "grad_norm": 2.15625, + "learning_rate": 5.854887323294856e-06, + "loss": 0.183, + "mean_token_accuracy": 0.9627510058879852, + "num_tokens": 1253680002.0, + "step": 11800 + }, + { + "entropy": 1.37578125, + "epoch": 0.29014250036726896, + "grad_norm": 1.8828125, + "learning_rate": 5.853015629918759e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9614068794250489, + "num_tokens": 1258924764.0, + "step": 11850 + }, + { + "entropy": 1.37796875, + "epoch": 0.2913667303266246, + "grad_norm": 1.90625, + "learning_rate": 5.8511322460851624e-06, + "loss": 0.1832, + "mean_token_accuracy": 0.9620686209201813, + "num_tokens": 1264051390.0, + "step": 11900 + }, + { + "entropy": 1.37328125, + "epoch": 0.29259096028598014, + "grad_norm": 2.3125, + "learning_rate": 5.849237179511381e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9628199970722199, + "num_tokens": 1269148836.0, + "step": 11950 + }, + { + "entropy": 1.376875, + "epoch": 0.2938151902453357, + "grad_norm": 3.125, + "learning_rate": 5.8473304379626e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.9601672506332397, + "num_tokens": 1274348582.0, + "step": 12000 + }, + { + "entropy": 1.35203125, + "epoch": 0.29503942020469126, + "grad_norm": 2.46875, + "learning_rate": 5.845412029251843e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9622039210796356, + "num_tokens": 1279184908.0, + "step": 12050 + }, + { + "entropy": 1.35859375, + "epoch": 0.2962636501640468, + "grad_norm": 2.921875, + "learning_rate": 5.843481961239942e-06, + "loss": 0.1772, + "mean_token_accuracy": 0.9627481973171235, + "num_tokens": 1284410532.0, + "step": 12100 + }, + { + "entropy": 1.35953125, + "epoch": 0.2974878801234024, + "grad_norm": 7.40625, + "learning_rate": 5.841540241835504e-06, + "loss": 0.1768, + "mean_token_accuracy": 0.9626896047592163, + "num_tokens": 1289768837.0, + "step": 12150 + }, + { + "entropy": 1.378125, + "epoch": 0.29871211008275794, + "grad_norm": 2.3125, + "learning_rate": 5.8395868789948775e-06, + "loss": 0.1848, + "mean_token_accuracy": 0.9612694227695465, + "num_tokens": 1295005247.0, + "step": 12200 + }, + { + "entropy": 1.37359375, + "epoch": 0.2999363400421135, + "grad_norm": 2.34375, + "learning_rate": 5.837621880722122e-06, + "loss": 0.1909, + "mean_token_accuracy": 0.9603909432888031, + "num_tokens": 1300316507.0, + "step": 12250 + }, + { + "entropy": 1.35953125, + "epoch": 0.30116057000146906, + "grad_norm": 2.75, + "learning_rate": 5.835645255068973e-06, + "loss": 0.1838, + "mean_token_accuracy": 0.9617878496646881, + "num_tokens": 1305931141.0, + "step": 12300 + }, + { + "entropy": 1.34640625, + "epoch": 0.3023847999608246, + "grad_norm": 2.375, + "learning_rate": 5.8336570101348115e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9648260760307312, + "num_tokens": 1310803906.0, + "step": 12350 + }, + { + "entropy": 1.358125, + "epoch": 0.3036090299201802, + "grad_norm": 2.84375, + "learning_rate": 5.831657154066629e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9618698525428772, + "num_tokens": 1315973080.0, + "step": 12400 + }, + { + "entropy": 1.35328125, + "epoch": 0.30483325987953575, + "grad_norm": 3.578125, + "learning_rate": 5.829645695058992e-06, + "loss": 0.1747, + "mean_token_accuracy": 0.9627145206928254, + "num_tokens": 1321381888.0, + "step": 12450 + }, + { + "entropy": 1.37859375, + "epoch": 0.30605748983889136, + "grad_norm": 2.609375, + "learning_rate": 5.827622641354014e-06, + "loss": 0.1787, + "mean_token_accuracy": 0.9626282620429992, + "num_tokens": 1326557068.0, + "step": 12500 + }, + { + "entropy": 1.3759375, + "epoch": 0.3072817197982469, + "grad_norm": 2.328125, + "learning_rate": 5.825588001241318e-06, + "loss": 0.1912, + "mean_token_accuracy": 0.9598784649372101, + "num_tokens": 1332216024.0, + "step": 12550 + }, + { + "entropy": 1.35890625, + "epoch": 0.3085059497576025, + "grad_norm": 1.8359375, + "learning_rate": 5.823541783058005e-06, + "loss": 0.174, + "mean_token_accuracy": 0.962734831571579, + "num_tokens": 1337390329.0, + "step": 12600 + }, + { + "entropy": 1.37375, + "epoch": 0.30973017971695804, + "grad_norm": 2.140625, + "learning_rate": 5.821483995188612e-06, + "loss": 0.1881, + "mean_token_accuracy": 0.9605675613880158, + "num_tokens": 1343045143.0, + "step": 12650 + }, + { + "entropy": 1.3415625, + "epoch": 0.3109544096763136, + "grad_norm": 2.28125, + "learning_rate": 5.81941464606509e-06, + "loss": 0.1666, + "mean_token_accuracy": 0.9643463969230652, + "num_tokens": 1348034262.0, + "step": 12700 + }, + { + "entropy": 1.3440625, + "epoch": 0.31217863963566916, + "grad_norm": 2.96875, + "learning_rate": 5.817333744166762e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9586631393432617, + "num_tokens": 1353723053.0, + "step": 12750 + }, + { + "entropy": 1.3721875, + "epoch": 0.3134028695950247, + "grad_norm": 2.203125, + "learning_rate": 5.815241298020286e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9600662136077881, + "num_tokens": 1358674728.0, + "step": 12800 + }, + { + "entropy": 1.365625, + "epoch": 0.3146270995543803, + "grad_norm": 2.6875, + "learning_rate": 5.813137316199628e-06, + "loss": 0.1835, + "mean_token_accuracy": 0.961768034696579, + "num_tokens": 1363933473.0, + "step": 12850 + }, + { + "entropy": 1.38015625, + "epoch": 0.31585132951373585, + "grad_norm": 2.5, + "learning_rate": 5.811021807326018e-06, + "loss": 0.1982, + "mean_token_accuracy": 0.9590709102153778, + "num_tokens": 1369281803.0, + "step": 12900 + }, + { + "entropy": 1.37, + "epoch": 0.3170755594730914, + "grad_norm": 3.03125, + "learning_rate": 5.808894780067923e-06, + "loss": 0.1949, + "mean_token_accuracy": 0.9586555528640747, + "num_tokens": 1374853145.0, + "step": 12950 + }, + { + "entropy": 1.36421875, + "epoch": 0.31829978943244697, + "grad_norm": 2.015625, + "learning_rate": 5.8067562431410045e-06, + "loss": 0.171, + "mean_token_accuracy": 0.9631958258152008, + "num_tokens": 1379934830.0, + "step": 13000 + }, + { + "entropy": 1.3609375, + "epoch": 0.3195240193918026, + "grad_norm": 2.09375, + "learning_rate": 5.804606205308088e-06, + "loss": 0.1841, + "mean_token_accuracy": 0.9605684506893158, + "num_tokens": 1385105704.0, + "step": 13050 + }, + { + "entropy": 1.37671875, + "epoch": 0.32074824935115814, + "grad_norm": 2.875, + "learning_rate": 5.802444675379122e-06, + "loss": 0.1947, + "mean_token_accuracy": 0.9595759809017181, + "num_tokens": 1390581041.0, + "step": 13100 + }, + { + "entropy": 1.37828125, + "epoch": 0.3219724793105137, + "grad_norm": 1.9453125, + "learning_rate": 5.8002716622111485e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9617175209522247, + "num_tokens": 1395850769.0, + "step": 13150 + }, + { + "entropy": 1.365, + "epoch": 0.32319670926986926, + "grad_norm": 2.515625, + "learning_rate": 5.79808717470826e-06, + "loss": 0.1676, + "mean_token_accuracy": 0.9655633735656738, + "num_tokens": 1400935540.0, + "step": 13200 + }, + { + "entropy": 1.3709375, + "epoch": 0.3244209392292248, + "grad_norm": 2.421875, + "learning_rate": 5.795891221821569e-06, + "loss": 0.1807, + "mean_token_accuracy": 0.9624592447280884, + "num_tokens": 1406376315.0, + "step": 13250 + }, + { + "entropy": 1.34875, + "epoch": 0.3256451691885804, + "grad_norm": 3.09375, + "learning_rate": 5.793683812549162e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9637568819522858, + "num_tokens": 1411533562.0, + "step": 13300 + }, + { + "entropy": 1.36421875, + "epoch": 0.32686939914793595, + "grad_norm": 2.703125, + "learning_rate": 5.791464955936077e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.9592576730251312, + "num_tokens": 1417402528.0, + "step": 13350 + }, + { + "entropy": 1.36109375, + "epoch": 0.3280936291072915, + "grad_norm": 1.7109375, + "learning_rate": 5.789234661074254e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9627709448337555, + "num_tokens": 1422622878.0, + "step": 13400 + }, + { + "entropy": 1.3790625, + "epoch": 0.32931785906664707, + "grad_norm": 2.421875, + "learning_rate": 5.786992937102503e-06, + "loss": 0.1959, + "mean_token_accuracy": 0.9586515820026398, + "num_tokens": 1427838914.0, + "step": 13450 + }, + { + "entropy": 1.36, + "epoch": 0.33054208902600263, + "grad_norm": 3.140625, + "learning_rate": 5.784739793206464e-06, + "loss": 0.1794, + "mean_token_accuracy": 0.9625478911399842, + "num_tokens": 1432973891.0, + "step": 13500 + }, + { + "entropy": 1.37546875, + "epoch": 0.3317663189853582, + "grad_norm": 2.875, + "learning_rate": 5.782475238618574e-06, + "loss": 0.1952, + "mean_token_accuracy": 0.958906524181366, + "num_tokens": 1438425313.0, + "step": 13550 + }, + { + "entropy": 1.39109375, + "epoch": 0.33299054894471375, + "grad_norm": 3.15625, + "learning_rate": 5.780199282618026e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9599265992641449, + "num_tokens": 1443930223.0, + "step": 13600 + }, + { + "entropy": 1.3784375, + "epoch": 0.33421477890406937, + "grad_norm": 2.359375, + "learning_rate": 5.777911934530726e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9606879663467407, + "num_tokens": 1449235492.0, + "step": 13650 + }, + { + "entropy": 1.3740625, + "epoch": 0.3354390088634249, + "grad_norm": 2.25, + "learning_rate": 5.7756132037292665e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.9607800352573395, + "num_tokens": 1454874971.0, + "step": 13700 + }, + { + "entropy": 1.3565625, + "epoch": 0.3366632388227805, + "grad_norm": 3.0, + "learning_rate": 5.77330309963288e-06, + "loss": 0.1664, + "mean_token_accuracy": 0.9650224351882934, + "num_tokens": 1459910564.0, + "step": 13750 + }, + { + "entropy": 1.3896875, + "epoch": 0.33788746878213605, + "grad_norm": 2.6875, + "learning_rate": 5.7709816317074e-06, + "loss": 0.1852, + "mean_token_accuracy": 0.9610321772098541, + "num_tokens": 1465214852.0, + "step": 13800 + }, + { + "entropy": 1.3609375, + "epoch": 0.3391116987414916, + "grad_norm": 2.421875, + "learning_rate": 5.768648809465223e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9646092760562897, + "num_tokens": 1470405224.0, + "step": 13850 + }, + { + "entropy": 1.3671875, + "epoch": 0.34033592870084717, + "grad_norm": 2.421875, + "learning_rate": 5.766304642465277e-06, + "loss": 0.1684, + "mean_token_accuracy": 0.964150664806366, + "num_tokens": 1475222511.0, + "step": 13900 + }, + { + "entropy": 1.3615625, + "epoch": 0.34156015866020273, + "grad_norm": 2.015625, + "learning_rate": 5.763949140312969e-06, + "loss": 0.1903, + "mean_token_accuracy": 0.9601925635337829, + "num_tokens": 1480884593.0, + "step": 13950 + }, + { + "entropy": 1.35734375, + "epoch": 0.3427843886195583, + "grad_norm": 2.859375, + "learning_rate": 5.7615823126601565e-06, + "loss": 0.1853, + "mean_token_accuracy": 0.9617584705352783, + "num_tokens": 1485873672.0, + "step": 14000 + }, + { + "entropy": 1.37375, + "epoch": 0.34400861857891385, + "grad_norm": 2.3125, + "learning_rate": 5.759204169205102e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9605587136745453, + "num_tokens": 1490904541.0, + "step": 14050 + }, + { + "entropy": 1.36359375, + "epoch": 0.3452328485382694, + "grad_norm": 2.140625, + "learning_rate": 5.7568147196924395e-06, + "loss": 0.1891, + "mean_token_accuracy": 0.9609455835819244, + "num_tokens": 1496373059.0, + "step": 14100 + }, + { + "entropy": 1.35421875, + "epoch": 0.34645707849762497, + "grad_norm": 0.0322265625, + "learning_rate": 5.754413973913126e-06, + "loss": 0.1673, + "mean_token_accuracy": 0.9642012619972229, + "num_tokens": 1500901681.0, + "step": 14150 + }, + { + "entropy": 1.343125, + "epoch": 0.3476813084569806, + "grad_norm": 1.859375, + "learning_rate": 5.752001941704407e-06, + "loss": 0.1759, + "mean_token_accuracy": 0.9625442051887512, + "num_tokens": 1506040261.0, + "step": 14200 + }, + { + "entropy": 1.36625, + "epoch": 0.34890553841633615, + "grad_norm": 3.0, + "learning_rate": 5.749578632949776e-06, + "loss": 0.1802, + "mean_token_accuracy": 0.9619328999519348, + "num_tokens": 1511536121.0, + "step": 14250 + }, + { + "entropy": 1.356875, + "epoch": 0.3501297683756917, + "grad_norm": 3.703125, + "learning_rate": 5.747144057578932e-06, + "loss": 0.1843, + "mean_token_accuracy": 0.9613735234737396, + "num_tokens": 1516899260.0, + "step": 14300 + }, + { + "entropy": 1.36203125, + "epoch": 0.35135399833504727, + "grad_norm": 3.671875, + "learning_rate": 5.744698225567742e-06, + "loss": 0.1929, + "mean_token_accuracy": 0.9596503937244415, + "num_tokens": 1522277914.0, + "step": 14350 + }, + { + "entropy": 1.35921875, + "epoch": 0.35257822829440283, + "grad_norm": 2.15625, + "learning_rate": 5.742241146938195e-06, + "loss": 0.18, + "mean_token_accuracy": 0.9617201662063599, + "num_tokens": 1527559983.0, + "step": 14400 + }, + { + "entropy": 1.3353125, + "epoch": 0.3538024582537584, + "grad_norm": 3.03125, + "learning_rate": 5.739772831758365e-06, + "loss": 0.171, + "mean_token_accuracy": 0.9635174345970153, + "num_tokens": 1532501983.0, + "step": 14450 + }, + { + "entropy": 1.37234375, + "epoch": 0.35502668821311395, + "grad_norm": 1.84375, + "learning_rate": 5.737293290142369e-06, + "loss": 0.1957, + "mean_token_accuracy": 0.9595348858833312, + "num_tokens": 1538384868.0, + "step": 14500 + }, + { + "entropy": 1.36453125, + "epoch": 0.3562509181724695, + "grad_norm": 2.734375, + "learning_rate": 5.734802532250327e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9636399447917938, + "num_tokens": 1543550967.0, + "step": 14550 + }, + { + "entropy": 1.36703125, + "epoch": 0.35747514813182507, + "grad_norm": 2.390625, + "learning_rate": 5.7323005682883144e-06, + "loss": 0.1817, + "mean_token_accuracy": 0.9614765977859497, + "num_tokens": 1548814643.0, + "step": 14600 + }, + { + "entropy": 1.37171875, + "epoch": 0.35869937809118063, + "grad_norm": 2.140625, + "learning_rate": 5.729787408508328e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9606961834430695, + "num_tokens": 1554002337.0, + "step": 14650 + }, + { + "entropy": 1.363125, + "epoch": 0.3599236080505362, + "grad_norm": 2.359375, + "learning_rate": 5.7272630632082385e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9617051208019256, + "num_tokens": 1558888261.0, + "step": 14700 + }, + { + "entropy": 1.3603125, + "epoch": 0.36114783800989175, + "grad_norm": 1.9609375, + "learning_rate": 5.7247275427317515e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.9613351905345917, + "num_tokens": 1564034699.0, + "step": 14750 + }, + { + "entropy": 1.38765625, + "epoch": 0.36237206796924737, + "grad_norm": 3.90625, + "learning_rate": 5.722180857468361e-06, + "loss": 0.2015, + "mean_token_accuracy": 0.9581510519981384, + "num_tokens": 1569662314.0, + "step": 14800 + }, + { + "entropy": 1.35671875, + "epoch": 0.36359629792860293, + "grad_norm": 1.875, + "learning_rate": 5.719623017853315e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9616824269294739, + "num_tokens": 1575167487.0, + "step": 14850 + }, + { + "entropy": 1.36796875, + "epoch": 0.3648205278879585, + "grad_norm": 2.921875, + "learning_rate": 5.7170540343675596e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9607573926448822, + "num_tokens": 1580657915.0, + "step": 14900 + }, + { + "entropy": 1.3684375, + "epoch": 0.36604475784731405, + "grad_norm": 2.578125, + "learning_rate": 5.714473917537712e-06, + "loss": 0.1771, + "mean_token_accuracy": 0.9625304937362671, + "num_tokens": 1585664001.0, + "step": 14950 + }, + { + "entropy": 1.36109375, + "epoch": 0.3672689878066696, + "grad_norm": 2.546875, + "learning_rate": 5.711882677936003e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.961945322751999, + "num_tokens": 1590920113.0, + "step": 15000 + }, + { + "entropy": 1.3575, + "epoch": 0.36849321776602517, + "grad_norm": 2.3125, + "learning_rate": 5.709280326180242e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9629940688610077, + "num_tokens": 1596062396.0, + "step": 15050 + }, + { + "entropy": 1.37359375, + "epoch": 0.36971744772538073, + "grad_norm": 2.140625, + "learning_rate": 5.7066668729337725e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9626081240177154, + "num_tokens": 1601254217.0, + "step": 15100 + }, + { + "entropy": 1.36609375, + "epoch": 0.3709416776847363, + "grad_norm": 2.109375, + "learning_rate": 5.704042328905426e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9608933937549591, + "num_tokens": 1606561855.0, + "step": 15150 + }, + { + "entropy": 1.34859375, + "epoch": 0.37216590764409185, + "grad_norm": 1.8515625, + "learning_rate": 5.701406704849479e-06, + "loss": 0.1893, + "mean_token_accuracy": 0.9602335524559021, + "num_tokens": 1612223884.0, + "step": 15200 + }, + { + "entropy": 1.36765625, + "epoch": 0.3733901376034474, + "grad_norm": 2.703125, + "learning_rate": 5.69876001156561e-06, + "loss": 0.1837, + "mean_token_accuracy": 0.9612676846981049, + "num_tokens": 1617459423.0, + "step": 15250 + }, + { + "entropy": 1.366875, + "epoch": 0.374614367562803, + "grad_norm": 2.0625, + "learning_rate": 5.696102259898855e-06, + "loss": 0.1895, + "mean_token_accuracy": 0.9605361771583557, + "num_tokens": 1622772691.0, + "step": 15300 + }, + { + "entropy": 1.3678125, + "epoch": 0.37583859752215854, + "grad_norm": 2.21875, + "learning_rate": 5.693433460739561e-06, + "loss": 0.1794, + "mean_token_accuracy": 0.9623438572883606, + "num_tokens": 1627992421.0, + "step": 15350 + }, + { + "entropy": 1.385, + "epoch": 0.37706282748151415, + "grad_norm": 2.15625, + "learning_rate": 5.690753625023344e-06, + "loss": 0.1903, + "mean_token_accuracy": 0.9602718544006348, + "num_tokens": 1633295976.0, + "step": 15400 + }, + { + "entropy": 1.36546875, + "epoch": 0.3782870574408697, + "grad_norm": 2.078125, + "learning_rate": 5.688062763731044e-06, + "loss": 0.2002, + "mean_token_accuracy": 0.9582274675369262, + "num_tokens": 1638988248.0, + "step": 15450 + }, + { + "entropy": 1.35359375, + "epoch": 0.3795112874002253, + "grad_norm": 1.9921875, + "learning_rate": 5.685360887888677e-06, + "loss": 0.1789, + "mean_token_accuracy": 0.9629680168628693, + "num_tokens": 1644498341.0, + "step": 15500 + }, + { + "entropy": 1.369375, + "epoch": 0.38073551735958083, + "grad_norm": 2.65625, + "learning_rate": 5.682648008567394e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9636906123161316, + "num_tokens": 1649900901.0, + "step": 15550 + }, + { + "entropy": 1.36546875, + "epoch": 0.3819597473189364, + "grad_norm": 2.40625, + "learning_rate": 5.679924136883432e-06, + "loss": 0.1916, + "mean_token_accuracy": 0.9601245021820068, + "num_tokens": 1655743468.0, + "step": 15600 + }, + { + "entropy": 1.37828125, + "epoch": 0.38318397727829195, + "grad_norm": 2.578125, + "learning_rate": 5.677189283998073e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.963598461151123, + "num_tokens": 1660916320.0, + "step": 15650 + }, + { + "entropy": 1.35796875, + "epoch": 0.3844082072376475, + "grad_norm": 2.265625, + "learning_rate": 5.674443461117591e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9613646280765533, + "num_tokens": 1666271922.0, + "step": 15700 + }, + { + "entropy": 1.3571875, + "epoch": 0.3856324371970031, + "grad_norm": 2.328125, + "learning_rate": 5.671686679493215e-06, + "loss": 0.187, + "mean_token_accuracy": 0.9609103786945343, + "num_tokens": 1671766527.0, + "step": 15750 + }, + { + "entropy": 1.36625, + "epoch": 0.38685666715635864, + "grad_norm": 1.6328125, + "learning_rate": 5.668918950421074e-06, + "loss": 0.1886, + "mean_token_accuracy": 0.9606494891643524, + "num_tokens": 1677165332.0, + "step": 15800 + }, + { + "entropy": 1.3475, + "epoch": 0.3880808971157142, + "grad_norm": 3.046875, + "learning_rate": 5.666140285242158e-06, + "loss": 0.1801, + "mean_token_accuracy": 0.9625120401382447, + "num_tokens": 1682494165.0, + "step": 15850 + }, + { + "entropy": 1.36125, + "epoch": 0.38930512707506976, + "grad_norm": 2.0625, + "learning_rate": 5.663350695342268e-06, + "loss": 0.1892, + "mean_token_accuracy": 0.9604367816448212, + "num_tokens": 1688253134.0, + "step": 15900 + }, + { + "entropy": 1.35328125, + "epoch": 0.3905293570344254, + "grad_norm": 1.6640625, + "learning_rate": 5.660550192151967e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.9621007204055786, + "num_tokens": 1693632232.0, + "step": 15950 + }, + { + "entropy": 1.3690625, + "epoch": 0.39175358699378093, + "grad_norm": 1.8359375, + "learning_rate": 5.657738787146543e-06, + "loss": 0.1885, + "mean_token_accuracy": 0.9610405099391938, + "num_tokens": 1698678337.0, + "step": 16000 + }, + { + "entropy": 1.346875, + "epoch": 0.3929778169531365, + "grad_norm": 2.765625, + "learning_rate": 5.654916491845947e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.9640054357051849, + "num_tokens": 1704187251.0, + "step": 16050 + }, + { + "entropy": 1.35375, + "epoch": 0.39420204691249205, + "grad_norm": 2.46875, + "learning_rate": 5.652083317814759e-06, + "loss": 0.1745, + "mean_token_accuracy": 0.9634167146682739, + "num_tokens": 1709408694.0, + "step": 16100 + }, + { + "entropy": 1.34265625, + "epoch": 0.3954262768718476, + "grad_norm": 2.8125, + "learning_rate": 5.649239276662133e-06, + "loss": 0.1724, + "mean_token_accuracy": 0.963241057395935, + "num_tokens": 1714585157.0, + "step": 16150 + }, + { + "entropy": 1.3303125, + "epoch": 0.3966505068312032, + "grad_norm": 2.578125, + "learning_rate": 5.646384380041755e-06, + "loss": 0.1759, + "mean_token_accuracy": 0.9634040462970733, + "num_tokens": 1719749974.0, + "step": 16200 + }, + { + "entropy": 1.33890625, + "epoch": 0.39787473679055874, + "grad_norm": 2.296875, + "learning_rate": 5.643518639651789e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.963290364742279, + "num_tokens": 1724935979.0, + "step": 16250 + }, + { + "entropy": 1.341875, + "epoch": 0.3990989667499143, + "grad_norm": 3.828125, + "learning_rate": 5.640642067234832e-06, + "loss": 0.1869, + "mean_token_accuracy": 0.9608835780620575, + "num_tokens": 1729904911.0, + "step": 16300 + }, + { + "entropy": 1.3525, + "epoch": 0.40032319670926986, + "grad_norm": 3.015625, + "learning_rate": 5.637754674577869e-06, + "loss": 0.193, + "mean_token_accuracy": 0.9592759358882904, + "num_tokens": 1735603402.0, + "step": 16350 + }, + { + "entropy": 1.33984375, + "epoch": 0.4015474266686254, + "grad_norm": 2.671875, + "learning_rate": 5.634856473512218e-06, + "loss": 0.1787, + "mean_token_accuracy": 0.9626182532310485, + "num_tokens": 1740876722.0, + "step": 16400 + }, + { + "entropy": 1.3328125, + "epoch": 0.402771656627981, + "grad_norm": 2.421875, + "learning_rate": 5.631947475913489e-06, + "loss": 0.1951, + "mean_token_accuracy": 0.9596171510219574, + "num_tokens": 1746470991.0, + "step": 16450 + }, + { + "entropy": 1.31375, + "epoch": 0.40399588658733654, + "grad_norm": 2.734375, + "learning_rate": 5.629027693701531e-06, + "loss": 0.1646, + "mean_token_accuracy": 0.9641488230228424, + "num_tokens": 1751600795.0, + "step": 16500 + }, + { + "entropy": 1.3459375, + "epoch": 0.40522011654669216, + "grad_norm": 0.01904296875, + "learning_rate": 5.626097138840379e-06, + "loss": 0.1931, + "mean_token_accuracy": 0.9586203134059906, + "num_tokens": 1757280148.0, + "step": 16550 + }, + { + "entropy": 1.32203125, + "epoch": 0.4064443465060477, + "grad_norm": 3.125, + "learning_rate": 5.623155823338219e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.961804312467575, + "num_tokens": 1762386072.0, + "step": 16600 + }, + { + "entropy": 1.309375, + "epoch": 0.4076685764654033, + "grad_norm": 1.9609375, + "learning_rate": 5.62020375924732e-06, + "loss": 0.1679, + "mean_token_accuracy": 0.9640087175369263, + "num_tokens": 1767593608.0, + "step": 16650 + }, + { + "entropy": 1.33890625, + "epoch": 0.40889280642475884, + "grad_norm": 2.296875, + "learning_rate": 5.617240958664e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9619925379753113, + "num_tokens": 1772859293.0, + "step": 16700 + }, + { + "entropy": 1.3303125, + "epoch": 0.4101170363841144, + "grad_norm": 2.453125, + "learning_rate": 5.614267433728569e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9621168851852417, + "num_tokens": 1778176957.0, + "step": 16750 + }, + { + "entropy": 1.33359375, + "epoch": 0.41134126634346996, + "grad_norm": 2.28125, + "learning_rate": 5.611283196625281e-06, + "loss": 0.1876, + "mean_token_accuracy": 0.9608843457698822, + "num_tokens": 1783513531.0, + "step": 16800 + }, + { + "entropy": 1.31875, + "epoch": 0.4125654963028255, + "grad_norm": 2.375, + "learning_rate": 5.6082882595822835e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9634191727638245, + "num_tokens": 1788649179.0, + "step": 16850 + }, + { + "entropy": 1.34703125, + "epoch": 0.4137897262621811, + "grad_norm": 3.0, + "learning_rate": 5.605282634871569e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9604820072650909, + "num_tokens": 1794020681.0, + "step": 16900 + }, + { + "entropy": 1.341875, + "epoch": 0.41501395622153664, + "grad_norm": 2.265625, + "learning_rate": 5.602266334808922e-06, + "loss": 0.1917, + "mean_token_accuracy": 0.9598517632484436, + "num_tokens": 1799786050.0, + "step": 16950 + }, + { + "entropy": 1.32484375, + "epoch": 0.4162381861808922, + "grad_norm": 2.421875, + "learning_rate": 5.599239371753871e-06, + "loss": 0.1843, + "mean_token_accuracy": 0.9613809895515442, + "num_tokens": 1805308121.0, + "step": 17000 + }, + { + "entropy": 1.3296875, + "epoch": 0.41746241614024776, + "grad_norm": 2.265625, + "learning_rate": 5.596201758109636e-06, + "loss": 0.1971, + "mean_token_accuracy": 0.9585018038749695, + "num_tokens": 1811016191.0, + "step": 17050 + }, + { + "entropy": 1.34390625, + "epoch": 0.4186866460996034, + "grad_norm": 2.65625, + "learning_rate": 5.593153506323082e-06, + "loss": 0.1912, + "mean_token_accuracy": 0.9609514188766479, + "num_tokens": 1816538866.0, + "step": 17100 + }, + { + "entropy": 1.319375, + "epoch": 0.41991087605895894, + "grad_norm": 2.578125, + "learning_rate": 5.59009462888466e-06, + "loss": 0.1692, + "mean_token_accuracy": 0.9638219344615936, + "num_tokens": 1821484676.0, + "step": 17150 + }, + { + "entropy": 1.3296875, + "epoch": 0.4211351060183145, + "grad_norm": 3.078125, + "learning_rate": 5.587025138328363e-06, + "loss": 0.1855, + "mean_token_accuracy": 0.9604250502586364, + "num_tokens": 1826760752.0, + "step": 17200 + }, + { + "entropy": 1.32703125, + "epoch": 0.42235933597767006, + "grad_norm": 4.375, + "learning_rate": 5.583945047231672e-06, + "loss": 0.1756, + "mean_token_accuracy": 0.9626831936836243, + "num_tokens": 1831709955.0, + "step": 17250 + }, + { + "entropy": 1.3278125, + "epoch": 0.4235835659370256, + "grad_norm": 3.578125, + "learning_rate": 5.580854368215504e-06, + "loss": 0.1688, + "mean_token_accuracy": 0.9641677963733674, + "num_tokens": 1836539757.0, + "step": 17300 + }, + { + "entropy": 1.35453125, + "epoch": 0.4248077958963812, + "grad_norm": 3.203125, + "learning_rate": 5.577753113944161e-06, + "loss": 0.1795, + "mean_token_accuracy": 0.9620350849628448, + "num_tokens": 1841748836.0, + "step": 17350 + }, + { + "entropy": 1.35484375, + "epoch": 0.42603202585573674, + "grad_norm": 3.046875, + "learning_rate": 5.574641297125277e-06, + "loss": 0.1903, + "mean_token_accuracy": 0.9602237248420715, + "num_tokens": 1846964872.0, + "step": 17400 + }, + { + "entropy": 1.3465625, + "epoch": 0.4272562558150923, + "grad_norm": 2.375, + "learning_rate": 5.5715189305097705e-06, + "loss": 0.18, + "mean_token_accuracy": 0.9612255036830902, + "num_tokens": 1852195890.0, + "step": 17450 + }, + { + "entropy": 1.34734375, + "epoch": 0.42848048577444786, + "grad_norm": 1.921875, + "learning_rate": 5.568386026891784e-06, + "loss": 0.1852, + "mean_token_accuracy": 0.9614002680778504, + "num_tokens": 1857781986.0, + "step": 17500 + }, + { + "entropy": 1.383125, + "epoch": 0.4297047157338034, + "grad_norm": 3.59375, + "learning_rate": 5.565242599108638e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.9632753264904023, + "num_tokens": 1862697378.0, + "step": 17550 + }, + { + "entropy": 1.37734375, + "epoch": 0.430928945693159, + "grad_norm": 2.578125, + "learning_rate": 5.5620886600407775e-06, + "loss": 0.1793, + "mean_token_accuracy": 0.9618914890289306, + "num_tokens": 1867900164.0, + "step": 17600 + }, + { + "entropy": 1.37453125, + "epoch": 0.43215317565251454, + "grad_norm": 3.359375, + "learning_rate": 5.558924222611718e-06, + "loss": 0.189, + "mean_token_accuracy": 0.9601231980323791, + "num_tokens": 1873349723.0, + "step": 17650 + }, + { + "entropy": 1.3796875, + "epoch": 0.43337740561187016, + "grad_norm": 2.125, + "learning_rate": 5.555749299787992e-06, + "loss": 0.183, + "mean_token_accuracy": 0.9612041318416595, + "num_tokens": 1878516011.0, + "step": 17700 + }, + { + "entropy": 1.36796875, + "epoch": 0.4346016355712257, + "grad_norm": 1.703125, + "learning_rate": 5.552563904579097e-06, + "loss": 0.1666, + "mean_token_accuracy": 0.965571962594986, + "num_tokens": 1883672436.0, + "step": 17750 + }, + { + "entropy": 1.37421875, + "epoch": 0.4358258655305813, + "grad_norm": 2.140625, + "learning_rate": 5.549368050037442e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9618594205379486, + "num_tokens": 1889075709.0, + "step": 17800 + }, + { + "entropy": 1.3753125, + "epoch": 0.43705009548993684, + "grad_norm": 1.703125, + "learning_rate": 5.5461617492582955e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.9609970545768738, + "num_tokens": 1894320611.0, + "step": 17850 + }, + { + "entropy": 1.35203125, + "epoch": 0.4382743254492924, + "grad_norm": 3.265625, + "learning_rate": 5.542945015379727e-06, + "loss": 0.1819, + "mean_token_accuracy": 0.9610999655723572, + "num_tokens": 1899502888.0, + "step": 17900 + }, + { + "entropy": 1.3653125, + "epoch": 0.43949855540864796, + "grad_norm": 3.125, + "learning_rate": 5.53971786158256e-06, + "loss": 0.1783, + "mean_token_accuracy": 0.9628078281879425, + "num_tokens": 1904727333.0, + "step": 17950 + }, + { + "entropy": 1.37265625, + "epoch": 0.4407227853680035, + "grad_norm": 2.15625, + "learning_rate": 5.536480301090311e-06, + "loss": 0.1825, + "mean_token_accuracy": 0.9612684857845306, + "num_tokens": 1910269964.0, + "step": 18000 + }, + { + "entropy": 1.36875, + "epoch": 0.4419470153273591, + "grad_norm": 2.421875, + "learning_rate": 5.533232347169142e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9630991363525391, + "num_tokens": 1915481678.0, + "step": 18050 + }, + { + "entropy": 1.37703125, + "epoch": 0.44317124528671464, + "grad_norm": 0.007720947265625, + "learning_rate": 5.5299740131278e-06, + "loss": 0.1776, + "mean_token_accuracy": 0.9631426560878754, + "num_tokens": 1920892313.0, + "step": 18100 + }, + { + "entropy": 1.3784375, + "epoch": 0.4443954752460702, + "grad_norm": 2.25, + "learning_rate": 5.5267053123175685e-06, + "loss": 0.1793, + "mean_token_accuracy": 0.9618562459945679, + "num_tokens": 1925855441.0, + "step": 18150 + }, + { + "entropy": 1.40484375, + "epoch": 0.44561970520542576, + "grad_norm": 2.390625, + "learning_rate": 5.523426258132208e-06, + "loss": 0.1895, + "mean_token_accuracy": 0.9602830135822296, + "num_tokens": 1931433927.0, + "step": 18200 + }, + { + "entropy": 1.381875, + "epoch": 0.4468439351647813, + "grad_norm": 2.140625, + "learning_rate": 5.520136864007901e-06, + "loss": 0.179, + "mean_token_accuracy": 0.9617183935642243, + "num_tokens": 1937093589.0, + "step": 18250 + }, + { + "entropy": 1.3784375, + "epoch": 0.44806816512413694, + "grad_norm": 2.890625, + "learning_rate": 5.516837143423201e-06, + "loss": 0.1807, + "mean_token_accuracy": 0.9620720791816711, + "num_tokens": 1942266157.0, + "step": 18300 + }, + { + "entropy": 1.3815625, + "epoch": 0.4492923950834925, + "grad_norm": 2.734375, + "learning_rate": 5.5135271098989745e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9636857545375824, + "num_tokens": 1947254229.0, + "step": 18350 + }, + { + "entropy": 1.39609375, + "epoch": 0.45051662504284806, + "grad_norm": 2.28125, + "learning_rate": 5.510206776998347e-06, + "loss": 0.2004, + "mean_token_accuracy": 0.9576922535896302, + "num_tokens": 1953541405.0, + "step": 18400 + }, + { + "entropy": 1.38515625, + "epoch": 0.4517408550022036, + "grad_norm": 1.8671875, + "learning_rate": 5.5068761583266446e-06, + "loss": 0.1815, + "mean_token_accuracy": 0.9612382733821869, + "num_tokens": 1958947967.0, + "step": 18450 + }, + { + "entropy": 1.38546875, + "epoch": 0.4529650849615592, + "grad_norm": 2.609375, + "learning_rate": 5.503535267531341e-06, + "loss": 0.1756, + "mean_token_accuracy": 0.9630067098140717, + "num_tokens": 1964172588.0, + "step": 18500 + }, + { + "entropy": 1.37171875, + "epoch": 0.45418931492091474, + "grad_norm": 2.453125, + "learning_rate": 5.500184118302001e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9629046404361725, + "num_tokens": 1969146021.0, + "step": 18550 + }, + { + "entropy": 1.35796875, + "epoch": 0.4554135448802703, + "grad_norm": 2.390625, + "learning_rate": 5.496822724370225e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9641622114181518, + "num_tokens": 1974171622.0, + "step": 18600 + }, + { + "entropy": 1.35109375, + "epoch": 0.45663777483962587, + "grad_norm": 1.9375, + "learning_rate": 5.493451099509589e-06, + "loss": 0.1797, + "mean_token_accuracy": 0.9615970349311829, + "num_tokens": 1979453512.0, + "step": 18650 + }, + { + "entropy": 1.3515625, + "epoch": 0.4578620047989814, + "grad_norm": 2.421875, + "learning_rate": 5.490069257535595e-06, + "loss": 0.1786, + "mean_token_accuracy": 0.9625794899463653, + "num_tokens": 1984570640.0, + "step": 18700 + }, + { + "entropy": 1.37140625, + "epoch": 0.459086234758337, + "grad_norm": 2.296875, + "learning_rate": 5.4866772123056055e-06, + "loss": 0.1928, + "mean_token_accuracy": 0.9605653440952301, + "num_tokens": 1990199710.0, + "step": 18750 + }, + { + "entropy": 1.375625, + "epoch": 0.46031046471769255, + "grad_norm": 2.09375, + "learning_rate": 5.483274977718797e-06, + "loss": 0.1885, + "mean_token_accuracy": 0.9597025084495544, + "num_tokens": 1995518980.0, + "step": 18800 + }, + { + "entropy": 1.37984375, + "epoch": 0.46153469467704816, + "grad_norm": 2.6875, + "learning_rate": 5.479862567716095e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9633987152576446, + "num_tokens": 2000479352.0, + "step": 18850 + }, + { + "entropy": 1.38640625, + "epoch": 0.4627589246364037, + "grad_norm": 3.671875, + "learning_rate": 5.476439996280118e-06, + "loss": 0.1941, + "mean_token_accuracy": 0.959332902431488, + "num_tokens": 2005933401.0, + "step": 18900 + }, + { + "entropy": 1.3975, + "epoch": 0.4639831545957593, + "grad_norm": 2.5625, + "learning_rate": 5.473007277435125e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9638979506492614, + "num_tokens": 2010666027.0, + "step": 18950 + }, + { + "entropy": 1.38140625, + "epoch": 0.46520738455511484, + "grad_norm": 3.640625, + "learning_rate": 5.469564425246953e-06, + "loss": 0.1852, + "mean_token_accuracy": 0.9617711079120635, + "num_tokens": 2016049085.0, + "step": 19000 + }, + { + "entropy": 1.37015625, + "epoch": 0.4664316145144704, + "grad_norm": 1.71875, + "learning_rate": 5.46611145382296e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9642109513282776, + "num_tokens": 2021148599.0, + "step": 19050 + }, + { + "entropy": 1.35875, + "epoch": 0.46765584447382597, + "grad_norm": 1.6875, + "learning_rate": 5.462648377311973e-06, + "loss": 0.1785, + "mean_token_accuracy": 0.9610287690162659, + "num_tokens": 2026306056.0, + "step": 19100 + }, + { + "entropy": 1.34953125, + "epoch": 0.4688800744331815, + "grad_norm": 2.78125, + "learning_rate": 5.459175209904221e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9627043080329895, + "num_tokens": 2031493225.0, + "step": 19150 + }, + { + "entropy": 1.34484375, + "epoch": 0.4701043043925371, + "grad_norm": 1.8671875, + "learning_rate": 5.455691965831281e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9625547790527343, + "num_tokens": 2036730518.0, + "step": 19200 + }, + { + "entropy": 1.3490625, + "epoch": 0.47132853435189265, + "grad_norm": 2.546875, + "learning_rate": 5.452198659366023e-06, + "loss": 0.167, + "mean_token_accuracy": 0.9653509867191314, + "num_tokens": 2041648821.0, + "step": 19250 + }, + { + "entropy": 1.33796875, + "epoch": 0.4725527643112482, + "grad_norm": 1.921875, + "learning_rate": 5.448695304822545e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.9637433886528015, + "num_tokens": 2046695948.0, + "step": 19300 + }, + { + "entropy": 1.35109375, + "epoch": 0.47377699427060377, + "grad_norm": 3.15625, + "learning_rate": 5.445181916556123e-06, + "loss": 0.1712, + "mean_token_accuracy": 0.96383709192276, + "num_tokens": 2051915262.0, + "step": 19350 + }, + { + "entropy": 1.3453125, + "epoch": 0.47500122422995933, + "grad_norm": 2.578125, + "learning_rate": 5.4416585089631414e-06, + "loss": 0.163, + "mean_token_accuracy": 0.9646891450881958, + "num_tokens": 2056999566.0, + "step": 19400 + }, + { + "entropy": 1.36125, + "epoch": 0.47622545418931495, + "grad_norm": 2.875, + "learning_rate": 5.438125096481043e-06, + "loss": 0.1833, + "mean_token_accuracy": 0.96080885887146, + "num_tokens": 2062335975.0, + "step": 19450 + }, + { + "entropy": 1.368125, + "epoch": 0.4774496841486705, + "grad_norm": 3.140625, + "learning_rate": 5.434581693588263e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9632956290245056, + "num_tokens": 2067247038.0, + "step": 19500 + }, + { + "entropy": 1.36484375, + "epoch": 0.47867391410802607, + "grad_norm": 2.59375, + "learning_rate": 5.4310283148041775e-06, + "loss": 0.185, + "mean_token_accuracy": 0.9606440508365631, + "num_tokens": 2072775995.0, + "step": 19550 + }, + { + "entropy": 1.36171875, + "epoch": 0.4798981440673816, + "grad_norm": 2.265625, + "learning_rate": 5.427464974689038e-06, + "loss": 0.1772, + "mean_token_accuracy": 0.963237328529358, + "num_tokens": 2078139054.0, + "step": 19600 + }, + { + "entropy": 1.35703125, + "epoch": 0.4811223740267372, + "grad_norm": 2.90625, + "learning_rate": 5.42389168784391e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9635715174674988, + "num_tokens": 2083527202.0, + "step": 19650 + }, + { + "entropy": 1.37875, + "epoch": 0.48234660398609275, + "grad_norm": 3.3125, + "learning_rate": 5.4203084689106225e-06, + "loss": 0.1927, + "mean_token_accuracy": 0.9599621570110322, + "num_tokens": 2089385771.0, + "step": 19700 + }, + { + "entropy": 1.34265625, + "epoch": 0.4835708339454483, + "grad_norm": 2.296875, + "learning_rate": 5.4167153325716976e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9641843712329865, + "num_tokens": 2094456460.0, + "step": 19750 + }, + { + "entropy": 1.3609375, + "epoch": 0.48479506390480387, + "grad_norm": 3.734375, + "learning_rate": 5.413112293550296e-06, + "loss": 0.181, + "mean_token_accuracy": 0.9612398469448089, + "num_tokens": 2099504284.0, + "step": 19800 + }, + { + "entropy": 1.3709375, + "epoch": 0.48601929386415943, + "grad_norm": 2.53125, + "learning_rate": 5.409499366610154e-06, + "loss": 0.1699, + "mean_token_accuracy": 0.9642571318149566, + "num_tokens": 2104524371.0, + "step": 19850 + }, + { + "entropy": 1.378125, + "epoch": 0.487243523823515, + "grad_norm": 5.53125, + "learning_rate": 5.405876566555529e-06, + "loss": 0.181, + "mean_token_accuracy": 0.9618199968338013, + "num_tokens": 2109740174.0, + "step": 19900 + }, + { + "entropy": 1.40078125, + "epoch": 0.48846775378287055, + "grad_norm": 2.0, + "learning_rate": 5.402243908231129e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.962717422246933, + "num_tokens": 2115362415.0, + "step": 19950 + }, + { + "entropy": 1.37703125, + "epoch": 0.48969198374222617, + "grad_norm": 3.40625, + "learning_rate": 5.398601406522059e-06, + "loss": 0.19, + "mean_token_accuracy": 0.9599020183086395, + "num_tokens": 2121188022.0, + "step": 20000 + }, + { + "epoch": 0.48969198374222617, + "eval_entropy": 1.366015625, + "eval_loss": 0.1947789192199707, + "eval_mean_token_accuracy": 0.9590674425164859, + "eval_num_tokens": 2121188022.0, + "eval_runtime": 605.3557, + "eval_samples_per_second": 15.951, + "eval_steps_per_second": 0.2, + "step": 20000 + }, + { + "entropy": 1.36578125, + "epoch": 0.4909162137015817, + "grad_norm": 2.71875, + "learning_rate": 5.3949490763537594e-06, + "loss": 0.1838, + "mean_token_accuracy": 0.9606946921348571, + "num_tokens": 2126472622.0, + "step": 20050 + }, + { + "entropy": 1.36359375, + "epoch": 0.4921404436609373, + "grad_norm": 2.21875, + "learning_rate": 5.391286932691941e-06, + "loss": 0.1717, + "mean_token_accuracy": 0.963376579284668, + "num_tokens": 2131377659.0, + "step": 20100 + }, + { + "entropy": 1.37875, + "epoch": 0.49336467362029285, + "grad_norm": 2.46875, + "learning_rate": 5.38761499054253e-06, + "loss": 0.1855, + "mean_token_accuracy": 0.9612623798847199, + "num_tokens": 2136546167.0, + "step": 20150 + }, + { + "entropy": 1.37296875, + "epoch": 0.4945889035796484, + "grad_norm": 4.40625, + "learning_rate": 5.383933264951596e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9621403360366821, + "num_tokens": 2141814792.0, + "step": 20200 + }, + { + "entropy": 1.37328125, + "epoch": 0.49581313353900397, + "grad_norm": 2.40625, + "learning_rate": 5.3802417710053056e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9616746437549591, + "num_tokens": 2147071830.0, + "step": 20250 + }, + { + "entropy": 1.38625, + "epoch": 0.49703736349835953, + "grad_norm": 3.375, + "learning_rate": 5.376540523829846e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9625440466403962, + "num_tokens": 2152428456.0, + "step": 20300 + }, + { + "entropy": 1.3896875, + "epoch": 0.4982615934577151, + "grad_norm": 2.203125, + "learning_rate": 5.372829538591368e-06, + "loss": 0.1876, + "mean_token_accuracy": 0.9597011947631836, + "num_tokens": 2157932348.0, + "step": 20350 + }, + { + "entropy": 1.38671875, + "epoch": 0.49948582341707065, + "grad_norm": 2.78125, + "learning_rate": 5.369108830495932e-06, + "loss": 0.1791, + "mean_token_accuracy": 0.9618503451347351, + "num_tokens": 2163273400.0, + "step": 20400 + }, + { + "entropy": 1.39640625, + "epoch": 0.5007100533764263, + "grad_norm": 2.1875, + "learning_rate": 5.365378414789431e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9630714511871338, + "num_tokens": 2168498693.0, + "step": 20450 + }, + { + "entropy": 1.38453125, + "epoch": 0.5019342833357818, + "grad_norm": 5.0625, + "learning_rate": 5.361638306757539e-06, + "loss": 0.1757, + "mean_token_accuracy": 0.963210039138794, + "num_tokens": 2173679268.0, + "step": 20500 + }, + { + "entropy": 1.40171875, + "epoch": 0.5031585132951374, + "grad_norm": 2.46875, + "learning_rate": 5.357888521725646e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9613598906993865, + "num_tokens": 2178826743.0, + "step": 20550 + }, + { + "entropy": 1.3775, + "epoch": 0.504382743254493, + "grad_norm": 2.546875, + "learning_rate": 5.354129075058793e-06, + "loss": 0.1786, + "mean_token_accuracy": 0.9626466917991638, + "num_tokens": 2184130873.0, + "step": 20600 + }, + { + "entropy": 1.35796875, + "epoch": 0.5056069732138485, + "grad_norm": 1.546875, + "learning_rate": 5.35035998216161e-06, + "loss": 0.1699, + "mean_token_accuracy": 0.9637439405918121, + "num_tokens": 2189388837.0, + "step": 20650 + }, + { + "entropy": 1.38328125, + "epoch": 0.5068312031732041, + "grad_norm": 1.703125, + "learning_rate": 5.3465812584782545e-06, + "loss": 0.1964, + "mean_token_accuracy": 0.9594271278381348, + "num_tokens": 2195050047.0, + "step": 20700 + }, + { + "entropy": 1.34203125, + "epoch": 0.5080554331325596, + "grad_norm": 2.3125, + "learning_rate": 5.342792919492344e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.9626959478855133, + "num_tokens": 2200302347.0, + "step": 20750 + }, + { + "entropy": 1.356875, + "epoch": 0.5092796630919152, + "grad_norm": 2.09375, + "learning_rate": 5.338994980726901e-06, + "loss": 0.1794, + "mean_token_accuracy": 0.9620554232597351, + "num_tokens": 2205512738.0, + "step": 20800 + }, + { + "entropy": 1.3575, + "epoch": 0.5105038930512708, + "grad_norm": 2.78125, + "learning_rate": 5.335187457744277e-06, + "loss": 0.1823, + "mean_token_accuracy": 0.9618464136123657, + "num_tokens": 2210651777.0, + "step": 20850 + }, + { + "entropy": 1.33390625, + "epoch": 0.5117281230106263, + "grad_norm": 1.6875, + "learning_rate": 5.3313703661461e-06, + "loss": 0.1819, + "mean_token_accuracy": 0.9613965570926666, + "num_tokens": 2215880518.0, + "step": 20900 + }, + { + "entropy": 1.3253125, + "epoch": 0.5129523529699819, + "grad_norm": 2.984375, + "learning_rate": 5.327543721573206e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.9638756012916565, + "num_tokens": 2221245311.0, + "step": 20950 + }, + { + "entropy": 1.32234375, + "epoch": 0.5141765829293374, + "grad_norm": 3.28125, + "learning_rate": 5.323707539705574e-06, + "loss": 0.1748, + "mean_token_accuracy": 0.963612312078476, + "num_tokens": 2226359631.0, + "step": 21000 + }, + { + "entropy": 1.30609375, + "epoch": 0.515400812888693, + "grad_norm": 2.15625, + "learning_rate": 5.3198618362622614e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.9639462912082672, + "num_tokens": 2231563334.0, + "step": 21050 + }, + { + "entropy": 1.31953125, + "epoch": 0.5166250428480486, + "grad_norm": 3.265625, + "learning_rate": 5.316006627001344e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.961728732585907, + "num_tokens": 2236847732.0, + "step": 21100 + }, + { + "entropy": 1.32125, + "epoch": 0.5178492728074041, + "grad_norm": 2.375, + "learning_rate": 5.312141927719849e-06, + "loss": 0.172, + "mean_token_accuracy": 0.9636801743507385, + "num_tokens": 2242148614.0, + "step": 21150 + }, + { + "entropy": 1.3134375, + "epoch": 0.5190735027667597, + "grad_norm": 2.546875, + "learning_rate": 5.308267754253684e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9632048571109771, + "num_tokens": 2247694541.0, + "step": 21200 + }, + { + "entropy": 1.36203125, + "epoch": 0.5202977327261152, + "grad_norm": 1.8359375, + "learning_rate": 5.304384122477584e-06, + "loss": 0.1983, + "mean_token_accuracy": 0.9583926129341126, + "num_tokens": 2253386473.0, + "step": 21250 + }, + { + "entropy": 1.34703125, + "epoch": 0.5215219626854708, + "grad_norm": 2.140625, + "learning_rate": 5.300491048305037e-06, + "loss": 0.1753, + "mean_token_accuracy": 0.9633457577228546, + "num_tokens": 2258591416.0, + "step": 21300 + }, + { + "entropy": 1.3553125, + "epoch": 0.5227461926448264, + "grad_norm": 3.140625, + "learning_rate": 5.296588547688221e-06, + "loss": 0.1809, + "mean_token_accuracy": 0.9621423208713531, + "num_tokens": 2263908714.0, + "step": 21350 + }, + { + "entropy": 1.35140625, + "epoch": 0.5239704226041819, + "grad_norm": 2.5, + "learning_rate": 5.292676636617946e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9637291979789734, + "num_tokens": 2269014561.0, + "step": 21400 + }, + { + "entropy": 1.3440625, + "epoch": 0.5251946525635376, + "grad_norm": 2.5625, + "learning_rate": 5.2887553311235736e-06, + "loss": 0.1753, + "mean_token_accuracy": 0.963253127336502, + "num_tokens": 2274143387.0, + "step": 21450 + }, + { + "entropy": 1.34984375, + "epoch": 0.5264188825228932, + "grad_norm": 1.8203125, + "learning_rate": 5.284824647272965e-06, + "loss": 0.1751, + "mean_token_accuracy": 0.9633476626873017, + "num_tokens": 2279551937.0, + "step": 21500 + }, + { + "entropy": 1.3815625, + "epoch": 0.5276431124822487, + "grad_norm": 1.765625, + "learning_rate": 5.280884601172408e-06, + "loss": 0.1901, + "mean_token_accuracy": 0.9609255039691925, + "num_tokens": 2284998091.0, + "step": 21550 + }, + { + "entropy": 1.37375, + "epoch": 0.5288673424416043, + "grad_norm": 2.078125, + "learning_rate": 5.276935208966554e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.9621355581283569, + "num_tokens": 2290404419.0, + "step": 21600 + }, + { + "entropy": 1.35875, + "epoch": 0.5300915724009598, + "grad_norm": 2.546875, + "learning_rate": 5.272976486838349e-06, + "loss": 0.1839, + "mean_token_accuracy": 0.9618707728385926, + "num_tokens": 2295855308.0, + "step": 21650 + }, + { + "entropy": 1.34296875, + "epoch": 0.5313158023603154, + "grad_norm": 3.84375, + "learning_rate": 5.269008451008974e-06, + "loss": 0.1683, + "mean_token_accuracy": 0.9649140095710754, + "num_tokens": 2300888682.0, + "step": 21700 + }, + { + "entropy": 1.3709375, + "epoch": 0.532540032319671, + "grad_norm": 2.046875, + "learning_rate": 5.265031117737765e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9606757354736328, + "num_tokens": 2306530067.0, + "step": 21750 + }, + { + "entropy": 1.3528125, + "epoch": 0.5337642622790265, + "grad_norm": 2.984375, + "learning_rate": 5.261044503322165e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9615514528751373, + "num_tokens": 2312022301.0, + "step": 21800 + }, + { + "entropy": 1.35828125, + "epoch": 0.5349884922383821, + "grad_norm": 2.5, + "learning_rate": 5.257048624097639e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9617948019504547, + "num_tokens": 2317336429.0, + "step": 21850 + }, + { + "entropy": 1.365625, + "epoch": 0.5362127221977376, + "grad_norm": 3.25, + "learning_rate": 5.253043496437619e-06, + "loss": 0.1875, + "mean_token_accuracy": 0.9604008531570435, + "num_tokens": 2322605855.0, + "step": 21900 + }, + { + "entropy": 1.3403125, + "epoch": 0.5374369521570932, + "grad_norm": 1.1171875, + "learning_rate": 5.249029136753436e-06, + "loss": 0.1757, + "mean_token_accuracy": 0.9632094752788544, + "num_tokens": 2328163176.0, + "step": 21950 + }, + { + "entropy": 1.3684375, + "epoch": 0.5386611821164488, + "grad_norm": 2.484375, + "learning_rate": 5.245005561494242e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9627390444278717, + "num_tokens": 2333245056.0, + "step": 22000 + }, + { + "entropy": 1.384375, + "epoch": 0.5398854120758043, + "grad_norm": 2.859375, + "learning_rate": 5.2409727871469585e-06, + "loss": 0.1926, + "mean_token_accuracy": 0.9592073571681976, + "num_tokens": 2338758359.0, + "step": 22050 + }, + { + "entropy": 1.35546875, + "epoch": 0.5411096420351599, + "grad_norm": 2.90625, + "learning_rate": 5.236930830236195e-06, + "loss": 0.179, + "mean_token_accuracy": 0.9627534210681915, + "num_tokens": 2344276248.0, + "step": 22100 + }, + { + "entropy": 1.34953125, + "epoch": 0.5423338719945154, + "grad_norm": 2.078125, + "learning_rate": 5.232879707324194e-06, + "loss": 0.1634, + "mean_token_accuracy": 0.965645101070404, + "num_tokens": 2349615408.0, + "step": 22150 + }, + { + "entropy": 1.37578125, + "epoch": 0.543558101953871, + "grad_norm": 2.34375, + "learning_rate": 5.228819435010749e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9645935368537902, + "num_tokens": 2354669027.0, + "step": 22200 + }, + { + "entropy": 1.3884375, + "epoch": 0.5447823319132266, + "grad_norm": 3.109375, + "learning_rate": 5.224750029933149e-06, + "loss": 0.1811, + "mean_token_accuracy": 0.9621996486186981, + "num_tokens": 2359585884.0, + "step": 22250 + }, + { + "entropy": 1.38390625, + "epoch": 0.5460065618725821, + "grad_norm": 2.375, + "learning_rate": 5.220671508766104e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9631420743465423, + "num_tokens": 2364818902.0, + "step": 22300 + }, + { + "entropy": 1.40234375, + "epoch": 0.5472307918319377, + "grad_norm": 2.03125, + "learning_rate": 5.216583888221676e-06, + "loss": 0.1888, + "mean_token_accuracy": 0.9602623808383942, + "num_tokens": 2370249320.0, + "step": 22350 + }, + { + "entropy": 1.3871875, + "epoch": 0.5484550217912932, + "grad_norm": 2.078125, + "learning_rate": 5.212487185049215e-06, + "loss": 0.1656, + "mean_token_accuracy": 0.9649445843696595, + "num_tokens": 2375353386.0, + "step": 22400 + }, + { + "entropy": 1.415625, + "epoch": 0.5496792517506488, + "grad_norm": 2.09375, + "learning_rate": 5.208381416035286e-06, + "loss": 0.1863, + "mean_token_accuracy": 0.9609400224685669, + "num_tokens": 2380836963.0, + "step": 22450 + }, + { + "entropy": 1.395, + "epoch": 0.5509034817100044, + "grad_norm": 0.00396728515625, + "learning_rate": 5.204266598003604e-06, + "loss": 0.1759, + "mean_token_accuracy": 0.9629833257198334, + "num_tokens": 2385836401.0, + "step": 22500 + }, + { + "entropy": 1.39046875, + "epoch": 0.5521277116693599, + "grad_norm": 3.671875, + "learning_rate": 5.20014274781496e-06, + "loss": 0.176, + "mean_token_accuracy": 0.9624341118335724, + "num_tokens": 2391023729.0, + "step": 22550 + }, + { + "entropy": 1.410625, + "epoch": 0.5533519416287156, + "grad_norm": 2.59375, + "learning_rate": 5.196009882367158e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9633600628376007, + "num_tokens": 2396091073.0, + "step": 22600 + }, + { + "entropy": 1.40546875, + "epoch": 0.5545761715880712, + "grad_norm": 1.640625, + "learning_rate": 5.191868018594941e-06, + "loss": 0.1828, + "mean_token_accuracy": 0.9620015740394592, + "num_tokens": 2401188218.0, + "step": 22650 + }, + { + "entropy": 1.4009375, + "epoch": 0.5558004015474267, + "grad_norm": 3.328125, + "learning_rate": 5.187717173469924e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9637360453605652, + "num_tokens": 2406245988.0, + "step": 22700 + }, + { + "entropy": 1.39234375, + "epoch": 0.5570246315067823, + "grad_norm": 2.0625, + "learning_rate": 5.183557364000523e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9634659576416016, + "num_tokens": 2411368109.0, + "step": 22750 + }, + { + "entropy": 1.40296875, + "epoch": 0.5582488614661378, + "grad_norm": 2.265625, + "learning_rate": 5.179388607231889e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9633192873001098, + "num_tokens": 2416689928.0, + "step": 22800 + }, + { + "entropy": 1.410625, + "epoch": 0.5594730914254934, + "grad_norm": 2.4375, + "learning_rate": 5.17521092024583e-06, + "loss": 0.1867, + "mean_token_accuracy": 0.9608077311515808, + "num_tokens": 2422352742.0, + "step": 22850 + }, + { + "entropy": 1.39109375, + "epoch": 0.560697321384849, + "grad_norm": 0.08642578125, + "learning_rate": 5.171024320160752e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9654168891906738, + "num_tokens": 2427576584.0, + "step": 22900 + }, + { + "entropy": 1.38734375, + "epoch": 0.5619215513442045, + "grad_norm": 2.75, + "learning_rate": 5.166828824131578e-06, + "loss": 0.1696, + "mean_token_accuracy": 0.9640141320228577, + "num_tokens": 2432765937.0, + "step": 22950 + }, + { + "entropy": 1.3884375, + "epoch": 0.5631457813035601, + "grad_norm": 2.75, + "learning_rate": 5.162624449349686e-06, + "loss": 0.1801, + "mean_token_accuracy": 0.9613782787322998, + "num_tokens": 2437980184.0, + "step": 23000 + }, + { + "entropy": 1.3728125, + "epoch": 0.5643700112629156, + "grad_norm": 2.953125, + "learning_rate": 5.158411213042835e-06, + "loss": 0.1675, + "mean_token_accuracy": 0.9656554198265076, + "num_tokens": 2443001633.0, + "step": 23050 + }, + { + "entropy": 1.39265625, + "epoch": 0.5655942412222712, + "grad_norm": 2.140625, + "learning_rate": 5.154189132475095e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9614216196537018, + "num_tokens": 2448599009.0, + "step": 23100 + }, + { + "entropy": 1.3725, + "epoch": 0.5668184711816268, + "grad_norm": 3.34375, + "learning_rate": 5.149958224946776e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.9604478991031646, + "num_tokens": 2454134698.0, + "step": 23150 + }, + { + "entropy": 1.3503125, + "epoch": 0.5680427011409823, + "grad_norm": 3.140625, + "learning_rate": 5.145718507794354e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9635867273807526, + "num_tokens": 2459430485.0, + "step": 23200 + }, + { + "entropy": 1.3696875, + "epoch": 0.5692669311003379, + "grad_norm": 2.0, + "learning_rate": 5.141469998390408e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9624897265434265, + "num_tokens": 2464814573.0, + "step": 23250 + }, + { + "entropy": 1.34359375, + "epoch": 0.5704911610596934, + "grad_norm": 3.109375, + "learning_rate": 5.1372127141435415e-06, + "loss": 0.1866, + "mean_token_accuracy": 0.961111787557602, + "num_tokens": 2470288053.0, + "step": 23300 + }, + { + "entropy": 1.36140625, + "epoch": 0.571715391019049, + "grad_norm": 2.609375, + "learning_rate": 5.132946672498313e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.9609505522251129, + "num_tokens": 2475912972.0, + "step": 23350 + }, + { + "entropy": 1.3640625, + "epoch": 0.5729396209784046, + "grad_norm": 2.015625, + "learning_rate": 5.128671890935168e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9606727063655853, + "num_tokens": 2481260397.0, + "step": 23400 + }, + { + "entropy": 1.36171875, + "epoch": 0.5741638509377601, + "grad_norm": 3.0625, + "learning_rate": 5.12438838697036e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9649614369869233, + "num_tokens": 2486480334.0, + "step": 23450 + }, + { + "entropy": 1.34078125, + "epoch": 0.5753880808971157, + "grad_norm": 2.453125, + "learning_rate": 5.120096178155887e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9637984907627106, + "num_tokens": 2491784273.0, + "step": 23500 + }, + { + "entropy": 1.37375, + "epoch": 0.5766123108564712, + "grad_norm": 2.796875, + "learning_rate": 5.115795282079414e-06, + "loss": 0.1825, + "mean_token_accuracy": 0.9622078704833984, + "num_tokens": 2496936761.0, + "step": 23550 + }, + { + "entropy": 1.37890625, + "epoch": 0.5778365408158268, + "grad_norm": 2.578125, + "learning_rate": 5.111485716364204e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9633621573448181, + "num_tokens": 2502372671.0, + "step": 23600 + }, + { + "entropy": 1.37671875, + "epoch": 0.5790607707751824, + "grad_norm": 2.34375, + "learning_rate": 5.107167498669044e-06, + "loss": 0.1888, + "mean_token_accuracy": 0.9600040495395661, + "num_tokens": 2508248084.0, + "step": 23650 + }, + { + "entropy": 1.3646875, + "epoch": 0.5802850007345379, + "grad_norm": 3.296875, + "learning_rate": 5.102840646688173e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9631288397312164, + "num_tokens": 2513722383.0, + "step": 23700 + }, + { + "entropy": 1.3534375, + "epoch": 0.5815092306938935, + "grad_norm": 1.7890625, + "learning_rate": 5.0985051781512076e-06, + "loss": 0.1853, + "mean_token_accuracy": 0.9618443667888641, + "num_tokens": 2518947610.0, + "step": 23750 + }, + { + "entropy": 1.34390625, + "epoch": 0.5827334606532492, + "grad_norm": 2.65625, + "learning_rate": 5.094161110823076e-06, + "loss": 0.178, + "mean_token_accuracy": 0.963310706615448, + "num_tokens": 2524269424.0, + "step": 23800 + }, + { + "entropy": 1.35328125, + "epoch": 0.5839576906126047, + "grad_norm": 2.59375, + "learning_rate": 5.089808462503938e-06, + "loss": 0.1839, + "mean_token_accuracy": 0.9614792597293854, + "num_tokens": 2529803600.0, + "step": 23850 + }, + { + "entropy": 1.3525, + "epoch": 0.5851819205719603, + "grad_norm": 3.046875, + "learning_rate": 5.085447251029113e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.963988184928894, + "num_tokens": 2534916174.0, + "step": 23900 + }, + { + "entropy": 1.35859375, + "epoch": 0.5864061505313158, + "grad_norm": 2.140625, + "learning_rate": 5.081077494269013e-06, + "loss": 0.1857, + "mean_token_accuracy": 0.9612233006954193, + "num_tokens": 2540205630.0, + "step": 23950 + }, + { + "entropy": 1.35015625, + "epoch": 0.5876303804906714, + "grad_norm": 2.125, + "learning_rate": 5.076699210129059e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9633960282802582, + "num_tokens": 2545114709.0, + "step": 24000 + }, + { + "entropy": 1.346875, + "epoch": 0.588854610450027, + "grad_norm": 2.265625, + "learning_rate": 5.072312416549619e-06, + "loss": 0.171, + "mean_token_accuracy": 0.9637422835826874, + "num_tokens": 2550645548.0, + "step": 24050 + }, + { + "entropy": 1.35140625, + "epoch": 0.5900788404093825, + "grad_norm": 1.8046875, + "learning_rate": 5.067917131505928e-06, + "loss": 0.186, + "mean_token_accuracy": 0.9609566831588745, + "num_tokens": 2556096356.0, + "step": 24100 + }, + { + "entropy": 1.34828125, + "epoch": 0.5913030703687381, + "grad_norm": 2.375, + "learning_rate": 5.063513373008014e-06, + "loss": 0.1874, + "mean_token_accuracy": 0.9602975726127625, + "num_tokens": 2561716691.0, + "step": 24150 + }, + { + "entropy": 1.36828125, + "epoch": 0.5925273003280936, + "grad_norm": 1.7578125, + "learning_rate": 5.059101159100625e-06, + "loss": 0.1911, + "mean_token_accuracy": 0.9601788830757141, + "num_tokens": 2566995725.0, + "step": 24200 + }, + { + "entropy": 1.36234375, + "epoch": 0.5937515302874492, + "grad_norm": 2.671875, + "learning_rate": 5.054680507863158e-06, + "loss": 0.196, + "mean_token_accuracy": 0.9593268644809723, + "num_tokens": 2572823278.0, + "step": 24250 + }, + { + "entropy": 1.36125, + "epoch": 0.5949757602468048, + "grad_norm": 2.375, + "learning_rate": 5.050251437409581e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9630362141132355, + "num_tokens": 2577835467.0, + "step": 24300 + }, + { + "entropy": 1.365625, + "epoch": 0.5961999902061603, + "grad_norm": 3.140625, + "learning_rate": 5.045813965888362e-06, + "loss": 0.184, + "mean_token_accuracy": 0.9621260786056518, + "num_tokens": 2582930120.0, + "step": 24350 + }, + { + "entropy": 1.355625, + "epoch": 0.5974242201655159, + "grad_norm": 3.40625, + "learning_rate": 5.04136811148239e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.963900375366211, + "num_tokens": 2587853502.0, + "step": 24400 + }, + { + "entropy": 1.36140625, + "epoch": 0.5986484501248714, + "grad_norm": 2.4375, + "learning_rate": 5.036913892408908e-06, + "loss": 0.1837, + "mean_token_accuracy": 0.9621051216125488, + "num_tokens": 2593227737.0, + "step": 24450 + }, + { + "entropy": 1.3525, + "epoch": 0.599872680084227, + "grad_norm": 2.203125, + "learning_rate": 5.032451326919429e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.962098822593689, + "num_tokens": 2598591436.0, + "step": 24500 + }, + { + "entropy": 1.34015625, + "epoch": 0.6010969100435826, + "grad_norm": 2.53125, + "learning_rate": 5.027980433299671e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9619297671318054, + "num_tokens": 2604000565.0, + "step": 24550 + }, + { + "entropy": 1.3484375, + "epoch": 0.6023211400029381, + "grad_norm": 2.71875, + "learning_rate": 5.023501229869474e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9643021488189697, + "num_tokens": 2608991683.0, + "step": 24600 + }, + { + "entropy": 1.33015625, + "epoch": 0.6035453699622937, + "grad_norm": 1.9765625, + "learning_rate": 5.0190137349827266e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9643359172344208, + "num_tokens": 2614123184.0, + "step": 24650 + }, + { + "entropy": 1.344375, + "epoch": 0.6047695999216492, + "grad_norm": 2.96875, + "learning_rate": 5.014517967027297e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.962350081205368, + "num_tokens": 2619309044.0, + "step": 24700 + }, + { + "entropy": 1.3540625, + "epoch": 0.6059938298810048, + "grad_norm": 2.734375, + "learning_rate": 5.01001394442495e-06, + "loss": 0.1776, + "mean_token_accuracy": 0.9621638679504394, + "num_tokens": 2624919047.0, + "step": 24750 + }, + { + "entropy": 1.34859375, + "epoch": 0.6072180598403604, + "grad_norm": 3.03125, + "learning_rate": 5.005501685631273e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.9635497546195984, + "num_tokens": 2630407723.0, + "step": 24800 + }, + { + "entropy": 1.3534375, + "epoch": 0.6084422897997159, + "grad_norm": 1.5390625, + "learning_rate": 5.000981209135607e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.9629986727237702, + "num_tokens": 2635671685.0, + "step": 24850 + }, + { + "entropy": 1.3459375, + "epoch": 0.6096665197590715, + "grad_norm": 3.71875, + "learning_rate": 4.9964525334609604e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9627162063121796, + "num_tokens": 2641068693.0, + "step": 24900 + }, + { + "entropy": 1.35453125, + "epoch": 0.6108907497184272, + "grad_norm": 2.75, + "learning_rate": 4.99191567716394e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9617865860462189, + "num_tokens": 2646610014.0, + "step": 24950 + }, + { + "entropy": 1.37453125, + "epoch": 0.6121149796777827, + "grad_norm": 3.109375, + "learning_rate": 4.987370658834675e-06, + "loss": 0.1833, + "mean_token_accuracy": 0.9610668885707855, + "num_tokens": 2651951764.0, + "step": 25000 + }, + { + "entropy": 1.40046875, + "epoch": 0.6133392096371383, + "grad_norm": 3.828125, + "learning_rate": 4.982817497096737e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9631572890281678, + "num_tokens": 2657065776.0, + "step": 25050 + }, + { + "entropy": 1.38859375, + "epoch": 0.6145634395964938, + "grad_norm": 3.0625, + "learning_rate": 4.978256210607068e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9639844071865081, + "num_tokens": 2662222291.0, + "step": 25100 + }, + { + "entropy": 1.3496875, + "epoch": 0.6157876695558494, + "grad_norm": 3.21875, + "learning_rate": 4.973686818055901e-06, + "loss": 0.1684, + "mean_token_accuracy": 0.9642084753513336, + "num_tokens": 2667209443.0, + "step": 25150 + }, + { + "entropy": 1.36375, + "epoch": 0.617011899515205, + "grad_norm": 1.859375, + "learning_rate": 4.969109338166683e-06, + "loss": 0.1719, + "mean_token_accuracy": 0.9646093189716339, + "num_tokens": 2672346139.0, + "step": 25200 + }, + { + "entropy": 1.38625, + "epoch": 0.6182361294745605, + "grad_norm": 2.40625, + "learning_rate": 4.964523789695999e-06, + "loss": 0.1855, + "mean_token_accuracy": 0.9612112033367157, + "num_tokens": 2677709139.0, + "step": 25250 + }, + { + "entropy": 1.38171875, + "epoch": 0.6194603594339161, + "grad_norm": 2.90625, + "learning_rate": 4.959930191433498e-06, + "loss": 0.1832, + "mean_token_accuracy": 0.9613463747501373, + "num_tokens": 2682889432.0, + "step": 25300 + }, + { + "entropy": 1.39375, + "epoch": 0.6206845893932716, + "grad_norm": 2.8125, + "learning_rate": 4.955328562201814e-06, + "loss": 0.1953, + "mean_token_accuracy": 0.959397931098938, + "num_tokens": 2688531671.0, + "step": 25350 + }, + { + "entropy": 1.396875, + "epoch": 0.6219088193526272, + "grad_norm": 1.8984375, + "learning_rate": 4.950718920856486e-06, + "loss": 0.1882, + "mean_token_accuracy": 0.9605313742160797, + "num_tokens": 2693586026.0, + "step": 25400 + }, + { + "entropy": 1.38203125, + "epoch": 0.6231330493119828, + "grad_norm": 2.328125, + "learning_rate": 4.946101286285884e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9638578796386719, + "num_tokens": 2698728829.0, + "step": 25450 + }, + { + "entropy": 1.3803125, + "epoch": 0.6243572792713383, + "grad_norm": 3.53125, + "learning_rate": 4.9414756774111335e-06, + "loss": 0.167, + "mean_token_accuracy": 0.9648666107654571, + "num_tokens": 2703894118.0, + "step": 25500 + }, + { + "entropy": 1.4071875, + "epoch": 0.6255815092306939, + "grad_norm": 3.46875, + "learning_rate": 4.93684211318603e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.962544618844986, + "num_tokens": 2709087928.0, + "step": 25550 + }, + { + "entropy": 1.40078125, + "epoch": 0.6268057391900494, + "grad_norm": 3.28125, + "learning_rate": 4.932200612596974e-06, + "loss": 0.1757, + "mean_token_accuracy": 0.963033629655838, + "num_tokens": 2714244664.0, + "step": 25600 + }, + { + "entropy": 1.401875, + "epoch": 0.628029969149405, + "grad_norm": 3.859375, + "learning_rate": 4.927551194662878e-06, + "loss": 0.1701, + "mean_token_accuracy": 0.9642516016960144, + "num_tokens": 2719276387.0, + "step": 25650 + }, + { + "entropy": 1.4296875, + "epoch": 0.6292541991087606, + "grad_norm": 2.625, + "learning_rate": 4.922893878435101e-06, + "loss": 0.1877, + "mean_token_accuracy": 0.9612637603282929, + "num_tokens": 2724924886.0, + "step": 25700 + }, + { + "entropy": 1.40390625, + "epoch": 0.6304784290681161, + "grad_norm": 2.546875, + "learning_rate": 4.918228682997367e-06, + "loss": 0.1751, + "mean_token_accuracy": 0.9626137948036194, + "num_tokens": 2730190384.0, + "step": 25750 + }, + { + "entropy": 1.4384375, + "epoch": 0.6317026590274717, + "grad_norm": 1.7421875, + "learning_rate": 4.9135556274656825e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9599238002300262, + "num_tokens": 2735642568.0, + "step": 25800 + }, + { + "entropy": 1.43296875, + "epoch": 0.6329268889868273, + "grad_norm": 2.609375, + "learning_rate": 4.908874730988262e-06, + "loss": 0.1859, + "mean_token_accuracy": 0.9601176917552948, + "num_tokens": 2741009627.0, + "step": 25850 + }, + { + "entropy": 1.42296875, + "epoch": 0.6341511189461828, + "grad_norm": 2.171875, + "learning_rate": 4.904186012745451e-06, + "loss": 0.1836, + "mean_token_accuracy": 0.9604202997684479, + "num_tokens": 2746576865.0, + "step": 25900 + }, + { + "entropy": 1.42078125, + "epoch": 0.6353753489055384, + "grad_norm": 3.109375, + "learning_rate": 4.899489491949643e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9639356219768525, + "num_tokens": 2751636571.0, + "step": 25950 + }, + { + "entropy": 1.43125, + "epoch": 0.6365995788648939, + "grad_norm": 3.328125, + "learning_rate": 4.894785187845203e-06, + "loss": 0.1763, + "mean_token_accuracy": 0.9626227140426635, + "num_tokens": 2756749043.0, + "step": 26000 + }, + { + "entropy": 1.41953125, + "epoch": 0.6378238088242495, + "grad_norm": 1.921875, + "learning_rate": 4.890073119708392e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9636380136013031, + "num_tokens": 2761887971.0, + "step": 26050 + }, + { + "entropy": 1.42109375, + "epoch": 0.6390480387836052, + "grad_norm": 2.0625, + "learning_rate": 4.88535330684728e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9623912250995637, + "num_tokens": 2767051370.0, + "step": 26100 + }, + { + "entropy": 1.4259375, + "epoch": 0.6402722687429607, + "grad_norm": 2.546875, + "learning_rate": 4.880625768601674e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.9622378349304199, + "num_tokens": 2772481902.0, + "step": 26150 + }, + { + "entropy": 1.4315625, + "epoch": 0.6414964987023163, + "grad_norm": 2.484375, + "learning_rate": 4.87589052434304e-06, + "loss": 0.1874, + "mean_token_accuracy": 0.9602720224857331, + "num_tokens": 2777927527.0, + "step": 26200 + }, + { + "entropy": 1.4140625, + "epoch": 0.6427207286616718, + "grad_norm": 2.421875, + "learning_rate": 4.871147593474412e-06, + "loss": 0.184, + "mean_token_accuracy": 0.9599432504177093, + "num_tokens": 2783446389.0, + "step": 26250 + }, + { + "entropy": 1.4053125, + "epoch": 0.6439449586210274, + "grad_norm": 2.40625, + "learning_rate": 4.866396995430328e-06, + "loss": 0.1786, + "mean_token_accuracy": 0.9628067684173583, + "num_tokens": 2788980882.0, + "step": 26300 + }, + { + "entropy": 1.38875, + "epoch": 0.645169188580383, + "grad_norm": 2.71875, + "learning_rate": 4.861638749676737e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9639978551864624, + "num_tokens": 2793955184.0, + "step": 26350 + }, + { + "entropy": 1.4034375, + "epoch": 0.6463934185397385, + "grad_norm": 1.6953125, + "learning_rate": 4.85687287571093e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9636970722675323, + "num_tokens": 2799185455.0, + "step": 26400 + }, + { + "entropy": 1.40828125, + "epoch": 0.6476176484990941, + "grad_norm": 3.640625, + "learning_rate": 4.852099393061452e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.962208844423294, + "num_tokens": 2804463803.0, + "step": 26450 + }, + { + "entropy": 1.38484375, + "epoch": 0.6488418784584497, + "grad_norm": 1.75, + "learning_rate": 4.847318321288027e-06, + "loss": 0.165, + "mean_token_accuracy": 0.9649109244346619, + "num_tokens": 2809874779.0, + "step": 26500 + }, + { + "entropy": 1.37953125, + "epoch": 0.6500661084178052, + "grad_norm": 2.984375, + "learning_rate": 4.842529679981474e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9632159042358398, + "num_tokens": 2814714128.0, + "step": 26550 + }, + { + "entropy": 1.39625, + "epoch": 0.6512903383771608, + "grad_norm": 2.765625, + "learning_rate": 4.8377334887636305e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.9637495183944702, + "num_tokens": 2819740494.0, + "step": 26600 + }, + { + "entropy": 1.39109375, + "epoch": 0.6525145683365163, + "grad_norm": 3.03125, + "learning_rate": 4.8329297672872695e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.9610202670097351, + "num_tokens": 2824966205.0, + "step": 26650 + }, + { + "entropy": 1.37796875, + "epoch": 0.6537387982958719, + "grad_norm": 2.53125, + "learning_rate": 4.828118535236023e-06, + "loss": 0.1742, + "mean_token_accuracy": 0.9625972366333008, + "num_tokens": 2830034251.0, + "step": 26700 + }, + { + "entropy": 1.3953125, + "epoch": 0.6549630282552275, + "grad_norm": 2.28125, + "learning_rate": 4.823299812324291e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.9611959600448609, + "num_tokens": 2835494370.0, + "step": 26750 + }, + { + "entropy": 1.38203125, + "epoch": 0.656187258214583, + "grad_norm": 2.15625, + "learning_rate": 4.818473618297175e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9636625552177429, + "num_tokens": 2840744565.0, + "step": 26800 + }, + { + "entropy": 1.3696875, + "epoch": 0.6574114881739386, + "grad_norm": 3.671875, + "learning_rate": 4.8136399729303875e-06, + "loss": 0.1599, + "mean_token_accuracy": 0.9664247930049896, + "num_tokens": 2845515500.0, + "step": 26850 + }, + { + "entropy": 1.39671875, + "epoch": 0.6586357181332941, + "grad_norm": 2.140625, + "learning_rate": 4.808798896030171e-06, + "loss": 0.182, + "mean_token_accuracy": 0.9610953998565673, + "num_tokens": 2850746030.0, + "step": 26900 + }, + { + "entropy": 1.38609375, + "epoch": 0.6598599480926497, + "grad_norm": 1.578125, + "learning_rate": 4.803950407433224e-06, + "loss": 0.1774, + "mean_token_accuracy": 0.9627044332027436, + "num_tokens": 2856071580.0, + "step": 26950 + }, + { + "entropy": 1.38640625, + "epoch": 0.6610841780520053, + "grad_norm": 2.359375, + "learning_rate": 4.799094527006611e-06, + "loss": 0.1747, + "mean_token_accuracy": 0.9633591079711914, + "num_tokens": 2861236205.0, + "step": 27000 + }, + { + "entropy": 1.38140625, + "epoch": 0.6623084080113608, + "grad_norm": 2.046875, + "learning_rate": 4.794231274647687e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9629326021671295, + "num_tokens": 2866317531.0, + "step": 27050 + }, + { + "entropy": 1.37421875, + "epoch": 0.6635326379707164, + "grad_norm": 2.765625, + "learning_rate": 4.789360670284014e-06, + "loss": 0.178, + "mean_token_accuracy": 0.962060467004776, + "num_tokens": 2871541131.0, + "step": 27100 + }, + { + "entropy": 1.4078125, + "epoch": 0.6647568679300719, + "grad_norm": 1.921875, + "learning_rate": 4.784482733873279e-06, + "loss": 0.1962, + "mean_token_accuracy": 0.959048901796341, + "num_tokens": 2877146197.0, + "step": 27150 + }, + { + "entropy": 1.3890625, + "epoch": 0.6659810978894275, + "grad_norm": 2.125, + "learning_rate": 4.7795974854032114e-06, + "loss": 0.1823, + "mean_token_accuracy": 0.9619522738456726, + "num_tokens": 2882596630.0, + "step": 27200 + }, + { + "entropy": 1.3603125, + "epoch": 0.6672053278487832, + "grad_norm": 2.421875, + "learning_rate": 4.774704944891505e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9625801253318786, + "num_tokens": 2887948438.0, + "step": 27250 + }, + { + "entropy": 1.39546875, + "epoch": 0.6684295578081387, + "grad_norm": 2.265625, + "learning_rate": 4.769805132385734e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.9613603317737579, + "num_tokens": 2893501173.0, + "step": 27300 + }, + { + "entropy": 1.40875, + "epoch": 0.6696537877674943, + "grad_norm": 2.3125, + "learning_rate": 4.764898067963265e-06, + "loss": 0.1873, + "mean_token_accuracy": 0.9604850566387176, + "num_tokens": 2898869944.0, + "step": 27350 + }, + { + "entropy": 1.37859375, + "epoch": 0.6708780177268499, + "grad_norm": 2.40625, + "learning_rate": 4.759983771731184e-06, + "loss": 0.1679, + "mean_token_accuracy": 0.965053141117096, + "num_tokens": 2903596870.0, + "step": 27400 + }, + { + "entropy": 1.37453125, + "epoch": 0.6721022476862054, + "grad_norm": 2.03125, + "learning_rate": 4.75506226382621e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9613700366020203, + "num_tokens": 2909474929.0, + "step": 27450 + }, + { + "entropy": 1.36875, + "epoch": 0.673326477645561, + "grad_norm": 2.453125, + "learning_rate": 4.750133564414611e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9644119250774383, + "num_tokens": 2914673564.0, + "step": 27500 + }, + { + "entropy": 1.396875, + "epoch": 0.6745507076049165, + "grad_norm": 2.796875, + "learning_rate": 4.745197693692121e-06, + "loss": 0.1852, + "mean_token_accuracy": 0.9608116745948792, + "num_tokens": 2920176865.0, + "step": 27550 + }, + { + "entropy": 1.41515625, + "epoch": 0.6757749375642721, + "grad_norm": 1.8359375, + "learning_rate": 4.740254671883864e-06, + "loss": 0.1912, + "mean_token_accuracy": 0.9596376729011535, + "num_tokens": 2925586459.0, + "step": 27600 + }, + { + "entropy": 1.3996875, + "epoch": 0.6769991675236277, + "grad_norm": 3.65625, + "learning_rate": 4.735304519244263e-06, + "loss": 0.1745, + "mean_token_accuracy": 0.9637066113948822, + "num_tokens": 2930825954.0, + "step": 27650 + }, + { + "entropy": 1.3809375, + "epoch": 0.6782233974829832, + "grad_norm": 1.921875, + "learning_rate": 4.73034725605696e-06, + "loss": 0.1658, + "mean_token_accuracy": 0.9653242897987365, + "num_tokens": 2935862959.0, + "step": 27700 + }, + { + "entropy": 1.38953125, + "epoch": 0.6794476274423388, + "grad_norm": 3.0625, + "learning_rate": 4.725382902634733e-06, + "loss": 0.1681, + "mean_token_accuracy": 0.9643997454643249, + "num_tokens": 2940725166.0, + "step": 27750 + }, + { + "entropy": 1.40421875, + "epoch": 0.6806718574016943, + "grad_norm": 2.859375, + "learning_rate": 4.720411479319414e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9641519057750702, + "num_tokens": 2946188027.0, + "step": 27800 + }, + { + "entropy": 1.40796875, + "epoch": 0.6818960873610499, + "grad_norm": 2.828125, + "learning_rate": 4.7154330064818045e-06, + "loss": 0.1841, + "mean_token_accuracy": 0.9606011056900025, + "num_tokens": 2951612651.0, + "step": 27850 + }, + { + "entropy": 1.395625, + "epoch": 0.6831203173204055, + "grad_norm": 2.96875, + "learning_rate": 4.710447504521588e-06, + "loss": 0.1647, + "mean_token_accuracy": 0.9641698563098907, + "num_tokens": 2956787623.0, + "step": 27900 + }, + { + "entropy": 1.40359375, + "epoch": 0.684344547279761, + "grad_norm": 3.5625, + "learning_rate": 4.705454993867257e-06, + "loss": 0.1751, + "mean_token_accuracy": 0.9634602963924408, + "num_tokens": 2961925459.0, + "step": 27950 + }, + { + "entropy": 1.3925, + "epoch": 0.6855687772391166, + "grad_norm": 1.921875, + "learning_rate": 4.700455494976019e-06, + "loss": 0.1751, + "mean_token_accuracy": 0.9632600677013398, + "num_tokens": 2967274024.0, + "step": 28000 + }, + { + "entropy": 1.3640625, + "epoch": 0.6867930071984721, + "grad_norm": 2.140625, + "learning_rate": 4.695449028333715e-06, + "loss": 0.1581, + "mean_token_accuracy": 0.965574380159378, + "num_tokens": 2972439136.0, + "step": 28050 + }, + { + "entropy": 1.37203125, + "epoch": 0.6880172371578277, + "grad_norm": 2.640625, + "learning_rate": 4.6904356144547405e-06, + "loss": 0.1833, + "mean_token_accuracy": 0.9605630087852478, + "num_tokens": 2977717715.0, + "step": 28100 + }, + { + "entropy": 1.38703125, + "epoch": 0.6892414671171833, + "grad_norm": 2.65625, + "learning_rate": 4.685415273881955e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9602934348583222, + "num_tokens": 2983019999.0, + "step": 28150 + }, + { + "entropy": 1.36609375, + "epoch": 0.6904656970765388, + "grad_norm": 1.65625, + "learning_rate": 4.6803880271866e-06, + "loss": 0.1635, + "mean_token_accuracy": 0.9659206521511078, + "num_tokens": 2987974089.0, + "step": 28200 + }, + { + "entropy": 1.38875, + "epoch": 0.6916899270358944, + "grad_norm": 2.171875, + "learning_rate": 4.675353894968219e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.958441025018692, + "num_tokens": 2993587967.0, + "step": 28250 + }, + { + "entropy": 1.3828125, + "epoch": 0.6929141569952499, + "grad_norm": 1.796875, + "learning_rate": 4.670312897854568e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9611673438549042, + "num_tokens": 2999047067.0, + "step": 28300 + }, + { + "entropy": 1.36875, + "epoch": 0.6941383869546055, + "grad_norm": 2.375, + "learning_rate": 4.665265056501529e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9631416380405426, + "num_tokens": 3004064576.0, + "step": 28350 + }, + { + "entropy": 1.34109375, + "epoch": 0.6953626169139612, + "grad_norm": 3.0625, + "learning_rate": 4.660210391593035e-06, + "loss": 0.1593, + "mean_token_accuracy": 0.9659523034095764, + "num_tokens": 3009178123.0, + "step": 28400 + }, + { + "entropy": 1.36859375, + "epoch": 0.6965868468733167, + "grad_norm": 2.96875, + "learning_rate": 4.655148923840974e-06, + "loss": 0.1848, + "mean_token_accuracy": 0.9613404250144959, + "num_tokens": 3014406061.0, + "step": 28450 + }, + { + "entropy": 1.36828125, + "epoch": 0.6978110768326723, + "grad_norm": 2.234375, + "learning_rate": 4.6500806739851114e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9632516479492188, + "num_tokens": 3019405252.0, + "step": 28500 + }, + { + "entropy": 1.36640625, + "epoch": 0.6990353067920279, + "grad_norm": 3.265625, + "learning_rate": 4.645005662793002e-06, + "loss": 0.1765, + "mean_token_accuracy": 0.9634008550643921, + "num_tokens": 3024715395.0, + "step": 28550 + }, + { + "entropy": 1.386875, + "epoch": 0.7002595367513834, + "grad_norm": 1.7265625, + "learning_rate": 4.639923911059907e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9633400416374207, + "num_tokens": 3030214594.0, + "step": 28600 + }, + { + "entropy": 1.36390625, + "epoch": 0.701483766710739, + "grad_norm": 2.828125, + "learning_rate": 4.634835439608706e-06, + "loss": 0.1712, + "mean_token_accuracy": 0.9632709419727326, + "num_tokens": 3035472593.0, + "step": 28650 + }, + { + "entropy": 1.34984375, + "epoch": 0.7027079966700945, + "grad_norm": 2.640625, + "learning_rate": 4.629740269289813e-06, + "loss": 0.1634, + "mean_token_accuracy": 0.9657196223735809, + "num_tokens": 3040576077.0, + "step": 28700 + }, + { + "entropy": 1.37296875, + "epoch": 0.7039322266294501, + "grad_norm": 1.8125, + "learning_rate": 4.6246384209810935e-06, + "loss": 0.1857, + "mean_token_accuracy": 0.9612914025783539, + "num_tokens": 3046057341.0, + "step": 28750 + }, + { + "entropy": 1.35765625, + "epoch": 0.7051564565888057, + "grad_norm": 3.5, + "learning_rate": 4.6195299155877746e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.9628597724437714, + "num_tokens": 3051406159.0, + "step": 28800 + }, + { + "entropy": 1.34625, + "epoch": 0.7063806865481612, + "grad_norm": 2.046875, + "learning_rate": 4.61441477404236e-06, + "loss": 0.1736, + "mean_token_accuracy": 0.963384006023407, + "num_tokens": 3056663844.0, + "step": 28850 + }, + { + "entropy": 1.35421875, + "epoch": 0.7076049165075168, + "grad_norm": 2.546875, + "learning_rate": 4.60929301730455e-06, + "loss": 0.1857, + "mean_token_accuracy": 0.9611174511909485, + "num_tokens": 3062180594.0, + "step": 28900 + }, + { + "entropy": 1.3396875, + "epoch": 0.7088291464668723, + "grad_norm": 2.171875, + "learning_rate": 4.604164666361146e-06, + "loss": 0.1771, + "mean_token_accuracy": 0.9630412280559539, + "num_tokens": 3067629529.0, + "step": 28950 + }, + { + "entropy": 1.3521875, + "epoch": 0.7100533764262279, + "grad_norm": 2.53125, + "learning_rate": 4.599029742225975e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9603700506687164, + "num_tokens": 3072962675.0, + "step": 29000 + }, + { + "entropy": 1.34265625, + "epoch": 0.7112776063855835, + "grad_norm": 2.578125, + "learning_rate": 4.593888265939793e-06, + "loss": 0.1668, + "mean_token_accuracy": 0.9641862511634827, + "num_tokens": 3078457917.0, + "step": 29050 + }, + { + "entropy": 1.3565625, + "epoch": 0.712501836344939, + "grad_norm": 2.484375, + "learning_rate": 4.5887402585702056e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9627685403823852, + "num_tokens": 3083722495.0, + "step": 29100 + }, + { + "entropy": 1.3690625, + "epoch": 0.7137260663042946, + "grad_norm": 2.0, + "learning_rate": 4.583585741211583e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9620171189308167, + "num_tokens": 3089097439.0, + "step": 29150 + }, + { + "entropy": 1.3615625, + "epoch": 0.7149502962636501, + "grad_norm": 2.90625, + "learning_rate": 4.5784247349849666e-06, + "loss": 0.183, + "mean_token_accuracy": 0.9622057628631592, + "num_tokens": 3094373355.0, + "step": 29200 + }, + { + "entropy": 1.3421875, + "epoch": 0.7161745262230057, + "grad_norm": 1.953125, + "learning_rate": 4.57325726103799e-06, + "loss": 0.1771, + "mean_token_accuracy": 0.9627100145816803, + "num_tokens": 3099619006.0, + "step": 29250 + }, + { + "entropy": 1.33015625, + "epoch": 0.7173987561823613, + "grad_norm": 3.296875, + "learning_rate": 4.568083340544785e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9631901240348816, + "num_tokens": 3104769496.0, + "step": 29300 + }, + { + "entropy": 1.32921875, + "epoch": 0.7186229861417168, + "grad_norm": 2.359375, + "learning_rate": 4.562902994705902e-06, + "loss": 0.1689, + "mean_token_accuracy": 0.9646138906478882, + "num_tokens": 3110079410.0, + "step": 29350 + }, + { + "entropy": 1.3515625, + "epoch": 0.7198472161010724, + "grad_norm": 2.640625, + "learning_rate": 4.557716244748217e-06, + "loss": 0.186, + "mean_token_accuracy": 0.9605904114246369, + "num_tokens": 3115590754.0, + "step": 29400 + }, + { + "entropy": 1.33421875, + "epoch": 0.721071446060428, + "grad_norm": 1.859375, + "learning_rate": 4.55252311192485e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9634395956993103, + "num_tokens": 3120943769.0, + "step": 29450 + }, + { + "entropy": 1.3384375, + "epoch": 0.7222956760197835, + "grad_norm": 1.8515625, + "learning_rate": 4.547323617515073e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9623040866851806, + "num_tokens": 3126534469.0, + "step": 29500 + }, + { + "entropy": 1.306875, + "epoch": 0.7235199059791391, + "grad_norm": 3.5, + "learning_rate": 4.542117782824228e-06, + "loss": 0.1649, + "mean_token_accuracy": 0.9650185751914978, + "num_tokens": 3131829007.0, + "step": 29550 + }, + { + "entropy": 1.31984375, + "epoch": 0.7247441359384947, + "grad_norm": 1.7109375, + "learning_rate": 4.536905629183632e-06, + "loss": 0.1844, + "mean_token_accuracy": 0.9605432045459747, + "num_tokens": 3137395527.0, + "step": 29600 + }, + { + "entropy": 1.3121875, + "epoch": 0.7259683658978503, + "grad_norm": 2.3125, + "learning_rate": 4.5316871779505e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9653282749652863, + "num_tokens": 3142501686.0, + "step": 29650 + }, + { + "entropy": 1.33921875, + "epoch": 0.7271925958572059, + "grad_norm": 1.9765625, + "learning_rate": 4.5264624505078485e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9623512411117554, + "num_tokens": 3147984109.0, + "step": 29700 + }, + { + "entropy": 1.3259375, + "epoch": 0.7284168258165614, + "grad_norm": 3.671875, + "learning_rate": 4.521231468264411e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9634522151947021, + "num_tokens": 3153428961.0, + "step": 29750 + }, + { + "entropy": 1.339375, + "epoch": 0.729641055775917, + "grad_norm": 1.8046875, + "learning_rate": 4.515994252654552e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9607186770439148, + "num_tokens": 3158828246.0, + "step": 29800 + }, + { + "entropy": 1.29671875, + "epoch": 0.7308652857352725, + "grad_norm": 3.140625, + "learning_rate": 4.510750825138178e-06, + "loss": 0.1608, + "mean_token_accuracy": 0.9657926094532013, + "num_tokens": 3163804439.0, + "step": 29850 + }, + { + "entropy": 1.3315625, + "epoch": 0.7320895156946281, + "grad_norm": 2.9375, + "learning_rate": 4.505501207200649e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.9619475591182709, + "num_tokens": 3169333412.0, + "step": 29900 + }, + { + "entropy": 1.324375, + "epoch": 0.7333137456539837, + "grad_norm": 2.15625, + "learning_rate": 4.500245420352687e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.963250036239624, + "num_tokens": 3174683947.0, + "step": 29950 + }, + { + "entropy": 1.32015625, + "epoch": 0.7345379756133392, + "grad_norm": 3.171875, + "learning_rate": 4.494983486130298e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9633795261383057, + "num_tokens": 3179817804.0, + "step": 30000 + }, + { + "epoch": 0.7345379756133392, + "eval_entropy": 1.3244140625, + "eval_loss": 0.1920091211795807, + "eval_mean_token_accuracy": 0.9597868000467619, + "eval_num_tokens": 3179817804.0, + "eval_runtime": 606.2695, + "eval_samples_per_second": 15.927, + "eval_steps_per_second": 0.2, + "step": 30000 + }, + { + "entropy": 1.34265625, + "epoch": 0.7357622055726948, + "grad_norm": 2.828125, + "learning_rate": 4.489715426094674e-06, + "loss": 0.1971, + "mean_token_accuracy": 0.9590841460227967, + "num_tokens": 3185695558.0, + "step": 30050 + }, + { + "entropy": 1.33234375, + "epoch": 0.7369864355320503, + "grad_norm": 2.28125, + "learning_rate": 4.484441261832107e-06, + "loss": 0.1767, + "mean_token_accuracy": 0.9629596638679504, + "num_tokens": 3191177099.0, + "step": 30100 + }, + { + "entropy": 1.3253125, + "epoch": 0.7382106654914059, + "grad_norm": 2.75, + "learning_rate": 4.479161014953903e-06, + "loss": 0.1795, + "mean_token_accuracy": 0.9617591965198516, + "num_tokens": 3196688072.0, + "step": 30150 + }, + { + "entropy": 1.3171875, + "epoch": 0.7394348954507615, + "grad_norm": 2.578125, + "learning_rate": 4.473874707096293e-06, + "loss": 0.185, + "mean_token_accuracy": 0.9615085804462433, + "num_tokens": 3202252950.0, + "step": 30200 + }, + { + "entropy": 1.3203125, + "epoch": 0.740659125410117, + "grad_norm": 3.078125, + "learning_rate": 4.46858235992034e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9639656889438629, + "num_tokens": 3207720004.0, + "step": 30250 + }, + { + "entropy": 1.33046875, + "epoch": 0.7418833553694726, + "grad_norm": 3.4375, + "learning_rate": 4.463283995111858e-06, + "loss": 0.1909, + "mean_token_accuracy": 0.9597360849380493, + "num_tokens": 3213270190.0, + "step": 30300 + }, + { + "entropy": 1.32171875, + "epoch": 0.7431075853288281, + "grad_norm": 3.671875, + "learning_rate": 4.4579796343813155e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9631195080280304, + "num_tokens": 3218354333.0, + "step": 30350 + }, + { + "entropy": 1.3359375, + "epoch": 0.7443318152881837, + "grad_norm": 3.15625, + "learning_rate": 4.452669299463749e-06, + "loss": 0.172, + "mean_token_accuracy": 0.963985036611557, + "num_tokens": 3223570126.0, + "step": 30400 + }, + { + "entropy": 1.32640625, + "epoch": 0.7455560452475393, + "grad_norm": 1.8125, + "learning_rate": 4.44735301211868e-06, + "loss": 0.1807, + "mean_token_accuracy": 0.9622200524806976, + "num_tokens": 3228934737.0, + "step": 30450 + }, + { + "entropy": 1.34375, + "epoch": 0.7467802752068948, + "grad_norm": 3.109375, + "learning_rate": 4.442030794130013e-06, + "loss": 0.1719, + "mean_token_accuracy": 0.9641703021526337, + "num_tokens": 3234092609.0, + "step": 30500 + }, + { + "entropy": 1.3525, + "epoch": 0.7480045051662504, + "grad_norm": 2.328125, + "learning_rate": 4.43670266730596e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.9610934937000275, + "num_tokens": 3239470570.0, + "step": 30550 + }, + { + "entropy": 1.35859375, + "epoch": 0.749228735125606, + "grad_norm": 3.234375, + "learning_rate": 4.431368653478943e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.9625358593463897, + "num_tokens": 3245129970.0, + "step": 30600 + }, + { + "entropy": 1.36859375, + "epoch": 0.7504529650849615, + "grad_norm": 1.921875, + "learning_rate": 4.426028774505504e-06, + "loss": 0.1895, + "mean_token_accuracy": 0.9608589220046997, + "num_tokens": 3250417534.0, + "step": 30650 + }, + { + "entropy": 1.37203125, + "epoch": 0.7516771950443171, + "grad_norm": 3.125, + "learning_rate": 4.420683052266223e-06, + "loss": 0.1962, + "mean_token_accuracy": 0.9591640889644623, + "num_tokens": 3256202020.0, + "step": 30700 + }, + { + "entropy": 1.35421875, + "epoch": 0.7529014250036727, + "grad_norm": 2.84375, + "learning_rate": 4.415331508665619e-06, + "loss": 0.1723, + "mean_token_accuracy": 0.9638619077205658, + "num_tokens": 3261559010.0, + "step": 30750 + }, + { + "entropy": 1.36328125, + "epoch": 0.7541256549630283, + "grad_norm": 3.5625, + "learning_rate": 4.409974165632064e-06, + "loss": 0.1819, + "mean_token_accuracy": 0.9618020045757294, + "num_tokens": 3267151095.0, + "step": 30800 + }, + { + "entropy": 1.3546875, + "epoch": 0.7553498849223839, + "grad_norm": 3.484375, + "learning_rate": 4.404611045117696e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9617926621437073, + "num_tokens": 3272412916.0, + "step": 30850 + }, + { + "entropy": 1.3534375, + "epoch": 0.7565741148817394, + "grad_norm": 2.578125, + "learning_rate": 4.399242169098329e-06, + "loss": 0.1745, + "mean_token_accuracy": 0.9625967741012573, + "num_tokens": 3277577448.0, + "step": 30900 + }, + { + "entropy": 1.35625, + "epoch": 0.757798344841095, + "grad_norm": 2.65625, + "learning_rate": 4.393867559573354e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9626732635498046, + "num_tokens": 3282706579.0, + "step": 30950 + }, + { + "entropy": 1.36421875, + "epoch": 0.7590225748004505, + "grad_norm": 3.0, + "learning_rate": 4.388487238565661e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9623777115345001, + "num_tokens": 3287949862.0, + "step": 31000 + }, + { + "entropy": 1.36, + "epoch": 0.7602468047598061, + "grad_norm": 0.0230712890625, + "learning_rate": 4.383101228121541e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9617887794971466, + "num_tokens": 3293406088.0, + "step": 31050 + }, + { + "entropy": 1.35609375, + "epoch": 0.7614710347191617, + "grad_norm": 2.984375, + "learning_rate": 4.377709550310598e-06, + "loss": 0.1699, + "mean_token_accuracy": 0.9636480760574341, + "num_tokens": 3298608896.0, + "step": 31100 + }, + { + "entropy": 1.35375, + "epoch": 0.7626952646785172, + "grad_norm": 3.65625, + "learning_rate": 4.37231222722566e-06, + "loss": 0.1643, + "mean_token_accuracy": 0.9644955229759217, + "num_tokens": 3303290550.0, + "step": 31150 + }, + { + "entropy": 1.37390625, + "epoch": 0.7639194946378728, + "grad_norm": 2.46875, + "learning_rate": 4.366909280982685e-06, + "loss": 0.1766, + "mean_token_accuracy": 0.9628056597709655, + "num_tokens": 3308295645.0, + "step": 31200 + }, + { + "entropy": 1.36515625, + "epoch": 0.7651437245972283, + "grad_norm": 2.453125, + "learning_rate": 4.361500733720674e-06, + "loss": 0.1662, + "mean_token_accuracy": 0.9649233341217041, + "num_tokens": 3313438478.0, + "step": 31250 + }, + { + "entropy": 1.3575, + "epoch": 0.7663679545565839, + "grad_norm": 4.46875, + "learning_rate": 4.356086607601575e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.9627750849723816, + "num_tokens": 3319025887.0, + "step": 31300 + }, + { + "entropy": 1.34359375, + "epoch": 0.7675921845159395, + "grad_norm": 2.6875, + "learning_rate": 4.350666924810203e-06, + "loss": 0.1647, + "mean_token_accuracy": 0.9644002187252044, + "num_tokens": 3323975976.0, + "step": 31350 + }, + { + "entropy": 1.35203125, + "epoch": 0.768816414475295, + "grad_norm": 3.765625, + "learning_rate": 4.345241707554134e-06, + "loss": 0.1674, + "mean_token_accuracy": 0.9647248589992523, + "num_tokens": 3329356054.0, + "step": 31400 + }, + { + "entropy": 1.36625, + "epoch": 0.7700406444346506, + "grad_norm": 2.6875, + "learning_rate": 4.339810978063626e-06, + "loss": 0.1776, + "mean_token_accuracy": 0.9627327370643616, + "num_tokens": 3334739313.0, + "step": 31450 + }, + { + "entropy": 1.35125, + "epoch": 0.7712648743940062, + "grad_norm": 1.875, + "learning_rate": 4.334374758591524e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9596246099472046, + "num_tokens": 3340200973.0, + "step": 31500 + }, + { + "entropy": 1.36171875, + "epoch": 0.7724891043533617, + "grad_norm": 2.328125, + "learning_rate": 4.328933071413168e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9636253571510315, + "num_tokens": 3345689303.0, + "step": 31550 + }, + { + "entropy": 1.36078125, + "epoch": 0.7737133343127173, + "grad_norm": 3.4375, + "learning_rate": 4.323485938826302e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9603872370719909, + "num_tokens": 3350984033.0, + "step": 31600 + }, + { + "entropy": 1.3403125, + "epoch": 0.7749375642720728, + "grad_norm": 2.5, + "learning_rate": 4.318033383150981e-06, + "loss": 0.1735, + "mean_token_accuracy": 0.9628359317779541, + "num_tokens": 3356162417.0, + "step": 31650 + }, + { + "entropy": 1.34640625, + "epoch": 0.7761617942314284, + "grad_norm": 2.0625, + "learning_rate": 4.312575426729486e-06, + "loss": 0.1848, + "mean_token_accuracy": 0.9605207931995392, + "num_tokens": 3361647453.0, + "step": 31700 + }, + { + "entropy": 1.33171875, + "epoch": 0.777386024190784, + "grad_norm": 1.9453125, + "learning_rate": 4.307112091926226e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.965142446756363, + "num_tokens": 3366481444.0, + "step": 31750 + }, + { + "entropy": 1.37390625, + "epoch": 0.7786102541501395, + "grad_norm": 3.1875, + "learning_rate": 4.301643401127647e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9628903007507325, + "num_tokens": 3371649682.0, + "step": 31800 + }, + { + "entropy": 1.3721875, + "epoch": 0.7798344841094951, + "grad_norm": 2.625, + "learning_rate": 4.2961693767421435e-06, + "loss": 0.1645, + "mean_token_accuracy": 0.9658307003974914, + "num_tokens": 3376382887.0, + "step": 31850 + }, + { + "entropy": 1.358125, + "epoch": 0.7810587140688507, + "grad_norm": 2.921875, + "learning_rate": 4.290690041199963e-06, + "loss": 0.179, + "mean_token_accuracy": 0.9622143077850341, + "num_tokens": 3381791030.0, + "step": 31900 + }, + { + "entropy": 1.37015625, + "epoch": 0.7822829440282063, + "grad_norm": 2.125, + "learning_rate": 4.285205416953118e-06, + "loss": 0.1876, + "mean_token_accuracy": 0.9609373700618744, + "num_tokens": 3387334981.0, + "step": 31950 + }, + { + "entropy": 1.34765625, + "epoch": 0.7835071739875619, + "grad_norm": 2.515625, + "learning_rate": 4.279715526475289e-06, + "loss": 0.1762, + "mean_token_accuracy": 0.962603681087494, + "num_tokens": 3392713314.0, + "step": 32000 + }, + { + "entropy": 1.3678125, + "epoch": 0.7847314039469174, + "grad_norm": 2.609375, + "learning_rate": 4.274220392261738e-06, + "loss": 0.1887, + "mean_token_accuracy": 0.9606349515914917, + "num_tokens": 3398537796.0, + "step": 32050 + }, + { + "entropy": 1.33734375, + "epoch": 0.785955633906273, + "grad_norm": 2.921875, + "learning_rate": 4.268720036829214e-06, + "loss": 0.1748, + "mean_token_accuracy": 0.964071912765503, + "num_tokens": 3403920236.0, + "step": 32100 + }, + { + "entropy": 1.37, + "epoch": 0.7871798638656285, + "grad_norm": 2.328125, + "learning_rate": 4.263214482715857e-06, + "loss": 0.1654, + "mean_token_accuracy": 0.9644496822357178, + "num_tokens": 3409108918.0, + "step": 32150 + }, + { + "entropy": 1.35046875, + "epoch": 0.7884040938249841, + "grad_norm": 3.125, + "learning_rate": 4.2577037524811104e-06, + "loss": 0.1714, + "mean_token_accuracy": 0.9636311101913452, + "num_tokens": 3414387238.0, + "step": 32200 + }, + { + "entropy": 1.34359375, + "epoch": 0.7896283237843397, + "grad_norm": 2.328125, + "learning_rate": 4.25218786870563e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.965884006023407, + "num_tokens": 3419148471.0, + "step": 32250 + }, + { + "entropy": 1.34875, + "epoch": 0.7908525537436952, + "grad_norm": 0.004241943359375, + "learning_rate": 4.246666853991186e-06, + "loss": 0.1676, + "mean_token_accuracy": 0.9639466750621796, + "num_tokens": 3424295496.0, + "step": 32300 + }, + { + "entropy": 1.364375, + "epoch": 0.7920767837030508, + "grad_norm": 1.6953125, + "learning_rate": 4.241140730960573e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9615444934368134, + "num_tokens": 3429846223.0, + "step": 32350 + }, + { + "entropy": 1.33828125, + "epoch": 0.7933010136624064, + "grad_norm": 3.53125, + "learning_rate": 4.235609522257517e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9621382772922515, + "num_tokens": 3434814232.0, + "step": 32400 + }, + { + "entropy": 1.37265625, + "epoch": 0.7945252436217619, + "grad_norm": 2.28125, + "learning_rate": 4.230073250546585e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9616455745697021, + "num_tokens": 3440013747.0, + "step": 32450 + }, + { + "entropy": 1.33484375, + "epoch": 0.7957494735811175, + "grad_norm": 2.828125, + "learning_rate": 4.224531938513088e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9632323062419892, + "num_tokens": 3445299571.0, + "step": 32500 + }, + { + "entropy": 1.34203125, + "epoch": 0.796973703540473, + "grad_norm": 2.421875, + "learning_rate": 4.218985608862992e-06, + "loss": 0.1814, + "mean_token_accuracy": 0.9623367011547088, + "num_tokens": 3450664579.0, + "step": 32550 + }, + { + "entropy": 1.3540625, + "epoch": 0.7981979334998286, + "grad_norm": 2.0625, + "learning_rate": 4.213434284322819e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9627703261375428, + "num_tokens": 3455979121.0, + "step": 32600 + }, + { + "entropy": 1.33734375, + "epoch": 0.7994221634591842, + "grad_norm": 2.796875, + "learning_rate": 4.207877987639566e-06, + "loss": 0.1764, + "mean_token_accuracy": 0.9627932643890381, + "num_tokens": 3461283678.0, + "step": 32650 + }, + { + "entropy": 1.3596875, + "epoch": 0.8006463934185397, + "grad_norm": 1.8984375, + "learning_rate": 4.202316741580594e-06, + "loss": 0.1854, + "mean_token_accuracy": 0.9612032771110535, + "num_tokens": 3467126201.0, + "step": 32700 + }, + { + "entropy": 1.344375, + "epoch": 0.8018706233778953, + "grad_norm": 2.921875, + "learning_rate": 4.196750568933551e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9638476753234864, + "num_tokens": 3472599559.0, + "step": 32750 + }, + { + "entropy": 1.3415625, + "epoch": 0.8030948533372508, + "grad_norm": 2.34375, + "learning_rate": 4.191179492506271e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9628195893764496, + "num_tokens": 3477994415.0, + "step": 32800 + }, + { + "entropy": 1.34953125, + "epoch": 0.8043190832966064, + "grad_norm": 2.15625, + "learning_rate": 4.18560353512668e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9618653762340545, + "num_tokens": 3483437386.0, + "step": 32850 + }, + { + "entropy": 1.34390625, + "epoch": 0.805543313255962, + "grad_norm": 2.875, + "learning_rate": 4.1800227196427055e-06, + "loss": 0.1751, + "mean_token_accuracy": 0.9623115694522858, + "num_tokens": 3488795577.0, + "step": 32900 + }, + { + "entropy": 1.32609375, + "epoch": 0.8067675432153175, + "grad_norm": 1.9921875, + "learning_rate": 4.17443706892218e-06, + "loss": 0.1766, + "mean_token_accuracy": 0.9626455020904541, + "num_tokens": 3494139245.0, + "step": 32950 + }, + { + "entropy": 1.34953125, + "epoch": 0.8079917731746731, + "grad_norm": 3.640625, + "learning_rate": 4.168846605852751e-06, + "loss": 0.1811, + "mean_token_accuracy": 0.9624789762496948, + "num_tokens": 3499294686.0, + "step": 33000 + }, + { + "entropy": 1.34546875, + "epoch": 0.8092160031340287, + "grad_norm": 3.234375, + "learning_rate": 4.1632513533417825e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9650925529003144, + "num_tokens": 3504042622.0, + "step": 33050 + }, + { + "entropy": 1.3675, + "epoch": 0.8104402330933843, + "grad_norm": 1.8984375, + "learning_rate": 4.157651334316264e-06, + "loss": 0.159, + "mean_token_accuracy": 0.9659399092197418, + "num_tokens": 3509103882.0, + "step": 33100 + }, + { + "entropy": 1.35625, + "epoch": 0.8116644630527399, + "grad_norm": 1.9765625, + "learning_rate": 4.1520465717227206e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9628897225856781, + "num_tokens": 3514150747.0, + "step": 33150 + }, + { + "entropy": 1.3603125, + "epoch": 0.8128886930120954, + "grad_norm": 2.859375, + "learning_rate": 4.146437088527108e-06, + "loss": 0.1811, + "mean_token_accuracy": 0.9617001414299011, + "num_tokens": 3519220750.0, + "step": 33200 + }, + { + "entropy": 1.36859375, + "epoch": 0.814112922971451, + "grad_norm": 2.921875, + "learning_rate": 4.140822907714728e-06, + "loss": 0.1885, + "mean_token_accuracy": 0.9607588303089142, + "num_tokens": 3524668178.0, + "step": 33250 + }, + { + "entropy": 1.35484375, + "epoch": 0.8153371529308066, + "grad_norm": 1.6015625, + "learning_rate": 4.135204052290131e-06, + "loss": 0.1645, + "mean_token_accuracy": 0.9654926788806916, + "num_tokens": 3529737924.0, + "step": 33300 + }, + { + "entropy": 1.33109375, + "epoch": 0.8165613828901621, + "grad_norm": 3.0, + "learning_rate": 4.129580545277023e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.9648844826221467, + "num_tokens": 3534673592.0, + "step": 33350 + }, + { + "entropy": 1.33046875, + "epoch": 0.8177856128495177, + "grad_norm": 2.1875, + "learning_rate": 4.123952409718169e-06, + "loss": 0.1705, + "mean_token_accuracy": 0.963813624382019, + "num_tokens": 3539705624.0, + "step": 33400 + }, + { + "entropy": 1.3225, + "epoch": 0.8190098428088732, + "grad_norm": 2.65625, + "learning_rate": 4.118319668675301e-06, + "loss": 0.1607, + "mean_token_accuracy": 0.9656564962863922, + "num_tokens": 3544723634.0, + "step": 33450 + }, + { + "entropy": 1.34328125, + "epoch": 0.8202340727682288, + "grad_norm": 3.625, + "learning_rate": 4.112682345229019e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9613649821281434, + "num_tokens": 3550196451.0, + "step": 33500 + }, + { + "entropy": 1.34546875, + "epoch": 0.8214583027275844, + "grad_norm": 2.375, + "learning_rate": 4.107040462478706e-06, + "loss": 0.1698, + "mean_token_accuracy": 0.9640332353115082, + "num_tokens": 3555769583.0, + "step": 33550 + }, + { + "entropy": 1.35515625, + "epoch": 0.8226825326869399, + "grad_norm": 4.6875, + "learning_rate": 4.101394043542421e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.9626898431777954, + "num_tokens": 3560775725.0, + "step": 33600 + }, + { + "entropy": 1.37046875, + "epoch": 0.8239067626462955, + "grad_norm": 2.828125, + "learning_rate": 4.095743111556813e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9615408968925476, + "num_tokens": 3566233997.0, + "step": 33650 + }, + { + "entropy": 1.3565625, + "epoch": 0.825130992605651, + "grad_norm": 2.6875, + "learning_rate": 4.090087689677025e-06, + "loss": 0.1798, + "mean_token_accuracy": 0.9622524130344391, + "num_tokens": 3571629994.0, + "step": 33700 + }, + { + "entropy": 1.35453125, + "epoch": 0.8263552225650066, + "grad_norm": 2.34375, + "learning_rate": 4.084427801076592e-06, + "loss": 0.1631, + "mean_token_accuracy": 0.965935331583023, + "num_tokens": 3576662114.0, + "step": 33750 + }, + { + "entropy": 1.36453125, + "epoch": 0.8275794525243622, + "grad_norm": 2.609375, + "learning_rate": 4.0787634689473605e-06, + "loss": 0.1704, + "mean_token_accuracy": 0.9641584491729737, + "num_tokens": 3581699530.0, + "step": 33800 + }, + { + "entropy": 1.33421875, + "epoch": 0.8288036824837177, + "grad_norm": 3.71875, + "learning_rate": 4.0730947164993775e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9626482093334198, + "num_tokens": 3586891414.0, + "step": 33850 + }, + { + "entropy": 1.34828125, + "epoch": 0.8300279124430733, + "grad_norm": 2.9375, + "learning_rate": 4.067421566960805e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9637481319904327, + "num_tokens": 3591845863.0, + "step": 33900 + }, + { + "entropy": 1.32796875, + "epoch": 0.8312521424024288, + "grad_norm": 4.3125, + "learning_rate": 4.061744043577822e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.960258857011795, + "num_tokens": 3597325814.0, + "step": 33950 + }, + { + "entropy": 1.343125, + "epoch": 0.8324763723617844, + "grad_norm": 3.65625, + "learning_rate": 4.056062169614533e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9624998271465302, + "num_tokens": 3602589177.0, + "step": 34000 + }, + { + "entropy": 1.33171875, + "epoch": 0.83370060232114, + "grad_norm": 5.5, + "learning_rate": 4.050375968352865e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.9635978293418884, + "num_tokens": 3607686315.0, + "step": 34050 + }, + { + "entropy": 1.35046875, + "epoch": 0.8349248322804955, + "grad_norm": 2.921875, + "learning_rate": 4.044685463092477e-06, + "loss": 0.1823, + "mean_token_accuracy": 0.9619014573097229, + "num_tokens": 3613032357.0, + "step": 34100 + }, + { + "entropy": 1.3278125, + "epoch": 0.8361490622398511, + "grad_norm": 3.796875, + "learning_rate": 4.0389906771506666e-06, + "loss": 0.1567, + "mean_token_accuracy": 0.9672730362415314, + "num_tokens": 3617947758.0, + "step": 34150 + }, + { + "entropy": 1.3509375, + "epoch": 0.8373732921992068, + "grad_norm": 3.0, + "learning_rate": 4.03329163386227e-06, + "loss": 0.1821, + "mean_token_accuracy": 0.9615289163589478, + "num_tokens": 3623324648.0, + "step": 34200 + }, + { + "entropy": 1.36625, + "epoch": 0.8385975221585623, + "grad_norm": 2.21875, + "learning_rate": 4.027588356579567e-06, + "loss": 0.1807, + "mean_token_accuracy": 0.962299063205719, + "num_tokens": 3628936189.0, + "step": 34250 + }, + { + "entropy": 1.34484375, + "epoch": 0.8398217521179179, + "grad_norm": 1.9375, + "learning_rate": 4.0218808686721884e-06, + "loss": 0.1766, + "mean_token_accuracy": 0.9632388269901275, + "num_tokens": 3634256824.0, + "step": 34300 + }, + { + "entropy": 1.3365625, + "epoch": 0.8410459820772734, + "grad_norm": 3.265625, + "learning_rate": 4.01616919352702e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9652460610866547, + "num_tokens": 3639058717.0, + "step": 34350 + }, + { + "entropy": 1.3490625, + "epoch": 0.842270212036629, + "grad_norm": 3.53125, + "learning_rate": 4.010453354548101e-06, + "loss": 0.1587, + "mean_token_accuracy": 0.9665447866916657, + "num_tokens": 3644031006.0, + "step": 34400 + }, + { + "entropy": 1.36546875, + "epoch": 0.8434944419959846, + "grad_norm": 2.0625, + "learning_rate": 4.004733375156534e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9608346676826477, + "num_tokens": 3649652142.0, + "step": 34450 + }, + { + "entropy": 1.36640625, + "epoch": 0.8447186719553401, + "grad_norm": 2.34375, + "learning_rate": 3.999009278790389e-06, + "loss": 0.1692, + "mean_token_accuracy": 0.9642466914653778, + "num_tokens": 3654831381.0, + "step": 34500 + }, + { + "entropy": 1.35890625, + "epoch": 0.8459429019146957, + "grad_norm": 1.96875, + "learning_rate": 3.993281088904603e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9651599872112274, + "num_tokens": 3659811312.0, + "step": 34550 + }, + { + "entropy": 1.36734375, + "epoch": 0.8471671318740512, + "grad_norm": 3.578125, + "learning_rate": 3.9875488289708895e-06, + "loss": 0.1693, + "mean_token_accuracy": 0.9640548026561737, + "num_tokens": 3665088140.0, + "step": 34600 + }, + { + "entropy": 1.35578125, + "epoch": 0.8483913618334068, + "grad_norm": 2.671875, + "learning_rate": 3.981812522477634e-06, + "loss": 0.1683, + "mean_token_accuracy": 0.9642880761623382, + "num_tokens": 3670199765.0, + "step": 34650 + }, + { + "entropy": 1.371875, + "epoch": 0.8496155917927624, + "grad_norm": 2.125, + "learning_rate": 3.976072192929812e-06, + "loss": 0.1859, + "mean_token_accuracy": 0.961214131116867, + "num_tokens": 3675973370.0, + "step": 34700 + }, + { + "entropy": 1.335, + "epoch": 0.8508398217521179, + "grad_norm": 2.234375, + "learning_rate": 3.970327863848874e-06, + "loss": 0.163, + "mean_token_accuracy": 0.9652379751205444, + "num_tokens": 3680935151.0, + "step": 34750 + }, + { + "entropy": 1.35953125, + "epoch": 0.8520640517114735, + "grad_norm": 3.984375, + "learning_rate": 3.964579558772665e-06, + "loss": 0.1686, + "mean_token_accuracy": 0.9643210101127625, + "num_tokens": 3686151191.0, + "step": 34800 + }, + { + "entropy": 1.35, + "epoch": 0.853288281670829, + "grad_norm": 2.46875, + "learning_rate": 3.95882730125532e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9624910676479339, + "num_tokens": 3691478654.0, + "step": 34850 + }, + { + "entropy": 1.338125, + "epoch": 0.8545125116301846, + "grad_norm": 2.109375, + "learning_rate": 3.953071114867171e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9633730280399323, + "num_tokens": 3696633906.0, + "step": 34900 + }, + { + "entropy": 1.34890625, + "epoch": 0.8557367415895402, + "grad_norm": 1.890625, + "learning_rate": 3.947311023194645e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9618865346908569, + "num_tokens": 3701978753.0, + "step": 34950 + }, + { + "entropy": 1.3384375, + "epoch": 0.8569609715488957, + "grad_norm": 2.203125, + "learning_rate": 3.941547049840176e-06, + "loss": 0.1645, + "mean_token_accuracy": 0.9649497640132904, + "num_tokens": 3706915348.0, + "step": 35000 + }, + { + "entropy": 1.32359375, + "epoch": 0.8581852015082513, + "grad_norm": 2.171875, + "learning_rate": 3.9357792184221005e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9632923007011414, + "num_tokens": 3712046907.0, + "step": 35050 + }, + { + "entropy": 1.3240625, + "epoch": 0.8594094314676068, + "grad_norm": 3.078125, + "learning_rate": 3.930007552574564e-06, + "loss": 0.1763, + "mean_token_accuracy": 0.9626149117946625, + "num_tokens": 3717274859.0, + "step": 35100 + }, + { + "entropy": 1.33484375, + "epoch": 0.8606336614269624, + "grad_norm": 3.03125, + "learning_rate": 3.924232075947427e-06, + "loss": 0.186, + "mean_token_accuracy": 0.9613423335552216, + "num_tokens": 3722674538.0, + "step": 35150 + }, + { + "entropy": 1.33484375, + "epoch": 0.861857891386318, + "grad_norm": 3.40625, + "learning_rate": 3.918452812206159e-06, + "loss": 0.1777, + "mean_token_accuracy": 0.9628440749645233, + "num_tokens": 3727975730.0, + "step": 35200 + }, + { + "entropy": 1.34125, + "epoch": 0.8630821213456735, + "grad_norm": 1.8359375, + "learning_rate": 3.9126697850317525e-06, + "loss": 0.1761, + "mean_token_accuracy": 0.963371901512146, + "num_tokens": 3733241093.0, + "step": 35250 + }, + { + "entropy": 1.34328125, + "epoch": 0.8643063513050291, + "grad_norm": 2.640625, + "learning_rate": 3.906883018120619e-06, + "loss": 0.1707, + "mean_token_accuracy": 0.9642481172084808, + "num_tokens": 3738164559.0, + "step": 35300 + }, + { + "entropy": 1.3203125, + "epoch": 0.8655305812643848, + "grad_norm": 3.546875, + "learning_rate": 3.901092535184496e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9637637650966644, + "num_tokens": 3743459921.0, + "step": 35350 + }, + { + "entropy": 1.35578125, + "epoch": 0.8667548112237403, + "grad_norm": 3.40625, + "learning_rate": 3.895298359950343e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9605180990695953, + "num_tokens": 3748868327.0, + "step": 35400 + }, + { + "entropy": 1.34265625, + "epoch": 0.8679790411830959, + "grad_norm": 2.125, + "learning_rate": 3.889500516160254e-06, + "loss": 0.1715, + "mean_token_accuracy": 0.9643005490303039, + "num_tokens": 3753748677.0, + "step": 35450 + }, + { + "entropy": 1.3384375, + "epoch": 0.8692032711424514, + "grad_norm": 2.375, + "learning_rate": 3.883699027571352e-06, + "loss": 0.1668, + "mean_token_accuracy": 0.965086680650711, + "num_tokens": 3759201853.0, + "step": 35500 + }, + { + "entropy": 1.34390625, + "epoch": 0.870427501101807, + "grad_norm": 3.25, + "learning_rate": 3.8778939179556976e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9643353164196015, + "num_tokens": 3764158638.0, + "step": 35550 + }, + { + "entropy": 1.33015625, + "epoch": 0.8716517310611626, + "grad_norm": 2.015625, + "learning_rate": 3.872085211100185e-06, + "loss": 0.1621, + "mean_token_accuracy": 0.9657464909553528, + "num_tokens": 3769226815.0, + "step": 35600 + }, + { + "entropy": 1.35078125, + "epoch": 0.8728759610205181, + "grad_norm": 1.890625, + "learning_rate": 3.86627293080645e-06, + "loss": 0.1836, + "mean_token_accuracy": 0.9611875438690185, + "num_tokens": 3774861819.0, + "step": 35650 + }, + { + "entropy": 1.34953125, + "epoch": 0.8741001909798737, + "grad_norm": 3.203125, + "learning_rate": 3.860457100890776e-06, + "loss": 0.1795, + "mean_token_accuracy": 0.9616686987876892, + "num_tokens": 3780181646.0, + "step": 35700 + }, + { + "entropy": 1.34359375, + "epoch": 0.8753244209392292, + "grad_norm": 3.046875, + "learning_rate": 3.854637745183983e-06, + "loss": 0.1762, + "mean_token_accuracy": 0.9630369508266449, + "num_tokens": 3785489246.0, + "step": 35750 + }, + { + "entropy": 1.3425, + "epoch": 0.8765486508985848, + "grad_norm": 1.953125, + "learning_rate": 3.848814887531342e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9609660315513611, + "num_tokens": 3790970702.0, + "step": 35800 + }, + { + "entropy": 1.3375, + "epoch": 0.8777728808579404, + "grad_norm": 2.890625, + "learning_rate": 3.842988551792473e-06, + "loss": 0.1666, + "mean_token_accuracy": 0.9646478390693665, + "num_tokens": 3796002667.0, + "step": 35850 + }, + { + "entropy": 1.33828125, + "epoch": 0.8789971108172959, + "grad_norm": 3.234375, + "learning_rate": 3.83715876184125e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9642738771438598, + "num_tokens": 3801134844.0, + "step": 35900 + }, + { + "entropy": 1.33859375, + "epoch": 0.8802213407766515, + "grad_norm": 2.8125, + "learning_rate": 3.831325541565699e-06, + "loss": 0.1714, + "mean_token_accuracy": 0.9640265047550202, + "num_tokens": 3806453829.0, + "step": 35950 + }, + { + "entropy": 1.34015625, + "epoch": 0.881445570736007, + "grad_norm": 3.046875, + "learning_rate": 3.825488914867901e-06, + "loss": 0.1762, + "mean_token_accuracy": 0.9627239561080932, + "num_tokens": 3811628461.0, + "step": 36000 + }, + { + "entropy": 1.35203125, + "epoch": 0.8826698006953626, + "grad_norm": 2.171875, + "learning_rate": 3.8196489056638965e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9613272595405579, + "num_tokens": 3816892701.0, + "step": 36050 + }, + { + "entropy": 1.34703125, + "epoch": 0.8838940306547182, + "grad_norm": 2.015625, + "learning_rate": 3.813805537883585e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9637981843948364, + "num_tokens": 3822028448.0, + "step": 36100 + }, + { + "entropy": 1.34875, + "epoch": 0.8851182606140737, + "grad_norm": 1.8671875, + "learning_rate": 3.80795883547063e-06, + "loss": 0.1672, + "mean_token_accuracy": 0.9647044801712036, + "num_tokens": 3827213092.0, + "step": 36150 + }, + { + "entropy": 1.34578125, + "epoch": 0.8863424905734293, + "grad_norm": 2.484375, + "learning_rate": 3.8021088223823558e-06, + "loss": 0.1927, + "mean_token_accuracy": 0.9597675764560699, + "num_tokens": 3832709039.0, + "step": 36200 + }, + { + "entropy": 1.33359375, + "epoch": 0.8875667205327848, + "grad_norm": 2.046875, + "learning_rate": 3.7962555225896563e-06, + "loss": 0.177, + "mean_token_accuracy": 0.9623324680328369, + "num_tokens": 3837879687.0, + "step": 36250 + }, + { + "entropy": 1.33890625, + "epoch": 0.8887909504921404, + "grad_norm": 2.328125, + "learning_rate": 3.790398960076891e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9629685461521149, + "num_tokens": 3843045671.0, + "step": 36300 + }, + { + "entropy": 1.32703125, + "epoch": 0.890015180451496, + "grad_norm": 3.09375, + "learning_rate": 3.7845391588417876e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9636087584495544, + "num_tokens": 3848206427.0, + "step": 36350 + }, + { + "entropy": 1.32984375, + "epoch": 0.8912394104108515, + "grad_norm": 2.171875, + "learning_rate": 3.778676142895346e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9632059478759766, + "num_tokens": 3853828427.0, + "step": 36400 + }, + { + "entropy": 1.32390625, + "epoch": 0.8924636403702071, + "grad_norm": 2.5625, + "learning_rate": 3.772809936261739e-06, + "loss": 0.1894, + "mean_token_accuracy": 0.9601573574543, + "num_tokens": 3859273920.0, + "step": 36450 + }, + { + "entropy": 1.3265625, + "epoch": 0.8936878703295627, + "grad_norm": 2.875, + "learning_rate": 3.766940562978211e-06, + "loss": 0.1763, + "mean_token_accuracy": 0.9631186270713806, + "num_tokens": 3864494355.0, + "step": 36500 + }, + { + "entropy": 1.33109375, + "epoch": 0.8949121002889183, + "grad_norm": 2.71875, + "learning_rate": 3.761068047094987e-06, + "loss": 0.1736, + "mean_token_accuracy": 0.963892787694931, + "num_tokens": 3869689661.0, + "step": 36550 + }, + { + "entropy": 1.3115625, + "epoch": 0.8961363302482739, + "grad_norm": 2.625, + "learning_rate": 3.7551924126751624e-06, + "loss": 0.1832, + "mean_token_accuracy": 0.9618776285648346, + "num_tokens": 3875053980.0, + "step": 36600 + }, + { + "entropy": 1.3021875, + "epoch": 0.8973605602076294, + "grad_norm": 2.734375, + "learning_rate": 3.7493136837946177e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.962455780506134, + "num_tokens": 3880568995.0, + "step": 36650 + }, + { + "entropy": 1.3209375, + "epoch": 0.898584790166985, + "grad_norm": 2.90625, + "learning_rate": 3.743431884541909e-06, + "loss": 0.1835, + "mean_token_accuracy": 0.9612640655040741, + "num_tokens": 3885898540.0, + "step": 36700 + }, + { + "entropy": 1.31390625, + "epoch": 0.8998090201263406, + "grad_norm": 2.75, + "learning_rate": 3.737547039018173e-06, + "loss": 0.1664, + "mean_token_accuracy": 0.9649625384807586, + "num_tokens": 3891014489.0, + "step": 36750 + }, + { + "entropy": 1.323125, + "epoch": 0.9010332500856961, + "grad_norm": 2.1875, + "learning_rate": 3.7316591713370315e-06, + "loss": 0.1774, + "mean_token_accuracy": 0.9622565031051635, + "num_tokens": 3896408077.0, + "step": 36800 + }, + { + "entropy": 1.34515625, + "epoch": 0.9022574800450517, + "grad_norm": 1.8203125, + "learning_rate": 3.7257683056244895e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9631640148162842, + "num_tokens": 3901699376.0, + "step": 36850 + }, + { + "entropy": 1.32171875, + "epoch": 0.9034817100044072, + "grad_norm": 2.84375, + "learning_rate": 3.7198744660188347e-06, + "loss": 0.1578, + "mean_token_accuracy": 0.966994469165802, + "num_tokens": 3906644235.0, + "step": 36900 + }, + { + "entropy": 1.3284375, + "epoch": 0.9047059399637628, + "grad_norm": 1.8828125, + "learning_rate": 3.7139776766705433e-06, + "loss": 0.161, + "mean_token_accuracy": 0.9657053291797638, + "num_tokens": 3911529877.0, + "step": 36950 + }, + { + "entropy": 1.320625, + "epoch": 0.9059301699231184, + "grad_norm": 2.640625, + "learning_rate": 3.7080779617421733e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9647897446155548, + "num_tokens": 3917023608.0, + "step": 37000 + }, + { + "entropy": 1.3315625, + "epoch": 0.9071543998824739, + "grad_norm": 3.078125, + "learning_rate": 3.7021753454082772e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9609014749526977, + "num_tokens": 3922789580.0, + "step": 37050 + }, + { + "entropy": 1.31453125, + "epoch": 0.9083786298418295, + "grad_norm": 2.484375, + "learning_rate": 3.696269851855292e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9629218196868896, + "num_tokens": 3927904246.0, + "step": 37100 + }, + { + "entropy": 1.29828125, + "epoch": 0.909602859801185, + "grad_norm": 2.875, + "learning_rate": 3.6903615052814444e-06, + "loss": 0.1723, + "mean_token_accuracy": 0.96382728099823, + "num_tokens": 3933096610.0, + "step": 37150 + }, + { + "entropy": 1.275, + "epoch": 0.9108270897605406, + "grad_norm": 1.640625, + "learning_rate": 3.684450329896653e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9664675867557526, + "num_tokens": 3938208531.0, + "step": 37200 + }, + { + "entropy": 1.2990625, + "epoch": 0.9120513197198962, + "grad_norm": 1.6640625, + "learning_rate": 3.6785363499224266e-06, + "loss": 0.1676, + "mean_token_accuracy": 0.9638699948787689, + "num_tokens": 3943507764.0, + "step": 37250 + }, + { + "entropy": 1.29953125, + "epoch": 0.9132755496792517, + "grad_norm": 2.921875, + "learning_rate": 3.672619589591768e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9631060230731964, + "num_tokens": 3948883174.0, + "step": 37300 + }, + { + "entropy": 1.3128125, + "epoch": 0.9144997796386073, + "grad_norm": 3.34375, + "learning_rate": 3.6667000731490695e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9630844449996948, + "num_tokens": 3954228445.0, + "step": 37350 + }, + { + "entropy": 1.31328125, + "epoch": 0.9157240095979629, + "grad_norm": 2.71875, + "learning_rate": 3.660777824850019e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9625172114372254, + "num_tokens": 3959522338.0, + "step": 37400 + }, + { + "entropy": 1.3109375, + "epoch": 0.9169482395573184, + "grad_norm": 3.875, + "learning_rate": 3.6548528689614985e-06, + "loss": 0.1615, + "mean_token_accuracy": 0.9651338791847229, + "num_tokens": 3964674293.0, + "step": 37450 + }, + { + "entropy": 1.3209375, + "epoch": 0.918172469516674, + "grad_norm": 2.765625, + "learning_rate": 3.6489252297614833e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9630649185180664, + "num_tokens": 3970201603.0, + "step": 37500 + }, + { + "entropy": 1.33578125, + "epoch": 0.9193966994760295, + "grad_norm": 1.8984375, + "learning_rate": 3.6429949315389455e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9619642412662506, + "num_tokens": 3975729221.0, + "step": 37550 + }, + { + "entropy": 1.3121875, + "epoch": 0.9206209294353851, + "grad_norm": 2.484375, + "learning_rate": 3.6370619985937513e-06, + "loss": 0.1658, + "mean_token_accuracy": 0.9639672470092774, + "num_tokens": 3980440332.0, + "step": 37600 + }, + { + "entropy": 1.314375, + "epoch": 0.9218451593947407, + "grad_norm": 2.0, + "learning_rate": 3.6311264552365634e-06, + "loss": 0.1748, + "mean_token_accuracy": 0.9630878198146821, + "num_tokens": 3985861602.0, + "step": 37650 + }, + { + "entropy": 1.31109375, + "epoch": 0.9230693893540963, + "grad_norm": 1.671875, + "learning_rate": 3.62518832578874e-06, + "loss": 0.1647, + "mean_token_accuracy": 0.9646557712554932, + "num_tokens": 3991141130.0, + "step": 37700 + }, + { + "entropy": 1.31140625, + "epoch": 0.9242936193134519, + "grad_norm": 1.6328125, + "learning_rate": 3.619247634582238e-06, + "loss": 0.1798, + "mean_token_accuracy": 0.961934734582901, + "num_tokens": 3996774043.0, + "step": 37750 + }, + { + "entropy": 1.326875, + "epoch": 0.9255178492728074, + "grad_norm": 2.078125, + "learning_rate": 3.6133044059595083e-06, + "loss": 0.1817, + "mean_token_accuracy": 0.9612915456295014, + "num_tokens": 4002462308.0, + "step": 37800 + }, + { + "entropy": 1.31359375, + "epoch": 0.926742079232163, + "grad_norm": 3.296875, + "learning_rate": 3.6073586642734027e-06, + "loss": 0.1779, + "mean_token_accuracy": 0.9622733199596405, + "num_tokens": 4007870657.0, + "step": 37850 + }, + { + "entropy": 1.3059375, + "epoch": 0.9279663091915186, + "grad_norm": 1.734375, + "learning_rate": 3.601410433887068e-06, + "loss": 0.1696, + "mean_token_accuracy": 0.9639555370807648, + "num_tokens": 4012925044.0, + "step": 37900 + }, + { + "entropy": 1.30625, + "epoch": 0.9291905391508741, + "grad_norm": 3.203125, + "learning_rate": 3.5954597391738487e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.9627858221530914, + "num_tokens": 4018089645.0, + "step": 37950 + }, + { + "entropy": 1.3059375, + "epoch": 0.9304147691102297, + "grad_norm": 3.0, + "learning_rate": 3.589506604517189e-06, + "loss": 0.1668, + "mean_token_accuracy": 0.9654299330711364, + "num_tokens": 4023139809.0, + "step": 38000 + }, + { + "entropy": 1.32140625, + "epoch": 0.9316389990695852, + "grad_norm": 1.765625, + "learning_rate": 3.583551054310529e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9638527107238769, + "num_tokens": 4028437262.0, + "step": 38050 + }, + { + "entropy": 1.31890625, + "epoch": 0.9328632290289408, + "grad_norm": 3.03125, + "learning_rate": 3.5775931129572072e-06, + "loss": 0.1658, + "mean_token_accuracy": 0.9640737462043762, + "num_tokens": 4033659635.0, + "step": 38100 + }, + { + "entropy": 1.31625, + "epoch": 0.9340874589882964, + "grad_norm": 4.28125, + "learning_rate": 3.57163280487036e-06, + "loss": 0.1742, + "mean_token_accuracy": 0.9627125465869903, + "num_tokens": 4039135210.0, + "step": 38150 + }, + { + "entropy": 1.31125, + "epoch": 0.9353116889476519, + "grad_norm": 2.546875, + "learning_rate": 3.5656701544728222e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9629321038722992, + "num_tokens": 4044192912.0, + "step": 38200 + }, + { + "entropy": 1.315625, + "epoch": 0.9365359189070075, + "grad_norm": 2.109375, + "learning_rate": 3.559705186197026e-06, + "loss": 0.1641, + "mean_token_accuracy": 0.9655595874786377, + "num_tokens": 4049649393.0, + "step": 38250 + }, + { + "entropy": 1.31484375, + "epoch": 0.937760148866363, + "grad_norm": 3.0625, + "learning_rate": 3.5537379244849017e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9634083175659179, + "num_tokens": 4054901732.0, + "step": 38300 + }, + { + "entropy": 1.31140625, + "epoch": 0.9389843788257186, + "grad_norm": 2.5625, + "learning_rate": 3.5477683937877755e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9634031581878663, + "num_tokens": 4060033796.0, + "step": 38350 + }, + { + "entropy": 1.30640625, + "epoch": 0.9402086087850742, + "grad_norm": 2.71875, + "learning_rate": 3.541796618566273e-06, + "loss": 0.1634, + "mean_token_accuracy": 0.9645454668998719, + "num_tokens": 4065362004.0, + "step": 38400 + }, + { + "entropy": 1.281875, + "epoch": 0.9414328387444297, + "grad_norm": 3.0625, + "learning_rate": 3.535822623290217e-06, + "loss": 0.1456, + "mean_token_accuracy": 0.9695195186138154, + "num_tokens": 4070167345.0, + "step": 38450 + }, + { + "entropy": 1.3134375, + "epoch": 0.9426570687037853, + "grad_norm": 0.004974365234375, + "learning_rate": 3.5298464324385246e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.9643441307544708, + "num_tokens": 4075331852.0, + "step": 38500 + }, + { + "entropy": 1.29109375, + "epoch": 0.9438812986631409, + "grad_norm": 2.609375, + "learning_rate": 3.523868070499112e-06, + "loss": 0.1522, + "mean_token_accuracy": 0.9671092510223389, + "num_tokens": 4080329045.0, + "step": 38550 + }, + { + "entropy": 1.2840625, + "epoch": 0.9451055286224964, + "grad_norm": 3.75, + "learning_rate": 3.517887561968791e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9650249874591827, + "num_tokens": 4085382254.0, + "step": 38600 + }, + { + "entropy": 1.29203125, + "epoch": 0.946329758581852, + "grad_norm": 3.5, + "learning_rate": 3.5119049313531687e-06, + "loss": 0.1698, + "mean_token_accuracy": 0.9630708813667297, + "num_tokens": 4090736615.0, + "step": 38650 + }, + { + "entropy": 1.29640625, + "epoch": 0.9475539885412075, + "grad_norm": 2.28125, + "learning_rate": 3.5059202031665473e-06, + "loss": 0.1761, + "mean_token_accuracy": 0.962629064321518, + "num_tokens": 4096335023.0, + "step": 38700 + }, + { + "entropy": 1.28390625, + "epoch": 0.9487782185005631, + "grad_norm": 2.53125, + "learning_rate": 3.499933401931826e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9639296698570251, + "num_tokens": 4101408840.0, + "step": 38750 + }, + { + "entropy": 1.26234375, + "epoch": 0.9500024484599187, + "grad_norm": 3.515625, + "learning_rate": 3.493944552180395e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9661567640304566, + "num_tokens": 4106422813.0, + "step": 38800 + }, + { + "entropy": 1.2709375, + "epoch": 0.9512266784192743, + "grad_norm": 3.109375, + "learning_rate": 3.487953678452042e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.9669099247455597, + "num_tokens": 4111614226.0, + "step": 38850 + }, + { + "entropy": 1.2715625, + "epoch": 0.9524509083786299, + "grad_norm": 3.1875, + "learning_rate": 3.481960805294847e-06, + "loss": 0.1652, + "mean_token_accuracy": 0.9649276435375214, + "num_tokens": 4116902981.0, + "step": 38900 + }, + { + "entropy": 1.26828125, + "epoch": 0.9536751383379855, + "grad_norm": 3.390625, + "learning_rate": 3.47596595726508e-06, + "loss": 0.1682, + "mean_token_accuracy": 0.9636393487453461, + "num_tokens": 4122056561.0, + "step": 38950 + }, + { + "entropy": 1.270625, + "epoch": 0.954899368297341, + "grad_norm": 2.125, + "learning_rate": 3.4699691589271076e-06, + "loss": 0.1685, + "mean_token_accuracy": 0.9632602989673614, + "num_tokens": 4127685041.0, + "step": 39000 + }, + { + "entropy": 1.2453125, + "epoch": 0.9561235982566966, + "grad_norm": 2.4375, + "learning_rate": 3.463970434853285e-06, + "loss": 0.142, + "mean_token_accuracy": 0.9697425818443298, + "num_tokens": 4132578966.0, + "step": 39050 + }, + { + "entropy": 1.258125, + "epoch": 0.9573478282160521, + "grad_norm": 2.375, + "learning_rate": 3.45796980962386e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9646705484390259, + "num_tokens": 4138012784.0, + "step": 39100 + }, + { + "entropy": 1.26859375, + "epoch": 0.9585720581754077, + "grad_norm": 1.7890625, + "learning_rate": 3.451967307826869e-06, + "loss": 0.1757, + "mean_token_accuracy": 0.9628133857250214, + "num_tokens": 4143616072.0, + "step": 39150 + }, + { + "entropy": 1.259375, + "epoch": 0.9597962881347633, + "grad_norm": 4.1875, + "learning_rate": 3.445962954058039e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.962674834728241, + "num_tokens": 4148944121.0, + "step": 39200 + }, + { + "entropy": 1.25921875, + "epoch": 0.9610205180941188, + "grad_norm": 3.375, + "learning_rate": 3.439956772920685e-06, + "loss": 0.1648, + "mean_token_accuracy": 0.9645766019821167, + "num_tokens": 4153880493.0, + "step": 39250 + }, + { + "entropy": 1.2525, + "epoch": 0.9622447480534744, + "grad_norm": 3.015625, + "learning_rate": 3.4339487890256097e-06, + "loss": 0.161, + "mean_token_accuracy": 0.965018298625946, + "num_tokens": 4158921325.0, + "step": 39300 + }, + { + "entropy": 1.2459375, + "epoch": 0.9634689780128299, + "grad_norm": 3.5, + "learning_rate": 3.4279390269910033e-06, + "loss": 0.1658, + "mean_token_accuracy": 0.9649594247341156, + "num_tokens": 4163950443.0, + "step": 39350 + }, + { + "entropy": 1.2590625, + "epoch": 0.9646932079721855, + "grad_norm": 2.875, + "learning_rate": 3.421927511442341e-06, + "loss": 0.172, + "mean_token_accuracy": 0.9640387868881226, + "num_tokens": 4169489034.0, + "step": 39400 + }, + { + "entropy": 1.261875, + "epoch": 0.9659174379315411, + "grad_norm": 2.28125, + "learning_rate": 3.4159142670122845e-06, + "loss": 0.1719, + "mean_token_accuracy": 0.9637044394016265, + "num_tokens": 4174842337.0, + "step": 39450 + }, + { + "entropy": 1.26265625, + "epoch": 0.9671416678908966, + "grad_norm": 2.390625, + "learning_rate": 3.4098993183405793e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9634046721458435, + "num_tokens": 4180354181.0, + "step": 39500 + }, + { + "entropy": 1.26046875, + "epoch": 0.9683658978502522, + "grad_norm": 1.6484375, + "learning_rate": 3.403882690073954e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9639586913585663, + "num_tokens": 4185417059.0, + "step": 39550 + }, + { + "entropy": 1.27140625, + "epoch": 0.9695901278096077, + "grad_norm": 1.2421875, + "learning_rate": 3.3978644068660175e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.96663733959198, + "num_tokens": 4190550088.0, + "step": 39600 + }, + { + "entropy": 1.28140625, + "epoch": 0.9708143577689633, + "grad_norm": 2.859375, + "learning_rate": 3.3918444933771637e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9624445605278015, + "num_tokens": 4196306371.0, + "step": 39650 + }, + { + "entropy": 1.27078125, + "epoch": 0.9720385877283189, + "grad_norm": 1.875, + "learning_rate": 3.385822974274465e-06, + "loss": 0.1673, + "mean_token_accuracy": 0.9644521117210388, + "num_tokens": 4201403065.0, + "step": 39700 + }, + { + "entropy": 1.2859375, + "epoch": 0.9732628176876744, + "grad_norm": 2.78125, + "learning_rate": 3.3797998742315724e-06, + "loss": 0.1646, + "mean_token_accuracy": 0.9653528666496277, + "num_tokens": 4206711792.0, + "step": 39750 + }, + { + "entropy": 1.2709375, + "epoch": 0.97448704764703, + "grad_norm": 3.625, + "learning_rate": 3.3737752179286158e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.964444397687912, + "num_tokens": 4212047599.0, + "step": 39800 + }, + { + "entropy": 1.289375, + "epoch": 0.9757112776063855, + "grad_norm": 2.5, + "learning_rate": 3.3677490300521e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.963803733587265, + "num_tokens": 4217477603.0, + "step": 39850 + }, + { + "entropy": 1.27140625, + "epoch": 0.9769355075657411, + "grad_norm": 2.25, + "learning_rate": 3.361721335294809e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9657166159152984, + "num_tokens": 4222290662.0, + "step": 39900 + }, + { + "entropy": 1.3034375, + "epoch": 0.9781597375250967, + "grad_norm": 2.03125, + "learning_rate": 3.355692158355699e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.9609908378124237, + "num_tokens": 4228024616.0, + "step": 39950 + }, + { + "entropy": 1.2765625, + "epoch": 0.9793839674844523, + "grad_norm": 1.8515625, + "learning_rate": 3.349661523939799e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9669453859329223, + "num_tokens": 4233080108.0, + "step": 40000 + }, + { + "epoch": 0.9793839674844523, + "eval_entropy": 1.2830078125, + "eval_loss": 0.18154892325401306, + "eval_mean_token_accuracy": 0.9611844887336095, + "eval_num_tokens": 4233080108.0, + "eval_runtime": 601.7254, + "eval_samples_per_second": 16.047, + "eval_steps_per_second": 0.201, + "step": 40000 + }, + { + "entropy": 1.28, + "epoch": 0.9806081974438079, + "grad_norm": 3.40625, + "learning_rate": 3.3436294567581125e-06, + "loss": 0.1685, + "mean_token_accuracy": 0.9643000710010529, + "num_tokens": 4238491459.0, + "step": 40050 + }, + { + "entropy": 1.29109375, + "epoch": 0.9818324274031635, + "grad_norm": 1.9453125, + "learning_rate": 3.3375959815275103e-06, + "loss": 0.1714, + "mean_token_accuracy": 0.9640710878372193, + "num_tokens": 4244109418.0, + "step": 40100 + }, + { + "entropy": 1.27515625, + "epoch": 0.983056657362519, + "grad_norm": 0.029052734375, + "learning_rate": 3.3315611229706377e-06, + "loss": 0.1519, + "mean_token_accuracy": 0.9681409633159638, + "num_tokens": 4249250373.0, + "step": 40150 + }, + { + "entropy": 1.2784375, + "epoch": 0.9842808873218746, + "grad_norm": 2.09375, + "learning_rate": 3.325524905815804e-06, + "loss": 0.1654, + "mean_token_accuracy": 0.9648780179023743, + "num_tokens": 4254623197.0, + "step": 40200 + }, + { + "entropy": 1.2959375, + "epoch": 0.9855051172812301, + "grad_norm": 2.0, + "learning_rate": 3.3194873547968867e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.963757860660553, + "num_tokens": 4260002335.0, + "step": 40250 + }, + { + "entropy": 1.28453125, + "epoch": 0.9867293472405857, + "grad_norm": 0.0078125, + "learning_rate": 3.313448494653232e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9633991587162017, + "num_tokens": 4265450665.0, + "step": 40300 + }, + { + "entropy": 1.26625, + "epoch": 0.9879535771999413, + "grad_norm": 2.90625, + "learning_rate": 3.3074083501295447e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9687767088413238, + "num_tokens": 4270155512.0, + "step": 40350 + }, + { + "entropy": 1.2728125, + "epoch": 0.9891778071592968, + "grad_norm": 2.15625, + "learning_rate": 3.3013669459757956e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9668355488777161, + "num_tokens": 4275174062.0, + "step": 40400 + }, + { + "entropy": 1.28171875, + "epoch": 0.9904020371186524, + "grad_norm": 2.421875, + "learning_rate": 3.2953243069471187e-06, + "loss": 0.1692, + "mean_token_accuracy": 0.9641734325885772, + "num_tokens": 4280291982.0, + "step": 40450 + }, + { + "entropy": 1.29375, + "epoch": 0.9916262670780079, + "grad_norm": 1.8828125, + "learning_rate": 3.2892804578037036e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9624480056762695, + "num_tokens": 4285827143.0, + "step": 40500 + }, + { + "entropy": 1.29921875, + "epoch": 0.9928504970373635, + "grad_norm": 3.328125, + "learning_rate": 3.2832354233107023e-06, + "loss": 0.1717, + "mean_token_accuracy": 0.9635557103157043, + "num_tokens": 4291196556.0, + "step": 40550 + }, + { + "entropy": 1.29515625, + "epoch": 0.9940747269967191, + "grad_norm": 2.046875, + "learning_rate": 3.2771892282381226e-06, + "loss": 0.1535, + "mean_token_accuracy": 0.9667463576793671, + "num_tokens": 4296297335.0, + "step": 40600 + }, + { + "entropy": 1.2765625, + "epoch": 0.9952989569560746, + "grad_norm": 2.15625, + "learning_rate": 3.2711418973607257e-06, + "loss": 0.1584, + "mean_token_accuracy": 0.9667293214797974, + "num_tokens": 4301506384.0, + "step": 40650 + }, + { + "entropy": 1.27078125, + "epoch": 0.9965231869154302, + "grad_norm": 2.234375, + "learning_rate": 3.2650934554579314e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.9660876715183258, + "num_tokens": 4306603792.0, + "step": 40700 + }, + { + "entropy": 1.27515625, + "epoch": 0.9977474168747857, + "grad_norm": 4.15625, + "learning_rate": 3.2590439273137074e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.9637362861633301, + "num_tokens": 4312148607.0, + "step": 40750 + }, + { + "entropy": 1.266875, + "epoch": 0.9989716468341413, + "grad_norm": 3.546875, + "learning_rate": 3.2529933377164754e-06, + "loss": 0.1498, + "mean_token_accuracy": 0.9686801016330719, + "num_tokens": 4317085828.0, + "step": 40800 + }, + { + "entropy": 1.2784375, + "epoch": 1.0001958767934969, + "grad_norm": 3.578125, + "learning_rate": 3.2469417114590055e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.9648519742488861, + "num_tokens": 4322221289.0, + "step": 40850 + }, + { + "entropy": 1.28578125, + "epoch": 1.0014201067528525, + "grad_norm": 4.375, + "learning_rate": 3.240889073338315e-06, + "loss": 0.1602, + "mean_token_accuracy": 0.9657353925704956, + "num_tokens": 4327372960.0, + "step": 40900 + }, + { + "entropy": 1.268125, + "epoch": 1.002644336712208, + "grad_norm": 2.078125, + "learning_rate": 3.2348354481555692e-06, + "loss": 0.1607, + "mean_token_accuracy": 0.9653881311416626, + "num_tokens": 4332436449.0, + "step": 40950 + }, + { + "entropy": 1.27359375, + "epoch": 1.0038685666715637, + "grad_norm": 0.0013580322265625, + "learning_rate": 3.2287808607159753e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9669638919830322, + "num_tokens": 4337572886.0, + "step": 41000 + }, + { + "entropy": 1.26375, + "epoch": 1.005092796630919, + "grad_norm": 2.609375, + "learning_rate": 3.222725335828685e-06, + "loss": 0.1474, + "mean_token_accuracy": 0.9681554007530212, + "num_tokens": 4342524064.0, + "step": 41050 + }, + { + "entropy": 1.290625, + "epoch": 1.0063170265902748, + "grad_norm": 2.796875, + "learning_rate": 3.216668898306692e-06, + "loss": 0.1723, + "mean_token_accuracy": 0.9632875370979309, + "num_tokens": 4347805365.0, + "step": 41100 + }, + { + "entropy": 1.28421875, + "epoch": 1.0075412565496302, + "grad_norm": 3.546875, + "learning_rate": 3.210611572966728e-06, + "loss": 0.1571, + "mean_token_accuracy": 0.9664819014072418, + "num_tokens": 4352875723.0, + "step": 41150 + }, + { + "entropy": 1.29171875, + "epoch": 1.008765486508986, + "grad_norm": 1.7421875, + "learning_rate": 3.2045533846291643e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9631037187576293, + "num_tokens": 4358561815.0, + "step": 41200 + }, + { + "entropy": 1.276875, + "epoch": 1.0099897164683413, + "grad_norm": 2.15625, + "learning_rate": 3.1984943581179053e-06, + "loss": 0.1553, + "mean_token_accuracy": 0.9667964303493499, + "num_tokens": 4363644242.0, + "step": 41250 + }, + { + "entropy": 1.30296875, + "epoch": 1.011213946427697, + "grad_norm": 3.96875, + "learning_rate": 3.1924345182602943e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.9630448269844055, + "num_tokens": 4369318393.0, + "step": 41300 + }, + { + "entropy": 1.28875, + "epoch": 1.0124381763870525, + "grad_norm": 3.296875, + "learning_rate": 3.1863738898870033e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.9647123277187347, + "num_tokens": 4374659681.0, + "step": 41350 + }, + { + "entropy": 1.27265625, + "epoch": 1.0136624063464081, + "grad_norm": 3.125, + "learning_rate": 3.180312497831938e-06, + "loss": 0.1567, + "mean_token_accuracy": 0.9661735820770264, + "num_tokens": 4379733438.0, + "step": 41400 + }, + { + "entropy": 1.28484375, + "epoch": 1.0148866363057636, + "grad_norm": 3.046875, + "learning_rate": 3.174250366932133e-06, + "loss": 0.1612, + "mean_token_accuracy": 0.9659793210029602, + "num_tokens": 4384885742.0, + "step": 41450 + }, + { + "entropy": 1.2934375, + "epoch": 1.0161108662651193, + "grad_norm": 3.65625, + "learning_rate": 3.1681875220276487e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.9628891766071319, + "num_tokens": 4390251007.0, + "step": 41500 + }, + { + "entropy": 1.29703125, + "epoch": 1.0173350962244747, + "grad_norm": 2.546875, + "learning_rate": 3.1621239879614722e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.9631851124763489, + "num_tokens": 4395820970.0, + "step": 41550 + }, + { + "entropy": 1.289375, + "epoch": 1.0185593261838304, + "grad_norm": 3.15625, + "learning_rate": 3.1560597895794157e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9643260395526886, + "num_tokens": 4401284321.0, + "step": 41600 + }, + { + "entropy": 1.3046875, + "epoch": 1.0197835561431858, + "grad_norm": 2.859375, + "learning_rate": 3.149994951730011e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.9601117408275605, + "num_tokens": 4406768060.0, + "step": 41650 + }, + { + "entropy": 1.29484375, + "epoch": 1.0210077861025415, + "grad_norm": 3.0, + "learning_rate": 3.143929499264413e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9648369300365448, + "num_tokens": 4412201333.0, + "step": 41700 + }, + { + "entropy": 1.28328125, + "epoch": 1.0222320160618972, + "grad_norm": 3.40625, + "learning_rate": 3.137863457036292e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9676184570789337, + "num_tokens": 4417135073.0, + "step": 41750 + }, + { + "entropy": 1.3009375, + "epoch": 1.0234562460212526, + "grad_norm": 2.828125, + "learning_rate": 3.1317968499017366e-06, + "loss": 0.1742, + "mean_token_accuracy": 0.9627422571182251, + "num_tokens": 4422234270.0, + "step": 41800 + }, + { + "entropy": 1.29265625, + "epoch": 1.0246804759806083, + "grad_norm": 1.6875, + "learning_rate": 3.1257297027191517e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9664195513725281, + "num_tokens": 4427309878.0, + "step": 41850 + }, + { + "entropy": 1.275, + "epoch": 1.0259047059399637, + "grad_norm": 2.015625, + "learning_rate": 3.1196620403491515e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9644128286838531, + "num_tokens": 4432672891.0, + "step": 41900 + }, + { + "entropy": 1.2815625, + "epoch": 1.0271289358993194, + "grad_norm": 2.28125, + "learning_rate": 3.113593887654463e-06, + "loss": 0.1513, + "mean_token_accuracy": 0.9673609352111816, + "num_tokens": 4437526358.0, + "step": 41950 + }, + { + "entropy": 1.290625, + "epoch": 1.0283531658586749, + "grad_norm": 3.125, + "learning_rate": 3.107525269499825e-06, + "loss": 0.1706, + "mean_token_accuracy": 0.9627550756931305, + "num_tokens": 4442820350.0, + "step": 42000 + }, + { + "entropy": 1.29484375, + "epoch": 1.0295773958180305, + "grad_norm": 3.4375, + "learning_rate": 3.1014562107518786e-06, + "loss": 0.1684, + "mean_token_accuracy": 0.9646277678012848, + "num_tokens": 4448357734.0, + "step": 42050 + }, + { + "entropy": 1.300625, + "epoch": 1.030801625777386, + "grad_norm": 3.5625, + "learning_rate": 3.0953867362790734e-06, + "loss": 0.1802, + "mean_token_accuracy": 0.9611736404895782, + "num_tokens": 4453928087.0, + "step": 42100 + }, + { + "entropy": 1.29171875, + "epoch": 1.0320258557367417, + "grad_norm": 3.375, + "learning_rate": 3.089316870951562e-06, + "loss": 0.162, + "mean_token_accuracy": 0.9649739050865174, + "num_tokens": 4458946227.0, + "step": 42150 + }, + { + "entropy": 1.289375, + "epoch": 1.033250085696097, + "grad_norm": 3.21875, + "learning_rate": 3.083246639641098e-06, + "loss": 0.1723, + "mean_token_accuracy": 0.9634380388259888, + "num_tokens": 4464192504.0, + "step": 42200 + }, + { + "entropy": 1.3146875, + "epoch": 1.0344743156554528, + "grad_norm": 2.453125, + "learning_rate": 3.077176067220935e-06, + "loss": 0.1793, + "mean_token_accuracy": 0.9617934930324554, + "num_tokens": 4469999689.0, + "step": 42250 + }, + { + "entropy": 1.3025, + "epoch": 1.0356985456148082, + "grad_norm": 2.125, + "learning_rate": 3.0711051785657236e-06, + "loss": 0.1649, + "mean_token_accuracy": 0.964527097940445, + "num_tokens": 4475221088.0, + "step": 42300 + }, + { + "entropy": 1.29015625, + "epoch": 1.036922775574164, + "grad_norm": 2.84375, + "learning_rate": 3.065033998551413e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9632121896743775, + "num_tokens": 4480484467.0, + "step": 42350 + }, + { + "entropy": 1.29890625, + "epoch": 1.0381470055335194, + "grad_norm": 3.234375, + "learning_rate": 3.0589625520551414e-06, + "loss": 0.168, + "mean_token_accuracy": 0.9637061321735382, + "num_tokens": 4486042679.0, + "step": 42400 + }, + { + "entropy": 1.31703125, + "epoch": 1.039371235492875, + "grad_norm": 2.75, + "learning_rate": 3.0528908639551436e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9634595859050751, + "num_tokens": 4491749175.0, + "step": 42450 + }, + { + "entropy": 1.280625, + "epoch": 1.0405954654522305, + "grad_norm": 2.59375, + "learning_rate": 3.0468189591306418e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.9648339354991913, + "num_tokens": 4497083391.0, + "step": 42500 + }, + { + "entropy": 1.275, + "epoch": 1.0418196954115861, + "grad_norm": 3.59375, + "learning_rate": 3.040746862461747e-06, + "loss": 0.1573, + "mean_token_accuracy": 0.9660842347145081, + "num_tokens": 4502213588.0, + "step": 42550 + }, + { + "entropy": 1.27265625, + "epoch": 1.0430439253709416, + "grad_norm": 2.078125, + "learning_rate": 3.0346745988293553e-06, + "loss": 0.1638, + "mean_token_accuracy": 0.9644993054866791, + "num_tokens": 4507601887.0, + "step": 42600 + }, + { + "entropy": 1.2703125, + "epoch": 1.0442681553302973, + "grad_norm": 2.234375, + "learning_rate": 3.02860219311505e-06, + "loss": 0.162, + "mean_token_accuracy": 0.965209093093872, + "num_tokens": 4512999351.0, + "step": 42650 + }, + { + "entropy": 1.2659375, + "epoch": 1.0454923852896527, + "grad_norm": 2.84375, + "learning_rate": 3.0225296702009917e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9636136376857758, + "num_tokens": 4518295845.0, + "step": 42700 + }, + { + "entropy": 1.27453125, + "epoch": 1.0467166152490084, + "grad_norm": 2.609375, + "learning_rate": 3.016457054969827e-06, + "loss": 0.165, + "mean_token_accuracy": 0.9648648130893708, + "num_tokens": 4523705084.0, + "step": 42750 + }, + { + "entropy": 1.27328125, + "epoch": 1.0479408452083638, + "grad_norm": 2.140625, + "learning_rate": 3.0103843723045753e-06, + "loss": 0.1587, + "mean_token_accuracy": 0.9660780084133148, + "num_tokens": 4528928559.0, + "step": 42800 + }, + { + "entropy": 1.27140625, + "epoch": 1.0491650751677195, + "grad_norm": 1.7265625, + "learning_rate": 3.004311647088536e-06, + "loss": 0.1608, + "mean_token_accuracy": 0.9661289596557617, + "num_tokens": 4534161929.0, + "step": 42850 + }, + { + "entropy": 1.28, + "epoch": 1.0503893051270752, + "grad_norm": 2.734375, + "learning_rate": 2.9982389042051802e-06, + "loss": 0.1596, + "mean_token_accuracy": 0.9655217385292053, + "num_tokens": 4539230226.0, + "step": 42900 + }, + { + "entropy": 1.27828125, + "epoch": 1.0516135350864306, + "grad_norm": 1.8359375, + "learning_rate": 2.992166168538055e-06, + "loss": 0.1654, + "mean_token_accuracy": 0.9645612442493439, + "num_tokens": 4544444757.0, + "step": 42950 + }, + { + "entropy": 1.28609375, + "epoch": 1.0528377650457863, + "grad_norm": 3.1875, + "learning_rate": 2.986093464970675e-06, + "loss": 0.1809, + "mean_token_accuracy": 0.961436516046524, + "num_tokens": 4550024290.0, + "step": 43000 + }, + { + "entropy": 1.2921875, + "epoch": 1.0540619950051417, + "grad_norm": 1.8046875, + "learning_rate": 2.9800208183864225e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9631437683105468, + "num_tokens": 4555846037.0, + "step": 43050 + }, + { + "entropy": 1.29046875, + "epoch": 1.0552862249644974, + "grad_norm": 2.859375, + "learning_rate": 2.97394825366845e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.9611044287681579, + "num_tokens": 4561556919.0, + "step": 43100 + }, + { + "entropy": 1.2615625, + "epoch": 1.0565104549238529, + "grad_norm": 2.578125, + "learning_rate": 2.9678757956995704e-06, + "loss": 0.1519, + "mean_token_accuracy": 0.967376263141632, + "num_tokens": 4566754673.0, + "step": 43150 + }, + { + "entropy": 1.24921875, + "epoch": 1.0577346848832085, + "grad_norm": 3.5, + "learning_rate": 2.9618034693621624e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9647138011455536, + "num_tokens": 4571961153.0, + "step": 43200 + }, + { + "entropy": 1.27078125, + "epoch": 1.058958914842564, + "grad_norm": 4.0625, + "learning_rate": 2.955731299538065e-06, + "loss": 0.1664, + "mean_token_accuracy": 0.9643959999084473, + "num_tokens": 4577276643.0, + "step": 43250 + }, + { + "entropy": 1.27125, + "epoch": 1.0601831448019197, + "grad_norm": 3.5, + "learning_rate": 2.9496593111084725e-06, + "loss": 0.1764, + "mean_token_accuracy": 0.9621264743804931, + "num_tokens": 4582787780.0, + "step": 43300 + }, + { + "entropy": 1.2503125, + "epoch": 1.0614073747612751, + "grad_norm": 0.06201171875, + "learning_rate": 2.9435875289538397e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9652257537841797, + "num_tokens": 4587978646.0, + "step": 43350 + }, + { + "entropy": 1.25390625, + "epoch": 1.0626316047206308, + "grad_norm": 2.34375, + "learning_rate": 2.937515977953776e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9656472432613373, + "num_tokens": 4593105594.0, + "step": 43400 + }, + { + "entropy": 1.235, + "epoch": 1.0638558346799862, + "grad_norm": 1.796875, + "learning_rate": 2.93144468298694e-06, + "loss": 0.1465, + "mean_token_accuracy": 0.9684570038318634, + "num_tokens": 4598082227.0, + "step": 43450 + }, + { + "entropy": 1.2615625, + "epoch": 1.065080064639342, + "grad_norm": 1.75, + "learning_rate": 2.9253736689309453e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9627693855762481, + "num_tokens": 4603820936.0, + "step": 43500 + }, + { + "entropy": 1.2409375, + "epoch": 1.0663042945986974, + "grad_norm": 2.4375, + "learning_rate": 2.919302960662252e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9645286548137665, + "num_tokens": 4609111825.0, + "step": 43550 + }, + { + "entropy": 1.251875, + "epoch": 1.067528524558053, + "grad_norm": 2.265625, + "learning_rate": 2.9132325830560694e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9642206788063049, + "num_tokens": 4614988638.0, + "step": 43600 + }, + { + "entropy": 1.23515625, + "epoch": 1.0687527545174085, + "grad_norm": 3.109375, + "learning_rate": 2.907162560986249e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9648200106620789, + "num_tokens": 4620258466.0, + "step": 43650 + }, + { + "entropy": 1.23046875, + "epoch": 1.0699769844767641, + "grad_norm": 1.78125, + "learning_rate": 2.9010929193251877e-06, + "loss": 0.1587, + "mean_token_accuracy": 0.9666041648387909, + "num_tokens": 4625541440.0, + "step": 43700 + }, + { + "entropy": 1.23578125, + "epoch": 1.0712012144361196, + "grad_norm": 2.59375, + "learning_rate": 2.8950236829437243e-06, + "loss": 0.1595, + "mean_token_accuracy": 0.9665923917293548, + "num_tokens": 4630862596.0, + "step": 43750 + }, + { + "entropy": 1.24796875, + "epoch": 1.0724254443954753, + "grad_norm": 3.625, + "learning_rate": 2.8889548767110325e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9622351431846619, + "num_tokens": 4636080162.0, + "step": 43800 + }, + { + "entropy": 1.255, + "epoch": 1.0736496743548307, + "grad_norm": 2.984375, + "learning_rate": 2.882886525494528e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9641489648818969, + "num_tokens": 4641603830.0, + "step": 43850 + }, + { + "entropy": 1.26390625, + "epoch": 1.0748739043141864, + "grad_norm": 2.203125, + "learning_rate": 2.8768186541597617e-06, + "loss": 0.1803, + "mean_token_accuracy": 0.9621511352062225, + "num_tokens": 4647162733.0, + "step": 43900 + }, + { + "entropy": 1.26953125, + "epoch": 1.0760981342735418, + "grad_norm": 2.359375, + "learning_rate": 2.8707512875703146e-06, + "loss": 0.1724, + "mean_token_accuracy": 0.963198972940445, + "num_tokens": 4652659894.0, + "step": 43950 + }, + { + "entropy": 1.261875, + "epoch": 1.0773223642328975, + "grad_norm": 2.984375, + "learning_rate": 2.8646844505877032e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.963871557712555, + "num_tokens": 4657833019.0, + "step": 44000 + }, + { + "entropy": 1.25171875, + "epoch": 1.078546594192253, + "grad_norm": 3.875, + "learning_rate": 2.8586181680712726e-06, + "loss": 0.1671, + "mean_token_accuracy": 0.9647689509391785, + "num_tokens": 4663099416.0, + "step": 44050 + }, + { + "entropy": 1.2353125, + "epoch": 1.0797708241516086, + "grad_norm": 1.921875, + "learning_rate": 2.852552464878096e-06, + "loss": 0.1626, + "mean_token_accuracy": 0.9649975061416626, + "num_tokens": 4668463403.0, + "step": 44100 + }, + { + "entropy": 1.2309375, + "epoch": 1.0809950541109643, + "grad_norm": 3.515625, + "learning_rate": 2.846487365862872e-06, + "loss": 0.1622, + "mean_token_accuracy": 0.966260347366333, + "num_tokens": 4673588957.0, + "step": 44150 + }, + { + "entropy": 1.2703125, + "epoch": 1.0822192840703198, + "grad_norm": 2.921875, + "learning_rate": 2.840422895877824e-06, + "loss": 0.1829, + "mean_token_accuracy": 0.9611806380748749, + "num_tokens": 4679435999.0, + "step": 44200 + }, + { + "entropy": 1.21671875, + "epoch": 1.0834435140296754, + "grad_norm": 2.515625, + "learning_rate": 2.8343590797725993e-06, + "loss": 0.1595, + "mean_token_accuracy": 0.9657203650474548, + "num_tokens": 4684283427.0, + "step": 44250 + }, + { + "entropy": 1.23546875, + "epoch": 1.0846677439890309, + "grad_norm": 2.296875, + "learning_rate": 2.828295942394163e-06, + "loss": 0.1545, + "mean_token_accuracy": 0.9663613975048065, + "num_tokens": 4689166634.0, + "step": 44300 + }, + { + "entropy": 1.2715625, + "epoch": 1.0858919739483865, + "grad_norm": 2.140625, + "learning_rate": 2.822233508586702e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9638037300109863, + "num_tokens": 4694728156.0, + "step": 44350 + }, + { + "entropy": 1.246875, + "epoch": 1.087116203907742, + "grad_norm": 1.1171875, + "learning_rate": 2.8161718031915194e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9652890110015869, + "num_tokens": 4700054529.0, + "step": 44400 + }, + { + "entropy": 1.26203125, + "epoch": 1.0883404338670977, + "grad_norm": 1.9765625, + "learning_rate": 2.8101108510469308e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9647334861755371, + "num_tokens": 4705521940.0, + "step": 44450 + }, + { + "entropy": 1.26171875, + "epoch": 1.0895646638264531, + "grad_norm": 2.15625, + "learning_rate": 2.804050676988169e-06, + "loss": 0.1764, + "mean_token_accuracy": 0.9625956809520722, + "num_tokens": 4711285057.0, + "step": 44500 + }, + { + "entropy": 1.26828125, + "epoch": 1.0907888937858088, + "grad_norm": 3.21875, + "learning_rate": 2.797991305847279e-06, + "loss": 0.1695, + "mean_token_accuracy": 0.9635378420352936, + "num_tokens": 4716659220.0, + "step": 44550 + }, + { + "entropy": 1.25296875, + "epoch": 1.0920131237451642, + "grad_norm": 2.84375, + "learning_rate": 2.7919327624530105e-06, + "loss": 0.1589, + "mean_token_accuracy": 0.966244969367981, + "num_tokens": 4721738500.0, + "step": 44600 + }, + { + "entropy": 1.25390625, + "epoch": 1.09323735370452, + "grad_norm": 1.5859375, + "learning_rate": 2.7858750716307267e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9655514645576477, + "num_tokens": 4727007974.0, + "step": 44650 + }, + { + "entropy": 1.261875, + "epoch": 1.0944615836638754, + "grad_norm": 3.15625, + "learning_rate": 2.7798182582022956e-06, + "loss": 0.1666, + "mean_token_accuracy": 0.9647921168804169, + "num_tokens": 4732247570.0, + "step": 44700 + }, + { + "entropy": 1.275, + "epoch": 1.095685813623231, + "grad_norm": 3.0, + "learning_rate": 2.7737623469859904e-06, + "loss": 0.1753, + "mean_token_accuracy": 0.9633481323719024, + "num_tokens": 4737626660.0, + "step": 44750 + }, + { + "entropy": 1.27203125, + "epoch": 1.0969100435825865, + "grad_norm": 3.1875, + "learning_rate": 2.767707362796385e-06, + "loss": 0.1707, + "mean_token_accuracy": 0.9635563850402832, + "num_tokens": 4743127298.0, + "step": 44800 + }, + { + "entropy": 1.2753125, + "epoch": 1.0981342735419422, + "grad_norm": 2.28125, + "learning_rate": 2.7616533304442583e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9624858343601227, + "num_tokens": 4748930038.0, + "step": 44850 + }, + { + "entropy": 1.251875, + "epoch": 1.0993585035012976, + "grad_norm": 2.828125, + "learning_rate": 2.7556002747364882e-06, + "loss": 0.1618, + "mean_token_accuracy": 0.965050835609436, + "num_tokens": 4754015548.0, + "step": 44900 + }, + { + "entropy": 1.24, + "epoch": 1.1005827334606533, + "grad_norm": 2.515625, + "learning_rate": 2.749548220475947e-06, + "loss": 0.1556, + "mean_token_accuracy": 0.9672428011894226, + "num_tokens": 4759064667.0, + "step": 44950 + }, + { + "entropy": 1.24671875, + "epoch": 1.1018069634200087, + "grad_norm": 3.796875, + "learning_rate": 2.7434971924614085e-06, + "loss": 0.1581, + "mean_token_accuracy": 0.9658971416950226, + "num_tokens": 4764080171.0, + "step": 45000 + }, + { + "entropy": 1.27921875, + "epoch": 1.1030311933793644, + "grad_norm": 1.6953125, + "learning_rate": 2.7374472154874396e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9628953158855438, + "num_tokens": 4769590544.0, + "step": 45050 + }, + { + "entropy": 1.27578125, + "epoch": 1.1042554233387198, + "grad_norm": 3.203125, + "learning_rate": 2.731398314344298e-06, + "loss": 0.172, + "mean_token_accuracy": 0.9631561875343323, + "num_tokens": 4774983478.0, + "step": 45100 + }, + { + "entropy": 1.26796875, + "epoch": 1.1054796532980755, + "grad_norm": 2.625, + "learning_rate": 2.7253505138178363e-06, + "loss": 0.1626, + "mean_token_accuracy": 0.9651547718048096, + "num_tokens": 4780291854.0, + "step": 45150 + }, + { + "entropy": 1.24359375, + "epoch": 1.1067038832574312, + "grad_norm": 1.7734375, + "learning_rate": 2.719303838689397e-06, + "loss": 0.1586, + "mean_token_accuracy": 0.9661097753047944, + "num_tokens": 4785746067.0, + "step": 45200 + }, + { + "entropy": 1.27703125, + "epoch": 1.1079281132167866, + "grad_norm": 3.21875, + "learning_rate": 2.7132583137357085e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9634435415267945, + "num_tokens": 4791411988.0, + "step": 45250 + }, + { + "entropy": 1.26703125, + "epoch": 1.1091523431761423, + "grad_norm": 2.390625, + "learning_rate": 2.70721396372879e-06, + "loss": 0.1574, + "mean_token_accuracy": 0.9663924646377563, + "num_tokens": 4796839124.0, + "step": 45300 + }, + { + "entropy": 1.251875, + "epoch": 1.1103765731354978, + "grad_norm": 1.7265625, + "learning_rate": 2.7011708134358433e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.963711371421814, + "num_tokens": 4802261281.0, + "step": 45350 + }, + { + "entropy": 1.26109375, + "epoch": 1.1116008030948534, + "grad_norm": 3.34375, + "learning_rate": 2.6951288876191554e-06, + "loss": 0.163, + "mean_token_accuracy": 0.9658736658096313, + "num_tokens": 4807722190.0, + "step": 45400 + }, + { + "entropy": 1.2421875, + "epoch": 1.1128250330542089, + "grad_norm": 2.515625, + "learning_rate": 2.689088211035996e-06, + "loss": 0.1582, + "mean_token_accuracy": 0.9665179479122162, + "num_tokens": 4812528854.0, + "step": 45450 + }, + { + "entropy": 1.26859375, + "epoch": 1.1140492630135646, + "grad_norm": 4.28125, + "learning_rate": 2.6830488084385153e-06, + "loss": 0.1633, + "mean_token_accuracy": 0.9647966718673706, + "num_tokens": 4817654045.0, + "step": 45500 + }, + { + "entropy": 1.276875, + "epoch": 1.11527349297292, + "grad_norm": 3.078125, + "learning_rate": 2.6770107045736457e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9641125738620758, + "num_tokens": 4823118089.0, + "step": 45550 + }, + { + "entropy": 1.26890625, + "epoch": 1.1164977229322757, + "grad_norm": 3.40625, + "learning_rate": 2.670973924182993e-06, + "loss": 0.1652, + "mean_token_accuracy": 0.965114232301712, + "num_tokens": 4828253691.0, + "step": 45600 + }, + { + "entropy": 1.25203125, + "epoch": 1.1177219528916311, + "grad_norm": 4.03125, + "learning_rate": 2.664938492002745e-06, + "loss": 0.1578, + "mean_token_accuracy": 0.965950778722763, + "num_tokens": 4833456111.0, + "step": 45650 + }, + { + "entropy": 1.27203125, + "epoch": 1.1189461828509868, + "grad_norm": 2.421875, + "learning_rate": 2.658904432763564e-06, + "loss": 0.172, + "mean_token_accuracy": 0.962825288772583, + "num_tokens": 4838982999.0, + "step": 45700 + }, + { + "entropy": 1.2459375, + "epoch": 1.1201704128103422, + "grad_norm": 2.53125, + "learning_rate": 2.6528717711904823e-06, + "loss": 0.1553, + "mean_token_accuracy": 0.9660564112663269, + "num_tokens": 4844057439.0, + "step": 45750 + }, + { + "entropy": 1.2546875, + "epoch": 1.121394642769698, + "grad_norm": 2.78125, + "learning_rate": 2.6468405320028107e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9631454050540924, + "num_tokens": 4849526204.0, + "step": 45800 + }, + { + "entropy": 1.2471875, + "epoch": 1.1226188727290534, + "grad_norm": 2.375, + "learning_rate": 2.6408107399140297e-06, + "loss": 0.1525, + "mean_token_accuracy": 0.9672383844852448, + "num_tokens": 4854563999.0, + "step": 45850 + }, + { + "entropy": 1.2390625, + "epoch": 1.123843102688409, + "grad_norm": 2.796875, + "learning_rate": 2.6347824196316884e-06, + "loss": 0.1571, + "mean_token_accuracy": 0.9666775286197662, + "num_tokens": 4859889553.0, + "step": 45900 + }, + { + "entropy": 1.251875, + "epoch": 1.1250673326477645, + "grad_norm": 6.1875, + "learning_rate": 2.628755595857308e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.964877005815506, + "num_tokens": 4865439463.0, + "step": 45950 + }, + { + "entropy": 1.26578125, + "epoch": 1.1262915626071202, + "grad_norm": 2.9375, + "learning_rate": 2.622730293286276e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9647691214084625, + "num_tokens": 4870527275.0, + "step": 46000 + }, + { + "entropy": 1.2690625, + "epoch": 1.1275157925664756, + "grad_norm": 3.84375, + "learning_rate": 2.6167065366077473e-06, + "loss": 0.164, + "mean_token_accuracy": 0.9649512505531311, + "num_tokens": 4875809735.0, + "step": 46050 + }, + { + "entropy": 1.2575, + "epoch": 1.1287400225258313, + "grad_norm": 3.546875, + "learning_rate": 2.6106843505045403e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.9659580600261688, + "num_tokens": 4881072058.0, + "step": 46100 + }, + { + "entropy": 1.2534375, + "epoch": 1.1299642524851867, + "grad_norm": 3.265625, + "learning_rate": 2.6046637596530405e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9629634070396423, + "num_tokens": 4886211504.0, + "step": 46150 + }, + { + "entropy": 1.255, + "epoch": 1.1311884824445424, + "grad_norm": 2.953125, + "learning_rate": 2.598644788723097e-06, + "loss": 0.1635, + "mean_token_accuracy": 0.964535938501358, + "num_tokens": 4891417957.0, + "step": 46200 + }, + { + "entropy": 1.26296875, + "epoch": 1.132412712403898, + "grad_norm": 2.25, + "learning_rate": 2.5926274623779176e-06, + "loss": 0.1648, + "mean_token_accuracy": 0.9648210310935974, + "num_tokens": 4897027521.0, + "step": 46250 + }, + { + "entropy": 1.2715625, + "epoch": 1.1336369423632535, + "grad_norm": 2.359375, + "learning_rate": 2.5866118052739744e-06, + "loss": 0.1701, + "mean_token_accuracy": 0.9643675744533539, + "num_tokens": 4902630666.0, + "step": 46300 + }, + { + "entropy": 1.2640625, + "epoch": 1.134861172322609, + "grad_norm": 2.921875, + "learning_rate": 2.5805978420608995e-06, + "loss": 0.1588, + "mean_token_accuracy": 0.9654871869087219, + "num_tokens": 4907957609.0, + "step": 46350 + }, + { + "entropy": 1.25765625, + "epoch": 1.1360854022819646, + "grad_norm": 1.96875, + "learning_rate": 2.574585597381383e-06, + "loss": 0.1657, + "mean_token_accuracy": 0.964663782119751, + "num_tokens": 4913108629.0, + "step": 46400 + }, + { + "entropy": 1.26984375, + "epoch": 1.1373096322413203, + "grad_norm": 2.890625, + "learning_rate": 2.5685750958710737e-06, + "loss": 0.1654, + "mean_token_accuracy": 0.9640021121501923, + "num_tokens": 4918622288.0, + "step": 46450 + }, + { + "entropy": 1.2890625, + "epoch": 1.1385338622006758, + "grad_norm": 2.59375, + "learning_rate": 2.5625663621584777e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9616779792308807, + "num_tokens": 4924224135.0, + "step": 46500 + }, + { + "entropy": 1.2665625, + "epoch": 1.1397580921600314, + "grad_norm": 3.171875, + "learning_rate": 2.5565594208648566e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9643717563152313, + "num_tokens": 4929573607.0, + "step": 46550 + }, + { + "entropy": 1.2684375, + "epoch": 1.1409823221193869, + "grad_norm": 3.296875, + "learning_rate": 2.5505542966041285e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9641470229625702, + "num_tokens": 4935198269.0, + "step": 46600 + }, + { + "entropy": 1.2725, + "epoch": 1.1422065520787426, + "grad_norm": 2.484375, + "learning_rate": 2.5445510139827656e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9628414344787598, + "num_tokens": 4940751379.0, + "step": 46650 + }, + { + "entropy": 1.2690625, + "epoch": 1.143430782038098, + "grad_norm": 1.78125, + "learning_rate": 2.5385495975996952e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9626391875743866, + "num_tokens": 4946216596.0, + "step": 46700 + }, + { + "entropy": 1.27125, + "epoch": 1.1446550119974537, + "grad_norm": 3.359375, + "learning_rate": 2.532550072046194e-06, + "loss": 0.179, + "mean_token_accuracy": 0.9620010888576508, + "num_tokens": 4951891973.0, + "step": 46750 + }, + { + "entropy": 1.28984375, + "epoch": 1.1458792419568091, + "grad_norm": 1.71875, + "learning_rate": 2.5265524619057936e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9611503231525421, + "num_tokens": 4957928188.0, + "step": 46800 + }, + { + "entropy": 1.27203125, + "epoch": 1.1471034719161648, + "grad_norm": 2.265625, + "learning_rate": 2.520556791754179e-06, + "loss": 0.1675, + "mean_token_accuracy": 0.9632143163681031, + "num_tokens": 4963189602.0, + "step": 46850 + }, + { + "entropy": 1.2546875, + "epoch": 1.1483277018755202, + "grad_norm": 2.296875, + "learning_rate": 2.5145630861590806e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9636298882961273, + "num_tokens": 4968384917.0, + "step": 46900 + }, + { + "entropy": 1.234375, + "epoch": 1.149551931834876, + "grad_norm": 2.0, + "learning_rate": 2.5085713696801825e-06, + "loss": 0.1456, + "mean_token_accuracy": 0.9684996688365937, + "num_tokens": 4973304826.0, + "step": 46950 + }, + { + "entropy": 1.2384375, + "epoch": 1.1507761617942314, + "grad_norm": 3.1875, + "learning_rate": 2.5025816668690183e-06, + "loss": 0.1615, + "mean_token_accuracy": 0.9655906355381012, + "num_tokens": 4978583670.0, + "step": 47000 + }, + { + "entropy": 1.23921875, + "epoch": 1.152000391753587, + "grad_norm": 2.796875, + "learning_rate": 2.496594002268869e-06, + "loss": 0.1633, + "mean_token_accuracy": 0.9643825757503509, + "num_tokens": 4983769645.0, + "step": 47050 + }, + { + "entropy": 1.2440625, + "epoch": 1.1532246217129425, + "grad_norm": 2.359375, + "learning_rate": 2.490608400414664e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9659870672225952, + "num_tokens": 4989133497.0, + "step": 47100 + }, + { + "entropy": 1.2484375, + "epoch": 1.1544488516722982, + "grad_norm": 3.15625, + "learning_rate": 2.484624885832883e-06, + "loss": 0.1618, + "mean_token_accuracy": 0.9654805910587311, + "num_tokens": 4994369533.0, + "step": 47150 + }, + { + "entropy": 1.2490625, + "epoch": 1.1556730816316536, + "grad_norm": 3.109375, + "learning_rate": 2.478643483041449e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9649089682102203, + "num_tokens": 4999527347.0, + "step": 47200 + }, + { + "entropy": 1.2553125, + "epoch": 1.1568973115910093, + "grad_norm": 2.4375, + "learning_rate": 2.472664216549633e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.9657234275341033, + "num_tokens": 5004961075.0, + "step": 47250 + }, + { + "entropy": 1.24203125, + "epoch": 1.1581215415503647, + "grad_norm": 2.640625, + "learning_rate": 2.466687110857955e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9676401782035827, + "num_tokens": 5009801621.0, + "step": 47300 + }, + { + "entropy": 1.2534375, + "epoch": 1.1593457715097204, + "grad_norm": 1.875, + "learning_rate": 2.4607121904580796e-06, + "loss": 0.1689, + "mean_token_accuracy": 0.96378169298172, + "num_tokens": 5015019832.0, + "step": 47350 + }, + { + "entropy": 1.285625, + "epoch": 1.1605700014690759, + "grad_norm": 2.6875, + "learning_rate": 2.4547394798327127e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.961477290391922, + "num_tokens": 5020771556.0, + "step": 47400 + }, + { + "entropy": 1.2609375, + "epoch": 1.1617942314284315, + "grad_norm": 0.00445556640625, + "learning_rate": 2.448769003455512e-06, + "loss": 0.1606, + "mean_token_accuracy": 0.9650316751003265, + "num_tokens": 5026174408.0, + "step": 47450 + }, + { + "entropy": 1.24875, + "epoch": 1.1630184613877872, + "grad_norm": 2.96875, + "learning_rate": 2.442800785790977e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9664806413650513, + "num_tokens": 5031142557.0, + "step": 47500 + }, + { + "entropy": 1.25828125, + "epoch": 1.1642426913471426, + "grad_norm": 2.75, + "learning_rate": 2.436834851294351e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9635387444496155, + "num_tokens": 5036598656.0, + "step": 47550 + }, + { + "entropy": 1.2440625, + "epoch": 1.165466921306498, + "grad_norm": 3.125, + "learning_rate": 2.4308712244115256e-06, + "loss": 0.1652, + "mean_token_accuracy": 0.9645625805854797, + "num_tokens": 5041932484.0, + "step": 47600 + }, + { + "entropy": 1.23, + "epoch": 1.1666911512658538, + "grad_norm": 4.53125, + "learning_rate": 2.4249099295789315e-06, + "loss": 0.1503, + "mean_token_accuracy": 0.9676901125907897, + "num_tokens": 5047049390.0, + "step": 47650 + }, + { + "entropy": 1.24640625, + "epoch": 1.1679153812252094, + "grad_norm": 1.6953125, + "learning_rate": 2.4189509912234475e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9623109328746796, + "num_tokens": 5052498083.0, + "step": 47700 + }, + { + "entropy": 1.2228125, + "epoch": 1.1691396111845649, + "grad_norm": 2.21875, + "learning_rate": 2.412994433762295e-06, + "loss": 0.1438, + "mean_token_accuracy": 0.9679240989685058, + "num_tokens": 5057358329.0, + "step": 47750 + }, + { + "entropy": 1.236875, + "epoch": 1.1703638411439206, + "grad_norm": 3.390625, + "learning_rate": 2.407040281602942e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9666338443756104, + "num_tokens": 5062500243.0, + "step": 47800 + }, + { + "entropy": 1.2196875, + "epoch": 1.171588071103276, + "grad_norm": 0.005096435546875, + "learning_rate": 2.4010885591429955e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9668021559715271, + "num_tokens": 5067435842.0, + "step": 47850 + }, + { + "entropy": 1.25109375, + "epoch": 1.1728123010626317, + "grad_norm": 3.40625, + "learning_rate": 2.3951392907701115e-06, + "loss": 0.1831, + "mean_token_accuracy": 0.9610938668251038, + "num_tokens": 5073063170.0, + "step": 47900 + }, + { + "entropy": 1.24375, + "epoch": 1.1740365310219871, + "grad_norm": 3.328125, + "learning_rate": 2.389192500861888e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9621718871593475, + "num_tokens": 5078828458.0, + "step": 47950 + }, + { + "entropy": 1.26078125, + "epoch": 1.1752607609813428, + "grad_norm": 2.578125, + "learning_rate": 2.3832482137857685e-06, + "loss": 0.175, + "mean_token_accuracy": 0.9630187213420868, + "num_tokens": 5084161692.0, + "step": 48000 + }, + { + "entropy": 1.2571875, + "epoch": 1.1764849909406982, + "grad_norm": 1.8046875, + "learning_rate": 2.377306453898938e-06, + "loss": 0.1689, + "mean_token_accuracy": 0.9643845617771148, + "num_tokens": 5089346169.0, + "step": 48050 + }, + { + "entropy": 1.25125, + "epoch": 1.177709220900054, + "grad_norm": 3.34375, + "learning_rate": 2.3713672455482293e-06, + "loss": 0.1609, + "mean_token_accuracy": 0.9652318274974823, + "num_tokens": 5094622581.0, + "step": 48100 + }, + { + "entropy": 1.24921875, + "epoch": 1.1789334508594094, + "grad_norm": 1.953125, + "learning_rate": 2.36543061307002e-06, + "loss": 0.1611, + "mean_token_accuracy": 0.9650622093677521, + "num_tokens": 5099539248.0, + "step": 48150 + }, + { + "entropy": 1.2584375, + "epoch": 1.180157680818765, + "grad_norm": 4.5, + "learning_rate": 2.35949658079013e-06, + "loss": 0.1693, + "mean_token_accuracy": 0.9631922256946563, + "num_tokens": 5104589567.0, + "step": 48200 + }, + { + "entropy": 1.26328125, + "epoch": 1.1813819107781205, + "grad_norm": 3.09375, + "learning_rate": 2.3535651730237275e-06, + "loss": 0.1613, + "mean_token_accuracy": 0.9661449313163757, + "num_tokens": 5109766096.0, + "step": 48250 + }, + { + "entropy": 1.25484375, + "epoch": 1.1826061407374762, + "grad_norm": 3.125, + "learning_rate": 2.3476364140752266e-06, + "loss": 0.1599, + "mean_token_accuracy": 0.9653767657279968, + "num_tokens": 5114683078.0, + "step": 48300 + }, + { + "entropy": 1.28109375, + "epoch": 1.1838303706968316, + "grad_norm": 3.46875, + "learning_rate": 2.341710328238185e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9629187500476837, + "num_tokens": 5120172628.0, + "step": 48350 + }, + { + "entropy": 1.2590625, + "epoch": 1.1850546006561873, + "grad_norm": 2.125, + "learning_rate": 2.335786939795209e-06, + "loss": 0.1574, + "mean_token_accuracy": 0.966355732679367, + "num_tokens": 5125111521.0, + "step": 48400 + }, + { + "entropy": 1.2721875, + "epoch": 1.1862788306155427, + "grad_norm": 2.046875, + "learning_rate": 2.3298662730178536e-06, + "loss": 0.1635, + "mean_token_accuracy": 0.9648284649848938, + "num_tokens": 5130646209.0, + "step": 48450 + }, + { + "entropy": 1.2484375, + "epoch": 1.1875030605748984, + "grad_norm": 2.703125, + "learning_rate": 2.3239483521665165e-06, + "loss": 0.1529, + "mean_token_accuracy": 0.9668037176132203, + "num_tokens": 5135665531.0, + "step": 48500 + }, + { + "entropy": 1.25546875, + "epoch": 1.188727290534254, + "grad_norm": 1.5703125, + "learning_rate": 2.31803320149035e-06, + "loss": 0.1674, + "mean_token_accuracy": 0.9642703318595887, + "num_tokens": 5140993137.0, + "step": 48550 + }, + { + "entropy": 1.2759375, + "epoch": 1.1899515204936095, + "grad_norm": 1.875, + "learning_rate": 2.312120845227151e-06, + "loss": 0.1682, + "mean_token_accuracy": 0.9635923814773559, + "num_tokens": 5146394110.0, + "step": 48600 + }, + { + "entropy": 1.269375, + "epoch": 1.191175750452965, + "grad_norm": 2.125, + "learning_rate": 2.306211307603269e-06, + "loss": 0.1603, + "mean_token_accuracy": 0.9650293779373169, + "num_tokens": 5151444447.0, + "step": 48650 + }, + { + "entropy": 1.2778125, + "epoch": 1.1923999804123206, + "grad_norm": 3.484375, + "learning_rate": 2.3003046128335004e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.962925443649292, + "num_tokens": 5157164016.0, + "step": 48700 + }, + { + "entropy": 1.2559375, + "epoch": 1.1936242103716763, + "grad_norm": 0.00262451171875, + "learning_rate": 2.2944007851209967e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9663327503204345, + "num_tokens": 5162287319.0, + "step": 48750 + }, + { + "entropy": 1.25578125, + "epoch": 1.1948484403310318, + "grad_norm": 1.84375, + "learning_rate": 2.2884998486571587e-06, + "loss": 0.1623, + "mean_token_accuracy": 0.9643605947494507, + "num_tokens": 5167697788.0, + "step": 48800 + }, + { + "entropy": 1.2621875, + "epoch": 1.1960726702903874, + "grad_norm": 2.421875, + "learning_rate": 2.2826018276215404e-06, + "loss": 0.1641, + "mean_token_accuracy": 0.9648311936855316, + "num_tokens": 5172726413.0, + "step": 48850 + }, + { + "entropy": 1.256875, + "epoch": 1.197296900249743, + "grad_norm": 3.9375, + "learning_rate": 2.276706746181751e-06, + "loss": 0.1647, + "mean_token_accuracy": 0.9653891062736512, + "num_tokens": 5177807515.0, + "step": 48900 + }, + { + "entropy": 1.24484375, + "epoch": 1.1985211302090986, + "grad_norm": 3.359375, + "learning_rate": 2.2708146284933544e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9672402215003967, + "num_tokens": 5182682002.0, + "step": 48950 + }, + { + "entropy": 1.2434375, + "epoch": 1.199745360168454, + "grad_norm": 2.09375, + "learning_rate": 2.2649254986997666e-06, + "loss": 0.1625, + "mean_token_accuracy": 0.9646528875827789, + "num_tokens": 5187927187.0, + "step": 49000 + }, + { + "entropy": 1.26171875, + "epoch": 1.2009695901278097, + "grad_norm": 2.140625, + "learning_rate": 2.2590393809321657e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9654495012760163, + "num_tokens": 5192885819.0, + "step": 49050 + }, + { + "entropy": 1.26296875, + "epoch": 1.2021938200871651, + "grad_norm": 2.015625, + "learning_rate": 2.2531562993093854e-06, + "loss": 0.1631, + "mean_token_accuracy": 0.9647388279438018, + "num_tokens": 5198240652.0, + "step": 49100 + }, + { + "entropy": 1.23265625, + "epoch": 1.2034180500465208, + "grad_norm": 2.5625, + "learning_rate": 2.247276277937817e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.966611897945404, + "num_tokens": 5203287957.0, + "step": 49150 + }, + { + "entropy": 1.24109375, + "epoch": 1.2046422800058763, + "grad_norm": 2.671875, + "learning_rate": 2.241399340911315e-06, + "loss": 0.1582, + "mean_token_accuracy": 0.9648150885105133, + "num_tokens": 5208259781.0, + "step": 49200 + }, + { + "entropy": 1.22828125, + "epoch": 1.205866509965232, + "grad_norm": 1.796875, + "learning_rate": 2.235525512311094e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9645445287227631, + "num_tokens": 5213559098.0, + "step": 49250 + }, + { + "entropy": 1.23921875, + "epoch": 1.2070907399245874, + "grad_norm": 3.1875, + "learning_rate": 2.229654816205632e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9639151406288147, + "num_tokens": 5218710994.0, + "step": 49300 + }, + { + "entropy": 1.2425, + "epoch": 1.208314969883943, + "grad_norm": 2.828125, + "learning_rate": 2.2237872766505715e-06, + "loss": 0.1676, + "mean_token_accuracy": 0.9631175470352172, + "num_tokens": 5224096915.0, + "step": 49350 + }, + { + "entropy": 1.25015625, + "epoch": 1.2095391998432985, + "grad_norm": 3.71875, + "learning_rate": 2.2179229176886196e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9628188860416412, + "num_tokens": 5229833600.0, + "step": 49400 + }, + { + "entropy": 1.24265625, + "epoch": 1.2107634298026542, + "grad_norm": 4.125, + "learning_rate": 2.212061763349454e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9654302883148194, + "num_tokens": 5235131114.0, + "step": 49450 + }, + { + "entropy": 1.23765625, + "epoch": 1.2119876597620096, + "grad_norm": 1.9375, + "learning_rate": 2.206203837649615e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9665101909637451, + "num_tokens": 5240317138.0, + "step": 49500 + }, + { + "entropy": 1.24921875, + "epoch": 1.2132118897213653, + "grad_norm": 1.9609375, + "learning_rate": 2.2003491645924195e-06, + "loss": 0.1715, + "mean_token_accuracy": 0.9628171730041504, + "num_tokens": 5245861371.0, + "step": 49550 + }, + { + "entropy": 1.24640625, + "epoch": 1.2144361196807207, + "grad_norm": 2.859375, + "learning_rate": 2.194497768167855e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9627651238441467, + "num_tokens": 5251350220.0, + "step": 49600 + }, + { + "entropy": 1.2528125, + "epoch": 1.2156603496400764, + "grad_norm": 2.296875, + "learning_rate": 2.188649672352479e-06, + "loss": 0.1707, + "mean_token_accuracy": 0.964025752544403, + "num_tokens": 5256995465.0, + "step": 49650 + }, + { + "entropy": 1.25234375, + "epoch": 1.2168845795994319, + "grad_norm": 2.703125, + "learning_rate": 2.1828049011093286e-06, + "loss": 0.1702, + "mean_token_accuracy": 0.9648704588413238, + "num_tokens": 5262286472.0, + "step": 49700 + }, + { + "entropy": 1.241875, + "epoch": 1.2181088095587875, + "grad_norm": 3.578125, + "learning_rate": 2.1769634783878182e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9658465564250946, + "num_tokens": 5267436922.0, + "step": 49750 + }, + { + "entropy": 1.26, + "epoch": 1.2193330395181432, + "grad_norm": 3.453125, + "learning_rate": 2.1711254281236373e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9622203695774079, + "num_tokens": 5273103073.0, + "step": 49800 + }, + { + "entropy": 1.258125, + "epoch": 1.2205572694774987, + "grad_norm": 2.78125, + "learning_rate": 2.1652907742386613e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9619389712810517, + "num_tokens": 5278483949.0, + "step": 49850 + }, + { + "entropy": 1.24796875, + "epoch": 1.221781499436854, + "grad_norm": 1.5546875, + "learning_rate": 2.159459540640847e-06, + "loss": 0.161, + "mean_token_accuracy": 0.9660306286811828, + "num_tokens": 5283427597.0, + "step": 49900 + }, + { + "entropy": 1.27984375, + "epoch": 1.2230057293962098, + "grad_norm": 1.5703125, + "learning_rate": 2.1536317512241348e-06, + "loss": 0.1777, + "mean_token_accuracy": 0.9623690032958985, + "num_tokens": 5288987030.0, + "step": 49950 + }, + { + "entropy": 1.2584375, + "epoch": 1.2242299593555654, + "grad_norm": 2.71875, + "learning_rate": 2.147807429868352e-06, + "loss": 0.1658, + "mean_token_accuracy": 0.9644541823863984, + "num_tokens": 5294529728.0, + "step": 50000 + }, + { + "epoch": 1.2242299593555654, + "eval_entropy": 1.2479817708333334, + "eval_loss": 0.17940963804721832, + "eval_mean_token_accuracy": 0.9616454169154167, + "eval_num_tokens": 5294529728.0, + "eval_runtime": 604.376, + "eval_samples_per_second": 15.977, + "eval_steps_per_second": 0.2, + "step": 50000 + }, + { + "entropy": 1.2259375, + "epoch": 1.225454189314921, + "grad_norm": 2.09375, + "learning_rate": 2.141986600439119e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9670542335510254, + "num_tokens": 5299381949.0, + "step": 50050 + }, + { + "entropy": 1.26140625, + "epoch": 1.2266784192742766, + "grad_norm": 2.625, + "learning_rate": 2.1361692867877455e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.9621517550945282, + "num_tokens": 5304936166.0, + "step": 50100 + }, + { + "entropy": 1.24140625, + "epoch": 1.227902649233632, + "grad_norm": 0.00927734375, + "learning_rate": 2.1303555127511327e-06, + "loss": 0.1545, + "mean_token_accuracy": 0.96613614320755, + "num_tokens": 5310169155.0, + "step": 50150 + }, + { + "entropy": 1.261875, + "epoch": 1.2291268791929877, + "grad_norm": 2.65625, + "learning_rate": 2.124545302151681e-06, + "loss": 0.1693, + "mean_token_accuracy": 0.9642032277584076, + "num_tokens": 5315607723.0, + "step": 50200 + }, + { + "entropy": 1.26796875, + "epoch": 1.2303511091523431, + "grad_norm": 2.640625, + "learning_rate": 2.118738678797191e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9641611945629119, + "num_tokens": 5321112342.0, + "step": 50250 + }, + { + "entropy": 1.25578125, + "epoch": 1.2315753391116988, + "grad_norm": 3.6875, + "learning_rate": 2.112935666480758e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.965636430978775, + "num_tokens": 5326352547.0, + "step": 50300 + }, + { + "entropy": 1.26484375, + "epoch": 1.2327995690710543, + "grad_norm": 2.046875, + "learning_rate": 2.1071362889806863e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.963402829170227, + "num_tokens": 5331870603.0, + "step": 50350 + }, + { + "entropy": 1.27546875, + "epoch": 1.23402379903041, + "grad_norm": 2.46875, + "learning_rate": 2.101340570060385e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9636083686351776, + "num_tokens": 5337306717.0, + "step": 50400 + }, + { + "entropy": 1.24609375, + "epoch": 1.2352480289897654, + "grad_norm": 4.125, + "learning_rate": 2.09554853346827e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9663618934154511, + "num_tokens": 5342628594.0, + "step": 50450 + }, + { + "entropy": 1.2796875, + "epoch": 1.236472258949121, + "grad_norm": 2.1875, + "learning_rate": 2.089760202937671e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9637987637519836, + "num_tokens": 5348316678.0, + "step": 50500 + }, + { + "entropy": 1.2409375, + "epoch": 1.2376964889084765, + "grad_norm": 4.09375, + "learning_rate": 2.0839756021867306e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.967620609998703, + "num_tokens": 5353095952.0, + "step": 50550 + }, + { + "entropy": 1.26671875, + "epoch": 1.2389207188678322, + "grad_norm": 2.1875, + "learning_rate": 2.07819475491831e-06, + "loss": 0.1675, + "mean_token_accuracy": 0.9643842697143554, + "num_tokens": 5358561384.0, + "step": 50600 + }, + { + "entropy": 1.25734375, + "epoch": 1.2401449488271876, + "grad_norm": 3.546875, + "learning_rate": 2.0724176848198856e-06, + "loss": 0.1578, + "mean_token_accuracy": 0.9659811770915985, + "num_tokens": 5363968041.0, + "step": 50650 + }, + { + "entropy": 1.2559375, + "epoch": 1.2413691787865433, + "grad_norm": 3.265625, + "learning_rate": 2.0666444155634613e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9649008166790009, + "num_tokens": 5369138043.0, + "step": 50700 + }, + { + "entropy": 1.2790625, + "epoch": 1.2425934087458987, + "grad_norm": 1.796875, + "learning_rate": 2.0608749708054666e-06, + "loss": 0.1717, + "mean_token_accuracy": 0.9624824106693268, + "num_tokens": 5374681050.0, + "step": 50750 + }, + { + "entropy": 1.274375, + "epoch": 1.2438176387052544, + "grad_norm": 3.5, + "learning_rate": 2.0551093741866555e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.964318573474884, + "num_tokens": 5379930328.0, + "step": 50800 + }, + { + "entropy": 1.2709375, + "epoch": 1.24504186866461, + "grad_norm": 2.328125, + "learning_rate": 2.0493476493320182e-06, + "loss": 0.1639, + "mean_token_accuracy": 0.9642879796028138, + "num_tokens": 5385290824.0, + "step": 50850 + }, + { + "entropy": 1.27515625, + "epoch": 1.2462660986239655, + "grad_norm": 3.15625, + "learning_rate": 2.043589819850679e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9621766293048859, + "num_tokens": 5390915687.0, + "step": 50900 + }, + { + "entropy": 1.26828125, + "epoch": 1.247490328583321, + "grad_norm": 2.671875, + "learning_rate": 2.037835909335799e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9644598591327668, + "num_tokens": 5396364664.0, + "step": 50950 + }, + { + "entropy": 1.23453125, + "epoch": 1.2487145585426767, + "grad_norm": 3.4375, + "learning_rate": 2.032085941364483e-06, + "loss": 0.1475, + "mean_token_accuracy": 0.9683002579212189, + "num_tokens": 5401284379.0, + "step": 51000 + }, + { + "entropy": 1.264375, + "epoch": 1.2499387885020323, + "grad_norm": 2.671875, + "learning_rate": 2.026339939497681e-06, + "loss": 0.1672, + "mean_token_accuracy": 0.9641962945461273, + "num_tokens": 5406818098.0, + "step": 51050 + }, + { + "entropy": 1.23828125, + "epoch": 1.2511630184613878, + "grad_norm": 0.2138671875, + "learning_rate": 2.020597927280089e-06, + "loss": 0.1498, + "mean_token_accuracy": 0.9685159015655518, + "num_tokens": 5411689647.0, + "step": 51100 + }, + { + "entropy": 1.24640625, + "epoch": 1.2523872484207432, + "grad_norm": 2.640625, + "learning_rate": 2.014859928240058e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.9665188646316528, + "num_tokens": 5416677115.0, + "step": 51150 + }, + { + "entropy": 1.24796875, + "epoch": 1.253611478380099, + "grad_norm": 1.84375, + "learning_rate": 2.0091259658894926e-06, + "loss": 0.1525, + "mean_token_accuracy": 0.9675477313995361, + "num_tokens": 5422071895.0, + "step": 51200 + }, + { + "entropy": 1.27703125, + "epoch": 1.2548357083394546, + "grad_norm": 2.15625, + "learning_rate": 2.00339606372376e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9615858125686646, + "num_tokens": 5427896152.0, + "step": 51250 + }, + { + "entropy": 1.25203125, + "epoch": 1.25605993829881, + "grad_norm": 2.21875, + "learning_rate": 1.9976702452215846e-06, + "loss": 0.1615, + "mean_token_accuracy": 0.9655699288845062, + "num_tokens": 5432956715.0, + "step": 51300 + }, + { + "entropy": 1.25671875, + "epoch": 1.2572841682581657, + "grad_norm": 2.5, + "learning_rate": 1.9919485338449633e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.963955899477005, + "num_tokens": 5438521726.0, + "step": 51350 + }, + { + "entropy": 1.25890625, + "epoch": 1.2585083982175211, + "grad_norm": 3.671875, + "learning_rate": 1.9862309530390627e-06, + "loss": 0.1604, + "mean_token_accuracy": 0.9649885761737823, + "num_tokens": 5443663826.0, + "step": 51400 + }, + { + "entropy": 1.25375, + "epoch": 1.2597326281768768, + "grad_norm": 1.703125, + "learning_rate": 1.98051752623212e-06, + "loss": 0.1607, + "mean_token_accuracy": 0.9659333276748657, + "num_tokens": 5448801306.0, + "step": 51450 + }, + { + "entropy": 1.26546875, + "epoch": 1.2609568581362323, + "grad_norm": 2.234375, + "learning_rate": 1.9748082768353554e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.9649898850917816, + "num_tokens": 5454048809.0, + "step": 51500 + }, + { + "entropy": 1.2559375, + "epoch": 1.262181088095588, + "grad_norm": 3.40625, + "learning_rate": 1.969103228242872e-06, + "loss": 0.1671, + "mean_token_accuracy": 0.9636943113803863, + "num_tokens": 5459063221.0, + "step": 51550 + }, + { + "entropy": 1.26359375, + "epoch": 1.2634053180549434, + "grad_norm": 0.01025390625, + "learning_rate": 1.9634024038315556e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9668670952320099, + "num_tokens": 5464218533.0, + "step": 51600 + }, + { + "entropy": 1.26984375, + "epoch": 1.264629548014299, + "grad_norm": 1.796875, + "learning_rate": 1.9577058269609873e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.9646493744850159, + "num_tokens": 5469633751.0, + "step": 51650 + }, + { + "entropy": 1.26015625, + "epoch": 1.2658537779736545, + "grad_norm": 2.5, + "learning_rate": 1.9520135209733434e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9670298910140991, + "num_tokens": 5474658175.0, + "step": 51700 + }, + { + "entropy": 1.24671875, + "epoch": 1.2670780079330102, + "grad_norm": 2.921875, + "learning_rate": 1.9463255091932946e-06, + "loss": 0.168, + "mean_token_accuracy": 0.9642450773715973, + "num_tokens": 5480009732.0, + "step": 51750 + }, + { + "entropy": 1.25875, + "epoch": 1.2683022378923656, + "grad_norm": 2.703125, + "learning_rate": 1.9406418149279224e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9646876096725464, + "num_tokens": 5485352642.0, + "step": 51800 + }, + { + "entropy": 1.25078125, + "epoch": 1.2695264678517213, + "grad_norm": 2.40625, + "learning_rate": 1.9349624614666137e-06, + "loss": 0.1599, + "mean_token_accuracy": 0.9663380241394043, + "num_tokens": 5490516069.0, + "step": 51850 + }, + { + "entropy": 1.2540625, + "epoch": 1.270750697811077, + "grad_norm": 2.125, + "learning_rate": 1.9292874720809706e-06, + "loss": 0.1691, + "mean_token_accuracy": 0.9637067282199859, + "num_tokens": 5495858878.0, + "step": 51900 + }, + { + "entropy": 1.2459375, + "epoch": 1.2719749277704324, + "grad_norm": 2.03125, + "learning_rate": 1.9236168700247085e-06, + "loss": 0.1597, + "mean_token_accuracy": 0.9652304399013519, + "num_tokens": 5500992334.0, + "step": 51950 + }, + { + "entropy": 1.26390625, + "epoch": 1.2731991577297879, + "grad_norm": 2.40625, + "learning_rate": 1.9179506785335695e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9612833940982819, + "num_tokens": 5506364973.0, + "step": 52000 + }, + { + "entropy": 1.2540625, + "epoch": 1.2744233876891435, + "grad_norm": 3.09375, + "learning_rate": 1.912288920825224e-06, + "loss": 0.1668, + "mean_token_accuracy": 0.9639379584789276, + "num_tokens": 5511847363.0, + "step": 52050 + }, + { + "entropy": 1.26140625, + "epoch": 1.2756476176484992, + "grad_norm": 3.59375, + "learning_rate": 1.9066316200991702e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.9622644722461701, + "num_tokens": 5517402202.0, + "step": 52100 + }, + { + "entropy": 1.23109375, + "epoch": 1.2768718476078547, + "grad_norm": 2.5625, + "learning_rate": 1.9009787995366464e-06, + "loss": 0.1571, + "mean_token_accuracy": 0.9665352630615235, + "num_tokens": 5522479618.0, + "step": 52150 + }, + { + "entropy": 1.2396875, + "epoch": 1.27809607756721, + "grad_norm": 1.8046875, + "learning_rate": 1.8953304823005346e-06, + "loss": 0.159, + "mean_token_accuracy": 0.965977475643158, + "num_tokens": 5527761846.0, + "step": 52200 + }, + { + "entropy": 1.24140625, + "epoch": 1.2793203075265658, + "grad_norm": 2.765625, + "learning_rate": 1.889686691535259e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9641374492645264, + "num_tokens": 5533078395.0, + "step": 52250 + }, + { + "entropy": 1.22796875, + "epoch": 1.2805445374859215, + "grad_norm": 2.0625, + "learning_rate": 1.8840474503667003e-06, + "loss": 0.1613, + "mean_token_accuracy": 0.96567800283432, + "num_tokens": 5538079639.0, + "step": 52300 + }, + { + "entropy": 1.233125, + "epoch": 1.281768767445277, + "grad_norm": 1.828125, + "learning_rate": 1.8784127819020977e-06, + "loss": 0.1696, + "mean_token_accuracy": 0.9639940130710601, + "num_tokens": 5543060468.0, + "step": 52350 + }, + { + "entropy": 1.23828125, + "epoch": 1.2829929974046324, + "grad_norm": 2.40625, + "learning_rate": 1.8727827092299486e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9634285986423492, + "num_tokens": 5548455628.0, + "step": 52400 + }, + { + "entropy": 1.2671875, + "epoch": 1.284217227363988, + "grad_norm": 1.8515625, + "learning_rate": 1.8671572554199227e-06, + "loss": 0.1745, + "mean_token_accuracy": 0.9630351853370667, + "num_tokens": 5554243712.0, + "step": 52450 + }, + { + "entropy": 1.256875, + "epoch": 1.2854414573233437, + "grad_norm": 3.09375, + "learning_rate": 1.8615364435227627e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9632880544662475, + "num_tokens": 5559645728.0, + "step": 52500 + }, + { + "entropy": 1.25578125, + "epoch": 1.2866656872826991, + "grad_norm": 2.4375, + "learning_rate": 1.8559202965701921e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9628579890727997, + "num_tokens": 5565441017.0, + "step": 52550 + }, + { + "entropy": 1.241875, + "epoch": 1.2878899172420548, + "grad_norm": 3.0625, + "learning_rate": 1.850308837574815e-06, + "loss": 0.1567, + "mean_token_accuracy": 0.9662058663368225, + "num_tokens": 5570548727.0, + "step": 52600 + }, + { + "entropy": 1.2465625, + "epoch": 1.2891141472014103, + "grad_norm": 3.671875, + "learning_rate": 1.8447020895300304e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.9654901123046875, + "num_tokens": 5575812384.0, + "step": 52650 + }, + { + "entropy": 1.26609375, + "epoch": 1.290338377160766, + "grad_norm": 2.609375, + "learning_rate": 1.8391000754099329e-06, + "loss": 0.1704, + "mean_token_accuracy": 0.9641706418991088, + "num_tokens": 5581119333.0, + "step": 52700 + }, + { + "entropy": 1.25296875, + "epoch": 1.2915626071201214, + "grad_norm": 3.484375, + "learning_rate": 1.8335028181692183e-06, + "loss": 0.1591, + "mean_token_accuracy": 0.9657709896564484, + "num_tokens": 5586146551.0, + "step": 52750 + }, + { + "entropy": 1.26609375, + "epoch": 1.292786837079477, + "grad_norm": 2.15625, + "learning_rate": 1.8279103407430918e-06, + "loss": 0.1682, + "mean_token_accuracy": 0.9645370328426361, + "num_tokens": 5591535827.0, + "step": 52800 + }, + { + "entropy": 1.24609375, + "epoch": 1.2940110670388325, + "grad_norm": 3.09375, + "learning_rate": 1.822322666047173e-06, + "loss": 0.156, + "mean_token_accuracy": 0.966865359544754, + "num_tokens": 5596513224.0, + "step": 52850 + }, + { + "entropy": 1.23125, + "epoch": 1.2952352969981882, + "grad_norm": 2.59375, + "learning_rate": 1.8167398169774003e-06, + "loss": 0.1562, + "mean_token_accuracy": 0.9663991129398346, + "num_tokens": 5601409756.0, + "step": 52900 + }, + { + "entropy": 1.24203125, + "epoch": 1.2964595269575436, + "grad_norm": 2.09375, + "learning_rate": 1.8111618164099405e-06, + "loss": 0.1586, + "mean_token_accuracy": 0.965841782093048, + "num_tokens": 5606579901.0, + "step": 52950 + }, + { + "entropy": 1.25640625, + "epoch": 1.2976837569168993, + "grad_norm": 4.40625, + "learning_rate": 1.805588687201094e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.9661786913871765, + "num_tokens": 5611890254.0, + "step": 53000 + }, + { + "entropy": 1.27453125, + "epoch": 1.2989079868762547, + "grad_norm": 2.9375, + "learning_rate": 1.8000204521871968e-06, + "loss": 0.1736, + "mean_token_accuracy": 0.9631719040870667, + "num_tokens": 5617317192.0, + "step": 53050 + }, + { + "entropy": 1.254375, + "epoch": 1.3001322168356104, + "grad_norm": 2.0625, + "learning_rate": 1.7944571341845338e-06, + "loss": 0.1735, + "mean_token_accuracy": 0.9628773295879364, + "num_tokens": 5622759860.0, + "step": 53100 + }, + { + "entropy": 1.251875, + "epoch": 1.301356446794966, + "grad_norm": 3.328125, + "learning_rate": 1.788898755989241e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.966829891204834, + "num_tokens": 5628009830.0, + "step": 53150 + }, + { + "entropy": 1.2346875, + "epoch": 1.3025806767543215, + "grad_norm": 1.9140625, + "learning_rate": 1.7833453403772148e-06, + "loss": 0.1496, + "mean_token_accuracy": 0.9679068636894226, + "num_tokens": 5633028331.0, + "step": 53200 + }, + { + "entropy": 1.23625, + "epoch": 1.303804906713677, + "grad_norm": 2.765625, + "learning_rate": 1.7777969101040137e-06, + "loss": 0.1598, + "mean_token_accuracy": 0.9658224785327911, + "num_tokens": 5638192081.0, + "step": 53250 + }, + { + "entropy": 1.25921875, + "epoch": 1.3050291366730327, + "grad_norm": 1.765625, + "learning_rate": 1.7722534879047704e-06, + "loss": 0.1679, + "mean_token_accuracy": 0.9648814105987549, + "num_tokens": 5643678649.0, + "step": 53300 + }, + { + "entropy": 1.23703125, + "epoch": 1.3062533666323883, + "grad_norm": 2.171875, + "learning_rate": 1.7667150964940981e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9665197932720184, + "num_tokens": 5648865610.0, + "step": 53350 + }, + { + "entropy": 1.2546875, + "epoch": 1.3074775965917438, + "grad_norm": 3.46875, + "learning_rate": 1.7611817585659915e-06, + "loss": 0.1695, + "mean_token_accuracy": 0.96389883518219, + "num_tokens": 5654452208.0, + "step": 53400 + }, + { + "entropy": 1.23046875, + "epoch": 1.3087018265510992, + "grad_norm": 2.9375, + "learning_rate": 1.7556534967937428e-06, + "loss": 0.1477, + "mean_token_accuracy": 0.967578010559082, + "num_tokens": 5659553855.0, + "step": 53450 + }, + { + "entropy": 1.2696875, + "epoch": 1.309926056510455, + "grad_norm": 2.71875, + "learning_rate": 1.750130333829843e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9626197755336762, + "num_tokens": 5665208689.0, + "step": 53500 + }, + { + "entropy": 1.230625, + "epoch": 1.3111502864698106, + "grad_norm": 2.265625, + "learning_rate": 1.744612292305887e-06, + "loss": 0.1488, + "mean_token_accuracy": 0.9678456223011017, + "num_tokens": 5670219320.0, + "step": 53550 + }, + { + "entropy": 1.26109375, + "epoch": 1.312374516429166, + "grad_norm": 3.46875, + "learning_rate": 1.73909939483249e-06, + "loss": 0.176, + "mean_token_accuracy": 0.9616470074653626, + "num_tokens": 5676005681.0, + "step": 53600 + }, + { + "entropy": 1.23359375, + "epoch": 1.3135987463885217, + "grad_norm": 3.46875, + "learning_rate": 1.7335916639991833e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9656192350387574, + "num_tokens": 5680838804.0, + "step": 53650 + }, + { + "entropy": 1.24828125, + "epoch": 1.3148229763478771, + "grad_norm": 3.25, + "learning_rate": 1.7280891223743347e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9647430288791656, + "num_tokens": 5686118856.0, + "step": 53700 + }, + { + "entropy": 1.25203125, + "epoch": 1.3160472063072328, + "grad_norm": 1.7890625, + "learning_rate": 1.7225917925050384e-06, + "loss": 0.1808, + "mean_token_accuracy": 0.9617255198955535, + "num_tokens": 5691606584.0, + "step": 53750 + }, + { + "entropy": 1.23875, + "epoch": 1.3172714362665883, + "grad_norm": 2.796875, + "learning_rate": 1.7170996969170434e-06, + "loss": 0.1643, + "mean_token_accuracy": 0.9644413828849793, + "num_tokens": 5697025528.0, + "step": 53800 + }, + { + "entropy": 1.23609375, + "epoch": 1.318495666225944, + "grad_norm": 3.0, + "learning_rate": 1.7116128581146443e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9660075342655182, + "num_tokens": 5702129646.0, + "step": 53850 + }, + { + "entropy": 1.239375, + "epoch": 1.3197198961852994, + "grad_norm": 2.46875, + "learning_rate": 1.7061312985805986e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9642334473133087, + "num_tokens": 5707290385.0, + "step": 53900 + }, + { + "entropy": 1.23515625, + "epoch": 1.320944126144655, + "grad_norm": 3.421875, + "learning_rate": 1.7006550407760285e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.9647632312774658, + "num_tokens": 5712555849.0, + "step": 53950 + }, + { + "entropy": 1.2396875, + "epoch": 1.3221683561040105, + "grad_norm": 3.71875, + "learning_rate": 1.695184107140337e-06, + "loss": 0.1682, + "mean_token_accuracy": 0.9639084780216217, + "num_tokens": 5717928890.0, + "step": 54000 + }, + { + "entropy": 1.2246875, + "epoch": 1.3233925860633662, + "grad_norm": 3.921875, + "learning_rate": 1.6897185200911068e-06, + "loss": 0.1468, + "mean_token_accuracy": 0.9690938425064087, + "num_tokens": 5722987021.0, + "step": 54050 + }, + { + "entropy": 1.2565625, + "epoch": 1.3246168160227216, + "grad_norm": 2.875, + "learning_rate": 1.6842583020240137e-06, + "loss": 0.166, + "mean_token_accuracy": 0.9647270548343658, + "num_tokens": 5728523665.0, + "step": 54100 + }, + { + "entropy": 1.2253125, + "epoch": 1.3258410459820773, + "grad_norm": 3.046875, + "learning_rate": 1.6788034753127332e-06, + "loss": 0.1509, + "mean_token_accuracy": 0.9676713216304779, + "num_tokens": 5733724051.0, + "step": 54150 + }, + { + "entropy": 1.2478125, + "epoch": 1.327065275941433, + "grad_norm": 1.7890625, + "learning_rate": 1.6733540623088485e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9635128057003022, + "num_tokens": 5739544907.0, + "step": 54200 + }, + { + "entropy": 1.245, + "epoch": 1.3282895059007884, + "grad_norm": 2.25, + "learning_rate": 1.6679100853417647e-06, + "loss": 0.1592, + "mean_token_accuracy": 0.9656123912334442, + "num_tokens": 5744896935.0, + "step": 54250 + }, + { + "entropy": 1.25453125, + "epoch": 1.3295137358601439, + "grad_norm": 2.53125, + "learning_rate": 1.6624715667186047e-06, + "loss": 0.1756, + "mean_token_accuracy": 0.962364639043808, + "num_tokens": 5750164763.0, + "step": 54300 + }, + { + "entropy": 1.23609375, + "epoch": 1.3307379658194995, + "grad_norm": 3.15625, + "learning_rate": 1.6570385287241335e-06, + "loss": 0.1577, + "mean_token_accuracy": 0.9660208249092102, + "num_tokens": 5755265140.0, + "step": 54350 + }, + { + "entropy": 1.25390625, + "epoch": 1.3319621957788552, + "grad_norm": 1.640625, + "learning_rate": 1.6516109936206498e-06, + "loss": 0.1756, + "mean_token_accuracy": 0.9626241695880889, + "num_tokens": 5760623089.0, + "step": 54400 + }, + { + "entropy": 1.246875, + "epoch": 1.3331864257382107, + "grad_norm": 2.125, + "learning_rate": 1.646188983647912e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9631841456890107, + "num_tokens": 5766177496.0, + "step": 54450 + }, + { + "entropy": 1.26140625, + "epoch": 1.3344106556975661, + "grad_norm": 2.921875, + "learning_rate": 1.6407725210230344e-06, + "loss": 0.1766, + "mean_token_accuracy": 0.9622941052913666, + "num_tokens": 5771692920.0, + "step": 54500 + }, + { + "entropy": 1.2415625, + "epoch": 1.3356348856569218, + "grad_norm": 4.0, + "learning_rate": 1.6353616279404013e-06, + "loss": 0.1569, + "mean_token_accuracy": 0.9662493073940277, + "num_tokens": 5777098724.0, + "step": 54550 + }, + { + "entropy": 1.23234375, + "epoch": 1.3368591156162775, + "grad_norm": 1.0, + "learning_rate": 1.6299563265715747e-06, + "loss": 0.148, + "mean_token_accuracy": 0.9682403624057769, + "num_tokens": 5782119917.0, + "step": 54600 + }, + { + "entropy": 1.25578125, + "epoch": 1.338083345575633, + "grad_norm": 3.46875, + "learning_rate": 1.624556639065207e-06, + "loss": 0.1594, + "mean_token_accuracy": 0.9662695753574372, + "num_tokens": 5787291101.0, + "step": 54650 + }, + { + "entropy": 1.24171875, + "epoch": 1.3393075755349884, + "grad_norm": 3.609375, + "learning_rate": 1.6191625875469446e-06, + "loss": 0.157, + "mean_token_accuracy": 0.9663849449157715, + "num_tokens": 5792520283.0, + "step": 54700 + }, + { + "entropy": 1.25046875, + "epoch": 1.340531805494344, + "grad_norm": 1.7734375, + "learning_rate": 1.6137741941193398e-06, + "loss": 0.1495, + "mean_token_accuracy": 0.9671278047561646, + "num_tokens": 5797431576.0, + "step": 54750 + }, + { + "entropy": 1.26546875, + "epoch": 1.3417560354536997, + "grad_norm": 2.734375, + "learning_rate": 1.6083914808617645e-06, + "loss": 0.1765, + "mean_token_accuracy": 0.9622493016719819, + "num_tokens": 5803286714.0, + "step": 54800 + }, + { + "entropy": 1.224375, + "epoch": 1.3429802654130552, + "grad_norm": 3.109375, + "learning_rate": 1.6030144698303079e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.9669049537181854, + "num_tokens": 5807862828.0, + "step": 54850 + }, + { + "entropy": 1.26, + "epoch": 1.3442044953724108, + "grad_norm": 3.0, + "learning_rate": 1.5976431830577022e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.964913833141327, + "num_tokens": 5813034358.0, + "step": 54900 + }, + { + "entropy": 1.25109375, + "epoch": 1.3454287253317663, + "grad_norm": 3.515625, + "learning_rate": 1.5922776425532186e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9639725112915039, + "num_tokens": 5818413943.0, + "step": 54950 + }, + { + "entropy": 1.2321875, + "epoch": 1.346652955291122, + "grad_norm": 2.453125, + "learning_rate": 1.5869178703025869e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.9674529373645783, + "num_tokens": 5823085402.0, + "step": 55000 + }, + { + "entropy": 1.23859375, + "epoch": 1.3478771852504774, + "grad_norm": 1.7109375, + "learning_rate": 1.5815638882678944e-06, + "loss": 0.1608, + "mean_token_accuracy": 0.9654952967166901, + "num_tokens": 5828359072.0, + "step": 55050 + }, + { + "entropy": 1.2465625, + "epoch": 1.349101415209833, + "grad_norm": 2.8125, + "learning_rate": 1.5762157183875092e-06, + "loss": 0.1618, + "mean_token_accuracy": 0.965077908039093, + "num_tokens": 5833897215.0, + "step": 55100 + }, + { + "entropy": 1.24125, + "epoch": 1.3503256451691885, + "grad_norm": 2.25, + "learning_rate": 1.5708733825759804e-06, + "loss": 0.1597, + "mean_token_accuracy": 0.9658141255378723, + "num_tokens": 5839005187.0, + "step": 55150 + }, + { + "entropy": 1.2690625, + "epoch": 1.3515498751285442, + "grad_norm": 3.0625, + "learning_rate": 1.5655369027239507e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9630602359771728, + "num_tokens": 5844499544.0, + "step": 55200 + }, + { + "entropy": 1.2484375, + "epoch": 1.3527741050878996, + "grad_norm": 2.734375, + "learning_rate": 1.5602063006980713e-06, + "loss": 0.1606, + "mean_token_accuracy": 0.9662463283538818, + "num_tokens": 5849831304.0, + "step": 55250 + }, + { + "entropy": 1.243125, + "epoch": 1.3539983350472553, + "grad_norm": 4.125, + "learning_rate": 1.5548815983409054e-06, + "loss": 0.1584, + "mean_token_accuracy": 0.9648811197280884, + "num_tokens": 5854831384.0, + "step": 55300 + }, + { + "entropy": 1.2475, + "epoch": 1.3552225650066108, + "grad_norm": 0.0169677734375, + "learning_rate": 1.5495628174708422e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.9666490364074707, + "num_tokens": 5860380821.0, + "step": 55350 + }, + { + "entropy": 1.2428125, + "epoch": 1.3564467949659664, + "grad_norm": 2.96875, + "learning_rate": 1.5442499798820062e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.9649770343303681, + "num_tokens": 5865590076.0, + "step": 55400 + }, + { + "entropy": 1.25265625, + "epoch": 1.357671024925322, + "grad_norm": 5.0625, + "learning_rate": 1.5389431073441742e-06, + "loss": 0.1625, + "mean_token_accuracy": 0.9651528835296631, + "num_tokens": 5870893580.0, + "step": 55450 + }, + { + "entropy": 1.2590625, + "epoch": 1.3588952548846776, + "grad_norm": 3.03125, + "learning_rate": 1.5336422216026717e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9625674414634705, + "num_tokens": 5876137820.0, + "step": 55500 + }, + { + "entropy": 1.24515625, + "epoch": 1.360119484844033, + "grad_norm": 3.046875, + "learning_rate": 1.5283473443783021e-06, + "loss": 0.1575, + "mean_token_accuracy": 0.9658649146556855, + "num_tokens": 5881136105.0, + "step": 55550 + }, + { + "entropy": 1.2434375, + "epoch": 1.3613437148033887, + "grad_norm": 3.015625, + "learning_rate": 1.5230584973672404e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9642657494544983, + "num_tokens": 5886333380.0, + "step": 55600 + }, + { + "entropy": 1.2628125, + "epoch": 1.3625679447627443, + "grad_norm": 1.6640625, + "learning_rate": 1.5177757022409606e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9618762648105621, + "num_tokens": 5892147042.0, + "step": 55650 + }, + { + "entropy": 1.25875, + "epoch": 1.3637921747220998, + "grad_norm": 3.0625, + "learning_rate": 1.5124989806461293e-06, + "loss": 0.1678, + "mean_token_accuracy": 0.9644319689273835, + "num_tokens": 5897583102.0, + "step": 55700 + }, + { + "entropy": 1.2546875, + "epoch": 1.3650164046814552, + "grad_norm": 2.453125, + "learning_rate": 1.5072283542045348e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9658961379528046, + "num_tokens": 5902701860.0, + "step": 55750 + }, + { + "entropy": 1.2584375, + "epoch": 1.366240634640811, + "grad_norm": 3.03125, + "learning_rate": 1.5019638445129849e-06, + "loss": 0.1656, + "mean_token_accuracy": 0.9642118716239929, + "num_tokens": 5908066266.0, + "step": 55800 + }, + { + "entropy": 1.24375, + "epoch": 1.3674648646001666, + "grad_norm": 1.8203125, + "learning_rate": 1.496705473143224e-06, + "loss": 0.1467, + "mean_token_accuracy": 0.9683407878875733, + "num_tokens": 5913106858.0, + "step": 55850 + }, + { + "entropy": 1.24109375, + "epoch": 1.368689094559522, + "grad_norm": 2.65625, + "learning_rate": 1.4914532616418477e-06, + "loss": 0.1619, + "mean_token_accuracy": 0.9651940071582794, + "num_tokens": 5918299911.0, + "step": 55900 + }, + { + "entropy": 1.24296875, + "epoch": 1.3699133245188777, + "grad_norm": 3.015625, + "learning_rate": 1.486207231530207e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.966886637210846, + "num_tokens": 5923373367.0, + "step": 55950 + }, + { + "entropy": 1.25984375, + "epoch": 1.3711375544782332, + "grad_norm": 2.21875, + "learning_rate": 1.4809674043043262e-06, + "loss": 0.1714, + "mean_token_accuracy": 0.9631552195549011, + "num_tokens": 5928830248.0, + "step": 56000 + }, + { + "entropy": 1.24640625, + "epoch": 1.3723617844375888, + "grad_norm": 2.84375, + "learning_rate": 1.4757338014348108e-06, + "loss": 0.17, + "mean_token_accuracy": 0.9638724672794342, + "num_tokens": 5934360325.0, + "step": 56050 + }, + { + "entropy": 1.2428125, + "epoch": 1.3735860143969443, + "grad_norm": 2.296875, + "learning_rate": 1.4705064443667672e-06, + "loss": 0.1672, + "mean_token_accuracy": 0.9640205073356628, + "num_tokens": 5939749032.0, + "step": 56100 + }, + { + "entropy": 1.2396875, + "epoch": 1.3748102443563, + "grad_norm": 3.140625, + "learning_rate": 1.4652853545196994e-06, + "loss": 0.1698, + "mean_token_accuracy": 0.9635356509685516, + "num_tokens": 5944946908.0, + "step": 56150 + }, + { + "entropy": 1.2471875, + "epoch": 1.3760344743156554, + "grad_norm": 1.9765625, + "learning_rate": 1.4600705532874409e-06, + "loss": 0.1612, + "mean_token_accuracy": 0.9657069194316864, + "num_tokens": 5950153678.0, + "step": 56200 + }, + { + "entropy": 1.2515625, + "epoch": 1.377258704275011, + "grad_norm": 1.6953125, + "learning_rate": 1.45486206203805e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9643088591098785, + "num_tokens": 5955488321.0, + "step": 56250 + }, + { + "entropy": 1.24984375, + "epoch": 1.3784829342343665, + "grad_norm": 2.296875, + "learning_rate": 1.4496599021137346e-06, + "loss": 0.1802, + "mean_token_accuracy": 0.9621450281143189, + "num_tokens": 5961263793.0, + "step": 56300 + }, + { + "entropy": 1.22625, + "epoch": 1.3797071641937222, + "grad_norm": 2.203125, + "learning_rate": 1.4444640948307554e-06, + "loss": 0.1567, + "mean_token_accuracy": 0.9664753973484039, + "num_tokens": 5966590895.0, + "step": 56350 + }, + { + "entropy": 1.2453125, + "epoch": 1.3809313941530776, + "grad_norm": 1.9921875, + "learning_rate": 1.4392746614793446e-06, + "loss": 0.162, + "mean_token_accuracy": 0.9654717576503754, + "num_tokens": 5972160004.0, + "step": 56400 + }, + { + "entropy": 1.24125, + "epoch": 1.3821556241124333, + "grad_norm": 2.90625, + "learning_rate": 1.4340916233236167e-06, + "loss": 0.1685, + "mean_token_accuracy": 0.9643662881851196, + "num_tokens": 5977855909.0, + "step": 56450 + }, + { + "entropy": 1.2490625, + "epoch": 1.383379854071789, + "grad_norm": 1.6171875, + "learning_rate": 1.4289150016014792e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9650551450252532, + "num_tokens": 5983284719.0, + "step": 56500 + }, + { + "entropy": 1.245, + "epoch": 1.3846040840311444, + "grad_norm": 1.8046875, + "learning_rate": 1.4237448175245523e-06, + "loss": 0.1565, + "mean_token_accuracy": 0.9658044958114624, + "num_tokens": 5988559128.0, + "step": 56550 + }, + { + "entropy": 1.24140625, + "epoch": 1.3858283139904999, + "grad_norm": 2.234375, + "learning_rate": 1.4185810922780736e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9643181717395782, + "num_tokens": 5993939256.0, + "step": 56600 + }, + { + "entropy": 1.245, + "epoch": 1.3870525439498556, + "grad_norm": 2.796875, + "learning_rate": 1.413423847020816e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.963967101573944, + "num_tokens": 5999401709.0, + "step": 56650 + }, + { + "entropy": 1.23953125, + "epoch": 1.3882767739092112, + "grad_norm": 3.203125, + "learning_rate": 1.4082731028849995e-06, + "loss": 0.1636, + "mean_token_accuracy": 0.9649562358856201, + "num_tokens": 6004763257.0, + "step": 56700 + }, + { + "entropy": 1.263125, + "epoch": 1.3895010038685667, + "grad_norm": 2.3125, + "learning_rate": 1.4031288809762096e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9629300630092621, + "num_tokens": 6010451639.0, + "step": 56750 + }, + { + "entropy": 1.23171875, + "epoch": 1.3907252338279221, + "grad_norm": 2.734375, + "learning_rate": 1.397991202373298e-06, + "loss": 0.16, + "mean_token_accuracy": 0.9664403641223908, + "num_tokens": 6015769794.0, + "step": 56800 + }, + { + "entropy": 1.24171875, + "epoch": 1.3919494637872778, + "grad_norm": 1.71875, + "learning_rate": 1.3928600881283135e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9627274203300477, + "num_tokens": 6020957098.0, + "step": 56850 + }, + { + "entropy": 1.2315625, + "epoch": 1.3931736937466335, + "grad_norm": 0.00994873046875, + "learning_rate": 1.3877355592664005e-06, + "loss": 0.1509, + "mean_token_accuracy": 0.9681152474880218, + "num_tokens": 6026298682.0, + "step": 56900 + }, + { + "entropy": 1.24703125, + "epoch": 1.394397923705989, + "grad_norm": 2.09375, + "learning_rate": 1.3826176367857244e-06, + "loss": 0.1599, + "mean_token_accuracy": 0.9659165751934051, + "num_tokens": 6031577635.0, + "step": 56950 + }, + { + "entropy": 1.23828125, + "epoch": 1.3956221536653444, + "grad_norm": 3.734375, + "learning_rate": 1.3775063416573772e-06, + "loss": 0.1602, + "mean_token_accuracy": 0.9653304886817932, + "num_tokens": 6036759854.0, + "step": 57000 + }, + { + "entropy": 1.23265625, + "epoch": 1.3968463836247, + "grad_norm": 1.6875, + "learning_rate": 1.3724016948252932e-06, + "loss": 0.1561, + "mean_token_accuracy": 0.9671315121650695, + "num_tokens": 6042005844.0, + "step": 57050 + }, + { + "entropy": 1.245, + "epoch": 1.3980706135840557, + "grad_norm": 4.125, + "learning_rate": 1.3673037172061715e-06, + "loss": 0.1645, + "mean_token_accuracy": 0.9652763676643371, + "num_tokens": 6047109956.0, + "step": 57100 + }, + { + "entropy": 1.23859375, + "epoch": 1.3992948435434112, + "grad_norm": 3.53125, + "learning_rate": 1.362212429689374e-06, + "loss": 0.1638, + "mean_token_accuracy": 0.9652803325653077, + "num_tokens": 6052155256.0, + "step": 57150 + }, + { + "entropy": 1.270625, + "epoch": 1.4005190735027668, + "grad_norm": 2.140625, + "learning_rate": 1.3571278531368583e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9618336653709412, + "num_tokens": 6057754576.0, + "step": 57200 + }, + { + "entropy": 1.25484375, + "epoch": 1.4017433034621223, + "grad_norm": 1.7109375, + "learning_rate": 1.3520500083830786e-06, + "loss": 0.1611, + "mean_token_accuracy": 0.9656724345684051, + "num_tokens": 6063117197.0, + "step": 57250 + }, + { + "entropy": 1.25125, + "epoch": 1.402967533421478, + "grad_norm": 2.5625, + "learning_rate": 1.346978916234905e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9628279542922974, + "num_tokens": 6068604024.0, + "step": 57300 + }, + { + "entropy": 1.25015625, + "epoch": 1.4041917633808334, + "grad_norm": 3.34375, + "learning_rate": 1.3419145974715394e-06, + "loss": 0.1561, + "mean_token_accuracy": 0.9659430325031281, + "num_tokens": 6073902078.0, + "step": 57350 + }, + { + "entropy": 1.26703125, + "epoch": 1.405415993340189, + "grad_norm": 3.859375, + "learning_rate": 1.3368570728444298e-06, + "loss": 0.1718, + "mean_token_accuracy": 0.9625124716758728, + "num_tokens": 6079405655.0, + "step": 57400 + }, + { + "entropy": 1.2446875, + "epoch": 1.4066402232995445, + "grad_norm": 3.828125, + "learning_rate": 1.331806363077184e-06, + "loss": 0.1662, + "mean_token_accuracy": 0.9648419404029847, + "num_tokens": 6084626144.0, + "step": 57450 + }, + { + "entropy": 1.23234375, + "epoch": 1.4078644532589002, + "grad_norm": 2.5625, + "learning_rate": 1.3267624888654835e-06, + "loss": 0.1479, + "mean_token_accuracy": 0.9676874935626983, + "num_tokens": 6089664069.0, + "step": 57500 + }, + { + "entropy": 1.255625, + "epoch": 1.4090886832182556, + "grad_norm": 2.609375, + "learning_rate": 1.3217254708770053e-06, + "loss": 0.1648, + "mean_token_accuracy": 0.964464715719223, + "num_tokens": 6095025878.0, + "step": 57550 + }, + { + "entropy": 1.25140625, + "epoch": 1.4103129131776113, + "grad_norm": 2.34375, + "learning_rate": 1.3166953297513275e-06, + "loss": 0.1638, + "mean_token_accuracy": 0.9649744808673859, + "num_tokens": 6100414900.0, + "step": 57600 + }, + { + "entropy": 1.24765625, + "epoch": 1.4115371431369668, + "grad_norm": 0.0166015625, + "learning_rate": 1.311672086099852e-06, + "loss": 0.1621, + "mean_token_accuracy": 0.9656559634208679, + "num_tokens": 6105532948.0, + "step": 57650 + }, + { + "entropy": 1.2375, + "epoch": 1.4127613730963224, + "grad_norm": 2.671875, + "learning_rate": 1.3066557605057167e-06, + "loss": 0.1633, + "mean_token_accuracy": 0.9653026688098908, + "num_tokens": 6110851956.0, + "step": 57700 + }, + { + "entropy": 1.26578125, + "epoch": 1.413985603055678, + "grad_norm": 1.9921875, + "learning_rate": 1.3016463735237164e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9625765991210937, + "num_tokens": 6116317682.0, + "step": 57750 + }, + { + "entropy": 1.2565625, + "epoch": 1.4152098330150336, + "grad_norm": 2.03125, + "learning_rate": 1.2966439456802059e-06, + "loss": 0.1742, + "mean_token_accuracy": 0.9632444334030151, + "num_tokens": 6122164130.0, + "step": 57800 + }, + { + "entropy": 1.24140625, + "epoch": 1.416434062974389, + "grad_norm": 2.234375, + "learning_rate": 1.2916484974730335e-06, + "loss": 0.1672, + "mean_token_accuracy": 0.9641308975219727, + "num_tokens": 6127574306.0, + "step": 57850 + }, + { + "entropy": 1.246875, + "epoch": 1.4176582929337447, + "grad_norm": 2.6875, + "learning_rate": 1.2866600493714425e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9628300058841706, + "num_tokens": 6133295960.0, + "step": 57900 + }, + { + "entropy": 1.25625, + "epoch": 1.4188825228931004, + "grad_norm": 2.546875, + "learning_rate": 1.281678621815994e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9640992879867554, + "num_tokens": 6138729294.0, + "step": 57950 + }, + { + "entropy": 1.21765625, + "epoch": 1.4201067528524558, + "grad_norm": 3.34375, + "learning_rate": 1.276704235218481e-06, + "loss": 0.1483, + "mean_token_accuracy": 0.9675537276268006, + "num_tokens": 6143658701.0, + "step": 58000 + }, + { + "entropy": 1.24703125, + "epoch": 1.4213309828118112, + "grad_norm": 3.359375, + "learning_rate": 1.2717369099618487e-06, + "loss": 0.168, + "mean_token_accuracy": 0.9638211143016815, + "num_tokens": 6148836685.0, + "step": 58050 + }, + { + "entropy": 1.2190625, + "epoch": 1.422555212771167, + "grad_norm": 3.296875, + "learning_rate": 1.2667766664001044e-06, + "loss": 0.1527, + "mean_token_accuracy": 0.9670968425273895, + "num_tokens": 6153703845.0, + "step": 58100 + }, + { + "entropy": 1.23734375, + "epoch": 1.4237794427305226, + "grad_norm": 2.15625, + "learning_rate": 1.2618235248582383e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.9668286955356598, + "num_tokens": 6158817391.0, + "step": 58150 + }, + { + "entropy": 1.23171875, + "epoch": 1.425003672689878, + "grad_norm": 3.28125, + "learning_rate": 1.2568775056321422e-06, + "loss": 0.1593, + "mean_token_accuracy": 0.9661485147476196, + "num_tokens": 6163833832.0, + "step": 58200 + }, + { + "entropy": 1.24328125, + "epoch": 1.4262279026492337, + "grad_norm": 1.8515625, + "learning_rate": 1.25193862898852e-06, + "loss": 0.1737, + "mean_token_accuracy": 0.9620695877075195, + "num_tokens": 6169273441.0, + "step": 58250 + }, + { + "entropy": 1.2259375, + "epoch": 1.4274521326085892, + "grad_norm": 4.15625, + "learning_rate": 1.2470069151648105e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.964862027168274, + "num_tokens": 6174358443.0, + "step": 58300 + }, + { + "entropy": 1.23609375, + "epoch": 1.4286763625679448, + "grad_norm": 3.1875, + "learning_rate": 1.2420823843691005e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9651170766353607, + "num_tokens": 6179906475.0, + "step": 58350 + }, + { + "entropy": 1.2340625, + "epoch": 1.4299005925273003, + "grad_norm": 2.90625, + "learning_rate": 1.2371650567800477e-06, + "loss": 0.1489, + "mean_token_accuracy": 0.967512333393097, + "num_tokens": 6184768923.0, + "step": 58400 + }, + { + "entropy": 1.250625, + "epoch": 1.431124822486656, + "grad_norm": 3.484375, + "learning_rate": 1.2322549525467878e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.9635206353664398, + "num_tokens": 6190151181.0, + "step": 58450 + }, + { + "entropy": 1.23453125, + "epoch": 1.4323490524460114, + "grad_norm": 3.65625, + "learning_rate": 1.2273520917888645e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.9650914788246154, + "num_tokens": 6195374468.0, + "step": 58500 + }, + { + "entropy": 1.24296875, + "epoch": 1.433573282405367, + "grad_norm": 2.046875, + "learning_rate": 1.2224564945961372e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9630816507339478, + "num_tokens": 6200703908.0, + "step": 58550 + }, + { + "entropy": 1.21984375, + "epoch": 1.4347975123647225, + "grad_norm": 2.96875, + "learning_rate": 1.2175681810287018e-06, + "loss": 0.142, + "mean_token_accuracy": 0.96914306640625, + "num_tokens": 6205730956.0, + "step": 58600 + }, + { + "entropy": 1.24125, + "epoch": 1.4360217423240782, + "grad_norm": 3.109375, + "learning_rate": 1.2126871711168126e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9625077545642853, + "num_tokens": 6211224150.0, + "step": 58650 + }, + { + "entropy": 1.23828125, + "epoch": 1.4372459722834336, + "grad_norm": 2.890625, + "learning_rate": 1.2078134848607935e-06, + "loss": 0.1578, + "mean_token_accuracy": 0.9665833008289337, + "num_tokens": 6216480413.0, + "step": 58700 + }, + { + "entropy": 1.22734375, + "epoch": 1.4384702022427893, + "grad_norm": 2.328125, + "learning_rate": 1.2029471422309593e-06, + "loss": 0.1592, + "mean_token_accuracy": 0.9655974650382996, + "num_tokens": 6221594113.0, + "step": 58750 + }, + { + "entropy": 1.2396875, + "epoch": 1.4396944322021448, + "grad_norm": 3.15625, + "learning_rate": 1.1980881631675338e-06, + "loss": 0.1642, + "mean_token_accuracy": 0.9646211445331574, + "num_tokens": 6226912535.0, + "step": 58800 + }, + { + "entropy": 1.2421875, + "epoch": 1.4409186621615004, + "grad_norm": 2.953125, + "learning_rate": 1.1932365675805704e-06, + "loss": 0.1704, + "mean_token_accuracy": 0.9632949602603912, + "num_tokens": 6232510565.0, + "step": 58850 + }, + { + "entropy": 1.2271875, + "epoch": 1.442142892120856, + "grad_norm": 2.5625, + "learning_rate": 1.1883923753498652e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9651079893112182, + "num_tokens": 6237750599.0, + "step": 58900 + }, + { + "entropy": 1.235, + "epoch": 1.4433671220802116, + "grad_norm": 2.4375, + "learning_rate": 1.1835556063248796e-06, + "loss": 0.157, + "mean_token_accuracy": 0.9665428209304809, + "num_tokens": 6243089430.0, + "step": 58950 + }, + { + "entropy": 1.22171875, + "epoch": 1.4445913520395672, + "grad_norm": 1.9453125, + "learning_rate": 1.1787262803246568e-06, + "loss": 0.159, + "mean_token_accuracy": 0.9651802563667298, + "num_tokens": 6248152093.0, + "step": 59000 + }, + { + "entropy": 1.2453125, + "epoch": 1.4458155819989227, + "grad_norm": 2.078125, + "learning_rate": 1.1739044171377455e-06, + "loss": 0.1685, + "mean_token_accuracy": 0.963554357290268, + "num_tokens": 6253653648.0, + "step": 59050 + }, + { + "entropy": 1.24859375, + "epoch": 1.4470398119582781, + "grad_norm": 2.171875, + "learning_rate": 1.1690900365221082e-06, + "loss": 0.1675, + "mean_token_accuracy": 0.9636942827701569, + "num_tokens": 6259395328.0, + "step": 59100 + }, + { + "entropy": 1.23515625, + "epoch": 1.4482640419176338, + "grad_norm": 1.9609375, + "learning_rate": 1.164283158205053e-06, + "loss": 0.163, + "mean_token_accuracy": 0.9648255848884583, + "num_tokens": 6264597318.0, + "step": 59150 + }, + { + "entropy": 1.22296875, + "epoch": 1.4494882718769895, + "grad_norm": 2.890625, + "learning_rate": 1.1594838018831444e-06, + "loss": 0.1506, + "mean_token_accuracy": 0.9675889956951141, + "num_tokens": 6269482590.0, + "step": 59200 + }, + { + "entropy": 1.26875, + "epoch": 1.450712501836345, + "grad_norm": 3.71875, + "learning_rate": 1.1546919872221238e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9605572533607483, + "num_tokens": 6275753206.0, + "step": 59250 + }, + { + "entropy": 1.235625, + "epoch": 1.4519367317957004, + "grad_norm": 3.34375, + "learning_rate": 1.1499077338568329e-06, + "loss": 0.1589, + "mean_token_accuracy": 0.9655532228946686, + "num_tokens": 6281061992.0, + "step": 59300 + }, + { + "entropy": 1.2371875, + "epoch": 1.453160961755056, + "grad_norm": 2.09375, + "learning_rate": 1.1451310613911282e-06, + "loss": 0.1668, + "mean_token_accuracy": 0.9643084633350373, + "num_tokens": 6286356933.0, + "step": 59350 + }, + { + "entropy": 1.22546875, + "epoch": 1.4543851917144117, + "grad_norm": 0.0068359375, + "learning_rate": 1.1403619893978035e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9669885611534119, + "num_tokens": 6291298254.0, + "step": 59400 + }, + { + "entropy": 1.22484375, + "epoch": 1.4556094216737672, + "grad_norm": 2.734375, + "learning_rate": 1.1356005374185075e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9667747104167939, + "num_tokens": 6296386141.0, + "step": 59450 + }, + { + "entropy": 1.233125, + "epoch": 1.4568336516331228, + "grad_norm": 2.890625, + "learning_rate": 1.1308467249636693e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9666030180454254, + "num_tokens": 6301578433.0, + "step": 59500 + }, + { + "entropy": 1.228125, + "epoch": 1.4580578815924783, + "grad_norm": 2.171875, + "learning_rate": 1.1261005715124106e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9642830669879914, + "num_tokens": 6306834089.0, + "step": 59550 + }, + { + "entropy": 1.24140625, + "epoch": 1.459282111551834, + "grad_norm": 3.28125, + "learning_rate": 1.1213620965124711e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9641312193870545, + "num_tokens": 6312270957.0, + "step": 59600 + }, + { + "entropy": 1.238125, + "epoch": 1.4605063415111894, + "grad_norm": 2.46875, + "learning_rate": 1.1166313193801264e-06, + "loss": 0.1717, + "mean_token_accuracy": 0.9619838237762451, + "num_tokens": 6317571444.0, + "step": 59650 + }, + { + "entropy": 1.235, + "epoch": 1.461730571470545, + "grad_norm": 1.6328125, + "learning_rate": 1.1119082595001127e-06, + "loss": 0.1617, + "mean_token_accuracy": 0.9648803687095642, + "num_tokens": 6322810865.0, + "step": 59700 + }, + { + "entropy": 1.24390625, + "epoch": 1.4629548014299005, + "grad_norm": 2.421875, + "learning_rate": 1.1071929362255407e-06, + "loss": 0.1768, + "mean_token_accuracy": 0.9624212658405304, + "num_tokens": 6328065527.0, + "step": 59750 + }, + { + "entropy": 1.2346875, + "epoch": 1.4641790313892562, + "grad_norm": 1.90625, + "learning_rate": 1.102485368877821e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.96669025182724, + "num_tokens": 6332934140.0, + "step": 59800 + }, + { + "entropy": 1.24796875, + "epoch": 1.4654032613486117, + "grad_norm": 2.34375, + "learning_rate": 1.0977855767465834e-06, + "loss": 0.1683, + "mean_token_accuracy": 0.9648297607898713, + "num_tokens": 6338286149.0, + "step": 59850 + }, + { + "entropy": 1.23640625, + "epoch": 1.4666274913079673, + "grad_norm": 2.703125, + "learning_rate": 1.0930935790895982e-06, + "loss": 0.1481, + "mean_token_accuracy": 0.9682129454612732, + "num_tokens": 6343347728.0, + "step": 59900 + }, + { + "entropy": 1.2359375, + "epoch": 1.4678517212673228, + "grad_norm": 1.65625, + "learning_rate": 1.0884093951326982e-06, + "loss": 0.1662, + "mean_token_accuracy": 0.9638714647293091, + "num_tokens": 6348595585.0, + "step": 59950 + }, + { + "entropy": 1.23671875, + "epoch": 1.4690759512266784, + "grad_norm": 2.0625, + "learning_rate": 1.083733044069698e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9660887753963471, + "num_tokens": 6353539392.0, + "step": 60000 + }, + { + "epoch": 1.4690759512266784, + "eval_entropy": 1.2380208333333333, + "eval_loss": 0.17763087153434753, + "eval_mean_token_accuracy": 0.9620065187414487, + "eval_num_tokens": 6353539392.0, + "eval_runtime": 603.0528, + "eval_samples_per_second": 16.012, + "eval_steps_per_second": 0.201, + "step": 60000 + }, + { + "entropy": 1.229375, + "epoch": 1.4703001811860341, + "grad_norm": 2.96875, + "learning_rate": 1.0790645450623166e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.9666960227489472, + "num_tokens": 6358769999.0, + "step": 60050 + }, + { + "entropy": 1.23546875, + "epoch": 1.4715244111453896, + "grad_norm": 3.0, + "learning_rate": 1.0744039172400965e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9672531485557556, + "num_tokens": 6363778830.0, + "step": 60100 + }, + { + "entropy": 1.21921875, + "epoch": 1.472748641104745, + "grad_norm": 2.171875, + "learning_rate": 1.0697511797003325e-06, + "loss": 0.1562, + "mean_token_accuracy": 0.9664645326137543, + "num_tokens": 6368813861.0, + "step": 60150 + }, + { + "entropy": 1.2353125, + "epoch": 1.4739728710641007, + "grad_norm": 2.296875, + "learning_rate": 1.0651063515079833e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9665102756023407, + "num_tokens": 6374106711.0, + "step": 60200 + }, + { + "entropy": 1.22546875, + "epoch": 1.4751971010234564, + "grad_norm": 1.75, + "learning_rate": 1.0604694516956e-06, + "loss": 0.151, + "mean_token_accuracy": 0.9675907123088837, + "num_tokens": 6379244247.0, + "step": 60250 + }, + { + "entropy": 1.22953125, + "epoch": 1.4764213309828118, + "grad_norm": 2.71875, + "learning_rate": 1.055840499263247e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.964186635017395, + "num_tokens": 6384481392.0, + "step": 60300 + }, + { + "entropy": 1.23578125, + "epoch": 1.4776455609421673, + "grad_norm": 3.5625, + "learning_rate": 1.0512195131784247e-06, + "loss": 0.1575, + "mean_token_accuracy": 0.965451090335846, + "num_tokens": 6389460183.0, + "step": 60350 + }, + { + "entropy": 1.2571875, + "epoch": 1.478869790901523, + "grad_norm": 2.359375, + "learning_rate": 1.0466065123759882e-06, + "loss": 0.1706, + "mean_token_accuracy": 0.9634547913074494, + "num_tokens": 6395040346.0, + "step": 60400 + }, + { + "entropy": 1.2428125, + "epoch": 1.4800940208608786, + "grad_norm": 2.578125, + "learning_rate": 1.0420015157580736e-06, + "loss": 0.1614, + "mean_token_accuracy": 0.9662553632259369, + "num_tokens": 6400379406.0, + "step": 60450 + }, + { + "entropy": 1.2384375, + "epoch": 1.481318250820234, + "grad_norm": 2.03125, + "learning_rate": 1.0374045421940215e-06, + "loss": 0.1574, + "mean_token_accuracy": 0.9662669360637665, + "num_tokens": 6405924043.0, + "step": 60500 + }, + { + "entropy": 1.24515625, + "epoch": 1.4825424807795895, + "grad_norm": 2.609375, + "learning_rate": 1.0328156105202916e-06, + "loss": 0.1666, + "mean_token_accuracy": 0.9644035375118256, + "num_tokens": 6411487076.0, + "step": 60550 + }, + { + "entropy": 1.22125, + "epoch": 1.4837667107389452, + "grad_norm": 2.59375, + "learning_rate": 1.0282347395403978e-06, + "loss": 0.1501, + "mean_token_accuracy": 0.9667956507205964, + "num_tokens": 6416699077.0, + "step": 60600 + }, + { + "entropy": 1.22921875, + "epoch": 1.4849909406983008, + "grad_norm": 3.0625, + "learning_rate": 1.0236619480248205e-06, + "loss": 0.1649, + "mean_token_accuracy": 0.9641565144062042, + "num_tokens": 6421663477.0, + "step": 60650 + }, + { + "entropy": 1.223125, + "epoch": 1.4862151706576563, + "grad_norm": 0.012939453125, + "learning_rate": 1.0190972547109352e-06, + "loss": 0.1465, + "mean_token_accuracy": 0.9683601307868958, + "num_tokens": 6426657308.0, + "step": 60700 + }, + { + "entropy": 1.23296875, + "epoch": 1.487439400617012, + "grad_norm": 2.34375, + "learning_rate": 1.0145406783029337e-06, + "loss": 0.1654, + "mean_token_accuracy": 0.9649899744987488, + "num_tokens": 6432023783.0, + "step": 60750 + }, + { + "entropy": 1.2375, + "epoch": 1.4886636305763674, + "grad_norm": 3.046875, + "learning_rate": 1.0099922374717499e-06, + "loss": 0.162, + "mean_token_accuracy": 0.9657110357284546, + "num_tokens": 6437497556.0, + "step": 60800 + }, + { + "entropy": 1.2409375, + "epoch": 1.489887860535723, + "grad_norm": 3.96875, + "learning_rate": 1.0054519508549797e-06, + "loss": 0.177, + "mean_token_accuracy": 0.9619574582576752, + "num_tokens": 6443350702.0, + "step": 60850 + }, + { + "entropy": 1.23125, + "epoch": 1.4911120904950785, + "grad_norm": 2.78125, + "learning_rate": 1.0009198370568066e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.964564654827118, + "num_tokens": 6448658491.0, + "step": 60900 + }, + { + "entropy": 1.2153125, + "epoch": 1.4923363204544342, + "grad_norm": 3.15625, + "learning_rate": 9.96395914647927e-07, + "loss": 0.1507, + "mean_token_accuracy": 0.9671316814422607, + "num_tokens": 6453556941.0, + "step": 60950 + }, + { + "entropy": 1.21390625, + "epoch": 1.4935605504137897, + "grad_norm": 2.5, + "learning_rate": 9.91880202165471e-07, + "loss": 0.1637, + "mean_token_accuracy": 0.964778846502304, + "num_tokens": 6458635664.0, + "step": 61000 + }, + { + "entropy": 1.23234375, + "epoch": 1.4947847803731453, + "grad_norm": 2.671875, + "learning_rate": 9.873727181129275e-07, + "loss": 0.17, + "mean_token_accuracy": 0.9645189070701599, + "num_tokens": 6464088495.0, + "step": 61050 + }, + { + "entropy": 1.23078125, + "epoch": 1.4960090103325008, + "grad_norm": 2.953125, + "learning_rate": 9.828734809600687e-07, + "loss": 0.1594, + "mean_token_accuracy": 0.9656787288188934, + "num_tokens": 6469234194.0, + "step": 61100 + }, + { + "entropy": 1.22921875, + "epoch": 1.4972332402918564, + "grad_norm": 2.25, + "learning_rate": 9.783825091428782e-07, + "loss": 0.1618, + "mean_token_accuracy": 0.965996481180191, + "num_tokens": 6474528140.0, + "step": 61150 + }, + { + "entropy": 1.2309375, + "epoch": 1.498457470251212, + "grad_norm": 2.234375, + "learning_rate": 9.738998210634644e-07, + "loss": 0.1728, + "mean_token_accuracy": 0.9626871156692505, + "num_tokens": 6480082901.0, + "step": 61200 + }, + { + "entropy": 1.23, + "epoch": 1.4996817002105676, + "grad_norm": 3.796875, + "learning_rate": 9.694254350900005e-07, + "loss": 0.1585, + "mean_token_accuracy": 0.9654109764099121, + "num_tokens": 6485373470.0, + "step": 61250 + }, + { + "entropy": 1.24640625, + "epoch": 1.5009059301699232, + "grad_norm": 2.15625, + "learning_rate": 9.649593695566355e-07, + "loss": 0.1673, + "mean_token_accuracy": 0.9639886951446534, + "num_tokens": 6490817618.0, + "step": 61300 + }, + { + "entropy": 1.2215625, + "epoch": 1.5021301601292787, + "grad_norm": 2.34375, + "learning_rate": 9.605016427634272e-07, + "loss": 0.1513, + "mean_token_accuracy": 0.9674781799316406, + "num_tokens": 6495843357.0, + "step": 61350 + }, + { + "entropy": 1.2359375, + "epoch": 1.5033543900886341, + "grad_norm": 1.9609375, + "learning_rate": 9.560522729762628e-07, + "loss": 0.1621, + "mean_token_accuracy": 0.96533607006073, + "num_tokens": 6500949587.0, + "step": 61400 + }, + { + "entropy": 1.2359375, + "epoch": 1.5045786200479898, + "grad_norm": 2.875, + "learning_rate": 9.516112784267896e-07, + "loss": 0.1714, + "mean_token_accuracy": 0.9637338280677795, + "num_tokens": 6506340396.0, + "step": 61450 + }, + { + "entropy": 1.2215625, + "epoch": 1.5058028500073455, + "grad_norm": 4.65625, + "learning_rate": 9.471786773123337e-07, + "loss": 0.1591, + "mean_token_accuracy": 0.9650114715099335, + "num_tokens": 6511689926.0, + "step": 61500 + }, + { + "entropy": 1.243125, + "epoch": 1.507027079966701, + "grad_norm": 2.859375, + "learning_rate": 9.427544877958278e-07, + "loss": 0.1678, + "mean_token_accuracy": 0.9641639375686646, + "num_tokens": 6517204008.0, + "step": 61550 + }, + { + "entropy": 1.21296875, + "epoch": 1.5082513099260564, + "grad_norm": 2.453125, + "learning_rate": 9.383387280057409e-07, + "loss": 0.1615, + "mean_token_accuracy": 0.9646773946285248, + "num_tokens": 6522483140.0, + "step": 61600 + }, + { + "entropy": 1.23234375, + "epoch": 1.509475539885412, + "grad_norm": 2.953125, + "learning_rate": 9.339314160359977e-07, + "loss": 0.1588, + "mean_token_accuracy": 0.9658515179157257, + "num_tokens": 6527644811.0, + "step": 61650 + }, + { + "entropy": 1.233125, + "epoch": 1.5106997698447677, + "grad_norm": 3.125, + "learning_rate": 9.295325699459082e-07, + "loss": 0.1629, + "mean_token_accuracy": 0.9652837121486664, + "num_tokens": 6532774529.0, + "step": 61700 + }, + { + "entropy": 1.22390625, + "epoch": 1.5119239998041232, + "grad_norm": 3.90625, + "learning_rate": 9.251422077600911e-07, + "loss": 0.1658, + "mean_token_accuracy": 0.9642423093318939, + "num_tokens": 6538188895.0, + "step": 61750 + }, + { + "entropy": 1.223125, + "epoch": 1.5131482297634786, + "grad_norm": 2.84375, + "learning_rate": 9.207603474684063e-07, + "loss": 0.1576, + "mean_token_accuracy": 0.9674744582176209, + "num_tokens": 6543389288.0, + "step": 61800 + }, + { + "entropy": 1.22765625, + "epoch": 1.5143724597228343, + "grad_norm": 2.375, + "learning_rate": 9.163870070258698e-07, + "loss": 0.1563, + "mean_token_accuracy": 0.9665237212181091, + "num_tokens": 6548548612.0, + "step": 61850 + }, + { + "entropy": 1.22171875, + "epoch": 1.51559668968219, + "grad_norm": 3.375, + "learning_rate": 9.120222043525931e-07, + "loss": 0.1515, + "mean_token_accuracy": 0.9670096004009247, + "num_tokens": 6553657775.0, + "step": 61900 + }, + { + "entropy": 1.23703125, + "epoch": 1.5168209196415454, + "grad_norm": 2.15625, + "learning_rate": 9.076659573337e-07, + "loss": 0.1619, + "mean_token_accuracy": 0.9654546058177949, + "num_tokens": 6559027325.0, + "step": 61950 + }, + { + "entropy": 1.22515625, + "epoch": 1.5180451496009009, + "grad_norm": 1.8359375, + "learning_rate": 9.033182838192564e-07, + "loss": 0.1595, + "mean_token_accuracy": 0.9660532510280609, + "num_tokens": 6564515287.0, + "step": 62000 + }, + { + "entropy": 1.236875, + "epoch": 1.5192693795602565, + "grad_norm": 1.9375, + "learning_rate": 8.98979201624201e-07, + "loss": 0.161, + "mean_token_accuracy": 0.9655505573749542, + "num_tokens": 6569987402.0, + "step": 62050 + }, + { + "entropy": 1.249375, + "epoch": 1.5204936095196122, + "grad_norm": 3.125, + "learning_rate": 8.946487285282659e-07, + "loss": 0.1724, + "mean_token_accuracy": 0.9626421999931335, + "num_tokens": 6575526706.0, + "step": 62100 + }, + { + "entropy": 1.225, + "epoch": 1.5217178394789679, + "grad_norm": 3.65625, + "learning_rate": 8.903268822759075e-07, + "loss": 0.1615, + "mean_token_accuracy": 0.966062741279602, + "num_tokens": 6580795009.0, + "step": 62150 + }, + { + "entropy": 1.23375, + "epoch": 1.5229420694383233, + "grad_norm": 4.5, + "learning_rate": 8.860136805762319e-07, + "loss": 0.1617, + "mean_token_accuracy": 0.9658437705039978, + "num_tokens": 6586016211.0, + "step": 62200 + }, + { + "entropy": 1.2359375, + "epoch": 1.5241662993976788, + "grad_norm": 2.359375, + "learning_rate": 8.817091411029271e-07, + "loss": 0.1593, + "mean_token_accuracy": 0.966154944896698, + "num_tokens": 6591160444.0, + "step": 62250 + }, + { + "entropy": 1.2134375, + "epoch": 1.5253905293570345, + "grad_norm": 2.390625, + "learning_rate": 8.774132814941828e-07, + "loss": 0.1579, + "mean_token_accuracy": 0.9668165516853332, + "num_tokens": 6596300228.0, + "step": 62300 + }, + { + "entropy": 1.21953125, + "epoch": 1.5266147593163901, + "grad_norm": 2.859375, + "learning_rate": 8.731261193526248e-07, + "loss": 0.1586, + "mean_token_accuracy": 0.9657115602493286, + "num_tokens": 6601689242.0, + "step": 62350 + }, + { + "entropy": 1.2521875, + "epoch": 1.5278389892757456, + "grad_norm": 3.046875, + "learning_rate": 8.688476722452379e-07, + "loss": 0.1732, + "mean_token_accuracy": 0.9633473336696625, + "num_tokens": 6607301069.0, + "step": 62400 + }, + { + "entropy": 1.23234375, + "epoch": 1.529063219235101, + "grad_norm": 2.0, + "learning_rate": 8.645779577033011e-07, + "loss": 0.1655, + "mean_token_accuracy": 0.9651632213592529, + "num_tokens": 6612690182.0, + "step": 62450 + }, + { + "entropy": 1.22234375, + "epoch": 1.5302874491944567, + "grad_norm": 2.234375, + "learning_rate": 8.603169932223042e-07, + "loss": 0.1644, + "mean_token_accuracy": 0.9645105350017548, + "num_tokens": 6618066965.0, + "step": 62500 + }, + { + "entropy": 1.2134375, + "epoch": 1.5315116791538124, + "grad_norm": 3.03125, + "learning_rate": 8.560647962618894e-07, + "loss": 0.1473, + "mean_token_accuracy": 0.9680246078968048, + "num_tokens": 6623009283.0, + "step": 62550 + }, + { + "entropy": 1.2359375, + "epoch": 1.5327359091131678, + "grad_norm": 2.9375, + "learning_rate": 8.518213842457696e-07, + "loss": 0.1684, + "mean_token_accuracy": 0.9639063477516174, + "num_tokens": 6628694150.0, + "step": 62600 + }, + { + "entropy": 1.2365625, + "epoch": 1.5339601390725233, + "grad_norm": 2.15625, + "learning_rate": 8.475867745616605e-07, + "loss": 0.1699, + "mean_token_accuracy": 0.9639629209041596, + "num_tokens": 6634163539.0, + "step": 62650 + }, + { + "entropy": 1.23515625, + "epoch": 1.535184369031879, + "grad_norm": 1.8125, + "learning_rate": 8.433609845612123e-07, + "loss": 0.1681, + "mean_token_accuracy": 0.9637242484092713, + "num_tokens": 6639673078.0, + "step": 62700 + }, + { + "entropy": 1.21796875, + "epoch": 1.5364085989912346, + "grad_norm": 2.1875, + "learning_rate": 8.39144031559933e-07, + "loss": 0.1653, + "mean_token_accuracy": 0.9641383695602417, + "num_tokens": 6645021375.0, + "step": 62750 + }, + { + "entropy": 1.218125, + "epoch": 1.53763282895059, + "grad_norm": 3.65625, + "learning_rate": 8.349359328371241e-07, + "loss": 0.1557, + "mean_token_accuracy": 0.9672486507892608, + "num_tokens": 6650282385.0, + "step": 62800 + }, + { + "entropy": 1.24453125, + "epoch": 1.5388570589099455, + "grad_norm": 2.265625, + "learning_rate": 8.307367056357993e-07, + "loss": 0.1744, + "mean_token_accuracy": 0.9627921509742737, + "num_tokens": 6655617849.0, + "step": 62850 + }, + { + "entropy": 1.2384375, + "epoch": 1.5400812888693012, + "grad_norm": 2.234375, + "learning_rate": 8.265463671626277e-07, + "loss": 0.1643, + "mean_token_accuracy": 0.9646277320384979, + "num_tokens": 6660898400.0, + "step": 62900 + }, + { + "entropy": 1.2315625, + "epoch": 1.5413055188286569, + "grad_norm": 2.53125, + "learning_rate": 8.223649345878521e-07, + "loss": 0.1595, + "mean_token_accuracy": 0.9663047862052917, + "num_tokens": 6666546321.0, + "step": 62950 + }, + { + "entropy": 1.21890625, + "epoch": 1.5425297487880123, + "grad_norm": 0.01312255859375, + "learning_rate": 8.181924250452234e-07, + "loss": 0.1479, + "mean_token_accuracy": 0.9685621929168701, + "num_tokens": 6671900409.0, + "step": 63000 + }, + { + "entropy": 1.22140625, + "epoch": 1.5437539787473677, + "grad_norm": 2.578125, + "learning_rate": 8.140288556319295e-07, + "loss": 0.1564, + "mean_token_accuracy": 0.9663173937797547, + "num_tokens": 6676916235.0, + "step": 63050 + }, + { + "entropy": 1.2315625, + "epoch": 1.5449782087067234, + "grad_norm": 2.171875, + "learning_rate": 8.098742434085274e-07, + "loss": 0.1619, + "mean_token_accuracy": 0.9653417527675628, + "num_tokens": 6681811077.0, + "step": 63100 + }, + { + "entropy": 1.2278125, + "epoch": 1.546202438666079, + "grad_norm": 2.609375, + "learning_rate": 8.057286053988688e-07, + "loss": 0.155, + "mean_token_accuracy": 0.9668863129615783, + "num_tokens": 6687079259.0, + "step": 63150 + }, + { + "entropy": 1.23734375, + "epoch": 1.5474266686254348, + "grad_norm": 2.3125, + "learning_rate": 8.015919585900328e-07, + "loss": 0.1698, + "mean_token_accuracy": 0.9634287714958191, + "num_tokens": 6692413841.0, + "step": 63200 + }, + { + "entropy": 1.20328125, + "epoch": 1.5486508985847902, + "grad_norm": 2.28125, + "learning_rate": 7.974643199322591e-07, + "loss": 0.1459, + "mean_token_accuracy": 0.9686257600784302, + "num_tokens": 6697530112.0, + "step": 63250 + }, + { + "entropy": 1.238125, + "epoch": 1.5498751285441457, + "grad_norm": 2.84375, + "learning_rate": 7.933457063388733e-07, + "loss": 0.171, + "mean_token_accuracy": 0.9629907369613647, + "num_tokens": 6702988908.0, + "step": 63300 + }, + { + "entropy": 1.215625, + "epoch": 1.5510993585035013, + "grad_norm": 1.6953125, + "learning_rate": 7.892361346862206e-07, + "loss": 0.1588, + "mean_token_accuracy": 0.9652132534980774, + "num_tokens": 6708127766.0, + "step": 63350 + }, + { + "entropy": 1.216875, + "epoch": 1.552323588462857, + "grad_norm": 3.546875, + "learning_rate": 7.851356218135953e-07, + "loss": 0.1565, + "mean_token_accuracy": 0.9663667130470276, + "num_tokens": 6713202542.0, + "step": 63400 + }, + { + "entropy": 1.2165625, + "epoch": 1.5535478184222125, + "grad_norm": 3.546875, + "learning_rate": 7.810441845231768e-07, + "loss": 0.1562, + "mean_token_accuracy": 0.9665763390064239, + "num_tokens": 6718170250.0, + "step": 63450 + }, + { + "entropy": 1.239375, + "epoch": 1.554772048381568, + "grad_norm": 2.671875, + "learning_rate": 7.769618395799495e-07, + "loss": 0.1701, + "mean_token_accuracy": 0.9642471766471863, + "num_tokens": 6723417011.0, + "step": 63500 + }, + { + "entropy": 1.20984375, + "epoch": 1.5559962783409236, + "grad_norm": 2.6875, + "learning_rate": 7.728886037116482e-07, + "loss": 0.1445, + "mean_token_accuracy": 0.9684971439838409, + "num_tokens": 6728453094.0, + "step": 63550 + }, + { + "entropy": 1.22625, + "epoch": 1.5572205083002792, + "grad_norm": 3.203125, + "learning_rate": 7.688244936086779e-07, + "loss": 0.1591, + "mean_token_accuracy": 0.9653982555866242, + "num_tokens": 6733460582.0, + "step": 63600 + }, + { + "entropy": 1.23765625, + "epoch": 1.5584447382596347, + "grad_norm": 1.5625, + "learning_rate": 7.64769525924052e-07, + "loss": 0.1631, + "mean_token_accuracy": 0.9650383579730988, + "num_tokens": 6739025377.0, + "step": 63650 + }, + { + "entropy": 1.241875, + "epoch": 1.5596689682189901, + "grad_norm": 1.921875, + "learning_rate": 7.607237172733212e-07, + "loss": 0.1629, + "mean_token_accuracy": 0.9644639611244201, + "num_tokens": 6744632607.0, + "step": 63700 + }, + { + "entropy": 1.21015625, + "epoch": 1.5608931981783458, + "grad_norm": 2.8125, + "learning_rate": 7.566870842345078e-07, + "loss": 0.1438, + "mean_token_accuracy": 0.9694548106193542, + "num_tokens": 6749711105.0, + "step": 63750 + }, + { + "entropy": 1.22625, + "epoch": 1.5621174281377015, + "grad_norm": 2.5625, + "learning_rate": 7.526596433480352e-07, + "loss": 0.162, + "mean_token_accuracy": 0.9650256216526032, + "num_tokens": 6755001114.0, + "step": 63800 + }, + { + "entropy": 1.2203125, + "epoch": 1.563341658097057, + "grad_norm": 2.078125, + "learning_rate": 7.486414111166603e-07, + "loss": 0.1585, + "mean_token_accuracy": 0.9653235769271851, + "num_tokens": 6760148593.0, + "step": 63850 + }, + { + "entropy": 1.22015625, + "epoch": 1.5645658880564124, + "grad_norm": 4.15625, + "learning_rate": 7.446324040054098e-07, + "loss": 0.1545, + "mean_token_accuracy": 0.9676208901405334, + "num_tokens": 6765196202.0, + "step": 63900 + }, + { + "entropy": 1.2396875, + "epoch": 1.565790118015768, + "grad_norm": 1.6640625, + "learning_rate": 7.406326384415069e-07, + "loss": 0.1645, + "mean_token_accuracy": 0.964854439496994, + "num_tokens": 6770864758.0, + "step": 63950 + }, + { + "entropy": 1.23265625, + "epoch": 1.5670143479751237, + "grad_norm": 4.53125, + "learning_rate": 7.366421308143074e-07, + "loss": 0.1678, + "mean_token_accuracy": 0.9636308062076568, + "num_tokens": 6776309871.0, + "step": 64000 + }, + { + "entropy": 1.22203125, + "epoch": 1.5682385779344792, + "grad_norm": 2.484375, + "learning_rate": 7.326608974752318e-07, + "loss": 0.1537, + "mean_token_accuracy": 0.9670063924789428, + "num_tokens": 6781591477.0, + "step": 64050 + }, + { + "entropy": 1.23421875, + "epoch": 1.5694628078938346, + "grad_norm": 3.671875, + "learning_rate": 7.286889547377019e-07, + "loss": 0.1576, + "mean_token_accuracy": 0.9661747896671296, + "num_tokens": 6787008758.0, + "step": 64100 + }, + { + "entropy": 1.2321875, + "epoch": 1.5706870378531903, + "grad_norm": 2.921875, + "learning_rate": 7.247263188770635e-07, + "loss": 0.1658, + "mean_token_accuracy": 0.9641131579875946, + "num_tokens": 6792453198.0, + "step": 64150 + }, + { + "entropy": 1.23484375, + "epoch": 1.571911267812546, + "grad_norm": 3.59375, + "learning_rate": 7.207730061305342e-07, + "loss": 0.1715, + "mean_token_accuracy": 0.9631493031978607, + "num_tokens": 6798199941.0, + "step": 64200 + }, + { + "entropy": 1.2353125, + "epoch": 1.5731354977719014, + "grad_norm": 3.3125, + "learning_rate": 7.168290326971248e-07, + "loss": 0.1629, + "mean_token_accuracy": 0.9649174082279205, + "num_tokens": 6803443062.0, + "step": 64250 + }, + { + "entropy": 1.220625, + "epoch": 1.5743597277312569, + "grad_norm": 2.203125, + "learning_rate": 7.128944147375779e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.967359025478363, + "num_tokens": 6808707076.0, + "step": 64300 + }, + { + "entropy": 1.2209375, + "epoch": 1.5755839576906125, + "grad_norm": 1.9375, + "learning_rate": 7.08969168374304e-07, + "loss": 0.1596, + "mean_token_accuracy": 0.9663796508312226, + "num_tokens": 6813958298.0, + "step": 64350 + }, + { + "entropy": 1.2228125, + "epoch": 1.5768081876499682, + "grad_norm": 3.828125, + "learning_rate": 7.050533096913104e-07, + "loss": 0.162, + "mean_token_accuracy": 0.9654451417922973, + "num_tokens": 6819296053.0, + "step": 64400 + }, + { + "entropy": 1.228125, + "epoch": 1.578032417609324, + "grad_norm": 2.390625, + "learning_rate": 7.011468547341376e-07, + "loss": 0.1488, + "mean_token_accuracy": 0.9677229869365692, + "num_tokens": 6824596472.0, + "step": 64450 + }, + { + "entropy": 1.23953125, + "epoch": 1.5792566475686793, + "grad_norm": 2.84375, + "learning_rate": 6.972498195097937e-07, + "loss": 0.1723, + "mean_token_accuracy": 0.962650990486145, + "num_tokens": 6830407037.0, + "step": 64500 + }, + { + "entropy": 1.230625, + "epoch": 1.5804808775280348, + "grad_norm": 2.375, + "learning_rate": 6.933622199866912e-07, + "loss": 0.1624, + "mean_token_accuracy": 0.9654111993312836, + "num_tokens": 6835900402.0, + "step": 64550 + }, + { + "entropy": 1.234375, + "epoch": 1.5817051074873905, + "grad_norm": 3.0625, + "learning_rate": 6.894840720945754e-07, + "loss": 0.1665, + "mean_token_accuracy": 0.9645081627368927, + "num_tokens": 6841235827.0, + "step": 64600 + }, + { + "entropy": 1.21890625, + "epoch": 1.5829293374467461, + "grad_norm": 2.421875, + "learning_rate": 6.856153917244647e-07, + "loss": 0.1611, + "mean_token_accuracy": 0.9654888653755188, + "num_tokens": 6846579737.0, + "step": 64650 + }, + { + "entropy": 1.2153125, + "epoch": 1.5841535674061016, + "grad_norm": 3.203125, + "learning_rate": 6.81756194728583e-07, + "loss": 0.1546, + "mean_token_accuracy": 0.9667556810379029, + "num_tokens": 6851881949.0, + "step": 64700 + }, + { + "entropy": 1.22421875, + "epoch": 1.585377797365457, + "grad_norm": 3.046875, + "learning_rate": 6.779064969202973e-07, + "loss": 0.1583, + "mean_token_accuracy": 0.966250067949295, + "num_tokens": 6857094183.0, + "step": 64750 + }, + { + "entropy": 1.2265625, + "epoch": 1.5866020273248127, + "grad_norm": 2.96875, + "learning_rate": 6.740663140740467e-07, + "loss": 0.163, + "mean_token_accuracy": 0.9652321350574493, + "num_tokens": 6862381095.0, + "step": 64800 + }, + { + "entropy": 1.2184375, + "epoch": 1.5878262572841684, + "grad_norm": 1.9765625, + "learning_rate": 6.70235661925287e-07, + "loss": 0.1594, + "mean_token_accuracy": 0.965182945728302, + "num_tokens": 6867345829.0, + "step": 64850 + }, + { + "entropy": 1.22640625, + "epoch": 1.5890504872435238, + "grad_norm": 2.6875, + "learning_rate": 6.664145561704173e-07, + "loss": 0.1548, + "mean_token_accuracy": 0.9668359410762787, + "num_tokens": 6872899925.0, + "step": 64900 + }, + { + "entropy": 1.23359375, + "epoch": 1.5902747172028793, + "grad_norm": 2.265625, + "learning_rate": 6.626030124667204e-07, + "loss": 0.1695, + "mean_token_accuracy": 0.9634568047523498, + "num_tokens": 6878428253.0, + "step": 64950 + }, + { + "entropy": 1.22609375, + "epoch": 1.591498947162235, + "grad_norm": 3.40625, + "learning_rate": 6.588010464323006e-07, + "loss": 0.1689, + "mean_token_accuracy": 0.9639648401737213, + "num_tokens": 6883915733.0, + "step": 65000 + }, + { + "entropy": 1.22859375, + "epoch": 1.5927231771215906, + "grad_norm": 2.28125, + "learning_rate": 6.550086736460136e-07, + "loss": 0.1719, + "mean_token_accuracy": 0.9634046721458435, + "num_tokens": 6889133852.0, + "step": 65050 + }, + { + "entropy": 1.23578125, + "epoch": 1.593947407080946, + "grad_norm": 2.984375, + "learning_rate": 6.512259096474075e-07, + "loss": 0.1729, + "mean_token_accuracy": 0.9630839240550995, + "num_tokens": 6894861703.0, + "step": 65100 + }, + { + "entropy": 1.21921875, + "epoch": 1.5951716370403015, + "grad_norm": 2.484375, + "learning_rate": 6.474527699366567e-07, + "loss": 0.1599, + "mean_token_accuracy": 0.965704824924469, + "num_tokens": 6899940861.0, + "step": 65150 + }, + { + "entropy": 1.21625, + "epoch": 1.5963958669996572, + "grad_norm": 2.078125, + "learning_rate": 6.436892699745009e-07, + "loss": 0.1572, + "mean_token_accuracy": 0.9666438150405884, + "num_tokens": 6905083361.0, + "step": 65200 + }, + { + "entropy": 1.2153125, + "epoch": 1.5976200969590129, + "grad_norm": 3.5625, + "learning_rate": 6.399354251821792e-07, + "loss": 0.1554, + "mean_token_accuracy": 0.9674275135993957, + "num_tokens": 6910092703.0, + "step": 65250 + }, + { + "entropy": 1.22984375, + "epoch": 1.5988443269183683, + "grad_norm": 2.828125, + "learning_rate": 6.361912509413676e-07, + "loss": 0.1645, + "mean_token_accuracy": 0.9646131348609924, + "num_tokens": 6915320978.0, + "step": 65300 + }, + { + "entropy": 1.22984375, + "epoch": 1.6000685568777238, + "grad_norm": 2.546875, + "learning_rate": 6.32456762594116e-07, + "loss": 0.1594, + "mean_token_accuracy": 0.9651407063007355, + "num_tokens": 6920827957.0, + "step": 65350 + }, + { + "entropy": 1.21140625, + "epoch": 1.6012927868370794, + "grad_norm": 2.578125, + "learning_rate": 6.287319754427873e-07, + "loss": 0.1533, + "mean_token_accuracy": 0.9665750122070312, + "num_tokens": 6926133415.0, + "step": 65400 + }, + { + "entropy": 1.22109375, + "epoch": 1.602517016796435, + "grad_norm": 2.859375, + "learning_rate": 6.250169047499916e-07, + "loss": 0.1563, + "mean_token_accuracy": 0.9660931730270386, + "num_tokens": 6931165132.0, + "step": 65450 + }, + { + "entropy": 1.2040625, + "epoch": 1.6037412467557908, + "grad_norm": 3.890625, + "learning_rate": 6.213115657385244e-07, + "loss": 0.1473, + "mean_token_accuracy": 0.9677533149719239, + "num_tokens": 6936236474.0, + "step": 65500 + }, + { + "entropy": 1.22515625, + "epoch": 1.6049654767151462, + "grad_norm": 1.9140625, + "learning_rate": 6.176159735913079e-07, + "loss": 0.1698, + "mean_token_accuracy": 0.9640149748325348, + "num_tokens": 6941667389.0, + "step": 65550 + }, + { + "entropy": 1.210625, + "epoch": 1.6061897066745017, + "grad_norm": 2.828125, + "learning_rate": 6.139301434513204e-07, + "loss": 0.1495, + "mean_token_accuracy": 0.9672247707843781, + "num_tokens": 6947023413.0, + "step": 65600 + }, + { + "entropy": 1.21921875, + "epoch": 1.6074139366338573, + "grad_norm": 2.859375, + "learning_rate": 6.102540904215455e-07, + "loss": 0.1579, + "mean_token_accuracy": 0.9656173276901245, + "num_tokens": 6952441096.0, + "step": 65650 + }, + { + "entropy": 1.223125, + "epoch": 1.608638166593213, + "grad_norm": 3.71875, + "learning_rate": 6.065878295649004e-07, + "loss": 0.166, + "mean_token_accuracy": 0.9646958529949188, + "num_tokens": 6957942190.0, + "step": 65700 + }, + { + "entropy": 1.2084375, + "epoch": 1.6098623965525685, + "grad_norm": 2.3125, + "learning_rate": 6.0293137590418e-07, + "loss": 0.15, + "mean_token_accuracy": 0.9669717216491699, + "num_tokens": 6963300846.0, + "step": 65750 + }, + { + "entropy": 1.22921875, + "epoch": 1.611086626511924, + "grad_norm": 2.078125, + "learning_rate": 5.992847444219915e-07, + "loss": 0.1614, + "mean_token_accuracy": 0.9650086772441864, + "num_tokens": 6968779335.0, + "step": 65800 + }, + { + "entropy": 1.22625, + "epoch": 1.6123108564712796, + "grad_norm": 2.78125, + "learning_rate": 5.956479500606977e-07, + "loss": 0.171, + "mean_token_accuracy": 0.9639155077934265, + "num_tokens": 6974202109.0, + "step": 65850 + }, + { + "entropy": 1.21328125, + "epoch": 1.6135350864306353, + "grad_norm": 3.375, + "learning_rate": 5.920210077223508e-07, + "loss": 0.1488, + "mean_token_accuracy": 0.9683645820617676, + "num_tokens": 6979171497.0, + "step": 65900 + }, + { + "entropy": 1.21875, + "epoch": 1.6147593163899907, + "grad_norm": 2.734375, + "learning_rate": 5.884039322686345e-07, + "loss": 0.1593, + "mean_token_accuracy": 0.9662387585639953, + "num_tokens": 6984410380.0, + "step": 65950 + }, + { + "entropy": 1.198125, + "epoch": 1.6159835463493462, + "grad_norm": 2.0, + "learning_rate": 5.847967385208012e-07, + "loss": 0.1521, + "mean_token_accuracy": 0.966891850233078, + "num_tokens": 6989408812.0, + "step": 66000 + }, + { + "entropy": 1.20296875, + "epoch": 1.6172077763087018, + "grad_norm": 2.109375, + "learning_rate": 5.81199441259614e-07, + "loss": 0.1509, + "mean_token_accuracy": 0.9681426680088043, + "num_tokens": 6994432516.0, + "step": 66050 + }, + { + "entropy": 1.225625, + "epoch": 1.6184320062680575, + "grad_norm": 3.140625, + "learning_rate": 5.776120552252833e-07, + "loss": 0.1638, + "mean_token_accuracy": 0.965145457983017, + "num_tokens": 6999763932.0, + "step": 66100 + }, + { + "entropy": 1.22421875, + "epoch": 1.619656236227413, + "grad_norm": 3.078125, + "learning_rate": 5.740345951174062e-07, + "loss": 0.1654, + "mean_token_accuracy": 0.9642065274715423, + "num_tokens": 7005089905.0, + "step": 66150 + }, + { + "entropy": 1.238125, + "epoch": 1.6208804661867684, + "grad_norm": 2.78125, + "learning_rate": 5.704670755949111e-07, + "loss": 0.1742, + "mean_token_accuracy": 0.962605128288269, + "num_tokens": 7010758688.0, + "step": 66200 + }, + { + "entropy": 1.2284375, + "epoch": 1.622104696146124, + "grad_norm": 2.359375, + "learning_rate": 5.669095112759893e-07, + "loss": 0.1699, + "mean_token_accuracy": 0.9639213311672211, + "num_tokens": 7015757555.0, + "step": 66250 + }, + { + "entropy": 1.215, + "epoch": 1.6233289261054797, + "grad_norm": 3.609375, + "learning_rate": 5.633619167380439e-07, + "loss": 0.1542, + "mean_token_accuracy": 0.9669547820091248, + "num_tokens": 7020934918.0, + "step": 66300 + }, + { + "entropy": 1.20421875, + "epoch": 1.6245531560648352, + "grad_norm": 2.609375, + "learning_rate": 5.598243065176243e-07, + "loss": 0.1491, + "mean_token_accuracy": 0.9682400977611542, + "num_tokens": 7026062287.0, + "step": 66350 + }, + { + "entropy": 1.224375, + "epoch": 1.6257773860241906, + "grad_norm": 3.328125, + "learning_rate": 5.56296695110368e-07, + "loss": 0.1563, + "mean_token_accuracy": 0.965864109992981, + "num_tokens": 7031243491.0, + "step": 66400 + }, + { + "entropy": 1.21640625, + "epoch": 1.6270016159835463, + "grad_norm": 1.875, + "learning_rate": 5.527790969709421e-07, + "loss": 0.1591, + "mean_token_accuracy": 0.9661051654815673, + "num_tokens": 7036518719.0, + "step": 66450 + }, + { + "entropy": 1.21765625, + "epoch": 1.628225845942902, + "grad_norm": 2.265625, + "learning_rate": 5.492715265129842e-07, + "loss": 0.1526, + "mean_token_accuracy": 0.967378306388855, + "num_tokens": 7041605356.0, + "step": 66500 + }, + { + "entropy": 1.22578125, + "epoch": 1.6294500759022574, + "grad_norm": 3.25, + "learning_rate": 5.457739981090422e-07, + "loss": 0.1608, + "mean_token_accuracy": 0.965805538892746, + "num_tokens": 7047131119.0, + "step": 66550 + }, + { + "entropy": 1.22296875, + "epoch": 1.6306743058616129, + "grad_norm": 3.9375, + "learning_rate": 5.422865260905141e-07, + "loss": 0.162, + "mean_token_accuracy": 0.9653668451309204, + "num_tokens": 7052461810.0, + "step": 66600 + }, + { + "entropy": 1.2321875, + "epoch": 1.6318985358209686, + "grad_norm": 2.015625, + "learning_rate": 5.388091247475948e-07, + "loss": 0.1674, + "mean_token_accuracy": 0.9641144728660583, + "num_tokens": 7057861665.0, + "step": 66650 + }, + { + "entropy": 1.22, + "epoch": 1.6331227657803242, + "grad_norm": 4.875, + "learning_rate": 5.35341808329211e-07, + "loss": 0.1612, + "mean_token_accuracy": 0.9650032806396485, + "num_tokens": 7063074323.0, + "step": 66700 + }, + { + "entropy": 1.2290625, + "epoch": 1.63434699573968, + "grad_norm": 2.21875, + "learning_rate": 5.31884591042966e-07, + "loss": 0.1642, + "mean_token_accuracy": 0.9645574033260346, + "num_tokens": 7068850662.0, + "step": 66750 + }, + { + "entropy": 1.21140625, + "epoch": 1.6355712256990353, + "grad_norm": 2.21875, + "learning_rate": 5.284374870550806e-07, + "loss": 0.1513, + "mean_token_accuracy": 0.9664854764938354, + "num_tokens": 7073845156.0, + "step": 66800 + }, + { + "entropy": 1.2134375, + "epoch": 1.6367954556583908, + "grad_norm": 2.59375, + "learning_rate": 5.250005104903391e-07, + "loss": 0.1526, + "mean_token_accuracy": 0.9672818171977997, + "num_tokens": 7078890553.0, + "step": 66850 + }, + { + "entropy": 1.21890625, + "epoch": 1.6380196856177465, + "grad_norm": 3.1875, + "learning_rate": 5.215736754320221e-07, + "loss": 0.1559, + "mean_token_accuracy": 0.9661696362495422, + "num_tokens": 7084113116.0, + "step": 66900 + }, + { + "entropy": 1.2209375, + "epoch": 1.6392439155771021, + "grad_norm": 2.578125, + "learning_rate": 5.181569959218593e-07, + "loss": 0.1537, + "mean_token_accuracy": 0.9654488229751587, + "num_tokens": 7089341607.0, + "step": 66950 + }, + { + "entropy": 1.21953125, + "epoch": 1.6404681455364576, + "grad_norm": 2.9375, + "learning_rate": 5.147504859599658e-07, + "loss": 0.1627, + "mean_token_accuracy": 0.9644181895256042, + "num_tokens": 7094625061.0, + "step": 67000 + }, + { + "entropy": 1.226875, + "epoch": 1.641692375495813, + "grad_norm": 2.78125, + "learning_rate": 5.113541595047853e-07, + "loss": 0.1638, + "mean_token_accuracy": 0.9646450591087341, + "num_tokens": 7100017216.0, + "step": 67050 + }, + { + "entropy": 1.22828125, + "epoch": 1.6429166054551687, + "grad_norm": 2.515625, + "learning_rate": 5.079680304730336e-07, + "loss": 0.1632, + "mean_token_accuracy": 0.9642895436286927, + "num_tokens": 7105647390.0, + "step": 67100 + }, + { + "entropy": 1.2190625, + "epoch": 1.6441408354145244, + "grad_norm": 3.1875, + "learning_rate": 5.045921127396446e-07, + "loss": 0.1568, + "mean_token_accuracy": 0.9664517366886138, + "num_tokens": 7111038795.0, + "step": 67150 + }, + { + "entropy": 1.20453125, + "epoch": 1.6453650653738798, + "grad_norm": 3.5625, + "learning_rate": 5.012264201377073e-07, + "loss": 0.1546, + "mean_token_accuracy": 0.9667070829868316, + "num_tokens": 7116213641.0, + "step": 67200 + }, + { + "entropy": 1.22828125, + "epoch": 1.6465892953332353, + "grad_norm": 2.484375, + "learning_rate": 4.978709664584132e-07, + "loss": 0.1502, + "mean_token_accuracy": 0.9669265413284301, + "num_tokens": 7121369080.0, + "step": 67250 + }, + { + "entropy": 1.2240625, + "epoch": 1.647813525292591, + "grad_norm": 2.328125, + "learning_rate": 4.945257654510013e-07, + "loss": 0.1614, + "mean_token_accuracy": 0.966176050901413, + "num_tokens": 7126738052.0, + "step": 67300 + }, + { + "entropy": 1.21375, + "epoch": 1.6490377552519466, + "grad_norm": 3.21875, + "learning_rate": 4.911908308226965e-07, + "loss": 0.1425, + "mean_token_accuracy": 0.969027806520462, + "num_tokens": 7131902692.0, + "step": 67350 + }, + { + "entropy": 1.20609375, + "epoch": 1.650261985211302, + "grad_norm": 2.46875, + "learning_rate": 4.878661762386575e-07, + "loss": 0.1494, + "mean_token_accuracy": 0.966635344028473, + "num_tokens": 7136808281.0, + "step": 67400 + }, + { + "entropy": 1.2134375, + "epoch": 1.6514862151706575, + "grad_norm": 3.921875, + "learning_rate": 4.845518153219194e-07, + "loss": 0.1536, + "mean_token_accuracy": 0.9664989912509918, + "num_tokens": 7141996551.0, + "step": 67450 + }, + { + "entropy": 1.2096875, + "epoch": 1.6527104451300132, + "grad_norm": 2.875, + "learning_rate": 4.812477616533406e-07, + "loss": 0.1517, + "mean_token_accuracy": 0.9665413784980774, + "num_tokens": 7146993092.0, + "step": 67500 + }, + { + "entropy": 1.209375, + "epoch": 1.6539346750893689, + "grad_norm": 3.3125, + "learning_rate": 4.779540287715394e-07, + "loss": 0.1583, + "mean_token_accuracy": 0.965690256357193, + "num_tokens": 7152324580.0, + "step": 67550 + }, + { + "entropy": 1.2259375, + "epoch": 1.6551589050487243, + "grad_norm": 3.828125, + "learning_rate": 4.7467063017285005e-07, + "loss": 0.1632, + "mean_token_accuracy": 0.9648753714561462, + "num_tokens": 7157642715.0, + "step": 67600 + }, + { + "entropy": 1.21328125, + "epoch": 1.6563831350080798, + "grad_norm": 3.4375, + "learning_rate": 4.713975793112569e-07, + "loss": 0.1542, + "mean_token_accuracy": 0.9669430148601532, + "num_tokens": 7162998030.0, + "step": 67650 + }, + { + "entropy": 1.185625, + "epoch": 1.6576073649674354, + "grad_norm": 3.953125, + "learning_rate": 4.681348895983448e-07, + "loss": 0.1379, + "mean_token_accuracy": 0.9700025701522828, + "num_tokens": 7167607013.0, + "step": 67700 + }, + { + "entropy": 1.2225, + "epoch": 1.658831594926791, + "grad_norm": 2.359375, + "learning_rate": 4.648825744032449e-07, + "loss": 0.1614, + "mean_token_accuracy": 0.9637822723388672, + "num_tokens": 7172916071.0, + "step": 67750 + }, + { + "entropy": 1.22109375, + "epoch": 1.6600558248861468, + "grad_norm": 0.003997802734375, + "learning_rate": 4.6164064705257424e-07, + "loss": 0.1604, + "mean_token_accuracy": 0.9653963768482208, + "num_tokens": 7178344100.0, + "step": 67800 + }, + { + "entropy": 1.21921875, + "epoch": 1.6612800548455022, + "grad_norm": 2.453125, + "learning_rate": 4.584091208303891e-07, + "loss": 0.1583, + "mean_token_accuracy": 0.9654520618915557, + "num_tokens": 7183547126.0, + "step": 67850 + }, + { + "entropy": 1.2121875, + "epoch": 1.6625042848048577, + "grad_norm": 1.7578125, + "learning_rate": 4.5518800897812174e-07, + "loss": 0.1521, + "mean_token_accuracy": 0.9661059749126434, + "num_tokens": 7188532212.0, + "step": 67900 + }, + { + "entropy": 1.2209375, + "epoch": 1.6637285147642134, + "grad_norm": 2.734375, + "learning_rate": 4.519773246945349e-07, + "loss": 0.1576, + "mean_token_accuracy": 0.9657674777507782, + "num_tokens": 7193693940.0, + "step": 67950 + }, + { + "entropy": 1.23375, + "epoch": 1.664952744723569, + "grad_norm": 2.953125, + "learning_rate": 4.487770811356612e-07, + "loss": 0.1664, + "mean_token_accuracy": 0.9635096192359924, + "num_tokens": 7199191726.0, + "step": 68000 + }, + { + "entropy": 1.21625, + "epoch": 1.6661769746829245, + "grad_norm": 3.03125, + "learning_rate": 4.455872914147521e-07, + "loss": 0.1614, + "mean_token_accuracy": 0.965412712097168, + "num_tokens": 7204740271.0, + "step": 68050 + }, + { + "entropy": 1.2178125, + "epoch": 1.66740120464228, + "grad_norm": 1.96875, + "learning_rate": 4.424079686022223e-07, + "loss": 0.1647, + "mean_token_accuracy": 0.9641766202449799, + "num_tokens": 7210407120.0, + "step": 68100 + }, + { + "entropy": 1.22875, + "epoch": 1.6686254346016356, + "grad_norm": 2.953125, + "learning_rate": 4.39239125725601e-07, + "loss": 0.162, + "mean_token_accuracy": 0.9659585297107697, + "num_tokens": 7215783474.0, + "step": 68150 + }, + { + "entropy": 1.226875, + "epoch": 1.6698496645609913, + "grad_norm": 2.4375, + "learning_rate": 4.360807757694718e-07, + "loss": 0.1626, + "mean_token_accuracy": 0.9646227335929871, + "num_tokens": 7220993281.0, + "step": 68200 + }, + { + "entropy": 1.19703125, + "epoch": 1.6710738945203467, + "grad_norm": 2.0625, + "learning_rate": 4.329329316754236e-07, + "loss": 0.1441, + "mean_token_accuracy": 0.9685395467281341, + "num_tokens": 7225810836.0, + "step": 68250 + }, + { + "entropy": 1.21875, + "epoch": 1.6722981244797022, + "grad_norm": 2.984375, + "learning_rate": 4.2979560634199754e-07, + "loss": 0.1688, + "mean_token_accuracy": 0.9636458623409271, + "num_tokens": 7231649459.0, + "step": 68300 + }, + { + "entropy": 1.19296875, + "epoch": 1.6735223544390578, + "grad_norm": 2.65625, + "learning_rate": 4.266688126246311e-07, + "loss": 0.1424, + "mean_token_accuracy": 0.9688647317886353, + "num_tokens": 7236848069.0, + "step": 68350 + }, + { + "entropy": 1.2278125, + "epoch": 1.6747465843984135, + "grad_norm": 3.203125, + "learning_rate": 4.235525633356111e-07, + "loss": 0.1676, + "mean_token_accuracy": 0.963608900308609, + "num_tokens": 7242384952.0, + "step": 68400 + }, + { + "entropy": 1.238125, + "epoch": 1.675970814357769, + "grad_norm": 3.359375, + "learning_rate": 4.204468712440144e-07, + "loss": 0.1653, + "mean_token_accuracy": 0.9638743424415588, + "num_tokens": 7247699380.0, + "step": 68450 + }, + { + "entropy": 1.21671875, + "epoch": 1.6771950443171244, + "grad_norm": 3.015625, + "learning_rate": 4.1735174907566234e-07, + "loss": 0.1507, + "mean_token_accuracy": 0.9674655389785767, + "num_tokens": 7252973599.0, + "step": 68500 + }, + { + "entropy": 1.2109375, + "epoch": 1.67841927427648, + "grad_norm": 2.4375, + "learning_rate": 4.142672095130603e-07, + "loss": 0.1488, + "mean_token_accuracy": 0.9676065123081208, + "num_tokens": 7257981736.0, + "step": 68550 + }, + { + "entropy": 1.2084375, + "epoch": 1.6796435042358357, + "grad_norm": 2.765625, + "learning_rate": 4.111932651953554e-07, + "loss": 0.1537, + "mean_token_accuracy": 0.9668715631961823, + "num_tokens": 7263067623.0, + "step": 68600 + }, + { + "entropy": 1.2253125, + "epoch": 1.6808677341951912, + "grad_norm": 1.953125, + "learning_rate": 4.0812992871827737e-07, + "loss": 0.1514, + "mean_token_accuracy": 0.967187968492508, + "num_tokens": 7268515412.0, + "step": 68650 + }, + { + "entropy": 1.2240625, + "epoch": 1.6820919641545466, + "grad_norm": 2.1875, + "learning_rate": 4.0507721263409016e-07, + "loss": 0.155, + "mean_token_accuracy": 0.9657605230808258, + "num_tokens": 7273767424.0, + "step": 68700 + }, + { + "entropy": 1.21890625, + "epoch": 1.6833161941139023, + "grad_norm": 2.078125, + "learning_rate": 4.0203512945153874e-07, + "loss": 0.1501, + "mean_token_accuracy": 0.9671496486663819, + "num_tokens": 7279187672.0, + "step": 68750 + }, + { + "entropy": 1.20953125, + "epoch": 1.684540424073258, + "grad_norm": 3.0625, + "learning_rate": 3.990036916358014e-07, + "loss": 0.1466, + "mean_token_accuracy": 0.9685079550743103, + "num_tokens": 7284104561.0, + "step": 68800 + }, + { + "entropy": 1.21328125, + "epoch": 1.6857646540326134, + "grad_norm": 4.0625, + "learning_rate": 3.9598291160843393e-07, + "loss": 0.1557, + "mean_token_accuracy": 0.9655941009521485, + "num_tokens": 7289492586.0, + "step": 68850 + }, + { + "entropy": 1.18875, + "epoch": 1.686988883991969, + "grad_norm": 1.875, + "learning_rate": 3.929728017473213e-07, + "loss": 0.14, + "mean_token_accuracy": 0.969061805009842, + "num_tokens": 7294671673.0, + "step": 68900 + }, + { + "entropy": 1.21671875, + "epoch": 1.6882131139513246, + "grad_norm": 1.578125, + "learning_rate": 3.8997337438662893e-07, + "loss": 0.1628, + "mean_token_accuracy": 0.9643185365200043, + "num_tokens": 7300014488.0, + "step": 68950 + }, + { + "entropy": 1.22359375, + "epoch": 1.6894373439106802, + "grad_norm": 0.01251220703125, + "learning_rate": 3.869846418167452e-07, + "loss": 0.1521, + "mean_token_accuracy": 0.9664946186542511, + "num_tokens": 7305132050.0, + "step": 69000 + }, + { + "entropy": 1.21640625, + "epoch": 1.690661573870036, + "grad_norm": 2.6875, + "learning_rate": 3.840066162842405e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9676698422431946, + "num_tokens": 7310341663.0, + "step": 69050 + }, + { + "entropy": 1.22984375, + "epoch": 1.6918858038293914, + "grad_norm": 3.625, + "learning_rate": 3.8103930999180936e-07, + "loss": 0.1685, + "mean_token_accuracy": 0.963647495508194, + "num_tokens": 7315713992.0, + "step": 69100 + }, + { + "entropy": 1.2271875, + "epoch": 1.6931100337887468, + "grad_norm": 2.5625, + "learning_rate": 3.780827350982258e-07, + "loss": 0.1558, + "mean_token_accuracy": 0.9662664186954498, + "num_tokens": 7321152260.0, + "step": 69150 + }, + { + "entropy": 1.21296875, + "epoch": 1.6943342637481025, + "grad_norm": 2.390625, + "learning_rate": 3.751369037182869e-07, + "loss": 0.1532, + "mean_token_accuracy": 0.9662709140777588, + "num_tokens": 7326190569.0, + "step": 69200 + }, + { + "entropy": 1.198125, + "epoch": 1.6955584937074581, + "grad_norm": 2.9375, + "learning_rate": 3.722018279227728e-07, + "loss": 0.1412, + "mean_token_accuracy": 0.9689172983169556, + "num_tokens": 7331368151.0, + "step": 69250 + }, + { + "entropy": 1.21125, + "epoch": 1.6967827236668136, + "grad_norm": 3.25, + "learning_rate": 3.6927751973838777e-07, + "loss": 0.1578, + "mean_token_accuracy": 0.9661315476894379, + "num_tokens": 7336566118.0, + "step": 69300 + }, + { + "entropy": 1.2215625, + "epoch": 1.698006953626169, + "grad_norm": 1.765625, + "learning_rate": 3.66363991147716e-07, + "loss": 0.1577, + "mean_token_accuracy": 0.9653751969337463, + "num_tokens": 7341728443.0, + "step": 69350 + }, + { + "entropy": 1.20796875, + "epoch": 1.6992311835855247, + "grad_norm": 2.53125, + "learning_rate": 3.6346125408917155e-07, + "loss": 0.1497, + "mean_token_accuracy": 0.9668842852115631, + "num_tokens": 7346956092.0, + "step": 69400 + }, + { + "entropy": 1.216875, + "epoch": 1.7004554135448804, + "grad_norm": 3.15625, + "learning_rate": 3.605693204569506e-07, + "loss": 0.1547, + "mean_token_accuracy": 0.967246618270874, + "num_tokens": 7352423947.0, + "step": 69450 + }, + { + "entropy": 1.2075, + "epoch": 1.7016796435042358, + "grad_norm": 2.46875, + "learning_rate": 3.576882021009792e-07, + "loss": 0.1489, + "mean_token_accuracy": 0.9667674267292022, + "num_tokens": 7357669096.0, + "step": 69500 + }, + { + "entropy": 1.19796875, + "epoch": 1.7029038734635913, + "grad_norm": 2.15625, + "learning_rate": 3.5481791082686757e-07, + "loss": 0.1421, + "mean_token_accuracy": 0.9695830595493317, + "num_tokens": 7362784518.0, + "step": 69550 + }, + { + "entropy": 1.2278125, + "epoch": 1.704128103422947, + "grad_norm": 2.15625, + "learning_rate": 3.519584583958636e-07, + "loss": 0.162, + "mean_token_accuracy": 0.9651164734363555, + "num_tokens": 7368275670.0, + "step": 69600 + }, + { + "entropy": 1.21578125, + "epoch": 1.7053523333823026, + "grad_norm": 2.640625, + "learning_rate": 3.4910985652479757e-07, + "loss": 0.1506, + "mean_token_accuracy": 0.9667972207069397, + "num_tokens": 7373607544.0, + "step": 69650 + }, + { + "entropy": 1.20625, + "epoch": 1.706576563341658, + "grad_norm": 4.71875, + "learning_rate": 3.462721168860428e-07, + "loss": 0.1492, + "mean_token_accuracy": 0.9675750434398651, + "num_tokens": 7378823181.0, + "step": 69700 + }, + { + "entropy": 1.2265625, + "epoch": 1.7078007933010135, + "grad_norm": 2.84375, + "learning_rate": 3.4344525110746127e-07, + "loss": 0.1603, + "mean_token_accuracy": 0.965987560749054, + "num_tokens": 7384384951.0, + "step": 69750 + }, + { + "entropy": 1.21953125, + "epoch": 1.7090250232603692, + "grad_norm": 1.640625, + "learning_rate": 3.4062927077236106e-07, + "loss": 0.1574, + "mean_token_accuracy": 0.9660314428806305, + "num_tokens": 7389942384.0, + "step": 69800 + }, + { + "entropy": 1.21640625, + "epoch": 1.7102492532197249, + "grad_norm": 2.109375, + "learning_rate": 3.3782418741944244e-07, + "loss": 0.1629, + "mean_token_accuracy": 0.9638810443878174, + "num_tokens": 7395323756.0, + "step": 69850 + }, + { + "entropy": 1.20765625, + "epoch": 1.7114734831790803, + "grad_norm": 2.625, + "learning_rate": 3.350300125427578e-07, + "loss": 0.1384, + "mean_token_accuracy": 0.9689883410930633, + "num_tokens": 7400575411.0, + "step": 69900 + }, + { + "entropy": 1.20546875, + "epoch": 1.7126977131384358, + "grad_norm": 3.109375, + "learning_rate": 3.3224675759166026e-07, + "loss": 0.1515, + "mean_token_accuracy": 0.9666663575172424, + "num_tokens": 7405984120.0, + "step": 69950 + }, + { + "entropy": 1.2203125, + "epoch": 1.7139219430977914, + "grad_norm": 2.328125, + "learning_rate": 3.294744339707564e-07, + "loss": 0.1566, + "mean_token_accuracy": 0.9662071549892426, + "num_tokens": 7411306216.0, + "step": 70000 + }, + { + "epoch": 1.7139219430977914, + "eval_entropy": 1.2108072916666666, + "eval_loss": 0.17756883800029755, + "eval_mean_token_accuracy": 0.9620932574073474, + "eval_num_tokens": 7411306216.0, + "eval_runtime": 601.9385, + "eval_samples_per_second": 16.042, + "eval_steps_per_second": 0.201, + "step": 70000 + }, + { + "entropy": 1.21734375, + "epoch": 1.7151461730571471, + "grad_norm": 0.0033111572265625, + "learning_rate": 3.2671305303986264e-07, + "loss": 0.1546, + "mean_token_accuracy": 0.9665888488292694, + "num_tokens": 7416539172.0, + "step": 70050 + }, + { + "entropy": 1.21734375, + "epoch": 1.7163704030165026, + "grad_norm": 2.84375, + "learning_rate": 3.23962626113956e-07, + "loss": 0.151, + "mean_token_accuracy": 0.9668701207637787, + "num_tokens": 7421707836.0, + "step": 70100 + }, + { + "entropy": 1.20390625, + "epoch": 1.7175946329758582, + "grad_norm": 2.875, + "learning_rate": 3.212231644631286e-07, + "loss": 0.1522, + "mean_token_accuracy": 0.967432736158371, + "num_tokens": 7427044054.0, + "step": 70150 + }, + { + "entropy": 1.1990625, + "epoch": 1.7188188629352137, + "grad_norm": 2.234375, + "learning_rate": 3.184946793125406e-07, + "loss": 0.1454, + "mean_token_accuracy": 0.9683572733402253, + "num_tokens": 7432165156.0, + "step": 70200 + }, + { + "entropy": 1.22375, + "epoch": 1.7200430928945694, + "grad_norm": 3.15625, + "learning_rate": 3.157771818423778e-07, + "loss": 0.1574, + "mean_token_accuracy": 0.9646234130859375, + "num_tokens": 7437729163.0, + "step": 70250 + }, + { + "entropy": 1.2253125, + "epoch": 1.721267322853925, + "grad_norm": 1.78125, + "learning_rate": 3.130706831877993e-07, + "loss": 0.1583, + "mean_token_accuracy": 0.965836591720581, + "num_tokens": 7443255376.0, + "step": 70300 + }, + { + "entropy": 1.21734375, + "epoch": 1.7224915528132805, + "grad_norm": 3.8125, + "learning_rate": 3.1037519443889927e-07, + "loss": 0.1502, + "mean_token_accuracy": 0.967227201461792, + "num_tokens": 7448723374.0, + "step": 70350 + }, + { + "entropy": 1.1978125, + "epoch": 1.723715782772636, + "grad_norm": 2.15625, + "learning_rate": 3.07690726640655e-07, + "loss": 0.1386, + "mean_token_accuracy": 0.9692979896068573, + "num_tokens": 7453945048.0, + "step": 70400 + }, + { + "entropy": 1.21671875, + "epoch": 1.7249400127319916, + "grad_norm": 3.359375, + "learning_rate": 3.050172907928872e-07, + "loss": 0.1601, + "mean_token_accuracy": 0.9648488080501556, + "num_tokens": 7459709955.0, + "step": 70450 + }, + { + "entropy": 1.194375, + "epoch": 1.7261642426913473, + "grad_norm": 1.2109375, + "learning_rate": 3.0235489785021073e-07, + "loss": 0.1429, + "mean_token_accuracy": 0.968617148399353, + "num_tokens": 7464731391.0, + "step": 70500 + }, + { + "entropy": 1.21328125, + "epoch": 1.7273884726507027, + "grad_norm": 4.1875, + "learning_rate": 2.997035587219911e-07, + "loss": 0.1509, + "mean_token_accuracy": 0.9667483043670654, + "num_tokens": 7470148354.0, + "step": 70550 + }, + { + "entropy": 1.21015625, + "epoch": 1.7286127026100582, + "grad_norm": 2.890625, + "learning_rate": 2.970632842723001e-07, + "loss": 0.1537, + "mean_token_accuracy": 0.9668030095100403, + "num_tokens": 7475597114.0, + "step": 70600 + }, + { + "entropy": 1.21203125, + "epoch": 1.7298369325694138, + "grad_norm": 1.78125, + "learning_rate": 2.944340853198715e-07, + "loss": 0.1489, + "mean_token_accuracy": 0.9677174651622772, + "num_tokens": 7480924480.0, + "step": 70650 + }, + { + "entropy": 1.1978125, + "epoch": 1.7310611625287695, + "grad_norm": 2.578125, + "learning_rate": 2.9181597263805703e-07, + "loss": 0.1381, + "mean_token_accuracy": 0.9692902910709381, + "num_tokens": 7485944672.0, + "step": 70700 + }, + { + "entropy": 1.2234375, + "epoch": 1.732285392488125, + "grad_norm": 3.15625, + "learning_rate": 2.8920895695478036e-07, + "loss": 0.1575, + "mean_token_accuracy": 0.9657765531539917, + "num_tokens": 7491484223.0, + "step": 70750 + }, + { + "entropy": 1.21984375, + "epoch": 1.7335096224474804, + "grad_norm": 1.640625, + "learning_rate": 2.866130489524946e-07, + "loss": 0.1497, + "mean_token_accuracy": 0.9674056577682495, + "num_tokens": 7496915236.0, + "step": 70800 + }, + { + "entropy": 1.2109375, + "epoch": 1.734733852406836, + "grad_norm": 1.9375, + "learning_rate": 2.8402825926813793e-07, + "loss": 0.1541, + "mean_token_accuracy": 0.9666642725467682, + "num_tokens": 7502068005.0, + "step": 70850 + }, + { + "entropy": 1.22796875, + "epoch": 1.7359580823661918, + "grad_norm": 1.171875, + "learning_rate": 2.814545984930923e-07, + "loss": 0.1643, + "mean_token_accuracy": 0.9640646266937256, + "num_tokens": 7507947357.0, + "step": 70900 + }, + { + "entropy": 1.2171875, + "epoch": 1.7371823123255472, + "grad_norm": 3.78125, + "learning_rate": 2.788920771731344e-07, + "loss": 0.1515, + "mean_token_accuracy": 0.96691251039505, + "num_tokens": 7513464788.0, + "step": 70950 + }, + { + "entropy": 1.21421875, + "epoch": 1.7384065422849027, + "grad_norm": 2.828125, + "learning_rate": 2.763407058083999e-07, + "loss": 0.1562, + "mean_token_accuracy": 0.9653972661495209, + "num_tokens": 7518965009.0, + "step": 71000 + }, + { + "entropy": 1.22109375, + "epoch": 1.7396307722442583, + "grad_norm": 3.09375, + "learning_rate": 2.738004948533338e-07, + "loss": 0.1553, + "mean_token_accuracy": 0.9661720776557923, + "num_tokens": 7524509007.0, + "step": 71050 + }, + { + "entropy": 1.2178125, + "epoch": 1.740855002203614, + "grad_norm": 2.640625, + "learning_rate": 2.712714547166534e-07, + "loss": 0.1494, + "mean_token_accuracy": 0.9680777621269226, + "num_tokens": 7529983645.0, + "step": 71100 + }, + { + "entropy": 1.22078125, + "epoch": 1.7420792321629694, + "grad_norm": 2.640625, + "learning_rate": 2.6875359576129975e-07, + "loss": 0.1604, + "mean_token_accuracy": 0.9644283270835876, + "num_tokens": 7535464039.0, + "step": 71150 + }, + { + "entropy": 1.206875, + "epoch": 1.743303462122325, + "grad_norm": 1.609375, + "learning_rate": 2.662469283043991e-07, + "loss": 0.1434, + "mean_token_accuracy": 0.9683542418479919, + "num_tokens": 7540523414.0, + "step": 71200 + }, + { + "entropy": 1.214375, + "epoch": 1.7445276920816806, + "grad_norm": 2.953125, + "learning_rate": 2.637514626172213e-07, + "loss": 0.1549, + "mean_token_accuracy": 0.9665893888473511, + "num_tokens": 7545849728.0, + "step": 71250 + }, + { + "entropy": 1.2040625, + "epoch": 1.7457519220410362, + "grad_norm": 2.765625, + "learning_rate": 2.6126720892513277e-07, + "loss": 0.1487, + "mean_token_accuracy": 0.9680774366855621, + "num_tokens": 7551159210.0, + "step": 71300 + }, + { + "entropy": 1.19421875, + "epoch": 1.746976152000392, + "grad_norm": 2.640625, + "learning_rate": 2.5879417740756093e-07, + "loss": 0.1363, + "mean_token_accuracy": 0.9701401054859161, + "num_tokens": 7556078762.0, + "step": 71350 + }, + { + "entropy": 1.218125, + "epoch": 1.7482003819597474, + "grad_norm": 1.5625, + "learning_rate": 2.563323781979482e-07, + "loss": 0.1656, + "mean_token_accuracy": 0.9642888736724854, + "num_tokens": 7561736323.0, + "step": 71400 + }, + { + "entropy": 1.21859375, + "epoch": 1.7494246119191028, + "grad_norm": 1.9609375, + "learning_rate": 2.5388182138371173e-07, + "loss": 0.1517, + "mean_token_accuracy": 0.966708824634552, + "num_tokens": 7567328811.0, + "step": 71450 + }, + { + "entropy": 1.22109375, + "epoch": 1.7506488418784585, + "grad_norm": 2.3125, + "learning_rate": 2.5144251700620135e-07, + "loss": 0.1629, + "mean_token_accuracy": 0.9650636351108551, + "num_tokens": 7572752827.0, + "step": 71500 + }, + { + "entropy": 1.21, + "epoch": 1.7518730718378142, + "grad_norm": 2.78125, + "learning_rate": 2.4901447506066133e-07, + "loss": 0.1599, + "mean_token_accuracy": 0.9643032836914063, + "num_tokens": 7578362509.0, + "step": 71550 + }, + { + "entropy": 1.2090625, + "epoch": 1.7530973017971696, + "grad_norm": 1.6484375, + "learning_rate": 2.465977054961852e-07, + "loss": 0.1493, + "mean_token_accuracy": 0.9673759829998017, + "num_tokens": 7583839931.0, + "step": 71600 + }, + { + "entropy": 1.21171875, + "epoch": 1.754321531756525, + "grad_norm": 2.828125, + "learning_rate": 2.441922182156775e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9662256014347076, + "num_tokens": 7589236608.0, + "step": 71650 + }, + { + "entropy": 1.209375, + "epoch": 1.7555457617158807, + "grad_norm": 2.890625, + "learning_rate": 2.4179802307581234e-07, + "loss": 0.1495, + "mean_token_accuracy": 0.9674426424503326, + "num_tokens": 7594652077.0, + "step": 71700 + }, + { + "entropy": 1.20265625, + "epoch": 1.7567699916752364, + "grad_norm": 2.96875, + "learning_rate": 2.394151298869952e-07, + "loss": 0.1451, + "mean_token_accuracy": 0.9673744821548462, + "num_tokens": 7599701409.0, + "step": 71750 + }, + { + "entropy": 1.2153125, + "epoch": 1.7579942216345918, + "grad_norm": 2.71875, + "learning_rate": 2.3704354841331932e-07, + "loss": 0.1505, + "mean_token_accuracy": 0.9669674754142761, + "num_tokens": 7605091932.0, + "step": 71800 + }, + { + "entropy": 1.2065625, + "epoch": 1.7592184515939473, + "grad_norm": 2.1875, + "learning_rate": 2.3468328837252628e-07, + "loss": 0.1478, + "mean_token_accuracy": 0.9676505529880524, + "num_tokens": 7610186489.0, + "step": 71850 + }, + { + "entropy": 1.20890625, + "epoch": 1.760442681553303, + "grad_norm": 1.765625, + "learning_rate": 2.3233435943597114e-07, + "loss": 0.1503, + "mean_token_accuracy": 0.9671880280971528, + "num_tokens": 7615665531.0, + "step": 71900 + }, + { + "entropy": 1.20375, + "epoch": 1.7616669115126586, + "grad_norm": 2.453125, + "learning_rate": 2.299967712285731e-07, + "loss": 0.1423, + "mean_token_accuracy": 0.9683215701580048, + "num_tokens": 7620773654.0, + "step": 71950 + }, + { + "entropy": 1.19234375, + "epoch": 1.762891141472014, + "grad_norm": 3.140625, + "learning_rate": 2.276705333287875e-07, + "loss": 0.1315, + "mean_token_accuracy": 0.9702609395980835, + "num_tokens": 7625470551.0, + "step": 72000 + }, + { + "entropy": 1.21046875, + "epoch": 1.7641153714313695, + "grad_norm": 2.234375, + "learning_rate": 2.253556552685573e-07, + "loss": 0.1433, + "mean_token_accuracy": 0.9681813132762909, + "num_tokens": 7630517430.0, + "step": 72050 + }, + { + "entropy": 1.21, + "epoch": 1.7653396013907252, + "grad_norm": 3.125, + "learning_rate": 2.2305214653327855e-07, + "loss": 0.1406, + "mean_token_accuracy": 0.9686529791355133, + "num_tokens": 7635763079.0, + "step": 72100 + }, + { + "entropy": 1.201875, + "epoch": 1.7665638313500809, + "grad_norm": 1.703125, + "learning_rate": 2.207600165617607e-07, + "loss": 0.1475, + "mean_token_accuracy": 0.9678330075740814, + "num_tokens": 7641423146.0, + "step": 72150 + }, + { + "entropy": 1.176875, + "epoch": 1.7677880613094363, + "grad_norm": 1.8125, + "learning_rate": 2.1847927474618846e-07, + "loss": 0.1314, + "mean_token_accuracy": 0.9702327287197113, + "num_tokens": 7646275038.0, + "step": 72200 + }, + { + "entropy": 1.205, + "epoch": 1.7690122912687918, + "grad_norm": 1.515625, + "learning_rate": 2.1620993043208182e-07, + "loss": 0.1371, + "mean_token_accuracy": 0.9702345824241638, + "num_tokens": 7651591457.0, + "step": 72250 + }, + { + "entropy": 1.2225, + "epoch": 1.7702365212281475, + "grad_norm": 1.6796875, + "learning_rate": 2.139519929182585e-07, + "loss": 0.1507, + "mean_token_accuracy": 0.9666866302490235, + "num_tokens": 7656975261.0, + "step": 72300 + }, + { + "entropy": 1.1996875, + "epoch": 1.7714607511875031, + "grad_norm": 2.46875, + "learning_rate": 2.1170547145679665e-07, + "loss": 0.1492, + "mean_token_accuracy": 0.966531822681427, + "num_tokens": 7662430438.0, + "step": 72350 + }, + { + "entropy": 1.21703125, + "epoch": 1.7726849811468586, + "grad_norm": 1.8203125, + "learning_rate": 2.0947037525299606e-07, + "loss": 0.1501, + "mean_token_accuracy": 0.9673058640956879, + "num_tokens": 7667987024.0, + "step": 72400 + }, + { + "entropy": 1.20890625, + "epoch": 1.7739092111062142, + "grad_norm": 2.640625, + "learning_rate": 2.0724671346533975e-07, + "loss": 0.1483, + "mean_token_accuracy": 0.9672919237613677, + "num_tokens": 7673092874.0, + "step": 72450 + }, + { + "entropy": 1.21171875, + "epoch": 1.7751334410655697, + "grad_norm": 2.421875, + "learning_rate": 2.0503449520545814e-07, + "loss": 0.1454, + "mean_token_accuracy": 0.9677470910549164, + "num_tokens": 7678350890.0, + "step": 72500 + }, + { + "entropy": 1.21125, + "epoch": 1.7763576710249254, + "grad_norm": 4.03125, + "learning_rate": 2.0283372953809187e-07, + "loss": 0.1506, + "mean_token_accuracy": 0.9673129177093506, + "num_tokens": 7683768054.0, + "step": 72550 + }, + { + "entropy": 1.19046875, + "epoch": 1.777581900984281, + "grad_norm": 0.010009765625, + "learning_rate": 2.0064442548105078e-07, + "loss": 0.1311, + "mean_token_accuracy": 0.9706909394264222, + "num_tokens": 7688732517.0, + "step": 72600 + }, + { + "entropy": 1.20234375, + "epoch": 1.7788061309436365, + "grad_norm": 2.625, + "learning_rate": 1.9846659200518323e-07, + "loss": 0.1443, + "mean_token_accuracy": 0.9685131824016571, + "num_tokens": 7693833105.0, + "step": 72650 + }, + { + "entropy": 1.1996875, + "epoch": 1.780030360902992, + "grad_norm": 2.8125, + "learning_rate": 1.963002380343336e-07, + "loss": 0.1372, + "mean_token_accuracy": 0.9696123468875885, + "num_tokens": 7698671416.0, + "step": 72700 + }, + { + "entropy": 1.2096875, + "epoch": 1.7812545908623476, + "grad_norm": 3.46875, + "learning_rate": 1.9414537244530883e-07, + "loss": 0.1447, + "mean_token_accuracy": 0.9681323492527008, + "num_tokens": 7704099695.0, + "step": 72750 + }, + { + "entropy": 1.209375, + "epoch": 1.7824788208217033, + "grad_norm": 3.8125, + "learning_rate": 1.9200200406784084e-07, + "loss": 0.1471, + "mean_token_accuracy": 0.9671408832073212, + "num_tokens": 7709413054.0, + "step": 72800 + }, + { + "entropy": 1.22046875, + "epoch": 1.7837030507810587, + "grad_norm": 2.375, + "learning_rate": 1.8987014168455263e-07, + "loss": 0.1513, + "mean_token_accuracy": 0.9667081344127655, + "num_tokens": 7714999778.0, + "step": 72850 + }, + { + "entropy": 1.21765625, + "epoch": 1.7849272807404142, + "grad_norm": 1.59375, + "learning_rate": 1.8774979403091852e-07, + "loss": 0.1467, + "mean_token_accuracy": 0.9685576283931732, + "num_tokens": 7720722054.0, + "step": 72900 + }, + { + "entropy": 1.18796875, + "epoch": 1.7861515106997699, + "grad_norm": 3.015625, + "learning_rate": 1.8564096979523027e-07, + "loss": 0.1448, + "mean_token_accuracy": 0.9685378670692444, + "num_tokens": 7726037284.0, + "step": 72950 + }, + { + "entropy": 1.21359375, + "epoch": 1.7873757406591255, + "grad_norm": 2.75, + "learning_rate": 1.835436776185634e-07, + "loss": 0.1305, + "mean_token_accuracy": 0.9697797727584839, + "num_tokens": 7731254143.0, + "step": 73000 + }, + { + "entropy": 1.189375, + "epoch": 1.788599970618481, + "grad_norm": 2.71875, + "learning_rate": 1.814579260947379e-07, + "loss": 0.1367, + "mean_token_accuracy": 0.969087952375412, + "num_tokens": 7736558719.0, + "step": 73050 + }, + { + "entropy": 1.20109375, + "epoch": 1.7898242005778364, + "grad_norm": 2.640625, + "learning_rate": 1.7938372377028622e-07, + "loss": 0.1265, + "mean_token_accuracy": 0.9715298664569855, + "num_tokens": 7741441296.0, + "step": 73100 + }, + { + "entropy": 1.1953125, + "epoch": 1.791048430537192, + "grad_norm": 2.078125, + "learning_rate": 1.773210791444161e-07, + "loss": 0.131, + "mean_token_accuracy": 0.9706771004199982, + "num_tokens": 7746461885.0, + "step": 73150 + }, + { + "entropy": 1.2090625, + "epoch": 1.7922726604965478, + "grad_norm": 3.375, + "learning_rate": 1.7527000066897837e-07, + "loss": 0.1469, + "mean_token_accuracy": 0.9673126399517059, + "num_tokens": 7752002392.0, + "step": 73200 + }, + { + "entropy": 1.1975, + "epoch": 1.7934968904559032, + "grad_norm": 1.5, + "learning_rate": 1.7323049674842783e-07, + "loss": 0.1437, + "mean_token_accuracy": 0.9683597016334534, + "num_tokens": 7756991548.0, + "step": 73250 + }, + { + "entropy": 1.2171875, + "epoch": 1.7947211204152587, + "grad_norm": 2.046875, + "learning_rate": 1.7120257573979492e-07, + "loss": 0.1454, + "mean_token_accuracy": 0.968316274881363, + "num_tokens": 7762203324.0, + "step": 73300 + }, + { + "entropy": 1.1959375, + "epoch": 1.7959453503746143, + "grad_norm": 2.109375, + "learning_rate": 1.6918624595264597e-07, + "loss": 0.1366, + "mean_token_accuracy": 0.9702933692932129, + "num_tokens": 7767460924.0, + "step": 73350 + }, + { + "entropy": 1.199375, + "epoch": 1.79716958033397, + "grad_norm": 2.265625, + "learning_rate": 1.671815156490517e-07, + "loss": 0.143, + "mean_token_accuracy": 0.9685783159732818, + "num_tokens": 7772824486.0, + "step": 73400 + }, + { + "entropy": 1.21921875, + "epoch": 1.7983938102933255, + "grad_norm": 2.953125, + "learning_rate": 1.651883930435535e-07, + "loss": 0.1362, + "mean_token_accuracy": 0.9696711504459381, + "num_tokens": 7778088634.0, + "step": 73450 + }, + { + "entropy": 1.2078125, + "epoch": 1.799618040252681, + "grad_norm": 0.004302978515625, + "learning_rate": 1.6320688630312908e-07, + "loss": 0.1363, + "mean_token_accuracy": 0.9695776212215423, + "num_tokens": 7783380087.0, + "step": 73500 + }, + { + "entropy": 1.22859375, + "epoch": 1.8008422702120366, + "grad_norm": 1.5625, + "learning_rate": 1.6123700354716032e-07, + "loss": 0.1559, + "mean_token_accuracy": 0.9663217055797577, + "num_tokens": 7789343726.0, + "step": 73550 + }, + { + "entropy": 1.21328125, + "epoch": 1.8020665001713922, + "grad_norm": 1.65625, + "learning_rate": 1.5927875284739546e-07, + "loss": 0.1356, + "mean_token_accuracy": 0.9702400255203247, + "num_tokens": 7794792440.0, + "step": 73600 + }, + { + "entropy": 1.21484375, + "epoch": 1.803290730130748, + "grad_norm": 1.71875, + "learning_rate": 1.5733214222792392e-07, + "loss": 0.1418, + "mean_token_accuracy": 0.9687067580223083, + "num_tokens": 7800254887.0, + "step": 73650 + }, + { + "entropy": 1.21421875, + "epoch": 1.8045149600901034, + "grad_norm": 3.625, + "learning_rate": 1.5539717966513623e-07, + "loss": 0.1361, + "mean_token_accuracy": 0.969369399547577, + "num_tokens": 7805607043.0, + "step": 73700 + }, + { + "entropy": 1.20984375, + "epoch": 1.8057391900494588, + "grad_norm": 2.609375, + "learning_rate": 1.5347387308769478e-07, + "loss": 0.1326, + "mean_token_accuracy": 0.9703532266616821, + "num_tokens": 7810964969.0, + "step": 73750 + }, + { + "entropy": 1.20515625, + "epoch": 1.8069634200088145, + "grad_norm": 2.234375, + "learning_rate": 1.5156223037649985e-07, + "loss": 0.1506, + "mean_token_accuracy": 0.9663440334796906, + "num_tokens": 7816484836.0, + "step": 73800 + }, + { + "entropy": 1.1890625, + "epoch": 1.8081876499681702, + "grad_norm": 3.03125, + "learning_rate": 1.4966225936465993e-07, + "loss": 0.1304, + "mean_token_accuracy": 0.9708381593227386, + "num_tokens": 7821459721.0, + "step": 73850 + }, + { + "entropy": 1.19953125, + "epoch": 1.8094118799275256, + "grad_norm": 2.1875, + "learning_rate": 1.4777396783745612e-07, + "loss": 0.128, + "mean_token_accuracy": 0.9713588643074036, + "num_tokens": 7826287539.0, + "step": 73900 + }, + { + "entropy": 1.1978125, + "epoch": 1.810636109886881, + "grad_norm": 2.15625, + "learning_rate": 1.4589736353231308e-07, + "loss": 0.1202, + "mean_token_accuracy": 0.9729771482944488, + "num_tokens": 7831387963.0, + "step": 73950 + }, + { + "entropy": 1.195, + "epoch": 1.8118603398462367, + "grad_norm": 2.296875, + "learning_rate": 1.4403245413876486e-07, + "loss": 0.1344, + "mean_token_accuracy": 0.9699731683731079, + "num_tokens": 7836315700.0, + "step": 74000 + }, + { + "entropy": 1.18796875, + "epoch": 1.8130845698055924, + "grad_norm": 2.296875, + "learning_rate": 1.4217924729842513e-07, + "loss": 0.1381, + "mean_token_accuracy": 0.9699892640113831, + "num_tokens": 7841453471.0, + "step": 74050 + }, + { + "entropy": 1.2075, + "epoch": 1.8143087997649479, + "grad_norm": 2.3125, + "learning_rate": 1.403377506049569e-07, + "loss": 0.1451, + "mean_token_accuracy": 0.9681575572490693, + "num_tokens": 7846798475.0, + "step": 74100 + }, + { + "entropy": 1.1890625, + "epoch": 1.8155330297243033, + "grad_norm": 3.328125, + "learning_rate": 1.385079716040376e-07, + "loss": 0.1253, + "mean_token_accuracy": 0.9720281398296357, + "num_tokens": 7851768429.0, + "step": 74150 + }, + { + "entropy": 1.19671875, + "epoch": 1.816757259683659, + "grad_norm": 2.40625, + "learning_rate": 1.3668991779333308e-07, + "loss": 0.1218, + "mean_token_accuracy": 0.9725555181503296, + "num_tokens": 7856881793.0, + "step": 74200 + }, + { + "entropy": 1.19890625, + "epoch": 1.8179814896430146, + "grad_norm": 1.8984375, + "learning_rate": 1.3488359662246087e-07, + "loss": 0.1272, + "mean_token_accuracy": 0.9715735244750977, + "num_tokens": 7861890257.0, + "step": 74250 + }, + { + "entropy": 1.20390625, + "epoch": 1.81920571960237, + "grad_norm": 1.90625, + "learning_rate": 1.3308901549296604e-07, + "loss": 0.1275, + "mean_token_accuracy": 0.9717478513717651, + "num_tokens": 7867074576.0, + "step": 74300 + }, + { + "entropy": 1.20203125, + "epoch": 1.8204299495617255, + "grad_norm": 2.46875, + "learning_rate": 1.3130618175828713e-07, + "loss": 0.1367, + "mean_token_accuracy": 0.9701256167888641, + "num_tokens": 7872381109.0, + "step": 74350 + }, + { + "entropy": 1.20828125, + "epoch": 1.8216541795210812, + "grad_norm": 3.359375, + "learning_rate": 1.2953510272372647e-07, + "loss": 0.1287, + "mean_token_accuracy": 0.9719671607017517, + "num_tokens": 7877881928.0, + "step": 74400 + }, + { + "entropy": 1.199375, + "epoch": 1.822878409480437, + "grad_norm": 2.59375, + "learning_rate": 1.2777578564641969e-07, + "loss": 0.1309, + "mean_token_accuracy": 0.9707298684120178, + "num_tokens": 7882820168.0, + "step": 74450 + }, + { + "entropy": 1.21734375, + "epoch": 1.8241026394397923, + "grad_norm": 2.546875, + "learning_rate": 1.2602823773530915e-07, + "loss": 0.1426, + "mean_token_accuracy": 0.9688560748100281, + "num_tokens": 7888372934.0, + "step": 74500 + }, + { + "entropy": 1.2046875, + "epoch": 1.8253268693991478, + "grad_norm": 2.703125, + "learning_rate": 1.2429246615111024e-07, + "loss": 0.1331, + "mean_token_accuracy": 0.970300270318985, + "num_tokens": 7893801088.0, + "step": 74550 + }, + { + "entropy": 1.21171875, + "epoch": 1.8265510993585035, + "grad_norm": 2.03125, + "learning_rate": 1.2256847800628425e-07, + "loss": 0.1223, + "mean_token_accuracy": 0.973189731836319, + "num_tokens": 7898852778.0, + "step": 74600 + }, + { + "entropy": 1.20671875, + "epoch": 1.8277753293178591, + "grad_norm": 2.078125, + "learning_rate": 1.2085628036501007e-07, + "loss": 0.123, + "mean_token_accuracy": 0.9726410353183746, + "num_tokens": 7903818883.0, + "step": 74650 + }, + { + "entropy": 1.19265625, + "epoch": 1.8289995592772146, + "grad_norm": 3.21875, + "learning_rate": 1.1915588024315194e-07, + "loss": 0.1278, + "mean_token_accuracy": 0.9702788054943084, + "num_tokens": 7908897679.0, + "step": 74700 + }, + { + "entropy": 1.20984375, + "epoch": 1.83022378923657, + "grad_norm": 3.15625, + "learning_rate": 1.1746728460823508e-07, + "loss": 0.1303, + "mean_token_accuracy": 0.9711257565021515, + "num_tokens": 7914006448.0, + "step": 74750 + }, + { + "entropy": 1.2140625, + "epoch": 1.8314480191959257, + "grad_norm": 1.9609375, + "learning_rate": 1.1579050037941275e-07, + "loss": 0.1362, + "mean_token_accuracy": 0.969500253200531, + "num_tokens": 7919510157.0, + "step": 74800 + }, + { + "entropy": 1.21421875, + "epoch": 1.8326722491552814, + "grad_norm": 2.40625, + "learning_rate": 1.1412553442744255e-07, + "loss": 0.132, + "mean_token_accuracy": 0.970678209066391, + "num_tokens": 7924726404.0, + "step": 74850 + }, + { + "entropy": 1.1996875, + "epoch": 1.833896479114637, + "grad_norm": 2.703125, + "learning_rate": 1.1247239357465255e-07, + "loss": 0.13, + "mean_token_accuracy": 0.9713816094398499, + "num_tokens": 7929934384.0, + "step": 74900 + }, + { + "entropy": 1.18921875, + "epoch": 1.8351207090739925, + "grad_norm": 1.9921875, + "learning_rate": 1.1083108459491986e-07, + "loss": 0.1256, + "mean_token_accuracy": 0.9721748220920563, + "num_tokens": 7935196457.0, + "step": 74950 + }, + { + "entropy": 1.2003125, + "epoch": 1.836344939033348, + "grad_norm": 2.703125, + "learning_rate": 1.0920161421363773e-07, + "loss": 0.119, + "mean_token_accuracy": 0.9733594739437104, + "num_tokens": 7940201367.0, + "step": 75000 + }, + { + "entropy": 1.22375, + "epoch": 1.8375691689927036, + "grad_norm": 1.7265625, + "learning_rate": 1.0758398910768951e-07, + "loss": 0.1373, + "mean_token_accuracy": 0.9692693221569061, + "num_tokens": 7945635438.0, + "step": 75050 + }, + { + "entropy": 1.20890625, + "epoch": 1.8387933989520593, + "grad_norm": 1.546875, + "learning_rate": 1.0597821590542211e-07, + "loss": 0.1282, + "mean_token_accuracy": 0.9722434699535369, + "num_tokens": 7951091367.0, + "step": 75100 + }, + { + "entropy": 1.18828125, + "epoch": 1.8400176289114147, + "grad_norm": 0.004425048828125, + "learning_rate": 1.0438430118661924e-07, + "loss": 0.124, + "mean_token_accuracy": 0.9725795328617096, + "num_tokens": 7956255217.0, + "step": 75150 + }, + { + "entropy": 1.1903125, + "epoch": 1.8412418588707702, + "grad_norm": 1.921875, + "learning_rate": 1.0280225148247213e-07, + "loss": 0.1179, + "mean_token_accuracy": 0.9743827605247497, + "num_tokens": 7961236486.0, + "step": 75200 + }, + { + "entropy": 1.1996875, + "epoch": 1.8424660888301259, + "grad_norm": 1.640625, + "learning_rate": 1.0123207327555462e-07, + "loss": 0.1156, + "mean_token_accuracy": 0.9743783438205719, + "num_tokens": 7966324215.0, + "step": 75250 + }, + { + "entropy": 1.2090625, + "epoch": 1.8436903187894815, + "grad_norm": 1.71875, + "learning_rate": 9.967377299979708e-08, + "loss": 0.134, + "mean_token_accuracy": 0.9705902481079102, + "num_tokens": 7971817863.0, + "step": 75300 + }, + { + "entropy": 1.19578125, + "epoch": 1.844914548748837, + "grad_norm": 2.15625, + "learning_rate": 9.812735704045684e-08, + "loss": 0.1185, + "mean_token_accuracy": 0.9737985277175903, + "num_tokens": 7977008142.0, + "step": 75350 + }, + { + "entropy": 1.190625, + "epoch": 1.8461387787081924, + "grad_norm": 1.75, + "learning_rate": 9.65928317340975e-08, + "loss": 0.1201, + "mean_token_accuracy": 0.9731456315517426, + "num_tokens": 7982011592.0, + "step": 75400 + }, + { + "entropy": 1.20875, + "epoch": 1.847363008667548, + "grad_norm": 1.765625, + "learning_rate": 9.507020336855632e-08, + "loss": 0.1221, + "mean_token_accuracy": 0.9724456059932709, + "num_tokens": 7987367141.0, + "step": 75450 + }, + { + "entropy": 1.20234375, + "epoch": 1.8485872386269038, + "grad_norm": 1.625, + "learning_rate": 9.355947818292554e-08, + "loss": 0.1149, + "mean_token_accuracy": 0.9738513994216919, + "num_tokens": 7992500198.0, + "step": 75500 + }, + { + "entropy": 1.21625, + "epoch": 1.8498114685862592, + "grad_norm": 1.78125, + "learning_rate": 9.206066236751943e-08, + "loss": 0.1328, + "mean_token_accuracy": 0.9707795882225037, + "num_tokens": 7998217427.0, + "step": 75550 + }, + { + "entropy": 1.1975, + "epoch": 1.8510356985456147, + "grad_norm": 2.125, + "learning_rate": 9.057376206385559e-08, + "loss": 0.1175, + "mean_token_accuracy": 0.9741839158535004, + "num_tokens": 8003308568.0, + "step": 75600 + }, + { + "entropy": 1.1878125, + "epoch": 1.8522599285049703, + "grad_norm": 3.21875, + "learning_rate": 8.90987833646254e-08, + "loss": 0.1077, + "mean_token_accuracy": 0.9759363722801209, + "num_tokens": 8008259087.0, + "step": 75650 + }, + { + "entropy": 1.20125, + "epoch": 1.853484158464326, + "grad_norm": 2.109375, + "learning_rate": 8.763573231367062e-08, + "loss": 0.1256, + "mean_token_accuracy": 0.9727174258232116, + "num_tokens": 8013653351.0, + "step": 75700 + }, + { + "entropy": 1.20078125, + "epoch": 1.8547083884236815, + "grad_norm": 2.765625, + "learning_rate": 8.618461490595975e-08, + "loss": 0.1214, + "mean_token_accuracy": 0.9735188388824463, + "num_tokens": 8018956628.0, + "step": 75750 + }, + { + "entropy": 1.209375, + "epoch": 1.855932618383037, + "grad_norm": 2.84375, + "learning_rate": 8.474543708756044e-08, + "loss": 0.1225, + "mean_token_accuracy": 0.9721533727645874, + "num_tokens": 8024197226.0, + "step": 75800 + }, + { + "entropy": 1.19015625, + "epoch": 1.8571568483423926, + "grad_norm": 0.005462646484375, + "learning_rate": 8.33182047556178e-08, + "loss": 0.1076, + "mean_token_accuracy": 0.9760002064704895, + "num_tokens": 8029024717.0, + "step": 75850 + }, + { + "entropy": 1.1953125, + "epoch": 1.8583810783017483, + "grad_norm": 1.640625, + "learning_rate": 8.190292375832975e-08, + "loss": 0.1274, + "mean_token_accuracy": 0.971969587802887, + "num_tokens": 8034254868.0, + "step": 75900 + }, + { + "entropy": 1.20546875, + "epoch": 1.859605308261104, + "grad_norm": 2.78125, + "learning_rate": 8.049959989492239e-08, + "loss": 0.1248, + "mean_token_accuracy": 0.9728272747993469, + "num_tokens": 8039555218.0, + "step": 75950 + }, + { + "entropy": 1.21359375, + "epoch": 1.8608295382204594, + "grad_norm": 1.640625, + "learning_rate": 7.910823891562536e-08, + "loss": 0.131, + "mean_token_accuracy": 0.9710195803642273, + "num_tokens": 8044915571.0, + "step": 76000 + }, + { + "entropy": 1.19625, + "epoch": 1.8620537681798148, + "grad_norm": 1.6953125, + "learning_rate": 7.77288465216518e-08, + "loss": 0.1189, + "mean_token_accuracy": 0.9735661280155182, + "num_tokens": 8050222763.0, + "step": 76050 + }, + { + "entropy": 1.1953125, + "epoch": 1.8632779981391705, + "grad_norm": 2.375, + "learning_rate": 7.636142836517013e-08, + "loss": 0.1211, + "mean_token_accuracy": 0.9737051403522492, + "num_tokens": 8055473678.0, + "step": 76100 + }, + { + "entropy": 1.196875, + "epoch": 1.8645022280985262, + "grad_norm": 1.6796875, + "learning_rate": 7.500599004928565e-08, + "loss": 0.1122, + "mean_token_accuracy": 0.974678498506546, + "num_tokens": 8060311800.0, + "step": 76150 + }, + { + "entropy": 1.18984375, + "epoch": 1.8657264580578816, + "grad_norm": 2.5, + "learning_rate": 7.36625371280133e-08, + "loss": 0.1164, + "mean_token_accuracy": 0.9736955296993256, + "num_tokens": 8065567322.0, + "step": 76200 + }, + { + "entropy": 1.211875, + "epoch": 1.866950688017237, + "grad_norm": 2.109375, + "learning_rate": 7.233107510625858e-08, + "loss": 0.1262, + "mean_token_accuracy": 0.9716404461860657, + "num_tokens": 8070882224.0, + "step": 76250 + }, + { + "entropy": 1.20234375, + "epoch": 1.8681749179765927, + "grad_norm": 1.65625, + "learning_rate": 7.101160943979201e-08, + "loss": 0.1242, + "mean_token_accuracy": 0.9728803491592407, + "num_tokens": 8075963376.0, + "step": 76300 + }, + { + "entropy": 1.20921875, + "epoch": 1.8693991479359484, + "grad_norm": 1.625, + "learning_rate": 6.970414553522842e-08, + "loss": 0.1223, + "mean_token_accuracy": 0.9728834819793701, + "num_tokens": 8081448166.0, + "step": 76350 + }, + { + "entropy": 1.1978125, + "epoch": 1.8706233778953039, + "grad_norm": 2.78125, + "learning_rate": 6.840868875000561e-08, + "loss": 0.1146, + "mean_token_accuracy": 0.9747687363624573, + "num_tokens": 8086285902.0, + "step": 76400 + }, + { + "entropy": 1.200625, + "epoch": 1.8718476078546593, + "grad_norm": 2.765625, + "learning_rate": 6.712524439235978e-08, + "loss": 0.1171, + "mean_token_accuracy": 0.9743122577667236, + "num_tokens": 8091436927.0, + "step": 76450 + }, + { + "entropy": 1.211875, + "epoch": 1.873071837814015, + "grad_norm": 2.078125, + "learning_rate": 6.585381772130584e-08, + "loss": 0.1327, + "mean_token_accuracy": 0.9712537932395935, + "num_tokens": 8097048708.0, + "step": 76500 + }, + { + "entropy": 1.2128125, + "epoch": 1.8742960677733707, + "grad_norm": 2.703125, + "learning_rate": 6.459441394661536e-08, + "loss": 0.1342, + "mean_token_accuracy": 0.9702994549274444, + "num_tokens": 8102302631.0, + "step": 76550 + }, + { + "entropy": 1.20875, + "epoch": 1.875520297732726, + "grad_norm": 1.7890625, + "learning_rate": 6.334703822879506e-08, + "loss": 0.1337, + "mean_token_accuracy": 0.970585721731186, + "num_tokens": 8107702374.0, + "step": 76600 + }, + { + "entropy": 1.208125, + "epoch": 1.8767445276920816, + "grad_norm": 2.359375, + "learning_rate": 6.211169567906572e-08, + "loss": 0.1419, + "mean_token_accuracy": 0.9687972629070282, + "num_tokens": 8113119431.0, + "step": 76650 + }, + { + "entropy": 1.20546875, + "epoch": 1.8779687576514372, + "grad_norm": 3.0625, + "learning_rate": 6.08883913593412e-08, + "loss": 0.1354, + "mean_token_accuracy": 0.9701398539543152, + "num_tokens": 8118309412.0, + "step": 76700 + }, + { + "entropy": 1.19796875, + "epoch": 1.879192987610793, + "grad_norm": 2.546875, + "learning_rate": 5.967713028220756e-08, + "loss": 0.1334, + "mean_token_accuracy": 0.9713104116916657, + "num_tokens": 8123346693.0, + "step": 76750 + }, + { + "entropy": 1.2065625, + "epoch": 1.8804172175701483, + "grad_norm": 2.46875, + "learning_rate": 5.8477917410903914e-08, + "loss": 0.1449, + "mean_token_accuracy": 0.968209480047226, + "num_tokens": 8128745782.0, + "step": 76800 + }, + { + "entropy": 1.19703125, + "epoch": 1.8816414475295038, + "grad_norm": 2.78125, + "learning_rate": 5.729075765929925e-08, + "loss": 0.1602, + "mean_token_accuracy": 0.9653090810775757, + "num_tokens": 8133734566.0, + "step": 76850 + }, + { + "entropy": 1.2078125, + "epoch": 1.8828656774888595, + "grad_norm": 3.046875, + "learning_rate": 5.61156558918744e-08, + "loss": 0.1748, + "mean_token_accuracy": 0.9636254405975342, + "num_tokens": 8139112182.0, + "step": 76900 + }, + { + "entropy": 1.19765625, + "epoch": 1.8840899074482151, + "grad_norm": 3.125, + "learning_rate": 5.4952616923703014e-08, + "loss": 0.1508, + "mean_token_accuracy": 0.9667049193382263, + "num_tokens": 8144120297.0, + "step": 76950 + }, + { + "entropy": 1.20921875, + "epoch": 1.8853141374075706, + "grad_norm": 2.8125, + "learning_rate": 5.380164552042832e-08, + "loss": 0.1581, + "mean_token_accuracy": 0.9663659358024597, + "num_tokens": 8149360110.0, + "step": 77000 + }, + { + "entropy": 1.2215625, + "epoch": 1.886538367366926, + "grad_norm": 2.046875, + "learning_rate": 5.266274639824742e-08, + "loss": 0.1807, + "mean_token_accuracy": 0.9613511979579925, + "num_tokens": 8154930968.0, + "step": 77050 + }, + { + "entropy": 1.1940625, + "epoch": 1.8877625973262817, + "grad_norm": 3.390625, + "learning_rate": 5.1535924223889305e-08, + "loss": 0.1593, + "mean_token_accuracy": 0.9654444575309753, + "num_tokens": 8159971112.0, + "step": 77100 + }, + { + "entropy": 1.2128125, + "epoch": 1.8889868272856374, + "grad_norm": 3.328125, + "learning_rate": 5.042118361459724e-08, + "loss": 0.1693, + "mean_token_accuracy": 0.964167617559433, + "num_tokens": 8165136464.0, + "step": 77150 + }, + { + "entropy": 1.20234375, + "epoch": 1.890211057244993, + "grad_norm": 2.84375, + "learning_rate": 4.931852913810875e-08, + "loss": 0.1597, + "mean_token_accuracy": 0.9660988628864289, + "num_tokens": 8170440548.0, + "step": 77200 + }, + { + "entropy": 1.2046875, + "epoch": 1.8914352872043485, + "grad_norm": 2.71875, + "learning_rate": 4.822796531263862e-08, + "loss": 0.163, + "mean_token_accuracy": 0.9647459161281585, + "num_tokens": 8175965156.0, + "step": 77250 + }, + { + "entropy": 1.21484375, + "epoch": 1.892659517163704, + "grad_norm": 3.09375, + "learning_rate": 4.7149496606857966e-08, + "loss": 0.1777, + "mean_token_accuracy": 0.9630069530010223, + "num_tokens": 8181436041.0, + "step": 77300 + }, + { + "entropy": 1.20734375, + "epoch": 1.8938837471230596, + "grad_norm": 3.359375, + "learning_rate": 4.608312743987819e-08, + "loss": 0.1646, + "mean_token_accuracy": 0.9651682090759277, + "num_tokens": 8186577107.0, + "step": 77350 + }, + { + "entropy": 1.2134375, + "epoch": 1.8951079770824153, + "grad_norm": 4.21875, + "learning_rate": 4.50288621812307e-08, + "loss": 0.1701, + "mean_token_accuracy": 0.9638711404800415, + "num_tokens": 8191908989.0, + "step": 77400 + }, + { + "entropy": 1.1978125, + "epoch": 1.8963322070417707, + "grad_norm": 2.921875, + "learning_rate": 4.398670515085157e-08, + "loss": 0.1672, + "mean_token_accuracy": 0.964127391576767, + "num_tokens": 8197252149.0, + "step": 77450 + }, + { + "entropy": 1.2015625, + "epoch": 1.8975564370011262, + "grad_norm": 2.75, + "learning_rate": 4.295666061906156e-08, + "loss": 0.1741, + "mean_token_accuracy": 0.9626425766944885, + "num_tokens": 8202870180.0, + "step": 77500 + }, + { + "entropy": 1.20109375, + "epoch": 1.8987806669604819, + "grad_norm": 4.0625, + "learning_rate": 4.193873280654914e-08, + "loss": 0.1645, + "mean_token_accuracy": 0.964863383769989, + "num_tokens": 8208065173.0, + "step": 77550 + }, + { + "entropy": 1.20234375, + "epoch": 1.9000048969198375, + "grad_norm": 2.28125, + "learning_rate": 4.093292588435549e-08, + "loss": 0.1605, + "mean_token_accuracy": 0.965006741285324, + "num_tokens": 8213242226.0, + "step": 77600 + }, + { + "entropy": 1.20734375, + "epoch": 1.901229126879193, + "grad_norm": 2.0, + "learning_rate": 3.993924397385251e-08, + "loss": 0.1693, + "mean_token_accuracy": 0.9635647284984589, + "num_tokens": 8218628064.0, + "step": 77650 + }, + { + "entropy": 1.21203125, + "epoch": 1.9024533568385484, + "grad_norm": 3.09375, + "learning_rate": 3.895769114673187e-08, + "loss": 0.1657, + "mean_token_accuracy": 0.9649439096450806, + "num_tokens": 8223851321.0, + "step": 77700 + }, + { + "entropy": 1.18859375, + "epoch": 1.903677586797904, + "grad_norm": 1.8203125, + "learning_rate": 3.798827142498329e-08, + "loss": 0.1508, + "mean_token_accuracy": 0.9679539859294891, + "num_tokens": 8228778299.0, + "step": 77750 + }, + { + "entropy": 1.20296875, + "epoch": 1.9049018167572598, + "grad_norm": 3.0625, + "learning_rate": 3.7030988780880957e-08, + "loss": 0.1541, + "mean_token_accuracy": 0.966580958366394, + "num_tokens": 8233727662.0, + "step": 77800 + }, + { + "entropy": 1.21453125, + "epoch": 1.9061260467166152, + "grad_norm": 2.046875, + "learning_rate": 3.6085847136966164e-08, + "loss": 0.1622, + "mean_token_accuracy": 0.9650613677501678, + "num_tokens": 8239365249.0, + "step": 77850 + }, + { + "entropy": 1.22, + "epoch": 1.9073502766759707, + "grad_norm": 2.546875, + "learning_rate": 3.515285036603233e-08, + "loss": 0.1736, + "mean_token_accuracy": 0.9626342761516571, + "num_tokens": 8244922468.0, + "step": 77900 + }, + { + "entropy": 1.21125, + "epoch": 1.9085745066353264, + "grad_norm": 2.65625, + "learning_rate": 3.423200229110701e-08, + "loss": 0.1665, + "mean_token_accuracy": 0.9643622922897339, + "num_tokens": 8250033392.0, + "step": 77950 + }, + { + "entropy": 1.20125, + "epoch": 1.909798736594682, + "grad_norm": 3.546875, + "learning_rate": 3.3323306685437926e-08, + "loss": 0.1587, + "mean_token_accuracy": 0.9665237700939179, + "num_tokens": 8255293579.0, + "step": 78000 + }, + { + "entropy": 1.189375, + "epoch": 1.9110229665540375, + "grad_norm": 3.296875, + "learning_rate": 3.242676727247795e-08, + "loss": 0.146, + "mean_token_accuracy": 0.9674337708950043, + "num_tokens": 8260317228.0, + "step": 78050 + }, + { + "entropy": 1.2103125, + "epoch": 1.912247196513393, + "grad_norm": 4.0, + "learning_rate": 3.1542387725868146e-08, + "loss": 0.1651, + "mean_token_accuracy": 0.9643155598640442, + "num_tokens": 8265716396.0, + "step": 78100 + }, + { + "entropy": 1.20078125, + "epoch": 1.9134714264727486, + "grad_norm": 2.453125, + "learning_rate": 3.0670171669423764e-08, + "loss": 0.1625, + "mean_token_accuracy": 0.9650612294673919, + "num_tokens": 8270999547.0, + "step": 78150 + }, + { + "entropy": 1.2115625, + "epoch": 1.9146956564321043, + "grad_norm": 2.421875, + "learning_rate": 2.981012267711858e-08, + "loss": 0.1725, + "mean_token_accuracy": 0.9635538387298584, + "num_tokens": 8276439622.0, + "step": 78200 + }, + { + "entropy": 1.203125, + "epoch": 1.91591988639146, + "grad_norm": 3.5625, + "learning_rate": 2.896224427307226e-08, + "loss": 0.1649, + "mean_token_accuracy": 0.9643189585208893, + "num_tokens": 8281629841.0, + "step": 78250 + }, + { + "entropy": 1.20921875, + "epoch": 1.9171441163508154, + "grad_norm": 3.5, + "learning_rate": 2.8126539931533023e-08, + "loss": 0.1601, + "mean_token_accuracy": 0.9657320499420166, + "num_tokens": 8286850296.0, + "step": 78300 + }, + { + "entropy": 1.2075, + "epoch": 1.9183683463101708, + "grad_norm": 3.078125, + "learning_rate": 2.7303013076866335e-08, + "loss": 0.1675, + "mean_token_accuracy": 0.964200325012207, + "num_tokens": 8292528304.0, + "step": 78350 + }, + { + "entropy": 1.21671875, + "epoch": 1.9195925762695265, + "grad_norm": 4.125, + "learning_rate": 2.6491667083537896e-08, + "loss": 0.1674, + "mean_token_accuracy": 0.9635697185993195, + "num_tokens": 8297851717.0, + "step": 78400 + }, + { + "entropy": 1.203125, + "epoch": 1.9208168062288822, + "grad_norm": 3.5625, + "learning_rate": 2.5692505276102673e-08, + "loss": 0.1639, + "mean_token_accuracy": 0.9647056591510773, + "num_tokens": 8302822545.0, + "step": 78450 + }, + { + "entropy": 1.20234375, + "epoch": 1.9220410361882376, + "grad_norm": 4.125, + "learning_rate": 2.490553092918957e-08, + "loss": 0.167, + "mean_token_accuracy": 0.9645107495784759, + "num_tokens": 8308044186.0, + "step": 78500 + }, + { + "entropy": 1.20390625, + "epoch": 1.923265266147593, + "grad_norm": 4.1875, + "learning_rate": 2.4130747267488096e-08, + "loss": 0.1587, + "mean_token_accuracy": 0.9651757764816284, + "num_tokens": 8313261711.0, + "step": 78550 + }, + { + "entropy": 1.20625, + "epoch": 1.9244894961069487, + "grad_norm": 2.390625, + "learning_rate": 2.3368157465735727e-08, + "loss": 0.1729, + "mean_token_accuracy": 0.9643122732639313, + "num_tokens": 8318954245.0, + "step": 78600 + }, + { + "entropy": 1.21640625, + "epoch": 1.9257137260663044, + "grad_norm": 5.46875, + "learning_rate": 2.261776464870424e-08, + "loss": 0.1712, + "mean_token_accuracy": 0.9633339118957519, + "num_tokens": 8324544756.0, + "step": 78650 + }, + { + "entropy": 1.21515625, + "epoch": 1.9269379560256599, + "grad_norm": 2.875, + "learning_rate": 2.1879571891188054e-08, + "loss": 0.1751, + "mean_token_accuracy": 0.9626336395740509, + "num_tokens": 8329948691.0, + "step": 78700 + }, + { + "entropy": 1.20515625, + "epoch": 1.9281621859850153, + "grad_norm": 2.71875, + "learning_rate": 2.1153582217990574e-08, + "loss": 0.1655, + "mean_token_accuracy": 0.964772834777832, + "num_tokens": 8335173517.0, + "step": 78750 + }, + { + "entropy": 1.2015625, + "epoch": 1.929386415944371, + "grad_norm": 2.796875, + "learning_rate": 2.043979860391154e-08, + "loss": 0.1711, + "mean_token_accuracy": 0.9635234928131103, + "num_tokens": 8340379735.0, + "step": 78800 + }, + { + "entropy": 1.1909375, + "epoch": 1.9306106459037267, + "grad_norm": 3.703125, + "learning_rate": 1.9738223973735702e-08, + "loss": 0.1559, + "mean_token_accuracy": 0.9672637641429901, + "num_tokens": 8345381104.0, + "step": 78850 + }, + { + "entropy": 1.21375, + "epoch": 1.9318348758630821, + "grad_norm": 2.484375, + "learning_rate": 1.9048861202221823e-08, + "loss": 0.1681, + "mean_token_accuracy": 0.9651576709747315, + "num_tokens": 8350559447.0, + "step": 78900 + }, + { + "entropy": 1.21234375, + "epoch": 1.9330591058224376, + "grad_norm": 3.9375, + "learning_rate": 1.8371713114086697e-08, + "loss": 0.1652, + "mean_token_accuracy": 0.9637591278553009, + "num_tokens": 8355928028.0, + "step": 78950 + }, + { + "entropy": 1.20640625, + "epoch": 1.9342833357817932, + "grad_norm": 0.4453125, + "learning_rate": 1.770678248399982e-08, + "loss": 0.1621, + "mean_token_accuracy": 0.9652046132087707, + "num_tokens": 8361366979.0, + "step": 79000 + }, + { + "entropy": 1.20453125, + "epoch": 1.935507565741149, + "grad_norm": 4.1875, + "learning_rate": 1.7054072036566394e-08, + "loss": 0.1685, + "mean_token_accuracy": 0.9641025936603547, + "num_tokens": 8366288409.0, + "step": 79050 + }, + { + "entropy": 1.2125, + "epoch": 1.9367317957005044, + "grad_norm": 2.234375, + "learning_rate": 1.6413584446319018e-08, + "loss": 0.1632, + "mean_token_accuracy": 0.9653700625896454, + "num_tokens": 8371880970.0, + "step": 79100 + }, + { + "entropy": 1.20578125, + "epoch": 1.9379560256598598, + "grad_norm": 2.609375, + "learning_rate": 1.5785322337706688e-08, + "loss": 0.164, + "mean_token_accuracy": 0.9650757694244385, + "num_tokens": 8377110509.0, + "step": 79150 + }, + { + "entropy": 1.20046875, + "epoch": 1.9391802556192155, + "grad_norm": 2.140625, + "learning_rate": 1.5169288285082793e-08, + "loss": 0.1631, + "mean_token_accuracy": 0.9651304471492768, + "num_tokens": 8382268459.0, + "step": 79200 + }, + { + "entropy": 1.2075, + "epoch": 1.9404044855785711, + "grad_norm": 3.40625, + "learning_rate": 1.4565484812696151e-08, + "loss": 0.155, + "mean_token_accuracy": 0.9661095356941223, + "num_tokens": 8387474552.0, + "step": 79250 + }, + { + "entropy": 1.1728125, + "epoch": 1.9416287155379266, + "grad_norm": 2.578125, + "learning_rate": 1.3973914394678655e-08, + "loss": 0.1379, + "mean_token_accuracy": 0.9702671027183533, + "num_tokens": 8392280218.0, + "step": 79300 + }, + { + "entropy": 1.21859375, + "epoch": 1.942852945497282, + "grad_norm": 3.828125, + "learning_rate": 1.3394579455037637e-08, + "loss": 0.1586, + "mean_token_accuracy": 0.9652379488945008, + "num_tokens": 8397377045.0, + "step": 79350 + }, + { + "entropy": 1.20515625, + "epoch": 1.9440771754566377, + "grad_norm": 2.453125, + "learning_rate": 1.2827482367643862e-08, + "loss": 0.1537, + "mean_token_accuracy": 0.9671237909793854, + "num_tokens": 8402675630.0, + "step": 79400 + }, + { + "entropy": 1.18984375, + "epoch": 1.9453014054159934, + "grad_norm": 2.703125, + "learning_rate": 1.2272625456221875e-08, + "loss": 0.1511, + "mean_token_accuracy": 0.9674056422710419, + "num_tokens": 8407470922.0, + "step": 79450 + }, + { + "entropy": 1.22015625, + "epoch": 1.946525635375349, + "grad_norm": 3.078125, + "learning_rate": 1.1730010994342344e-08, + "loss": 0.1683, + "mean_token_accuracy": 0.963656575679779, + "num_tokens": 8413030681.0, + "step": 79500 + }, + { + "entropy": 1.2075, + "epoch": 1.9477498653347045, + "grad_norm": 3.703125, + "learning_rate": 1.1199641205410727e-08, + "loss": 0.1676, + "mean_token_accuracy": 0.9641730666160584, + "num_tokens": 8418435608.0, + "step": 79550 + }, + { + "entropy": 1.20109375, + "epoch": 1.94897409529406, + "grad_norm": 2.609375, + "learning_rate": 1.0681518262659618e-08, + "loss": 0.1612, + "mean_token_accuracy": 0.9652375304698944, + "num_tokens": 8423410030.0, + "step": 79600 + }, + { + "entropy": 1.19328125, + "epoch": 1.9501983252534156, + "grad_norm": 4.3125, + "learning_rate": 1.0175644289138419e-08, + "loss": 0.1565, + "mean_token_accuracy": 0.9664306437969208, + "num_tokens": 8428505734.0, + "step": 79650 + }, + { + "entropy": 1.19609375, + "epoch": 1.9514225552127713, + "grad_norm": 3.15625, + "learning_rate": 9.682021357706018e-09, + "loss": 0.1491, + "mean_token_accuracy": 0.968139351606369, + "num_tokens": 8433821851.0, + "step": 79700 + }, + { + "entropy": 1.19390625, + "epoch": 1.9526467851721268, + "grad_norm": 3.53125, + "learning_rate": 9.20065149102145e-09, + "loss": 0.1566, + "mean_token_accuracy": 0.9661858582496643, + "num_tokens": 8438988974.0, + "step": 79750 + }, + { + "entropy": 1.20546875, + "epoch": 1.9538710151314822, + "grad_norm": 2.5625, + "learning_rate": 8.731536661535588e-09, + "loss": 0.1691, + "mean_token_accuracy": 0.9629131543636322, + "num_tokens": 8444297546.0, + "step": 79800 + }, + { + "entropy": 1.20671875, + "epoch": 1.9550952450908379, + "grad_norm": 2.640625, + "learning_rate": 8.274678791484136e-09, + "loss": 0.1603, + "mean_token_accuracy": 0.9652000117301941, + "num_tokens": 8449852335.0, + "step": 79850 + }, + { + "entropy": 1.18640625, + "epoch": 1.9563194750501935, + "grad_norm": 3.359375, + "learning_rate": 7.830079752877973e-09, + "loss": 0.1394, + "mean_token_accuracy": 0.9697746348381042, + "num_tokens": 8454775071.0, + "step": 79900 + }, + { + "entropy": 1.2021875, + "epoch": 1.957543705009549, + "grad_norm": 2.734375, + "learning_rate": 7.397741367497157e-09, + "loss": 0.1613, + "mean_token_accuracy": 0.9663744091987609, + "num_tokens": 8460122393.0, + "step": 79950 + }, + { + "entropy": 1.21125, + "epoch": 1.9587679349689044, + "grad_norm": 3.125, + "learning_rate": 6.977665406882272e-09, + "loss": 0.1689, + "mean_token_accuracy": 0.9630868649482727, + "num_tokens": 8465752957.0, + "step": 80000 + }, + { + "epoch": 1.9587679349689044, + "eval_entropy": 1.2009765625, + "eval_loss": 0.17771507799625397, + "eval_mean_token_accuracy": 0.9620104561249415, + "eval_num_tokens": 8465752957.0, + "eval_runtime": 611.9165, + "eval_samples_per_second": 15.78, + "eval_steps_per_second": 0.198, + "step": 80000 + }, + { + "entropy": 1.21203125, + "epoch": 1.9599921649282601, + "grad_norm": 3.421875, + "learning_rate": 6.569853592327757e-09, + "loss": 0.1792, + "mean_token_accuracy": 0.9620552754402161, + "num_tokens": 8471172063.0, + "step": 80050 + }, + { + "entropy": 1.200625, + "epoch": 1.9612163948876158, + "grad_norm": 3.28125, + "learning_rate": 6.174307594874917e-09, + "loss": 0.1558, + "mean_token_accuracy": 0.9663512742519379, + "num_tokens": 8476017366.0, + "step": 80100 + }, + { + "entropy": 1.195, + "epoch": 1.9624406248469712, + "grad_norm": 3.65625, + "learning_rate": 5.7910290353049285e-09, + "loss": 0.1529, + "mean_token_accuracy": 0.9670116317272186, + "num_tokens": 8481042255.0, + "step": 80150 + }, + { + "entropy": 1.18890625, + "epoch": 1.9636648548063267, + "grad_norm": 2.0, + "learning_rate": 5.420019484131844e-09, + "loss": 0.1608, + "mean_token_accuracy": 0.9656985890865326, + "num_tokens": 8486092212.0, + "step": 80200 + }, + { + "entropy": 1.209375, + "epoch": 1.9648890847656824, + "grad_norm": 2.203125, + "learning_rate": 5.061280461596929e-09, + "loss": 0.1747, + "mean_token_accuracy": 0.962527574300766, + "num_tokens": 8491676835.0, + "step": 80250 + }, + { + "entropy": 1.20671875, + "epoch": 1.966113314725038, + "grad_norm": 3.21875, + "learning_rate": 4.714813437661336e-09, + "loss": 0.1636, + "mean_token_accuracy": 0.9649911904335022, + "num_tokens": 8497110970.0, + "step": 80300 + }, + { + "entropy": 1.2090625, + "epoch": 1.9673375446843935, + "grad_norm": 3.078125, + "learning_rate": 4.380619832001775e-09, + "loss": 0.1698, + "mean_token_accuracy": 0.9634296333789826, + "num_tokens": 8502537441.0, + "step": 80350 + }, + { + "entropy": 1.20875, + "epoch": 1.968561774643749, + "grad_norm": 4.9375, + "learning_rate": 4.058701014002187e-09, + "loss": 0.1637, + "mean_token_accuracy": 0.9648308408260345, + "num_tokens": 8507630732.0, + "step": 80400 + }, + { + "entropy": 1.20390625, + "epoch": 1.9697860046031046, + "grad_norm": 2.15625, + "learning_rate": 3.749058302751074e-09, + "loss": 0.1531, + "mean_token_accuracy": 0.9671729254722595, + "num_tokens": 8512848795.0, + "step": 80450 + }, + { + "entropy": 1.2090625, + "epoch": 1.9710102345624603, + "grad_norm": 3.984375, + "learning_rate": 3.451692967033848e-09, + "loss": 0.1643, + "mean_token_accuracy": 0.9643464314937592, + "num_tokens": 8518368412.0, + "step": 80500 + }, + { + "entropy": 1.20875, + "epoch": 1.972234464521816, + "grad_norm": 3.203125, + "learning_rate": 3.1666062253284942e-09, + "loss": 0.1677, + "mean_token_accuracy": 0.9644135737419128, + "num_tokens": 8523653890.0, + "step": 80550 + }, + { + "entropy": 1.2159375, + "epoch": 1.9734586944811714, + "grad_norm": 3.875, + "learning_rate": 2.893799245800244e-09, + "loss": 0.166, + "mean_token_accuracy": 0.9650824117660523, + "num_tokens": 8529015592.0, + "step": 80600 + }, + { + "entropy": 1.2, + "epoch": 1.9746829244405268, + "grad_norm": 2.46875, + "learning_rate": 2.633273146297577e-09, + "loss": 0.1591, + "mean_token_accuracy": 0.9664743864536285, + "num_tokens": 8534375896.0, + "step": 80650 + }, + { + "entropy": 1.21421875, + "epoch": 1.9759071543998825, + "grad_norm": 2.484375, + "learning_rate": 2.385028994346894e-09, + "loss": 0.1685, + "mean_token_accuracy": 0.9637958765029907, + "num_tokens": 8539783916.0, + "step": 80700 + }, + { + "entropy": 1.19375, + "epoch": 1.9771313843592382, + "grad_norm": 2.34375, + "learning_rate": 2.149067807147853e-09, + "loss": 0.1589, + "mean_token_accuracy": 0.9659115636348724, + "num_tokens": 8544696959.0, + "step": 80750 + }, + { + "entropy": 1.21125, + "epoch": 1.9783556143185936, + "grad_norm": 2.25, + "learning_rate": 1.925390551570705e-09, + "loss": 0.1649, + "mean_token_accuracy": 0.9644637072086334, + "num_tokens": 8550159905.0, + "step": 80800 + }, + { + "entropy": 1.19796875, + "epoch": 1.979579844277949, + "grad_norm": 1.8828125, + "learning_rate": 1.7139981441502973e-09, + "loss": 0.1526, + "mean_token_accuracy": 0.9669871032238007, + "num_tokens": 8555193317.0, + "step": 80850 + }, + { + "entropy": 1.1953125, + "epoch": 1.9808040742373048, + "grad_norm": 2.828125, + "learning_rate": 1.514891451083744e-09, + "loss": 0.1676, + "mean_token_accuracy": 0.9646944868564605, + "num_tokens": 8560635630.0, + "step": 80900 + }, + { + "entropy": 1.21921875, + "epoch": 1.9820283041966604, + "grad_norm": 2.90625, + "learning_rate": 1.328071288226762e-09, + "loss": 0.1694, + "mean_token_accuracy": 0.9641423618793488, + "num_tokens": 8566246965.0, + "step": 80950 + }, + { + "entropy": 1.1896875, + "epoch": 1.9832525341560159, + "grad_norm": 1.9453125, + "learning_rate": 1.1535384210893395e-09, + "loss": 0.1436, + "mean_token_accuracy": 0.9696673655509949, + "num_tokens": 8571430149.0, + "step": 81000 + }, + { + "entropy": 1.2009375, + "epoch": 1.9844767641153713, + "grad_norm": 2.015625, + "learning_rate": 9.912935648344057e-10, + "loss": 0.1667, + "mean_token_accuracy": 0.9643544840812683, + "num_tokens": 8576922262.0, + "step": 81050 + }, + { + "entropy": 1.208125, + "epoch": 1.985700994074727, + "grad_norm": 2.71875, + "learning_rate": 8.413373842721672e-10, + "loss": 0.1569, + "mean_token_accuracy": 0.9652732384204864, + "num_tokens": 8582076472.0, + "step": 81100 + }, + { + "entropy": 1.20875, + "epoch": 1.9869252240340827, + "grad_norm": 2.671875, + "learning_rate": 7.036704938611083e-10, + "loss": 0.1691, + "mean_token_accuracy": 0.9644241857528687, + "num_tokens": 8587501762.0, + "step": 81150 + }, + { + "entropy": 1.17515625, + "epoch": 1.9881494539934381, + "grad_norm": 3.0, + "learning_rate": 5.782934577009957e-10, + "loss": 0.1391, + "mean_token_accuracy": 0.9696795284748078, + "num_tokens": 8592212871.0, + "step": 81200 + }, + { + "entropy": 1.195, + "epoch": 1.9893736839527936, + "grad_norm": 2.28125, + "learning_rate": 4.652067895352108e-10, + "loss": 0.1522, + "mean_token_accuracy": 0.9666969799995422, + "num_tokens": 8597378423.0, + "step": 81250 + }, + { + "entropy": 1.198125, + "epoch": 1.9905979139121492, + "grad_norm": 2.828125, + "learning_rate": 3.644109527447537e-10, + "loss": 0.1695, + "mean_token_accuracy": 0.9640089082717895, + "num_tokens": 8602514679.0, + "step": 81300 + }, + { + "entropy": 1.20984375, + "epoch": 1.991822143871505, + "grad_norm": 4.5, + "learning_rate": 2.7590636034857675e-10, + "loss": 0.1634, + "mean_token_accuracy": 0.964598093032837, + "num_tokens": 8607950288.0, + "step": 81350 + }, + { + "entropy": 1.21296875, + "epoch": 1.9930463738308604, + "grad_norm": 3.625, + "learning_rate": 1.9969337500125308e-10, + "loss": 0.166, + "mean_token_accuracy": 0.9647457122802734, + "num_tokens": 8613281748.0, + "step": 81400 + }, + { + "entropy": 1.21078125, + "epoch": 1.9942706037902158, + "grad_norm": 1.78125, + "learning_rate": 1.3577230899197712e-10, + "loss": 0.1541, + "mean_token_accuracy": 0.9666912174224853, + "num_tokens": 8618635116.0, + "step": 81450 + }, + { + "entropy": 1.18859375, + "epoch": 1.9954948337495715, + "grad_norm": 3.09375, + "learning_rate": 8.414342424156729e-11, + "loss": 0.149, + "mean_token_accuracy": 0.9679227757453919, + "num_tokens": 8623674169.0, + "step": 81500 + }, + { + "entropy": 1.18640625, + "epoch": 1.9967190637089272, + "grad_norm": 2.75, + "learning_rate": 4.48069323044642e-11, + "loss": 0.1535, + "mean_token_accuracy": 0.966040461063385, + "num_tokens": 8628848521.0, + "step": 81550 + }, + { + "entropy": 1.1934375, + "epoch": 1.9979432936682826, + "grad_norm": 3.5625, + "learning_rate": 1.776299436406781e-11, + "loss": 0.1668, + "mean_token_accuracy": 0.9646425199508667, + "num_tokens": 8634212914.0, + "step": 81600 + }, + { + "entropy": 1.1825, + "epoch": 1.999167523627638, + "grad_norm": 2.65625, + "learning_rate": 3.0117212357350098e-12, + "loss": 0.147, + "mean_token_accuracy": 0.9685306799411774, + "num_tokens": 8639315101.0, + "step": 81650 + } + ], + "logging_steps": 50, + "max_steps": 81684, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.811857454193967e+19, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}