ARI-8B / trainer_state.json
EvelynKimmy's picture
Initial model upload
939033c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 10000,
"global_step": 81684,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.13171875,
"epoch": 0.0012242299593555653,
"grad_norm": 83.0,
"learning_rate": 7.197062423500612e-08,
"loss": 1.0766,
"mean_token_accuracy": 0.8785432744026184,
"num_tokens": 5226465.0,
"step": 50
},
{
"entropy": 1.1265625,
"epoch": 0.0024484599187111307,
"grad_norm": 78.0,
"learning_rate": 1.4541003671970627e-07,
"loss": 1.0658,
"mean_token_accuracy": 0.8774244856834411,
"num_tokens": 10264339.0,
"step": 100
},
{
"entropy": 1.12125,
"epoch": 0.003672689878066696,
"grad_norm": 65.5,
"learning_rate": 2.1884944920440638e-07,
"loss": 1.0161,
"mean_token_accuracy": 0.8803484535217285,
"num_tokens": 15456221.0,
"step": 150
},
{
"entropy": 1.12859375,
"epoch": 0.004896919837422261,
"grad_norm": 73.0,
"learning_rate": 2.922888616891065e-07,
"loss": 0.9905,
"mean_token_accuracy": 0.8795530760288238,
"num_tokens": 20433188.0,
"step": 200
},
{
"entropy": 1.14265625,
"epoch": 0.006121149796777827,
"grad_norm": 69.0,
"learning_rate": 3.6572827417380663e-07,
"loss": 0.8835,
"mean_token_accuracy": 0.884142210483551,
"num_tokens": 25654586.0,
"step": 250
},
{
"entropy": 1.1475,
"epoch": 0.007345379756133392,
"grad_norm": 60.0,
"learning_rate": 4.391676866585067e-07,
"loss": 0.7555,
"mean_token_accuracy": 0.8876442670822143,
"num_tokens": 30682210.0,
"step": 300
},
{
"entropy": 1.14234375,
"epoch": 0.008569609715488957,
"grad_norm": 30.0,
"learning_rate": 5.126070991432069e-07,
"loss": 0.691,
"mean_token_accuracy": 0.8891012752056122,
"num_tokens": 36107614.0,
"step": 350
},
{
"entropy": 1.1603125,
"epoch": 0.009793839674844523,
"grad_norm": 15.6875,
"learning_rate": 5.860465116279069e-07,
"loss": 0.5872,
"mean_token_accuracy": 0.9278727066516876,
"num_tokens": 41528585.0,
"step": 400
},
{
"entropy": 1.20984375,
"epoch": 0.011018069634200088,
"grad_norm": 10.375,
"learning_rate": 6.594859241126071e-07,
"loss": 0.5128,
"mean_token_accuracy": 0.9328850126266479,
"num_tokens": 47205376.0,
"step": 450
},
{
"entropy": 1.23328125,
"epoch": 0.012242299593555654,
"grad_norm": 8.875,
"learning_rate": 7.329253365973072e-07,
"loss": 0.464,
"mean_token_accuracy": 0.9372936522960663,
"num_tokens": 52484312.0,
"step": 500
},
{
"entropy": 1.2515625,
"epoch": 0.013466529552911218,
"grad_norm": 10.375,
"learning_rate": 8.063647490820073e-07,
"loss": 0.4469,
"mean_token_accuracy": 0.9350792992115021,
"num_tokens": 57610761.0,
"step": 550
},
{
"entropy": 1.28046875,
"epoch": 0.014690759512266784,
"grad_norm": 7.8125,
"learning_rate": 8.798041615667075e-07,
"loss": 0.4233,
"mean_token_accuracy": 0.9361693727970123,
"num_tokens": 62744047.0,
"step": 600
},
{
"entropy": 1.30703125,
"epoch": 0.01591498947162235,
"grad_norm": 11.375,
"learning_rate": 9.532435740514075e-07,
"loss": 0.4228,
"mean_token_accuracy": 0.9353253149986267,
"num_tokens": 68113654.0,
"step": 650
},
{
"entropy": 1.3890625,
"epoch": 0.017139219430977914,
"grad_norm": 4.96875,
"learning_rate": 1.0266829865361079e-06,
"loss": 0.3897,
"mean_token_accuracy": 0.934454687833786,
"num_tokens": 73614017.0,
"step": 700
},
{
"entropy": 1.4703125,
"epoch": 0.01836344939033348,
"grad_norm": 4.25,
"learning_rate": 1.100122399020808e-06,
"loss": 0.3618,
"mean_token_accuracy": 0.9344593751430511,
"num_tokens": 79174232.0,
"step": 750
},
{
"entropy": 1.52625,
"epoch": 0.019587679349689045,
"grad_norm": 3.671875,
"learning_rate": 1.173561811505508e-06,
"loss": 0.3804,
"mean_token_accuracy": 0.9336516118049621,
"num_tokens": 84608483.0,
"step": 800
},
{
"entropy": 1.5390625,
"epoch": 0.02081190930904461,
"grad_norm": 3.8125,
"learning_rate": 1.2470012239902082e-06,
"loss": 0.353,
"mean_token_accuracy": 0.9379545438289643,
"num_tokens": 89999996.0,
"step": 850
},
{
"entropy": 1.54515625,
"epoch": 0.022036139268400177,
"grad_norm": 3.296875,
"learning_rate": 1.3204406364749082e-06,
"loss": 0.3294,
"mean_token_accuracy": 0.9422214996814727,
"num_tokens": 95124008.0,
"step": 900
},
{
"entropy": 1.5690625,
"epoch": 0.02326036922775574,
"grad_norm": 4.09375,
"learning_rate": 1.3938800489596082e-06,
"loss": 0.3514,
"mean_token_accuracy": 0.9378081679344177,
"num_tokens": 100136013.0,
"step": 950
},
{
"entropy": 1.55765625,
"epoch": 0.02448459918711131,
"grad_norm": 3.171875,
"learning_rate": 1.4673194614443085e-06,
"loss": 0.3343,
"mean_token_accuracy": 0.9409069657325745,
"num_tokens": 105114554.0,
"step": 1000
},
{
"entropy": 1.53890625,
"epoch": 0.025708829146466872,
"grad_norm": 3.296875,
"learning_rate": 1.5407588739290085e-06,
"loss": 0.3284,
"mean_token_accuracy": 0.9414800906181335,
"num_tokens": 110370176.0,
"step": 1050
},
{
"entropy": 1.5546875,
"epoch": 0.026933059105822436,
"grad_norm": 3.625,
"learning_rate": 1.6141982864137085e-06,
"loss": 0.3183,
"mean_token_accuracy": 0.9426046288013459,
"num_tokens": 115321229.0,
"step": 1100
},
{
"entropy": 1.55046875,
"epoch": 0.028157289065178004,
"grad_norm": 2.515625,
"learning_rate": 1.687637698898409e-06,
"loss": 0.332,
"mean_token_accuracy": 0.9409434747695923,
"num_tokens": 120648053.0,
"step": 1150
},
{
"entropy": 1.54046875,
"epoch": 0.029381519024533568,
"grad_norm": 2.578125,
"learning_rate": 1.761077111383109e-06,
"loss": 0.3266,
"mean_token_accuracy": 0.941448210477829,
"num_tokens": 126142957.0,
"step": 1200
},
{
"entropy": 1.54453125,
"epoch": 0.030605748983889135,
"grad_norm": 3.484375,
"learning_rate": 1.8345165238678093e-06,
"loss": 0.3357,
"mean_token_accuracy": 0.9392308318614959,
"num_tokens": 131721983.0,
"step": 1250
},
{
"entropy": 1.54265625,
"epoch": 0.0318299789432447,
"grad_norm": 3.890625,
"learning_rate": 1.9079559363525093e-06,
"loss": 0.323,
"mean_token_accuracy": 0.9425621521472931,
"num_tokens": 136834110.0,
"step": 1300
},
{
"entropy": 1.55375,
"epoch": 0.03305420890260027,
"grad_norm": 3.046875,
"learning_rate": 1.9813953488372093e-06,
"loss": 0.3103,
"mean_token_accuracy": 0.9435288536548615,
"num_tokens": 142077567.0,
"step": 1350
},
{
"entropy": 1.5815625,
"epoch": 0.03427843886195583,
"grad_norm": 1.703125,
"learning_rate": 2.0548347613219094e-06,
"loss": 0.325,
"mean_token_accuracy": 0.9404278743267059,
"num_tokens": 147938512.0,
"step": 1400
},
{
"entropy": 1.603125,
"epoch": 0.035502668821311395,
"grad_norm": 2.46875,
"learning_rate": 2.1282741738066094e-06,
"loss": 0.292,
"mean_token_accuracy": 0.94657958984375,
"num_tokens": 152967734.0,
"step": 1450
},
{
"entropy": 1.5790625,
"epoch": 0.03672689878066696,
"grad_norm": 4.84375,
"learning_rate": 2.20171358629131e-06,
"loss": 0.3027,
"mean_token_accuracy": 0.9442594313621521,
"num_tokens": 158252140.0,
"step": 1500
},
{
"entropy": 1.5709375,
"epoch": 0.03795112874002252,
"grad_norm": 2.796875,
"learning_rate": 2.27515299877601e-06,
"loss": 0.2935,
"mean_token_accuracy": 0.9451294171810151,
"num_tokens": 163906622.0,
"step": 1550
},
{
"entropy": 1.58890625,
"epoch": 0.03917535869937809,
"grad_norm": 2.703125,
"learning_rate": 2.34859241126071e-06,
"loss": 0.2963,
"mean_token_accuracy": 0.9451341640949249,
"num_tokens": 169532126.0,
"step": 1600
},
{
"entropy": 1.55125,
"epoch": 0.04039958865873366,
"grad_norm": 2.46875,
"learning_rate": 2.42203182374541e-06,
"loss": 0.2704,
"mean_token_accuracy": 0.9490817248821258,
"num_tokens": 174811062.0,
"step": 1650
},
{
"entropy": 1.54515625,
"epoch": 0.04162381861808922,
"grad_norm": 1.9296875,
"learning_rate": 2.49547123623011e-06,
"loss": 0.2704,
"mean_token_accuracy": 0.9497274696826935,
"num_tokens": 180019609.0,
"step": 1700
},
{
"entropy": 1.545625,
"epoch": 0.042848048577444786,
"grad_norm": 2.171875,
"learning_rate": 2.56891064871481e-06,
"loss": 0.2729,
"mean_token_accuracy": 0.9472972440719605,
"num_tokens": 185410342.0,
"step": 1750
},
{
"entropy": 1.54171875,
"epoch": 0.044072278536800354,
"grad_norm": 2.3125,
"learning_rate": 2.6423500611995105e-06,
"loss": 0.2723,
"mean_token_accuracy": 0.9487947750091553,
"num_tokens": 190878398.0,
"step": 1800
},
{
"entropy": 1.54234375,
"epoch": 0.04529650849615592,
"grad_norm": 2.8125,
"learning_rate": 2.715789473684211e-06,
"loss": 0.2761,
"mean_token_accuracy": 0.9490153706073761,
"num_tokens": 196027836.0,
"step": 1850
},
{
"entropy": 1.53703125,
"epoch": 0.04652073845551148,
"grad_norm": 2.578125,
"learning_rate": 2.789228886168911e-06,
"loss": 0.2882,
"mean_token_accuracy": 0.9455779695510864,
"num_tokens": 201616019.0,
"step": 1900
},
{
"entropy": 1.52796875,
"epoch": 0.04774496841486705,
"grad_norm": 1.671875,
"learning_rate": 2.862668298653611e-06,
"loss": 0.2609,
"mean_token_accuracy": 0.9508947324752808,
"num_tokens": 206756502.0,
"step": 1950
},
{
"entropy": 1.51796875,
"epoch": 0.04896919837422262,
"grad_norm": 2.296875,
"learning_rate": 2.936107711138311e-06,
"loss": 0.2627,
"mean_token_accuracy": 0.9504942214488983,
"num_tokens": 211966544.0,
"step": 2000
},
{
"entropy": 1.5309375,
"epoch": 0.05019342833357818,
"grad_norm": 2.84375,
"learning_rate": 3.0095471236230106e-06,
"loss": 0.2622,
"mean_token_accuracy": 0.9505769073963165,
"num_tokens": 217058889.0,
"step": 2050
},
{
"entropy": 1.52125,
"epoch": 0.051417658292933745,
"grad_norm": 2.203125,
"learning_rate": 3.082986536107711e-06,
"loss": 0.271,
"mean_token_accuracy": 0.9492247033119202,
"num_tokens": 222302364.0,
"step": 2100
},
{
"entropy": 1.5103125,
"epoch": 0.05264188825228931,
"grad_norm": 2.046875,
"learning_rate": 3.1564259485924115e-06,
"loss": 0.2836,
"mean_token_accuracy": 0.9467552089691162,
"num_tokens": 227892169.0,
"step": 2150
},
{
"entropy": 1.5121875,
"epoch": 0.05386611821164487,
"grad_norm": 1.6796875,
"learning_rate": 3.2298653610771116e-06,
"loss": 0.2772,
"mean_token_accuracy": 0.9473760116100312,
"num_tokens": 233522252.0,
"step": 2200
},
{
"entropy": 1.51453125,
"epoch": 0.05509034817100044,
"grad_norm": 2.46875,
"learning_rate": 3.303304773561812e-06,
"loss": 0.2814,
"mean_token_accuracy": 0.9471155571937561,
"num_tokens": 239241678.0,
"step": 2250
},
{
"entropy": 1.50359375,
"epoch": 0.05631457813035601,
"grad_norm": 6.65625,
"learning_rate": 3.3767441860465116e-06,
"loss": 0.252,
"mean_token_accuracy": 0.9517535066604614,
"num_tokens": 244573352.0,
"step": 2300
},
{
"entropy": 1.489375,
"epoch": 0.05753880808971157,
"grad_norm": 2.578125,
"learning_rate": 3.450183598531212e-06,
"loss": 0.2686,
"mean_token_accuracy": 0.9490506839752197,
"num_tokens": 249799704.0,
"step": 2350
},
{
"entropy": 1.5084375,
"epoch": 0.058763038049067136,
"grad_norm": 2.859375,
"learning_rate": 3.5236230110159117e-06,
"loss": 0.2593,
"mean_token_accuracy": 0.951296364068985,
"num_tokens": 255107263.0,
"step": 2400
},
{
"entropy": 1.49984375,
"epoch": 0.0599872680084227,
"grad_norm": 3.03125,
"learning_rate": 3.597062423500612e-06,
"loss": 0.2734,
"mean_token_accuracy": 0.9485626530647278,
"num_tokens": 260533835.0,
"step": 2450
},
{
"entropy": 1.48375,
"epoch": 0.06121149796777827,
"grad_norm": 1.0390625,
"learning_rate": 3.670501835985312e-06,
"loss": 0.2529,
"mean_token_accuracy": 0.9517404818534851,
"num_tokens": 265773085.0,
"step": 2500
},
{
"entropy": 1.4821875,
"epoch": 0.06243572792713383,
"grad_norm": 2.703125,
"learning_rate": 3.743941248470012e-06,
"loss": 0.2616,
"mean_token_accuracy": 0.9500162851810455,
"num_tokens": 271036305.0,
"step": 2550
},
{
"entropy": 1.47765625,
"epoch": 0.0636599578864894,
"grad_norm": 1.953125,
"learning_rate": 3.817380660954712e-06,
"loss": 0.2462,
"mean_token_accuracy": 0.9525675988197326,
"num_tokens": 275952458.0,
"step": 2600
},
{
"entropy": 1.48328125,
"epoch": 0.06488418784584496,
"grad_norm": 2.125,
"learning_rate": 3.890820073439412e-06,
"loss": 0.2592,
"mean_token_accuracy": 0.9498057246208191,
"num_tokens": 281644324.0,
"step": 2650
},
{
"entropy": 1.47390625,
"epoch": 0.06610841780520053,
"grad_norm": 1.90625,
"learning_rate": 3.964259485924113e-06,
"loss": 0.2416,
"mean_token_accuracy": 0.9530806469917298,
"num_tokens": 286839400.0,
"step": 2700
},
{
"entropy": 1.471875,
"epoch": 0.0673326477645561,
"grad_norm": 1.9296875,
"learning_rate": 4.037698898408813e-06,
"loss": 0.2483,
"mean_token_accuracy": 0.9517722308635712,
"num_tokens": 292713093.0,
"step": 2750
},
{
"entropy": 1.47,
"epoch": 0.06855687772391166,
"grad_norm": 1.8984375,
"learning_rate": 4.111138310893514e-06,
"loss": 0.2357,
"mean_token_accuracy": 0.9542278277873993,
"num_tokens": 297994633.0,
"step": 2800
},
{
"entropy": 1.48640625,
"epoch": 0.06978110768326723,
"grad_norm": 2.328125,
"learning_rate": 4.184577723378213e-06,
"loss": 0.2434,
"mean_token_accuracy": 0.9529701387882232,
"num_tokens": 303305735.0,
"step": 2850
},
{
"entropy": 1.46640625,
"epoch": 0.07100533764262279,
"grad_norm": 1.9765625,
"learning_rate": 4.258017135862914e-06,
"loss": 0.2228,
"mean_token_accuracy": 0.9564117324352265,
"num_tokens": 308665099.0,
"step": 2900
},
{
"entropy": 1.47671875,
"epoch": 0.07222956760197835,
"grad_norm": 2.546875,
"learning_rate": 4.331456548347613e-06,
"loss": 0.2485,
"mean_token_accuracy": 0.9520271122455597,
"num_tokens": 313894105.0,
"step": 2950
},
{
"entropy": 1.46984375,
"epoch": 0.07345379756133393,
"grad_norm": 2.125,
"learning_rate": 4.404895960832314e-06,
"loss": 0.2354,
"mean_token_accuracy": 0.9531759965419769,
"num_tokens": 319439357.0,
"step": 3000
},
{
"entropy": 1.479375,
"epoch": 0.07467802752068949,
"grad_norm": 1.96875,
"learning_rate": 4.478335373317013e-06,
"loss": 0.2506,
"mean_token_accuracy": 0.9517410743236542,
"num_tokens": 325090760.0,
"step": 3050
},
{
"entropy": 1.475,
"epoch": 0.07590225748004505,
"grad_norm": 1.6796875,
"learning_rate": 4.551774785801714e-06,
"loss": 0.2273,
"mean_token_accuracy": 0.955747674703598,
"num_tokens": 330405470.0,
"step": 3100
},
{
"entropy": 1.47546875,
"epoch": 0.07712648743940062,
"grad_norm": 1.8828125,
"learning_rate": 4.6252141982864134e-06,
"loss": 0.2391,
"mean_token_accuracy": 0.9522111368179321,
"num_tokens": 335678826.0,
"step": 3150
},
{
"entropy": 1.4603125,
"epoch": 0.07835071739875618,
"grad_norm": 1.53125,
"learning_rate": 4.698653610771114e-06,
"loss": 0.2344,
"mean_token_accuracy": 0.9539849495887757,
"num_tokens": 340918671.0,
"step": 3200
},
{
"entropy": 1.4509375,
"epoch": 0.07957494735811174,
"grad_norm": 2.5625,
"learning_rate": 4.7720930232558135e-06,
"loss": 0.2191,
"mean_token_accuracy": 0.9559646666049957,
"num_tokens": 346171106.0,
"step": 3250
},
{
"entropy": 1.454375,
"epoch": 0.08079917731746732,
"grad_norm": 5.6875,
"learning_rate": 4.845532435740514e-06,
"loss": 0.2356,
"mean_token_accuracy": 0.9528819477558136,
"num_tokens": 351560226.0,
"step": 3300
},
{
"entropy": 1.46609375,
"epoch": 0.08202340727682288,
"grad_norm": 1.859375,
"learning_rate": 4.918971848225214e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.9533221650123597,
"num_tokens": 357311606.0,
"step": 3350
},
{
"entropy": 1.45046875,
"epoch": 0.08324763723617844,
"grad_norm": 3.0625,
"learning_rate": 4.992411260709914e-06,
"loss": 0.218,
"mean_token_accuracy": 0.9566865241527558,
"num_tokens": 362184714.0,
"step": 3400
},
{
"entropy": 1.44765625,
"epoch": 0.08447186719553401,
"grad_norm": 3.03125,
"learning_rate": 5.0658506731946145e-06,
"loss": 0.2163,
"mean_token_accuracy": 0.9571156585216523,
"num_tokens": 367118033.0,
"step": 3450
},
{
"entropy": 1.4721875,
"epoch": 0.08569609715488957,
"grad_norm": 1.90625,
"learning_rate": 5.139290085679315e-06,
"loss": 0.2269,
"mean_token_accuracy": 0.9551365935802459,
"num_tokens": 372554179.0,
"step": 3500
},
{
"entropy": 1.43546875,
"epoch": 0.08692032711424515,
"grad_norm": 2.65625,
"learning_rate": 5.212729498164015e-06,
"loss": 0.2235,
"mean_token_accuracy": 0.9559626686573028,
"num_tokens": 377909880.0,
"step": 3550
},
{
"entropy": 1.4384375,
"epoch": 0.08814455707360071,
"grad_norm": 1.7578125,
"learning_rate": 5.286168910648715e-06,
"loss": 0.2151,
"mean_token_accuracy": 0.9575206100940704,
"num_tokens": 383194488.0,
"step": 3600
},
{
"entropy": 1.42265625,
"epoch": 0.08936878703295627,
"grad_norm": 1.9609375,
"learning_rate": 5.3596083231334154e-06,
"loss": 0.229,
"mean_token_accuracy": 0.9538651633262635,
"num_tokens": 389073618.0,
"step": 3650
},
{
"entropy": 1.429375,
"epoch": 0.09059301699231184,
"grad_norm": 2.15625,
"learning_rate": 5.433047735618115e-06,
"loss": 0.2294,
"mean_token_accuracy": 0.9545065891742707,
"num_tokens": 394553347.0,
"step": 3700
},
{
"entropy": 1.42375,
"epoch": 0.0918172469516674,
"grad_norm": 2.078125,
"learning_rate": 5.5064871481028155e-06,
"loss": 0.2085,
"mean_token_accuracy": 0.9575728678703308,
"num_tokens": 399579739.0,
"step": 3750
},
{
"entropy": 1.411875,
"epoch": 0.09304147691102296,
"grad_norm": 1.6484375,
"learning_rate": 5.579926560587515e-06,
"loss": 0.2211,
"mean_token_accuracy": 0.9557280552387237,
"num_tokens": 404841496.0,
"step": 3800
},
{
"entropy": 1.40765625,
"epoch": 0.09426570687037854,
"grad_norm": 2.015625,
"learning_rate": 5.6533659730722156e-06,
"loss": 0.2125,
"mean_token_accuracy": 0.9576599287986756,
"num_tokens": 410023001.0,
"step": 3850
},
{
"entropy": 1.42984375,
"epoch": 0.0954899368297341,
"grad_norm": 2.640625,
"learning_rate": 5.726805385556916e-06,
"loss": 0.2279,
"mean_token_accuracy": 0.9547258257865906,
"num_tokens": 415549547.0,
"step": 3900
},
{
"entropy": 1.3978125,
"epoch": 0.09671416678908966,
"grad_norm": 2.59375,
"learning_rate": 5.800244798041616e-06,
"loss": 0.2232,
"mean_token_accuracy": 0.9551710951328277,
"num_tokens": 421034105.0,
"step": 3950
},
{
"entropy": 1.38796875,
"epoch": 0.09793839674844523,
"grad_norm": 1.515625,
"learning_rate": 5.873684210526316e-06,
"loss": 0.2162,
"mean_token_accuracy": 0.9557711553573608,
"num_tokens": 426688731.0,
"step": 4000
},
{
"entropy": 1.3903125,
"epoch": 0.0991626267078008,
"grad_norm": 10.25,
"learning_rate": 5.947123623011016e-06,
"loss": 0.2102,
"mean_token_accuracy": 0.9573217809200287,
"num_tokens": 431945587.0,
"step": 4050
},
{
"entropy": 1.37515625,
"epoch": 0.10038685666715635,
"grad_norm": 2.703125,
"learning_rate": 5.9999995181245345e-06,
"loss": 0.2068,
"mean_token_accuracy": 0.9580986511707306,
"num_tokens": 436945746.0,
"step": 4100
},
{
"entropy": 1.3790625,
"epoch": 0.10161108662651193,
"grad_norm": 2.171875,
"learning_rate": 5.999989929791556e-06,
"loss": 0.2008,
"mean_token_accuracy": 0.9594962692260742,
"num_tokens": 441913649.0,
"step": 4150
},
{
"entropy": 1.39890625,
"epoch": 0.10283531658586749,
"grad_norm": 2.25,
"learning_rate": 5.9999680487622435e-06,
"loss": 0.2158,
"mean_token_accuracy": 0.9564687287807465,
"num_tokens": 447263639.0,
"step": 4200
},
{
"entropy": 1.39796875,
"epoch": 0.10405954654522305,
"grad_norm": 2.359375,
"learning_rate": 5.999933875126256e-06,
"loss": 0.2235,
"mean_token_accuracy": 0.9537206184864044,
"num_tokens": 452831245.0,
"step": 4250
},
{
"entropy": 1.40046875,
"epoch": 0.10528377650457862,
"grad_norm": 2.484375,
"learning_rate": 5.999887409023625e-06,
"loss": 0.1983,
"mean_token_accuracy": 0.9605963575839996,
"num_tokens": 457920235.0,
"step": 4300
},
{
"entropy": 1.37109375,
"epoch": 0.10650800646393419,
"grad_norm": 2.46875,
"learning_rate": 5.9998286506447455e-06,
"loss": 0.1985,
"mean_token_accuracy": 0.9589159560203552,
"num_tokens": 463428491.0,
"step": 4350
},
{
"entropy": 1.393125,
"epoch": 0.10773223642328975,
"grad_norm": 2.4375,
"learning_rate": 5.999757600230387e-06,
"loss": 0.2181,
"mean_token_accuracy": 0.9564608442783356,
"num_tokens": 469183579.0,
"step": 4400
},
{
"entropy": 1.40828125,
"epoch": 0.10895646638264532,
"grad_norm": 1.953125,
"learning_rate": 5.999674258071684e-06,
"loss": 0.1997,
"mean_token_accuracy": 0.9596063613891601,
"num_tokens": 474548123.0,
"step": 4450
},
{
"entropy": 1.38171875,
"epoch": 0.11018069634200088,
"grad_norm": 2.25,
"learning_rate": 5.999578624510137e-06,
"loss": 0.2113,
"mean_token_accuracy": 0.9565052735805512,
"num_tokens": 480099691.0,
"step": 4500
},
{
"entropy": 1.39328125,
"epoch": 0.11140492630135644,
"grad_norm": 2.328125,
"learning_rate": 5.9994706999376126e-06,
"loss": 0.2096,
"mean_token_accuracy": 0.9578315222263336,
"num_tokens": 485485141.0,
"step": 4550
},
{
"entropy": 1.39828125,
"epoch": 0.11262915626071202,
"grad_norm": 2.125,
"learning_rate": 5.999350484796339e-06,
"loss": 0.1935,
"mean_token_accuracy": 0.9609186232089997,
"num_tokens": 490314941.0,
"step": 4600
},
{
"entropy": 1.41859375,
"epoch": 0.11385338622006758,
"grad_norm": 2.28125,
"learning_rate": 5.999217979578909e-06,
"loss": 0.2132,
"mean_token_accuracy": 0.9569031345844269,
"num_tokens": 495604676.0,
"step": 4650
},
{
"entropy": 1.41984375,
"epoch": 0.11507761617942314,
"grad_norm": 1.90625,
"learning_rate": 5.999073184828273e-06,
"loss": 0.1948,
"mean_token_accuracy": 0.9596328222751618,
"num_tokens": 500772718.0,
"step": 4700
},
{
"entropy": 1.42,
"epoch": 0.11630184613877871,
"grad_norm": 2.75,
"learning_rate": 5.998916101137737e-06,
"loss": 0.2128,
"mean_token_accuracy": 0.9574012553691864,
"num_tokens": 506105312.0,
"step": 4750
},
{
"entropy": 1.40890625,
"epoch": 0.11752607609813427,
"grad_norm": 2.671875,
"learning_rate": 5.998746729150967e-06,
"loss": 0.2019,
"mean_token_accuracy": 0.958700270652771,
"num_tokens": 511311990.0,
"step": 4800
},
{
"entropy": 1.41671875,
"epoch": 0.11875030605748983,
"grad_norm": 1.515625,
"learning_rate": 5.998565069561976e-06,
"loss": 0.2044,
"mean_token_accuracy": 0.9582890093326568,
"num_tokens": 516615202.0,
"step": 4850
},
{
"entropy": 1.4115625,
"epoch": 0.1199745360168454,
"grad_norm": 1.828125,
"learning_rate": 5.998371123115128e-06,
"loss": 0.207,
"mean_token_accuracy": 0.9571990466117859,
"num_tokens": 521934656.0,
"step": 4900
},
{
"entropy": 1.396875,
"epoch": 0.12119876597620097,
"grad_norm": 2.140625,
"learning_rate": 5.9981648906051355e-06,
"loss": 0.2069,
"mean_token_accuracy": 0.9578309345245362,
"num_tokens": 527328179.0,
"step": 4950
},
{
"entropy": 1.41046875,
"epoch": 0.12242299593555654,
"grad_norm": 2.484375,
"learning_rate": 5.9979463728770525e-06,
"loss": 0.1965,
"mean_token_accuracy": 0.9601268231868744,
"num_tokens": 532420262.0,
"step": 5000
},
{
"entropy": 1.3953125,
"epoch": 0.1236472258949121,
"grad_norm": 2.46875,
"learning_rate": 5.997715570826272e-06,
"loss": 0.1938,
"mean_token_accuracy": 0.9605181181430816,
"num_tokens": 537756232.0,
"step": 5050
},
{
"entropy": 1.390625,
"epoch": 0.12487145585426766,
"grad_norm": 1.5703125,
"learning_rate": 5.997472485398524e-06,
"loss": 0.2038,
"mean_token_accuracy": 0.9585963201522827,
"num_tokens": 543281806.0,
"step": 5100
},
{
"entropy": 1.4215625,
"epoch": 0.12609568581362324,
"grad_norm": 1.75,
"learning_rate": 5.99721711758987e-06,
"loss": 0.1969,
"mean_token_accuracy": 0.9599570655822753,
"num_tokens": 548233812.0,
"step": 5150
},
{
"entropy": 1.40515625,
"epoch": 0.1273199157729788,
"grad_norm": 2.375,
"learning_rate": 5.9969494684466985e-06,
"loss": 0.2041,
"mean_token_accuracy": 0.9577370703220367,
"num_tokens": 553736654.0,
"step": 5200
},
{
"entropy": 1.3990625,
"epoch": 0.12854414573233436,
"grad_norm": 2.140625,
"learning_rate": 5.996669539065727e-06,
"loss": 0.1945,
"mean_token_accuracy": 0.9612773549556732,
"num_tokens": 558856334.0,
"step": 5250
},
{
"entropy": 1.40203125,
"epoch": 0.12976837569168992,
"grad_norm": 1.7734375,
"learning_rate": 5.996377330593983e-06,
"loss": 0.2145,
"mean_token_accuracy": 0.9565242063999176,
"num_tokens": 564032272.0,
"step": 5300
},
{
"entropy": 1.39671875,
"epoch": 0.13099260565104548,
"grad_norm": 2.09375,
"learning_rate": 5.9960728442288186e-06,
"loss": 0.1992,
"mean_token_accuracy": 0.958374012708664,
"num_tokens": 569306892.0,
"step": 5350
},
{
"entropy": 1.38578125,
"epoch": 0.13221683561040107,
"grad_norm": 2.6875,
"learning_rate": 5.995756081217889e-06,
"loss": 0.1979,
"mean_token_accuracy": 0.9593621265888214,
"num_tokens": 574741752.0,
"step": 5400
},
{
"entropy": 1.38234375,
"epoch": 0.13344106556975663,
"grad_norm": 2.15625,
"learning_rate": 5.9954270428591555e-06,
"loss": 0.2003,
"mean_token_accuracy": 0.9591895163059234,
"num_tokens": 580457265.0,
"step": 5450
},
{
"entropy": 1.394375,
"epoch": 0.1346652955291122,
"grad_norm": 2.078125,
"learning_rate": 5.995085730500878e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9607266175746918,
"num_tokens": 585705175.0,
"step": 5500
},
{
"entropy": 1.39078125,
"epoch": 0.13588952548846775,
"grad_norm": 1.5234375,
"learning_rate": 5.994732145541613e-06,
"loss": 0.2003,
"mean_token_accuracy": 0.9587921166419983,
"num_tokens": 590923544.0,
"step": 5550
},
{
"entropy": 1.380625,
"epoch": 0.1371137554478233,
"grad_norm": 3.265625,
"learning_rate": 5.9943662894302e-06,
"loss": 0.1945,
"mean_token_accuracy": 0.9587338602542878,
"num_tokens": 596469221.0,
"step": 5600
},
{
"entropy": 1.4028125,
"epoch": 0.1383379854071789,
"grad_norm": 1.5859375,
"learning_rate": 5.993988163665767e-06,
"loss": 0.2225,
"mean_token_accuracy": 0.9551014530658722,
"num_tokens": 602167038.0,
"step": 5650
},
{
"entropy": 1.3846875,
"epoch": 0.13956221536653446,
"grad_norm": 2.640625,
"learning_rate": 5.9935977697977114e-06,
"loss": 0.201,
"mean_token_accuracy": 0.958451042175293,
"num_tokens": 607292638.0,
"step": 5700
},
{
"entropy": 1.3784375,
"epoch": 0.14078644532589002,
"grad_norm": 2.203125,
"learning_rate": 5.993195109425705e-06,
"loss": 0.2112,
"mean_token_accuracy": 0.9564135050773621,
"num_tokens": 613202323.0,
"step": 5750
},
{
"entropy": 1.38828125,
"epoch": 0.14201067528524558,
"grad_norm": 2.40625,
"learning_rate": 5.9927801841996784e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9602376103401185,
"num_tokens": 618640198.0,
"step": 5800
},
{
"entropy": 1.385,
"epoch": 0.14323490524460114,
"grad_norm": 2.609375,
"learning_rate": 5.992352995819822e-06,
"loss": 0.2075,
"mean_token_accuracy": 0.9579639828205109,
"num_tokens": 623893423.0,
"step": 5850
},
{
"entropy": 1.375625,
"epoch": 0.1444591352039567,
"grad_norm": 2.84375,
"learning_rate": 5.991913546036574e-06,
"loss": 0.2106,
"mean_token_accuracy": 0.9564978110790253,
"num_tokens": 629592369.0,
"step": 5900
},
{
"entropy": 1.37296875,
"epoch": 0.1456833651633123,
"grad_norm": 2.078125,
"learning_rate": 5.991461836650615e-06,
"loss": 0.211,
"mean_token_accuracy": 0.9563369131088257,
"num_tokens": 635736307.0,
"step": 5950
},
{
"entropy": 1.38203125,
"epoch": 0.14690759512266785,
"grad_norm": 3.0,
"learning_rate": 5.990997869512859e-06,
"loss": 0.1961,
"mean_token_accuracy": 0.9592690026760101,
"num_tokens": 641116233.0,
"step": 6000
},
{
"entropy": 1.378125,
"epoch": 0.1481318250820234,
"grad_norm": 2.65625,
"learning_rate": 5.990521646524447e-06,
"loss": 0.2008,
"mean_token_accuracy": 0.9585745882987976,
"num_tokens": 646167116.0,
"step": 6050
},
{
"entropy": 1.37140625,
"epoch": 0.14935605504137897,
"grad_norm": 2.25,
"learning_rate": 5.990033169636744e-06,
"loss": 0.1783,
"mean_token_accuracy": 0.962623051404953,
"num_tokens": 651158602.0,
"step": 6100
},
{
"entropy": 1.38609375,
"epoch": 0.15058028500073453,
"grad_norm": 2.390625,
"learning_rate": 5.989532440851319e-06,
"loss": 0.1925,
"mean_token_accuracy": 0.9600079596042633,
"num_tokens": 656353157.0,
"step": 6150
},
{
"entropy": 1.375625,
"epoch": 0.1518045149600901,
"grad_norm": 2.09375,
"learning_rate": 5.98901946221995e-06,
"loss": 0.1956,
"mean_token_accuracy": 0.9591733336448669,
"num_tokens": 661516084.0,
"step": 6200
},
{
"entropy": 1.3775,
"epoch": 0.15302874491944568,
"grad_norm": 2.59375,
"learning_rate": 5.988494235844608e-06,
"loss": 0.1857,
"mean_token_accuracy": 0.9618037152290344,
"num_tokens": 666952800.0,
"step": 6250
},
{
"entropy": 1.3721875,
"epoch": 0.15425297487880124,
"grad_norm": 1.546875,
"learning_rate": 5.987956763877448e-06,
"loss": 0.1994,
"mean_token_accuracy": 0.9587778007984161,
"num_tokens": 672306196.0,
"step": 6300
},
{
"entropy": 1.390625,
"epoch": 0.1554772048381568,
"grad_norm": 2.1875,
"learning_rate": 5.987407048520806e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.9617053723335266,
"num_tokens": 677399978.0,
"step": 6350
},
{
"entropy": 1.38171875,
"epoch": 0.15670143479751236,
"grad_norm": 1.8671875,
"learning_rate": 5.986845092027181e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9602959334850312,
"num_tokens": 682747630.0,
"step": 6400
},
{
"entropy": 1.38578125,
"epoch": 0.15792566475686792,
"grad_norm": 2.671875,
"learning_rate": 5.986270896699237e-06,
"loss": 0.177,
"mean_token_accuracy": 0.964161764383316,
"num_tokens": 687573308.0,
"step": 6450
},
{
"entropy": 1.394375,
"epoch": 0.15914989471622348,
"grad_norm": 2.15625,
"learning_rate": 5.985684464889784e-06,
"loss": 0.1956,
"mean_token_accuracy": 0.9590267181396485,
"num_tokens": 692719553.0,
"step": 6500
},
{
"entropy": 1.4165625,
"epoch": 0.16037412467557907,
"grad_norm": 2.640625,
"learning_rate": 5.985085799001773e-06,
"loss": 0.21,
"mean_token_accuracy": 0.9567484962940216,
"num_tokens": 698446523.0,
"step": 6550
},
{
"entropy": 1.39546875,
"epoch": 0.16159835463493463,
"grad_norm": 1.8984375,
"learning_rate": 5.984474901488284e-06,
"loss": 0.1936,
"mean_token_accuracy": 0.9587848937511444,
"num_tokens": 703964383.0,
"step": 6600
},
{
"entropy": 1.3865625,
"epoch": 0.1628225845942902,
"grad_norm": 2.5625,
"learning_rate": 5.983851774852519e-06,
"loss": 0.1814,
"mean_token_accuracy": 0.9620046615600586,
"num_tokens": 708987822.0,
"step": 6650
},
{
"entropy": 1.38390625,
"epoch": 0.16404681455364575,
"grad_norm": 1.6015625,
"learning_rate": 5.983216421647789e-06,
"loss": 0.1997,
"mean_token_accuracy": 0.9585830473899841,
"num_tokens": 714405287.0,
"step": 6700
},
{
"entropy": 1.37453125,
"epoch": 0.16527104451300131,
"grad_norm": 2.40625,
"learning_rate": 5.982568844477502e-06,
"loss": 0.1944,
"mean_token_accuracy": 0.9597526073455811,
"num_tokens": 719693246.0,
"step": 6750
},
{
"entropy": 1.34859375,
"epoch": 0.16649527447235687,
"grad_norm": 2.265625,
"learning_rate": 5.9819090459951595e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9628249955177307,
"num_tokens": 724856885.0,
"step": 6800
},
{
"entropy": 1.37203125,
"epoch": 0.16771950443171246,
"grad_norm": 1.921875,
"learning_rate": 5.981237028904336e-06,
"loss": 0.2106,
"mean_token_accuracy": 0.9559559297561645,
"num_tokens": 730337882.0,
"step": 6850
},
{
"entropy": 1.3596875,
"epoch": 0.16894373439106802,
"grad_norm": 2.78125,
"learning_rate": 5.980552795958676e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.964083902835846,
"num_tokens": 735194384.0,
"step": 6900
},
{
"entropy": 1.37875,
"epoch": 0.17016796435042358,
"grad_norm": 2.890625,
"learning_rate": 5.979856349961876e-06,
"loss": 0.1884,
"mean_token_accuracy": 0.961032167673111,
"num_tokens": 740456561.0,
"step": 6950
},
{
"entropy": 1.34078125,
"epoch": 0.17139219430977914,
"grad_norm": 1.875,
"learning_rate": 5.979147693767682e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9612845265865326,
"num_tokens": 745438122.0,
"step": 7000
},
{
"entropy": 1.35234375,
"epoch": 0.1726164242691347,
"grad_norm": 1.8828125,
"learning_rate": 5.978426830279867e-06,
"loss": 0.2001,
"mean_token_accuracy": 0.9585837364196778,
"num_tokens": 750857417.0,
"step": 7050
},
{
"entropy": 1.35828125,
"epoch": 0.1738406542284903,
"grad_norm": 1.5703125,
"learning_rate": 5.977693762452226e-06,
"loss": 0.2077,
"mean_token_accuracy": 0.956944135427475,
"num_tokens": 756565585.0,
"step": 7100
},
{
"entropy": 1.37453125,
"epoch": 0.17506488418784585,
"grad_norm": 1.59375,
"learning_rate": 5.976948493288563e-06,
"loss": 0.1978,
"mean_token_accuracy": 0.9594669210910797,
"num_tokens": 762042483.0,
"step": 7150
},
{
"entropy": 1.38609375,
"epoch": 0.17628911414720141,
"grad_norm": 1.96875,
"learning_rate": 5.976191025842678e-06,
"loss": 0.1967,
"mean_token_accuracy": 0.9588606441020966,
"num_tokens": 767082096.0,
"step": 7200
},
{
"entropy": 1.3721875,
"epoch": 0.17751334410655698,
"grad_norm": 2.4375,
"learning_rate": 5.975421363218352e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9610229313373566,
"num_tokens": 772416657.0,
"step": 7250
},
{
"entropy": 1.37078125,
"epoch": 0.17873757406591254,
"grad_norm": 2.46875,
"learning_rate": 5.97463950856934e-06,
"loss": 0.187,
"mean_token_accuracy": 0.9611088275909424,
"num_tokens": 777391863.0,
"step": 7300
},
{
"entropy": 1.3696875,
"epoch": 0.1799618040252681,
"grad_norm": 2.9375,
"learning_rate": 5.973845465099352e-06,
"loss": 0.196,
"mean_token_accuracy": 0.9594384169578553,
"num_tokens": 782502134.0,
"step": 7350
},
{
"entropy": 1.3825,
"epoch": 0.18118603398462368,
"grad_norm": 3.296875,
"learning_rate": 5.973039236062047e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9621104383468628,
"num_tokens": 787376887.0,
"step": 7400
},
{
"entropy": 1.3746875,
"epoch": 0.18241026394397925,
"grad_norm": 2.609375,
"learning_rate": 5.9722208247610095e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9605046558380127,
"num_tokens": 792554125.0,
"step": 7450
},
{
"entropy": 1.39890625,
"epoch": 0.1836344939033348,
"grad_norm": 2.375,
"learning_rate": 5.971390234549746e-06,
"loss": 0.1981,
"mean_token_accuracy": 0.9588062584400177,
"num_tokens": 797990011.0,
"step": 7500
},
{
"entropy": 1.39328125,
"epoch": 0.18485872386269037,
"grad_norm": 2.1875,
"learning_rate": 5.970547468831664e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9626439011096954,
"num_tokens": 802985973.0,
"step": 7550
},
{
"entropy": 1.40375,
"epoch": 0.18608295382204593,
"grad_norm": 2.140625,
"learning_rate": 5.969692531060065e-06,
"loss": 0.1851,
"mean_token_accuracy": 0.9621277391910553,
"num_tokens": 808398744.0,
"step": 7600
},
{
"entropy": 1.391875,
"epoch": 0.1873071837814015,
"grad_norm": 1.421875,
"learning_rate": 5.9688254247381225e-06,
"loss": 0.1859,
"mean_token_accuracy": 0.9607931089401245,
"num_tokens": 813549741.0,
"step": 7650
},
{
"entropy": 1.3784375,
"epoch": 0.18853141374075708,
"grad_norm": 3.171875,
"learning_rate": 5.967946153418875e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9606724309921265,
"num_tokens": 818604872.0,
"step": 7700
},
{
"entropy": 1.3865625,
"epoch": 0.18975564370011264,
"grad_norm": 2.046875,
"learning_rate": 5.967054720705204e-06,
"loss": 0.1934,
"mean_token_accuracy": 0.9598609590530396,
"num_tokens": 824064581.0,
"step": 7750
},
{
"entropy": 1.39875,
"epoch": 0.1909798736594682,
"grad_norm": 2.53125,
"learning_rate": 5.966151130249828e-06,
"loss": 0.1926,
"mean_token_accuracy": 0.9593923246860504,
"num_tokens": 829369830.0,
"step": 7800
},
{
"entropy": 1.3865625,
"epoch": 0.19220410361882376,
"grad_norm": 2.28125,
"learning_rate": 5.965235385755279e-06,
"loss": 0.1926,
"mean_token_accuracy": 0.9593356001377106,
"num_tokens": 834877335.0,
"step": 7850
},
{
"entropy": 1.39328125,
"epoch": 0.19342833357817932,
"grad_norm": 9.0,
"learning_rate": 5.9643074909738936e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9613538563251496,
"num_tokens": 840076176.0,
"step": 7900
},
{
"entropy": 1.38703125,
"epoch": 0.19465256353753488,
"grad_norm": 2.3125,
"learning_rate": 5.963367449707793e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9614927160739899,
"num_tokens": 845350867.0,
"step": 7950
},
{
"entropy": 1.39875,
"epoch": 0.19587679349689047,
"grad_norm": 1.8359375,
"learning_rate": 5.962415265808872e-06,
"loss": 0.1921,
"mean_token_accuracy": 0.9596588695049286,
"num_tokens": 850547684.0,
"step": 8000
},
{
"entropy": 1.3890625,
"epoch": 0.19710102345624603,
"grad_norm": 2.6875,
"learning_rate": 5.961450943178779e-06,
"loss": 0.1915,
"mean_token_accuracy": 0.9603916919231414,
"num_tokens": 855721426.0,
"step": 8050
},
{
"entropy": 1.37421875,
"epoch": 0.1983252534156016,
"grad_norm": 2.734375,
"learning_rate": 5.960474485768902e-06,
"loss": 0.1722,
"mean_token_accuracy": 0.963141576051712,
"num_tokens": 860509090.0,
"step": 8100
},
{
"entropy": 1.34984375,
"epoch": 0.19954948337495715,
"grad_norm": 2.109375,
"learning_rate": 5.959485897580353e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9624167239665985,
"num_tokens": 865732499.0,
"step": 8150
},
{
"entropy": 1.37765625,
"epoch": 0.2007737133343127,
"grad_norm": 2.875,
"learning_rate": 5.95848518266395e-06,
"loss": 0.1955,
"mean_token_accuracy": 0.9592999804019928,
"num_tokens": 870715442.0,
"step": 8200
},
{
"entropy": 1.3496875,
"epoch": 0.20199794329366827,
"grad_norm": 1.8359375,
"learning_rate": 5.957472345120202e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9611281609535217,
"num_tokens": 875976771.0,
"step": 8250
},
{
"entropy": 1.331875,
"epoch": 0.20322217325302386,
"grad_norm": 2.34375,
"learning_rate": 5.95644738909929e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9619064545631408,
"num_tokens": 881030532.0,
"step": 8300
},
{
"entropy": 1.33828125,
"epoch": 0.20444640321237942,
"grad_norm": 2.3125,
"learning_rate": 5.9554103188010544e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9607453966140747,
"num_tokens": 886102364.0,
"step": 8350
},
{
"entropy": 1.33625,
"epoch": 0.20567063317173498,
"grad_norm": 2.59375,
"learning_rate": 5.9543611384749716e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9599519455432892,
"num_tokens": 891339628.0,
"step": 8400
},
{
"entropy": 1.3515625,
"epoch": 0.20689486313109054,
"grad_norm": 3.1875,
"learning_rate": 5.953299852420142e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.9594342112541199,
"num_tokens": 896598491.0,
"step": 8450
},
{
"entropy": 1.3475,
"epoch": 0.2081190930904461,
"grad_norm": 1.6171875,
"learning_rate": 5.952226464985268e-06,
"loss": 0.1876,
"mean_token_accuracy": 0.9601819491386414,
"num_tokens": 901857034.0,
"step": 8500
},
{
"entropy": 1.34546875,
"epoch": 0.2093433230498017,
"grad_norm": 2.484375,
"learning_rate": 5.951140980568639e-06,
"loss": 0.2025,
"mean_token_accuracy": 0.9580735051631928,
"num_tokens": 907672007.0,
"step": 8550
},
{
"entropy": 1.3434375,
"epoch": 0.21056755300915725,
"grad_norm": 2.859375,
"learning_rate": 5.950043403618116e-06,
"loss": 0.182,
"mean_token_accuracy": 0.9620107614994049,
"num_tokens": 912959621.0,
"step": 8600
},
{
"entropy": 1.34140625,
"epoch": 0.2117917829685128,
"grad_norm": 2.015625,
"learning_rate": 5.948933738631106e-06,
"loss": 0.182,
"mean_token_accuracy": 0.9617352223396302,
"num_tokens": 918075673.0,
"step": 8650
},
{
"entropy": 1.3446875,
"epoch": 0.21301601292786837,
"grad_norm": 2.625,
"learning_rate": 5.9478119901545485e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.960466115474701,
"num_tokens": 923511470.0,
"step": 8700
},
{
"entropy": 1.3490625,
"epoch": 0.21424024288722393,
"grad_norm": 2.4375,
"learning_rate": 5.946678162784898e-06,
"loss": 0.1997,
"mean_token_accuracy": 0.9574442803859711,
"num_tokens": 929168035.0,
"step": 8750
},
{
"entropy": 1.3559375,
"epoch": 0.2154644728465795,
"grad_norm": 2.59375,
"learning_rate": 5.945532261168101e-06,
"loss": 0.188,
"mean_token_accuracy": 0.9608505368232727,
"num_tokens": 934643696.0,
"step": 8800
},
{
"entropy": 1.37,
"epoch": 0.21668870280593508,
"grad_norm": 2.84375,
"learning_rate": 5.9443742899995815e-06,
"loss": 0.1987,
"mean_token_accuracy": 0.9590060126781463,
"num_tokens": 940012909.0,
"step": 8850
},
{
"entropy": 1.360625,
"epoch": 0.21791293276529064,
"grad_norm": 2.28125,
"learning_rate": 5.943204254024216e-06,
"loss": 0.1835,
"mean_token_accuracy": 0.9617989957332611,
"num_tokens": 945384360.0,
"step": 8900
},
{
"entropy": 1.3675,
"epoch": 0.2191371627246462,
"grad_norm": 3.03125,
"learning_rate": 5.942022158036322e-06,
"loss": 0.1955,
"mean_token_accuracy": 0.9601530432701111,
"num_tokens": 950833742.0,
"step": 8950
},
{
"entropy": 1.38125,
"epoch": 0.22036139268400176,
"grad_norm": 2.578125,
"learning_rate": 5.9408280068796286e-06,
"loss": 0.2066,
"mean_token_accuracy": 0.9570643317699432,
"num_tokens": 956401892.0,
"step": 9000
},
{
"entropy": 1.37234375,
"epoch": 0.22158562264335732,
"grad_norm": 1.71875,
"learning_rate": 5.939621805447267e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9623953711986541,
"num_tokens": 961223140.0,
"step": 9050
},
{
"entropy": 1.391875,
"epoch": 0.22280985260271288,
"grad_norm": 2.15625,
"learning_rate": 5.938403558681743e-06,
"loss": 0.202,
"mean_token_accuracy": 0.9580870044231414,
"num_tokens": 966771629.0,
"step": 9100
},
{
"entropy": 1.36703125,
"epoch": 0.22403408256206847,
"grad_norm": 2.609375,
"learning_rate": 5.9371732715749175e-06,
"loss": 0.1866,
"mean_token_accuracy": 0.9609157121181489,
"num_tokens": 972305399.0,
"step": 9150
},
{
"entropy": 1.35140625,
"epoch": 0.22525831252142403,
"grad_norm": 1.6796875,
"learning_rate": 5.935930949167991e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9617423331737518,
"num_tokens": 977370470.0,
"step": 9200
},
{
"entropy": 1.36953125,
"epoch": 0.2264825424807796,
"grad_norm": 2.140625,
"learning_rate": 5.934676596551477e-06,
"loss": 0.1884,
"mean_token_accuracy": 0.9609754991531372,
"num_tokens": 982652269.0,
"step": 9250
},
{
"entropy": 1.363125,
"epoch": 0.22770677244013515,
"grad_norm": 2.484375,
"learning_rate": 5.933410218865186e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9611955726146698,
"num_tokens": 988014138.0,
"step": 9300
},
{
"entropy": 1.37265625,
"epoch": 0.2289310023994907,
"grad_norm": 2.53125,
"learning_rate": 5.932131821298198e-06,
"loss": 0.1856,
"mean_token_accuracy": 0.9616758930683136,
"num_tokens": 993370242.0,
"step": 9350
},
{
"entropy": 1.38515625,
"epoch": 0.23015523235884627,
"grad_norm": 2.34375,
"learning_rate": 5.930841409088853e-06,
"loss": 0.1906,
"mean_token_accuracy": 0.9603582990169525,
"num_tokens": 998918502.0,
"step": 9400
},
{
"entropy": 1.39,
"epoch": 0.23137946231820186,
"grad_norm": 2.578125,
"learning_rate": 5.929538987524712e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9604568040370941,
"num_tokens": 1004326538.0,
"step": 9450
},
{
"entropy": 1.3890625,
"epoch": 0.23260369227755742,
"grad_norm": 2.75,
"learning_rate": 5.928224561942554e-06,
"loss": 0.1812,
"mean_token_accuracy": 0.9616895508766174,
"num_tokens": 1009603548.0,
"step": 9500
},
{
"entropy": 1.3871875,
"epoch": 0.23382792223691298,
"grad_norm": 2.3125,
"learning_rate": 5.92689813772834e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.9590861582756043,
"num_tokens": 1015070964.0,
"step": 9550
},
{
"entropy": 1.36609375,
"epoch": 0.23505215219626854,
"grad_norm": 3.109375,
"learning_rate": 5.9255597203172e-06,
"loss": 0.1828,
"mean_token_accuracy": 0.9619620275497437,
"num_tokens": 1020492153.0,
"step": 9600
},
{
"entropy": 1.38609375,
"epoch": 0.2362763821556241,
"grad_norm": 2.421875,
"learning_rate": 5.924209315193405e-06,
"loss": 0.1845,
"mean_token_accuracy": 0.961515667438507,
"num_tokens": 1025864529.0,
"step": 9650
},
{
"entropy": 1.3715625,
"epoch": 0.23750061211497966,
"grad_norm": 2.296875,
"learning_rate": 5.922846927890345e-06,
"loss": 0.1797,
"mean_token_accuracy": 0.9618804860115051,
"num_tokens": 1031024359.0,
"step": 9700
},
{
"entropy": 1.36359375,
"epoch": 0.23872484207433525,
"grad_norm": 2.46875,
"learning_rate": 5.9214725639905115e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.9610350334644318,
"num_tokens": 1036377471.0,
"step": 9750
},
{
"entropy": 1.3715625,
"epoch": 0.2399490720336908,
"grad_norm": 2.859375,
"learning_rate": 5.92008622912547e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9612818145751953,
"num_tokens": 1041703688.0,
"step": 9800
},
{
"entropy": 1.35671875,
"epoch": 0.24117330199304637,
"grad_norm": 2.6875,
"learning_rate": 5.918687928975836e-06,
"loss": 0.1839,
"mean_token_accuracy": 0.9616091656684875,
"num_tokens": 1046917985.0,
"step": 9850
},
{
"entropy": 1.39015625,
"epoch": 0.24239753195240193,
"grad_norm": 1.8046875,
"learning_rate": 5.9172776692712575e-06,
"loss": 0.1965,
"mean_token_accuracy": 0.9584881782531738,
"num_tokens": 1052482737.0,
"step": 9900
},
{
"entropy": 1.38703125,
"epoch": 0.2436217619117575,
"grad_norm": 2.6875,
"learning_rate": 5.915855455790381e-06,
"loss": 0.1884,
"mean_token_accuracy": 0.9608153140544892,
"num_tokens": 1057868410.0,
"step": 9950
},
{
"entropy": 1.395,
"epoch": 0.24484599187111308,
"grad_norm": 2.8125,
"learning_rate": 5.914421294360843e-06,
"loss": 0.1904,
"mean_token_accuracy": 0.9597806739807129,
"num_tokens": 1063175179.0,
"step": 10000
},
{
"epoch": 0.24484599187111308,
"eval_entropy": 1.359765625,
"eval_loss": 0.20250044763088226,
"eval_mean_token_accuracy": 0.9580152039726575,
"eval_num_tokens": 1063175179.0,
"eval_runtime": 600.0597,
"eval_samples_per_second": 16.092,
"eval_steps_per_second": 0.202,
"step": 10000
},
{
"entropy": 1.3840625,
"epoch": 0.24607022183046864,
"grad_norm": 2.28125,
"learning_rate": 5.912975190859232e-06,
"loss": 0.195,
"mean_token_accuracy": 0.9596641564369202,
"num_tokens": 1068741854.0,
"step": 10050
},
{
"entropy": 1.3790625,
"epoch": 0.2472944517898242,
"grad_norm": 2.484375,
"learning_rate": 5.9115171512110714e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9604480576515197,
"num_tokens": 1074116479.0,
"step": 10100
},
{
"entropy": 1.36453125,
"epoch": 0.24851868174917977,
"grad_norm": 2.171875,
"learning_rate": 5.910047181390794e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.9642793035507202,
"num_tokens": 1079159902.0,
"step": 10150
},
{
"entropy": 1.373125,
"epoch": 0.24974291170853533,
"grad_norm": 1.9765625,
"learning_rate": 5.908565287421718e-06,
"loss": 0.1861,
"mean_token_accuracy": 0.9611909198760986,
"num_tokens": 1084521049.0,
"step": 10200
},
{
"entropy": 1.3578125,
"epoch": 0.2509671416678909,
"grad_norm": 2.65625,
"learning_rate": 5.907071475376021e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9620854771137237,
"num_tokens": 1089493722.0,
"step": 10250
},
{
"entropy": 1.36484375,
"epoch": 0.2521913716272465,
"grad_norm": 2.640625,
"learning_rate": 5.905565751374717e-06,
"loss": 0.1732,
"mean_token_accuracy": 0.9639436435699463,
"num_tokens": 1094338571.0,
"step": 10300
},
{
"entropy": 1.37234375,
"epoch": 0.25341560158660204,
"grad_norm": 2.5625,
"learning_rate": 5.904048121587628e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.9625762343406677,
"num_tokens": 1099742354.0,
"step": 10350
},
{
"entropy": 1.38359375,
"epoch": 0.2546398315459576,
"grad_norm": 1.5078125,
"learning_rate": 5.902518592233363e-06,
"loss": 0.1987,
"mean_token_accuracy": 0.9577878427505493,
"num_tokens": 1105617487.0,
"step": 10400
},
{
"entropy": 1.3615625,
"epoch": 0.25586406150531316,
"grad_norm": 3.234375,
"learning_rate": 5.9009771695792905e-06,
"loss": 0.1811,
"mean_token_accuracy": 0.9621189975738526,
"num_tokens": 1110680544.0,
"step": 10450
},
{
"entropy": 1.37375,
"epoch": 0.2570882914646687,
"grad_norm": 2.140625,
"learning_rate": 5.899423859941511e-06,
"loss": 0.1882,
"mean_token_accuracy": 0.9606586790084839,
"num_tokens": 1116178837.0,
"step": 10500
},
{
"entropy": 1.37484375,
"epoch": 0.2583125214240243,
"grad_norm": 1.7578125,
"learning_rate": 5.897858669684833e-06,
"loss": 0.1893,
"mean_token_accuracy": 0.9598471677303314,
"num_tokens": 1121511467.0,
"step": 10550
},
{
"entropy": 1.3609375,
"epoch": 0.25953675138337984,
"grad_norm": 2.078125,
"learning_rate": 5.896281605222749e-06,
"loss": 0.1806,
"mean_token_accuracy": 0.9624120283126831,
"num_tokens": 1126507233.0,
"step": 10600
},
{
"entropy": 1.34734375,
"epoch": 0.2607609813427354,
"grad_norm": 2.28125,
"learning_rate": 5.8946926730174045e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.9608824181556702,
"num_tokens": 1131912464.0,
"step": 10650
},
{
"entropy": 1.33921875,
"epoch": 0.26198521130209096,
"grad_norm": 2.5625,
"learning_rate": 5.893091879579575e-06,
"loss": 0.1856,
"mean_token_accuracy": 0.9607326745986938,
"num_tokens": 1136882208.0,
"step": 10700
},
{
"entropy": 1.343125,
"epoch": 0.2632094412614466,
"grad_norm": 1.9921875,
"learning_rate": 5.89147923146864e-06,
"loss": 0.1813,
"mean_token_accuracy": 0.9620126748085022,
"num_tokens": 1142095292.0,
"step": 10750
},
{
"entropy": 1.34765625,
"epoch": 0.26443367122080214,
"grad_norm": 3.234375,
"learning_rate": 5.889854735292551e-06,
"loss": 0.1841,
"mean_token_accuracy": 0.9618128108978271,
"num_tokens": 1147363920.0,
"step": 10800
},
{
"entropy": 1.356875,
"epoch": 0.2656579011801577,
"grad_norm": 2.46875,
"learning_rate": 5.888218397707811e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9638459277153015,
"num_tokens": 1152380705.0,
"step": 10850
},
{
"entropy": 1.32984375,
"epoch": 0.26688213113951326,
"grad_norm": 2.109375,
"learning_rate": 5.886570225419441e-06,
"loss": 0.1865,
"mean_token_accuracy": 0.9608019030094147,
"num_tokens": 1157839898.0,
"step": 10900
},
{
"entropy": 1.34609375,
"epoch": 0.2681063610988688,
"grad_norm": 3.453125,
"learning_rate": 5.88491022518096e-06,
"loss": 0.1918,
"mean_token_accuracy": 0.9609634006023406,
"num_tokens": 1163068506.0,
"step": 10950
},
{
"entropy": 1.32734375,
"epoch": 0.2693305910582244,
"grad_norm": 2.125,
"learning_rate": 5.883238403794349e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9633646559715271,
"num_tokens": 1168287852.0,
"step": 11000
},
{
"entropy": 1.34375,
"epoch": 0.27055482101757994,
"grad_norm": 2.296875,
"learning_rate": 5.881554768110028e-06,
"loss": 0.1914,
"mean_token_accuracy": 0.9605349290370941,
"num_tokens": 1173597061.0,
"step": 11050
},
{
"entropy": 1.3434375,
"epoch": 0.2717790509769355,
"grad_norm": 3.5,
"learning_rate": 5.879859325026828e-06,
"loss": 0.1864,
"mean_token_accuracy": 0.9604840254783631,
"num_tokens": 1178845621.0,
"step": 11100
},
{
"entropy": 1.35984375,
"epoch": 0.27300328093629106,
"grad_norm": 2.734375,
"learning_rate": 5.878152081491963e-06,
"loss": 0.1925,
"mean_token_accuracy": 0.9589577269554138,
"num_tokens": 1184054388.0,
"step": 11150
},
{
"entropy": 1.34875,
"epoch": 0.2742275108956466,
"grad_norm": 2.625,
"learning_rate": 5.876433044500996e-06,
"loss": 0.1921,
"mean_token_accuracy": 0.9595346593856812,
"num_tokens": 1189697396.0,
"step": 11200
},
{
"entropy": 1.34390625,
"epoch": 0.2754517408550022,
"grad_norm": 2.0,
"learning_rate": 5.874702221097819e-06,
"loss": 0.1882,
"mean_token_accuracy": 0.960370112657547,
"num_tokens": 1195166226.0,
"step": 11250
},
{
"entropy": 1.34515625,
"epoch": 0.2766759708143578,
"grad_norm": 2.734375,
"learning_rate": 5.8729596183746175e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.9621370649337768,
"num_tokens": 1200392905.0,
"step": 11300
},
{
"entropy": 1.3428125,
"epoch": 0.27790020077371336,
"grad_norm": 3.078125,
"learning_rate": 5.871205243471844e-06,
"loss": 0.1841,
"mean_token_accuracy": 0.9613085889816284,
"num_tokens": 1205618541.0,
"step": 11350
},
{
"entropy": 1.35171875,
"epoch": 0.2791244307330689,
"grad_norm": 3.40625,
"learning_rate": 5.869439103578189e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9616814315319061,
"num_tokens": 1210836329.0,
"step": 11400
},
{
"entropy": 1.3453125,
"epoch": 0.2803486606924245,
"grad_norm": 1.8359375,
"learning_rate": 5.867661205930549e-06,
"loss": 0.1821,
"mean_token_accuracy": 0.9620612812042236,
"num_tokens": 1215867506.0,
"step": 11450
},
{
"entropy": 1.35875,
"epoch": 0.28157289065178004,
"grad_norm": 2.953125,
"learning_rate": 5.865871557814003e-06,
"loss": 0.1915,
"mean_token_accuracy": 0.9604600322246551,
"num_tokens": 1220793244.0,
"step": 11500
},
{
"entropy": 1.353125,
"epoch": 0.2827971206111356,
"grad_norm": 2.796875,
"learning_rate": 5.864070166561775e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9599918603897095,
"num_tokens": 1226305868.0,
"step": 11550
},
{
"entropy": 1.394375,
"epoch": 0.28402135057049116,
"grad_norm": 2.046875,
"learning_rate": 5.862257039555207e-06,
"loss": 0.1991,
"mean_token_accuracy": 0.9583842658996582,
"num_tokens": 1232013095.0,
"step": 11600
},
{
"entropy": 1.37578125,
"epoch": 0.2852455805298467,
"grad_norm": 2.015625,
"learning_rate": 5.860432184223731e-06,
"loss": 0.1913,
"mean_token_accuracy": 0.9596893274784088,
"num_tokens": 1237458606.0,
"step": 11650
},
{
"entropy": 1.35703125,
"epoch": 0.2864698104892023,
"grad_norm": 2.09375,
"learning_rate": 5.858595608044837e-06,
"loss": 0.1835,
"mean_token_accuracy": 0.9611952984333039,
"num_tokens": 1242972251.0,
"step": 11700
},
{
"entropy": 1.37078125,
"epoch": 0.28769404044855784,
"grad_norm": 3.1875,
"learning_rate": 5.856747318544041e-06,
"loss": 0.1865,
"mean_token_accuracy": 0.9609648621082306,
"num_tokens": 1248318638.0,
"step": 11750
},
{
"entropy": 1.365,
"epoch": 0.2889182704079134,
"grad_norm": 2.15625,
"learning_rate": 5.854887323294856e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9627510058879852,
"num_tokens": 1253680002.0,
"step": 11800
},
{
"entropy": 1.37578125,
"epoch": 0.29014250036726896,
"grad_norm": 1.8828125,
"learning_rate": 5.853015629918759e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9614068794250489,
"num_tokens": 1258924764.0,
"step": 11850
},
{
"entropy": 1.37796875,
"epoch": 0.2913667303266246,
"grad_norm": 1.90625,
"learning_rate": 5.8511322460851624e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9620686209201813,
"num_tokens": 1264051390.0,
"step": 11900
},
{
"entropy": 1.37328125,
"epoch": 0.29259096028598014,
"grad_norm": 2.3125,
"learning_rate": 5.849237179511381e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9628199970722199,
"num_tokens": 1269148836.0,
"step": 11950
},
{
"entropy": 1.376875,
"epoch": 0.2938151902453357,
"grad_norm": 3.125,
"learning_rate": 5.8473304379626e-06,
"loss": 0.1871,
"mean_token_accuracy": 0.9601672506332397,
"num_tokens": 1274348582.0,
"step": 12000
},
{
"entropy": 1.35203125,
"epoch": 0.29503942020469126,
"grad_norm": 2.46875,
"learning_rate": 5.845412029251843e-06,
"loss": 0.1796,
"mean_token_accuracy": 0.9622039210796356,
"num_tokens": 1279184908.0,
"step": 12050
},
{
"entropy": 1.35859375,
"epoch": 0.2962636501640468,
"grad_norm": 2.921875,
"learning_rate": 5.843481961239942e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.9627481973171235,
"num_tokens": 1284410532.0,
"step": 12100
},
{
"entropy": 1.35953125,
"epoch": 0.2974878801234024,
"grad_norm": 7.40625,
"learning_rate": 5.841540241835504e-06,
"loss": 0.1768,
"mean_token_accuracy": 0.9626896047592163,
"num_tokens": 1289768837.0,
"step": 12150
},
{
"entropy": 1.378125,
"epoch": 0.29871211008275794,
"grad_norm": 2.3125,
"learning_rate": 5.8395868789948775e-06,
"loss": 0.1848,
"mean_token_accuracy": 0.9612694227695465,
"num_tokens": 1295005247.0,
"step": 12200
},
{
"entropy": 1.37359375,
"epoch": 0.2999363400421135,
"grad_norm": 2.34375,
"learning_rate": 5.837621880722122e-06,
"loss": 0.1909,
"mean_token_accuracy": 0.9603909432888031,
"num_tokens": 1300316507.0,
"step": 12250
},
{
"entropy": 1.35953125,
"epoch": 0.30116057000146906,
"grad_norm": 2.75,
"learning_rate": 5.835645255068973e-06,
"loss": 0.1838,
"mean_token_accuracy": 0.9617878496646881,
"num_tokens": 1305931141.0,
"step": 12300
},
{
"entropy": 1.34640625,
"epoch": 0.3023847999608246,
"grad_norm": 2.375,
"learning_rate": 5.8336570101348115e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9648260760307312,
"num_tokens": 1310803906.0,
"step": 12350
},
{
"entropy": 1.358125,
"epoch": 0.3036090299201802,
"grad_norm": 2.84375,
"learning_rate": 5.831657154066629e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9618698525428772,
"num_tokens": 1315973080.0,
"step": 12400
},
{
"entropy": 1.35328125,
"epoch": 0.30483325987953575,
"grad_norm": 3.578125,
"learning_rate": 5.829645695058992e-06,
"loss": 0.1747,
"mean_token_accuracy": 0.9627145206928254,
"num_tokens": 1321381888.0,
"step": 12450
},
{
"entropy": 1.37859375,
"epoch": 0.30605748983889136,
"grad_norm": 2.609375,
"learning_rate": 5.827622641354014e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9626282620429992,
"num_tokens": 1326557068.0,
"step": 12500
},
{
"entropy": 1.3759375,
"epoch": 0.3072817197982469,
"grad_norm": 2.328125,
"learning_rate": 5.825588001241318e-06,
"loss": 0.1912,
"mean_token_accuracy": 0.9598784649372101,
"num_tokens": 1332216024.0,
"step": 12550
},
{
"entropy": 1.35890625,
"epoch": 0.3085059497576025,
"grad_norm": 1.8359375,
"learning_rate": 5.823541783058005e-06,
"loss": 0.174,
"mean_token_accuracy": 0.962734831571579,
"num_tokens": 1337390329.0,
"step": 12600
},
{
"entropy": 1.37375,
"epoch": 0.30973017971695804,
"grad_norm": 2.140625,
"learning_rate": 5.821483995188612e-06,
"loss": 0.1881,
"mean_token_accuracy": 0.9605675613880158,
"num_tokens": 1343045143.0,
"step": 12650
},
{
"entropy": 1.3415625,
"epoch": 0.3109544096763136,
"grad_norm": 2.28125,
"learning_rate": 5.81941464606509e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9643463969230652,
"num_tokens": 1348034262.0,
"step": 12700
},
{
"entropy": 1.3440625,
"epoch": 0.31217863963566916,
"grad_norm": 2.96875,
"learning_rate": 5.817333744166762e-06,
"loss": 0.1921,
"mean_token_accuracy": 0.9586631393432617,
"num_tokens": 1353723053.0,
"step": 12750
},
{
"entropy": 1.3721875,
"epoch": 0.3134028695950247,
"grad_norm": 2.203125,
"learning_rate": 5.815241298020286e-06,
"loss": 0.1846,
"mean_token_accuracy": 0.9600662136077881,
"num_tokens": 1358674728.0,
"step": 12800
},
{
"entropy": 1.365625,
"epoch": 0.3146270995543803,
"grad_norm": 2.6875,
"learning_rate": 5.813137316199628e-06,
"loss": 0.1835,
"mean_token_accuracy": 0.961768034696579,
"num_tokens": 1363933473.0,
"step": 12850
},
{
"entropy": 1.38015625,
"epoch": 0.31585132951373585,
"grad_norm": 2.5,
"learning_rate": 5.811021807326018e-06,
"loss": 0.1982,
"mean_token_accuracy": 0.9590709102153778,
"num_tokens": 1369281803.0,
"step": 12900
},
{
"entropy": 1.37,
"epoch": 0.3170755594730914,
"grad_norm": 3.03125,
"learning_rate": 5.808894780067923e-06,
"loss": 0.1949,
"mean_token_accuracy": 0.9586555528640747,
"num_tokens": 1374853145.0,
"step": 12950
},
{
"entropy": 1.36421875,
"epoch": 0.31829978943244697,
"grad_norm": 2.015625,
"learning_rate": 5.8067562431410045e-06,
"loss": 0.171,
"mean_token_accuracy": 0.9631958258152008,
"num_tokens": 1379934830.0,
"step": 13000
},
{
"entropy": 1.3609375,
"epoch": 0.3195240193918026,
"grad_norm": 2.09375,
"learning_rate": 5.804606205308088e-06,
"loss": 0.1841,
"mean_token_accuracy": 0.9605684506893158,
"num_tokens": 1385105704.0,
"step": 13050
},
{
"entropy": 1.37671875,
"epoch": 0.32074824935115814,
"grad_norm": 2.875,
"learning_rate": 5.802444675379122e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9595759809017181,
"num_tokens": 1390581041.0,
"step": 13100
},
{
"entropy": 1.37828125,
"epoch": 0.3219724793105137,
"grad_norm": 1.9453125,
"learning_rate": 5.8002716622111485e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9617175209522247,
"num_tokens": 1395850769.0,
"step": 13150
},
{
"entropy": 1.365,
"epoch": 0.32319670926986926,
"grad_norm": 2.515625,
"learning_rate": 5.79808717470826e-06,
"loss": 0.1676,
"mean_token_accuracy": 0.9655633735656738,
"num_tokens": 1400935540.0,
"step": 13200
},
{
"entropy": 1.3709375,
"epoch": 0.3244209392292248,
"grad_norm": 2.421875,
"learning_rate": 5.795891221821569e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9624592447280884,
"num_tokens": 1406376315.0,
"step": 13250
},
{
"entropy": 1.34875,
"epoch": 0.3256451691885804,
"grad_norm": 3.09375,
"learning_rate": 5.793683812549162e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9637568819522858,
"num_tokens": 1411533562.0,
"step": 13300
},
{
"entropy": 1.36421875,
"epoch": 0.32686939914793595,
"grad_norm": 2.703125,
"learning_rate": 5.791464955936077e-06,
"loss": 0.1938,
"mean_token_accuracy": 0.9592576730251312,
"num_tokens": 1417402528.0,
"step": 13350
},
{
"entropy": 1.36109375,
"epoch": 0.3280936291072915,
"grad_norm": 1.7109375,
"learning_rate": 5.789234661074254e-06,
"loss": 0.1744,
"mean_token_accuracy": 0.9627709448337555,
"num_tokens": 1422622878.0,
"step": 13400
},
{
"entropy": 1.3790625,
"epoch": 0.32931785906664707,
"grad_norm": 2.421875,
"learning_rate": 5.786992937102503e-06,
"loss": 0.1959,
"mean_token_accuracy": 0.9586515820026398,
"num_tokens": 1427838914.0,
"step": 13450
},
{
"entropy": 1.36,
"epoch": 0.33054208902600263,
"grad_norm": 3.140625,
"learning_rate": 5.784739793206464e-06,
"loss": 0.1794,
"mean_token_accuracy": 0.9625478911399842,
"num_tokens": 1432973891.0,
"step": 13500
},
{
"entropy": 1.37546875,
"epoch": 0.3317663189853582,
"grad_norm": 2.875,
"learning_rate": 5.782475238618574e-06,
"loss": 0.1952,
"mean_token_accuracy": 0.958906524181366,
"num_tokens": 1438425313.0,
"step": 13550
},
{
"entropy": 1.39109375,
"epoch": 0.33299054894471375,
"grad_norm": 3.15625,
"learning_rate": 5.780199282618026e-06,
"loss": 0.1937,
"mean_token_accuracy": 0.9599265992641449,
"num_tokens": 1443930223.0,
"step": 13600
},
{
"entropy": 1.3784375,
"epoch": 0.33421477890406937,
"grad_norm": 2.359375,
"learning_rate": 5.777911934530726e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9606879663467407,
"num_tokens": 1449235492.0,
"step": 13650
},
{
"entropy": 1.3740625,
"epoch": 0.3354390088634249,
"grad_norm": 2.25,
"learning_rate": 5.7756132037292665e-06,
"loss": 0.1845,
"mean_token_accuracy": 0.9607800352573395,
"num_tokens": 1454874971.0,
"step": 13700
},
{
"entropy": 1.3565625,
"epoch": 0.3366632388227805,
"grad_norm": 3.0,
"learning_rate": 5.77330309963288e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.9650224351882934,
"num_tokens": 1459910564.0,
"step": 13750
},
{
"entropy": 1.3896875,
"epoch": 0.33788746878213605,
"grad_norm": 2.6875,
"learning_rate": 5.7709816317074e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9610321772098541,
"num_tokens": 1465214852.0,
"step": 13800
},
{
"entropy": 1.3609375,
"epoch": 0.3391116987414916,
"grad_norm": 2.421875,
"learning_rate": 5.768648809465223e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9646092760562897,
"num_tokens": 1470405224.0,
"step": 13850
},
{
"entropy": 1.3671875,
"epoch": 0.34033592870084717,
"grad_norm": 2.421875,
"learning_rate": 5.766304642465277e-06,
"loss": 0.1684,
"mean_token_accuracy": 0.964150664806366,
"num_tokens": 1475222511.0,
"step": 13900
},
{
"entropy": 1.3615625,
"epoch": 0.34156015866020273,
"grad_norm": 2.015625,
"learning_rate": 5.763949140312969e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.9601925635337829,
"num_tokens": 1480884593.0,
"step": 13950
},
{
"entropy": 1.35734375,
"epoch": 0.3427843886195583,
"grad_norm": 2.859375,
"learning_rate": 5.7615823126601565e-06,
"loss": 0.1853,
"mean_token_accuracy": 0.9617584705352783,
"num_tokens": 1485873672.0,
"step": 14000
},
{
"entropy": 1.37375,
"epoch": 0.34400861857891385,
"grad_norm": 2.3125,
"learning_rate": 5.759204169205102e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9605587136745453,
"num_tokens": 1490904541.0,
"step": 14050
},
{
"entropy": 1.36359375,
"epoch": 0.3452328485382694,
"grad_norm": 2.140625,
"learning_rate": 5.7568147196924395e-06,
"loss": 0.1891,
"mean_token_accuracy": 0.9609455835819244,
"num_tokens": 1496373059.0,
"step": 14100
},
{
"entropy": 1.35421875,
"epoch": 0.34645707849762497,
"grad_norm": 0.0322265625,
"learning_rate": 5.754413973913126e-06,
"loss": 0.1673,
"mean_token_accuracy": 0.9642012619972229,
"num_tokens": 1500901681.0,
"step": 14150
},
{
"entropy": 1.343125,
"epoch": 0.3476813084569806,
"grad_norm": 1.859375,
"learning_rate": 5.752001941704407e-06,
"loss": 0.1759,
"mean_token_accuracy": 0.9625442051887512,
"num_tokens": 1506040261.0,
"step": 14200
},
{
"entropy": 1.36625,
"epoch": 0.34890553841633615,
"grad_norm": 3.0,
"learning_rate": 5.749578632949776e-06,
"loss": 0.1802,
"mean_token_accuracy": 0.9619328999519348,
"num_tokens": 1511536121.0,
"step": 14250
},
{
"entropy": 1.356875,
"epoch": 0.3501297683756917,
"grad_norm": 3.703125,
"learning_rate": 5.747144057578932e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.9613735234737396,
"num_tokens": 1516899260.0,
"step": 14300
},
{
"entropy": 1.36203125,
"epoch": 0.35135399833504727,
"grad_norm": 3.671875,
"learning_rate": 5.744698225567742e-06,
"loss": 0.1929,
"mean_token_accuracy": 0.9596503937244415,
"num_tokens": 1522277914.0,
"step": 14350
},
{
"entropy": 1.35921875,
"epoch": 0.35257822829440283,
"grad_norm": 2.15625,
"learning_rate": 5.742241146938195e-06,
"loss": 0.18,
"mean_token_accuracy": 0.9617201662063599,
"num_tokens": 1527559983.0,
"step": 14400
},
{
"entropy": 1.3353125,
"epoch": 0.3538024582537584,
"grad_norm": 3.03125,
"learning_rate": 5.739772831758365e-06,
"loss": 0.171,
"mean_token_accuracy": 0.9635174345970153,
"num_tokens": 1532501983.0,
"step": 14450
},
{
"entropy": 1.37234375,
"epoch": 0.35502668821311395,
"grad_norm": 1.84375,
"learning_rate": 5.737293290142369e-06,
"loss": 0.1957,
"mean_token_accuracy": 0.9595348858833312,
"num_tokens": 1538384868.0,
"step": 14500
},
{
"entropy": 1.36453125,
"epoch": 0.3562509181724695,
"grad_norm": 2.734375,
"learning_rate": 5.734802532250327e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.9636399447917938,
"num_tokens": 1543550967.0,
"step": 14550
},
{
"entropy": 1.36703125,
"epoch": 0.35747514813182507,
"grad_norm": 2.390625,
"learning_rate": 5.7323005682883144e-06,
"loss": 0.1817,
"mean_token_accuracy": 0.9614765977859497,
"num_tokens": 1548814643.0,
"step": 14600
},
{
"entropy": 1.37171875,
"epoch": 0.35869937809118063,
"grad_norm": 2.140625,
"learning_rate": 5.729787408508328e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9606961834430695,
"num_tokens": 1554002337.0,
"step": 14650
},
{
"entropy": 1.363125,
"epoch": 0.3599236080505362,
"grad_norm": 2.359375,
"learning_rate": 5.7272630632082385e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.9617051208019256,
"num_tokens": 1558888261.0,
"step": 14700
},
{
"entropy": 1.3603125,
"epoch": 0.36114783800989175,
"grad_norm": 1.9609375,
"learning_rate": 5.7247275427317515e-06,
"loss": 0.1882,
"mean_token_accuracy": 0.9613351905345917,
"num_tokens": 1564034699.0,
"step": 14750
},
{
"entropy": 1.38765625,
"epoch": 0.36237206796924737,
"grad_norm": 3.90625,
"learning_rate": 5.722180857468361e-06,
"loss": 0.2015,
"mean_token_accuracy": 0.9581510519981384,
"num_tokens": 1569662314.0,
"step": 14800
},
{
"entropy": 1.35671875,
"epoch": 0.36359629792860293,
"grad_norm": 1.875,
"learning_rate": 5.719623017853315e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9616824269294739,
"num_tokens": 1575167487.0,
"step": 14850
},
{
"entropy": 1.36796875,
"epoch": 0.3648205278879585,
"grad_norm": 2.921875,
"learning_rate": 5.7170540343675596e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9607573926448822,
"num_tokens": 1580657915.0,
"step": 14900
},
{
"entropy": 1.3684375,
"epoch": 0.36604475784731405,
"grad_norm": 2.578125,
"learning_rate": 5.714473917537712e-06,
"loss": 0.1771,
"mean_token_accuracy": 0.9625304937362671,
"num_tokens": 1585664001.0,
"step": 14950
},
{
"entropy": 1.36109375,
"epoch": 0.3672689878066696,
"grad_norm": 2.546875,
"learning_rate": 5.711882677936003e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.961945322751999,
"num_tokens": 1590920113.0,
"step": 15000
},
{
"entropy": 1.3575,
"epoch": 0.36849321776602517,
"grad_norm": 2.3125,
"learning_rate": 5.709280326180242e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9629940688610077,
"num_tokens": 1596062396.0,
"step": 15050
},
{
"entropy": 1.37359375,
"epoch": 0.36971744772538073,
"grad_norm": 2.140625,
"learning_rate": 5.7066668729337725e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9626081240177154,
"num_tokens": 1601254217.0,
"step": 15100
},
{
"entropy": 1.36609375,
"epoch": 0.3709416776847363,
"grad_norm": 2.109375,
"learning_rate": 5.704042328905426e-06,
"loss": 0.1851,
"mean_token_accuracy": 0.9608933937549591,
"num_tokens": 1606561855.0,
"step": 15150
},
{
"entropy": 1.34859375,
"epoch": 0.37216590764409185,
"grad_norm": 1.8515625,
"learning_rate": 5.701406704849479e-06,
"loss": 0.1893,
"mean_token_accuracy": 0.9602335524559021,
"num_tokens": 1612223884.0,
"step": 15200
},
{
"entropy": 1.36765625,
"epoch": 0.3733901376034474,
"grad_norm": 2.703125,
"learning_rate": 5.69876001156561e-06,
"loss": 0.1837,
"mean_token_accuracy": 0.9612676846981049,
"num_tokens": 1617459423.0,
"step": 15250
},
{
"entropy": 1.366875,
"epoch": 0.374614367562803,
"grad_norm": 2.0625,
"learning_rate": 5.696102259898855e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9605361771583557,
"num_tokens": 1622772691.0,
"step": 15300
},
{
"entropy": 1.3678125,
"epoch": 0.37583859752215854,
"grad_norm": 2.21875,
"learning_rate": 5.693433460739561e-06,
"loss": 0.1794,
"mean_token_accuracy": 0.9623438572883606,
"num_tokens": 1627992421.0,
"step": 15350
},
{
"entropy": 1.385,
"epoch": 0.37706282748151415,
"grad_norm": 2.15625,
"learning_rate": 5.690753625023344e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.9602718544006348,
"num_tokens": 1633295976.0,
"step": 15400
},
{
"entropy": 1.36546875,
"epoch": 0.3782870574408697,
"grad_norm": 2.078125,
"learning_rate": 5.688062763731044e-06,
"loss": 0.2002,
"mean_token_accuracy": 0.9582274675369262,
"num_tokens": 1638988248.0,
"step": 15450
},
{
"entropy": 1.35359375,
"epoch": 0.3795112874002253,
"grad_norm": 1.9921875,
"learning_rate": 5.685360887888677e-06,
"loss": 0.1789,
"mean_token_accuracy": 0.9629680168628693,
"num_tokens": 1644498341.0,
"step": 15500
},
{
"entropy": 1.369375,
"epoch": 0.38073551735958083,
"grad_norm": 2.65625,
"learning_rate": 5.682648008567394e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9636906123161316,
"num_tokens": 1649900901.0,
"step": 15550
},
{
"entropy": 1.36546875,
"epoch": 0.3819597473189364,
"grad_norm": 2.40625,
"learning_rate": 5.679924136883432e-06,
"loss": 0.1916,
"mean_token_accuracy": 0.9601245021820068,
"num_tokens": 1655743468.0,
"step": 15600
},
{
"entropy": 1.37828125,
"epoch": 0.38318397727829195,
"grad_norm": 2.578125,
"learning_rate": 5.677189283998073e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.963598461151123,
"num_tokens": 1660916320.0,
"step": 15650
},
{
"entropy": 1.35796875,
"epoch": 0.3844082072376475,
"grad_norm": 2.265625,
"learning_rate": 5.674443461117591e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9613646280765533,
"num_tokens": 1666271922.0,
"step": 15700
},
{
"entropy": 1.3571875,
"epoch": 0.3856324371970031,
"grad_norm": 2.328125,
"learning_rate": 5.671686679493215e-06,
"loss": 0.187,
"mean_token_accuracy": 0.9609103786945343,
"num_tokens": 1671766527.0,
"step": 15750
},
{
"entropy": 1.36625,
"epoch": 0.38685666715635864,
"grad_norm": 1.6328125,
"learning_rate": 5.668918950421074e-06,
"loss": 0.1886,
"mean_token_accuracy": 0.9606494891643524,
"num_tokens": 1677165332.0,
"step": 15800
},
{
"entropy": 1.3475,
"epoch": 0.3880808971157142,
"grad_norm": 3.046875,
"learning_rate": 5.666140285242158e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9625120401382447,
"num_tokens": 1682494165.0,
"step": 15850
},
{
"entropy": 1.36125,
"epoch": 0.38930512707506976,
"grad_norm": 2.0625,
"learning_rate": 5.663350695342268e-06,
"loss": 0.1892,
"mean_token_accuracy": 0.9604367816448212,
"num_tokens": 1688253134.0,
"step": 15900
},
{
"entropy": 1.35328125,
"epoch": 0.3905293570344254,
"grad_norm": 1.6640625,
"learning_rate": 5.660550192151967e-06,
"loss": 0.1845,
"mean_token_accuracy": 0.9621007204055786,
"num_tokens": 1693632232.0,
"step": 15950
},
{
"entropy": 1.3690625,
"epoch": 0.39175358699378093,
"grad_norm": 1.8359375,
"learning_rate": 5.657738787146543e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9610405099391938,
"num_tokens": 1698678337.0,
"step": 16000
},
{
"entropy": 1.346875,
"epoch": 0.3929778169531365,
"grad_norm": 2.765625,
"learning_rate": 5.654916491845947e-06,
"loss": 0.1733,
"mean_token_accuracy": 0.9640054357051849,
"num_tokens": 1704187251.0,
"step": 16050
},
{
"entropy": 1.35375,
"epoch": 0.39420204691249205,
"grad_norm": 2.46875,
"learning_rate": 5.652083317814759e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.9634167146682739,
"num_tokens": 1709408694.0,
"step": 16100
},
{
"entropy": 1.34265625,
"epoch": 0.3954262768718476,
"grad_norm": 2.8125,
"learning_rate": 5.649239276662133e-06,
"loss": 0.1724,
"mean_token_accuracy": 0.963241057395935,
"num_tokens": 1714585157.0,
"step": 16150
},
{
"entropy": 1.3303125,
"epoch": 0.3966505068312032,
"grad_norm": 2.578125,
"learning_rate": 5.646384380041755e-06,
"loss": 0.1759,
"mean_token_accuracy": 0.9634040462970733,
"num_tokens": 1719749974.0,
"step": 16200
},
{
"entropy": 1.33890625,
"epoch": 0.39787473679055874,
"grad_norm": 2.296875,
"learning_rate": 5.643518639651789e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.963290364742279,
"num_tokens": 1724935979.0,
"step": 16250
},
{
"entropy": 1.341875,
"epoch": 0.3990989667499143,
"grad_norm": 3.828125,
"learning_rate": 5.640642067234832e-06,
"loss": 0.1869,
"mean_token_accuracy": 0.9608835780620575,
"num_tokens": 1729904911.0,
"step": 16300
},
{
"entropy": 1.3525,
"epoch": 0.40032319670926986,
"grad_norm": 3.015625,
"learning_rate": 5.637754674577869e-06,
"loss": 0.193,
"mean_token_accuracy": 0.9592759358882904,
"num_tokens": 1735603402.0,
"step": 16350
},
{
"entropy": 1.33984375,
"epoch": 0.4015474266686254,
"grad_norm": 2.671875,
"learning_rate": 5.634856473512218e-06,
"loss": 0.1787,
"mean_token_accuracy": 0.9626182532310485,
"num_tokens": 1740876722.0,
"step": 16400
},
{
"entropy": 1.3328125,
"epoch": 0.402771656627981,
"grad_norm": 2.421875,
"learning_rate": 5.631947475913489e-06,
"loss": 0.1951,
"mean_token_accuracy": 0.9596171510219574,
"num_tokens": 1746470991.0,
"step": 16450
},
{
"entropy": 1.31375,
"epoch": 0.40399588658733654,
"grad_norm": 2.734375,
"learning_rate": 5.629027693701531e-06,
"loss": 0.1646,
"mean_token_accuracy": 0.9641488230228424,
"num_tokens": 1751600795.0,
"step": 16500
},
{
"entropy": 1.3459375,
"epoch": 0.40522011654669216,
"grad_norm": 0.01904296875,
"learning_rate": 5.626097138840379e-06,
"loss": 0.1931,
"mean_token_accuracy": 0.9586203134059906,
"num_tokens": 1757280148.0,
"step": 16550
},
{
"entropy": 1.32203125,
"epoch": 0.4064443465060477,
"grad_norm": 3.125,
"learning_rate": 5.623155823338219e-06,
"loss": 0.1845,
"mean_token_accuracy": 0.961804312467575,
"num_tokens": 1762386072.0,
"step": 16600
},
{
"entropy": 1.309375,
"epoch": 0.4076685764654033,
"grad_norm": 1.9609375,
"learning_rate": 5.62020375924732e-06,
"loss": 0.1679,
"mean_token_accuracy": 0.9640087175369263,
"num_tokens": 1767593608.0,
"step": 16650
},
{
"entropy": 1.33890625,
"epoch": 0.40889280642475884,
"grad_norm": 2.296875,
"learning_rate": 5.617240958664e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9619925379753113,
"num_tokens": 1772859293.0,
"step": 16700
},
{
"entropy": 1.3303125,
"epoch": 0.4101170363841144,
"grad_norm": 2.453125,
"learning_rate": 5.614267433728569e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9621168851852417,
"num_tokens": 1778176957.0,
"step": 16750
},
{
"entropy": 1.33359375,
"epoch": 0.41134126634346996,
"grad_norm": 2.28125,
"learning_rate": 5.611283196625281e-06,
"loss": 0.1876,
"mean_token_accuracy": 0.9608843457698822,
"num_tokens": 1783513531.0,
"step": 16800
},
{
"entropy": 1.31875,
"epoch": 0.4125654963028255,
"grad_norm": 2.375,
"learning_rate": 5.6082882595822835e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9634191727638245,
"num_tokens": 1788649179.0,
"step": 16850
},
{
"entropy": 1.34703125,
"epoch": 0.4137897262621811,
"grad_norm": 3.0,
"learning_rate": 5.605282634871569e-06,
"loss": 0.1846,
"mean_token_accuracy": 0.9604820072650909,
"num_tokens": 1794020681.0,
"step": 16900
},
{
"entropy": 1.341875,
"epoch": 0.41501395622153664,
"grad_norm": 2.265625,
"learning_rate": 5.602266334808922e-06,
"loss": 0.1917,
"mean_token_accuracy": 0.9598517632484436,
"num_tokens": 1799786050.0,
"step": 16950
},
{
"entropy": 1.32484375,
"epoch": 0.4162381861808922,
"grad_norm": 2.421875,
"learning_rate": 5.599239371753871e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.9613809895515442,
"num_tokens": 1805308121.0,
"step": 17000
},
{
"entropy": 1.3296875,
"epoch": 0.41746241614024776,
"grad_norm": 2.265625,
"learning_rate": 5.596201758109636e-06,
"loss": 0.1971,
"mean_token_accuracy": 0.9585018038749695,
"num_tokens": 1811016191.0,
"step": 17050
},
{
"entropy": 1.34390625,
"epoch": 0.4186866460996034,
"grad_norm": 2.65625,
"learning_rate": 5.593153506323082e-06,
"loss": 0.1912,
"mean_token_accuracy": 0.9609514188766479,
"num_tokens": 1816538866.0,
"step": 17100
},
{
"entropy": 1.319375,
"epoch": 0.41991087605895894,
"grad_norm": 2.578125,
"learning_rate": 5.59009462888466e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.9638219344615936,
"num_tokens": 1821484676.0,
"step": 17150
},
{
"entropy": 1.3296875,
"epoch": 0.4211351060183145,
"grad_norm": 3.078125,
"learning_rate": 5.587025138328363e-06,
"loss": 0.1855,
"mean_token_accuracy": 0.9604250502586364,
"num_tokens": 1826760752.0,
"step": 17200
},
{
"entropy": 1.32703125,
"epoch": 0.42235933597767006,
"grad_norm": 4.375,
"learning_rate": 5.583945047231672e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.9626831936836243,
"num_tokens": 1831709955.0,
"step": 17250
},
{
"entropy": 1.3278125,
"epoch": 0.4235835659370256,
"grad_norm": 3.578125,
"learning_rate": 5.580854368215504e-06,
"loss": 0.1688,
"mean_token_accuracy": 0.9641677963733674,
"num_tokens": 1836539757.0,
"step": 17300
},
{
"entropy": 1.35453125,
"epoch": 0.4248077958963812,
"grad_norm": 3.203125,
"learning_rate": 5.577753113944161e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.9620350849628448,
"num_tokens": 1841748836.0,
"step": 17350
},
{
"entropy": 1.35484375,
"epoch": 0.42603202585573674,
"grad_norm": 3.046875,
"learning_rate": 5.574641297125277e-06,
"loss": 0.1903,
"mean_token_accuracy": 0.9602237248420715,
"num_tokens": 1846964872.0,
"step": 17400
},
{
"entropy": 1.3465625,
"epoch": 0.4272562558150923,
"grad_norm": 2.375,
"learning_rate": 5.5715189305097705e-06,
"loss": 0.18,
"mean_token_accuracy": 0.9612255036830902,
"num_tokens": 1852195890.0,
"step": 17450
},
{
"entropy": 1.34734375,
"epoch": 0.42848048577444786,
"grad_norm": 1.921875,
"learning_rate": 5.568386026891784e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9614002680778504,
"num_tokens": 1857781986.0,
"step": 17500
},
{
"entropy": 1.383125,
"epoch": 0.4297047157338034,
"grad_norm": 3.59375,
"learning_rate": 5.565242599108638e-06,
"loss": 0.1733,
"mean_token_accuracy": 0.9632753264904023,
"num_tokens": 1862697378.0,
"step": 17550
},
{
"entropy": 1.37734375,
"epoch": 0.430928945693159,
"grad_norm": 2.578125,
"learning_rate": 5.5620886600407775e-06,
"loss": 0.1793,
"mean_token_accuracy": 0.9618914890289306,
"num_tokens": 1867900164.0,
"step": 17600
},
{
"entropy": 1.37453125,
"epoch": 0.43215317565251454,
"grad_norm": 3.359375,
"learning_rate": 5.558924222611718e-06,
"loss": 0.189,
"mean_token_accuracy": 0.9601231980323791,
"num_tokens": 1873349723.0,
"step": 17650
},
{
"entropy": 1.3796875,
"epoch": 0.43337740561187016,
"grad_norm": 2.125,
"learning_rate": 5.555749299787992e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9612041318416595,
"num_tokens": 1878516011.0,
"step": 17700
},
{
"entropy": 1.36796875,
"epoch": 0.4346016355712257,
"grad_norm": 1.703125,
"learning_rate": 5.552563904579097e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.965571962594986,
"num_tokens": 1883672436.0,
"step": 17750
},
{
"entropy": 1.37421875,
"epoch": 0.4358258655305813,
"grad_norm": 2.140625,
"learning_rate": 5.549368050037442e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.9618594205379486,
"num_tokens": 1889075709.0,
"step": 17800
},
{
"entropy": 1.3753125,
"epoch": 0.43705009548993684,
"grad_norm": 1.703125,
"learning_rate": 5.5461617492582955e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9609970545768738,
"num_tokens": 1894320611.0,
"step": 17850
},
{
"entropy": 1.35203125,
"epoch": 0.4382743254492924,
"grad_norm": 3.265625,
"learning_rate": 5.542945015379727e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.9610999655723572,
"num_tokens": 1899502888.0,
"step": 17900
},
{
"entropy": 1.3653125,
"epoch": 0.43949855540864796,
"grad_norm": 3.125,
"learning_rate": 5.53971786158256e-06,
"loss": 0.1783,
"mean_token_accuracy": 0.9628078281879425,
"num_tokens": 1904727333.0,
"step": 17950
},
{
"entropy": 1.37265625,
"epoch": 0.4407227853680035,
"grad_norm": 2.15625,
"learning_rate": 5.536480301090311e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9612684857845306,
"num_tokens": 1910269964.0,
"step": 18000
},
{
"entropy": 1.36875,
"epoch": 0.4419470153273591,
"grad_norm": 2.421875,
"learning_rate": 5.533232347169142e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9630991363525391,
"num_tokens": 1915481678.0,
"step": 18050
},
{
"entropy": 1.37703125,
"epoch": 0.44317124528671464,
"grad_norm": 0.007720947265625,
"learning_rate": 5.5299740131278e-06,
"loss": 0.1776,
"mean_token_accuracy": 0.9631426560878754,
"num_tokens": 1920892313.0,
"step": 18100
},
{
"entropy": 1.3784375,
"epoch": 0.4443954752460702,
"grad_norm": 2.25,
"learning_rate": 5.5267053123175685e-06,
"loss": 0.1793,
"mean_token_accuracy": 0.9618562459945679,
"num_tokens": 1925855441.0,
"step": 18150
},
{
"entropy": 1.40484375,
"epoch": 0.44561970520542576,
"grad_norm": 2.390625,
"learning_rate": 5.523426258132208e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9602830135822296,
"num_tokens": 1931433927.0,
"step": 18200
},
{
"entropy": 1.381875,
"epoch": 0.4468439351647813,
"grad_norm": 2.140625,
"learning_rate": 5.520136864007901e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9617183935642243,
"num_tokens": 1937093589.0,
"step": 18250
},
{
"entropy": 1.3784375,
"epoch": 0.44806816512413694,
"grad_norm": 2.890625,
"learning_rate": 5.516837143423201e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9620720791816711,
"num_tokens": 1942266157.0,
"step": 18300
},
{
"entropy": 1.3815625,
"epoch": 0.4492923950834925,
"grad_norm": 2.734375,
"learning_rate": 5.5135271098989745e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9636857545375824,
"num_tokens": 1947254229.0,
"step": 18350
},
{
"entropy": 1.39609375,
"epoch": 0.45051662504284806,
"grad_norm": 2.28125,
"learning_rate": 5.510206776998347e-06,
"loss": 0.2004,
"mean_token_accuracy": 0.9576922535896302,
"num_tokens": 1953541405.0,
"step": 18400
},
{
"entropy": 1.38515625,
"epoch": 0.4517408550022036,
"grad_norm": 1.8671875,
"learning_rate": 5.5068761583266446e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9612382733821869,
"num_tokens": 1958947967.0,
"step": 18450
},
{
"entropy": 1.38546875,
"epoch": 0.4529650849615592,
"grad_norm": 2.609375,
"learning_rate": 5.503535267531341e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.9630067098140717,
"num_tokens": 1964172588.0,
"step": 18500
},
{
"entropy": 1.37171875,
"epoch": 0.45418931492091474,
"grad_norm": 2.453125,
"learning_rate": 5.500184118302001e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9629046404361725,
"num_tokens": 1969146021.0,
"step": 18550
},
{
"entropy": 1.35796875,
"epoch": 0.4554135448802703,
"grad_norm": 2.390625,
"learning_rate": 5.496822724370225e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9641622114181518,
"num_tokens": 1974171622.0,
"step": 18600
},
{
"entropy": 1.35109375,
"epoch": 0.45663777483962587,
"grad_norm": 1.9375,
"learning_rate": 5.493451099509589e-06,
"loss": 0.1797,
"mean_token_accuracy": 0.9615970349311829,
"num_tokens": 1979453512.0,
"step": 18650
},
{
"entropy": 1.3515625,
"epoch": 0.4578620047989814,
"grad_norm": 2.421875,
"learning_rate": 5.490069257535595e-06,
"loss": 0.1786,
"mean_token_accuracy": 0.9625794899463653,
"num_tokens": 1984570640.0,
"step": 18700
},
{
"entropy": 1.37140625,
"epoch": 0.459086234758337,
"grad_norm": 2.296875,
"learning_rate": 5.4866772123056055e-06,
"loss": 0.1928,
"mean_token_accuracy": 0.9605653440952301,
"num_tokens": 1990199710.0,
"step": 18750
},
{
"entropy": 1.375625,
"epoch": 0.46031046471769255,
"grad_norm": 2.09375,
"learning_rate": 5.483274977718797e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9597025084495544,
"num_tokens": 1995518980.0,
"step": 18800
},
{
"entropy": 1.37984375,
"epoch": 0.46153469467704816,
"grad_norm": 2.6875,
"learning_rate": 5.479862567716095e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9633987152576446,
"num_tokens": 2000479352.0,
"step": 18850
},
{
"entropy": 1.38640625,
"epoch": 0.4627589246364037,
"grad_norm": 3.671875,
"learning_rate": 5.476439996280118e-06,
"loss": 0.1941,
"mean_token_accuracy": 0.959332902431488,
"num_tokens": 2005933401.0,
"step": 18900
},
{
"entropy": 1.3975,
"epoch": 0.4639831545957593,
"grad_norm": 2.5625,
"learning_rate": 5.473007277435125e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9638979506492614,
"num_tokens": 2010666027.0,
"step": 18950
},
{
"entropy": 1.38140625,
"epoch": 0.46520738455511484,
"grad_norm": 3.640625,
"learning_rate": 5.469564425246953e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9617711079120635,
"num_tokens": 2016049085.0,
"step": 19000
},
{
"entropy": 1.37015625,
"epoch": 0.4664316145144704,
"grad_norm": 1.71875,
"learning_rate": 5.46611145382296e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9642109513282776,
"num_tokens": 2021148599.0,
"step": 19050
},
{
"entropy": 1.35875,
"epoch": 0.46765584447382597,
"grad_norm": 1.6875,
"learning_rate": 5.462648377311973e-06,
"loss": 0.1785,
"mean_token_accuracy": 0.9610287690162659,
"num_tokens": 2026306056.0,
"step": 19100
},
{
"entropy": 1.34953125,
"epoch": 0.4688800744331815,
"grad_norm": 2.78125,
"learning_rate": 5.459175209904221e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9627043080329895,
"num_tokens": 2031493225.0,
"step": 19150
},
{
"entropy": 1.34484375,
"epoch": 0.4701043043925371,
"grad_norm": 1.8671875,
"learning_rate": 5.455691965831281e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9625547790527343,
"num_tokens": 2036730518.0,
"step": 19200
},
{
"entropy": 1.3490625,
"epoch": 0.47132853435189265,
"grad_norm": 2.546875,
"learning_rate": 5.452198659366023e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9653509867191314,
"num_tokens": 2041648821.0,
"step": 19250
},
{
"entropy": 1.33796875,
"epoch": 0.4725527643112482,
"grad_norm": 1.921875,
"learning_rate": 5.448695304822545e-06,
"loss": 0.1733,
"mean_token_accuracy": 0.9637433886528015,
"num_tokens": 2046695948.0,
"step": 19300
},
{
"entropy": 1.35109375,
"epoch": 0.47377699427060377,
"grad_norm": 3.15625,
"learning_rate": 5.445181916556123e-06,
"loss": 0.1712,
"mean_token_accuracy": 0.96383709192276,
"num_tokens": 2051915262.0,
"step": 19350
},
{
"entropy": 1.3453125,
"epoch": 0.47500122422995933,
"grad_norm": 2.578125,
"learning_rate": 5.4416585089631414e-06,
"loss": 0.163,
"mean_token_accuracy": 0.9646891450881958,
"num_tokens": 2056999566.0,
"step": 19400
},
{
"entropy": 1.36125,
"epoch": 0.47622545418931495,
"grad_norm": 2.875,
"learning_rate": 5.438125096481043e-06,
"loss": 0.1833,
"mean_token_accuracy": 0.96080885887146,
"num_tokens": 2062335975.0,
"step": 19450
},
{
"entropy": 1.368125,
"epoch": 0.4774496841486705,
"grad_norm": 3.140625,
"learning_rate": 5.434581693588263e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9632956290245056,
"num_tokens": 2067247038.0,
"step": 19500
},
{
"entropy": 1.36484375,
"epoch": 0.47867391410802607,
"grad_norm": 2.59375,
"learning_rate": 5.4310283148041775e-06,
"loss": 0.185,
"mean_token_accuracy": 0.9606440508365631,
"num_tokens": 2072775995.0,
"step": 19550
},
{
"entropy": 1.36171875,
"epoch": 0.4798981440673816,
"grad_norm": 2.265625,
"learning_rate": 5.427464974689038e-06,
"loss": 0.1772,
"mean_token_accuracy": 0.963237328529358,
"num_tokens": 2078139054.0,
"step": 19600
},
{
"entropy": 1.35703125,
"epoch": 0.4811223740267372,
"grad_norm": 2.90625,
"learning_rate": 5.42389168784391e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9635715174674988,
"num_tokens": 2083527202.0,
"step": 19650
},
{
"entropy": 1.37875,
"epoch": 0.48234660398609275,
"grad_norm": 3.3125,
"learning_rate": 5.4203084689106225e-06,
"loss": 0.1927,
"mean_token_accuracy": 0.9599621570110322,
"num_tokens": 2089385771.0,
"step": 19700
},
{
"entropy": 1.34265625,
"epoch": 0.4835708339454483,
"grad_norm": 2.296875,
"learning_rate": 5.4167153325716976e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9641843712329865,
"num_tokens": 2094456460.0,
"step": 19750
},
{
"entropy": 1.3609375,
"epoch": 0.48479506390480387,
"grad_norm": 3.734375,
"learning_rate": 5.413112293550296e-06,
"loss": 0.181,
"mean_token_accuracy": 0.9612398469448089,
"num_tokens": 2099504284.0,
"step": 19800
},
{
"entropy": 1.3709375,
"epoch": 0.48601929386415943,
"grad_norm": 2.53125,
"learning_rate": 5.409499366610154e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9642571318149566,
"num_tokens": 2104524371.0,
"step": 19850
},
{
"entropy": 1.378125,
"epoch": 0.487243523823515,
"grad_norm": 5.53125,
"learning_rate": 5.405876566555529e-06,
"loss": 0.181,
"mean_token_accuracy": 0.9618199968338013,
"num_tokens": 2109740174.0,
"step": 19900
},
{
"entropy": 1.40078125,
"epoch": 0.48846775378287055,
"grad_norm": 2.0,
"learning_rate": 5.402243908231129e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.962717422246933,
"num_tokens": 2115362415.0,
"step": 19950
},
{
"entropy": 1.37703125,
"epoch": 0.48969198374222617,
"grad_norm": 3.40625,
"learning_rate": 5.398601406522059e-06,
"loss": 0.19,
"mean_token_accuracy": 0.9599020183086395,
"num_tokens": 2121188022.0,
"step": 20000
},
{
"epoch": 0.48969198374222617,
"eval_entropy": 1.366015625,
"eval_loss": 0.1947789192199707,
"eval_mean_token_accuracy": 0.9590674425164859,
"eval_num_tokens": 2121188022.0,
"eval_runtime": 605.3557,
"eval_samples_per_second": 15.951,
"eval_steps_per_second": 0.2,
"step": 20000
},
{
"entropy": 1.36578125,
"epoch": 0.4909162137015817,
"grad_norm": 2.71875,
"learning_rate": 5.3949490763537594e-06,
"loss": 0.1838,
"mean_token_accuracy": 0.9606946921348571,
"num_tokens": 2126472622.0,
"step": 20050
},
{
"entropy": 1.36359375,
"epoch": 0.4921404436609373,
"grad_norm": 2.21875,
"learning_rate": 5.391286932691941e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.963376579284668,
"num_tokens": 2131377659.0,
"step": 20100
},
{
"entropy": 1.37875,
"epoch": 0.49336467362029285,
"grad_norm": 2.46875,
"learning_rate": 5.38761499054253e-06,
"loss": 0.1855,
"mean_token_accuracy": 0.9612623798847199,
"num_tokens": 2136546167.0,
"step": 20150
},
{
"entropy": 1.37296875,
"epoch": 0.4945889035796484,
"grad_norm": 4.40625,
"learning_rate": 5.383933264951596e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9621403360366821,
"num_tokens": 2141814792.0,
"step": 20200
},
{
"entropy": 1.37328125,
"epoch": 0.49581313353900397,
"grad_norm": 2.40625,
"learning_rate": 5.3802417710053056e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9616746437549591,
"num_tokens": 2147071830.0,
"step": 20250
},
{
"entropy": 1.38625,
"epoch": 0.49703736349835953,
"grad_norm": 3.375,
"learning_rate": 5.376540523829846e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9625440466403962,
"num_tokens": 2152428456.0,
"step": 20300
},
{
"entropy": 1.3896875,
"epoch": 0.4982615934577151,
"grad_norm": 2.203125,
"learning_rate": 5.372829538591368e-06,
"loss": 0.1876,
"mean_token_accuracy": 0.9597011947631836,
"num_tokens": 2157932348.0,
"step": 20350
},
{
"entropy": 1.38671875,
"epoch": 0.49948582341707065,
"grad_norm": 2.78125,
"learning_rate": 5.369108830495932e-06,
"loss": 0.1791,
"mean_token_accuracy": 0.9618503451347351,
"num_tokens": 2163273400.0,
"step": 20400
},
{
"entropy": 1.39640625,
"epoch": 0.5007100533764263,
"grad_norm": 2.1875,
"learning_rate": 5.365378414789431e-06,
"loss": 0.1744,
"mean_token_accuracy": 0.9630714511871338,
"num_tokens": 2168498693.0,
"step": 20450
},
{
"entropy": 1.38453125,
"epoch": 0.5019342833357818,
"grad_norm": 5.0625,
"learning_rate": 5.361638306757539e-06,
"loss": 0.1757,
"mean_token_accuracy": 0.963210039138794,
"num_tokens": 2173679268.0,
"step": 20500
},
{
"entropy": 1.40171875,
"epoch": 0.5031585132951374,
"grad_norm": 2.46875,
"learning_rate": 5.357888521725646e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9613598906993865,
"num_tokens": 2178826743.0,
"step": 20550
},
{
"entropy": 1.3775,
"epoch": 0.504382743254493,
"grad_norm": 2.546875,
"learning_rate": 5.354129075058793e-06,
"loss": 0.1786,
"mean_token_accuracy": 0.9626466917991638,
"num_tokens": 2184130873.0,
"step": 20600
},
{
"entropy": 1.35796875,
"epoch": 0.5056069732138485,
"grad_norm": 1.546875,
"learning_rate": 5.35035998216161e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9637439405918121,
"num_tokens": 2189388837.0,
"step": 20650
},
{
"entropy": 1.38328125,
"epoch": 0.5068312031732041,
"grad_norm": 1.703125,
"learning_rate": 5.3465812584782545e-06,
"loss": 0.1964,
"mean_token_accuracy": 0.9594271278381348,
"num_tokens": 2195050047.0,
"step": 20700
},
{
"entropy": 1.34203125,
"epoch": 0.5080554331325596,
"grad_norm": 2.3125,
"learning_rate": 5.342792919492344e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9626959478855133,
"num_tokens": 2200302347.0,
"step": 20750
},
{
"entropy": 1.356875,
"epoch": 0.5092796630919152,
"grad_norm": 2.09375,
"learning_rate": 5.338994980726901e-06,
"loss": 0.1794,
"mean_token_accuracy": 0.9620554232597351,
"num_tokens": 2205512738.0,
"step": 20800
},
{
"entropy": 1.3575,
"epoch": 0.5105038930512708,
"grad_norm": 2.78125,
"learning_rate": 5.335187457744277e-06,
"loss": 0.1823,
"mean_token_accuracy": 0.9618464136123657,
"num_tokens": 2210651777.0,
"step": 20850
},
{
"entropy": 1.33390625,
"epoch": 0.5117281230106263,
"grad_norm": 1.6875,
"learning_rate": 5.3313703661461e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.9613965570926666,
"num_tokens": 2215880518.0,
"step": 20900
},
{
"entropy": 1.3253125,
"epoch": 0.5129523529699819,
"grad_norm": 2.984375,
"learning_rate": 5.327543721573206e-06,
"loss": 0.1752,
"mean_token_accuracy": 0.9638756012916565,
"num_tokens": 2221245311.0,
"step": 20950
},
{
"entropy": 1.32234375,
"epoch": 0.5141765829293374,
"grad_norm": 3.28125,
"learning_rate": 5.323707539705574e-06,
"loss": 0.1748,
"mean_token_accuracy": 0.963612312078476,
"num_tokens": 2226359631.0,
"step": 21000
},
{
"entropy": 1.30609375,
"epoch": 0.515400812888693,
"grad_norm": 2.15625,
"learning_rate": 5.3198618362622614e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.9639462912082672,
"num_tokens": 2231563334.0,
"step": 21050
},
{
"entropy": 1.31953125,
"epoch": 0.5166250428480486,
"grad_norm": 3.265625,
"learning_rate": 5.316006627001344e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.961728732585907,
"num_tokens": 2236847732.0,
"step": 21100
},
{
"entropy": 1.32125,
"epoch": 0.5178492728074041,
"grad_norm": 2.375,
"learning_rate": 5.312141927719849e-06,
"loss": 0.172,
"mean_token_accuracy": 0.9636801743507385,
"num_tokens": 2242148614.0,
"step": 21150
},
{
"entropy": 1.3134375,
"epoch": 0.5190735027667597,
"grad_norm": 2.546875,
"learning_rate": 5.308267754253684e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9632048571109771,
"num_tokens": 2247694541.0,
"step": 21200
},
{
"entropy": 1.36203125,
"epoch": 0.5202977327261152,
"grad_norm": 1.8359375,
"learning_rate": 5.304384122477584e-06,
"loss": 0.1983,
"mean_token_accuracy": 0.9583926129341126,
"num_tokens": 2253386473.0,
"step": 21250
},
{
"entropy": 1.34703125,
"epoch": 0.5215219626854708,
"grad_norm": 2.140625,
"learning_rate": 5.300491048305037e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.9633457577228546,
"num_tokens": 2258591416.0,
"step": 21300
},
{
"entropy": 1.3553125,
"epoch": 0.5227461926448264,
"grad_norm": 3.140625,
"learning_rate": 5.296588547688221e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9621423208713531,
"num_tokens": 2263908714.0,
"step": 21350
},
{
"entropy": 1.35140625,
"epoch": 0.5239704226041819,
"grad_norm": 2.5,
"learning_rate": 5.292676636617946e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9637291979789734,
"num_tokens": 2269014561.0,
"step": 21400
},
{
"entropy": 1.3440625,
"epoch": 0.5251946525635376,
"grad_norm": 2.5625,
"learning_rate": 5.2887553311235736e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.963253127336502,
"num_tokens": 2274143387.0,
"step": 21450
},
{
"entropy": 1.34984375,
"epoch": 0.5264188825228932,
"grad_norm": 1.8203125,
"learning_rate": 5.284824647272965e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9633476626873017,
"num_tokens": 2279551937.0,
"step": 21500
},
{
"entropy": 1.3815625,
"epoch": 0.5276431124822487,
"grad_norm": 1.765625,
"learning_rate": 5.280884601172408e-06,
"loss": 0.1901,
"mean_token_accuracy": 0.9609255039691925,
"num_tokens": 2284998091.0,
"step": 21550
},
{
"entropy": 1.37375,
"epoch": 0.5288673424416043,
"grad_norm": 2.078125,
"learning_rate": 5.276935208966554e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.9621355581283569,
"num_tokens": 2290404419.0,
"step": 21600
},
{
"entropy": 1.35875,
"epoch": 0.5300915724009598,
"grad_norm": 2.546875,
"learning_rate": 5.272976486838349e-06,
"loss": 0.1839,
"mean_token_accuracy": 0.9618707728385926,
"num_tokens": 2295855308.0,
"step": 21650
},
{
"entropy": 1.34296875,
"epoch": 0.5313158023603154,
"grad_norm": 3.84375,
"learning_rate": 5.269008451008974e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.9649140095710754,
"num_tokens": 2300888682.0,
"step": 21700
},
{
"entropy": 1.3709375,
"epoch": 0.532540032319671,
"grad_norm": 2.046875,
"learning_rate": 5.265031117737765e-06,
"loss": 0.1856,
"mean_token_accuracy": 0.9606757354736328,
"num_tokens": 2306530067.0,
"step": 21750
},
{
"entropy": 1.3528125,
"epoch": 0.5337642622790265,
"grad_norm": 2.984375,
"learning_rate": 5.261044503322165e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9615514528751373,
"num_tokens": 2312022301.0,
"step": 21800
},
{
"entropy": 1.35828125,
"epoch": 0.5349884922383821,
"grad_norm": 2.5,
"learning_rate": 5.257048624097639e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9617948019504547,
"num_tokens": 2317336429.0,
"step": 21850
},
{
"entropy": 1.365625,
"epoch": 0.5362127221977376,
"grad_norm": 3.25,
"learning_rate": 5.253043496437619e-06,
"loss": 0.1875,
"mean_token_accuracy": 0.9604008531570435,
"num_tokens": 2322605855.0,
"step": 21900
},
{
"entropy": 1.3403125,
"epoch": 0.5374369521570932,
"grad_norm": 1.1171875,
"learning_rate": 5.249029136753436e-06,
"loss": 0.1757,
"mean_token_accuracy": 0.9632094752788544,
"num_tokens": 2328163176.0,
"step": 21950
},
{
"entropy": 1.3684375,
"epoch": 0.5386611821164488,
"grad_norm": 2.484375,
"learning_rate": 5.245005561494242e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9627390444278717,
"num_tokens": 2333245056.0,
"step": 22000
},
{
"entropy": 1.384375,
"epoch": 0.5398854120758043,
"grad_norm": 2.859375,
"learning_rate": 5.2409727871469585e-06,
"loss": 0.1926,
"mean_token_accuracy": 0.9592073571681976,
"num_tokens": 2338758359.0,
"step": 22050
},
{
"entropy": 1.35546875,
"epoch": 0.5411096420351599,
"grad_norm": 2.90625,
"learning_rate": 5.236930830236195e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9627534210681915,
"num_tokens": 2344276248.0,
"step": 22100
},
{
"entropy": 1.34953125,
"epoch": 0.5423338719945154,
"grad_norm": 2.078125,
"learning_rate": 5.232879707324194e-06,
"loss": 0.1634,
"mean_token_accuracy": 0.965645101070404,
"num_tokens": 2349615408.0,
"step": 22150
},
{
"entropy": 1.37578125,
"epoch": 0.543558101953871,
"grad_norm": 2.34375,
"learning_rate": 5.228819435010749e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9645935368537902,
"num_tokens": 2354669027.0,
"step": 22200
},
{
"entropy": 1.3884375,
"epoch": 0.5447823319132266,
"grad_norm": 3.109375,
"learning_rate": 5.224750029933149e-06,
"loss": 0.1811,
"mean_token_accuracy": 0.9621996486186981,
"num_tokens": 2359585884.0,
"step": 22250
},
{
"entropy": 1.38390625,
"epoch": 0.5460065618725821,
"grad_norm": 2.375,
"learning_rate": 5.220671508766104e-06,
"loss": 0.1716,
"mean_token_accuracy": 0.9631420743465423,
"num_tokens": 2364818902.0,
"step": 22300
},
{
"entropy": 1.40234375,
"epoch": 0.5472307918319377,
"grad_norm": 2.03125,
"learning_rate": 5.216583888221676e-06,
"loss": 0.1888,
"mean_token_accuracy": 0.9602623808383942,
"num_tokens": 2370249320.0,
"step": 22350
},
{
"entropy": 1.3871875,
"epoch": 0.5484550217912932,
"grad_norm": 2.078125,
"learning_rate": 5.212487185049215e-06,
"loss": 0.1656,
"mean_token_accuracy": 0.9649445843696595,
"num_tokens": 2375353386.0,
"step": 22400
},
{
"entropy": 1.415625,
"epoch": 0.5496792517506488,
"grad_norm": 2.09375,
"learning_rate": 5.208381416035286e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.9609400224685669,
"num_tokens": 2380836963.0,
"step": 22450
},
{
"entropy": 1.395,
"epoch": 0.5509034817100044,
"grad_norm": 0.00396728515625,
"learning_rate": 5.204266598003604e-06,
"loss": 0.1759,
"mean_token_accuracy": 0.9629833257198334,
"num_tokens": 2385836401.0,
"step": 22500
},
{
"entropy": 1.39046875,
"epoch": 0.5521277116693599,
"grad_norm": 3.671875,
"learning_rate": 5.20014274781496e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9624341118335724,
"num_tokens": 2391023729.0,
"step": 22550
},
{
"entropy": 1.410625,
"epoch": 0.5533519416287156,
"grad_norm": 2.59375,
"learning_rate": 5.196009882367158e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9633600628376007,
"num_tokens": 2396091073.0,
"step": 22600
},
{
"entropy": 1.40546875,
"epoch": 0.5545761715880712,
"grad_norm": 1.640625,
"learning_rate": 5.191868018594941e-06,
"loss": 0.1828,
"mean_token_accuracy": 0.9620015740394592,
"num_tokens": 2401188218.0,
"step": 22650
},
{
"entropy": 1.4009375,
"epoch": 0.5558004015474267,
"grad_norm": 3.328125,
"learning_rate": 5.187717173469924e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9637360453605652,
"num_tokens": 2406245988.0,
"step": 22700
},
{
"entropy": 1.39234375,
"epoch": 0.5570246315067823,
"grad_norm": 2.0625,
"learning_rate": 5.183557364000523e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9634659576416016,
"num_tokens": 2411368109.0,
"step": 22750
},
{
"entropy": 1.40296875,
"epoch": 0.5582488614661378,
"grad_norm": 2.265625,
"learning_rate": 5.179388607231889e-06,
"loss": 0.1728,
"mean_token_accuracy": 0.9633192873001098,
"num_tokens": 2416689928.0,
"step": 22800
},
{
"entropy": 1.410625,
"epoch": 0.5594730914254934,
"grad_norm": 2.4375,
"learning_rate": 5.17521092024583e-06,
"loss": 0.1867,
"mean_token_accuracy": 0.9608077311515808,
"num_tokens": 2422352742.0,
"step": 22850
},
{
"entropy": 1.39109375,
"epoch": 0.560697321384849,
"grad_norm": 0.08642578125,
"learning_rate": 5.171024320160752e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9654168891906738,
"num_tokens": 2427576584.0,
"step": 22900
},
{
"entropy": 1.38734375,
"epoch": 0.5619215513442045,
"grad_norm": 2.75,
"learning_rate": 5.166828824131578e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9640141320228577,
"num_tokens": 2432765937.0,
"step": 22950
},
{
"entropy": 1.3884375,
"epoch": 0.5631457813035601,
"grad_norm": 2.75,
"learning_rate": 5.162624449349686e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9613782787322998,
"num_tokens": 2437980184.0,
"step": 23000
},
{
"entropy": 1.3728125,
"epoch": 0.5643700112629156,
"grad_norm": 2.953125,
"learning_rate": 5.158411213042835e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9656554198265076,
"num_tokens": 2443001633.0,
"step": 23050
},
{
"entropy": 1.39265625,
"epoch": 0.5655942412222712,
"grad_norm": 2.140625,
"learning_rate": 5.154189132475095e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.9614216196537018,
"num_tokens": 2448599009.0,
"step": 23100
},
{
"entropy": 1.3725,
"epoch": 0.5668184711816268,
"grad_norm": 3.34375,
"learning_rate": 5.149958224946776e-06,
"loss": 0.1871,
"mean_token_accuracy": 0.9604478991031646,
"num_tokens": 2454134698.0,
"step": 23150
},
{
"entropy": 1.3503125,
"epoch": 0.5680427011409823,
"grad_norm": 3.140625,
"learning_rate": 5.145718507794354e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9635867273807526,
"num_tokens": 2459430485.0,
"step": 23200
},
{
"entropy": 1.3696875,
"epoch": 0.5692669311003379,
"grad_norm": 2.0,
"learning_rate": 5.141469998390408e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9624897265434265,
"num_tokens": 2464814573.0,
"step": 23250
},
{
"entropy": 1.34359375,
"epoch": 0.5704911610596934,
"grad_norm": 3.109375,
"learning_rate": 5.1372127141435415e-06,
"loss": 0.1866,
"mean_token_accuracy": 0.961111787557602,
"num_tokens": 2470288053.0,
"step": 23300
},
{
"entropy": 1.36140625,
"epoch": 0.571715391019049,
"grad_norm": 2.609375,
"learning_rate": 5.132946672498313e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9609505522251129,
"num_tokens": 2475912972.0,
"step": 23350
},
{
"entropy": 1.3640625,
"epoch": 0.5729396209784046,
"grad_norm": 2.015625,
"learning_rate": 5.128671890935168e-06,
"loss": 0.1868,
"mean_token_accuracy": 0.9606727063655853,
"num_tokens": 2481260397.0,
"step": 23400
},
{
"entropy": 1.36171875,
"epoch": 0.5741638509377601,
"grad_norm": 3.0625,
"learning_rate": 5.12438838697036e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9649614369869233,
"num_tokens": 2486480334.0,
"step": 23450
},
{
"entropy": 1.34078125,
"epoch": 0.5753880808971157,
"grad_norm": 2.453125,
"learning_rate": 5.120096178155887e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9637984907627106,
"num_tokens": 2491784273.0,
"step": 23500
},
{
"entropy": 1.37375,
"epoch": 0.5766123108564712,
"grad_norm": 2.796875,
"learning_rate": 5.115795282079414e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9622078704833984,
"num_tokens": 2496936761.0,
"step": 23550
},
{
"entropy": 1.37890625,
"epoch": 0.5778365408158268,
"grad_norm": 2.578125,
"learning_rate": 5.111485716364204e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9633621573448181,
"num_tokens": 2502372671.0,
"step": 23600
},
{
"entropy": 1.37671875,
"epoch": 0.5790607707751824,
"grad_norm": 2.34375,
"learning_rate": 5.107167498669044e-06,
"loss": 0.1888,
"mean_token_accuracy": 0.9600040495395661,
"num_tokens": 2508248084.0,
"step": 23650
},
{
"entropy": 1.3646875,
"epoch": 0.5802850007345379,
"grad_norm": 3.296875,
"learning_rate": 5.102840646688173e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9631288397312164,
"num_tokens": 2513722383.0,
"step": 23700
},
{
"entropy": 1.3534375,
"epoch": 0.5815092306938935,
"grad_norm": 1.7890625,
"learning_rate": 5.0985051781512076e-06,
"loss": 0.1853,
"mean_token_accuracy": 0.9618443667888641,
"num_tokens": 2518947610.0,
"step": 23750
},
{
"entropy": 1.34390625,
"epoch": 0.5827334606532492,
"grad_norm": 2.65625,
"learning_rate": 5.094161110823076e-06,
"loss": 0.178,
"mean_token_accuracy": 0.963310706615448,
"num_tokens": 2524269424.0,
"step": 23800
},
{
"entropy": 1.35328125,
"epoch": 0.5839576906126047,
"grad_norm": 2.59375,
"learning_rate": 5.089808462503938e-06,
"loss": 0.1839,
"mean_token_accuracy": 0.9614792597293854,
"num_tokens": 2529803600.0,
"step": 23850
},
{
"entropy": 1.3525,
"epoch": 0.5851819205719603,
"grad_norm": 3.046875,
"learning_rate": 5.085447251029113e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.963988184928894,
"num_tokens": 2534916174.0,
"step": 23900
},
{
"entropy": 1.35859375,
"epoch": 0.5864061505313158,
"grad_norm": 2.140625,
"learning_rate": 5.081077494269013e-06,
"loss": 0.1857,
"mean_token_accuracy": 0.9612233006954193,
"num_tokens": 2540205630.0,
"step": 23950
},
{
"entropy": 1.35015625,
"epoch": 0.5876303804906714,
"grad_norm": 2.125,
"learning_rate": 5.076699210129059e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9633960282802582,
"num_tokens": 2545114709.0,
"step": 24000
},
{
"entropy": 1.346875,
"epoch": 0.588854610450027,
"grad_norm": 2.265625,
"learning_rate": 5.072312416549619e-06,
"loss": 0.171,
"mean_token_accuracy": 0.9637422835826874,
"num_tokens": 2550645548.0,
"step": 24050
},
{
"entropy": 1.35140625,
"epoch": 0.5900788404093825,
"grad_norm": 1.8046875,
"learning_rate": 5.067917131505928e-06,
"loss": 0.186,
"mean_token_accuracy": 0.9609566831588745,
"num_tokens": 2556096356.0,
"step": 24100
},
{
"entropy": 1.34828125,
"epoch": 0.5913030703687381,
"grad_norm": 2.375,
"learning_rate": 5.063513373008014e-06,
"loss": 0.1874,
"mean_token_accuracy": 0.9602975726127625,
"num_tokens": 2561716691.0,
"step": 24150
},
{
"entropy": 1.36828125,
"epoch": 0.5925273003280936,
"grad_norm": 1.7578125,
"learning_rate": 5.059101159100625e-06,
"loss": 0.1911,
"mean_token_accuracy": 0.9601788830757141,
"num_tokens": 2566995725.0,
"step": 24200
},
{
"entropy": 1.36234375,
"epoch": 0.5937515302874492,
"grad_norm": 2.671875,
"learning_rate": 5.054680507863158e-06,
"loss": 0.196,
"mean_token_accuracy": 0.9593268644809723,
"num_tokens": 2572823278.0,
"step": 24250
},
{
"entropy": 1.36125,
"epoch": 0.5949757602468048,
"grad_norm": 2.375,
"learning_rate": 5.050251437409581e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9630362141132355,
"num_tokens": 2577835467.0,
"step": 24300
},
{
"entropy": 1.365625,
"epoch": 0.5961999902061603,
"grad_norm": 3.140625,
"learning_rate": 5.045813965888362e-06,
"loss": 0.184,
"mean_token_accuracy": 0.9621260786056518,
"num_tokens": 2582930120.0,
"step": 24350
},
{
"entropy": 1.355625,
"epoch": 0.5974242201655159,
"grad_norm": 3.40625,
"learning_rate": 5.04136811148239e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.963900375366211,
"num_tokens": 2587853502.0,
"step": 24400
},
{
"entropy": 1.36140625,
"epoch": 0.5986484501248714,
"grad_norm": 2.4375,
"learning_rate": 5.036913892408908e-06,
"loss": 0.1837,
"mean_token_accuracy": 0.9621051216125488,
"num_tokens": 2593227737.0,
"step": 24450
},
{
"entropy": 1.3525,
"epoch": 0.599872680084227,
"grad_norm": 2.203125,
"learning_rate": 5.032451326919429e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.962098822593689,
"num_tokens": 2598591436.0,
"step": 24500
},
{
"entropy": 1.34015625,
"epoch": 0.6010969100435826,
"grad_norm": 2.53125,
"learning_rate": 5.027980433299671e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9619297671318054,
"num_tokens": 2604000565.0,
"step": 24550
},
{
"entropy": 1.3484375,
"epoch": 0.6023211400029381,
"grad_norm": 2.71875,
"learning_rate": 5.023501229869474e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9643021488189697,
"num_tokens": 2608991683.0,
"step": 24600
},
{
"entropy": 1.33015625,
"epoch": 0.6035453699622937,
"grad_norm": 1.9765625,
"learning_rate": 5.0190137349827266e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9643359172344208,
"num_tokens": 2614123184.0,
"step": 24650
},
{
"entropy": 1.344375,
"epoch": 0.6047695999216492,
"grad_norm": 2.96875,
"learning_rate": 5.014517967027297e-06,
"loss": 0.1805,
"mean_token_accuracy": 0.962350081205368,
"num_tokens": 2619309044.0,
"step": 24700
},
{
"entropy": 1.3540625,
"epoch": 0.6059938298810048,
"grad_norm": 2.734375,
"learning_rate": 5.01001394442495e-06,
"loss": 0.1776,
"mean_token_accuracy": 0.9621638679504394,
"num_tokens": 2624919047.0,
"step": 24750
},
{
"entropy": 1.34859375,
"epoch": 0.6072180598403604,
"grad_norm": 3.03125,
"learning_rate": 5.005501685631273e-06,
"loss": 0.1733,
"mean_token_accuracy": 0.9635497546195984,
"num_tokens": 2630407723.0,
"step": 24800
},
{
"entropy": 1.3534375,
"epoch": 0.6084422897997159,
"grad_norm": 1.5390625,
"learning_rate": 5.000981209135607e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.9629986727237702,
"num_tokens": 2635671685.0,
"step": 24850
},
{
"entropy": 1.3459375,
"epoch": 0.6096665197590715,
"grad_norm": 3.71875,
"learning_rate": 4.9964525334609604e-06,
"loss": 0.174,
"mean_token_accuracy": 0.9627162063121796,
"num_tokens": 2641068693.0,
"step": 24900
},
{
"entropy": 1.35453125,
"epoch": 0.6108907497184272,
"grad_norm": 2.75,
"learning_rate": 4.99191567716394e-06,
"loss": 0.1796,
"mean_token_accuracy": 0.9617865860462189,
"num_tokens": 2646610014.0,
"step": 24950
},
{
"entropy": 1.37453125,
"epoch": 0.6121149796777827,
"grad_norm": 3.109375,
"learning_rate": 4.987370658834675e-06,
"loss": 0.1833,
"mean_token_accuracy": 0.9610668885707855,
"num_tokens": 2651951764.0,
"step": 25000
},
{
"entropy": 1.40046875,
"epoch": 0.6133392096371383,
"grad_norm": 3.828125,
"learning_rate": 4.982817497096737e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9631572890281678,
"num_tokens": 2657065776.0,
"step": 25050
},
{
"entropy": 1.38859375,
"epoch": 0.6145634395964938,
"grad_norm": 3.0625,
"learning_rate": 4.978256210607068e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9639844071865081,
"num_tokens": 2662222291.0,
"step": 25100
},
{
"entropy": 1.3496875,
"epoch": 0.6157876695558494,
"grad_norm": 3.21875,
"learning_rate": 4.973686818055901e-06,
"loss": 0.1684,
"mean_token_accuracy": 0.9642084753513336,
"num_tokens": 2667209443.0,
"step": 25150
},
{
"entropy": 1.36375,
"epoch": 0.617011899515205,
"grad_norm": 1.859375,
"learning_rate": 4.969109338166683e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9646093189716339,
"num_tokens": 2672346139.0,
"step": 25200
},
{
"entropy": 1.38625,
"epoch": 0.6182361294745605,
"grad_norm": 2.40625,
"learning_rate": 4.964523789695999e-06,
"loss": 0.1855,
"mean_token_accuracy": 0.9612112033367157,
"num_tokens": 2677709139.0,
"step": 25250
},
{
"entropy": 1.38171875,
"epoch": 0.6194603594339161,
"grad_norm": 2.90625,
"learning_rate": 4.959930191433498e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9613463747501373,
"num_tokens": 2682889432.0,
"step": 25300
},
{
"entropy": 1.39375,
"epoch": 0.6206845893932716,
"grad_norm": 2.8125,
"learning_rate": 4.955328562201814e-06,
"loss": 0.1953,
"mean_token_accuracy": 0.959397931098938,
"num_tokens": 2688531671.0,
"step": 25350
},
{
"entropy": 1.396875,
"epoch": 0.6219088193526272,
"grad_norm": 1.8984375,
"learning_rate": 4.950718920856486e-06,
"loss": 0.1882,
"mean_token_accuracy": 0.9605313742160797,
"num_tokens": 2693586026.0,
"step": 25400
},
{
"entropy": 1.38203125,
"epoch": 0.6231330493119828,
"grad_norm": 2.328125,
"learning_rate": 4.946101286285884e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.9638578796386719,
"num_tokens": 2698728829.0,
"step": 25450
},
{
"entropy": 1.3803125,
"epoch": 0.6243572792713383,
"grad_norm": 3.53125,
"learning_rate": 4.9414756774111335e-06,
"loss": 0.167,
"mean_token_accuracy": 0.9648666107654571,
"num_tokens": 2703894118.0,
"step": 25500
},
{
"entropy": 1.4071875,
"epoch": 0.6255815092306939,
"grad_norm": 3.46875,
"learning_rate": 4.93684211318603e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.962544618844986,
"num_tokens": 2709087928.0,
"step": 25550
},
{
"entropy": 1.40078125,
"epoch": 0.6268057391900494,
"grad_norm": 3.28125,
"learning_rate": 4.932200612596974e-06,
"loss": 0.1757,
"mean_token_accuracy": 0.963033629655838,
"num_tokens": 2714244664.0,
"step": 25600
},
{
"entropy": 1.401875,
"epoch": 0.628029969149405,
"grad_norm": 3.859375,
"learning_rate": 4.927551194662878e-06,
"loss": 0.1701,
"mean_token_accuracy": 0.9642516016960144,
"num_tokens": 2719276387.0,
"step": 25650
},
{
"entropy": 1.4296875,
"epoch": 0.6292541991087606,
"grad_norm": 2.625,
"learning_rate": 4.922893878435101e-06,
"loss": 0.1877,
"mean_token_accuracy": 0.9612637603282929,
"num_tokens": 2724924886.0,
"step": 25700
},
{
"entropy": 1.40390625,
"epoch": 0.6304784290681161,
"grad_norm": 2.546875,
"learning_rate": 4.918228682997367e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9626137948036194,
"num_tokens": 2730190384.0,
"step": 25750
},
{
"entropy": 1.4384375,
"epoch": 0.6317026590274717,
"grad_norm": 1.7421875,
"learning_rate": 4.9135556274656825e-06,
"loss": 0.1921,
"mean_token_accuracy": 0.9599238002300262,
"num_tokens": 2735642568.0,
"step": 25800
},
{
"entropy": 1.43296875,
"epoch": 0.6329268889868273,
"grad_norm": 2.609375,
"learning_rate": 4.908874730988262e-06,
"loss": 0.1859,
"mean_token_accuracy": 0.9601176917552948,
"num_tokens": 2741009627.0,
"step": 25850
},
{
"entropy": 1.42296875,
"epoch": 0.6341511189461828,
"grad_norm": 2.171875,
"learning_rate": 4.904186012745451e-06,
"loss": 0.1836,
"mean_token_accuracy": 0.9604202997684479,
"num_tokens": 2746576865.0,
"step": 25900
},
{
"entropy": 1.42078125,
"epoch": 0.6353753489055384,
"grad_norm": 3.109375,
"learning_rate": 4.899489491949643e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9639356219768525,
"num_tokens": 2751636571.0,
"step": 25950
},
{
"entropy": 1.43125,
"epoch": 0.6365995788648939,
"grad_norm": 3.328125,
"learning_rate": 4.894785187845203e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.9626227140426635,
"num_tokens": 2756749043.0,
"step": 26000
},
{
"entropy": 1.41953125,
"epoch": 0.6378238088242495,
"grad_norm": 1.921875,
"learning_rate": 4.890073119708392e-06,
"loss": 0.1716,
"mean_token_accuracy": 0.9636380136013031,
"num_tokens": 2761887971.0,
"step": 26050
},
{
"entropy": 1.42109375,
"epoch": 0.6390480387836052,
"grad_norm": 2.0625,
"learning_rate": 4.88535330684728e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9623912250995637,
"num_tokens": 2767051370.0,
"step": 26100
},
{
"entropy": 1.4259375,
"epoch": 0.6402722687429607,
"grad_norm": 2.546875,
"learning_rate": 4.880625768601674e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.9622378349304199,
"num_tokens": 2772481902.0,
"step": 26150
},
{
"entropy": 1.4315625,
"epoch": 0.6414964987023163,
"grad_norm": 2.484375,
"learning_rate": 4.87589052434304e-06,
"loss": 0.1874,
"mean_token_accuracy": 0.9602720224857331,
"num_tokens": 2777927527.0,
"step": 26200
},
{
"entropy": 1.4140625,
"epoch": 0.6427207286616718,
"grad_norm": 2.421875,
"learning_rate": 4.871147593474412e-06,
"loss": 0.184,
"mean_token_accuracy": 0.9599432504177093,
"num_tokens": 2783446389.0,
"step": 26250
},
{
"entropy": 1.4053125,
"epoch": 0.6439449586210274,
"grad_norm": 2.40625,
"learning_rate": 4.866396995430328e-06,
"loss": 0.1786,
"mean_token_accuracy": 0.9628067684173583,
"num_tokens": 2788980882.0,
"step": 26300
},
{
"entropy": 1.38875,
"epoch": 0.645169188580383,
"grad_norm": 2.71875,
"learning_rate": 4.861638749676737e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9639978551864624,
"num_tokens": 2793955184.0,
"step": 26350
},
{
"entropy": 1.4034375,
"epoch": 0.6463934185397385,
"grad_norm": 1.6953125,
"learning_rate": 4.85687287571093e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.9636970722675323,
"num_tokens": 2799185455.0,
"step": 26400
},
{
"entropy": 1.40828125,
"epoch": 0.6476176484990941,
"grad_norm": 3.640625,
"learning_rate": 4.852099393061452e-06,
"loss": 0.1818,
"mean_token_accuracy": 0.962208844423294,
"num_tokens": 2804463803.0,
"step": 26450
},
{
"entropy": 1.38484375,
"epoch": 0.6488418784584497,
"grad_norm": 1.75,
"learning_rate": 4.847318321288027e-06,
"loss": 0.165,
"mean_token_accuracy": 0.9649109244346619,
"num_tokens": 2809874779.0,
"step": 26500
},
{
"entropy": 1.37953125,
"epoch": 0.6500661084178052,
"grad_norm": 2.984375,
"learning_rate": 4.842529679981474e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9632159042358398,
"num_tokens": 2814714128.0,
"step": 26550
},
{
"entropy": 1.39625,
"epoch": 0.6512903383771608,
"grad_norm": 2.765625,
"learning_rate": 4.8377334887636305e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.9637495183944702,
"num_tokens": 2819740494.0,
"step": 26600
},
{
"entropy": 1.39109375,
"epoch": 0.6525145683365163,
"grad_norm": 3.03125,
"learning_rate": 4.8329297672872695e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9610202670097351,
"num_tokens": 2824966205.0,
"step": 26650
},
{
"entropy": 1.37796875,
"epoch": 0.6537387982958719,
"grad_norm": 2.53125,
"learning_rate": 4.828118535236023e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9625972366333008,
"num_tokens": 2830034251.0,
"step": 26700
},
{
"entropy": 1.3953125,
"epoch": 0.6549630282552275,
"grad_norm": 2.28125,
"learning_rate": 4.823299812324291e-06,
"loss": 0.1847,
"mean_token_accuracy": 0.9611959600448609,
"num_tokens": 2835494370.0,
"step": 26750
},
{
"entropy": 1.38203125,
"epoch": 0.656187258214583,
"grad_norm": 2.15625,
"learning_rate": 4.818473618297175e-06,
"loss": 0.1728,
"mean_token_accuracy": 0.9636625552177429,
"num_tokens": 2840744565.0,
"step": 26800
},
{
"entropy": 1.3696875,
"epoch": 0.6574114881739386,
"grad_norm": 3.671875,
"learning_rate": 4.8136399729303875e-06,
"loss": 0.1599,
"mean_token_accuracy": 0.9664247930049896,
"num_tokens": 2845515500.0,
"step": 26850
},
{
"entropy": 1.39671875,
"epoch": 0.6586357181332941,
"grad_norm": 2.140625,
"learning_rate": 4.808798896030171e-06,
"loss": 0.182,
"mean_token_accuracy": 0.9610953998565673,
"num_tokens": 2850746030.0,
"step": 26900
},
{
"entropy": 1.38609375,
"epoch": 0.6598599480926497,
"grad_norm": 1.578125,
"learning_rate": 4.803950407433224e-06,
"loss": 0.1774,
"mean_token_accuracy": 0.9627044332027436,
"num_tokens": 2856071580.0,
"step": 26950
},
{
"entropy": 1.38640625,
"epoch": 0.6610841780520053,
"grad_norm": 2.359375,
"learning_rate": 4.799094527006611e-06,
"loss": 0.1747,
"mean_token_accuracy": 0.9633591079711914,
"num_tokens": 2861236205.0,
"step": 27000
},
{
"entropy": 1.38140625,
"epoch": 0.6623084080113608,
"grad_norm": 2.046875,
"learning_rate": 4.794231274647687e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9629326021671295,
"num_tokens": 2866317531.0,
"step": 27050
},
{
"entropy": 1.37421875,
"epoch": 0.6635326379707164,
"grad_norm": 2.765625,
"learning_rate": 4.789360670284014e-06,
"loss": 0.178,
"mean_token_accuracy": 0.962060467004776,
"num_tokens": 2871541131.0,
"step": 27100
},
{
"entropy": 1.4078125,
"epoch": 0.6647568679300719,
"grad_norm": 1.921875,
"learning_rate": 4.784482733873279e-06,
"loss": 0.1962,
"mean_token_accuracy": 0.959048901796341,
"num_tokens": 2877146197.0,
"step": 27150
},
{
"entropy": 1.3890625,
"epoch": 0.6659810978894275,
"grad_norm": 2.125,
"learning_rate": 4.7795974854032114e-06,
"loss": 0.1823,
"mean_token_accuracy": 0.9619522738456726,
"num_tokens": 2882596630.0,
"step": 27200
},
{
"entropy": 1.3603125,
"epoch": 0.6672053278487832,
"grad_norm": 2.421875,
"learning_rate": 4.774704944891505e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9625801253318786,
"num_tokens": 2887948438.0,
"step": 27250
},
{
"entropy": 1.39546875,
"epoch": 0.6684295578081387,
"grad_norm": 2.265625,
"learning_rate": 4.769805132385734e-06,
"loss": 0.1879,
"mean_token_accuracy": 0.9613603317737579,
"num_tokens": 2893501173.0,
"step": 27300
},
{
"entropy": 1.40875,
"epoch": 0.6696537877674943,
"grad_norm": 2.3125,
"learning_rate": 4.764898067963265e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.9604850566387176,
"num_tokens": 2898869944.0,
"step": 27350
},
{
"entropy": 1.37859375,
"epoch": 0.6708780177268499,
"grad_norm": 2.40625,
"learning_rate": 4.759983771731184e-06,
"loss": 0.1679,
"mean_token_accuracy": 0.965053141117096,
"num_tokens": 2903596870.0,
"step": 27400
},
{
"entropy": 1.37453125,
"epoch": 0.6721022476862054,
"grad_norm": 2.03125,
"learning_rate": 4.75506226382621e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9613700366020203,
"num_tokens": 2909474929.0,
"step": 27450
},
{
"entropy": 1.36875,
"epoch": 0.673326477645561,
"grad_norm": 2.453125,
"learning_rate": 4.750133564414611e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9644119250774383,
"num_tokens": 2914673564.0,
"step": 27500
},
{
"entropy": 1.396875,
"epoch": 0.6745507076049165,
"grad_norm": 2.796875,
"learning_rate": 4.745197693692121e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.9608116745948792,
"num_tokens": 2920176865.0,
"step": 27550
},
{
"entropy": 1.41515625,
"epoch": 0.6757749375642721,
"grad_norm": 1.8359375,
"learning_rate": 4.740254671883864e-06,
"loss": 0.1912,
"mean_token_accuracy": 0.9596376729011535,
"num_tokens": 2925586459.0,
"step": 27600
},
{
"entropy": 1.3996875,
"epoch": 0.6769991675236277,
"grad_norm": 3.65625,
"learning_rate": 4.735304519244263e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.9637066113948822,
"num_tokens": 2930825954.0,
"step": 27650
},
{
"entropy": 1.3809375,
"epoch": 0.6782233974829832,
"grad_norm": 1.921875,
"learning_rate": 4.73034725605696e-06,
"loss": 0.1658,
"mean_token_accuracy": 0.9653242897987365,
"num_tokens": 2935862959.0,
"step": 27700
},
{
"entropy": 1.38953125,
"epoch": 0.6794476274423388,
"grad_norm": 3.0625,
"learning_rate": 4.725382902634733e-06,
"loss": 0.1681,
"mean_token_accuracy": 0.9643997454643249,
"num_tokens": 2940725166.0,
"step": 27750
},
{
"entropy": 1.40421875,
"epoch": 0.6806718574016943,
"grad_norm": 2.859375,
"learning_rate": 4.720411479319414e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9641519057750702,
"num_tokens": 2946188027.0,
"step": 27800
},
{
"entropy": 1.40796875,
"epoch": 0.6818960873610499,
"grad_norm": 2.828125,
"learning_rate": 4.7154330064818045e-06,
"loss": 0.1841,
"mean_token_accuracy": 0.9606011056900025,
"num_tokens": 2951612651.0,
"step": 27850
},
{
"entropy": 1.395625,
"epoch": 0.6831203173204055,
"grad_norm": 2.96875,
"learning_rate": 4.710447504521588e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9641698563098907,
"num_tokens": 2956787623.0,
"step": 27900
},
{
"entropy": 1.40359375,
"epoch": 0.684344547279761,
"grad_norm": 3.5625,
"learning_rate": 4.705454993867257e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9634602963924408,
"num_tokens": 2961925459.0,
"step": 27950
},
{
"entropy": 1.3925,
"epoch": 0.6855687772391166,
"grad_norm": 1.921875,
"learning_rate": 4.700455494976019e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9632600677013398,
"num_tokens": 2967274024.0,
"step": 28000
},
{
"entropy": 1.3640625,
"epoch": 0.6867930071984721,
"grad_norm": 2.140625,
"learning_rate": 4.695449028333715e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.965574380159378,
"num_tokens": 2972439136.0,
"step": 28050
},
{
"entropy": 1.37203125,
"epoch": 0.6880172371578277,
"grad_norm": 2.640625,
"learning_rate": 4.6904356144547405e-06,
"loss": 0.1833,
"mean_token_accuracy": 0.9605630087852478,
"num_tokens": 2977717715.0,
"step": 28100
},
{
"entropy": 1.38703125,
"epoch": 0.6892414671171833,
"grad_norm": 2.65625,
"learning_rate": 4.685415273881955e-06,
"loss": 0.1849,
"mean_token_accuracy": 0.9602934348583222,
"num_tokens": 2983019999.0,
"step": 28150
},
{
"entropy": 1.36609375,
"epoch": 0.6904656970765388,
"grad_norm": 1.65625,
"learning_rate": 4.6803880271866e-06,
"loss": 0.1635,
"mean_token_accuracy": 0.9659206521511078,
"num_tokens": 2987974089.0,
"step": 28200
},
{
"entropy": 1.38875,
"epoch": 0.6916899270358944,
"grad_norm": 2.171875,
"learning_rate": 4.675353894968219e-06,
"loss": 0.1956,
"mean_token_accuracy": 0.958441025018692,
"num_tokens": 2993587967.0,
"step": 28250
},
{
"entropy": 1.3828125,
"epoch": 0.6929141569952499,
"grad_norm": 1.796875,
"learning_rate": 4.670312897854568e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.9611673438549042,
"num_tokens": 2999047067.0,
"step": 28300
},
{
"entropy": 1.36875,
"epoch": 0.6941383869546055,
"grad_norm": 2.375,
"learning_rate": 4.665265056501529e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9631416380405426,
"num_tokens": 3004064576.0,
"step": 28350
},
{
"entropy": 1.34109375,
"epoch": 0.6953626169139612,
"grad_norm": 3.0625,
"learning_rate": 4.660210391593035e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9659523034095764,
"num_tokens": 3009178123.0,
"step": 28400
},
{
"entropy": 1.36859375,
"epoch": 0.6965868468733167,
"grad_norm": 2.96875,
"learning_rate": 4.655148923840974e-06,
"loss": 0.1848,
"mean_token_accuracy": 0.9613404250144959,
"num_tokens": 3014406061.0,
"step": 28450
},
{
"entropy": 1.36828125,
"epoch": 0.6978110768326723,
"grad_norm": 2.234375,
"learning_rate": 4.6500806739851114e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9632516479492188,
"num_tokens": 3019405252.0,
"step": 28500
},
{
"entropy": 1.36640625,
"epoch": 0.6990353067920279,
"grad_norm": 3.265625,
"learning_rate": 4.645005662793002e-06,
"loss": 0.1765,
"mean_token_accuracy": 0.9634008550643921,
"num_tokens": 3024715395.0,
"step": 28550
},
{
"entropy": 1.386875,
"epoch": 0.7002595367513834,
"grad_norm": 1.7265625,
"learning_rate": 4.639923911059907e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9633400416374207,
"num_tokens": 3030214594.0,
"step": 28600
},
{
"entropy": 1.36390625,
"epoch": 0.701483766710739,
"grad_norm": 2.828125,
"learning_rate": 4.634835439608706e-06,
"loss": 0.1712,
"mean_token_accuracy": 0.9632709419727326,
"num_tokens": 3035472593.0,
"step": 28650
},
{
"entropy": 1.34984375,
"epoch": 0.7027079966700945,
"grad_norm": 2.640625,
"learning_rate": 4.629740269289813e-06,
"loss": 0.1634,
"mean_token_accuracy": 0.9657196223735809,
"num_tokens": 3040576077.0,
"step": 28700
},
{
"entropy": 1.37296875,
"epoch": 0.7039322266294501,
"grad_norm": 1.8125,
"learning_rate": 4.6246384209810935e-06,
"loss": 0.1857,
"mean_token_accuracy": 0.9612914025783539,
"num_tokens": 3046057341.0,
"step": 28750
},
{
"entropy": 1.35765625,
"epoch": 0.7051564565888057,
"grad_norm": 3.5,
"learning_rate": 4.6195299155877746e-06,
"loss": 0.1752,
"mean_token_accuracy": 0.9628597724437714,
"num_tokens": 3051406159.0,
"step": 28800
},
{
"entropy": 1.34625,
"epoch": 0.7063806865481612,
"grad_norm": 2.046875,
"learning_rate": 4.61441477404236e-06,
"loss": 0.1736,
"mean_token_accuracy": 0.963384006023407,
"num_tokens": 3056663844.0,
"step": 28850
},
{
"entropy": 1.35421875,
"epoch": 0.7076049165075168,
"grad_norm": 2.546875,
"learning_rate": 4.60929301730455e-06,
"loss": 0.1857,
"mean_token_accuracy": 0.9611174511909485,
"num_tokens": 3062180594.0,
"step": 28900
},
{
"entropy": 1.3396875,
"epoch": 0.7088291464668723,
"grad_norm": 2.171875,
"learning_rate": 4.604164666361146e-06,
"loss": 0.1771,
"mean_token_accuracy": 0.9630412280559539,
"num_tokens": 3067629529.0,
"step": 28950
},
{
"entropy": 1.3521875,
"epoch": 0.7100533764262279,
"grad_norm": 2.53125,
"learning_rate": 4.599029742225975e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9603700506687164,
"num_tokens": 3072962675.0,
"step": 29000
},
{
"entropy": 1.34265625,
"epoch": 0.7112776063855835,
"grad_norm": 2.578125,
"learning_rate": 4.593888265939793e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9641862511634827,
"num_tokens": 3078457917.0,
"step": 29050
},
{
"entropy": 1.3565625,
"epoch": 0.712501836344939,
"grad_norm": 2.484375,
"learning_rate": 4.5887402585702056e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9627685403823852,
"num_tokens": 3083722495.0,
"step": 29100
},
{
"entropy": 1.3690625,
"epoch": 0.7137260663042946,
"grad_norm": 2.0,
"learning_rate": 4.583585741211583e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9620171189308167,
"num_tokens": 3089097439.0,
"step": 29150
},
{
"entropy": 1.3615625,
"epoch": 0.7149502962636501,
"grad_norm": 2.90625,
"learning_rate": 4.5784247349849666e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9622057628631592,
"num_tokens": 3094373355.0,
"step": 29200
},
{
"entropy": 1.3421875,
"epoch": 0.7161745262230057,
"grad_norm": 1.953125,
"learning_rate": 4.57325726103799e-06,
"loss": 0.1771,
"mean_token_accuracy": 0.9627100145816803,
"num_tokens": 3099619006.0,
"step": 29250
},
{
"entropy": 1.33015625,
"epoch": 0.7173987561823613,
"grad_norm": 3.296875,
"learning_rate": 4.568083340544785e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9631901240348816,
"num_tokens": 3104769496.0,
"step": 29300
},
{
"entropy": 1.32921875,
"epoch": 0.7186229861417168,
"grad_norm": 2.359375,
"learning_rate": 4.562902994705902e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.9646138906478882,
"num_tokens": 3110079410.0,
"step": 29350
},
{
"entropy": 1.3515625,
"epoch": 0.7198472161010724,
"grad_norm": 2.640625,
"learning_rate": 4.557716244748217e-06,
"loss": 0.186,
"mean_token_accuracy": 0.9605904114246369,
"num_tokens": 3115590754.0,
"step": 29400
},
{
"entropy": 1.33421875,
"epoch": 0.721071446060428,
"grad_norm": 1.859375,
"learning_rate": 4.55252311192485e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9634395956993103,
"num_tokens": 3120943769.0,
"step": 29450
},
{
"entropy": 1.3384375,
"epoch": 0.7222956760197835,
"grad_norm": 1.8515625,
"learning_rate": 4.547323617515073e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9623040866851806,
"num_tokens": 3126534469.0,
"step": 29500
},
{
"entropy": 1.306875,
"epoch": 0.7235199059791391,
"grad_norm": 3.5,
"learning_rate": 4.542117782824228e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.9650185751914978,
"num_tokens": 3131829007.0,
"step": 29550
},
{
"entropy": 1.31984375,
"epoch": 0.7247441359384947,
"grad_norm": 1.7109375,
"learning_rate": 4.536905629183632e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9605432045459747,
"num_tokens": 3137395527.0,
"step": 29600
},
{
"entropy": 1.3121875,
"epoch": 0.7259683658978503,
"grad_norm": 2.3125,
"learning_rate": 4.5316871779505e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9653282749652863,
"num_tokens": 3142501686.0,
"step": 29650
},
{
"entropy": 1.33921875,
"epoch": 0.7271925958572059,
"grad_norm": 1.9765625,
"learning_rate": 4.5264624505078485e-06,
"loss": 0.1796,
"mean_token_accuracy": 0.9623512411117554,
"num_tokens": 3147984109.0,
"step": 29700
},
{
"entropy": 1.3259375,
"epoch": 0.7284168258165614,
"grad_norm": 3.671875,
"learning_rate": 4.521231468264411e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9634522151947021,
"num_tokens": 3153428961.0,
"step": 29750
},
{
"entropy": 1.339375,
"epoch": 0.729641055775917,
"grad_norm": 1.8046875,
"learning_rate": 4.515994252654552e-06,
"loss": 0.1846,
"mean_token_accuracy": 0.9607186770439148,
"num_tokens": 3158828246.0,
"step": 29800
},
{
"entropy": 1.29671875,
"epoch": 0.7308652857352725,
"grad_norm": 3.140625,
"learning_rate": 4.510750825138178e-06,
"loss": 0.1608,
"mean_token_accuracy": 0.9657926094532013,
"num_tokens": 3163804439.0,
"step": 29850
},
{
"entropy": 1.3315625,
"epoch": 0.7320895156946281,
"grad_norm": 2.9375,
"learning_rate": 4.505501207200649e-06,
"loss": 0.1818,
"mean_token_accuracy": 0.9619475591182709,
"num_tokens": 3169333412.0,
"step": 29900
},
{
"entropy": 1.324375,
"epoch": 0.7333137456539837,
"grad_norm": 2.15625,
"learning_rate": 4.500245420352687e-06,
"loss": 0.1733,
"mean_token_accuracy": 0.963250036239624,
"num_tokens": 3174683947.0,
"step": 29950
},
{
"entropy": 1.32015625,
"epoch": 0.7345379756133392,
"grad_norm": 3.171875,
"learning_rate": 4.494983486130298e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9633795261383057,
"num_tokens": 3179817804.0,
"step": 30000
},
{
"epoch": 0.7345379756133392,
"eval_entropy": 1.3244140625,
"eval_loss": 0.1920091211795807,
"eval_mean_token_accuracy": 0.9597868000467619,
"eval_num_tokens": 3179817804.0,
"eval_runtime": 606.2695,
"eval_samples_per_second": 15.927,
"eval_steps_per_second": 0.2,
"step": 30000
},
{
"entropy": 1.34265625,
"epoch": 0.7357622055726948,
"grad_norm": 2.828125,
"learning_rate": 4.489715426094674e-06,
"loss": 0.1971,
"mean_token_accuracy": 0.9590841460227967,
"num_tokens": 3185695558.0,
"step": 30050
},
{
"entropy": 1.33234375,
"epoch": 0.7369864355320503,
"grad_norm": 2.28125,
"learning_rate": 4.484441261832107e-06,
"loss": 0.1767,
"mean_token_accuracy": 0.9629596638679504,
"num_tokens": 3191177099.0,
"step": 30100
},
{
"entropy": 1.3253125,
"epoch": 0.7382106654914059,
"grad_norm": 2.75,
"learning_rate": 4.479161014953903e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.9617591965198516,
"num_tokens": 3196688072.0,
"step": 30150
},
{
"entropy": 1.3171875,
"epoch": 0.7394348954507615,
"grad_norm": 2.578125,
"learning_rate": 4.473874707096293e-06,
"loss": 0.185,
"mean_token_accuracy": 0.9615085804462433,
"num_tokens": 3202252950.0,
"step": 30200
},
{
"entropy": 1.3203125,
"epoch": 0.740659125410117,
"grad_norm": 3.078125,
"learning_rate": 4.46858235992034e-06,
"loss": 0.1716,
"mean_token_accuracy": 0.9639656889438629,
"num_tokens": 3207720004.0,
"step": 30250
},
{
"entropy": 1.33046875,
"epoch": 0.7418833553694726,
"grad_norm": 3.4375,
"learning_rate": 4.463283995111858e-06,
"loss": 0.1909,
"mean_token_accuracy": 0.9597360849380493,
"num_tokens": 3213270190.0,
"step": 30300
},
{
"entropy": 1.32171875,
"epoch": 0.7431075853288281,
"grad_norm": 3.671875,
"learning_rate": 4.4579796343813155e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9631195080280304,
"num_tokens": 3218354333.0,
"step": 30350
},
{
"entropy": 1.3359375,
"epoch": 0.7443318152881837,
"grad_norm": 3.15625,
"learning_rate": 4.452669299463749e-06,
"loss": 0.172,
"mean_token_accuracy": 0.963985036611557,
"num_tokens": 3223570126.0,
"step": 30400
},
{
"entropy": 1.32640625,
"epoch": 0.7455560452475393,
"grad_norm": 1.8125,
"learning_rate": 4.44735301211868e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9622200524806976,
"num_tokens": 3228934737.0,
"step": 30450
},
{
"entropy": 1.34375,
"epoch": 0.7467802752068948,
"grad_norm": 3.109375,
"learning_rate": 4.442030794130013e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9641703021526337,
"num_tokens": 3234092609.0,
"step": 30500
},
{
"entropy": 1.3525,
"epoch": 0.7480045051662504,
"grad_norm": 2.328125,
"learning_rate": 4.43670266730596e-06,
"loss": 0.1871,
"mean_token_accuracy": 0.9610934937000275,
"num_tokens": 3239470570.0,
"step": 30550
},
{
"entropy": 1.35859375,
"epoch": 0.749228735125606,
"grad_norm": 3.234375,
"learning_rate": 4.431368653478943e-06,
"loss": 0.1799,
"mean_token_accuracy": 0.9625358593463897,
"num_tokens": 3245129970.0,
"step": 30600
},
{
"entropy": 1.36859375,
"epoch": 0.7504529650849615,
"grad_norm": 1.921875,
"learning_rate": 4.426028774505504e-06,
"loss": 0.1895,
"mean_token_accuracy": 0.9608589220046997,
"num_tokens": 3250417534.0,
"step": 30650
},
{
"entropy": 1.37203125,
"epoch": 0.7516771950443171,
"grad_norm": 3.125,
"learning_rate": 4.420683052266223e-06,
"loss": 0.1962,
"mean_token_accuracy": 0.9591640889644623,
"num_tokens": 3256202020.0,
"step": 30700
},
{
"entropy": 1.35421875,
"epoch": 0.7529014250036727,
"grad_norm": 2.84375,
"learning_rate": 4.415331508665619e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9638619077205658,
"num_tokens": 3261559010.0,
"step": 30750
},
{
"entropy": 1.36328125,
"epoch": 0.7541256549630283,
"grad_norm": 3.5625,
"learning_rate": 4.409974165632064e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.9618020045757294,
"num_tokens": 3267151095.0,
"step": 30800
},
{
"entropy": 1.3546875,
"epoch": 0.7553498849223839,
"grad_norm": 3.484375,
"learning_rate": 4.404611045117696e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9617926621437073,
"num_tokens": 3272412916.0,
"step": 30850
},
{
"entropy": 1.3534375,
"epoch": 0.7565741148817394,
"grad_norm": 2.578125,
"learning_rate": 4.399242169098329e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.9625967741012573,
"num_tokens": 3277577448.0,
"step": 30900
},
{
"entropy": 1.35625,
"epoch": 0.757798344841095,
"grad_norm": 2.65625,
"learning_rate": 4.393867559573354e-06,
"loss": 0.1744,
"mean_token_accuracy": 0.9626732635498046,
"num_tokens": 3282706579.0,
"step": 30950
},
{
"entropy": 1.36421875,
"epoch": 0.7590225748004505,
"grad_norm": 3.0,
"learning_rate": 4.388487238565661e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9623777115345001,
"num_tokens": 3287949862.0,
"step": 31000
},
{
"entropy": 1.36,
"epoch": 0.7602468047598061,
"grad_norm": 0.0230712890625,
"learning_rate": 4.383101228121541e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.9617887794971466,
"num_tokens": 3293406088.0,
"step": 31050
},
{
"entropy": 1.35609375,
"epoch": 0.7614710347191617,
"grad_norm": 2.984375,
"learning_rate": 4.377709550310598e-06,
"loss": 0.1699,
"mean_token_accuracy": 0.9636480760574341,
"num_tokens": 3298608896.0,
"step": 31100
},
{
"entropy": 1.35375,
"epoch": 0.7626952646785172,
"grad_norm": 3.65625,
"learning_rate": 4.37231222722566e-06,
"loss": 0.1643,
"mean_token_accuracy": 0.9644955229759217,
"num_tokens": 3303290550.0,
"step": 31150
},
{
"entropy": 1.37390625,
"epoch": 0.7639194946378728,
"grad_norm": 2.46875,
"learning_rate": 4.366909280982685e-06,
"loss": 0.1766,
"mean_token_accuracy": 0.9628056597709655,
"num_tokens": 3308295645.0,
"step": 31200
},
{
"entropy": 1.36515625,
"epoch": 0.7651437245972283,
"grad_norm": 2.453125,
"learning_rate": 4.361500733720674e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9649233341217041,
"num_tokens": 3313438478.0,
"step": 31250
},
{
"entropy": 1.3575,
"epoch": 0.7663679545565839,
"grad_norm": 4.46875,
"learning_rate": 4.356086607601575e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9627750849723816,
"num_tokens": 3319025887.0,
"step": 31300
},
{
"entropy": 1.34359375,
"epoch": 0.7675921845159395,
"grad_norm": 2.6875,
"learning_rate": 4.350666924810203e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9644002187252044,
"num_tokens": 3323975976.0,
"step": 31350
},
{
"entropy": 1.35203125,
"epoch": 0.768816414475295,
"grad_norm": 3.765625,
"learning_rate": 4.345241707554134e-06,
"loss": 0.1674,
"mean_token_accuracy": 0.9647248589992523,
"num_tokens": 3329356054.0,
"step": 31400
},
{
"entropy": 1.36625,
"epoch": 0.7700406444346506,
"grad_norm": 2.6875,
"learning_rate": 4.339810978063626e-06,
"loss": 0.1776,
"mean_token_accuracy": 0.9627327370643616,
"num_tokens": 3334739313.0,
"step": 31450
},
{
"entropy": 1.35125,
"epoch": 0.7712648743940062,
"grad_norm": 1.875,
"learning_rate": 4.334374758591524e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9596246099472046,
"num_tokens": 3340200973.0,
"step": 31500
},
{
"entropy": 1.36171875,
"epoch": 0.7724891043533617,
"grad_norm": 2.328125,
"learning_rate": 4.328933071413168e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9636253571510315,
"num_tokens": 3345689303.0,
"step": 31550
},
{
"entropy": 1.36078125,
"epoch": 0.7737133343127173,
"grad_norm": 3.4375,
"learning_rate": 4.323485938826302e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9603872370719909,
"num_tokens": 3350984033.0,
"step": 31600
},
{
"entropy": 1.3403125,
"epoch": 0.7749375642720728,
"grad_norm": 2.5,
"learning_rate": 4.318033383150981e-06,
"loss": 0.1735,
"mean_token_accuracy": 0.9628359317779541,
"num_tokens": 3356162417.0,
"step": 31650
},
{
"entropy": 1.34640625,
"epoch": 0.7761617942314284,
"grad_norm": 2.0625,
"learning_rate": 4.312575426729486e-06,
"loss": 0.1848,
"mean_token_accuracy": 0.9605207931995392,
"num_tokens": 3361647453.0,
"step": 31700
},
{
"entropy": 1.33171875,
"epoch": 0.777386024190784,
"grad_norm": 1.9453125,
"learning_rate": 4.307112091926226e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.965142446756363,
"num_tokens": 3366481444.0,
"step": 31750
},
{
"entropy": 1.37390625,
"epoch": 0.7786102541501395,
"grad_norm": 3.1875,
"learning_rate": 4.301643401127647e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9628903007507325,
"num_tokens": 3371649682.0,
"step": 31800
},
{
"entropy": 1.3721875,
"epoch": 0.7798344841094951,
"grad_norm": 2.625,
"learning_rate": 4.2961693767421435e-06,
"loss": 0.1645,
"mean_token_accuracy": 0.9658307003974914,
"num_tokens": 3376382887.0,
"step": 31850
},
{
"entropy": 1.358125,
"epoch": 0.7810587140688507,
"grad_norm": 2.921875,
"learning_rate": 4.290690041199963e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9622143077850341,
"num_tokens": 3381791030.0,
"step": 31900
},
{
"entropy": 1.37015625,
"epoch": 0.7822829440282063,
"grad_norm": 2.125,
"learning_rate": 4.285205416953118e-06,
"loss": 0.1876,
"mean_token_accuracy": 0.9609373700618744,
"num_tokens": 3387334981.0,
"step": 31950
},
{
"entropy": 1.34765625,
"epoch": 0.7835071739875619,
"grad_norm": 2.515625,
"learning_rate": 4.279715526475289e-06,
"loss": 0.1762,
"mean_token_accuracy": 0.962603681087494,
"num_tokens": 3392713314.0,
"step": 32000
},
{
"entropy": 1.3678125,
"epoch": 0.7847314039469174,
"grad_norm": 2.609375,
"learning_rate": 4.274220392261738e-06,
"loss": 0.1887,
"mean_token_accuracy": 0.9606349515914917,
"num_tokens": 3398537796.0,
"step": 32050
},
{
"entropy": 1.33734375,
"epoch": 0.785955633906273,
"grad_norm": 2.921875,
"learning_rate": 4.268720036829214e-06,
"loss": 0.1748,
"mean_token_accuracy": 0.964071912765503,
"num_tokens": 3403920236.0,
"step": 32100
},
{
"entropy": 1.37,
"epoch": 0.7871798638656285,
"grad_norm": 2.328125,
"learning_rate": 4.263214482715857e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9644496822357178,
"num_tokens": 3409108918.0,
"step": 32150
},
{
"entropy": 1.35046875,
"epoch": 0.7884040938249841,
"grad_norm": 3.125,
"learning_rate": 4.2577037524811104e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9636311101913452,
"num_tokens": 3414387238.0,
"step": 32200
},
{
"entropy": 1.34359375,
"epoch": 0.7896283237843397,
"grad_norm": 2.328125,
"learning_rate": 4.25218786870563e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.965884006023407,
"num_tokens": 3419148471.0,
"step": 32250
},
{
"entropy": 1.34875,
"epoch": 0.7908525537436952,
"grad_norm": 0.004241943359375,
"learning_rate": 4.246666853991186e-06,
"loss": 0.1676,
"mean_token_accuracy": 0.9639466750621796,
"num_tokens": 3424295496.0,
"step": 32300
},
{
"entropy": 1.364375,
"epoch": 0.7920767837030508,
"grad_norm": 1.6953125,
"learning_rate": 4.241140730960573e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9615444934368134,
"num_tokens": 3429846223.0,
"step": 32350
},
{
"entropy": 1.33828125,
"epoch": 0.7933010136624064,
"grad_norm": 3.53125,
"learning_rate": 4.235609522257517e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9621382772922515,
"num_tokens": 3434814232.0,
"step": 32400
},
{
"entropy": 1.37265625,
"epoch": 0.7945252436217619,
"grad_norm": 2.28125,
"learning_rate": 4.230073250546585e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9616455745697021,
"num_tokens": 3440013747.0,
"step": 32450
},
{
"entropy": 1.33484375,
"epoch": 0.7957494735811175,
"grad_norm": 2.828125,
"learning_rate": 4.224531938513088e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9632323062419892,
"num_tokens": 3445299571.0,
"step": 32500
},
{
"entropy": 1.34203125,
"epoch": 0.796973703540473,
"grad_norm": 2.421875,
"learning_rate": 4.218985608862992e-06,
"loss": 0.1814,
"mean_token_accuracy": 0.9623367011547088,
"num_tokens": 3450664579.0,
"step": 32550
},
{
"entropy": 1.3540625,
"epoch": 0.7981979334998286,
"grad_norm": 2.0625,
"learning_rate": 4.213434284322819e-06,
"loss": 0.1729,
"mean_token_accuracy": 0.9627703261375428,
"num_tokens": 3455979121.0,
"step": 32600
},
{
"entropy": 1.33734375,
"epoch": 0.7994221634591842,
"grad_norm": 2.796875,
"learning_rate": 4.207877987639566e-06,
"loss": 0.1764,
"mean_token_accuracy": 0.9627932643890381,
"num_tokens": 3461283678.0,
"step": 32650
},
{
"entropy": 1.3596875,
"epoch": 0.8006463934185397,
"grad_norm": 1.8984375,
"learning_rate": 4.202316741580594e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9612032771110535,
"num_tokens": 3467126201.0,
"step": 32700
},
{
"entropy": 1.344375,
"epoch": 0.8018706233778953,
"grad_norm": 2.921875,
"learning_rate": 4.196750568933551e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.9638476753234864,
"num_tokens": 3472599559.0,
"step": 32750
},
{
"entropy": 1.3415625,
"epoch": 0.8030948533372508,
"grad_norm": 2.34375,
"learning_rate": 4.191179492506271e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9628195893764496,
"num_tokens": 3477994415.0,
"step": 32800
},
{
"entropy": 1.34953125,
"epoch": 0.8043190832966064,
"grad_norm": 2.15625,
"learning_rate": 4.18560353512668e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9618653762340545,
"num_tokens": 3483437386.0,
"step": 32850
},
{
"entropy": 1.34390625,
"epoch": 0.805543313255962,
"grad_norm": 2.875,
"learning_rate": 4.1800227196427055e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9623115694522858,
"num_tokens": 3488795577.0,
"step": 32900
},
{
"entropy": 1.32609375,
"epoch": 0.8067675432153175,
"grad_norm": 1.9921875,
"learning_rate": 4.17443706892218e-06,
"loss": 0.1766,
"mean_token_accuracy": 0.9626455020904541,
"num_tokens": 3494139245.0,
"step": 32950
},
{
"entropy": 1.34953125,
"epoch": 0.8079917731746731,
"grad_norm": 3.640625,
"learning_rate": 4.168846605852751e-06,
"loss": 0.1811,
"mean_token_accuracy": 0.9624789762496948,
"num_tokens": 3499294686.0,
"step": 33000
},
{
"entropy": 1.34546875,
"epoch": 0.8092160031340287,
"grad_norm": 3.234375,
"learning_rate": 4.1632513533417825e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9650925529003144,
"num_tokens": 3504042622.0,
"step": 33050
},
{
"entropy": 1.3675,
"epoch": 0.8104402330933843,
"grad_norm": 1.8984375,
"learning_rate": 4.157651334316264e-06,
"loss": 0.159,
"mean_token_accuracy": 0.9659399092197418,
"num_tokens": 3509103882.0,
"step": 33100
},
{
"entropy": 1.35625,
"epoch": 0.8116644630527399,
"grad_norm": 1.9765625,
"learning_rate": 4.1520465717227206e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.9628897225856781,
"num_tokens": 3514150747.0,
"step": 33150
},
{
"entropy": 1.3603125,
"epoch": 0.8128886930120954,
"grad_norm": 2.859375,
"learning_rate": 4.146437088527108e-06,
"loss": 0.1811,
"mean_token_accuracy": 0.9617001414299011,
"num_tokens": 3519220750.0,
"step": 33200
},
{
"entropy": 1.36859375,
"epoch": 0.814112922971451,
"grad_norm": 2.921875,
"learning_rate": 4.140822907714728e-06,
"loss": 0.1885,
"mean_token_accuracy": 0.9607588303089142,
"num_tokens": 3524668178.0,
"step": 33250
},
{
"entropy": 1.35484375,
"epoch": 0.8153371529308066,
"grad_norm": 1.6015625,
"learning_rate": 4.135204052290131e-06,
"loss": 0.1645,
"mean_token_accuracy": 0.9654926788806916,
"num_tokens": 3529737924.0,
"step": 33300
},
{
"entropy": 1.33109375,
"epoch": 0.8165613828901621,
"grad_norm": 3.0,
"learning_rate": 4.129580545277023e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9648844826221467,
"num_tokens": 3534673592.0,
"step": 33350
},
{
"entropy": 1.33046875,
"epoch": 0.8177856128495177,
"grad_norm": 2.1875,
"learning_rate": 4.123952409718169e-06,
"loss": 0.1705,
"mean_token_accuracy": 0.963813624382019,
"num_tokens": 3539705624.0,
"step": 33400
},
{
"entropy": 1.3225,
"epoch": 0.8190098428088732,
"grad_norm": 2.65625,
"learning_rate": 4.118319668675301e-06,
"loss": 0.1607,
"mean_token_accuracy": 0.9656564962863922,
"num_tokens": 3544723634.0,
"step": 33450
},
{
"entropy": 1.34328125,
"epoch": 0.8202340727682288,
"grad_norm": 3.625,
"learning_rate": 4.112682345229019e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9613649821281434,
"num_tokens": 3550196451.0,
"step": 33500
},
{
"entropy": 1.34546875,
"epoch": 0.8214583027275844,
"grad_norm": 2.375,
"learning_rate": 4.107040462478706e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.9640332353115082,
"num_tokens": 3555769583.0,
"step": 33550
},
{
"entropy": 1.35515625,
"epoch": 0.8226825326869399,
"grad_norm": 4.6875,
"learning_rate": 4.101394043542421e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.9626898431777954,
"num_tokens": 3560775725.0,
"step": 33600
},
{
"entropy": 1.37046875,
"epoch": 0.8239067626462955,
"grad_norm": 2.828125,
"learning_rate": 4.095743111556813e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.9615408968925476,
"num_tokens": 3566233997.0,
"step": 33650
},
{
"entropy": 1.3565625,
"epoch": 0.825130992605651,
"grad_norm": 2.6875,
"learning_rate": 4.090087689677025e-06,
"loss": 0.1798,
"mean_token_accuracy": 0.9622524130344391,
"num_tokens": 3571629994.0,
"step": 33700
},
{
"entropy": 1.35453125,
"epoch": 0.8263552225650066,
"grad_norm": 2.34375,
"learning_rate": 4.084427801076592e-06,
"loss": 0.1631,
"mean_token_accuracy": 0.965935331583023,
"num_tokens": 3576662114.0,
"step": 33750
},
{
"entropy": 1.36453125,
"epoch": 0.8275794525243622,
"grad_norm": 2.609375,
"learning_rate": 4.0787634689473605e-06,
"loss": 0.1704,
"mean_token_accuracy": 0.9641584491729737,
"num_tokens": 3581699530.0,
"step": 33800
},
{
"entropy": 1.33421875,
"epoch": 0.8288036824837177,
"grad_norm": 3.71875,
"learning_rate": 4.0730947164993775e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9626482093334198,
"num_tokens": 3586891414.0,
"step": 33850
},
{
"entropy": 1.34828125,
"epoch": 0.8300279124430733,
"grad_norm": 2.9375,
"learning_rate": 4.067421566960805e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9637481319904327,
"num_tokens": 3591845863.0,
"step": 33900
},
{
"entropy": 1.32796875,
"epoch": 0.8312521424024288,
"grad_norm": 4.3125,
"learning_rate": 4.061744043577822e-06,
"loss": 0.1826,
"mean_token_accuracy": 0.960258857011795,
"num_tokens": 3597325814.0,
"step": 33950
},
{
"entropy": 1.343125,
"epoch": 0.8324763723617844,
"grad_norm": 3.65625,
"learning_rate": 4.056062169614533e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.9624998271465302,
"num_tokens": 3602589177.0,
"step": 34000
},
{
"entropy": 1.33171875,
"epoch": 0.83370060232114,
"grad_norm": 5.5,
"learning_rate": 4.050375968352865e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9635978293418884,
"num_tokens": 3607686315.0,
"step": 34050
},
{
"entropy": 1.35046875,
"epoch": 0.8349248322804955,
"grad_norm": 2.921875,
"learning_rate": 4.044685463092477e-06,
"loss": 0.1823,
"mean_token_accuracy": 0.9619014573097229,
"num_tokens": 3613032357.0,
"step": 34100
},
{
"entropy": 1.3278125,
"epoch": 0.8361490622398511,
"grad_norm": 3.796875,
"learning_rate": 4.0389906771506666e-06,
"loss": 0.1567,
"mean_token_accuracy": 0.9672730362415314,
"num_tokens": 3617947758.0,
"step": 34150
},
{
"entropy": 1.3509375,
"epoch": 0.8373732921992068,
"grad_norm": 3.0,
"learning_rate": 4.03329163386227e-06,
"loss": 0.1821,
"mean_token_accuracy": 0.9615289163589478,
"num_tokens": 3623324648.0,
"step": 34200
},
{
"entropy": 1.36625,
"epoch": 0.8385975221585623,
"grad_norm": 2.21875,
"learning_rate": 4.027588356579567e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.962299063205719,
"num_tokens": 3628936189.0,
"step": 34250
},
{
"entropy": 1.34484375,
"epoch": 0.8398217521179179,
"grad_norm": 1.9375,
"learning_rate": 4.0218808686721884e-06,
"loss": 0.1766,
"mean_token_accuracy": 0.9632388269901275,
"num_tokens": 3634256824.0,
"step": 34300
},
{
"entropy": 1.3365625,
"epoch": 0.8410459820772734,
"grad_norm": 3.265625,
"learning_rate": 4.01616919352702e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9652460610866547,
"num_tokens": 3639058717.0,
"step": 34350
},
{
"entropy": 1.3490625,
"epoch": 0.842270212036629,
"grad_norm": 3.53125,
"learning_rate": 4.010453354548101e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9665447866916657,
"num_tokens": 3644031006.0,
"step": 34400
},
{
"entropy": 1.36546875,
"epoch": 0.8434944419959846,
"grad_norm": 2.0625,
"learning_rate": 4.004733375156534e-06,
"loss": 0.1862,
"mean_token_accuracy": 0.9608346676826477,
"num_tokens": 3649652142.0,
"step": 34450
},
{
"entropy": 1.36640625,
"epoch": 0.8447186719553401,
"grad_norm": 2.34375,
"learning_rate": 3.999009278790389e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.9642466914653778,
"num_tokens": 3654831381.0,
"step": 34500
},
{
"entropy": 1.35890625,
"epoch": 0.8459429019146957,
"grad_norm": 1.96875,
"learning_rate": 3.993281088904603e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9651599872112274,
"num_tokens": 3659811312.0,
"step": 34550
},
{
"entropy": 1.36734375,
"epoch": 0.8471671318740512,
"grad_norm": 3.578125,
"learning_rate": 3.9875488289708895e-06,
"loss": 0.1693,
"mean_token_accuracy": 0.9640548026561737,
"num_tokens": 3665088140.0,
"step": 34600
},
{
"entropy": 1.35578125,
"epoch": 0.8483913618334068,
"grad_norm": 2.671875,
"learning_rate": 3.981812522477634e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.9642880761623382,
"num_tokens": 3670199765.0,
"step": 34650
},
{
"entropy": 1.371875,
"epoch": 0.8496155917927624,
"grad_norm": 2.125,
"learning_rate": 3.976072192929812e-06,
"loss": 0.1859,
"mean_token_accuracy": 0.961214131116867,
"num_tokens": 3675973370.0,
"step": 34700
},
{
"entropy": 1.335,
"epoch": 0.8508398217521179,
"grad_norm": 2.234375,
"learning_rate": 3.970327863848874e-06,
"loss": 0.163,
"mean_token_accuracy": 0.9652379751205444,
"num_tokens": 3680935151.0,
"step": 34750
},
{
"entropy": 1.35953125,
"epoch": 0.8520640517114735,
"grad_norm": 3.984375,
"learning_rate": 3.964579558772665e-06,
"loss": 0.1686,
"mean_token_accuracy": 0.9643210101127625,
"num_tokens": 3686151191.0,
"step": 34800
},
{
"entropy": 1.35,
"epoch": 0.853288281670829,
"grad_norm": 2.46875,
"learning_rate": 3.95882730125532e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9624910676479339,
"num_tokens": 3691478654.0,
"step": 34850
},
{
"entropy": 1.338125,
"epoch": 0.8545125116301846,
"grad_norm": 2.109375,
"learning_rate": 3.953071114867171e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9633730280399323,
"num_tokens": 3696633906.0,
"step": 34900
},
{
"entropy": 1.34890625,
"epoch": 0.8557367415895402,
"grad_norm": 1.890625,
"learning_rate": 3.947311023194645e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9618865346908569,
"num_tokens": 3701978753.0,
"step": 34950
},
{
"entropy": 1.3384375,
"epoch": 0.8569609715488957,
"grad_norm": 2.203125,
"learning_rate": 3.941547049840176e-06,
"loss": 0.1645,
"mean_token_accuracy": 0.9649497640132904,
"num_tokens": 3706915348.0,
"step": 35000
},
{
"entropy": 1.32359375,
"epoch": 0.8581852015082513,
"grad_norm": 2.171875,
"learning_rate": 3.9357792184221005e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9632923007011414,
"num_tokens": 3712046907.0,
"step": 35050
},
{
"entropy": 1.3240625,
"epoch": 0.8594094314676068,
"grad_norm": 3.078125,
"learning_rate": 3.930007552574564e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.9626149117946625,
"num_tokens": 3717274859.0,
"step": 35100
},
{
"entropy": 1.33484375,
"epoch": 0.8606336614269624,
"grad_norm": 3.03125,
"learning_rate": 3.924232075947427e-06,
"loss": 0.186,
"mean_token_accuracy": 0.9613423335552216,
"num_tokens": 3722674538.0,
"step": 35150
},
{
"entropy": 1.33484375,
"epoch": 0.861857891386318,
"grad_norm": 3.40625,
"learning_rate": 3.918452812206159e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9628440749645233,
"num_tokens": 3727975730.0,
"step": 35200
},
{
"entropy": 1.34125,
"epoch": 0.8630821213456735,
"grad_norm": 1.8359375,
"learning_rate": 3.9126697850317525e-06,
"loss": 0.1761,
"mean_token_accuracy": 0.963371901512146,
"num_tokens": 3733241093.0,
"step": 35250
},
{
"entropy": 1.34328125,
"epoch": 0.8643063513050291,
"grad_norm": 2.640625,
"learning_rate": 3.906883018120619e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.9642481172084808,
"num_tokens": 3738164559.0,
"step": 35300
},
{
"entropy": 1.3203125,
"epoch": 0.8655305812643848,
"grad_norm": 3.546875,
"learning_rate": 3.901092535184496e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9637637650966644,
"num_tokens": 3743459921.0,
"step": 35350
},
{
"entropy": 1.35578125,
"epoch": 0.8667548112237403,
"grad_norm": 3.40625,
"learning_rate": 3.895298359950343e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9605180990695953,
"num_tokens": 3748868327.0,
"step": 35400
},
{
"entropy": 1.34265625,
"epoch": 0.8679790411830959,
"grad_norm": 2.125,
"learning_rate": 3.889500516160254e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9643005490303039,
"num_tokens": 3753748677.0,
"step": 35450
},
{
"entropy": 1.3384375,
"epoch": 0.8692032711424514,
"grad_norm": 2.375,
"learning_rate": 3.883699027571352e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.965086680650711,
"num_tokens": 3759201853.0,
"step": 35500
},
{
"entropy": 1.34390625,
"epoch": 0.870427501101807,
"grad_norm": 3.25,
"learning_rate": 3.8778939179556976e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9643353164196015,
"num_tokens": 3764158638.0,
"step": 35550
},
{
"entropy": 1.33015625,
"epoch": 0.8716517310611626,
"grad_norm": 2.015625,
"learning_rate": 3.872085211100185e-06,
"loss": 0.1621,
"mean_token_accuracy": 0.9657464909553528,
"num_tokens": 3769226815.0,
"step": 35600
},
{
"entropy": 1.35078125,
"epoch": 0.8728759610205181,
"grad_norm": 1.890625,
"learning_rate": 3.86627293080645e-06,
"loss": 0.1836,
"mean_token_accuracy": 0.9611875438690185,
"num_tokens": 3774861819.0,
"step": 35650
},
{
"entropy": 1.34953125,
"epoch": 0.8741001909798737,
"grad_norm": 3.203125,
"learning_rate": 3.860457100890776e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.9616686987876892,
"num_tokens": 3780181646.0,
"step": 35700
},
{
"entropy": 1.34359375,
"epoch": 0.8753244209392292,
"grad_norm": 3.046875,
"learning_rate": 3.854637745183983e-06,
"loss": 0.1762,
"mean_token_accuracy": 0.9630369508266449,
"num_tokens": 3785489246.0,
"step": 35750
},
{
"entropy": 1.3425,
"epoch": 0.8765486508985848,
"grad_norm": 1.953125,
"learning_rate": 3.848814887531342e-06,
"loss": 0.1865,
"mean_token_accuracy": 0.9609660315513611,
"num_tokens": 3790970702.0,
"step": 35800
},
{
"entropy": 1.3375,
"epoch": 0.8777728808579404,
"grad_norm": 2.890625,
"learning_rate": 3.842988551792473e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9646478390693665,
"num_tokens": 3796002667.0,
"step": 35850
},
{
"entropy": 1.33828125,
"epoch": 0.8789971108172959,
"grad_norm": 3.234375,
"learning_rate": 3.83715876184125e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9642738771438598,
"num_tokens": 3801134844.0,
"step": 35900
},
{
"entropy": 1.33859375,
"epoch": 0.8802213407766515,
"grad_norm": 2.8125,
"learning_rate": 3.831325541565699e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9640265047550202,
"num_tokens": 3806453829.0,
"step": 35950
},
{
"entropy": 1.34015625,
"epoch": 0.881445570736007,
"grad_norm": 3.046875,
"learning_rate": 3.825488914867901e-06,
"loss": 0.1762,
"mean_token_accuracy": 0.9627239561080932,
"num_tokens": 3811628461.0,
"step": 36000
},
{
"entropy": 1.35203125,
"epoch": 0.8826698006953626,
"grad_norm": 2.171875,
"learning_rate": 3.8196489056638965e-06,
"loss": 0.1849,
"mean_token_accuracy": 0.9613272595405579,
"num_tokens": 3816892701.0,
"step": 36050
},
{
"entropy": 1.34703125,
"epoch": 0.8838940306547182,
"grad_norm": 2.015625,
"learning_rate": 3.813805537883585e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9637981843948364,
"num_tokens": 3822028448.0,
"step": 36100
},
{
"entropy": 1.34875,
"epoch": 0.8851182606140737,
"grad_norm": 1.8671875,
"learning_rate": 3.80795883547063e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9647044801712036,
"num_tokens": 3827213092.0,
"step": 36150
},
{
"entropy": 1.34578125,
"epoch": 0.8863424905734293,
"grad_norm": 2.484375,
"learning_rate": 3.8021088223823558e-06,
"loss": 0.1927,
"mean_token_accuracy": 0.9597675764560699,
"num_tokens": 3832709039.0,
"step": 36200
},
{
"entropy": 1.33359375,
"epoch": 0.8875667205327848,
"grad_norm": 2.046875,
"learning_rate": 3.7962555225896563e-06,
"loss": 0.177,
"mean_token_accuracy": 0.9623324680328369,
"num_tokens": 3837879687.0,
"step": 36250
},
{
"entropy": 1.33890625,
"epoch": 0.8887909504921404,
"grad_norm": 2.328125,
"learning_rate": 3.790398960076891e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9629685461521149,
"num_tokens": 3843045671.0,
"step": 36300
},
{
"entropy": 1.32703125,
"epoch": 0.890015180451496,
"grad_norm": 3.09375,
"learning_rate": 3.7845391588417876e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9636087584495544,
"num_tokens": 3848206427.0,
"step": 36350
},
{
"entropy": 1.32984375,
"epoch": 0.8912394104108515,
"grad_norm": 2.171875,
"learning_rate": 3.778676142895346e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9632059478759766,
"num_tokens": 3853828427.0,
"step": 36400
},
{
"entropy": 1.32390625,
"epoch": 0.8924636403702071,
"grad_norm": 2.5625,
"learning_rate": 3.772809936261739e-06,
"loss": 0.1894,
"mean_token_accuracy": 0.9601573574543,
"num_tokens": 3859273920.0,
"step": 36450
},
{
"entropy": 1.3265625,
"epoch": 0.8936878703295627,
"grad_norm": 2.875,
"learning_rate": 3.766940562978211e-06,
"loss": 0.1763,
"mean_token_accuracy": 0.9631186270713806,
"num_tokens": 3864494355.0,
"step": 36500
},
{
"entropy": 1.33109375,
"epoch": 0.8949121002889183,
"grad_norm": 2.71875,
"learning_rate": 3.761068047094987e-06,
"loss": 0.1736,
"mean_token_accuracy": 0.963892787694931,
"num_tokens": 3869689661.0,
"step": 36550
},
{
"entropy": 1.3115625,
"epoch": 0.8961363302482739,
"grad_norm": 2.625,
"learning_rate": 3.7551924126751624e-06,
"loss": 0.1832,
"mean_token_accuracy": 0.9618776285648346,
"num_tokens": 3875053980.0,
"step": 36600
},
{
"entropy": 1.3021875,
"epoch": 0.8973605602076294,
"grad_norm": 2.734375,
"learning_rate": 3.7493136837946177e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.962455780506134,
"num_tokens": 3880568995.0,
"step": 36650
},
{
"entropy": 1.3209375,
"epoch": 0.898584790166985,
"grad_norm": 2.90625,
"learning_rate": 3.743431884541909e-06,
"loss": 0.1835,
"mean_token_accuracy": 0.9612640655040741,
"num_tokens": 3885898540.0,
"step": 36700
},
{
"entropy": 1.31390625,
"epoch": 0.8998090201263406,
"grad_norm": 2.75,
"learning_rate": 3.737547039018173e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.9649625384807586,
"num_tokens": 3891014489.0,
"step": 36750
},
{
"entropy": 1.323125,
"epoch": 0.9010332500856961,
"grad_norm": 2.1875,
"learning_rate": 3.7316591713370315e-06,
"loss": 0.1774,
"mean_token_accuracy": 0.9622565031051635,
"num_tokens": 3896408077.0,
"step": 36800
},
{
"entropy": 1.34515625,
"epoch": 0.9022574800450517,
"grad_norm": 1.8203125,
"learning_rate": 3.7257683056244895e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9631640148162842,
"num_tokens": 3901699376.0,
"step": 36850
},
{
"entropy": 1.32171875,
"epoch": 0.9034817100044072,
"grad_norm": 2.84375,
"learning_rate": 3.7198744660188347e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.966994469165802,
"num_tokens": 3906644235.0,
"step": 36900
},
{
"entropy": 1.3284375,
"epoch": 0.9047059399637628,
"grad_norm": 1.8828125,
"learning_rate": 3.7139776766705433e-06,
"loss": 0.161,
"mean_token_accuracy": 0.9657053291797638,
"num_tokens": 3911529877.0,
"step": 36950
},
{
"entropy": 1.320625,
"epoch": 0.9059301699231184,
"grad_norm": 2.640625,
"learning_rate": 3.7080779617421733e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9647897446155548,
"num_tokens": 3917023608.0,
"step": 37000
},
{
"entropy": 1.3315625,
"epoch": 0.9071543998824739,
"grad_norm": 3.078125,
"learning_rate": 3.7021753454082772e-06,
"loss": 0.1851,
"mean_token_accuracy": 0.9609014749526977,
"num_tokens": 3922789580.0,
"step": 37050
},
{
"entropy": 1.31453125,
"epoch": 0.9083786298418295,
"grad_norm": 2.484375,
"learning_rate": 3.696269851855292e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9629218196868896,
"num_tokens": 3927904246.0,
"step": 37100
},
{
"entropy": 1.29828125,
"epoch": 0.909602859801185,
"grad_norm": 2.875,
"learning_rate": 3.6903615052814444e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.96382728099823,
"num_tokens": 3933096610.0,
"step": 37150
},
{
"entropy": 1.275,
"epoch": 0.9108270897605406,
"grad_norm": 1.640625,
"learning_rate": 3.684450329896653e-06,
"loss": 0.1538,
"mean_token_accuracy": 0.9664675867557526,
"num_tokens": 3938208531.0,
"step": 37200
},
{
"entropy": 1.2990625,
"epoch": 0.9120513197198962,
"grad_norm": 1.6640625,
"learning_rate": 3.6785363499224266e-06,
"loss": 0.1676,
"mean_token_accuracy": 0.9638699948787689,
"num_tokens": 3943507764.0,
"step": 37250
},
{
"entropy": 1.29953125,
"epoch": 0.9132755496792517,
"grad_norm": 2.921875,
"learning_rate": 3.672619589591768e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9631060230731964,
"num_tokens": 3948883174.0,
"step": 37300
},
{
"entropy": 1.3128125,
"epoch": 0.9144997796386073,
"grad_norm": 3.34375,
"learning_rate": 3.6667000731490695e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9630844449996948,
"num_tokens": 3954228445.0,
"step": 37350
},
{
"entropy": 1.31328125,
"epoch": 0.9157240095979629,
"grad_norm": 2.71875,
"learning_rate": 3.660777824850019e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9625172114372254,
"num_tokens": 3959522338.0,
"step": 37400
},
{
"entropy": 1.3109375,
"epoch": 0.9169482395573184,
"grad_norm": 3.875,
"learning_rate": 3.6548528689614985e-06,
"loss": 0.1615,
"mean_token_accuracy": 0.9651338791847229,
"num_tokens": 3964674293.0,
"step": 37450
},
{
"entropy": 1.3209375,
"epoch": 0.918172469516674,
"grad_norm": 2.765625,
"learning_rate": 3.6489252297614833e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9630649185180664,
"num_tokens": 3970201603.0,
"step": 37500
},
{
"entropy": 1.33578125,
"epoch": 0.9193966994760295,
"grad_norm": 1.8984375,
"learning_rate": 3.6429949315389455e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9619642412662506,
"num_tokens": 3975729221.0,
"step": 37550
},
{
"entropy": 1.3121875,
"epoch": 0.9206209294353851,
"grad_norm": 2.484375,
"learning_rate": 3.6370619985937513e-06,
"loss": 0.1658,
"mean_token_accuracy": 0.9639672470092774,
"num_tokens": 3980440332.0,
"step": 37600
},
{
"entropy": 1.314375,
"epoch": 0.9218451593947407,
"grad_norm": 2.0,
"learning_rate": 3.6311264552365634e-06,
"loss": 0.1748,
"mean_token_accuracy": 0.9630878198146821,
"num_tokens": 3985861602.0,
"step": 37650
},
{
"entropy": 1.31109375,
"epoch": 0.9230693893540963,
"grad_norm": 1.671875,
"learning_rate": 3.62518832578874e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9646557712554932,
"num_tokens": 3991141130.0,
"step": 37700
},
{
"entropy": 1.31140625,
"epoch": 0.9242936193134519,
"grad_norm": 1.6328125,
"learning_rate": 3.619247634582238e-06,
"loss": 0.1798,
"mean_token_accuracy": 0.961934734582901,
"num_tokens": 3996774043.0,
"step": 37750
},
{
"entropy": 1.326875,
"epoch": 0.9255178492728074,
"grad_norm": 2.078125,
"learning_rate": 3.6133044059595083e-06,
"loss": 0.1817,
"mean_token_accuracy": 0.9612915456295014,
"num_tokens": 4002462308.0,
"step": 37800
},
{
"entropy": 1.31359375,
"epoch": 0.926742079232163,
"grad_norm": 3.296875,
"learning_rate": 3.6073586642734027e-06,
"loss": 0.1779,
"mean_token_accuracy": 0.9622733199596405,
"num_tokens": 4007870657.0,
"step": 37850
},
{
"entropy": 1.3059375,
"epoch": 0.9279663091915186,
"grad_norm": 1.734375,
"learning_rate": 3.601410433887068e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9639555370807648,
"num_tokens": 4012925044.0,
"step": 37900
},
{
"entropy": 1.30625,
"epoch": 0.9291905391508741,
"grad_norm": 3.203125,
"learning_rate": 3.5954597391738487e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9627858221530914,
"num_tokens": 4018089645.0,
"step": 37950
},
{
"entropy": 1.3059375,
"epoch": 0.9304147691102297,
"grad_norm": 3.0,
"learning_rate": 3.589506604517189e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9654299330711364,
"num_tokens": 4023139809.0,
"step": 38000
},
{
"entropy": 1.32140625,
"epoch": 0.9316389990695852,
"grad_norm": 1.765625,
"learning_rate": 3.583551054310529e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9638527107238769,
"num_tokens": 4028437262.0,
"step": 38050
},
{
"entropy": 1.31890625,
"epoch": 0.9328632290289408,
"grad_norm": 3.03125,
"learning_rate": 3.5775931129572072e-06,
"loss": 0.1658,
"mean_token_accuracy": 0.9640737462043762,
"num_tokens": 4033659635.0,
"step": 38100
},
{
"entropy": 1.31625,
"epoch": 0.9340874589882964,
"grad_norm": 4.28125,
"learning_rate": 3.57163280487036e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9627125465869903,
"num_tokens": 4039135210.0,
"step": 38150
},
{
"entropy": 1.31125,
"epoch": 0.9353116889476519,
"grad_norm": 2.546875,
"learning_rate": 3.5656701544728222e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9629321038722992,
"num_tokens": 4044192912.0,
"step": 38200
},
{
"entropy": 1.315625,
"epoch": 0.9365359189070075,
"grad_norm": 2.109375,
"learning_rate": 3.559705186197026e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9655595874786377,
"num_tokens": 4049649393.0,
"step": 38250
},
{
"entropy": 1.31484375,
"epoch": 0.937760148866363,
"grad_norm": 3.0625,
"learning_rate": 3.5537379244849017e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9634083175659179,
"num_tokens": 4054901732.0,
"step": 38300
},
{
"entropy": 1.31140625,
"epoch": 0.9389843788257186,
"grad_norm": 2.5625,
"learning_rate": 3.5477683937877755e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9634031581878663,
"num_tokens": 4060033796.0,
"step": 38350
},
{
"entropy": 1.30640625,
"epoch": 0.9402086087850742,
"grad_norm": 2.71875,
"learning_rate": 3.541796618566273e-06,
"loss": 0.1634,
"mean_token_accuracy": 0.9645454668998719,
"num_tokens": 4065362004.0,
"step": 38400
},
{
"entropy": 1.281875,
"epoch": 0.9414328387444297,
"grad_norm": 3.0625,
"learning_rate": 3.535822623290217e-06,
"loss": 0.1456,
"mean_token_accuracy": 0.9695195186138154,
"num_tokens": 4070167345.0,
"step": 38450
},
{
"entropy": 1.3134375,
"epoch": 0.9426570687037853,
"grad_norm": 0.004974365234375,
"learning_rate": 3.5298464324385246e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9643441307544708,
"num_tokens": 4075331852.0,
"step": 38500
},
{
"entropy": 1.29109375,
"epoch": 0.9438812986631409,
"grad_norm": 2.609375,
"learning_rate": 3.523868070499112e-06,
"loss": 0.1522,
"mean_token_accuracy": 0.9671092510223389,
"num_tokens": 4080329045.0,
"step": 38550
},
{
"entropy": 1.2840625,
"epoch": 0.9451055286224964,
"grad_norm": 3.75,
"learning_rate": 3.517887561968791e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9650249874591827,
"num_tokens": 4085382254.0,
"step": 38600
},
{
"entropy": 1.29203125,
"epoch": 0.946329758581852,
"grad_norm": 3.5,
"learning_rate": 3.5119049313531687e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.9630708813667297,
"num_tokens": 4090736615.0,
"step": 38650
},
{
"entropy": 1.29640625,
"epoch": 0.9475539885412075,
"grad_norm": 2.28125,
"learning_rate": 3.5059202031665473e-06,
"loss": 0.1761,
"mean_token_accuracy": 0.962629064321518,
"num_tokens": 4096335023.0,
"step": 38700
},
{
"entropy": 1.28390625,
"epoch": 0.9487782185005631,
"grad_norm": 2.53125,
"learning_rate": 3.499933401931826e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9639296698570251,
"num_tokens": 4101408840.0,
"step": 38750
},
{
"entropy": 1.26234375,
"epoch": 0.9500024484599187,
"grad_norm": 3.515625,
"learning_rate": 3.493944552180395e-06,
"loss": 0.1548,
"mean_token_accuracy": 0.9661567640304566,
"num_tokens": 4106422813.0,
"step": 38800
},
{
"entropy": 1.2709375,
"epoch": 0.9512266784192743,
"grad_norm": 3.109375,
"learning_rate": 3.487953678452042e-06,
"loss": 0.1544,
"mean_token_accuracy": 0.9669099247455597,
"num_tokens": 4111614226.0,
"step": 38850
},
{
"entropy": 1.2715625,
"epoch": 0.9524509083786299,
"grad_norm": 3.1875,
"learning_rate": 3.481960805294847e-06,
"loss": 0.1652,
"mean_token_accuracy": 0.9649276435375214,
"num_tokens": 4116902981.0,
"step": 38900
},
{
"entropy": 1.26828125,
"epoch": 0.9536751383379855,
"grad_norm": 3.390625,
"learning_rate": 3.47596595726508e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9636393487453461,
"num_tokens": 4122056561.0,
"step": 38950
},
{
"entropy": 1.270625,
"epoch": 0.954899368297341,
"grad_norm": 2.125,
"learning_rate": 3.4699691589271076e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9632602989673614,
"num_tokens": 4127685041.0,
"step": 39000
},
{
"entropy": 1.2453125,
"epoch": 0.9561235982566966,
"grad_norm": 2.4375,
"learning_rate": 3.463970434853285e-06,
"loss": 0.142,
"mean_token_accuracy": 0.9697425818443298,
"num_tokens": 4132578966.0,
"step": 39050
},
{
"entropy": 1.258125,
"epoch": 0.9573478282160521,
"grad_norm": 2.375,
"learning_rate": 3.45796980962386e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9646705484390259,
"num_tokens": 4138012784.0,
"step": 39100
},
{
"entropy": 1.26859375,
"epoch": 0.9585720581754077,
"grad_norm": 1.7890625,
"learning_rate": 3.451967307826869e-06,
"loss": 0.1757,
"mean_token_accuracy": 0.9628133857250214,
"num_tokens": 4143616072.0,
"step": 39150
},
{
"entropy": 1.259375,
"epoch": 0.9597962881347633,
"grad_norm": 4.1875,
"learning_rate": 3.445962954058039e-06,
"loss": 0.1752,
"mean_token_accuracy": 0.962674834728241,
"num_tokens": 4148944121.0,
"step": 39200
},
{
"entropy": 1.25921875,
"epoch": 0.9610205180941188,
"grad_norm": 3.375,
"learning_rate": 3.439956772920685e-06,
"loss": 0.1648,
"mean_token_accuracy": 0.9645766019821167,
"num_tokens": 4153880493.0,
"step": 39250
},
{
"entropy": 1.2525,
"epoch": 0.9622447480534744,
"grad_norm": 3.015625,
"learning_rate": 3.4339487890256097e-06,
"loss": 0.161,
"mean_token_accuracy": 0.965018298625946,
"num_tokens": 4158921325.0,
"step": 39300
},
{
"entropy": 1.2459375,
"epoch": 0.9634689780128299,
"grad_norm": 3.5,
"learning_rate": 3.4279390269910033e-06,
"loss": 0.1658,
"mean_token_accuracy": 0.9649594247341156,
"num_tokens": 4163950443.0,
"step": 39350
},
{
"entropy": 1.2590625,
"epoch": 0.9646932079721855,
"grad_norm": 2.875,
"learning_rate": 3.421927511442341e-06,
"loss": 0.172,
"mean_token_accuracy": 0.9640387868881226,
"num_tokens": 4169489034.0,
"step": 39400
},
{
"entropy": 1.261875,
"epoch": 0.9659174379315411,
"grad_norm": 2.28125,
"learning_rate": 3.4159142670122845e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9637044394016265,
"num_tokens": 4174842337.0,
"step": 39450
},
{
"entropy": 1.26265625,
"epoch": 0.9671416678908966,
"grad_norm": 2.390625,
"learning_rate": 3.4098993183405793e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9634046721458435,
"num_tokens": 4180354181.0,
"step": 39500
},
{
"entropy": 1.26046875,
"epoch": 0.9683658978502522,
"grad_norm": 1.6484375,
"learning_rate": 3.403882690073954e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9639586913585663,
"num_tokens": 4185417059.0,
"step": 39550
},
{
"entropy": 1.27140625,
"epoch": 0.9695901278096077,
"grad_norm": 1.2421875,
"learning_rate": 3.3978644068660175e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.96663733959198,
"num_tokens": 4190550088.0,
"step": 39600
},
{
"entropy": 1.28140625,
"epoch": 0.9708143577689633,
"grad_norm": 2.859375,
"learning_rate": 3.3918444933771637e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9624445605278015,
"num_tokens": 4196306371.0,
"step": 39650
},
{
"entropy": 1.27078125,
"epoch": 0.9720385877283189,
"grad_norm": 1.875,
"learning_rate": 3.385822974274465e-06,
"loss": 0.1673,
"mean_token_accuracy": 0.9644521117210388,
"num_tokens": 4201403065.0,
"step": 39700
},
{
"entropy": 1.2859375,
"epoch": 0.9732628176876744,
"grad_norm": 2.78125,
"learning_rate": 3.3797998742315724e-06,
"loss": 0.1646,
"mean_token_accuracy": 0.9653528666496277,
"num_tokens": 4206711792.0,
"step": 39750
},
{
"entropy": 1.2709375,
"epoch": 0.97448704764703,
"grad_norm": 3.625,
"learning_rate": 3.3737752179286158e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.964444397687912,
"num_tokens": 4212047599.0,
"step": 39800
},
{
"entropy": 1.289375,
"epoch": 0.9757112776063855,
"grad_norm": 2.5,
"learning_rate": 3.3677490300521e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.963803733587265,
"num_tokens": 4217477603.0,
"step": 39850
},
{
"entropy": 1.27140625,
"epoch": 0.9769355075657411,
"grad_norm": 2.25,
"learning_rate": 3.361721335294809e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9657166159152984,
"num_tokens": 4222290662.0,
"step": 39900
},
{
"entropy": 1.3034375,
"epoch": 0.9781597375250967,
"grad_norm": 2.03125,
"learning_rate": 3.355692158355699e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.9609908378124237,
"num_tokens": 4228024616.0,
"step": 39950
},
{
"entropy": 1.2765625,
"epoch": 0.9793839674844523,
"grad_norm": 1.8515625,
"learning_rate": 3.349661523939799e-06,
"loss": 0.1549,
"mean_token_accuracy": 0.9669453859329223,
"num_tokens": 4233080108.0,
"step": 40000
},
{
"epoch": 0.9793839674844523,
"eval_entropy": 1.2830078125,
"eval_loss": 0.18154892325401306,
"eval_mean_token_accuracy": 0.9611844887336095,
"eval_num_tokens": 4233080108.0,
"eval_runtime": 601.7254,
"eval_samples_per_second": 16.047,
"eval_steps_per_second": 0.201,
"step": 40000
},
{
"entropy": 1.28,
"epoch": 0.9806081974438079,
"grad_norm": 3.40625,
"learning_rate": 3.3436294567581125e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9643000710010529,
"num_tokens": 4238491459.0,
"step": 40050
},
{
"entropy": 1.29109375,
"epoch": 0.9818324274031635,
"grad_norm": 1.9453125,
"learning_rate": 3.3375959815275103e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9640710878372193,
"num_tokens": 4244109418.0,
"step": 40100
},
{
"entropy": 1.27515625,
"epoch": 0.983056657362519,
"grad_norm": 0.029052734375,
"learning_rate": 3.3315611229706377e-06,
"loss": 0.1519,
"mean_token_accuracy": 0.9681409633159638,
"num_tokens": 4249250373.0,
"step": 40150
},
{
"entropy": 1.2784375,
"epoch": 0.9842808873218746,
"grad_norm": 2.09375,
"learning_rate": 3.325524905815804e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9648780179023743,
"num_tokens": 4254623197.0,
"step": 40200
},
{
"entropy": 1.2959375,
"epoch": 0.9855051172812301,
"grad_norm": 2.0,
"learning_rate": 3.3194873547968867e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.963757860660553,
"num_tokens": 4260002335.0,
"step": 40250
},
{
"entropy": 1.28453125,
"epoch": 0.9867293472405857,
"grad_norm": 0.0078125,
"learning_rate": 3.313448494653232e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9633991587162017,
"num_tokens": 4265450665.0,
"step": 40300
},
{
"entropy": 1.26625,
"epoch": 0.9879535771999413,
"grad_norm": 2.90625,
"learning_rate": 3.3074083501295447e-06,
"loss": 0.1441,
"mean_token_accuracy": 0.9687767088413238,
"num_tokens": 4270155512.0,
"step": 40350
},
{
"entropy": 1.2728125,
"epoch": 0.9891778071592968,
"grad_norm": 2.15625,
"learning_rate": 3.3013669459757956e-06,
"loss": 0.1546,
"mean_token_accuracy": 0.9668355488777161,
"num_tokens": 4275174062.0,
"step": 40400
},
{
"entropy": 1.28171875,
"epoch": 0.9904020371186524,
"grad_norm": 2.421875,
"learning_rate": 3.2953243069471187e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.9641734325885772,
"num_tokens": 4280291982.0,
"step": 40450
},
{
"entropy": 1.29375,
"epoch": 0.9916262670780079,
"grad_norm": 1.8828125,
"learning_rate": 3.2892804578037036e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9624480056762695,
"num_tokens": 4285827143.0,
"step": 40500
},
{
"entropy": 1.29921875,
"epoch": 0.9928504970373635,
"grad_norm": 3.328125,
"learning_rate": 3.2832354233107023e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9635557103157043,
"num_tokens": 4291196556.0,
"step": 40550
},
{
"entropy": 1.29515625,
"epoch": 0.9940747269967191,
"grad_norm": 2.046875,
"learning_rate": 3.2771892282381226e-06,
"loss": 0.1535,
"mean_token_accuracy": 0.9667463576793671,
"num_tokens": 4296297335.0,
"step": 40600
},
{
"entropy": 1.2765625,
"epoch": 0.9952989569560746,
"grad_norm": 2.15625,
"learning_rate": 3.2711418973607257e-06,
"loss": 0.1584,
"mean_token_accuracy": 0.9667293214797974,
"num_tokens": 4301506384.0,
"step": 40650
},
{
"entropy": 1.27078125,
"epoch": 0.9965231869154302,
"grad_norm": 2.234375,
"learning_rate": 3.2650934554579314e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9660876715183258,
"num_tokens": 4306603792.0,
"step": 40700
},
{
"entropy": 1.27515625,
"epoch": 0.9977474168747857,
"grad_norm": 4.15625,
"learning_rate": 3.2590439273137074e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.9637362861633301,
"num_tokens": 4312148607.0,
"step": 40750
},
{
"entropy": 1.266875,
"epoch": 0.9989716468341413,
"grad_norm": 3.546875,
"learning_rate": 3.2529933377164754e-06,
"loss": 0.1498,
"mean_token_accuracy": 0.9686801016330719,
"num_tokens": 4317085828.0,
"step": 40800
},
{
"entropy": 1.2784375,
"epoch": 1.0001958767934969,
"grad_norm": 3.578125,
"learning_rate": 3.2469417114590055e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9648519742488861,
"num_tokens": 4322221289.0,
"step": 40850
},
{
"entropy": 1.28578125,
"epoch": 1.0014201067528525,
"grad_norm": 4.375,
"learning_rate": 3.240889073338315e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.9657353925704956,
"num_tokens": 4327372960.0,
"step": 40900
},
{
"entropy": 1.268125,
"epoch": 1.002644336712208,
"grad_norm": 2.078125,
"learning_rate": 3.2348354481555692e-06,
"loss": 0.1607,
"mean_token_accuracy": 0.9653881311416626,
"num_tokens": 4332436449.0,
"step": 40950
},
{
"entropy": 1.27359375,
"epoch": 1.0038685666715637,
"grad_norm": 0.0013580322265625,
"learning_rate": 3.2287808607159753e-06,
"loss": 0.153,
"mean_token_accuracy": 0.9669638919830322,
"num_tokens": 4337572886.0,
"step": 41000
},
{
"entropy": 1.26375,
"epoch": 1.005092796630919,
"grad_norm": 2.609375,
"learning_rate": 3.222725335828685e-06,
"loss": 0.1474,
"mean_token_accuracy": 0.9681554007530212,
"num_tokens": 4342524064.0,
"step": 41050
},
{
"entropy": 1.290625,
"epoch": 1.0063170265902748,
"grad_norm": 2.796875,
"learning_rate": 3.216668898306692e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9632875370979309,
"num_tokens": 4347805365.0,
"step": 41100
},
{
"entropy": 1.28421875,
"epoch": 1.0075412565496302,
"grad_norm": 3.546875,
"learning_rate": 3.210611572966728e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9664819014072418,
"num_tokens": 4352875723.0,
"step": 41150
},
{
"entropy": 1.29171875,
"epoch": 1.008765486508986,
"grad_norm": 1.7421875,
"learning_rate": 3.2045533846291643e-06,
"loss": 0.1755,
"mean_token_accuracy": 0.9631037187576293,
"num_tokens": 4358561815.0,
"step": 41200
},
{
"entropy": 1.276875,
"epoch": 1.0099897164683413,
"grad_norm": 2.15625,
"learning_rate": 3.1984943581179053e-06,
"loss": 0.1553,
"mean_token_accuracy": 0.9667964303493499,
"num_tokens": 4363644242.0,
"step": 41250
},
{
"entropy": 1.30296875,
"epoch": 1.011213946427697,
"grad_norm": 3.96875,
"learning_rate": 3.1924345182602943e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9630448269844055,
"num_tokens": 4369318393.0,
"step": 41300
},
{
"entropy": 1.28875,
"epoch": 1.0124381763870525,
"grad_norm": 3.296875,
"learning_rate": 3.1863738898870033e-06,
"loss": 0.1669,
"mean_token_accuracy": 0.9647123277187347,
"num_tokens": 4374659681.0,
"step": 41350
},
{
"entropy": 1.27265625,
"epoch": 1.0136624063464081,
"grad_norm": 3.125,
"learning_rate": 3.180312497831938e-06,
"loss": 0.1567,
"mean_token_accuracy": 0.9661735820770264,
"num_tokens": 4379733438.0,
"step": 41400
},
{
"entropy": 1.28484375,
"epoch": 1.0148866363057636,
"grad_norm": 3.046875,
"learning_rate": 3.174250366932133e-06,
"loss": 0.1612,
"mean_token_accuracy": 0.9659793210029602,
"num_tokens": 4384885742.0,
"step": 41450
},
{
"entropy": 1.2934375,
"epoch": 1.0161108662651193,
"grad_norm": 3.65625,
"learning_rate": 3.1681875220276487e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.9628891766071319,
"num_tokens": 4390251007.0,
"step": 41500
},
{
"entropy": 1.29703125,
"epoch": 1.0173350962244747,
"grad_norm": 2.546875,
"learning_rate": 3.1621239879614722e-06,
"loss": 0.1752,
"mean_token_accuracy": 0.9631851124763489,
"num_tokens": 4395820970.0,
"step": 41550
},
{
"entropy": 1.289375,
"epoch": 1.0185593261838304,
"grad_norm": 3.15625,
"learning_rate": 3.1560597895794157e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9643260395526886,
"num_tokens": 4401284321.0,
"step": 41600
},
{
"entropy": 1.3046875,
"epoch": 1.0197835561431858,
"grad_norm": 2.859375,
"learning_rate": 3.149994951730011e-06,
"loss": 0.1879,
"mean_token_accuracy": 0.9601117408275605,
"num_tokens": 4406768060.0,
"step": 41650
},
{
"entropy": 1.29484375,
"epoch": 1.0210077861025415,
"grad_norm": 3.0,
"learning_rate": 3.143929499264413e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9648369300365448,
"num_tokens": 4412201333.0,
"step": 41700
},
{
"entropy": 1.28328125,
"epoch": 1.0222320160618972,
"grad_norm": 3.40625,
"learning_rate": 3.137863457036292e-06,
"loss": 0.1533,
"mean_token_accuracy": 0.9676184570789337,
"num_tokens": 4417135073.0,
"step": 41750
},
{
"entropy": 1.3009375,
"epoch": 1.0234562460212526,
"grad_norm": 2.828125,
"learning_rate": 3.1317968499017366e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9627422571182251,
"num_tokens": 4422234270.0,
"step": 41800
},
{
"entropy": 1.29265625,
"epoch": 1.0246804759806083,
"grad_norm": 1.6875,
"learning_rate": 3.1257297027191517e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9664195513725281,
"num_tokens": 4427309878.0,
"step": 41850
},
{
"entropy": 1.275,
"epoch": 1.0259047059399637,
"grad_norm": 2.015625,
"learning_rate": 3.1196620403491515e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9644128286838531,
"num_tokens": 4432672891.0,
"step": 41900
},
{
"entropy": 1.2815625,
"epoch": 1.0271289358993194,
"grad_norm": 2.28125,
"learning_rate": 3.113593887654463e-06,
"loss": 0.1513,
"mean_token_accuracy": 0.9673609352111816,
"num_tokens": 4437526358.0,
"step": 41950
},
{
"entropy": 1.290625,
"epoch": 1.0283531658586749,
"grad_norm": 3.125,
"learning_rate": 3.107525269499825e-06,
"loss": 0.1706,
"mean_token_accuracy": 0.9627550756931305,
"num_tokens": 4442820350.0,
"step": 42000
},
{
"entropy": 1.29484375,
"epoch": 1.0295773958180305,
"grad_norm": 3.4375,
"learning_rate": 3.1014562107518786e-06,
"loss": 0.1684,
"mean_token_accuracy": 0.9646277678012848,
"num_tokens": 4448357734.0,
"step": 42050
},
{
"entropy": 1.300625,
"epoch": 1.030801625777386,
"grad_norm": 3.5625,
"learning_rate": 3.0953867362790734e-06,
"loss": 0.1802,
"mean_token_accuracy": 0.9611736404895782,
"num_tokens": 4453928087.0,
"step": 42100
},
{
"entropy": 1.29171875,
"epoch": 1.0320258557367417,
"grad_norm": 3.375,
"learning_rate": 3.089316870951562e-06,
"loss": 0.162,
"mean_token_accuracy": 0.9649739050865174,
"num_tokens": 4458946227.0,
"step": 42150
},
{
"entropy": 1.289375,
"epoch": 1.033250085696097,
"grad_norm": 3.21875,
"learning_rate": 3.083246639641098e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.9634380388259888,
"num_tokens": 4464192504.0,
"step": 42200
},
{
"entropy": 1.3146875,
"epoch": 1.0344743156554528,
"grad_norm": 2.453125,
"learning_rate": 3.077176067220935e-06,
"loss": 0.1793,
"mean_token_accuracy": 0.9617934930324554,
"num_tokens": 4469999689.0,
"step": 42250
},
{
"entropy": 1.3025,
"epoch": 1.0356985456148082,
"grad_norm": 2.125,
"learning_rate": 3.0711051785657236e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.964527097940445,
"num_tokens": 4475221088.0,
"step": 42300
},
{
"entropy": 1.29015625,
"epoch": 1.036922775574164,
"grad_norm": 2.84375,
"learning_rate": 3.065033998551413e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9632121896743775,
"num_tokens": 4480484467.0,
"step": 42350
},
{
"entropy": 1.29890625,
"epoch": 1.0381470055335194,
"grad_norm": 3.234375,
"learning_rate": 3.0589625520551414e-06,
"loss": 0.168,
"mean_token_accuracy": 0.9637061321735382,
"num_tokens": 4486042679.0,
"step": 42400
},
{
"entropy": 1.31703125,
"epoch": 1.039371235492875,
"grad_norm": 2.75,
"learning_rate": 3.0528908639551436e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9634595859050751,
"num_tokens": 4491749175.0,
"step": 42450
},
{
"entropy": 1.280625,
"epoch": 1.0405954654522305,
"grad_norm": 2.59375,
"learning_rate": 3.0468189591306418e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9648339354991913,
"num_tokens": 4497083391.0,
"step": 42500
},
{
"entropy": 1.275,
"epoch": 1.0418196954115861,
"grad_norm": 3.59375,
"learning_rate": 3.040746862461747e-06,
"loss": 0.1573,
"mean_token_accuracy": 0.9660842347145081,
"num_tokens": 4502213588.0,
"step": 42550
},
{
"entropy": 1.27265625,
"epoch": 1.0430439253709416,
"grad_norm": 2.078125,
"learning_rate": 3.0346745988293553e-06,
"loss": 0.1638,
"mean_token_accuracy": 0.9644993054866791,
"num_tokens": 4507601887.0,
"step": 42600
},
{
"entropy": 1.2703125,
"epoch": 1.0442681553302973,
"grad_norm": 2.234375,
"learning_rate": 3.02860219311505e-06,
"loss": 0.162,
"mean_token_accuracy": 0.965209093093872,
"num_tokens": 4512999351.0,
"step": 42650
},
{
"entropy": 1.2659375,
"epoch": 1.0454923852896527,
"grad_norm": 2.84375,
"learning_rate": 3.0225296702009917e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.9636136376857758,
"num_tokens": 4518295845.0,
"step": 42700
},
{
"entropy": 1.27453125,
"epoch": 1.0467166152490084,
"grad_norm": 2.609375,
"learning_rate": 3.016457054969827e-06,
"loss": 0.165,
"mean_token_accuracy": 0.9648648130893708,
"num_tokens": 4523705084.0,
"step": 42750
},
{
"entropy": 1.27328125,
"epoch": 1.0479408452083638,
"grad_norm": 2.140625,
"learning_rate": 3.0103843723045753e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9660780084133148,
"num_tokens": 4528928559.0,
"step": 42800
},
{
"entropy": 1.27140625,
"epoch": 1.0491650751677195,
"grad_norm": 1.7265625,
"learning_rate": 3.004311647088536e-06,
"loss": 0.1608,
"mean_token_accuracy": 0.9661289596557617,
"num_tokens": 4534161929.0,
"step": 42850
},
{
"entropy": 1.28,
"epoch": 1.0503893051270752,
"grad_norm": 2.734375,
"learning_rate": 2.9982389042051802e-06,
"loss": 0.1596,
"mean_token_accuracy": 0.9655217385292053,
"num_tokens": 4539230226.0,
"step": 42900
},
{
"entropy": 1.27828125,
"epoch": 1.0516135350864306,
"grad_norm": 1.8359375,
"learning_rate": 2.992166168538055e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9645612442493439,
"num_tokens": 4544444757.0,
"step": 42950
},
{
"entropy": 1.28609375,
"epoch": 1.0528377650457863,
"grad_norm": 3.1875,
"learning_rate": 2.986093464970675e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.961436516046524,
"num_tokens": 4550024290.0,
"step": 43000
},
{
"entropy": 1.2921875,
"epoch": 1.0540619950051417,
"grad_norm": 1.8046875,
"learning_rate": 2.9800208183864225e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9631437683105468,
"num_tokens": 4555846037.0,
"step": 43050
},
{
"entropy": 1.29046875,
"epoch": 1.0552862249644974,
"grad_norm": 2.859375,
"learning_rate": 2.97394825366845e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9611044287681579,
"num_tokens": 4561556919.0,
"step": 43100
},
{
"entropy": 1.2615625,
"epoch": 1.0565104549238529,
"grad_norm": 2.578125,
"learning_rate": 2.9678757956995704e-06,
"loss": 0.1519,
"mean_token_accuracy": 0.967376263141632,
"num_tokens": 4566754673.0,
"step": 43150
},
{
"entropy": 1.24921875,
"epoch": 1.0577346848832085,
"grad_norm": 3.5,
"learning_rate": 2.9618034693621624e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.9647138011455536,
"num_tokens": 4571961153.0,
"step": 43200
},
{
"entropy": 1.27078125,
"epoch": 1.058958914842564,
"grad_norm": 4.0625,
"learning_rate": 2.955731299538065e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.9643959999084473,
"num_tokens": 4577276643.0,
"step": 43250
},
{
"entropy": 1.27125,
"epoch": 1.0601831448019197,
"grad_norm": 3.5,
"learning_rate": 2.9496593111084725e-06,
"loss": 0.1764,
"mean_token_accuracy": 0.9621264743804931,
"num_tokens": 4582787780.0,
"step": 43300
},
{
"entropy": 1.2503125,
"epoch": 1.0614073747612751,
"grad_norm": 0.06201171875,
"learning_rate": 2.9435875289538397e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9652257537841797,
"num_tokens": 4587978646.0,
"step": 43350
},
{
"entropy": 1.25390625,
"epoch": 1.0626316047206308,
"grad_norm": 2.34375,
"learning_rate": 2.937515977953776e-06,
"loss": 0.1601,
"mean_token_accuracy": 0.9656472432613373,
"num_tokens": 4593105594.0,
"step": 43400
},
{
"entropy": 1.235,
"epoch": 1.0638558346799862,
"grad_norm": 1.796875,
"learning_rate": 2.93144468298694e-06,
"loss": 0.1465,
"mean_token_accuracy": 0.9684570038318634,
"num_tokens": 4598082227.0,
"step": 43450
},
{
"entropy": 1.2615625,
"epoch": 1.065080064639342,
"grad_norm": 1.75,
"learning_rate": 2.9253736689309453e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9627693855762481,
"num_tokens": 4603820936.0,
"step": 43500
},
{
"entropy": 1.2409375,
"epoch": 1.0663042945986974,
"grad_norm": 2.4375,
"learning_rate": 2.919302960662252e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9645286548137665,
"num_tokens": 4609111825.0,
"step": 43550
},
{
"entropy": 1.251875,
"epoch": 1.067528524558053,
"grad_norm": 2.265625,
"learning_rate": 2.9132325830560694e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.9642206788063049,
"num_tokens": 4614988638.0,
"step": 43600
},
{
"entropy": 1.23515625,
"epoch": 1.0687527545174085,
"grad_norm": 3.109375,
"learning_rate": 2.907162560986249e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9648200106620789,
"num_tokens": 4620258466.0,
"step": 43650
},
{
"entropy": 1.23046875,
"epoch": 1.0699769844767641,
"grad_norm": 1.78125,
"learning_rate": 2.9010929193251877e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.9666041648387909,
"num_tokens": 4625541440.0,
"step": 43700
},
{
"entropy": 1.23578125,
"epoch": 1.0712012144361196,
"grad_norm": 2.59375,
"learning_rate": 2.8950236829437243e-06,
"loss": 0.1595,
"mean_token_accuracy": 0.9665923917293548,
"num_tokens": 4630862596.0,
"step": 43750
},
{
"entropy": 1.24796875,
"epoch": 1.0724254443954753,
"grad_norm": 3.625,
"learning_rate": 2.8889548767110325e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9622351431846619,
"num_tokens": 4636080162.0,
"step": 43800
},
{
"entropy": 1.255,
"epoch": 1.0736496743548307,
"grad_norm": 2.984375,
"learning_rate": 2.882886525494528e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9641489648818969,
"num_tokens": 4641603830.0,
"step": 43850
},
{
"entropy": 1.26390625,
"epoch": 1.0748739043141864,
"grad_norm": 2.203125,
"learning_rate": 2.8768186541597617e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.9621511352062225,
"num_tokens": 4647162733.0,
"step": 43900
},
{
"entropy": 1.26953125,
"epoch": 1.0760981342735418,
"grad_norm": 2.359375,
"learning_rate": 2.8707512875703146e-06,
"loss": 0.1724,
"mean_token_accuracy": 0.963198972940445,
"num_tokens": 4652659894.0,
"step": 43950
},
{
"entropy": 1.261875,
"epoch": 1.0773223642328975,
"grad_norm": 2.984375,
"learning_rate": 2.8646844505877032e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.963871557712555,
"num_tokens": 4657833019.0,
"step": 44000
},
{
"entropy": 1.25171875,
"epoch": 1.078546594192253,
"grad_norm": 3.875,
"learning_rate": 2.8586181680712726e-06,
"loss": 0.1671,
"mean_token_accuracy": 0.9647689509391785,
"num_tokens": 4663099416.0,
"step": 44050
},
{
"entropy": 1.2353125,
"epoch": 1.0797708241516086,
"grad_norm": 1.921875,
"learning_rate": 2.852552464878096e-06,
"loss": 0.1626,
"mean_token_accuracy": 0.9649975061416626,
"num_tokens": 4668463403.0,
"step": 44100
},
{
"entropy": 1.2309375,
"epoch": 1.0809950541109643,
"grad_norm": 3.515625,
"learning_rate": 2.846487365862872e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.966260347366333,
"num_tokens": 4673588957.0,
"step": 44150
},
{
"entropy": 1.2703125,
"epoch": 1.0822192840703198,
"grad_norm": 2.921875,
"learning_rate": 2.840422895877824e-06,
"loss": 0.1829,
"mean_token_accuracy": 0.9611806380748749,
"num_tokens": 4679435999.0,
"step": 44200
},
{
"entropy": 1.21671875,
"epoch": 1.0834435140296754,
"grad_norm": 2.515625,
"learning_rate": 2.8343590797725993e-06,
"loss": 0.1595,
"mean_token_accuracy": 0.9657203650474548,
"num_tokens": 4684283427.0,
"step": 44250
},
{
"entropy": 1.23546875,
"epoch": 1.0846677439890309,
"grad_norm": 2.296875,
"learning_rate": 2.828295942394163e-06,
"loss": 0.1545,
"mean_token_accuracy": 0.9663613975048065,
"num_tokens": 4689166634.0,
"step": 44300
},
{
"entropy": 1.2715625,
"epoch": 1.0858919739483865,
"grad_norm": 2.140625,
"learning_rate": 2.822233508586702e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.9638037300109863,
"num_tokens": 4694728156.0,
"step": 44350
},
{
"entropy": 1.246875,
"epoch": 1.087116203907742,
"grad_norm": 1.1171875,
"learning_rate": 2.8161718031915194e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9652890110015869,
"num_tokens": 4700054529.0,
"step": 44400
},
{
"entropy": 1.26203125,
"epoch": 1.0883404338670977,
"grad_norm": 1.9765625,
"learning_rate": 2.8101108510469308e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9647334861755371,
"num_tokens": 4705521940.0,
"step": 44450
},
{
"entropy": 1.26171875,
"epoch": 1.0895646638264531,
"grad_norm": 2.15625,
"learning_rate": 2.804050676988169e-06,
"loss": 0.1764,
"mean_token_accuracy": 0.9625956809520722,
"num_tokens": 4711285057.0,
"step": 44500
},
{
"entropy": 1.26828125,
"epoch": 1.0907888937858088,
"grad_norm": 3.21875,
"learning_rate": 2.797991305847279e-06,
"loss": 0.1695,
"mean_token_accuracy": 0.9635378420352936,
"num_tokens": 4716659220.0,
"step": 44550
},
{
"entropy": 1.25296875,
"epoch": 1.0920131237451642,
"grad_norm": 2.84375,
"learning_rate": 2.7919327624530105e-06,
"loss": 0.1589,
"mean_token_accuracy": 0.966244969367981,
"num_tokens": 4721738500.0,
"step": 44600
},
{
"entropy": 1.25390625,
"epoch": 1.09323735370452,
"grad_norm": 1.5859375,
"learning_rate": 2.7858750716307267e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9655514645576477,
"num_tokens": 4727007974.0,
"step": 44650
},
{
"entropy": 1.261875,
"epoch": 1.0944615836638754,
"grad_norm": 3.15625,
"learning_rate": 2.7798182582022956e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9647921168804169,
"num_tokens": 4732247570.0,
"step": 44700
},
{
"entropy": 1.275,
"epoch": 1.095685813623231,
"grad_norm": 3.0,
"learning_rate": 2.7737623469859904e-06,
"loss": 0.1753,
"mean_token_accuracy": 0.9633481323719024,
"num_tokens": 4737626660.0,
"step": 44750
},
{
"entropy": 1.27203125,
"epoch": 1.0969100435825865,
"grad_norm": 3.1875,
"learning_rate": 2.767707362796385e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.9635563850402832,
"num_tokens": 4743127298.0,
"step": 44800
},
{
"entropy": 1.2753125,
"epoch": 1.0981342735419422,
"grad_norm": 2.28125,
"learning_rate": 2.7616533304442583e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9624858343601227,
"num_tokens": 4748930038.0,
"step": 44850
},
{
"entropy": 1.251875,
"epoch": 1.0993585035012976,
"grad_norm": 2.828125,
"learning_rate": 2.7556002747364882e-06,
"loss": 0.1618,
"mean_token_accuracy": 0.965050835609436,
"num_tokens": 4754015548.0,
"step": 44900
},
{
"entropy": 1.24,
"epoch": 1.1005827334606533,
"grad_norm": 2.515625,
"learning_rate": 2.749548220475947e-06,
"loss": 0.1556,
"mean_token_accuracy": 0.9672428011894226,
"num_tokens": 4759064667.0,
"step": 44950
},
{
"entropy": 1.24671875,
"epoch": 1.1018069634200087,
"grad_norm": 3.796875,
"learning_rate": 2.7434971924614085e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.9658971416950226,
"num_tokens": 4764080171.0,
"step": 45000
},
{
"entropy": 1.27921875,
"epoch": 1.1030311933793644,
"grad_norm": 1.6953125,
"learning_rate": 2.7374472154874396e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.9628953158855438,
"num_tokens": 4769590544.0,
"step": 45050
},
{
"entropy": 1.27578125,
"epoch": 1.1042554233387198,
"grad_norm": 3.203125,
"learning_rate": 2.731398314344298e-06,
"loss": 0.172,
"mean_token_accuracy": 0.9631561875343323,
"num_tokens": 4774983478.0,
"step": 45100
},
{
"entropy": 1.26796875,
"epoch": 1.1054796532980755,
"grad_norm": 2.625,
"learning_rate": 2.7253505138178363e-06,
"loss": 0.1626,
"mean_token_accuracy": 0.9651547718048096,
"num_tokens": 4780291854.0,
"step": 45150
},
{
"entropy": 1.24359375,
"epoch": 1.1067038832574312,
"grad_norm": 1.7734375,
"learning_rate": 2.719303838689397e-06,
"loss": 0.1586,
"mean_token_accuracy": 0.9661097753047944,
"num_tokens": 4785746067.0,
"step": 45200
},
{
"entropy": 1.27703125,
"epoch": 1.1079281132167866,
"grad_norm": 3.21875,
"learning_rate": 2.7132583137357085e-06,
"loss": 0.173,
"mean_token_accuracy": 0.9634435415267945,
"num_tokens": 4791411988.0,
"step": 45250
},
{
"entropy": 1.26703125,
"epoch": 1.1091523431761423,
"grad_norm": 2.390625,
"learning_rate": 2.70721396372879e-06,
"loss": 0.1574,
"mean_token_accuracy": 0.9663924646377563,
"num_tokens": 4796839124.0,
"step": 45300
},
{
"entropy": 1.251875,
"epoch": 1.1103765731354978,
"grad_norm": 1.7265625,
"learning_rate": 2.7011708134358433e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.963711371421814,
"num_tokens": 4802261281.0,
"step": 45350
},
{
"entropy": 1.26109375,
"epoch": 1.1116008030948534,
"grad_norm": 3.34375,
"learning_rate": 2.6951288876191554e-06,
"loss": 0.163,
"mean_token_accuracy": 0.9658736658096313,
"num_tokens": 4807722190.0,
"step": 45400
},
{
"entropy": 1.2421875,
"epoch": 1.1128250330542089,
"grad_norm": 2.515625,
"learning_rate": 2.689088211035996e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.9665179479122162,
"num_tokens": 4812528854.0,
"step": 45450
},
{
"entropy": 1.26859375,
"epoch": 1.1140492630135646,
"grad_norm": 4.28125,
"learning_rate": 2.6830488084385153e-06,
"loss": 0.1633,
"mean_token_accuracy": 0.9647966718673706,
"num_tokens": 4817654045.0,
"step": 45500
},
{
"entropy": 1.276875,
"epoch": 1.11527349297292,
"grad_norm": 3.078125,
"learning_rate": 2.6770107045736457e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9641125738620758,
"num_tokens": 4823118089.0,
"step": 45550
},
{
"entropy": 1.26890625,
"epoch": 1.1164977229322757,
"grad_norm": 3.40625,
"learning_rate": 2.670973924182993e-06,
"loss": 0.1652,
"mean_token_accuracy": 0.965114232301712,
"num_tokens": 4828253691.0,
"step": 45600
},
{
"entropy": 1.25203125,
"epoch": 1.1177219528916311,
"grad_norm": 4.03125,
"learning_rate": 2.664938492002745e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.965950778722763,
"num_tokens": 4833456111.0,
"step": 45650
},
{
"entropy": 1.27203125,
"epoch": 1.1189461828509868,
"grad_norm": 2.421875,
"learning_rate": 2.658904432763564e-06,
"loss": 0.172,
"mean_token_accuracy": 0.962825288772583,
"num_tokens": 4838982999.0,
"step": 45700
},
{
"entropy": 1.2459375,
"epoch": 1.1201704128103422,
"grad_norm": 2.53125,
"learning_rate": 2.6528717711904823e-06,
"loss": 0.1553,
"mean_token_accuracy": 0.9660564112663269,
"num_tokens": 4844057439.0,
"step": 45750
},
{
"entropy": 1.2546875,
"epoch": 1.121394642769698,
"grad_norm": 2.78125,
"learning_rate": 2.6468405320028107e-06,
"loss": 0.1758,
"mean_token_accuracy": 0.9631454050540924,
"num_tokens": 4849526204.0,
"step": 45800
},
{
"entropy": 1.2471875,
"epoch": 1.1226188727290534,
"grad_norm": 2.375,
"learning_rate": 2.6408107399140297e-06,
"loss": 0.1525,
"mean_token_accuracy": 0.9672383844852448,
"num_tokens": 4854563999.0,
"step": 45850
},
{
"entropy": 1.2390625,
"epoch": 1.123843102688409,
"grad_norm": 2.796875,
"learning_rate": 2.6347824196316884e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9666775286197662,
"num_tokens": 4859889553.0,
"step": 45900
},
{
"entropy": 1.251875,
"epoch": 1.1250673326477645,
"grad_norm": 6.1875,
"learning_rate": 2.628755595857308e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.964877005815506,
"num_tokens": 4865439463.0,
"step": 45950
},
{
"entropy": 1.26578125,
"epoch": 1.1262915626071202,
"grad_norm": 2.9375,
"learning_rate": 2.622730293286276e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9647691214084625,
"num_tokens": 4870527275.0,
"step": 46000
},
{
"entropy": 1.2690625,
"epoch": 1.1275157925664756,
"grad_norm": 3.84375,
"learning_rate": 2.6167065366077473e-06,
"loss": 0.164,
"mean_token_accuracy": 0.9649512505531311,
"num_tokens": 4875809735.0,
"step": 46050
},
{
"entropy": 1.2575,
"epoch": 1.1287400225258313,
"grad_norm": 3.546875,
"learning_rate": 2.6106843505045403e-06,
"loss": 0.1637,
"mean_token_accuracy": 0.9659580600261688,
"num_tokens": 4881072058.0,
"step": 46100
},
{
"entropy": 1.2534375,
"epoch": 1.1299642524851867,
"grad_norm": 3.265625,
"learning_rate": 2.6046637596530405e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9629634070396423,
"num_tokens": 4886211504.0,
"step": 46150
},
{
"entropy": 1.255,
"epoch": 1.1311884824445424,
"grad_norm": 2.953125,
"learning_rate": 2.598644788723097e-06,
"loss": 0.1635,
"mean_token_accuracy": 0.964535938501358,
"num_tokens": 4891417957.0,
"step": 46200
},
{
"entropy": 1.26296875,
"epoch": 1.132412712403898,
"grad_norm": 2.25,
"learning_rate": 2.5926274623779176e-06,
"loss": 0.1648,
"mean_token_accuracy": 0.9648210310935974,
"num_tokens": 4897027521.0,
"step": 46250
},
{
"entropy": 1.2715625,
"epoch": 1.1336369423632535,
"grad_norm": 2.359375,
"learning_rate": 2.5866118052739744e-06,
"loss": 0.1701,
"mean_token_accuracy": 0.9643675744533539,
"num_tokens": 4902630666.0,
"step": 46300
},
{
"entropy": 1.2640625,
"epoch": 1.134861172322609,
"grad_norm": 2.921875,
"learning_rate": 2.5805978420608995e-06,
"loss": 0.1588,
"mean_token_accuracy": 0.9654871869087219,
"num_tokens": 4907957609.0,
"step": 46350
},
{
"entropy": 1.25765625,
"epoch": 1.1360854022819646,
"grad_norm": 1.96875,
"learning_rate": 2.574585597381383e-06,
"loss": 0.1657,
"mean_token_accuracy": 0.964663782119751,
"num_tokens": 4913108629.0,
"step": 46400
},
{
"entropy": 1.26984375,
"epoch": 1.1373096322413203,
"grad_norm": 2.890625,
"learning_rate": 2.5685750958710737e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9640021121501923,
"num_tokens": 4918622288.0,
"step": 46450
},
{
"entropy": 1.2890625,
"epoch": 1.1385338622006758,
"grad_norm": 2.59375,
"learning_rate": 2.5625663621584777e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.9616779792308807,
"num_tokens": 4924224135.0,
"step": 46500
},
{
"entropy": 1.2665625,
"epoch": 1.1397580921600314,
"grad_norm": 3.171875,
"learning_rate": 2.5565594208648566e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9643717563152313,
"num_tokens": 4929573607.0,
"step": 46550
},
{
"entropy": 1.2684375,
"epoch": 1.1409823221193869,
"grad_norm": 3.296875,
"learning_rate": 2.5505542966041285e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9641470229625702,
"num_tokens": 4935198269.0,
"step": 46600
},
{
"entropy": 1.2725,
"epoch": 1.1422065520787426,
"grad_norm": 2.484375,
"learning_rate": 2.5445510139827656e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9628414344787598,
"num_tokens": 4940751379.0,
"step": 46650
},
{
"entropy": 1.2690625,
"epoch": 1.143430782038098,
"grad_norm": 1.78125,
"learning_rate": 2.5385495975996952e-06,
"loss": 0.1769,
"mean_token_accuracy": 0.9626391875743866,
"num_tokens": 4946216596.0,
"step": 46700
},
{
"entropy": 1.27125,
"epoch": 1.1446550119974537,
"grad_norm": 3.359375,
"learning_rate": 2.532550072046194e-06,
"loss": 0.179,
"mean_token_accuracy": 0.9620010888576508,
"num_tokens": 4951891973.0,
"step": 46750
},
{
"entropy": 1.28984375,
"epoch": 1.1458792419568091,
"grad_norm": 1.71875,
"learning_rate": 2.5265524619057936e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.9611503231525421,
"num_tokens": 4957928188.0,
"step": 46800
},
{
"entropy": 1.27203125,
"epoch": 1.1471034719161648,
"grad_norm": 2.265625,
"learning_rate": 2.520556791754179e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9632143163681031,
"num_tokens": 4963189602.0,
"step": 46850
},
{
"entropy": 1.2546875,
"epoch": 1.1483277018755202,
"grad_norm": 2.296875,
"learning_rate": 2.5145630861590806e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9636298882961273,
"num_tokens": 4968384917.0,
"step": 46900
},
{
"entropy": 1.234375,
"epoch": 1.149551931834876,
"grad_norm": 2.0,
"learning_rate": 2.5085713696801825e-06,
"loss": 0.1456,
"mean_token_accuracy": 0.9684996688365937,
"num_tokens": 4973304826.0,
"step": 46950
},
{
"entropy": 1.2384375,
"epoch": 1.1507761617942314,
"grad_norm": 3.1875,
"learning_rate": 2.5025816668690183e-06,
"loss": 0.1615,
"mean_token_accuracy": 0.9655906355381012,
"num_tokens": 4978583670.0,
"step": 47000
},
{
"entropy": 1.23921875,
"epoch": 1.152000391753587,
"grad_norm": 2.796875,
"learning_rate": 2.496594002268869e-06,
"loss": 0.1633,
"mean_token_accuracy": 0.9643825757503509,
"num_tokens": 4983769645.0,
"step": 47050
},
{
"entropy": 1.2440625,
"epoch": 1.1532246217129425,
"grad_norm": 2.359375,
"learning_rate": 2.490608400414664e-06,
"loss": 0.1601,
"mean_token_accuracy": 0.9659870672225952,
"num_tokens": 4989133497.0,
"step": 47100
},
{
"entropy": 1.2484375,
"epoch": 1.1544488516722982,
"grad_norm": 3.15625,
"learning_rate": 2.484624885832883e-06,
"loss": 0.1618,
"mean_token_accuracy": 0.9654805910587311,
"num_tokens": 4994369533.0,
"step": 47150
},
{
"entropy": 1.2490625,
"epoch": 1.1556730816316536,
"grad_norm": 3.109375,
"learning_rate": 2.478643483041449e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9649089682102203,
"num_tokens": 4999527347.0,
"step": 47200
},
{
"entropy": 1.2553125,
"epoch": 1.1568973115910093,
"grad_norm": 2.4375,
"learning_rate": 2.472664216549633e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9657234275341033,
"num_tokens": 5004961075.0,
"step": 47250
},
{
"entropy": 1.24203125,
"epoch": 1.1581215415503647,
"grad_norm": 2.640625,
"learning_rate": 2.466687110857955e-06,
"loss": 0.1533,
"mean_token_accuracy": 0.9676401782035827,
"num_tokens": 5009801621.0,
"step": 47300
},
{
"entropy": 1.2534375,
"epoch": 1.1593457715097204,
"grad_norm": 1.875,
"learning_rate": 2.4607121904580796e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.96378169298172,
"num_tokens": 5015019832.0,
"step": 47350
},
{
"entropy": 1.285625,
"epoch": 1.1605700014690759,
"grad_norm": 2.6875,
"learning_rate": 2.4547394798327127e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.961477290391922,
"num_tokens": 5020771556.0,
"step": 47400
},
{
"entropy": 1.2609375,
"epoch": 1.1617942314284315,
"grad_norm": 0.00445556640625,
"learning_rate": 2.448769003455512e-06,
"loss": 0.1606,
"mean_token_accuracy": 0.9650316751003265,
"num_tokens": 5026174408.0,
"step": 47450
},
{
"entropy": 1.24875,
"epoch": 1.1630184613877872,
"grad_norm": 2.96875,
"learning_rate": 2.442800785790977e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9664806413650513,
"num_tokens": 5031142557.0,
"step": 47500
},
{
"entropy": 1.25828125,
"epoch": 1.1642426913471426,
"grad_norm": 2.75,
"learning_rate": 2.436834851294351e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9635387444496155,
"num_tokens": 5036598656.0,
"step": 47550
},
{
"entropy": 1.2440625,
"epoch": 1.165466921306498,
"grad_norm": 3.125,
"learning_rate": 2.4308712244115256e-06,
"loss": 0.1652,
"mean_token_accuracy": 0.9645625805854797,
"num_tokens": 5041932484.0,
"step": 47600
},
{
"entropy": 1.23,
"epoch": 1.1666911512658538,
"grad_norm": 4.53125,
"learning_rate": 2.4249099295789315e-06,
"loss": 0.1503,
"mean_token_accuracy": 0.9676901125907897,
"num_tokens": 5047049390.0,
"step": 47650
},
{
"entropy": 1.24640625,
"epoch": 1.1679153812252094,
"grad_norm": 1.6953125,
"learning_rate": 2.4189509912234475e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9623109328746796,
"num_tokens": 5052498083.0,
"step": 47700
},
{
"entropy": 1.2228125,
"epoch": 1.1691396111845649,
"grad_norm": 2.21875,
"learning_rate": 2.412994433762295e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9679240989685058,
"num_tokens": 5057358329.0,
"step": 47750
},
{
"entropy": 1.236875,
"epoch": 1.1703638411439206,
"grad_norm": 3.390625,
"learning_rate": 2.407040281602942e-06,
"loss": 0.1549,
"mean_token_accuracy": 0.9666338443756104,
"num_tokens": 5062500243.0,
"step": 47800
},
{
"entropy": 1.2196875,
"epoch": 1.171588071103276,
"grad_norm": 0.005096435546875,
"learning_rate": 2.4010885591429955e-06,
"loss": 0.1541,
"mean_token_accuracy": 0.9668021559715271,
"num_tokens": 5067435842.0,
"step": 47850
},
{
"entropy": 1.25109375,
"epoch": 1.1728123010626317,
"grad_norm": 3.40625,
"learning_rate": 2.3951392907701115e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.9610938668251038,
"num_tokens": 5073063170.0,
"step": 47900
},
{
"entropy": 1.24375,
"epoch": 1.1740365310219871,
"grad_norm": 3.328125,
"learning_rate": 2.389192500861888e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9621718871593475,
"num_tokens": 5078828458.0,
"step": 47950
},
{
"entropy": 1.26078125,
"epoch": 1.1752607609813428,
"grad_norm": 2.578125,
"learning_rate": 2.3832482137857685e-06,
"loss": 0.175,
"mean_token_accuracy": 0.9630187213420868,
"num_tokens": 5084161692.0,
"step": 48000
},
{
"entropy": 1.2571875,
"epoch": 1.1764849909406982,
"grad_norm": 1.8046875,
"learning_rate": 2.377306453898938e-06,
"loss": 0.1689,
"mean_token_accuracy": 0.9643845617771148,
"num_tokens": 5089346169.0,
"step": 48050
},
{
"entropy": 1.25125,
"epoch": 1.177709220900054,
"grad_norm": 3.34375,
"learning_rate": 2.3713672455482293e-06,
"loss": 0.1609,
"mean_token_accuracy": 0.9652318274974823,
"num_tokens": 5094622581.0,
"step": 48100
},
{
"entropy": 1.24921875,
"epoch": 1.1789334508594094,
"grad_norm": 1.953125,
"learning_rate": 2.36543061307002e-06,
"loss": 0.1611,
"mean_token_accuracy": 0.9650622093677521,
"num_tokens": 5099539248.0,
"step": 48150
},
{
"entropy": 1.2584375,
"epoch": 1.180157680818765,
"grad_norm": 4.5,
"learning_rate": 2.35949658079013e-06,
"loss": 0.1693,
"mean_token_accuracy": 0.9631922256946563,
"num_tokens": 5104589567.0,
"step": 48200
},
{
"entropy": 1.26328125,
"epoch": 1.1813819107781205,
"grad_norm": 3.09375,
"learning_rate": 2.3535651730237275e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.9661449313163757,
"num_tokens": 5109766096.0,
"step": 48250
},
{
"entropy": 1.25484375,
"epoch": 1.1826061407374762,
"grad_norm": 3.125,
"learning_rate": 2.3476364140752266e-06,
"loss": 0.1599,
"mean_token_accuracy": 0.9653767657279968,
"num_tokens": 5114683078.0,
"step": 48300
},
{
"entropy": 1.28109375,
"epoch": 1.1838303706968316,
"grad_norm": 3.46875,
"learning_rate": 2.341710328238185e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9629187500476837,
"num_tokens": 5120172628.0,
"step": 48350
},
{
"entropy": 1.2590625,
"epoch": 1.1850546006561873,
"grad_norm": 2.125,
"learning_rate": 2.335786939795209e-06,
"loss": 0.1574,
"mean_token_accuracy": 0.966355732679367,
"num_tokens": 5125111521.0,
"step": 48400
},
{
"entropy": 1.2721875,
"epoch": 1.1862788306155427,
"grad_norm": 2.046875,
"learning_rate": 2.3298662730178536e-06,
"loss": 0.1635,
"mean_token_accuracy": 0.9648284649848938,
"num_tokens": 5130646209.0,
"step": 48450
},
{
"entropy": 1.2484375,
"epoch": 1.1875030605748984,
"grad_norm": 2.703125,
"learning_rate": 2.3239483521665165e-06,
"loss": 0.1529,
"mean_token_accuracy": 0.9668037176132203,
"num_tokens": 5135665531.0,
"step": 48500
},
{
"entropy": 1.25546875,
"epoch": 1.188727290534254,
"grad_norm": 1.5703125,
"learning_rate": 2.31803320149035e-06,
"loss": 0.1674,
"mean_token_accuracy": 0.9642703318595887,
"num_tokens": 5140993137.0,
"step": 48550
},
{
"entropy": 1.2759375,
"epoch": 1.1899515204936095,
"grad_norm": 1.875,
"learning_rate": 2.312120845227151e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9635923814773559,
"num_tokens": 5146394110.0,
"step": 48600
},
{
"entropy": 1.269375,
"epoch": 1.191175750452965,
"grad_norm": 2.125,
"learning_rate": 2.306211307603269e-06,
"loss": 0.1603,
"mean_token_accuracy": 0.9650293779373169,
"num_tokens": 5151444447.0,
"step": 48650
},
{
"entropy": 1.2778125,
"epoch": 1.1923999804123206,
"grad_norm": 3.484375,
"learning_rate": 2.3003046128335004e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.962925443649292,
"num_tokens": 5157164016.0,
"step": 48700
},
{
"entropy": 1.2559375,
"epoch": 1.1936242103716763,
"grad_norm": 0.00262451171875,
"learning_rate": 2.2944007851209967e-06,
"loss": 0.1555,
"mean_token_accuracy": 0.9663327503204345,
"num_tokens": 5162287319.0,
"step": 48750
},
{
"entropy": 1.25578125,
"epoch": 1.1948484403310318,
"grad_norm": 1.84375,
"learning_rate": 2.2884998486571587e-06,
"loss": 0.1623,
"mean_token_accuracy": 0.9643605947494507,
"num_tokens": 5167697788.0,
"step": 48800
},
{
"entropy": 1.2621875,
"epoch": 1.1960726702903874,
"grad_norm": 2.421875,
"learning_rate": 2.2826018276215404e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9648311936855316,
"num_tokens": 5172726413.0,
"step": 48850
},
{
"entropy": 1.256875,
"epoch": 1.197296900249743,
"grad_norm": 3.9375,
"learning_rate": 2.276706746181751e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.9653891062736512,
"num_tokens": 5177807515.0,
"step": 48900
},
{
"entropy": 1.24484375,
"epoch": 1.1985211302090986,
"grad_norm": 3.359375,
"learning_rate": 2.2708146284933544e-06,
"loss": 0.1491,
"mean_token_accuracy": 0.9672402215003967,
"num_tokens": 5182682002.0,
"step": 48950
},
{
"entropy": 1.2434375,
"epoch": 1.199745360168454,
"grad_norm": 2.09375,
"learning_rate": 2.2649254986997666e-06,
"loss": 0.1625,
"mean_token_accuracy": 0.9646528875827789,
"num_tokens": 5187927187.0,
"step": 49000
},
{
"entropy": 1.26171875,
"epoch": 1.2009695901278097,
"grad_norm": 2.140625,
"learning_rate": 2.2590393809321657e-06,
"loss": 0.1601,
"mean_token_accuracy": 0.9654495012760163,
"num_tokens": 5192885819.0,
"step": 49050
},
{
"entropy": 1.26296875,
"epoch": 1.2021938200871651,
"grad_norm": 2.015625,
"learning_rate": 2.2531562993093854e-06,
"loss": 0.1631,
"mean_token_accuracy": 0.9647388279438018,
"num_tokens": 5198240652.0,
"step": 49100
},
{
"entropy": 1.23265625,
"epoch": 1.2034180500465208,
"grad_norm": 2.5625,
"learning_rate": 2.247276277937817e-06,
"loss": 0.1537,
"mean_token_accuracy": 0.966611897945404,
"num_tokens": 5203287957.0,
"step": 49150
},
{
"entropy": 1.24109375,
"epoch": 1.2046422800058763,
"grad_norm": 2.671875,
"learning_rate": 2.241399340911315e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.9648150885105133,
"num_tokens": 5208259781.0,
"step": 49200
},
{
"entropy": 1.22828125,
"epoch": 1.205866509965232,
"grad_norm": 1.796875,
"learning_rate": 2.235525512311094e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9645445287227631,
"num_tokens": 5213559098.0,
"step": 49250
},
{
"entropy": 1.23921875,
"epoch": 1.2070907399245874,
"grad_norm": 3.1875,
"learning_rate": 2.229654816205632e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9639151406288147,
"num_tokens": 5218710994.0,
"step": 49300
},
{
"entropy": 1.2425,
"epoch": 1.208314969883943,
"grad_norm": 2.828125,
"learning_rate": 2.2237872766505715e-06,
"loss": 0.1676,
"mean_token_accuracy": 0.9631175470352172,
"num_tokens": 5224096915.0,
"step": 49350
},
{
"entropy": 1.25015625,
"epoch": 1.2095391998432985,
"grad_norm": 3.71875,
"learning_rate": 2.2179229176886196e-06,
"loss": 0.1731,
"mean_token_accuracy": 0.9628188860416412,
"num_tokens": 5229833600.0,
"step": 49400
},
{
"entropy": 1.24265625,
"epoch": 1.2107634298026542,
"grad_norm": 4.125,
"learning_rate": 2.212061763349454e-06,
"loss": 0.1616,
"mean_token_accuracy": 0.9654302883148194,
"num_tokens": 5235131114.0,
"step": 49450
},
{
"entropy": 1.23765625,
"epoch": 1.2119876597620096,
"grad_norm": 1.9375,
"learning_rate": 2.206203837649615e-06,
"loss": 0.1555,
"mean_token_accuracy": 0.9665101909637451,
"num_tokens": 5240317138.0,
"step": 49500
},
{
"entropy": 1.24921875,
"epoch": 1.2132118897213653,
"grad_norm": 1.9609375,
"learning_rate": 2.2003491645924195e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.9628171730041504,
"num_tokens": 5245861371.0,
"step": 49550
},
{
"entropy": 1.24640625,
"epoch": 1.2144361196807207,
"grad_norm": 2.859375,
"learning_rate": 2.194497768167855e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9627651238441467,
"num_tokens": 5251350220.0,
"step": 49600
},
{
"entropy": 1.2528125,
"epoch": 1.2156603496400764,
"grad_norm": 2.296875,
"learning_rate": 2.188649672352479e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.964025752544403,
"num_tokens": 5256995465.0,
"step": 49650
},
{
"entropy": 1.25234375,
"epoch": 1.2168845795994319,
"grad_norm": 2.703125,
"learning_rate": 2.1828049011093286e-06,
"loss": 0.1702,
"mean_token_accuracy": 0.9648704588413238,
"num_tokens": 5262286472.0,
"step": 49700
},
{
"entropy": 1.241875,
"epoch": 1.2181088095587875,
"grad_norm": 3.578125,
"learning_rate": 2.1769634783878182e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9658465564250946,
"num_tokens": 5267436922.0,
"step": 49750
},
{
"entropy": 1.26,
"epoch": 1.2193330395181432,
"grad_norm": 3.453125,
"learning_rate": 2.1711254281236373e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9622203695774079,
"num_tokens": 5273103073.0,
"step": 49800
},
{
"entropy": 1.258125,
"epoch": 1.2205572694774987,
"grad_norm": 2.78125,
"learning_rate": 2.1652907742386613e-06,
"loss": 0.178,
"mean_token_accuracy": 0.9619389712810517,
"num_tokens": 5278483949.0,
"step": 49850
},
{
"entropy": 1.24796875,
"epoch": 1.221781499436854,
"grad_norm": 1.5546875,
"learning_rate": 2.159459540640847e-06,
"loss": 0.161,
"mean_token_accuracy": 0.9660306286811828,
"num_tokens": 5283427597.0,
"step": 49900
},
{
"entropy": 1.27984375,
"epoch": 1.2230057293962098,
"grad_norm": 1.5703125,
"learning_rate": 2.1536317512241348e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9623690032958985,
"num_tokens": 5288987030.0,
"step": 49950
},
{
"entropy": 1.2584375,
"epoch": 1.2242299593555654,
"grad_norm": 2.71875,
"learning_rate": 2.147807429868352e-06,
"loss": 0.1658,
"mean_token_accuracy": 0.9644541823863984,
"num_tokens": 5294529728.0,
"step": 50000
},
{
"epoch": 1.2242299593555654,
"eval_entropy": 1.2479817708333334,
"eval_loss": 0.17940963804721832,
"eval_mean_token_accuracy": 0.9616454169154167,
"eval_num_tokens": 5294529728.0,
"eval_runtime": 604.376,
"eval_samples_per_second": 15.977,
"eval_steps_per_second": 0.2,
"step": 50000
},
{
"entropy": 1.2259375,
"epoch": 1.225454189314921,
"grad_norm": 2.09375,
"learning_rate": 2.141986600439119e-06,
"loss": 0.153,
"mean_token_accuracy": 0.9670542335510254,
"num_tokens": 5299381949.0,
"step": 50050
},
{
"entropy": 1.26140625,
"epoch": 1.2266784192742766,
"grad_norm": 2.625,
"learning_rate": 2.1361692867877455e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9621517550945282,
"num_tokens": 5304936166.0,
"step": 50100
},
{
"entropy": 1.24140625,
"epoch": 1.227902649233632,
"grad_norm": 0.00927734375,
"learning_rate": 2.1303555127511327e-06,
"loss": 0.1545,
"mean_token_accuracy": 0.96613614320755,
"num_tokens": 5310169155.0,
"step": 50150
},
{
"entropy": 1.261875,
"epoch": 1.2291268791929877,
"grad_norm": 2.65625,
"learning_rate": 2.124545302151681e-06,
"loss": 0.1693,
"mean_token_accuracy": 0.9642032277584076,
"num_tokens": 5315607723.0,
"step": 50200
},
{
"entropy": 1.26796875,
"epoch": 1.2303511091523431,
"grad_norm": 2.640625,
"learning_rate": 2.118738678797191e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9641611945629119,
"num_tokens": 5321112342.0,
"step": 50250
},
{
"entropy": 1.25578125,
"epoch": 1.2315753391116988,
"grad_norm": 3.6875,
"learning_rate": 2.112935666480758e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.965636430978775,
"num_tokens": 5326352547.0,
"step": 50300
},
{
"entropy": 1.26484375,
"epoch": 1.2327995690710543,
"grad_norm": 2.046875,
"learning_rate": 2.1071362889806863e-06,
"loss": 0.1729,
"mean_token_accuracy": 0.963402829170227,
"num_tokens": 5331870603.0,
"step": 50350
},
{
"entropy": 1.27546875,
"epoch": 1.23402379903041,
"grad_norm": 2.46875,
"learning_rate": 2.101340570060385e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9636083686351776,
"num_tokens": 5337306717.0,
"step": 50400
},
{
"entropy": 1.24609375,
"epoch": 1.2352480289897654,
"grad_norm": 4.125,
"learning_rate": 2.09554853346827e-06,
"loss": 0.1558,
"mean_token_accuracy": 0.9663618934154511,
"num_tokens": 5342628594.0,
"step": 50450
},
{
"entropy": 1.2796875,
"epoch": 1.236472258949121,
"grad_norm": 2.1875,
"learning_rate": 2.089760202937671e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9637987637519836,
"num_tokens": 5348316678.0,
"step": 50500
},
{
"entropy": 1.2409375,
"epoch": 1.2376964889084765,
"grad_norm": 4.09375,
"learning_rate": 2.0839756021867306e-06,
"loss": 0.1499,
"mean_token_accuracy": 0.967620609998703,
"num_tokens": 5353095952.0,
"step": 50550
},
{
"entropy": 1.26671875,
"epoch": 1.2389207188678322,
"grad_norm": 2.1875,
"learning_rate": 2.07819475491831e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9643842697143554,
"num_tokens": 5358561384.0,
"step": 50600
},
{
"entropy": 1.25734375,
"epoch": 1.2401449488271876,
"grad_norm": 3.546875,
"learning_rate": 2.0724176848198856e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.9659811770915985,
"num_tokens": 5363968041.0,
"step": 50650
},
{
"entropy": 1.2559375,
"epoch": 1.2413691787865433,
"grad_norm": 3.265625,
"learning_rate": 2.0666444155634613e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9649008166790009,
"num_tokens": 5369138043.0,
"step": 50700
},
{
"entropy": 1.2790625,
"epoch": 1.2425934087458987,
"grad_norm": 1.796875,
"learning_rate": 2.0608749708054666e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9624824106693268,
"num_tokens": 5374681050.0,
"step": 50750
},
{
"entropy": 1.274375,
"epoch": 1.2438176387052544,
"grad_norm": 3.5,
"learning_rate": 2.0551093741866555e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.964318573474884,
"num_tokens": 5379930328.0,
"step": 50800
},
{
"entropy": 1.2709375,
"epoch": 1.24504186866461,
"grad_norm": 2.328125,
"learning_rate": 2.0493476493320182e-06,
"loss": 0.1639,
"mean_token_accuracy": 0.9642879796028138,
"num_tokens": 5385290824.0,
"step": 50850
},
{
"entropy": 1.27515625,
"epoch": 1.2462660986239655,
"grad_norm": 3.15625,
"learning_rate": 2.043589819850679e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9621766293048859,
"num_tokens": 5390915687.0,
"step": 50900
},
{
"entropy": 1.26828125,
"epoch": 1.247490328583321,
"grad_norm": 2.671875,
"learning_rate": 2.037835909335799e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9644598591327668,
"num_tokens": 5396364664.0,
"step": 50950
},
{
"entropy": 1.23453125,
"epoch": 1.2487145585426767,
"grad_norm": 3.4375,
"learning_rate": 2.032085941364483e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9683002579212189,
"num_tokens": 5401284379.0,
"step": 51000
},
{
"entropy": 1.264375,
"epoch": 1.2499387885020323,
"grad_norm": 2.671875,
"learning_rate": 2.026339939497681e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9641962945461273,
"num_tokens": 5406818098.0,
"step": 51050
},
{
"entropy": 1.23828125,
"epoch": 1.2511630184613878,
"grad_norm": 0.2138671875,
"learning_rate": 2.020597927280089e-06,
"loss": 0.1498,
"mean_token_accuracy": 0.9685159015655518,
"num_tokens": 5411689647.0,
"step": 51100
},
{
"entropy": 1.24640625,
"epoch": 1.2523872484207432,
"grad_norm": 2.640625,
"learning_rate": 2.014859928240058e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.9665188646316528,
"num_tokens": 5416677115.0,
"step": 51150
},
{
"entropy": 1.24796875,
"epoch": 1.253611478380099,
"grad_norm": 1.84375,
"learning_rate": 2.0091259658894926e-06,
"loss": 0.1525,
"mean_token_accuracy": 0.9675477313995361,
"num_tokens": 5422071895.0,
"step": 51200
},
{
"entropy": 1.27703125,
"epoch": 1.2548357083394546,
"grad_norm": 2.15625,
"learning_rate": 2.00339606372376e-06,
"loss": 0.1796,
"mean_token_accuracy": 0.9615858125686646,
"num_tokens": 5427896152.0,
"step": 51250
},
{
"entropy": 1.25203125,
"epoch": 1.25605993829881,
"grad_norm": 2.21875,
"learning_rate": 1.9976702452215846e-06,
"loss": 0.1615,
"mean_token_accuracy": 0.9655699288845062,
"num_tokens": 5432956715.0,
"step": 51300
},
{
"entropy": 1.25671875,
"epoch": 1.2572841682581657,
"grad_norm": 2.5,
"learning_rate": 1.9919485338449633e-06,
"loss": 0.1669,
"mean_token_accuracy": 0.963955899477005,
"num_tokens": 5438521726.0,
"step": 51350
},
{
"entropy": 1.25890625,
"epoch": 1.2585083982175211,
"grad_norm": 3.671875,
"learning_rate": 1.9862309530390627e-06,
"loss": 0.1604,
"mean_token_accuracy": 0.9649885761737823,
"num_tokens": 5443663826.0,
"step": 51400
},
{
"entropy": 1.25375,
"epoch": 1.2597326281768768,
"grad_norm": 1.703125,
"learning_rate": 1.98051752623212e-06,
"loss": 0.1607,
"mean_token_accuracy": 0.9659333276748657,
"num_tokens": 5448801306.0,
"step": 51450
},
{
"entropy": 1.26546875,
"epoch": 1.2609568581362323,
"grad_norm": 2.234375,
"learning_rate": 1.9748082768353554e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9649898850917816,
"num_tokens": 5454048809.0,
"step": 51500
},
{
"entropy": 1.2559375,
"epoch": 1.262181088095588,
"grad_norm": 3.40625,
"learning_rate": 1.969103228242872e-06,
"loss": 0.1671,
"mean_token_accuracy": 0.9636943113803863,
"num_tokens": 5459063221.0,
"step": 51550
},
{
"entropy": 1.26359375,
"epoch": 1.2634053180549434,
"grad_norm": 0.01025390625,
"learning_rate": 1.9634024038315556e-06,
"loss": 0.1555,
"mean_token_accuracy": 0.9668670952320099,
"num_tokens": 5464218533.0,
"step": 51600
},
{
"entropy": 1.26984375,
"epoch": 1.264629548014299,
"grad_norm": 1.796875,
"learning_rate": 1.9577058269609873e-06,
"loss": 0.1677,
"mean_token_accuracy": 0.9646493744850159,
"num_tokens": 5469633751.0,
"step": 51650
},
{
"entropy": 1.26015625,
"epoch": 1.2658537779736545,
"grad_norm": 2.5,
"learning_rate": 1.9520135209733434e-06,
"loss": 0.1548,
"mean_token_accuracy": 0.9670298910140991,
"num_tokens": 5474658175.0,
"step": 51700
},
{
"entropy": 1.24671875,
"epoch": 1.2670780079330102,
"grad_norm": 2.921875,
"learning_rate": 1.9463255091932946e-06,
"loss": 0.168,
"mean_token_accuracy": 0.9642450773715973,
"num_tokens": 5480009732.0,
"step": 51750
},
{
"entropy": 1.25875,
"epoch": 1.2683022378923656,
"grad_norm": 2.703125,
"learning_rate": 1.9406418149279224e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9646876096725464,
"num_tokens": 5485352642.0,
"step": 51800
},
{
"entropy": 1.25078125,
"epoch": 1.2695264678517213,
"grad_norm": 2.40625,
"learning_rate": 1.9349624614666137e-06,
"loss": 0.1599,
"mean_token_accuracy": 0.9663380241394043,
"num_tokens": 5490516069.0,
"step": 51850
},
{
"entropy": 1.2540625,
"epoch": 1.270750697811077,
"grad_norm": 2.125,
"learning_rate": 1.9292874720809706e-06,
"loss": 0.1691,
"mean_token_accuracy": 0.9637067282199859,
"num_tokens": 5495858878.0,
"step": 51900
},
{
"entropy": 1.2459375,
"epoch": 1.2719749277704324,
"grad_norm": 2.03125,
"learning_rate": 1.9236168700247085e-06,
"loss": 0.1597,
"mean_token_accuracy": 0.9652304399013519,
"num_tokens": 5500992334.0,
"step": 51950
},
{
"entropy": 1.26390625,
"epoch": 1.2731991577297879,
"grad_norm": 2.40625,
"learning_rate": 1.9179506785335695e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9612833940982819,
"num_tokens": 5506364973.0,
"step": 52000
},
{
"entropy": 1.2540625,
"epoch": 1.2744233876891435,
"grad_norm": 3.09375,
"learning_rate": 1.912288920825224e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9639379584789276,
"num_tokens": 5511847363.0,
"step": 52050
},
{
"entropy": 1.26140625,
"epoch": 1.2756476176484992,
"grad_norm": 3.59375,
"learning_rate": 1.9066316200991702e-06,
"loss": 0.1739,
"mean_token_accuracy": 0.9622644722461701,
"num_tokens": 5517402202.0,
"step": 52100
},
{
"entropy": 1.23109375,
"epoch": 1.2768718476078547,
"grad_norm": 2.5625,
"learning_rate": 1.9009787995366464e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9665352630615235,
"num_tokens": 5522479618.0,
"step": 52150
},
{
"entropy": 1.2396875,
"epoch": 1.27809607756721,
"grad_norm": 1.8046875,
"learning_rate": 1.8953304823005346e-06,
"loss": 0.159,
"mean_token_accuracy": 0.965977475643158,
"num_tokens": 5527761846.0,
"step": 52200
},
{
"entropy": 1.24140625,
"epoch": 1.2793203075265658,
"grad_norm": 2.765625,
"learning_rate": 1.889686691535259e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9641374492645264,
"num_tokens": 5533078395.0,
"step": 52250
},
{
"entropy": 1.22796875,
"epoch": 1.2805445374859215,
"grad_norm": 2.0625,
"learning_rate": 1.8840474503667003e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.96567800283432,
"num_tokens": 5538079639.0,
"step": 52300
},
{
"entropy": 1.233125,
"epoch": 1.281768767445277,
"grad_norm": 1.828125,
"learning_rate": 1.8784127819020977e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9639940130710601,
"num_tokens": 5543060468.0,
"step": 52350
},
{
"entropy": 1.23828125,
"epoch": 1.2829929974046324,
"grad_norm": 2.40625,
"learning_rate": 1.8727827092299486e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9634285986423492,
"num_tokens": 5548455628.0,
"step": 52400
},
{
"entropy": 1.2671875,
"epoch": 1.284217227363988,
"grad_norm": 1.8515625,
"learning_rate": 1.8671572554199227e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.9630351853370667,
"num_tokens": 5554243712.0,
"step": 52450
},
{
"entropy": 1.256875,
"epoch": 1.2854414573233437,
"grad_norm": 3.09375,
"learning_rate": 1.8615364435227627e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9632880544662475,
"num_tokens": 5559645728.0,
"step": 52500
},
{
"entropy": 1.25578125,
"epoch": 1.2866656872826991,
"grad_norm": 2.4375,
"learning_rate": 1.8559202965701921e-06,
"loss": 0.1729,
"mean_token_accuracy": 0.9628579890727997,
"num_tokens": 5565441017.0,
"step": 52550
},
{
"entropy": 1.241875,
"epoch": 1.2878899172420548,
"grad_norm": 3.0625,
"learning_rate": 1.850308837574815e-06,
"loss": 0.1567,
"mean_token_accuracy": 0.9662058663368225,
"num_tokens": 5570548727.0,
"step": 52600
},
{
"entropy": 1.2465625,
"epoch": 1.2891141472014103,
"grad_norm": 3.671875,
"learning_rate": 1.8447020895300304e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.9654901123046875,
"num_tokens": 5575812384.0,
"step": 52650
},
{
"entropy": 1.26609375,
"epoch": 1.290338377160766,
"grad_norm": 2.609375,
"learning_rate": 1.8391000754099329e-06,
"loss": 0.1704,
"mean_token_accuracy": 0.9641706418991088,
"num_tokens": 5581119333.0,
"step": 52700
},
{
"entropy": 1.25296875,
"epoch": 1.2915626071201214,
"grad_norm": 3.484375,
"learning_rate": 1.8335028181692183e-06,
"loss": 0.1591,
"mean_token_accuracy": 0.9657709896564484,
"num_tokens": 5586146551.0,
"step": 52750
},
{
"entropy": 1.26609375,
"epoch": 1.292786837079477,
"grad_norm": 2.15625,
"learning_rate": 1.8279103407430918e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9645370328426361,
"num_tokens": 5591535827.0,
"step": 52800
},
{
"entropy": 1.24609375,
"epoch": 1.2940110670388325,
"grad_norm": 3.09375,
"learning_rate": 1.822322666047173e-06,
"loss": 0.156,
"mean_token_accuracy": 0.966865359544754,
"num_tokens": 5596513224.0,
"step": 52850
},
{
"entropy": 1.23125,
"epoch": 1.2952352969981882,
"grad_norm": 2.59375,
"learning_rate": 1.8167398169774003e-06,
"loss": 0.1562,
"mean_token_accuracy": 0.9663991129398346,
"num_tokens": 5601409756.0,
"step": 52900
},
{
"entropy": 1.24203125,
"epoch": 1.2964595269575436,
"grad_norm": 2.09375,
"learning_rate": 1.8111618164099405e-06,
"loss": 0.1586,
"mean_token_accuracy": 0.965841782093048,
"num_tokens": 5606579901.0,
"step": 52950
},
{
"entropy": 1.25640625,
"epoch": 1.2976837569168993,
"grad_norm": 4.40625,
"learning_rate": 1.805588687201094e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9661786913871765,
"num_tokens": 5611890254.0,
"step": 53000
},
{
"entropy": 1.27453125,
"epoch": 1.2989079868762547,
"grad_norm": 2.9375,
"learning_rate": 1.8000204521871968e-06,
"loss": 0.1736,
"mean_token_accuracy": 0.9631719040870667,
"num_tokens": 5617317192.0,
"step": 53050
},
{
"entropy": 1.254375,
"epoch": 1.3001322168356104,
"grad_norm": 2.0625,
"learning_rate": 1.7944571341845338e-06,
"loss": 0.1735,
"mean_token_accuracy": 0.9628773295879364,
"num_tokens": 5622759860.0,
"step": 53100
},
{
"entropy": 1.251875,
"epoch": 1.301356446794966,
"grad_norm": 3.328125,
"learning_rate": 1.788898755989241e-06,
"loss": 0.1544,
"mean_token_accuracy": 0.966829891204834,
"num_tokens": 5628009830.0,
"step": 53150
},
{
"entropy": 1.2346875,
"epoch": 1.3025806767543215,
"grad_norm": 1.9140625,
"learning_rate": 1.7833453403772148e-06,
"loss": 0.1496,
"mean_token_accuracy": 0.9679068636894226,
"num_tokens": 5633028331.0,
"step": 53200
},
{
"entropy": 1.23625,
"epoch": 1.303804906713677,
"grad_norm": 2.765625,
"learning_rate": 1.7777969101040137e-06,
"loss": 0.1598,
"mean_token_accuracy": 0.9658224785327911,
"num_tokens": 5638192081.0,
"step": 53250
},
{
"entropy": 1.25921875,
"epoch": 1.3050291366730327,
"grad_norm": 1.765625,
"learning_rate": 1.7722534879047704e-06,
"loss": 0.1679,
"mean_token_accuracy": 0.9648814105987549,
"num_tokens": 5643678649.0,
"step": 53300
},
{
"entropy": 1.23703125,
"epoch": 1.3062533666323883,
"grad_norm": 2.171875,
"learning_rate": 1.7667150964940981e-06,
"loss": 0.1542,
"mean_token_accuracy": 0.9665197932720184,
"num_tokens": 5648865610.0,
"step": 53350
},
{
"entropy": 1.2546875,
"epoch": 1.3074775965917438,
"grad_norm": 3.46875,
"learning_rate": 1.7611817585659915e-06,
"loss": 0.1695,
"mean_token_accuracy": 0.96389883518219,
"num_tokens": 5654452208.0,
"step": 53400
},
{
"entropy": 1.23046875,
"epoch": 1.3087018265510992,
"grad_norm": 2.9375,
"learning_rate": 1.7556534967937428e-06,
"loss": 0.1477,
"mean_token_accuracy": 0.967578010559082,
"num_tokens": 5659553855.0,
"step": 53450
},
{
"entropy": 1.2696875,
"epoch": 1.309926056510455,
"grad_norm": 2.71875,
"learning_rate": 1.750130333829843e-06,
"loss": 0.174,
"mean_token_accuracy": 0.9626197755336762,
"num_tokens": 5665208689.0,
"step": 53500
},
{
"entropy": 1.230625,
"epoch": 1.3111502864698106,
"grad_norm": 2.265625,
"learning_rate": 1.744612292305887e-06,
"loss": 0.1488,
"mean_token_accuracy": 0.9678456223011017,
"num_tokens": 5670219320.0,
"step": 53550
},
{
"entropy": 1.26109375,
"epoch": 1.312374516429166,
"grad_norm": 3.46875,
"learning_rate": 1.73909939483249e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9616470074653626,
"num_tokens": 5676005681.0,
"step": 53600
},
{
"entropy": 1.23359375,
"epoch": 1.3135987463885217,
"grad_norm": 3.46875,
"learning_rate": 1.7335916639991833e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9656192350387574,
"num_tokens": 5680838804.0,
"step": 53650
},
{
"entropy": 1.24828125,
"epoch": 1.3148229763478771,
"grad_norm": 3.25,
"learning_rate": 1.7280891223743347e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9647430288791656,
"num_tokens": 5686118856.0,
"step": 53700
},
{
"entropy": 1.25203125,
"epoch": 1.3160472063072328,
"grad_norm": 1.7890625,
"learning_rate": 1.7225917925050384e-06,
"loss": 0.1808,
"mean_token_accuracy": 0.9617255198955535,
"num_tokens": 5691606584.0,
"step": 53750
},
{
"entropy": 1.23875,
"epoch": 1.3172714362665883,
"grad_norm": 2.796875,
"learning_rate": 1.7170996969170434e-06,
"loss": 0.1643,
"mean_token_accuracy": 0.9644413828849793,
"num_tokens": 5697025528.0,
"step": 53800
},
{
"entropy": 1.23609375,
"epoch": 1.318495666225944,
"grad_norm": 3.0,
"learning_rate": 1.7116128581146443e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.9660075342655182,
"num_tokens": 5702129646.0,
"step": 53850
},
{
"entropy": 1.239375,
"epoch": 1.3197198961852994,
"grad_norm": 2.46875,
"learning_rate": 1.7061312985805986e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9642334473133087,
"num_tokens": 5707290385.0,
"step": 53900
},
{
"entropy": 1.23515625,
"epoch": 1.320944126144655,
"grad_norm": 3.421875,
"learning_rate": 1.7006550407760285e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9647632312774658,
"num_tokens": 5712555849.0,
"step": 53950
},
{
"entropy": 1.2396875,
"epoch": 1.3221683561040105,
"grad_norm": 3.71875,
"learning_rate": 1.695184107140337e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9639084780216217,
"num_tokens": 5717928890.0,
"step": 54000
},
{
"entropy": 1.2246875,
"epoch": 1.3233925860633662,
"grad_norm": 3.921875,
"learning_rate": 1.6897185200911068e-06,
"loss": 0.1468,
"mean_token_accuracy": 0.9690938425064087,
"num_tokens": 5722987021.0,
"step": 54050
},
{
"entropy": 1.2565625,
"epoch": 1.3246168160227216,
"grad_norm": 2.875,
"learning_rate": 1.6842583020240137e-06,
"loss": 0.166,
"mean_token_accuracy": 0.9647270548343658,
"num_tokens": 5728523665.0,
"step": 54100
},
{
"entropy": 1.2253125,
"epoch": 1.3258410459820773,
"grad_norm": 3.046875,
"learning_rate": 1.6788034753127332e-06,
"loss": 0.1509,
"mean_token_accuracy": 0.9676713216304779,
"num_tokens": 5733724051.0,
"step": 54150
},
{
"entropy": 1.2478125,
"epoch": 1.327065275941433,
"grad_norm": 1.7890625,
"learning_rate": 1.6733540623088485e-06,
"loss": 0.1703,
"mean_token_accuracy": 0.9635128057003022,
"num_tokens": 5739544907.0,
"step": 54200
},
{
"entropy": 1.245,
"epoch": 1.3282895059007884,
"grad_norm": 2.25,
"learning_rate": 1.6679100853417647e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9656123912334442,
"num_tokens": 5744896935.0,
"step": 54250
},
{
"entropy": 1.25453125,
"epoch": 1.3295137358601439,
"grad_norm": 2.53125,
"learning_rate": 1.6624715667186047e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.962364639043808,
"num_tokens": 5750164763.0,
"step": 54300
},
{
"entropy": 1.23609375,
"epoch": 1.3307379658194995,
"grad_norm": 3.15625,
"learning_rate": 1.6570385287241335e-06,
"loss": 0.1577,
"mean_token_accuracy": 0.9660208249092102,
"num_tokens": 5755265140.0,
"step": 54350
},
{
"entropy": 1.25390625,
"epoch": 1.3319621957788552,
"grad_norm": 1.640625,
"learning_rate": 1.6516109936206498e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.9626241695880889,
"num_tokens": 5760623089.0,
"step": 54400
},
{
"entropy": 1.246875,
"epoch": 1.3331864257382107,
"grad_norm": 2.125,
"learning_rate": 1.646188983647912e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9631841456890107,
"num_tokens": 5766177496.0,
"step": 54450
},
{
"entropy": 1.26140625,
"epoch": 1.3344106556975661,
"grad_norm": 2.921875,
"learning_rate": 1.6407725210230344e-06,
"loss": 0.1766,
"mean_token_accuracy": 0.9622941052913666,
"num_tokens": 5771692920.0,
"step": 54500
},
{
"entropy": 1.2415625,
"epoch": 1.3356348856569218,
"grad_norm": 4.0,
"learning_rate": 1.6353616279404013e-06,
"loss": 0.1569,
"mean_token_accuracy": 0.9662493073940277,
"num_tokens": 5777098724.0,
"step": 54550
},
{
"entropy": 1.23234375,
"epoch": 1.3368591156162775,
"grad_norm": 1.0,
"learning_rate": 1.6299563265715747e-06,
"loss": 0.148,
"mean_token_accuracy": 0.9682403624057769,
"num_tokens": 5782119917.0,
"step": 54600
},
{
"entropy": 1.25578125,
"epoch": 1.338083345575633,
"grad_norm": 3.46875,
"learning_rate": 1.624556639065207e-06,
"loss": 0.1594,
"mean_token_accuracy": 0.9662695753574372,
"num_tokens": 5787291101.0,
"step": 54650
},
{
"entropy": 1.24171875,
"epoch": 1.3393075755349884,
"grad_norm": 3.609375,
"learning_rate": 1.6191625875469446e-06,
"loss": 0.157,
"mean_token_accuracy": 0.9663849449157715,
"num_tokens": 5792520283.0,
"step": 54700
},
{
"entropy": 1.25046875,
"epoch": 1.340531805494344,
"grad_norm": 1.7734375,
"learning_rate": 1.6137741941193398e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9671278047561646,
"num_tokens": 5797431576.0,
"step": 54750
},
{
"entropy": 1.26546875,
"epoch": 1.3417560354536997,
"grad_norm": 2.734375,
"learning_rate": 1.6083914808617645e-06,
"loss": 0.1765,
"mean_token_accuracy": 0.9622493016719819,
"num_tokens": 5803286714.0,
"step": 54800
},
{
"entropy": 1.224375,
"epoch": 1.3429802654130552,
"grad_norm": 3.109375,
"learning_rate": 1.6030144698303079e-06,
"loss": 0.1544,
"mean_token_accuracy": 0.9669049537181854,
"num_tokens": 5807862828.0,
"step": 54850
},
{
"entropy": 1.26,
"epoch": 1.3442044953724108,
"grad_norm": 3.0,
"learning_rate": 1.5976431830577022e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.964913833141327,
"num_tokens": 5813034358.0,
"step": 54900
},
{
"entropy": 1.25109375,
"epoch": 1.3454287253317663,
"grad_norm": 3.515625,
"learning_rate": 1.5922776425532186e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9639725112915039,
"num_tokens": 5818413943.0,
"step": 54950
},
{
"entropy": 1.2321875,
"epoch": 1.346652955291122,
"grad_norm": 2.453125,
"learning_rate": 1.5869178703025869e-06,
"loss": 0.1489,
"mean_token_accuracy": 0.9674529373645783,
"num_tokens": 5823085402.0,
"step": 55000
},
{
"entropy": 1.23859375,
"epoch": 1.3478771852504774,
"grad_norm": 1.7109375,
"learning_rate": 1.5815638882678944e-06,
"loss": 0.1608,
"mean_token_accuracy": 0.9654952967166901,
"num_tokens": 5828359072.0,
"step": 55050
},
{
"entropy": 1.2465625,
"epoch": 1.349101415209833,
"grad_norm": 2.8125,
"learning_rate": 1.5762157183875092e-06,
"loss": 0.1618,
"mean_token_accuracy": 0.965077908039093,
"num_tokens": 5833897215.0,
"step": 55100
},
{
"entropy": 1.24125,
"epoch": 1.3503256451691885,
"grad_norm": 2.25,
"learning_rate": 1.5708733825759804e-06,
"loss": 0.1597,
"mean_token_accuracy": 0.9658141255378723,
"num_tokens": 5839005187.0,
"step": 55150
},
{
"entropy": 1.2690625,
"epoch": 1.3515498751285442,
"grad_norm": 3.0625,
"learning_rate": 1.5655369027239507e-06,
"loss": 0.1728,
"mean_token_accuracy": 0.9630602359771728,
"num_tokens": 5844499544.0,
"step": 55200
},
{
"entropy": 1.2484375,
"epoch": 1.3527741050878996,
"grad_norm": 2.734375,
"learning_rate": 1.5602063006980713e-06,
"loss": 0.1606,
"mean_token_accuracy": 0.9662463283538818,
"num_tokens": 5849831304.0,
"step": 55250
},
{
"entropy": 1.243125,
"epoch": 1.3539983350472553,
"grad_norm": 4.125,
"learning_rate": 1.5548815983409054e-06,
"loss": 0.1584,
"mean_token_accuracy": 0.9648811197280884,
"num_tokens": 5854831384.0,
"step": 55300
},
{
"entropy": 1.2475,
"epoch": 1.3552225650066108,
"grad_norm": 0.0169677734375,
"learning_rate": 1.5495628174708422e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.9666490364074707,
"num_tokens": 5860380821.0,
"step": 55350
},
{
"entropy": 1.2428125,
"epoch": 1.3564467949659664,
"grad_norm": 2.96875,
"learning_rate": 1.5442499798820062e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9649770343303681,
"num_tokens": 5865590076.0,
"step": 55400
},
{
"entropy": 1.25265625,
"epoch": 1.357671024925322,
"grad_norm": 5.0625,
"learning_rate": 1.5389431073441742e-06,
"loss": 0.1625,
"mean_token_accuracy": 0.9651528835296631,
"num_tokens": 5870893580.0,
"step": 55450
},
{
"entropy": 1.2590625,
"epoch": 1.3588952548846776,
"grad_norm": 3.03125,
"learning_rate": 1.5336422216026717e-06,
"loss": 0.1708,
"mean_token_accuracy": 0.9625674414634705,
"num_tokens": 5876137820.0,
"step": 55500
},
{
"entropy": 1.24515625,
"epoch": 1.360119484844033,
"grad_norm": 3.046875,
"learning_rate": 1.5283473443783021e-06,
"loss": 0.1575,
"mean_token_accuracy": 0.9658649146556855,
"num_tokens": 5881136105.0,
"step": 55550
},
{
"entropy": 1.2434375,
"epoch": 1.3613437148033887,
"grad_norm": 3.015625,
"learning_rate": 1.5230584973672404e-06,
"loss": 0.1716,
"mean_token_accuracy": 0.9642657494544983,
"num_tokens": 5886333380.0,
"step": 55600
},
{
"entropy": 1.2628125,
"epoch": 1.3625679447627443,
"grad_norm": 1.6640625,
"learning_rate": 1.5177757022409606e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.9618762648105621,
"num_tokens": 5892147042.0,
"step": 55650
},
{
"entropy": 1.25875,
"epoch": 1.3637921747220998,
"grad_norm": 3.0625,
"learning_rate": 1.5124989806461293e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.9644319689273835,
"num_tokens": 5897583102.0,
"step": 55700
},
{
"entropy": 1.2546875,
"epoch": 1.3650164046814552,
"grad_norm": 2.453125,
"learning_rate": 1.5072283542045348e-06,
"loss": 0.1558,
"mean_token_accuracy": 0.9658961379528046,
"num_tokens": 5902701860.0,
"step": 55750
},
{
"entropy": 1.2584375,
"epoch": 1.366240634640811,
"grad_norm": 3.03125,
"learning_rate": 1.5019638445129849e-06,
"loss": 0.1656,
"mean_token_accuracy": 0.9642118716239929,
"num_tokens": 5908066266.0,
"step": 55800
},
{
"entropy": 1.24375,
"epoch": 1.3674648646001666,
"grad_norm": 1.8203125,
"learning_rate": 1.496705473143224e-06,
"loss": 0.1467,
"mean_token_accuracy": 0.9683407878875733,
"num_tokens": 5913106858.0,
"step": 55850
},
{
"entropy": 1.24109375,
"epoch": 1.368689094559522,
"grad_norm": 2.65625,
"learning_rate": 1.4914532616418477e-06,
"loss": 0.1619,
"mean_token_accuracy": 0.9651940071582794,
"num_tokens": 5918299911.0,
"step": 55900
},
{
"entropy": 1.24296875,
"epoch": 1.3699133245188777,
"grad_norm": 3.015625,
"learning_rate": 1.486207231530207e-06,
"loss": 0.1533,
"mean_token_accuracy": 0.966886637210846,
"num_tokens": 5923373367.0,
"step": 55950
},
{
"entropy": 1.25984375,
"epoch": 1.3711375544782332,
"grad_norm": 2.21875,
"learning_rate": 1.4809674043043262e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9631552195549011,
"num_tokens": 5928830248.0,
"step": 56000
},
{
"entropy": 1.24640625,
"epoch": 1.3723617844375888,
"grad_norm": 2.84375,
"learning_rate": 1.4757338014348108e-06,
"loss": 0.17,
"mean_token_accuracy": 0.9638724672794342,
"num_tokens": 5934360325.0,
"step": 56050
},
{
"entropy": 1.2428125,
"epoch": 1.3735860143969443,
"grad_norm": 2.296875,
"learning_rate": 1.4705064443667672e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9640205073356628,
"num_tokens": 5939749032.0,
"step": 56100
},
{
"entropy": 1.2396875,
"epoch": 1.3748102443563,
"grad_norm": 3.140625,
"learning_rate": 1.4652853545196994e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.9635356509685516,
"num_tokens": 5944946908.0,
"step": 56150
},
{
"entropy": 1.2471875,
"epoch": 1.3760344743156554,
"grad_norm": 1.9765625,
"learning_rate": 1.4600705532874409e-06,
"loss": 0.1612,
"mean_token_accuracy": 0.9657069194316864,
"num_tokens": 5950153678.0,
"step": 56200
},
{
"entropy": 1.2515625,
"epoch": 1.377258704275011,
"grad_norm": 1.6953125,
"learning_rate": 1.45486206203805e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.9643088591098785,
"num_tokens": 5955488321.0,
"step": 56250
},
{
"entropy": 1.24984375,
"epoch": 1.3784829342343665,
"grad_norm": 2.296875,
"learning_rate": 1.4496599021137346e-06,
"loss": 0.1802,
"mean_token_accuracy": 0.9621450281143189,
"num_tokens": 5961263793.0,
"step": 56300
},
{
"entropy": 1.22625,
"epoch": 1.3797071641937222,
"grad_norm": 2.203125,
"learning_rate": 1.4444640948307554e-06,
"loss": 0.1567,
"mean_token_accuracy": 0.9664753973484039,
"num_tokens": 5966590895.0,
"step": 56350
},
{
"entropy": 1.2453125,
"epoch": 1.3809313941530776,
"grad_norm": 1.9921875,
"learning_rate": 1.4392746614793446e-06,
"loss": 0.162,
"mean_token_accuracy": 0.9654717576503754,
"num_tokens": 5972160004.0,
"step": 56400
},
{
"entropy": 1.24125,
"epoch": 1.3821556241124333,
"grad_norm": 2.90625,
"learning_rate": 1.4340916233236167e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.9643662881851196,
"num_tokens": 5977855909.0,
"step": 56450
},
{
"entropy": 1.2490625,
"epoch": 1.383379854071789,
"grad_norm": 1.6171875,
"learning_rate": 1.4289150016014792e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9650551450252532,
"num_tokens": 5983284719.0,
"step": 56500
},
{
"entropy": 1.245,
"epoch": 1.3846040840311444,
"grad_norm": 1.8046875,
"learning_rate": 1.4237448175245523e-06,
"loss": 0.1565,
"mean_token_accuracy": 0.9658044958114624,
"num_tokens": 5988559128.0,
"step": 56550
},
{
"entropy": 1.24140625,
"epoch": 1.3858283139904999,
"grad_norm": 2.234375,
"learning_rate": 1.4185810922780736e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9643181717395782,
"num_tokens": 5993939256.0,
"step": 56600
},
{
"entropy": 1.245,
"epoch": 1.3870525439498556,
"grad_norm": 2.796875,
"learning_rate": 1.413423847020816e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.963967101573944,
"num_tokens": 5999401709.0,
"step": 56650
},
{
"entropy": 1.23953125,
"epoch": 1.3882767739092112,
"grad_norm": 3.203125,
"learning_rate": 1.4082731028849995e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9649562358856201,
"num_tokens": 6004763257.0,
"step": 56700
},
{
"entropy": 1.263125,
"epoch": 1.3895010038685667,
"grad_norm": 2.3125,
"learning_rate": 1.4031288809762096e-06,
"loss": 0.1734,
"mean_token_accuracy": 0.9629300630092621,
"num_tokens": 6010451639.0,
"step": 56750
},
{
"entropy": 1.23171875,
"epoch": 1.3907252338279221,
"grad_norm": 2.734375,
"learning_rate": 1.397991202373298e-06,
"loss": 0.16,
"mean_token_accuracy": 0.9664403641223908,
"num_tokens": 6015769794.0,
"step": 56800
},
{
"entropy": 1.24171875,
"epoch": 1.3919494637872778,
"grad_norm": 1.71875,
"learning_rate": 1.3928600881283135e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.9627274203300477,
"num_tokens": 6020957098.0,
"step": 56850
},
{
"entropy": 1.2315625,
"epoch": 1.3931736937466335,
"grad_norm": 0.00994873046875,
"learning_rate": 1.3877355592664005e-06,
"loss": 0.1509,
"mean_token_accuracy": 0.9681152474880218,
"num_tokens": 6026298682.0,
"step": 56900
},
{
"entropy": 1.24703125,
"epoch": 1.394397923705989,
"grad_norm": 2.09375,
"learning_rate": 1.3826176367857244e-06,
"loss": 0.1599,
"mean_token_accuracy": 0.9659165751934051,
"num_tokens": 6031577635.0,
"step": 56950
},
{
"entropy": 1.23828125,
"epoch": 1.3956221536653444,
"grad_norm": 3.734375,
"learning_rate": 1.3775063416573772e-06,
"loss": 0.1602,
"mean_token_accuracy": 0.9653304886817932,
"num_tokens": 6036759854.0,
"step": 57000
},
{
"entropy": 1.23265625,
"epoch": 1.3968463836247,
"grad_norm": 1.6875,
"learning_rate": 1.3724016948252932e-06,
"loss": 0.1561,
"mean_token_accuracy": 0.9671315121650695,
"num_tokens": 6042005844.0,
"step": 57050
},
{
"entropy": 1.245,
"epoch": 1.3980706135840557,
"grad_norm": 4.125,
"learning_rate": 1.3673037172061715e-06,
"loss": 0.1645,
"mean_token_accuracy": 0.9652763676643371,
"num_tokens": 6047109956.0,
"step": 57100
},
{
"entropy": 1.23859375,
"epoch": 1.3992948435434112,
"grad_norm": 3.53125,
"learning_rate": 1.362212429689374e-06,
"loss": 0.1638,
"mean_token_accuracy": 0.9652803325653077,
"num_tokens": 6052155256.0,
"step": 57150
},
{
"entropy": 1.270625,
"epoch": 1.4005190735027668,
"grad_norm": 2.140625,
"learning_rate": 1.3571278531368583e-06,
"loss": 0.1746,
"mean_token_accuracy": 0.9618336653709412,
"num_tokens": 6057754576.0,
"step": 57200
},
{
"entropy": 1.25484375,
"epoch": 1.4017433034621223,
"grad_norm": 1.7109375,
"learning_rate": 1.3520500083830786e-06,
"loss": 0.1611,
"mean_token_accuracy": 0.9656724345684051,
"num_tokens": 6063117197.0,
"step": 57250
},
{
"entropy": 1.25125,
"epoch": 1.402967533421478,
"grad_norm": 2.5625,
"learning_rate": 1.346978916234905e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9628279542922974,
"num_tokens": 6068604024.0,
"step": 57300
},
{
"entropy": 1.25015625,
"epoch": 1.4041917633808334,
"grad_norm": 3.34375,
"learning_rate": 1.3419145974715394e-06,
"loss": 0.1561,
"mean_token_accuracy": 0.9659430325031281,
"num_tokens": 6073902078.0,
"step": 57350
},
{
"entropy": 1.26703125,
"epoch": 1.405415993340189,
"grad_norm": 3.859375,
"learning_rate": 1.3368570728444298e-06,
"loss": 0.1718,
"mean_token_accuracy": 0.9625124716758728,
"num_tokens": 6079405655.0,
"step": 57400
},
{
"entropy": 1.2446875,
"epoch": 1.4066402232995445,
"grad_norm": 3.828125,
"learning_rate": 1.331806363077184e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9648419404029847,
"num_tokens": 6084626144.0,
"step": 57450
},
{
"entropy": 1.23234375,
"epoch": 1.4078644532589002,
"grad_norm": 2.5625,
"learning_rate": 1.3267624888654835e-06,
"loss": 0.1479,
"mean_token_accuracy": 0.9676874935626983,
"num_tokens": 6089664069.0,
"step": 57500
},
{
"entropy": 1.255625,
"epoch": 1.4090886832182556,
"grad_norm": 2.609375,
"learning_rate": 1.3217254708770053e-06,
"loss": 0.1648,
"mean_token_accuracy": 0.964464715719223,
"num_tokens": 6095025878.0,
"step": 57550
},
{
"entropy": 1.25140625,
"epoch": 1.4103129131776113,
"grad_norm": 2.34375,
"learning_rate": 1.3166953297513275e-06,
"loss": 0.1638,
"mean_token_accuracy": 0.9649744808673859,
"num_tokens": 6100414900.0,
"step": 57600
},
{
"entropy": 1.24765625,
"epoch": 1.4115371431369668,
"grad_norm": 0.0166015625,
"learning_rate": 1.311672086099852e-06,
"loss": 0.1621,
"mean_token_accuracy": 0.9656559634208679,
"num_tokens": 6105532948.0,
"step": 57650
},
{
"entropy": 1.2375,
"epoch": 1.4127613730963224,
"grad_norm": 2.671875,
"learning_rate": 1.3066557605057167e-06,
"loss": 0.1633,
"mean_token_accuracy": 0.9653026688098908,
"num_tokens": 6110851956.0,
"step": 57700
},
{
"entropy": 1.26578125,
"epoch": 1.413985603055678,
"grad_norm": 1.9921875,
"learning_rate": 1.3016463735237164e-06,
"loss": 0.1721,
"mean_token_accuracy": 0.9625765991210937,
"num_tokens": 6116317682.0,
"step": 57750
},
{
"entropy": 1.2565625,
"epoch": 1.4152098330150336,
"grad_norm": 2.03125,
"learning_rate": 1.2966439456802059e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9632444334030151,
"num_tokens": 6122164130.0,
"step": 57800
},
{
"entropy": 1.24140625,
"epoch": 1.416434062974389,
"grad_norm": 2.234375,
"learning_rate": 1.2916484974730335e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9641308975219727,
"num_tokens": 6127574306.0,
"step": 57850
},
{
"entropy": 1.246875,
"epoch": 1.4176582929337447,
"grad_norm": 2.6875,
"learning_rate": 1.2866600493714425e-06,
"loss": 0.1725,
"mean_token_accuracy": 0.9628300058841706,
"num_tokens": 6133295960.0,
"step": 57900
},
{
"entropy": 1.25625,
"epoch": 1.4188825228931004,
"grad_norm": 2.546875,
"learning_rate": 1.281678621815994e-06,
"loss": 0.1727,
"mean_token_accuracy": 0.9640992879867554,
"num_tokens": 6138729294.0,
"step": 57950
},
{
"entropy": 1.21765625,
"epoch": 1.4201067528524558,
"grad_norm": 3.34375,
"learning_rate": 1.276704235218481e-06,
"loss": 0.1483,
"mean_token_accuracy": 0.9675537276268006,
"num_tokens": 6143658701.0,
"step": 58000
},
{
"entropy": 1.24703125,
"epoch": 1.4213309828118112,
"grad_norm": 3.359375,
"learning_rate": 1.2717369099618487e-06,
"loss": 0.168,
"mean_token_accuracy": 0.9638211143016815,
"num_tokens": 6148836685.0,
"step": 58050
},
{
"entropy": 1.2190625,
"epoch": 1.422555212771167,
"grad_norm": 3.296875,
"learning_rate": 1.2667766664001044e-06,
"loss": 0.1527,
"mean_token_accuracy": 0.9670968425273895,
"num_tokens": 6153703845.0,
"step": 58100
},
{
"entropy": 1.23734375,
"epoch": 1.4237794427305226,
"grad_norm": 2.15625,
"learning_rate": 1.2618235248582383e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.9668286955356598,
"num_tokens": 6158817391.0,
"step": 58150
},
{
"entropy": 1.23171875,
"epoch": 1.425003672689878,
"grad_norm": 3.28125,
"learning_rate": 1.2568775056321422e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9661485147476196,
"num_tokens": 6163833832.0,
"step": 58200
},
{
"entropy": 1.24328125,
"epoch": 1.4262279026492337,
"grad_norm": 1.8515625,
"learning_rate": 1.25193862898852e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9620695877075195,
"num_tokens": 6169273441.0,
"step": 58250
},
{
"entropy": 1.2259375,
"epoch": 1.4274521326085892,
"grad_norm": 4.15625,
"learning_rate": 1.2470069151648105e-06,
"loss": 0.1605,
"mean_token_accuracy": 0.964862027168274,
"num_tokens": 6174358443.0,
"step": 58300
},
{
"entropy": 1.23609375,
"epoch": 1.4286763625679448,
"grad_norm": 3.1875,
"learning_rate": 1.2420823843691005e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.9651170766353607,
"num_tokens": 6179906475.0,
"step": 58350
},
{
"entropy": 1.2340625,
"epoch": 1.4299005925273003,
"grad_norm": 2.90625,
"learning_rate": 1.2371650567800477e-06,
"loss": 0.1489,
"mean_token_accuracy": 0.967512333393097,
"num_tokens": 6184768923.0,
"step": 58400
},
{
"entropy": 1.250625,
"epoch": 1.431124822486656,
"grad_norm": 3.484375,
"learning_rate": 1.2322549525467878e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.9635206353664398,
"num_tokens": 6190151181.0,
"step": 58450
},
{
"entropy": 1.23453125,
"epoch": 1.4323490524460114,
"grad_norm": 3.65625,
"learning_rate": 1.2273520917888645e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9650914788246154,
"num_tokens": 6195374468.0,
"step": 58500
},
{
"entropy": 1.24296875,
"epoch": 1.433573282405367,
"grad_norm": 2.046875,
"learning_rate": 1.2224564945961372e-06,
"loss": 0.1738,
"mean_token_accuracy": 0.9630816507339478,
"num_tokens": 6200703908.0,
"step": 58550
},
{
"entropy": 1.21984375,
"epoch": 1.4347975123647225,
"grad_norm": 2.96875,
"learning_rate": 1.2175681810287018e-06,
"loss": 0.142,
"mean_token_accuracy": 0.96914306640625,
"num_tokens": 6205730956.0,
"step": 58600
},
{
"entropy": 1.24125,
"epoch": 1.4360217423240782,
"grad_norm": 3.109375,
"learning_rate": 1.2126871711168126e-06,
"loss": 0.1744,
"mean_token_accuracy": 0.9625077545642853,
"num_tokens": 6211224150.0,
"step": 58650
},
{
"entropy": 1.23828125,
"epoch": 1.4372459722834336,
"grad_norm": 2.890625,
"learning_rate": 1.2078134848607935e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.9665833008289337,
"num_tokens": 6216480413.0,
"step": 58700
},
{
"entropy": 1.22734375,
"epoch": 1.4384702022427893,
"grad_norm": 2.328125,
"learning_rate": 1.2029471422309593e-06,
"loss": 0.1592,
"mean_token_accuracy": 0.9655974650382996,
"num_tokens": 6221594113.0,
"step": 58750
},
{
"entropy": 1.2396875,
"epoch": 1.4396944322021448,
"grad_norm": 3.15625,
"learning_rate": 1.1980881631675338e-06,
"loss": 0.1642,
"mean_token_accuracy": 0.9646211445331574,
"num_tokens": 6226912535.0,
"step": 58800
},
{
"entropy": 1.2421875,
"epoch": 1.4409186621615004,
"grad_norm": 2.953125,
"learning_rate": 1.1932365675805704e-06,
"loss": 0.1704,
"mean_token_accuracy": 0.9632949602603912,
"num_tokens": 6232510565.0,
"step": 58850
},
{
"entropy": 1.2271875,
"epoch": 1.442142892120856,
"grad_norm": 2.5625,
"learning_rate": 1.1883923753498652e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.9651079893112182,
"num_tokens": 6237750599.0,
"step": 58900
},
{
"entropy": 1.235,
"epoch": 1.4433671220802116,
"grad_norm": 2.4375,
"learning_rate": 1.1835556063248796e-06,
"loss": 0.157,
"mean_token_accuracy": 0.9665428209304809,
"num_tokens": 6243089430.0,
"step": 58950
},
{
"entropy": 1.22171875,
"epoch": 1.4445913520395672,
"grad_norm": 1.9453125,
"learning_rate": 1.1787262803246568e-06,
"loss": 0.159,
"mean_token_accuracy": 0.9651802563667298,
"num_tokens": 6248152093.0,
"step": 59000
},
{
"entropy": 1.2453125,
"epoch": 1.4458155819989227,
"grad_norm": 2.078125,
"learning_rate": 1.1739044171377455e-06,
"loss": 0.1685,
"mean_token_accuracy": 0.963554357290268,
"num_tokens": 6253653648.0,
"step": 59050
},
{
"entropy": 1.24859375,
"epoch": 1.4470398119582781,
"grad_norm": 2.171875,
"learning_rate": 1.1690900365221082e-06,
"loss": 0.1675,
"mean_token_accuracy": 0.9636942827701569,
"num_tokens": 6259395328.0,
"step": 59100
},
{
"entropy": 1.23515625,
"epoch": 1.4482640419176338,
"grad_norm": 1.9609375,
"learning_rate": 1.164283158205053e-06,
"loss": 0.163,
"mean_token_accuracy": 0.9648255848884583,
"num_tokens": 6264597318.0,
"step": 59150
},
{
"entropy": 1.22296875,
"epoch": 1.4494882718769895,
"grad_norm": 2.890625,
"learning_rate": 1.1594838018831444e-06,
"loss": 0.1506,
"mean_token_accuracy": 0.9675889956951141,
"num_tokens": 6269482590.0,
"step": 59200
},
{
"entropy": 1.26875,
"epoch": 1.450712501836345,
"grad_norm": 3.71875,
"learning_rate": 1.1546919872221238e-06,
"loss": 0.1858,
"mean_token_accuracy": 0.9605572533607483,
"num_tokens": 6275753206.0,
"step": 59250
},
{
"entropy": 1.235625,
"epoch": 1.4519367317957004,
"grad_norm": 3.34375,
"learning_rate": 1.1499077338568329e-06,
"loss": 0.1589,
"mean_token_accuracy": 0.9655532228946686,
"num_tokens": 6281061992.0,
"step": 59300
},
{
"entropy": 1.2371875,
"epoch": 1.453160961755056,
"grad_norm": 2.09375,
"learning_rate": 1.1451310613911282e-06,
"loss": 0.1668,
"mean_token_accuracy": 0.9643084633350373,
"num_tokens": 6286356933.0,
"step": 59350
},
{
"entropy": 1.22546875,
"epoch": 1.4543851917144117,
"grad_norm": 0.0068359375,
"learning_rate": 1.1403619893978035e-06,
"loss": 0.1536,
"mean_token_accuracy": 0.9669885611534119,
"num_tokens": 6291298254.0,
"step": 59400
},
{
"entropy": 1.22484375,
"epoch": 1.4556094216737672,
"grad_norm": 2.734375,
"learning_rate": 1.1356005374185075e-06,
"loss": 0.1541,
"mean_token_accuracy": 0.9667747104167939,
"num_tokens": 6296386141.0,
"step": 59450
},
{
"entropy": 1.233125,
"epoch": 1.4568336516331228,
"grad_norm": 2.890625,
"learning_rate": 1.1308467249636693e-06,
"loss": 0.1546,
"mean_token_accuracy": 0.9666030180454254,
"num_tokens": 6301578433.0,
"step": 59500
},
{
"entropy": 1.228125,
"epoch": 1.4580578815924783,
"grad_norm": 2.171875,
"learning_rate": 1.1261005715124106e-06,
"loss": 0.1653,
"mean_token_accuracy": 0.9642830669879914,
"num_tokens": 6306834089.0,
"step": 59550
},
{
"entropy": 1.24140625,
"epoch": 1.459282111551834,
"grad_norm": 3.28125,
"learning_rate": 1.1213620965124711e-06,
"loss": 0.1713,
"mean_token_accuracy": 0.9641312193870545,
"num_tokens": 6312270957.0,
"step": 59600
},
{
"entropy": 1.238125,
"epoch": 1.4605063415111894,
"grad_norm": 2.46875,
"learning_rate": 1.1166313193801264e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9619838237762451,
"num_tokens": 6317571444.0,
"step": 59650
},
{
"entropy": 1.235,
"epoch": 1.461730571470545,
"grad_norm": 1.6328125,
"learning_rate": 1.1119082595001127e-06,
"loss": 0.1617,
"mean_token_accuracy": 0.9648803687095642,
"num_tokens": 6322810865.0,
"step": 59700
},
{
"entropy": 1.24390625,
"epoch": 1.4629548014299005,
"grad_norm": 2.421875,
"learning_rate": 1.1071929362255407e-06,
"loss": 0.1768,
"mean_token_accuracy": 0.9624212658405304,
"num_tokens": 6328065527.0,
"step": 59750
},
{
"entropy": 1.2346875,
"epoch": 1.4641790313892562,
"grad_norm": 1.90625,
"learning_rate": 1.102485368877821e-06,
"loss": 0.1547,
"mean_token_accuracy": 0.96669025182724,
"num_tokens": 6332934140.0,
"step": 59800
},
{
"entropy": 1.24796875,
"epoch": 1.4654032613486117,
"grad_norm": 2.34375,
"learning_rate": 1.0977855767465834e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.9648297607898713,
"num_tokens": 6338286149.0,
"step": 59850
},
{
"entropy": 1.23640625,
"epoch": 1.4666274913079673,
"grad_norm": 2.703125,
"learning_rate": 1.0930935790895982e-06,
"loss": 0.1481,
"mean_token_accuracy": 0.9682129454612732,
"num_tokens": 6343347728.0,
"step": 59900
},
{
"entropy": 1.2359375,
"epoch": 1.4678517212673228,
"grad_norm": 1.65625,
"learning_rate": 1.0884093951326982e-06,
"loss": 0.1662,
"mean_token_accuracy": 0.9638714647293091,
"num_tokens": 6348595585.0,
"step": 59950
},
{
"entropy": 1.23671875,
"epoch": 1.4690759512266784,
"grad_norm": 2.0625,
"learning_rate": 1.083733044069698e-06,
"loss": 0.1533,
"mean_token_accuracy": 0.9660887753963471,
"num_tokens": 6353539392.0,
"step": 60000
},
{
"epoch": 1.4690759512266784,
"eval_entropy": 1.2380208333333333,
"eval_loss": 0.17763087153434753,
"eval_mean_token_accuracy": 0.9620065187414487,
"eval_num_tokens": 6353539392.0,
"eval_runtime": 603.0528,
"eval_samples_per_second": 16.012,
"eval_steps_per_second": 0.201,
"step": 60000
},
{
"entropy": 1.229375,
"epoch": 1.4703001811860341,
"grad_norm": 2.96875,
"learning_rate": 1.0790645450623166e-06,
"loss": 0.1552,
"mean_token_accuracy": 0.9666960227489472,
"num_tokens": 6358769999.0,
"step": 60050
},
{
"entropy": 1.23546875,
"epoch": 1.4715244111453896,
"grad_norm": 3.0,
"learning_rate": 1.0744039172400965e-06,
"loss": 0.1538,
"mean_token_accuracy": 0.9672531485557556,
"num_tokens": 6363778830.0,
"step": 60100
},
{
"entropy": 1.21921875,
"epoch": 1.472748641104745,
"grad_norm": 2.171875,
"learning_rate": 1.0697511797003325e-06,
"loss": 0.1562,
"mean_token_accuracy": 0.9664645326137543,
"num_tokens": 6368813861.0,
"step": 60150
},
{
"entropy": 1.2353125,
"epoch": 1.4739728710641007,
"grad_norm": 2.296875,
"learning_rate": 1.0651063515079833e-06,
"loss": 0.1537,
"mean_token_accuracy": 0.9665102756023407,
"num_tokens": 6374106711.0,
"step": 60200
},
{
"entropy": 1.22546875,
"epoch": 1.4751971010234564,
"grad_norm": 1.75,
"learning_rate": 1.0604694516956e-06,
"loss": 0.151,
"mean_token_accuracy": 0.9675907123088837,
"num_tokens": 6379244247.0,
"step": 60250
},
{
"entropy": 1.22953125,
"epoch": 1.4764213309828118,
"grad_norm": 2.71875,
"learning_rate": 1.055840499263247e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.964186635017395,
"num_tokens": 6384481392.0,
"step": 60300
},
{
"entropy": 1.23578125,
"epoch": 1.4776455609421673,
"grad_norm": 3.5625,
"learning_rate": 1.0512195131784247e-06,
"loss": 0.1575,
"mean_token_accuracy": 0.965451090335846,
"num_tokens": 6389460183.0,
"step": 60350
},
{
"entropy": 1.2571875,
"epoch": 1.478869790901523,
"grad_norm": 2.359375,
"learning_rate": 1.0466065123759882e-06,
"loss": 0.1706,
"mean_token_accuracy": 0.9634547913074494,
"num_tokens": 6395040346.0,
"step": 60400
},
{
"entropy": 1.2428125,
"epoch": 1.4800940208608786,
"grad_norm": 2.578125,
"learning_rate": 1.0420015157580736e-06,
"loss": 0.1614,
"mean_token_accuracy": 0.9662553632259369,
"num_tokens": 6400379406.0,
"step": 60450
},
{
"entropy": 1.2384375,
"epoch": 1.481318250820234,
"grad_norm": 2.03125,
"learning_rate": 1.0374045421940215e-06,
"loss": 0.1574,
"mean_token_accuracy": 0.9662669360637665,
"num_tokens": 6405924043.0,
"step": 60500
},
{
"entropy": 1.24515625,
"epoch": 1.4825424807795895,
"grad_norm": 2.609375,
"learning_rate": 1.0328156105202916e-06,
"loss": 0.1666,
"mean_token_accuracy": 0.9644035375118256,
"num_tokens": 6411487076.0,
"step": 60550
},
{
"entropy": 1.22125,
"epoch": 1.4837667107389452,
"grad_norm": 2.59375,
"learning_rate": 1.0282347395403978e-06,
"loss": 0.1501,
"mean_token_accuracy": 0.9667956507205964,
"num_tokens": 6416699077.0,
"step": 60600
},
{
"entropy": 1.22921875,
"epoch": 1.4849909406983008,
"grad_norm": 3.0625,
"learning_rate": 1.0236619480248205e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.9641565144062042,
"num_tokens": 6421663477.0,
"step": 60650
},
{
"entropy": 1.223125,
"epoch": 1.4862151706576563,
"grad_norm": 0.012939453125,
"learning_rate": 1.0190972547109352e-06,
"loss": 0.1465,
"mean_token_accuracy": 0.9683601307868958,
"num_tokens": 6426657308.0,
"step": 60700
},
{
"entropy": 1.23296875,
"epoch": 1.487439400617012,
"grad_norm": 2.34375,
"learning_rate": 1.0145406783029337e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9649899744987488,
"num_tokens": 6432023783.0,
"step": 60750
},
{
"entropy": 1.2375,
"epoch": 1.4886636305763674,
"grad_norm": 3.046875,
"learning_rate": 1.0099922374717499e-06,
"loss": 0.162,
"mean_token_accuracy": 0.9657110357284546,
"num_tokens": 6437497556.0,
"step": 60800
},
{
"entropy": 1.2409375,
"epoch": 1.489887860535723,
"grad_norm": 3.96875,
"learning_rate": 1.0054519508549797e-06,
"loss": 0.177,
"mean_token_accuracy": 0.9619574582576752,
"num_tokens": 6443350702.0,
"step": 60850
},
{
"entropy": 1.23125,
"epoch": 1.4911120904950785,
"grad_norm": 2.78125,
"learning_rate": 1.0009198370568066e-06,
"loss": 0.1627,
"mean_token_accuracy": 0.964564654827118,
"num_tokens": 6448658491.0,
"step": 60900
},
{
"entropy": 1.2153125,
"epoch": 1.4923363204544342,
"grad_norm": 3.15625,
"learning_rate": 9.96395914647927e-07,
"loss": 0.1507,
"mean_token_accuracy": 0.9671316814422607,
"num_tokens": 6453556941.0,
"step": 60950
},
{
"entropy": 1.21390625,
"epoch": 1.4935605504137897,
"grad_norm": 2.5,
"learning_rate": 9.91880202165471e-07,
"loss": 0.1637,
"mean_token_accuracy": 0.964778846502304,
"num_tokens": 6458635664.0,
"step": 61000
},
{
"entropy": 1.23234375,
"epoch": 1.4947847803731453,
"grad_norm": 2.671875,
"learning_rate": 9.873727181129275e-07,
"loss": 0.17,
"mean_token_accuracy": 0.9645189070701599,
"num_tokens": 6464088495.0,
"step": 61050
},
{
"entropy": 1.23078125,
"epoch": 1.4960090103325008,
"grad_norm": 2.953125,
"learning_rate": 9.828734809600687e-07,
"loss": 0.1594,
"mean_token_accuracy": 0.9656787288188934,
"num_tokens": 6469234194.0,
"step": 61100
},
{
"entropy": 1.22921875,
"epoch": 1.4972332402918564,
"grad_norm": 2.25,
"learning_rate": 9.783825091428782e-07,
"loss": 0.1618,
"mean_token_accuracy": 0.965996481180191,
"num_tokens": 6474528140.0,
"step": 61150
},
{
"entropy": 1.2309375,
"epoch": 1.498457470251212,
"grad_norm": 2.234375,
"learning_rate": 9.738998210634644e-07,
"loss": 0.1728,
"mean_token_accuracy": 0.9626871156692505,
"num_tokens": 6480082901.0,
"step": 61200
},
{
"entropy": 1.23,
"epoch": 1.4996817002105676,
"grad_norm": 3.796875,
"learning_rate": 9.694254350900005e-07,
"loss": 0.1585,
"mean_token_accuracy": 0.9654109764099121,
"num_tokens": 6485373470.0,
"step": 61250
},
{
"entropy": 1.24640625,
"epoch": 1.5009059301699232,
"grad_norm": 2.15625,
"learning_rate": 9.649593695566355e-07,
"loss": 0.1673,
"mean_token_accuracy": 0.9639886951446534,
"num_tokens": 6490817618.0,
"step": 61300
},
{
"entropy": 1.2215625,
"epoch": 1.5021301601292787,
"grad_norm": 2.34375,
"learning_rate": 9.605016427634272e-07,
"loss": 0.1513,
"mean_token_accuracy": 0.9674781799316406,
"num_tokens": 6495843357.0,
"step": 61350
},
{
"entropy": 1.2359375,
"epoch": 1.5033543900886341,
"grad_norm": 1.9609375,
"learning_rate": 9.560522729762628e-07,
"loss": 0.1621,
"mean_token_accuracy": 0.96533607006073,
"num_tokens": 6500949587.0,
"step": 61400
},
{
"entropy": 1.2359375,
"epoch": 1.5045786200479898,
"grad_norm": 2.875,
"learning_rate": 9.516112784267896e-07,
"loss": 0.1714,
"mean_token_accuracy": 0.9637338280677795,
"num_tokens": 6506340396.0,
"step": 61450
},
{
"entropy": 1.2215625,
"epoch": 1.5058028500073455,
"grad_norm": 4.65625,
"learning_rate": 9.471786773123337e-07,
"loss": 0.1591,
"mean_token_accuracy": 0.9650114715099335,
"num_tokens": 6511689926.0,
"step": 61500
},
{
"entropy": 1.243125,
"epoch": 1.507027079966701,
"grad_norm": 2.859375,
"learning_rate": 9.427544877958278e-07,
"loss": 0.1678,
"mean_token_accuracy": 0.9641639375686646,
"num_tokens": 6517204008.0,
"step": 61550
},
{
"entropy": 1.21296875,
"epoch": 1.5082513099260564,
"grad_norm": 2.453125,
"learning_rate": 9.383387280057409e-07,
"loss": 0.1615,
"mean_token_accuracy": 0.9646773946285248,
"num_tokens": 6522483140.0,
"step": 61600
},
{
"entropy": 1.23234375,
"epoch": 1.509475539885412,
"grad_norm": 2.953125,
"learning_rate": 9.339314160359977e-07,
"loss": 0.1588,
"mean_token_accuracy": 0.9658515179157257,
"num_tokens": 6527644811.0,
"step": 61650
},
{
"entropy": 1.233125,
"epoch": 1.5106997698447677,
"grad_norm": 3.125,
"learning_rate": 9.295325699459082e-07,
"loss": 0.1629,
"mean_token_accuracy": 0.9652837121486664,
"num_tokens": 6532774529.0,
"step": 61700
},
{
"entropy": 1.22390625,
"epoch": 1.5119239998041232,
"grad_norm": 3.90625,
"learning_rate": 9.251422077600911e-07,
"loss": 0.1658,
"mean_token_accuracy": 0.9642423093318939,
"num_tokens": 6538188895.0,
"step": 61750
},
{
"entropy": 1.223125,
"epoch": 1.5131482297634786,
"grad_norm": 2.84375,
"learning_rate": 9.207603474684063e-07,
"loss": 0.1576,
"mean_token_accuracy": 0.9674744582176209,
"num_tokens": 6543389288.0,
"step": 61800
},
{
"entropy": 1.22765625,
"epoch": 1.5143724597228343,
"grad_norm": 2.375,
"learning_rate": 9.163870070258698e-07,
"loss": 0.1563,
"mean_token_accuracy": 0.9665237212181091,
"num_tokens": 6548548612.0,
"step": 61850
},
{
"entropy": 1.22171875,
"epoch": 1.51559668968219,
"grad_norm": 3.375,
"learning_rate": 9.120222043525931e-07,
"loss": 0.1515,
"mean_token_accuracy": 0.9670096004009247,
"num_tokens": 6553657775.0,
"step": 61900
},
{
"entropy": 1.23703125,
"epoch": 1.5168209196415454,
"grad_norm": 2.15625,
"learning_rate": 9.076659573337e-07,
"loss": 0.1619,
"mean_token_accuracy": 0.9654546058177949,
"num_tokens": 6559027325.0,
"step": 61950
},
{
"entropy": 1.22515625,
"epoch": 1.5180451496009009,
"grad_norm": 1.8359375,
"learning_rate": 9.033182838192564e-07,
"loss": 0.1595,
"mean_token_accuracy": 0.9660532510280609,
"num_tokens": 6564515287.0,
"step": 62000
},
{
"entropy": 1.236875,
"epoch": 1.5192693795602565,
"grad_norm": 1.9375,
"learning_rate": 8.98979201624201e-07,
"loss": 0.161,
"mean_token_accuracy": 0.9655505573749542,
"num_tokens": 6569987402.0,
"step": 62050
},
{
"entropy": 1.249375,
"epoch": 1.5204936095196122,
"grad_norm": 3.125,
"learning_rate": 8.946487285282659e-07,
"loss": 0.1724,
"mean_token_accuracy": 0.9626421999931335,
"num_tokens": 6575526706.0,
"step": 62100
},
{
"entropy": 1.225,
"epoch": 1.5217178394789679,
"grad_norm": 3.65625,
"learning_rate": 8.903268822759075e-07,
"loss": 0.1615,
"mean_token_accuracy": 0.966062741279602,
"num_tokens": 6580795009.0,
"step": 62150
},
{
"entropy": 1.23375,
"epoch": 1.5229420694383233,
"grad_norm": 4.5,
"learning_rate": 8.860136805762319e-07,
"loss": 0.1617,
"mean_token_accuracy": 0.9658437705039978,
"num_tokens": 6586016211.0,
"step": 62200
},
{
"entropy": 1.2359375,
"epoch": 1.5241662993976788,
"grad_norm": 2.359375,
"learning_rate": 8.817091411029271e-07,
"loss": 0.1593,
"mean_token_accuracy": 0.966154944896698,
"num_tokens": 6591160444.0,
"step": 62250
},
{
"entropy": 1.2134375,
"epoch": 1.5253905293570345,
"grad_norm": 2.390625,
"learning_rate": 8.774132814941828e-07,
"loss": 0.1579,
"mean_token_accuracy": 0.9668165516853332,
"num_tokens": 6596300228.0,
"step": 62300
},
{
"entropy": 1.21953125,
"epoch": 1.5266147593163901,
"grad_norm": 2.859375,
"learning_rate": 8.731261193526248e-07,
"loss": 0.1586,
"mean_token_accuracy": 0.9657115602493286,
"num_tokens": 6601689242.0,
"step": 62350
},
{
"entropy": 1.2521875,
"epoch": 1.5278389892757456,
"grad_norm": 3.046875,
"learning_rate": 8.688476722452379e-07,
"loss": 0.1732,
"mean_token_accuracy": 0.9633473336696625,
"num_tokens": 6607301069.0,
"step": 62400
},
{
"entropy": 1.23234375,
"epoch": 1.529063219235101,
"grad_norm": 2.0,
"learning_rate": 8.645779577033011e-07,
"loss": 0.1655,
"mean_token_accuracy": 0.9651632213592529,
"num_tokens": 6612690182.0,
"step": 62450
},
{
"entropy": 1.22234375,
"epoch": 1.5302874491944567,
"grad_norm": 2.234375,
"learning_rate": 8.603169932223042e-07,
"loss": 0.1644,
"mean_token_accuracy": 0.9645105350017548,
"num_tokens": 6618066965.0,
"step": 62500
},
{
"entropy": 1.2134375,
"epoch": 1.5315116791538124,
"grad_norm": 3.03125,
"learning_rate": 8.560647962618894e-07,
"loss": 0.1473,
"mean_token_accuracy": 0.9680246078968048,
"num_tokens": 6623009283.0,
"step": 62550
},
{
"entropy": 1.2359375,
"epoch": 1.5327359091131678,
"grad_norm": 2.9375,
"learning_rate": 8.518213842457696e-07,
"loss": 0.1684,
"mean_token_accuracy": 0.9639063477516174,
"num_tokens": 6628694150.0,
"step": 62600
},
{
"entropy": 1.2365625,
"epoch": 1.5339601390725233,
"grad_norm": 2.15625,
"learning_rate": 8.475867745616605e-07,
"loss": 0.1699,
"mean_token_accuracy": 0.9639629209041596,
"num_tokens": 6634163539.0,
"step": 62650
},
{
"entropy": 1.23515625,
"epoch": 1.535184369031879,
"grad_norm": 1.8125,
"learning_rate": 8.433609845612123e-07,
"loss": 0.1681,
"mean_token_accuracy": 0.9637242484092713,
"num_tokens": 6639673078.0,
"step": 62700
},
{
"entropy": 1.21796875,
"epoch": 1.5364085989912346,
"grad_norm": 2.1875,
"learning_rate": 8.39144031559933e-07,
"loss": 0.1653,
"mean_token_accuracy": 0.9641383695602417,
"num_tokens": 6645021375.0,
"step": 62750
},
{
"entropy": 1.218125,
"epoch": 1.53763282895059,
"grad_norm": 3.65625,
"learning_rate": 8.349359328371241e-07,
"loss": 0.1557,
"mean_token_accuracy": 0.9672486507892608,
"num_tokens": 6650282385.0,
"step": 62800
},
{
"entropy": 1.24453125,
"epoch": 1.5388570589099455,
"grad_norm": 2.265625,
"learning_rate": 8.307367056357993e-07,
"loss": 0.1744,
"mean_token_accuracy": 0.9627921509742737,
"num_tokens": 6655617849.0,
"step": 62850
},
{
"entropy": 1.2384375,
"epoch": 1.5400812888693012,
"grad_norm": 2.234375,
"learning_rate": 8.265463671626277e-07,
"loss": 0.1643,
"mean_token_accuracy": 0.9646277320384979,
"num_tokens": 6660898400.0,
"step": 62900
},
{
"entropy": 1.2315625,
"epoch": 1.5413055188286569,
"grad_norm": 2.53125,
"learning_rate": 8.223649345878521e-07,
"loss": 0.1595,
"mean_token_accuracy": 0.9663047862052917,
"num_tokens": 6666546321.0,
"step": 62950
},
{
"entropy": 1.21890625,
"epoch": 1.5425297487880123,
"grad_norm": 0.01312255859375,
"learning_rate": 8.181924250452234e-07,
"loss": 0.1479,
"mean_token_accuracy": 0.9685621929168701,
"num_tokens": 6671900409.0,
"step": 63000
},
{
"entropy": 1.22140625,
"epoch": 1.5437539787473677,
"grad_norm": 2.578125,
"learning_rate": 8.140288556319295e-07,
"loss": 0.1564,
"mean_token_accuracy": 0.9663173937797547,
"num_tokens": 6676916235.0,
"step": 63050
},
{
"entropy": 1.2315625,
"epoch": 1.5449782087067234,
"grad_norm": 2.171875,
"learning_rate": 8.098742434085274e-07,
"loss": 0.1619,
"mean_token_accuracy": 0.9653417527675628,
"num_tokens": 6681811077.0,
"step": 63100
},
{
"entropy": 1.2278125,
"epoch": 1.546202438666079,
"grad_norm": 2.609375,
"learning_rate": 8.057286053988688e-07,
"loss": 0.155,
"mean_token_accuracy": 0.9668863129615783,
"num_tokens": 6687079259.0,
"step": 63150
},
{
"entropy": 1.23734375,
"epoch": 1.5474266686254348,
"grad_norm": 2.3125,
"learning_rate": 8.015919585900328e-07,
"loss": 0.1698,
"mean_token_accuracy": 0.9634287714958191,
"num_tokens": 6692413841.0,
"step": 63200
},
{
"entropy": 1.20328125,
"epoch": 1.5486508985847902,
"grad_norm": 2.28125,
"learning_rate": 7.974643199322591e-07,
"loss": 0.1459,
"mean_token_accuracy": 0.9686257600784302,
"num_tokens": 6697530112.0,
"step": 63250
},
{
"entropy": 1.238125,
"epoch": 1.5498751285441457,
"grad_norm": 2.84375,
"learning_rate": 7.933457063388733e-07,
"loss": 0.171,
"mean_token_accuracy": 0.9629907369613647,
"num_tokens": 6702988908.0,
"step": 63300
},
{
"entropy": 1.215625,
"epoch": 1.5510993585035013,
"grad_norm": 1.6953125,
"learning_rate": 7.892361346862206e-07,
"loss": 0.1588,
"mean_token_accuracy": 0.9652132534980774,
"num_tokens": 6708127766.0,
"step": 63350
},
{
"entropy": 1.216875,
"epoch": 1.552323588462857,
"grad_norm": 3.546875,
"learning_rate": 7.851356218135953e-07,
"loss": 0.1565,
"mean_token_accuracy": 0.9663667130470276,
"num_tokens": 6713202542.0,
"step": 63400
},
{
"entropy": 1.2165625,
"epoch": 1.5535478184222125,
"grad_norm": 3.546875,
"learning_rate": 7.810441845231768e-07,
"loss": 0.1562,
"mean_token_accuracy": 0.9665763390064239,
"num_tokens": 6718170250.0,
"step": 63450
},
{
"entropy": 1.239375,
"epoch": 1.554772048381568,
"grad_norm": 2.671875,
"learning_rate": 7.769618395799495e-07,
"loss": 0.1701,
"mean_token_accuracy": 0.9642471766471863,
"num_tokens": 6723417011.0,
"step": 63500
},
{
"entropy": 1.20984375,
"epoch": 1.5559962783409236,
"grad_norm": 2.6875,
"learning_rate": 7.728886037116482e-07,
"loss": 0.1445,
"mean_token_accuracy": 0.9684971439838409,
"num_tokens": 6728453094.0,
"step": 63550
},
{
"entropy": 1.22625,
"epoch": 1.5572205083002792,
"grad_norm": 3.203125,
"learning_rate": 7.688244936086779e-07,
"loss": 0.1591,
"mean_token_accuracy": 0.9653982555866242,
"num_tokens": 6733460582.0,
"step": 63600
},
{
"entropy": 1.23765625,
"epoch": 1.5584447382596347,
"grad_norm": 1.5625,
"learning_rate": 7.64769525924052e-07,
"loss": 0.1631,
"mean_token_accuracy": 0.9650383579730988,
"num_tokens": 6739025377.0,
"step": 63650
},
{
"entropy": 1.241875,
"epoch": 1.5596689682189901,
"grad_norm": 1.921875,
"learning_rate": 7.607237172733212e-07,
"loss": 0.1629,
"mean_token_accuracy": 0.9644639611244201,
"num_tokens": 6744632607.0,
"step": 63700
},
{
"entropy": 1.21015625,
"epoch": 1.5608931981783458,
"grad_norm": 2.8125,
"learning_rate": 7.566870842345078e-07,
"loss": 0.1438,
"mean_token_accuracy": 0.9694548106193542,
"num_tokens": 6749711105.0,
"step": 63750
},
{
"entropy": 1.22625,
"epoch": 1.5621174281377015,
"grad_norm": 2.5625,
"learning_rate": 7.526596433480352e-07,
"loss": 0.162,
"mean_token_accuracy": 0.9650256216526032,
"num_tokens": 6755001114.0,
"step": 63800
},
{
"entropy": 1.2203125,
"epoch": 1.563341658097057,
"grad_norm": 2.078125,
"learning_rate": 7.486414111166603e-07,
"loss": 0.1585,
"mean_token_accuracy": 0.9653235769271851,
"num_tokens": 6760148593.0,
"step": 63850
},
{
"entropy": 1.22015625,
"epoch": 1.5645658880564124,
"grad_norm": 4.15625,
"learning_rate": 7.446324040054098e-07,
"loss": 0.1545,
"mean_token_accuracy": 0.9676208901405334,
"num_tokens": 6765196202.0,
"step": 63900
},
{
"entropy": 1.2396875,
"epoch": 1.565790118015768,
"grad_norm": 1.6640625,
"learning_rate": 7.406326384415069e-07,
"loss": 0.1645,
"mean_token_accuracy": 0.964854439496994,
"num_tokens": 6770864758.0,
"step": 63950
},
{
"entropy": 1.23265625,
"epoch": 1.5670143479751237,
"grad_norm": 4.53125,
"learning_rate": 7.366421308143074e-07,
"loss": 0.1678,
"mean_token_accuracy": 0.9636308062076568,
"num_tokens": 6776309871.0,
"step": 64000
},
{
"entropy": 1.22203125,
"epoch": 1.5682385779344792,
"grad_norm": 2.484375,
"learning_rate": 7.326608974752318e-07,
"loss": 0.1537,
"mean_token_accuracy": 0.9670063924789428,
"num_tokens": 6781591477.0,
"step": 64050
},
{
"entropy": 1.23421875,
"epoch": 1.5694628078938346,
"grad_norm": 3.671875,
"learning_rate": 7.286889547377019e-07,
"loss": 0.1576,
"mean_token_accuracy": 0.9661747896671296,
"num_tokens": 6787008758.0,
"step": 64100
},
{
"entropy": 1.2321875,
"epoch": 1.5706870378531903,
"grad_norm": 2.921875,
"learning_rate": 7.247263188770635e-07,
"loss": 0.1658,
"mean_token_accuracy": 0.9641131579875946,
"num_tokens": 6792453198.0,
"step": 64150
},
{
"entropy": 1.23484375,
"epoch": 1.571911267812546,
"grad_norm": 3.59375,
"learning_rate": 7.207730061305342e-07,
"loss": 0.1715,
"mean_token_accuracy": 0.9631493031978607,
"num_tokens": 6798199941.0,
"step": 64200
},
{
"entropy": 1.2353125,
"epoch": 1.5731354977719014,
"grad_norm": 3.3125,
"learning_rate": 7.168290326971248e-07,
"loss": 0.1629,
"mean_token_accuracy": 0.9649174082279205,
"num_tokens": 6803443062.0,
"step": 64250
},
{
"entropy": 1.220625,
"epoch": 1.5743597277312569,
"grad_norm": 2.203125,
"learning_rate": 7.128944147375779e-07,
"loss": 0.1518,
"mean_token_accuracy": 0.967359025478363,
"num_tokens": 6808707076.0,
"step": 64300
},
{
"entropy": 1.2209375,
"epoch": 1.5755839576906125,
"grad_norm": 1.9375,
"learning_rate": 7.08969168374304e-07,
"loss": 0.1596,
"mean_token_accuracy": 0.9663796508312226,
"num_tokens": 6813958298.0,
"step": 64350
},
{
"entropy": 1.2228125,
"epoch": 1.5768081876499682,
"grad_norm": 3.828125,
"learning_rate": 7.050533096913104e-07,
"loss": 0.162,
"mean_token_accuracy": 0.9654451417922973,
"num_tokens": 6819296053.0,
"step": 64400
},
{
"entropy": 1.228125,
"epoch": 1.578032417609324,
"grad_norm": 2.390625,
"learning_rate": 7.011468547341376e-07,
"loss": 0.1488,
"mean_token_accuracy": 0.9677229869365692,
"num_tokens": 6824596472.0,
"step": 64450
},
{
"entropy": 1.23953125,
"epoch": 1.5792566475686793,
"grad_norm": 2.84375,
"learning_rate": 6.972498195097937e-07,
"loss": 0.1723,
"mean_token_accuracy": 0.962650990486145,
"num_tokens": 6830407037.0,
"step": 64500
},
{
"entropy": 1.230625,
"epoch": 1.5804808775280348,
"grad_norm": 2.375,
"learning_rate": 6.933622199866912e-07,
"loss": 0.1624,
"mean_token_accuracy": 0.9654111993312836,
"num_tokens": 6835900402.0,
"step": 64550
},
{
"entropy": 1.234375,
"epoch": 1.5817051074873905,
"grad_norm": 3.0625,
"learning_rate": 6.894840720945754e-07,
"loss": 0.1665,
"mean_token_accuracy": 0.9645081627368927,
"num_tokens": 6841235827.0,
"step": 64600
},
{
"entropy": 1.21890625,
"epoch": 1.5829293374467461,
"grad_norm": 2.421875,
"learning_rate": 6.856153917244647e-07,
"loss": 0.1611,
"mean_token_accuracy": 0.9654888653755188,
"num_tokens": 6846579737.0,
"step": 64650
},
{
"entropy": 1.2153125,
"epoch": 1.5841535674061016,
"grad_norm": 3.203125,
"learning_rate": 6.81756194728583e-07,
"loss": 0.1546,
"mean_token_accuracy": 0.9667556810379029,
"num_tokens": 6851881949.0,
"step": 64700
},
{
"entropy": 1.22421875,
"epoch": 1.585377797365457,
"grad_norm": 3.046875,
"learning_rate": 6.779064969202973e-07,
"loss": 0.1583,
"mean_token_accuracy": 0.966250067949295,
"num_tokens": 6857094183.0,
"step": 64750
},
{
"entropy": 1.2265625,
"epoch": 1.5866020273248127,
"grad_norm": 2.96875,
"learning_rate": 6.740663140740467e-07,
"loss": 0.163,
"mean_token_accuracy": 0.9652321350574493,
"num_tokens": 6862381095.0,
"step": 64800
},
{
"entropy": 1.2184375,
"epoch": 1.5878262572841684,
"grad_norm": 1.9765625,
"learning_rate": 6.70235661925287e-07,
"loss": 0.1594,
"mean_token_accuracy": 0.965182945728302,
"num_tokens": 6867345829.0,
"step": 64850
},
{
"entropy": 1.22640625,
"epoch": 1.5890504872435238,
"grad_norm": 2.6875,
"learning_rate": 6.664145561704173e-07,
"loss": 0.1548,
"mean_token_accuracy": 0.9668359410762787,
"num_tokens": 6872899925.0,
"step": 64900
},
{
"entropy": 1.23359375,
"epoch": 1.5902747172028793,
"grad_norm": 2.265625,
"learning_rate": 6.626030124667204e-07,
"loss": 0.1695,
"mean_token_accuracy": 0.9634568047523498,
"num_tokens": 6878428253.0,
"step": 64950
},
{
"entropy": 1.22609375,
"epoch": 1.591498947162235,
"grad_norm": 3.40625,
"learning_rate": 6.588010464323006e-07,
"loss": 0.1689,
"mean_token_accuracy": 0.9639648401737213,
"num_tokens": 6883915733.0,
"step": 65000
},
{
"entropy": 1.22859375,
"epoch": 1.5927231771215906,
"grad_norm": 2.28125,
"learning_rate": 6.550086736460136e-07,
"loss": 0.1719,
"mean_token_accuracy": 0.9634046721458435,
"num_tokens": 6889133852.0,
"step": 65050
},
{
"entropy": 1.23578125,
"epoch": 1.593947407080946,
"grad_norm": 2.984375,
"learning_rate": 6.512259096474075e-07,
"loss": 0.1729,
"mean_token_accuracy": 0.9630839240550995,
"num_tokens": 6894861703.0,
"step": 65100
},
{
"entropy": 1.21921875,
"epoch": 1.5951716370403015,
"grad_norm": 2.484375,
"learning_rate": 6.474527699366567e-07,
"loss": 0.1599,
"mean_token_accuracy": 0.965704824924469,
"num_tokens": 6899940861.0,
"step": 65150
},
{
"entropy": 1.21625,
"epoch": 1.5963958669996572,
"grad_norm": 2.078125,
"learning_rate": 6.436892699745009e-07,
"loss": 0.1572,
"mean_token_accuracy": 0.9666438150405884,
"num_tokens": 6905083361.0,
"step": 65200
},
{
"entropy": 1.2153125,
"epoch": 1.5976200969590129,
"grad_norm": 3.5625,
"learning_rate": 6.399354251821792e-07,
"loss": 0.1554,
"mean_token_accuracy": 0.9674275135993957,
"num_tokens": 6910092703.0,
"step": 65250
},
{
"entropy": 1.22984375,
"epoch": 1.5988443269183683,
"grad_norm": 2.828125,
"learning_rate": 6.361912509413676e-07,
"loss": 0.1645,
"mean_token_accuracy": 0.9646131348609924,
"num_tokens": 6915320978.0,
"step": 65300
},
{
"entropy": 1.22984375,
"epoch": 1.6000685568777238,
"grad_norm": 2.546875,
"learning_rate": 6.32456762594116e-07,
"loss": 0.1594,
"mean_token_accuracy": 0.9651407063007355,
"num_tokens": 6920827957.0,
"step": 65350
},
{
"entropy": 1.21140625,
"epoch": 1.6012927868370794,
"grad_norm": 2.578125,
"learning_rate": 6.287319754427873e-07,
"loss": 0.1533,
"mean_token_accuracy": 0.9665750122070312,
"num_tokens": 6926133415.0,
"step": 65400
},
{
"entropy": 1.22109375,
"epoch": 1.602517016796435,
"grad_norm": 2.859375,
"learning_rate": 6.250169047499916e-07,
"loss": 0.1563,
"mean_token_accuracy": 0.9660931730270386,
"num_tokens": 6931165132.0,
"step": 65450
},
{
"entropy": 1.2040625,
"epoch": 1.6037412467557908,
"grad_norm": 3.890625,
"learning_rate": 6.213115657385244e-07,
"loss": 0.1473,
"mean_token_accuracy": 0.9677533149719239,
"num_tokens": 6936236474.0,
"step": 65500
},
{
"entropy": 1.22515625,
"epoch": 1.6049654767151462,
"grad_norm": 1.9140625,
"learning_rate": 6.176159735913079e-07,
"loss": 0.1698,
"mean_token_accuracy": 0.9640149748325348,
"num_tokens": 6941667389.0,
"step": 65550
},
{
"entropy": 1.210625,
"epoch": 1.6061897066745017,
"grad_norm": 2.828125,
"learning_rate": 6.139301434513204e-07,
"loss": 0.1495,
"mean_token_accuracy": 0.9672247707843781,
"num_tokens": 6947023413.0,
"step": 65600
},
{
"entropy": 1.21921875,
"epoch": 1.6074139366338573,
"grad_norm": 2.859375,
"learning_rate": 6.102540904215455e-07,
"loss": 0.1579,
"mean_token_accuracy": 0.9656173276901245,
"num_tokens": 6952441096.0,
"step": 65650
},
{
"entropy": 1.223125,
"epoch": 1.608638166593213,
"grad_norm": 3.71875,
"learning_rate": 6.065878295649004e-07,
"loss": 0.166,
"mean_token_accuracy": 0.9646958529949188,
"num_tokens": 6957942190.0,
"step": 65700
},
{
"entropy": 1.2084375,
"epoch": 1.6098623965525685,
"grad_norm": 2.3125,
"learning_rate": 6.0293137590418e-07,
"loss": 0.15,
"mean_token_accuracy": 0.9669717216491699,
"num_tokens": 6963300846.0,
"step": 65750
},
{
"entropy": 1.22921875,
"epoch": 1.611086626511924,
"grad_norm": 2.078125,
"learning_rate": 5.992847444219915e-07,
"loss": 0.1614,
"mean_token_accuracy": 0.9650086772441864,
"num_tokens": 6968779335.0,
"step": 65800
},
{
"entropy": 1.22625,
"epoch": 1.6123108564712796,
"grad_norm": 2.78125,
"learning_rate": 5.956479500606977e-07,
"loss": 0.171,
"mean_token_accuracy": 0.9639155077934265,
"num_tokens": 6974202109.0,
"step": 65850
},
{
"entropy": 1.21328125,
"epoch": 1.6135350864306353,
"grad_norm": 3.375,
"learning_rate": 5.920210077223508e-07,
"loss": 0.1488,
"mean_token_accuracy": 0.9683645820617676,
"num_tokens": 6979171497.0,
"step": 65900
},
{
"entropy": 1.21875,
"epoch": 1.6147593163899907,
"grad_norm": 2.734375,
"learning_rate": 5.884039322686345e-07,
"loss": 0.1593,
"mean_token_accuracy": 0.9662387585639953,
"num_tokens": 6984410380.0,
"step": 65950
},
{
"entropy": 1.198125,
"epoch": 1.6159835463493462,
"grad_norm": 2.0,
"learning_rate": 5.847967385208012e-07,
"loss": 0.1521,
"mean_token_accuracy": 0.966891850233078,
"num_tokens": 6989408812.0,
"step": 66000
},
{
"entropy": 1.20296875,
"epoch": 1.6172077763087018,
"grad_norm": 2.109375,
"learning_rate": 5.81199441259614e-07,
"loss": 0.1509,
"mean_token_accuracy": 0.9681426680088043,
"num_tokens": 6994432516.0,
"step": 66050
},
{
"entropy": 1.225625,
"epoch": 1.6184320062680575,
"grad_norm": 3.140625,
"learning_rate": 5.776120552252833e-07,
"loss": 0.1638,
"mean_token_accuracy": 0.965145457983017,
"num_tokens": 6999763932.0,
"step": 66100
},
{
"entropy": 1.22421875,
"epoch": 1.619656236227413,
"grad_norm": 3.078125,
"learning_rate": 5.740345951174062e-07,
"loss": 0.1654,
"mean_token_accuracy": 0.9642065274715423,
"num_tokens": 7005089905.0,
"step": 66150
},
{
"entropy": 1.238125,
"epoch": 1.6208804661867684,
"grad_norm": 2.78125,
"learning_rate": 5.704670755949111e-07,
"loss": 0.1742,
"mean_token_accuracy": 0.962605128288269,
"num_tokens": 7010758688.0,
"step": 66200
},
{
"entropy": 1.2284375,
"epoch": 1.622104696146124,
"grad_norm": 2.359375,
"learning_rate": 5.669095112759893e-07,
"loss": 0.1699,
"mean_token_accuracy": 0.9639213311672211,
"num_tokens": 7015757555.0,
"step": 66250
},
{
"entropy": 1.215,
"epoch": 1.6233289261054797,
"grad_norm": 3.609375,
"learning_rate": 5.633619167380439e-07,
"loss": 0.1542,
"mean_token_accuracy": 0.9669547820091248,
"num_tokens": 7020934918.0,
"step": 66300
},
{
"entropy": 1.20421875,
"epoch": 1.6245531560648352,
"grad_norm": 2.609375,
"learning_rate": 5.598243065176243e-07,
"loss": 0.1491,
"mean_token_accuracy": 0.9682400977611542,
"num_tokens": 7026062287.0,
"step": 66350
},
{
"entropy": 1.224375,
"epoch": 1.6257773860241906,
"grad_norm": 3.328125,
"learning_rate": 5.56296695110368e-07,
"loss": 0.1563,
"mean_token_accuracy": 0.965864109992981,
"num_tokens": 7031243491.0,
"step": 66400
},
{
"entropy": 1.21640625,
"epoch": 1.6270016159835463,
"grad_norm": 1.875,
"learning_rate": 5.527790969709421e-07,
"loss": 0.1591,
"mean_token_accuracy": 0.9661051654815673,
"num_tokens": 7036518719.0,
"step": 66450
},
{
"entropy": 1.21765625,
"epoch": 1.628225845942902,
"grad_norm": 2.265625,
"learning_rate": 5.492715265129842e-07,
"loss": 0.1526,
"mean_token_accuracy": 0.967378306388855,
"num_tokens": 7041605356.0,
"step": 66500
},
{
"entropy": 1.22578125,
"epoch": 1.6294500759022574,
"grad_norm": 3.25,
"learning_rate": 5.457739981090422e-07,
"loss": 0.1608,
"mean_token_accuracy": 0.965805538892746,
"num_tokens": 7047131119.0,
"step": 66550
},
{
"entropy": 1.22296875,
"epoch": 1.6306743058616129,
"grad_norm": 3.9375,
"learning_rate": 5.422865260905141e-07,
"loss": 0.162,
"mean_token_accuracy": 0.9653668451309204,
"num_tokens": 7052461810.0,
"step": 66600
},
{
"entropy": 1.2321875,
"epoch": 1.6318985358209686,
"grad_norm": 2.015625,
"learning_rate": 5.388091247475948e-07,
"loss": 0.1674,
"mean_token_accuracy": 0.9641144728660583,
"num_tokens": 7057861665.0,
"step": 66650
},
{
"entropy": 1.22,
"epoch": 1.6331227657803242,
"grad_norm": 4.875,
"learning_rate": 5.35341808329211e-07,
"loss": 0.1612,
"mean_token_accuracy": 0.9650032806396485,
"num_tokens": 7063074323.0,
"step": 66700
},
{
"entropy": 1.2290625,
"epoch": 1.63434699573968,
"grad_norm": 2.21875,
"learning_rate": 5.31884591042966e-07,
"loss": 0.1642,
"mean_token_accuracy": 0.9645574033260346,
"num_tokens": 7068850662.0,
"step": 66750
},
{
"entropy": 1.21140625,
"epoch": 1.6355712256990353,
"grad_norm": 2.21875,
"learning_rate": 5.284374870550806e-07,
"loss": 0.1513,
"mean_token_accuracy": 0.9664854764938354,
"num_tokens": 7073845156.0,
"step": 66800
},
{
"entropy": 1.2134375,
"epoch": 1.6367954556583908,
"grad_norm": 2.59375,
"learning_rate": 5.250005104903391e-07,
"loss": 0.1526,
"mean_token_accuracy": 0.9672818171977997,
"num_tokens": 7078890553.0,
"step": 66850
},
{
"entropy": 1.21890625,
"epoch": 1.6380196856177465,
"grad_norm": 3.1875,
"learning_rate": 5.215736754320221e-07,
"loss": 0.1559,
"mean_token_accuracy": 0.9661696362495422,
"num_tokens": 7084113116.0,
"step": 66900
},
{
"entropy": 1.2209375,
"epoch": 1.6392439155771021,
"grad_norm": 2.578125,
"learning_rate": 5.181569959218593e-07,
"loss": 0.1537,
"mean_token_accuracy": 0.9654488229751587,
"num_tokens": 7089341607.0,
"step": 66950
},
{
"entropy": 1.21953125,
"epoch": 1.6404681455364576,
"grad_norm": 2.9375,
"learning_rate": 5.147504859599658e-07,
"loss": 0.1627,
"mean_token_accuracy": 0.9644181895256042,
"num_tokens": 7094625061.0,
"step": 67000
},
{
"entropy": 1.226875,
"epoch": 1.641692375495813,
"grad_norm": 2.78125,
"learning_rate": 5.113541595047853e-07,
"loss": 0.1638,
"mean_token_accuracy": 0.9646450591087341,
"num_tokens": 7100017216.0,
"step": 67050
},
{
"entropy": 1.22828125,
"epoch": 1.6429166054551687,
"grad_norm": 2.515625,
"learning_rate": 5.079680304730336e-07,
"loss": 0.1632,
"mean_token_accuracy": 0.9642895436286927,
"num_tokens": 7105647390.0,
"step": 67100
},
{
"entropy": 1.2190625,
"epoch": 1.6441408354145244,
"grad_norm": 3.1875,
"learning_rate": 5.045921127396446e-07,
"loss": 0.1568,
"mean_token_accuracy": 0.9664517366886138,
"num_tokens": 7111038795.0,
"step": 67150
},
{
"entropy": 1.20453125,
"epoch": 1.6453650653738798,
"grad_norm": 3.5625,
"learning_rate": 5.012264201377073e-07,
"loss": 0.1546,
"mean_token_accuracy": 0.9667070829868316,
"num_tokens": 7116213641.0,
"step": 67200
},
{
"entropy": 1.22828125,
"epoch": 1.6465892953332353,
"grad_norm": 2.484375,
"learning_rate": 4.978709664584132e-07,
"loss": 0.1502,
"mean_token_accuracy": 0.9669265413284301,
"num_tokens": 7121369080.0,
"step": 67250
},
{
"entropy": 1.2240625,
"epoch": 1.647813525292591,
"grad_norm": 2.328125,
"learning_rate": 4.945257654510013e-07,
"loss": 0.1614,
"mean_token_accuracy": 0.966176050901413,
"num_tokens": 7126738052.0,
"step": 67300
},
{
"entropy": 1.21375,
"epoch": 1.6490377552519466,
"grad_norm": 3.21875,
"learning_rate": 4.911908308226965e-07,
"loss": 0.1425,
"mean_token_accuracy": 0.969027806520462,
"num_tokens": 7131902692.0,
"step": 67350
},
{
"entropy": 1.20609375,
"epoch": 1.650261985211302,
"grad_norm": 2.46875,
"learning_rate": 4.878661762386575e-07,
"loss": 0.1494,
"mean_token_accuracy": 0.966635344028473,
"num_tokens": 7136808281.0,
"step": 67400
},
{
"entropy": 1.2134375,
"epoch": 1.6514862151706575,
"grad_norm": 3.921875,
"learning_rate": 4.845518153219194e-07,
"loss": 0.1536,
"mean_token_accuracy": 0.9664989912509918,
"num_tokens": 7141996551.0,
"step": 67450
},
{
"entropy": 1.2096875,
"epoch": 1.6527104451300132,
"grad_norm": 2.875,
"learning_rate": 4.812477616533406e-07,
"loss": 0.1517,
"mean_token_accuracy": 0.9665413784980774,
"num_tokens": 7146993092.0,
"step": 67500
},
{
"entropy": 1.209375,
"epoch": 1.6539346750893689,
"grad_norm": 3.3125,
"learning_rate": 4.779540287715394e-07,
"loss": 0.1583,
"mean_token_accuracy": 0.965690256357193,
"num_tokens": 7152324580.0,
"step": 67550
},
{
"entropy": 1.2259375,
"epoch": 1.6551589050487243,
"grad_norm": 3.828125,
"learning_rate": 4.7467063017285005e-07,
"loss": 0.1632,
"mean_token_accuracy": 0.9648753714561462,
"num_tokens": 7157642715.0,
"step": 67600
},
{
"entropy": 1.21328125,
"epoch": 1.6563831350080798,
"grad_norm": 3.4375,
"learning_rate": 4.713975793112569e-07,
"loss": 0.1542,
"mean_token_accuracy": 0.9669430148601532,
"num_tokens": 7162998030.0,
"step": 67650
},
{
"entropy": 1.185625,
"epoch": 1.6576073649674354,
"grad_norm": 3.953125,
"learning_rate": 4.681348895983448e-07,
"loss": 0.1379,
"mean_token_accuracy": 0.9700025701522828,
"num_tokens": 7167607013.0,
"step": 67700
},
{
"entropy": 1.2225,
"epoch": 1.658831594926791,
"grad_norm": 2.359375,
"learning_rate": 4.648825744032449e-07,
"loss": 0.1614,
"mean_token_accuracy": 0.9637822723388672,
"num_tokens": 7172916071.0,
"step": 67750
},
{
"entropy": 1.22109375,
"epoch": 1.6600558248861468,
"grad_norm": 0.003997802734375,
"learning_rate": 4.6164064705257424e-07,
"loss": 0.1604,
"mean_token_accuracy": 0.9653963768482208,
"num_tokens": 7178344100.0,
"step": 67800
},
{
"entropy": 1.21921875,
"epoch": 1.6612800548455022,
"grad_norm": 2.453125,
"learning_rate": 4.584091208303891e-07,
"loss": 0.1583,
"mean_token_accuracy": 0.9654520618915557,
"num_tokens": 7183547126.0,
"step": 67850
},
{
"entropy": 1.2121875,
"epoch": 1.6625042848048577,
"grad_norm": 1.7578125,
"learning_rate": 4.5518800897812174e-07,
"loss": 0.1521,
"mean_token_accuracy": 0.9661059749126434,
"num_tokens": 7188532212.0,
"step": 67900
},
{
"entropy": 1.2209375,
"epoch": 1.6637285147642134,
"grad_norm": 2.734375,
"learning_rate": 4.519773246945349e-07,
"loss": 0.1576,
"mean_token_accuracy": 0.9657674777507782,
"num_tokens": 7193693940.0,
"step": 67950
},
{
"entropy": 1.23375,
"epoch": 1.664952744723569,
"grad_norm": 2.953125,
"learning_rate": 4.487770811356612e-07,
"loss": 0.1664,
"mean_token_accuracy": 0.9635096192359924,
"num_tokens": 7199191726.0,
"step": 68000
},
{
"entropy": 1.21625,
"epoch": 1.6661769746829245,
"grad_norm": 3.03125,
"learning_rate": 4.455872914147521e-07,
"loss": 0.1614,
"mean_token_accuracy": 0.965412712097168,
"num_tokens": 7204740271.0,
"step": 68050
},
{
"entropy": 1.2178125,
"epoch": 1.66740120464228,
"grad_norm": 1.96875,
"learning_rate": 4.424079686022223e-07,
"loss": 0.1647,
"mean_token_accuracy": 0.9641766202449799,
"num_tokens": 7210407120.0,
"step": 68100
},
{
"entropy": 1.22875,
"epoch": 1.6686254346016356,
"grad_norm": 2.953125,
"learning_rate": 4.39239125725601e-07,
"loss": 0.162,
"mean_token_accuracy": 0.9659585297107697,
"num_tokens": 7215783474.0,
"step": 68150
},
{
"entropy": 1.226875,
"epoch": 1.6698496645609913,
"grad_norm": 2.4375,
"learning_rate": 4.360807757694718e-07,
"loss": 0.1626,
"mean_token_accuracy": 0.9646227335929871,
"num_tokens": 7220993281.0,
"step": 68200
},
{
"entropy": 1.19703125,
"epoch": 1.6710738945203467,
"grad_norm": 2.0625,
"learning_rate": 4.329329316754236e-07,
"loss": 0.1441,
"mean_token_accuracy": 0.9685395467281341,
"num_tokens": 7225810836.0,
"step": 68250
},
{
"entropy": 1.21875,
"epoch": 1.6722981244797022,
"grad_norm": 2.984375,
"learning_rate": 4.2979560634199754e-07,
"loss": 0.1688,
"mean_token_accuracy": 0.9636458623409271,
"num_tokens": 7231649459.0,
"step": 68300
},
{
"entropy": 1.19296875,
"epoch": 1.6735223544390578,
"grad_norm": 2.65625,
"learning_rate": 4.266688126246311e-07,
"loss": 0.1424,
"mean_token_accuracy": 0.9688647317886353,
"num_tokens": 7236848069.0,
"step": 68350
},
{
"entropy": 1.2278125,
"epoch": 1.6747465843984135,
"grad_norm": 3.203125,
"learning_rate": 4.235525633356111e-07,
"loss": 0.1676,
"mean_token_accuracy": 0.963608900308609,
"num_tokens": 7242384952.0,
"step": 68400
},
{
"entropy": 1.238125,
"epoch": 1.675970814357769,
"grad_norm": 3.359375,
"learning_rate": 4.204468712440144e-07,
"loss": 0.1653,
"mean_token_accuracy": 0.9638743424415588,
"num_tokens": 7247699380.0,
"step": 68450
},
{
"entropy": 1.21671875,
"epoch": 1.6771950443171244,
"grad_norm": 3.015625,
"learning_rate": 4.1735174907566234e-07,
"loss": 0.1507,
"mean_token_accuracy": 0.9674655389785767,
"num_tokens": 7252973599.0,
"step": 68500
},
{
"entropy": 1.2109375,
"epoch": 1.67841927427648,
"grad_norm": 2.4375,
"learning_rate": 4.142672095130603e-07,
"loss": 0.1488,
"mean_token_accuracy": 0.9676065123081208,
"num_tokens": 7257981736.0,
"step": 68550
},
{
"entropy": 1.2084375,
"epoch": 1.6796435042358357,
"grad_norm": 2.765625,
"learning_rate": 4.111932651953554e-07,
"loss": 0.1537,
"mean_token_accuracy": 0.9668715631961823,
"num_tokens": 7263067623.0,
"step": 68600
},
{
"entropy": 1.2253125,
"epoch": 1.6808677341951912,
"grad_norm": 1.953125,
"learning_rate": 4.0812992871827737e-07,
"loss": 0.1514,
"mean_token_accuracy": 0.967187968492508,
"num_tokens": 7268515412.0,
"step": 68650
},
{
"entropy": 1.2240625,
"epoch": 1.6820919641545466,
"grad_norm": 2.1875,
"learning_rate": 4.0507721263409016e-07,
"loss": 0.155,
"mean_token_accuracy": 0.9657605230808258,
"num_tokens": 7273767424.0,
"step": 68700
},
{
"entropy": 1.21890625,
"epoch": 1.6833161941139023,
"grad_norm": 2.078125,
"learning_rate": 4.0203512945153874e-07,
"loss": 0.1501,
"mean_token_accuracy": 0.9671496486663819,
"num_tokens": 7279187672.0,
"step": 68750
},
{
"entropy": 1.20953125,
"epoch": 1.684540424073258,
"grad_norm": 3.0625,
"learning_rate": 3.990036916358014e-07,
"loss": 0.1466,
"mean_token_accuracy": 0.9685079550743103,
"num_tokens": 7284104561.0,
"step": 68800
},
{
"entropy": 1.21328125,
"epoch": 1.6857646540326134,
"grad_norm": 4.0625,
"learning_rate": 3.9598291160843393e-07,
"loss": 0.1557,
"mean_token_accuracy": 0.9655941009521485,
"num_tokens": 7289492586.0,
"step": 68850
},
{
"entropy": 1.18875,
"epoch": 1.686988883991969,
"grad_norm": 1.875,
"learning_rate": 3.929728017473213e-07,
"loss": 0.14,
"mean_token_accuracy": 0.969061805009842,
"num_tokens": 7294671673.0,
"step": 68900
},
{
"entropy": 1.21671875,
"epoch": 1.6882131139513246,
"grad_norm": 1.578125,
"learning_rate": 3.8997337438662893e-07,
"loss": 0.1628,
"mean_token_accuracy": 0.9643185365200043,
"num_tokens": 7300014488.0,
"step": 68950
},
{
"entropy": 1.22359375,
"epoch": 1.6894373439106802,
"grad_norm": 0.01251220703125,
"learning_rate": 3.869846418167452e-07,
"loss": 0.1521,
"mean_token_accuracy": 0.9664946186542511,
"num_tokens": 7305132050.0,
"step": 69000
},
{
"entropy": 1.21640625,
"epoch": 1.690661573870036,
"grad_norm": 2.6875,
"learning_rate": 3.840066162842405e-07,
"loss": 0.1518,
"mean_token_accuracy": 0.9676698422431946,
"num_tokens": 7310341663.0,
"step": 69050
},
{
"entropy": 1.22984375,
"epoch": 1.6918858038293914,
"grad_norm": 3.625,
"learning_rate": 3.8103930999180936e-07,
"loss": 0.1685,
"mean_token_accuracy": 0.963647495508194,
"num_tokens": 7315713992.0,
"step": 69100
},
{
"entropy": 1.2271875,
"epoch": 1.6931100337887468,
"grad_norm": 2.5625,
"learning_rate": 3.780827350982258e-07,
"loss": 0.1558,
"mean_token_accuracy": 0.9662664186954498,
"num_tokens": 7321152260.0,
"step": 69150
},
{
"entropy": 1.21296875,
"epoch": 1.6943342637481025,
"grad_norm": 2.390625,
"learning_rate": 3.751369037182869e-07,
"loss": 0.1532,
"mean_token_accuracy": 0.9662709140777588,
"num_tokens": 7326190569.0,
"step": 69200
},
{
"entropy": 1.198125,
"epoch": 1.6955584937074581,
"grad_norm": 2.9375,
"learning_rate": 3.722018279227728e-07,
"loss": 0.1412,
"mean_token_accuracy": 0.9689172983169556,
"num_tokens": 7331368151.0,
"step": 69250
},
{
"entropy": 1.21125,
"epoch": 1.6967827236668136,
"grad_norm": 3.25,
"learning_rate": 3.6927751973838777e-07,
"loss": 0.1578,
"mean_token_accuracy": 0.9661315476894379,
"num_tokens": 7336566118.0,
"step": 69300
},
{
"entropy": 1.2215625,
"epoch": 1.698006953626169,
"grad_norm": 1.765625,
"learning_rate": 3.66363991147716e-07,
"loss": 0.1577,
"mean_token_accuracy": 0.9653751969337463,
"num_tokens": 7341728443.0,
"step": 69350
},
{
"entropy": 1.20796875,
"epoch": 1.6992311835855247,
"grad_norm": 2.53125,
"learning_rate": 3.6346125408917155e-07,
"loss": 0.1497,
"mean_token_accuracy": 0.9668842852115631,
"num_tokens": 7346956092.0,
"step": 69400
},
{
"entropy": 1.216875,
"epoch": 1.7004554135448804,
"grad_norm": 3.15625,
"learning_rate": 3.605693204569506e-07,
"loss": 0.1547,
"mean_token_accuracy": 0.967246618270874,
"num_tokens": 7352423947.0,
"step": 69450
},
{
"entropy": 1.2075,
"epoch": 1.7016796435042358,
"grad_norm": 2.46875,
"learning_rate": 3.576882021009792e-07,
"loss": 0.1489,
"mean_token_accuracy": 0.9667674267292022,
"num_tokens": 7357669096.0,
"step": 69500
},
{
"entropy": 1.19796875,
"epoch": 1.7029038734635913,
"grad_norm": 2.15625,
"learning_rate": 3.5481791082686757e-07,
"loss": 0.1421,
"mean_token_accuracy": 0.9695830595493317,
"num_tokens": 7362784518.0,
"step": 69550
},
{
"entropy": 1.2278125,
"epoch": 1.704128103422947,
"grad_norm": 2.15625,
"learning_rate": 3.519584583958636e-07,
"loss": 0.162,
"mean_token_accuracy": 0.9651164734363555,
"num_tokens": 7368275670.0,
"step": 69600
},
{
"entropy": 1.21578125,
"epoch": 1.7053523333823026,
"grad_norm": 2.640625,
"learning_rate": 3.4910985652479757e-07,
"loss": 0.1506,
"mean_token_accuracy": 0.9667972207069397,
"num_tokens": 7373607544.0,
"step": 69650
},
{
"entropy": 1.20625,
"epoch": 1.706576563341658,
"grad_norm": 4.71875,
"learning_rate": 3.462721168860428e-07,
"loss": 0.1492,
"mean_token_accuracy": 0.9675750434398651,
"num_tokens": 7378823181.0,
"step": 69700
},
{
"entropy": 1.2265625,
"epoch": 1.7078007933010135,
"grad_norm": 2.84375,
"learning_rate": 3.4344525110746127e-07,
"loss": 0.1603,
"mean_token_accuracy": 0.965987560749054,
"num_tokens": 7384384951.0,
"step": 69750
},
{
"entropy": 1.21953125,
"epoch": 1.7090250232603692,
"grad_norm": 1.640625,
"learning_rate": 3.4062927077236106e-07,
"loss": 0.1574,
"mean_token_accuracy": 0.9660314428806305,
"num_tokens": 7389942384.0,
"step": 69800
},
{
"entropy": 1.21640625,
"epoch": 1.7102492532197249,
"grad_norm": 2.109375,
"learning_rate": 3.3782418741944244e-07,
"loss": 0.1629,
"mean_token_accuracy": 0.9638810443878174,
"num_tokens": 7395323756.0,
"step": 69850
},
{
"entropy": 1.20765625,
"epoch": 1.7114734831790803,
"grad_norm": 2.625,
"learning_rate": 3.350300125427578e-07,
"loss": 0.1384,
"mean_token_accuracy": 0.9689883410930633,
"num_tokens": 7400575411.0,
"step": 69900
},
{
"entropy": 1.20546875,
"epoch": 1.7126977131384358,
"grad_norm": 3.109375,
"learning_rate": 3.3224675759166026e-07,
"loss": 0.1515,
"mean_token_accuracy": 0.9666663575172424,
"num_tokens": 7405984120.0,
"step": 69950
},
{
"entropy": 1.2203125,
"epoch": 1.7139219430977914,
"grad_norm": 2.328125,
"learning_rate": 3.294744339707564e-07,
"loss": 0.1566,
"mean_token_accuracy": 0.9662071549892426,
"num_tokens": 7411306216.0,
"step": 70000
},
{
"epoch": 1.7139219430977914,
"eval_entropy": 1.2108072916666666,
"eval_loss": 0.17756883800029755,
"eval_mean_token_accuracy": 0.9620932574073474,
"eval_num_tokens": 7411306216.0,
"eval_runtime": 601.9385,
"eval_samples_per_second": 16.042,
"eval_steps_per_second": 0.201,
"step": 70000
},
{
"entropy": 1.21734375,
"epoch": 1.7151461730571471,
"grad_norm": 0.0033111572265625,
"learning_rate": 3.2671305303986264e-07,
"loss": 0.1546,
"mean_token_accuracy": 0.9665888488292694,
"num_tokens": 7416539172.0,
"step": 70050
},
{
"entropy": 1.21734375,
"epoch": 1.7163704030165026,
"grad_norm": 2.84375,
"learning_rate": 3.23962626113956e-07,
"loss": 0.151,
"mean_token_accuracy": 0.9668701207637787,
"num_tokens": 7421707836.0,
"step": 70100
},
{
"entropy": 1.20390625,
"epoch": 1.7175946329758582,
"grad_norm": 2.875,
"learning_rate": 3.212231644631286e-07,
"loss": 0.1522,
"mean_token_accuracy": 0.967432736158371,
"num_tokens": 7427044054.0,
"step": 70150
},
{
"entropy": 1.1990625,
"epoch": 1.7188188629352137,
"grad_norm": 2.234375,
"learning_rate": 3.184946793125406e-07,
"loss": 0.1454,
"mean_token_accuracy": 0.9683572733402253,
"num_tokens": 7432165156.0,
"step": 70200
},
{
"entropy": 1.22375,
"epoch": 1.7200430928945694,
"grad_norm": 3.15625,
"learning_rate": 3.157771818423778e-07,
"loss": 0.1574,
"mean_token_accuracy": 0.9646234130859375,
"num_tokens": 7437729163.0,
"step": 70250
},
{
"entropy": 1.2253125,
"epoch": 1.721267322853925,
"grad_norm": 1.78125,
"learning_rate": 3.130706831877993e-07,
"loss": 0.1583,
"mean_token_accuracy": 0.965836591720581,
"num_tokens": 7443255376.0,
"step": 70300
},
{
"entropy": 1.21734375,
"epoch": 1.7224915528132805,
"grad_norm": 3.8125,
"learning_rate": 3.1037519443889927e-07,
"loss": 0.1502,
"mean_token_accuracy": 0.967227201461792,
"num_tokens": 7448723374.0,
"step": 70350
},
{
"entropy": 1.1978125,
"epoch": 1.723715782772636,
"grad_norm": 2.15625,
"learning_rate": 3.07690726640655e-07,
"loss": 0.1386,
"mean_token_accuracy": 0.9692979896068573,
"num_tokens": 7453945048.0,
"step": 70400
},
{
"entropy": 1.21671875,
"epoch": 1.7249400127319916,
"grad_norm": 3.359375,
"learning_rate": 3.050172907928872e-07,
"loss": 0.1601,
"mean_token_accuracy": 0.9648488080501556,
"num_tokens": 7459709955.0,
"step": 70450
},
{
"entropy": 1.194375,
"epoch": 1.7261642426913473,
"grad_norm": 1.2109375,
"learning_rate": 3.0235489785021073e-07,
"loss": 0.1429,
"mean_token_accuracy": 0.968617148399353,
"num_tokens": 7464731391.0,
"step": 70500
},
{
"entropy": 1.21328125,
"epoch": 1.7273884726507027,
"grad_norm": 4.1875,
"learning_rate": 2.997035587219911e-07,
"loss": 0.1509,
"mean_token_accuracy": 0.9667483043670654,
"num_tokens": 7470148354.0,
"step": 70550
},
{
"entropy": 1.21015625,
"epoch": 1.7286127026100582,
"grad_norm": 2.890625,
"learning_rate": 2.970632842723001e-07,
"loss": 0.1537,
"mean_token_accuracy": 0.9668030095100403,
"num_tokens": 7475597114.0,
"step": 70600
},
{
"entropy": 1.21203125,
"epoch": 1.7298369325694138,
"grad_norm": 1.78125,
"learning_rate": 2.944340853198715e-07,
"loss": 0.1489,
"mean_token_accuracy": 0.9677174651622772,
"num_tokens": 7480924480.0,
"step": 70650
},
{
"entropy": 1.1978125,
"epoch": 1.7310611625287695,
"grad_norm": 2.578125,
"learning_rate": 2.9181597263805703e-07,
"loss": 0.1381,
"mean_token_accuracy": 0.9692902910709381,
"num_tokens": 7485944672.0,
"step": 70700
},
{
"entropy": 1.2234375,
"epoch": 1.732285392488125,
"grad_norm": 3.15625,
"learning_rate": 2.8920895695478036e-07,
"loss": 0.1575,
"mean_token_accuracy": 0.9657765531539917,
"num_tokens": 7491484223.0,
"step": 70750
},
{
"entropy": 1.21984375,
"epoch": 1.7335096224474804,
"grad_norm": 1.640625,
"learning_rate": 2.866130489524946e-07,
"loss": 0.1497,
"mean_token_accuracy": 0.9674056577682495,
"num_tokens": 7496915236.0,
"step": 70800
},
{
"entropy": 1.2109375,
"epoch": 1.734733852406836,
"grad_norm": 1.9375,
"learning_rate": 2.8402825926813793e-07,
"loss": 0.1541,
"mean_token_accuracy": 0.9666642725467682,
"num_tokens": 7502068005.0,
"step": 70850
},
{
"entropy": 1.22796875,
"epoch": 1.7359580823661918,
"grad_norm": 1.171875,
"learning_rate": 2.814545984930923e-07,
"loss": 0.1643,
"mean_token_accuracy": 0.9640646266937256,
"num_tokens": 7507947357.0,
"step": 70900
},
{
"entropy": 1.2171875,
"epoch": 1.7371823123255472,
"grad_norm": 3.78125,
"learning_rate": 2.788920771731344e-07,
"loss": 0.1515,
"mean_token_accuracy": 0.96691251039505,
"num_tokens": 7513464788.0,
"step": 70950
},
{
"entropy": 1.21421875,
"epoch": 1.7384065422849027,
"grad_norm": 2.828125,
"learning_rate": 2.763407058083999e-07,
"loss": 0.1562,
"mean_token_accuracy": 0.9653972661495209,
"num_tokens": 7518965009.0,
"step": 71000
},
{
"entropy": 1.22109375,
"epoch": 1.7396307722442583,
"grad_norm": 3.09375,
"learning_rate": 2.738004948533338e-07,
"loss": 0.1553,
"mean_token_accuracy": 0.9661720776557923,
"num_tokens": 7524509007.0,
"step": 71050
},
{
"entropy": 1.2178125,
"epoch": 1.740855002203614,
"grad_norm": 2.640625,
"learning_rate": 2.712714547166534e-07,
"loss": 0.1494,
"mean_token_accuracy": 0.9680777621269226,
"num_tokens": 7529983645.0,
"step": 71100
},
{
"entropy": 1.22078125,
"epoch": 1.7420792321629694,
"grad_norm": 2.640625,
"learning_rate": 2.6875359576129975e-07,
"loss": 0.1604,
"mean_token_accuracy": 0.9644283270835876,
"num_tokens": 7535464039.0,
"step": 71150
},
{
"entropy": 1.206875,
"epoch": 1.743303462122325,
"grad_norm": 1.609375,
"learning_rate": 2.662469283043991e-07,
"loss": 0.1434,
"mean_token_accuracy": 0.9683542418479919,
"num_tokens": 7540523414.0,
"step": 71200
},
{
"entropy": 1.214375,
"epoch": 1.7445276920816806,
"grad_norm": 2.953125,
"learning_rate": 2.637514626172213e-07,
"loss": 0.1549,
"mean_token_accuracy": 0.9665893888473511,
"num_tokens": 7545849728.0,
"step": 71250
},
{
"entropy": 1.2040625,
"epoch": 1.7457519220410362,
"grad_norm": 2.765625,
"learning_rate": 2.6126720892513277e-07,
"loss": 0.1487,
"mean_token_accuracy": 0.9680774366855621,
"num_tokens": 7551159210.0,
"step": 71300
},
{
"entropy": 1.19421875,
"epoch": 1.746976152000392,
"grad_norm": 2.640625,
"learning_rate": 2.5879417740756093e-07,
"loss": 0.1363,
"mean_token_accuracy": 0.9701401054859161,
"num_tokens": 7556078762.0,
"step": 71350
},
{
"entropy": 1.218125,
"epoch": 1.7482003819597474,
"grad_norm": 1.5625,
"learning_rate": 2.563323781979482e-07,
"loss": 0.1656,
"mean_token_accuracy": 0.9642888736724854,
"num_tokens": 7561736323.0,
"step": 71400
},
{
"entropy": 1.21859375,
"epoch": 1.7494246119191028,
"grad_norm": 1.9609375,
"learning_rate": 2.5388182138371173e-07,
"loss": 0.1517,
"mean_token_accuracy": 0.966708824634552,
"num_tokens": 7567328811.0,
"step": 71450
},
{
"entropy": 1.22109375,
"epoch": 1.7506488418784585,
"grad_norm": 2.3125,
"learning_rate": 2.5144251700620135e-07,
"loss": 0.1629,
"mean_token_accuracy": 0.9650636351108551,
"num_tokens": 7572752827.0,
"step": 71500
},
{
"entropy": 1.21,
"epoch": 1.7518730718378142,
"grad_norm": 2.78125,
"learning_rate": 2.4901447506066133e-07,
"loss": 0.1599,
"mean_token_accuracy": 0.9643032836914063,
"num_tokens": 7578362509.0,
"step": 71550
},
{
"entropy": 1.2090625,
"epoch": 1.7530973017971696,
"grad_norm": 1.6484375,
"learning_rate": 2.465977054961852e-07,
"loss": 0.1493,
"mean_token_accuracy": 0.9673759829998017,
"num_tokens": 7583839931.0,
"step": 71600
},
{
"entropy": 1.21171875,
"epoch": 1.754321531756525,
"grad_norm": 2.828125,
"learning_rate": 2.441922182156775e-07,
"loss": 0.1518,
"mean_token_accuracy": 0.9662256014347076,
"num_tokens": 7589236608.0,
"step": 71650
},
{
"entropy": 1.209375,
"epoch": 1.7555457617158807,
"grad_norm": 2.890625,
"learning_rate": 2.4179802307581234e-07,
"loss": 0.1495,
"mean_token_accuracy": 0.9674426424503326,
"num_tokens": 7594652077.0,
"step": 71700
},
{
"entropy": 1.20265625,
"epoch": 1.7567699916752364,
"grad_norm": 2.96875,
"learning_rate": 2.394151298869952e-07,
"loss": 0.1451,
"mean_token_accuracy": 0.9673744821548462,
"num_tokens": 7599701409.0,
"step": 71750
},
{
"entropy": 1.2153125,
"epoch": 1.7579942216345918,
"grad_norm": 2.71875,
"learning_rate": 2.3704354841331932e-07,
"loss": 0.1505,
"mean_token_accuracy": 0.9669674754142761,
"num_tokens": 7605091932.0,
"step": 71800
},
{
"entropy": 1.2065625,
"epoch": 1.7592184515939473,
"grad_norm": 2.1875,
"learning_rate": 2.3468328837252628e-07,
"loss": 0.1478,
"mean_token_accuracy": 0.9676505529880524,
"num_tokens": 7610186489.0,
"step": 71850
},
{
"entropy": 1.20890625,
"epoch": 1.760442681553303,
"grad_norm": 1.765625,
"learning_rate": 2.3233435943597114e-07,
"loss": 0.1503,
"mean_token_accuracy": 0.9671880280971528,
"num_tokens": 7615665531.0,
"step": 71900
},
{
"entropy": 1.20375,
"epoch": 1.7616669115126586,
"grad_norm": 2.453125,
"learning_rate": 2.299967712285731e-07,
"loss": 0.1423,
"mean_token_accuracy": 0.9683215701580048,
"num_tokens": 7620773654.0,
"step": 71950
},
{
"entropy": 1.19234375,
"epoch": 1.762891141472014,
"grad_norm": 3.140625,
"learning_rate": 2.276705333287875e-07,
"loss": 0.1315,
"mean_token_accuracy": 0.9702609395980835,
"num_tokens": 7625470551.0,
"step": 72000
},
{
"entropy": 1.21046875,
"epoch": 1.7641153714313695,
"grad_norm": 2.234375,
"learning_rate": 2.253556552685573e-07,
"loss": 0.1433,
"mean_token_accuracy": 0.9681813132762909,
"num_tokens": 7630517430.0,
"step": 72050
},
{
"entropy": 1.21,
"epoch": 1.7653396013907252,
"grad_norm": 3.125,
"learning_rate": 2.2305214653327855e-07,
"loss": 0.1406,
"mean_token_accuracy": 0.9686529791355133,
"num_tokens": 7635763079.0,
"step": 72100
},
{
"entropy": 1.201875,
"epoch": 1.7665638313500809,
"grad_norm": 1.703125,
"learning_rate": 2.207600165617607e-07,
"loss": 0.1475,
"mean_token_accuracy": 0.9678330075740814,
"num_tokens": 7641423146.0,
"step": 72150
},
{
"entropy": 1.176875,
"epoch": 1.7677880613094363,
"grad_norm": 1.8125,
"learning_rate": 2.1847927474618846e-07,
"loss": 0.1314,
"mean_token_accuracy": 0.9702327287197113,
"num_tokens": 7646275038.0,
"step": 72200
},
{
"entropy": 1.205,
"epoch": 1.7690122912687918,
"grad_norm": 1.515625,
"learning_rate": 2.1620993043208182e-07,
"loss": 0.1371,
"mean_token_accuracy": 0.9702345824241638,
"num_tokens": 7651591457.0,
"step": 72250
},
{
"entropy": 1.2225,
"epoch": 1.7702365212281475,
"grad_norm": 1.6796875,
"learning_rate": 2.139519929182585e-07,
"loss": 0.1507,
"mean_token_accuracy": 0.9666866302490235,
"num_tokens": 7656975261.0,
"step": 72300
},
{
"entropy": 1.1996875,
"epoch": 1.7714607511875031,
"grad_norm": 2.46875,
"learning_rate": 2.1170547145679665e-07,
"loss": 0.1492,
"mean_token_accuracy": 0.966531822681427,
"num_tokens": 7662430438.0,
"step": 72350
},
{
"entropy": 1.21703125,
"epoch": 1.7726849811468586,
"grad_norm": 1.8203125,
"learning_rate": 2.0947037525299606e-07,
"loss": 0.1501,
"mean_token_accuracy": 0.9673058640956879,
"num_tokens": 7667987024.0,
"step": 72400
},
{
"entropy": 1.20890625,
"epoch": 1.7739092111062142,
"grad_norm": 2.640625,
"learning_rate": 2.0724671346533975e-07,
"loss": 0.1483,
"mean_token_accuracy": 0.9672919237613677,
"num_tokens": 7673092874.0,
"step": 72450
},
{
"entropy": 1.21171875,
"epoch": 1.7751334410655697,
"grad_norm": 2.421875,
"learning_rate": 2.0503449520545814e-07,
"loss": 0.1454,
"mean_token_accuracy": 0.9677470910549164,
"num_tokens": 7678350890.0,
"step": 72500
},
{
"entropy": 1.21125,
"epoch": 1.7763576710249254,
"grad_norm": 4.03125,
"learning_rate": 2.0283372953809187e-07,
"loss": 0.1506,
"mean_token_accuracy": 0.9673129177093506,
"num_tokens": 7683768054.0,
"step": 72550
},
{
"entropy": 1.19046875,
"epoch": 1.777581900984281,
"grad_norm": 0.010009765625,
"learning_rate": 2.0064442548105078e-07,
"loss": 0.1311,
"mean_token_accuracy": 0.9706909394264222,
"num_tokens": 7688732517.0,
"step": 72600
},
{
"entropy": 1.20234375,
"epoch": 1.7788061309436365,
"grad_norm": 2.625,
"learning_rate": 1.9846659200518323e-07,
"loss": 0.1443,
"mean_token_accuracy": 0.9685131824016571,
"num_tokens": 7693833105.0,
"step": 72650
},
{
"entropy": 1.1996875,
"epoch": 1.780030360902992,
"grad_norm": 2.8125,
"learning_rate": 1.963002380343336e-07,
"loss": 0.1372,
"mean_token_accuracy": 0.9696123468875885,
"num_tokens": 7698671416.0,
"step": 72700
},
{
"entropy": 1.2096875,
"epoch": 1.7812545908623476,
"grad_norm": 3.46875,
"learning_rate": 1.9414537244530883e-07,
"loss": 0.1447,
"mean_token_accuracy": 0.9681323492527008,
"num_tokens": 7704099695.0,
"step": 72750
},
{
"entropy": 1.209375,
"epoch": 1.7824788208217033,
"grad_norm": 3.8125,
"learning_rate": 1.9200200406784084e-07,
"loss": 0.1471,
"mean_token_accuracy": 0.9671408832073212,
"num_tokens": 7709413054.0,
"step": 72800
},
{
"entropy": 1.22046875,
"epoch": 1.7837030507810587,
"grad_norm": 2.375,
"learning_rate": 1.8987014168455263e-07,
"loss": 0.1513,
"mean_token_accuracy": 0.9667081344127655,
"num_tokens": 7714999778.0,
"step": 72850
},
{
"entropy": 1.21765625,
"epoch": 1.7849272807404142,
"grad_norm": 1.59375,
"learning_rate": 1.8774979403091852e-07,
"loss": 0.1467,
"mean_token_accuracy": 0.9685576283931732,
"num_tokens": 7720722054.0,
"step": 72900
},
{
"entropy": 1.18796875,
"epoch": 1.7861515106997699,
"grad_norm": 3.015625,
"learning_rate": 1.8564096979523027e-07,
"loss": 0.1448,
"mean_token_accuracy": 0.9685378670692444,
"num_tokens": 7726037284.0,
"step": 72950
},
{
"entropy": 1.21359375,
"epoch": 1.7873757406591255,
"grad_norm": 2.75,
"learning_rate": 1.835436776185634e-07,
"loss": 0.1305,
"mean_token_accuracy": 0.9697797727584839,
"num_tokens": 7731254143.0,
"step": 73000
},
{
"entropy": 1.189375,
"epoch": 1.788599970618481,
"grad_norm": 2.71875,
"learning_rate": 1.814579260947379e-07,
"loss": 0.1367,
"mean_token_accuracy": 0.969087952375412,
"num_tokens": 7736558719.0,
"step": 73050
},
{
"entropy": 1.20109375,
"epoch": 1.7898242005778364,
"grad_norm": 2.640625,
"learning_rate": 1.7938372377028622e-07,
"loss": 0.1265,
"mean_token_accuracy": 0.9715298664569855,
"num_tokens": 7741441296.0,
"step": 73100
},
{
"entropy": 1.1953125,
"epoch": 1.791048430537192,
"grad_norm": 2.078125,
"learning_rate": 1.773210791444161e-07,
"loss": 0.131,
"mean_token_accuracy": 0.9706771004199982,
"num_tokens": 7746461885.0,
"step": 73150
},
{
"entropy": 1.2090625,
"epoch": 1.7922726604965478,
"grad_norm": 3.375,
"learning_rate": 1.7527000066897837e-07,
"loss": 0.1469,
"mean_token_accuracy": 0.9673126399517059,
"num_tokens": 7752002392.0,
"step": 73200
},
{
"entropy": 1.1975,
"epoch": 1.7934968904559032,
"grad_norm": 1.5,
"learning_rate": 1.7323049674842783e-07,
"loss": 0.1437,
"mean_token_accuracy": 0.9683597016334534,
"num_tokens": 7756991548.0,
"step": 73250
},
{
"entropy": 1.2171875,
"epoch": 1.7947211204152587,
"grad_norm": 2.046875,
"learning_rate": 1.7120257573979492e-07,
"loss": 0.1454,
"mean_token_accuracy": 0.968316274881363,
"num_tokens": 7762203324.0,
"step": 73300
},
{
"entropy": 1.1959375,
"epoch": 1.7959453503746143,
"grad_norm": 2.109375,
"learning_rate": 1.6918624595264597e-07,
"loss": 0.1366,
"mean_token_accuracy": 0.9702933692932129,
"num_tokens": 7767460924.0,
"step": 73350
},
{
"entropy": 1.199375,
"epoch": 1.79716958033397,
"grad_norm": 2.265625,
"learning_rate": 1.671815156490517e-07,
"loss": 0.143,
"mean_token_accuracy": 0.9685783159732818,
"num_tokens": 7772824486.0,
"step": 73400
},
{
"entropy": 1.21921875,
"epoch": 1.7983938102933255,
"grad_norm": 2.953125,
"learning_rate": 1.651883930435535e-07,
"loss": 0.1362,
"mean_token_accuracy": 0.9696711504459381,
"num_tokens": 7778088634.0,
"step": 73450
},
{
"entropy": 1.2078125,
"epoch": 1.799618040252681,
"grad_norm": 0.004302978515625,
"learning_rate": 1.6320688630312908e-07,
"loss": 0.1363,
"mean_token_accuracy": 0.9695776212215423,
"num_tokens": 7783380087.0,
"step": 73500
},
{
"entropy": 1.22859375,
"epoch": 1.8008422702120366,
"grad_norm": 1.5625,
"learning_rate": 1.6123700354716032e-07,
"loss": 0.1559,
"mean_token_accuracy": 0.9663217055797577,
"num_tokens": 7789343726.0,
"step": 73550
},
{
"entropy": 1.21328125,
"epoch": 1.8020665001713922,
"grad_norm": 1.65625,
"learning_rate": 1.5927875284739546e-07,
"loss": 0.1356,
"mean_token_accuracy": 0.9702400255203247,
"num_tokens": 7794792440.0,
"step": 73600
},
{
"entropy": 1.21484375,
"epoch": 1.803290730130748,
"grad_norm": 1.71875,
"learning_rate": 1.5733214222792392e-07,
"loss": 0.1418,
"mean_token_accuracy": 0.9687067580223083,
"num_tokens": 7800254887.0,
"step": 73650
},
{
"entropy": 1.21421875,
"epoch": 1.8045149600901034,
"grad_norm": 3.625,
"learning_rate": 1.5539717966513623e-07,
"loss": 0.1361,
"mean_token_accuracy": 0.969369399547577,
"num_tokens": 7805607043.0,
"step": 73700
},
{
"entropy": 1.20984375,
"epoch": 1.8057391900494588,
"grad_norm": 2.609375,
"learning_rate": 1.5347387308769478e-07,
"loss": 0.1326,
"mean_token_accuracy": 0.9703532266616821,
"num_tokens": 7810964969.0,
"step": 73750
},
{
"entropy": 1.20515625,
"epoch": 1.8069634200088145,
"grad_norm": 2.234375,
"learning_rate": 1.5156223037649985e-07,
"loss": 0.1506,
"mean_token_accuracy": 0.9663440334796906,
"num_tokens": 7816484836.0,
"step": 73800
},
{
"entropy": 1.1890625,
"epoch": 1.8081876499681702,
"grad_norm": 3.03125,
"learning_rate": 1.4966225936465993e-07,
"loss": 0.1304,
"mean_token_accuracy": 0.9708381593227386,
"num_tokens": 7821459721.0,
"step": 73850
},
{
"entropy": 1.19953125,
"epoch": 1.8094118799275256,
"grad_norm": 2.1875,
"learning_rate": 1.4777396783745612e-07,
"loss": 0.128,
"mean_token_accuracy": 0.9713588643074036,
"num_tokens": 7826287539.0,
"step": 73900
},
{
"entropy": 1.1978125,
"epoch": 1.810636109886881,
"grad_norm": 2.15625,
"learning_rate": 1.4589736353231308e-07,
"loss": 0.1202,
"mean_token_accuracy": 0.9729771482944488,
"num_tokens": 7831387963.0,
"step": 73950
},
{
"entropy": 1.195,
"epoch": 1.8118603398462367,
"grad_norm": 2.296875,
"learning_rate": 1.4403245413876486e-07,
"loss": 0.1344,
"mean_token_accuracy": 0.9699731683731079,
"num_tokens": 7836315700.0,
"step": 74000
},
{
"entropy": 1.18796875,
"epoch": 1.8130845698055924,
"grad_norm": 2.296875,
"learning_rate": 1.4217924729842513e-07,
"loss": 0.1381,
"mean_token_accuracy": 0.9699892640113831,
"num_tokens": 7841453471.0,
"step": 74050
},
{
"entropy": 1.2075,
"epoch": 1.8143087997649479,
"grad_norm": 2.3125,
"learning_rate": 1.403377506049569e-07,
"loss": 0.1451,
"mean_token_accuracy": 0.9681575572490693,
"num_tokens": 7846798475.0,
"step": 74100
},
{
"entropy": 1.1890625,
"epoch": 1.8155330297243033,
"grad_norm": 3.328125,
"learning_rate": 1.385079716040376e-07,
"loss": 0.1253,
"mean_token_accuracy": 0.9720281398296357,
"num_tokens": 7851768429.0,
"step": 74150
},
{
"entropy": 1.19671875,
"epoch": 1.816757259683659,
"grad_norm": 2.40625,
"learning_rate": 1.3668991779333308e-07,
"loss": 0.1218,
"mean_token_accuracy": 0.9725555181503296,
"num_tokens": 7856881793.0,
"step": 74200
},
{
"entropy": 1.19890625,
"epoch": 1.8179814896430146,
"grad_norm": 1.8984375,
"learning_rate": 1.3488359662246087e-07,
"loss": 0.1272,
"mean_token_accuracy": 0.9715735244750977,
"num_tokens": 7861890257.0,
"step": 74250
},
{
"entropy": 1.20390625,
"epoch": 1.81920571960237,
"grad_norm": 1.90625,
"learning_rate": 1.3308901549296604e-07,
"loss": 0.1275,
"mean_token_accuracy": 0.9717478513717651,
"num_tokens": 7867074576.0,
"step": 74300
},
{
"entropy": 1.20203125,
"epoch": 1.8204299495617255,
"grad_norm": 2.46875,
"learning_rate": 1.3130618175828713e-07,
"loss": 0.1367,
"mean_token_accuracy": 0.9701256167888641,
"num_tokens": 7872381109.0,
"step": 74350
},
{
"entropy": 1.20828125,
"epoch": 1.8216541795210812,
"grad_norm": 3.359375,
"learning_rate": 1.2953510272372647e-07,
"loss": 0.1287,
"mean_token_accuracy": 0.9719671607017517,
"num_tokens": 7877881928.0,
"step": 74400
},
{
"entropy": 1.199375,
"epoch": 1.822878409480437,
"grad_norm": 2.59375,
"learning_rate": 1.2777578564641969e-07,
"loss": 0.1309,
"mean_token_accuracy": 0.9707298684120178,
"num_tokens": 7882820168.0,
"step": 74450
},
{
"entropy": 1.21734375,
"epoch": 1.8241026394397923,
"grad_norm": 2.546875,
"learning_rate": 1.2602823773530915e-07,
"loss": 0.1426,
"mean_token_accuracy": 0.9688560748100281,
"num_tokens": 7888372934.0,
"step": 74500
},
{
"entropy": 1.2046875,
"epoch": 1.8253268693991478,
"grad_norm": 2.703125,
"learning_rate": 1.2429246615111024e-07,
"loss": 0.1331,
"mean_token_accuracy": 0.970300270318985,
"num_tokens": 7893801088.0,
"step": 74550
},
{
"entropy": 1.21171875,
"epoch": 1.8265510993585035,
"grad_norm": 2.03125,
"learning_rate": 1.2256847800628425e-07,
"loss": 0.1223,
"mean_token_accuracy": 0.973189731836319,
"num_tokens": 7898852778.0,
"step": 74600
},
{
"entropy": 1.20671875,
"epoch": 1.8277753293178591,
"grad_norm": 2.078125,
"learning_rate": 1.2085628036501007e-07,
"loss": 0.123,
"mean_token_accuracy": 0.9726410353183746,
"num_tokens": 7903818883.0,
"step": 74650
},
{
"entropy": 1.19265625,
"epoch": 1.8289995592772146,
"grad_norm": 3.21875,
"learning_rate": 1.1915588024315194e-07,
"loss": 0.1278,
"mean_token_accuracy": 0.9702788054943084,
"num_tokens": 7908897679.0,
"step": 74700
},
{
"entropy": 1.20984375,
"epoch": 1.83022378923657,
"grad_norm": 3.15625,
"learning_rate": 1.1746728460823508e-07,
"loss": 0.1303,
"mean_token_accuracy": 0.9711257565021515,
"num_tokens": 7914006448.0,
"step": 74750
},
{
"entropy": 1.2140625,
"epoch": 1.8314480191959257,
"grad_norm": 1.9609375,
"learning_rate": 1.1579050037941275e-07,
"loss": 0.1362,
"mean_token_accuracy": 0.969500253200531,
"num_tokens": 7919510157.0,
"step": 74800
},
{
"entropy": 1.21421875,
"epoch": 1.8326722491552814,
"grad_norm": 2.40625,
"learning_rate": 1.1412553442744255e-07,
"loss": 0.132,
"mean_token_accuracy": 0.970678209066391,
"num_tokens": 7924726404.0,
"step": 74850
},
{
"entropy": 1.1996875,
"epoch": 1.833896479114637,
"grad_norm": 2.703125,
"learning_rate": 1.1247239357465255e-07,
"loss": 0.13,
"mean_token_accuracy": 0.9713816094398499,
"num_tokens": 7929934384.0,
"step": 74900
},
{
"entropy": 1.18921875,
"epoch": 1.8351207090739925,
"grad_norm": 1.9921875,
"learning_rate": 1.1083108459491986e-07,
"loss": 0.1256,
"mean_token_accuracy": 0.9721748220920563,
"num_tokens": 7935196457.0,
"step": 74950
},
{
"entropy": 1.2003125,
"epoch": 1.836344939033348,
"grad_norm": 2.703125,
"learning_rate": 1.0920161421363773e-07,
"loss": 0.119,
"mean_token_accuracy": 0.9733594739437104,
"num_tokens": 7940201367.0,
"step": 75000
},
{
"entropy": 1.22375,
"epoch": 1.8375691689927036,
"grad_norm": 1.7265625,
"learning_rate": 1.0758398910768951e-07,
"loss": 0.1373,
"mean_token_accuracy": 0.9692693221569061,
"num_tokens": 7945635438.0,
"step": 75050
},
{
"entropy": 1.20890625,
"epoch": 1.8387933989520593,
"grad_norm": 1.546875,
"learning_rate": 1.0597821590542211e-07,
"loss": 0.1282,
"mean_token_accuracy": 0.9722434699535369,
"num_tokens": 7951091367.0,
"step": 75100
},
{
"entropy": 1.18828125,
"epoch": 1.8400176289114147,
"grad_norm": 0.004425048828125,
"learning_rate": 1.0438430118661924e-07,
"loss": 0.124,
"mean_token_accuracy": 0.9725795328617096,
"num_tokens": 7956255217.0,
"step": 75150
},
{
"entropy": 1.1903125,
"epoch": 1.8412418588707702,
"grad_norm": 1.921875,
"learning_rate": 1.0280225148247213e-07,
"loss": 0.1179,
"mean_token_accuracy": 0.9743827605247497,
"num_tokens": 7961236486.0,
"step": 75200
},
{
"entropy": 1.1996875,
"epoch": 1.8424660888301259,
"grad_norm": 1.640625,
"learning_rate": 1.0123207327555462e-07,
"loss": 0.1156,
"mean_token_accuracy": 0.9743783438205719,
"num_tokens": 7966324215.0,
"step": 75250
},
{
"entropy": 1.2090625,
"epoch": 1.8436903187894815,
"grad_norm": 1.71875,
"learning_rate": 9.967377299979708e-08,
"loss": 0.134,
"mean_token_accuracy": 0.9705902481079102,
"num_tokens": 7971817863.0,
"step": 75300
},
{
"entropy": 1.19578125,
"epoch": 1.844914548748837,
"grad_norm": 2.15625,
"learning_rate": 9.812735704045684e-08,
"loss": 0.1185,
"mean_token_accuracy": 0.9737985277175903,
"num_tokens": 7977008142.0,
"step": 75350
},
{
"entropy": 1.190625,
"epoch": 1.8461387787081924,
"grad_norm": 1.75,
"learning_rate": 9.65928317340975e-08,
"loss": 0.1201,
"mean_token_accuracy": 0.9731456315517426,
"num_tokens": 7982011592.0,
"step": 75400
},
{
"entropy": 1.20875,
"epoch": 1.847363008667548,
"grad_norm": 1.765625,
"learning_rate": 9.507020336855632e-08,
"loss": 0.1221,
"mean_token_accuracy": 0.9724456059932709,
"num_tokens": 7987367141.0,
"step": 75450
},
{
"entropy": 1.20234375,
"epoch": 1.8485872386269038,
"grad_norm": 1.625,
"learning_rate": 9.355947818292554e-08,
"loss": 0.1149,
"mean_token_accuracy": 0.9738513994216919,
"num_tokens": 7992500198.0,
"step": 75500
},
{
"entropy": 1.21625,
"epoch": 1.8498114685862592,
"grad_norm": 1.78125,
"learning_rate": 9.206066236751943e-08,
"loss": 0.1328,
"mean_token_accuracy": 0.9707795882225037,
"num_tokens": 7998217427.0,
"step": 75550
},
{
"entropy": 1.1975,
"epoch": 1.8510356985456147,
"grad_norm": 2.125,
"learning_rate": 9.057376206385559e-08,
"loss": 0.1175,
"mean_token_accuracy": 0.9741839158535004,
"num_tokens": 8003308568.0,
"step": 75600
},
{
"entropy": 1.1878125,
"epoch": 1.8522599285049703,
"grad_norm": 3.21875,
"learning_rate": 8.90987833646254e-08,
"loss": 0.1077,
"mean_token_accuracy": 0.9759363722801209,
"num_tokens": 8008259087.0,
"step": 75650
},
{
"entropy": 1.20125,
"epoch": 1.853484158464326,
"grad_norm": 2.109375,
"learning_rate": 8.763573231367062e-08,
"loss": 0.1256,
"mean_token_accuracy": 0.9727174258232116,
"num_tokens": 8013653351.0,
"step": 75700
},
{
"entropy": 1.20078125,
"epoch": 1.8547083884236815,
"grad_norm": 2.765625,
"learning_rate": 8.618461490595975e-08,
"loss": 0.1214,
"mean_token_accuracy": 0.9735188388824463,
"num_tokens": 8018956628.0,
"step": 75750
},
{
"entropy": 1.209375,
"epoch": 1.855932618383037,
"grad_norm": 2.84375,
"learning_rate": 8.474543708756044e-08,
"loss": 0.1225,
"mean_token_accuracy": 0.9721533727645874,
"num_tokens": 8024197226.0,
"step": 75800
},
{
"entropy": 1.19015625,
"epoch": 1.8571568483423926,
"grad_norm": 0.005462646484375,
"learning_rate": 8.33182047556178e-08,
"loss": 0.1076,
"mean_token_accuracy": 0.9760002064704895,
"num_tokens": 8029024717.0,
"step": 75850
},
{
"entropy": 1.1953125,
"epoch": 1.8583810783017483,
"grad_norm": 1.640625,
"learning_rate": 8.190292375832975e-08,
"loss": 0.1274,
"mean_token_accuracy": 0.971969587802887,
"num_tokens": 8034254868.0,
"step": 75900
},
{
"entropy": 1.20546875,
"epoch": 1.859605308261104,
"grad_norm": 2.78125,
"learning_rate": 8.049959989492239e-08,
"loss": 0.1248,
"mean_token_accuracy": 0.9728272747993469,
"num_tokens": 8039555218.0,
"step": 75950
},
{
"entropy": 1.21359375,
"epoch": 1.8608295382204594,
"grad_norm": 1.640625,
"learning_rate": 7.910823891562536e-08,
"loss": 0.131,
"mean_token_accuracy": 0.9710195803642273,
"num_tokens": 8044915571.0,
"step": 76000
},
{
"entropy": 1.19625,
"epoch": 1.8620537681798148,
"grad_norm": 1.6953125,
"learning_rate": 7.77288465216518e-08,
"loss": 0.1189,
"mean_token_accuracy": 0.9735661280155182,
"num_tokens": 8050222763.0,
"step": 76050
},
{
"entropy": 1.1953125,
"epoch": 1.8632779981391705,
"grad_norm": 2.375,
"learning_rate": 7.636142836517013e-08,
"loss": 0.1211,
"mean_token_accuracy": 0.9737051403522492,
"num_tokens": 8055473678.0,
"step": 76100
},
{
"entropy": 1.196875,
"epoch": 1.8645022280985262,
"grad_norm": 1.6796875,
"learning_rate": 7.500599004928565e-08,
"loss": 0.1122,
"mean_token_accuracy": 0.974678498506546,
"num_tokens": 8060311800.0,
"step": 76150
},
{
"entropy": 1.18984375,
"epoch": 1.8657264580578816,
"grad_norm": 2.5,
"learning_rate": 7.36625371280133e-08,
"loss": 0.1164,
"mean_token_accuracy": 0.9736955296993256,
"num_tokens": 8065567322.0,
"step": 76200
},
{
"entropy": 1.211875,
"epoch": 1.866950688017237,
"grad_norm": 2.109375,
"learning_rate": 7.233107510625858e-08,
"loss": 0.1262,
"mean_token_accuracy": 0.9716404461860657,
"num_tokens": 8070882224.0,
"step": 76250
},
{
"entropy": 1.20234375,
"epoch": 1.8681749179765927,
"grad_norm": 1.65625,
"learning_rate": 7.101160943979201e-08,
"loss": 0.1242,
"mean_token_accuracy": 0.9728803491592407,
"num_tokens": 8075963376.0,
"step": 76300
},
{
"entropy": 1.20921875,
"epoch": 1.8693991479359484,
"grad_norm": 1.625,
"learning_rate": 6.970414553522842e-08,
"loss": 0.1223,
"mean_token_accuracy": 0.9728834819793701,
"num_tokens": 8081448166.0,
"step": 76350
},
{
"entropy": 1.1978125,
"epoch": 1.8706233778953039,
"grad_norm": 2.78125,
"learning_rate": 6.840868875000561e-08,
"loss": 0.1146,
"mean_token_accuracy": 0.9747687363624573,
"num_tokens": 8086285902.0,
"step": 76400
},
{
"entropy": 1.200625,
"epoch": 1.8718476078546593,
"grad_norm": 2.765625,
"learning_rate": 6.712524439235978e-08,
"loss": 0.1171,
"mean_token_accuracy": 0.9743122577667236,
"num_tokens": 8091436927.0,
"step": 76450
},
{
"entropy": 1.211875,
"epoch": 1.873071837814015,
"grad_norm": 2.078125,
"learning_rate": 6.585381772130584e-08,
"loss": 0.1327,
"mean_token_accuracy": 0.9712537932395935,
"num_tokens": 8097048708.0,
"step": 76500
},
{
"entropy": 1.2128125,
"epoch": 1.8742960677733707,
"grad_norm": 2.703125,
"learning_rate": 6.459441394661536e-08,
"loss": 0.1342,
"mean_token_accuracy": 0.9702994549274444,
"num_tokens": 8102302631.0,
"step": 76550
},
{
"entropy": 1.20875,
"epoch": 1.875520297732726,
"grad_norm": 1.7890625,
"learning_rate": 6.334703822879506e-08,
"loss": 0.1337,
"mean_token_accuracy": 0.970585721731186,
"num_tokens": 8107702374.0,
"step": 76600
},
{
"entropy": 1.208125,
"epoch": 1.8767445276920816,
"grad_norm": 2.359375,
"learning_rate": 6.211169567906572e-08,
"loss": 0.1419,
"mean_token_accuracy": 0.9687972629070282,
"num_tokens": 8113119431.0,
"step": 76650
},
{
"entropy": 1.20546875,
"epoch": 1.8779687576514372,
"grad_norm": 3.0625,
"learning_rate": 6.08883913593412e-08,
"loss": 0.1354,
"mean_token_accuracy": 0.9701398539543152,
"num_tokens": 8118309412.0,
"step": 76700
},
{
"entropy": 1.19796875,
"epoch": 1.879192987610793,
"grad_norm": 2.546875,
"learning_rate": 5.967713028220756e-08,
"loss": 0.1334,
"mean_token_accuracy": 0.9713104116916657,
"num_tokens": 8123346693.0,
"step": 76750
},
{
"entropy": 1.2065625,
"epoch": 1.8804172175701483,
"grad_norm": 2.46875,
"learning_rate": 5.8477917410903914e-08,
"loss": 0.1449,
"mean_token_accuracy": 0.968209480047226,
"num_tokens": 8128745782.0,
"step": 76800
},
{
"entropy": 1.19703125,
"epoch": 1.8816414475295038,
"grad_norm": 2.78125,
"learning_rate": 5.729075765929925e-08,
"loss": 0.1602,
"mean_token_accuracy": 0.9653090810775757,
"num_tokens": 8133734566.0,
"step": 76850
},
{
"entropy": 1.2078125,
"epoch": 1.8828656774888595,
"grad_norm": 3.046875,
"learning_rate": 5.61156558918744e-08,
"loss": 0.1748,
"mean_token_accuracy": 0.9636254405975342,
"num_tokens": 8139112182.0,
"step": 76900
},
{
"entropy": 1.19765625,
"epoch": 1.8840899074482151,
"grad_norm": 3.125,
"learning_rate": 5.4952616923703014e-08,
"loss": 0.1508,
"mean_token_accuracy": 0.9667049193382263,
"num_tokens": 8144120297.0,
"step": 76950
},
{
"entropy": 1.20921875,
"epoch": 1.8853141374075706,
"grad_norm": 2.8125,
"learning_rate": 5.380164552042832e-08,
"loss": 0.1581,
"mean_token_accuracy": 0.9663659358024597,
"num_tokens": 8149360110.0,
"step": 77000
},
{
"entropy": 1.2215625,
"epoch": 1.886538367366926,
"grad_norm": 2.046875,
"learning_rate": 5.266274639824742e-08,
"loss": 0.1807,
"mean_token_accuracy": 0.9613511979579925,
"num_tokens": 8154930968.0,
"step": 77050
},
{
"entropy": 1.1940625,
"epoch": 1.8877625973262817,
"grad_norm": 3.390625,
"learning_rate": 5.1535924223889305e-08,
"loss": 0.1593,
"mean_token_accuracy": 0.9654444575309753,
"num_tokens": 8159971112.0,
"step": 77100
},
{
"entropy": 1.2128125,
"epoch": 1.8889868272856374,
"grad_norm": 3.328125,
"learning_rate": 5.042118361459724e-08,
"loss": 0.1693,
"mean_token_accuracy": 0.964167617559433,
"num_tokens": 8165136464.0,
"step": 77150
},
{
"entropy": 1.20234375,
"epoch": 1.890211057244993,
"grad_norm": 2.84375,
"learning_rate": 4.931852913810875e-08,
"loss": 0.1597,
"mean_token_accuracy": 0.9660988628864289,
"num_tokens": 8170440548.0,
"step": 77200
},
{
"entropy": 1.2046875,
"epoch": 1.8914352872043485,
"grad_norm": 2.71875,
"learning_rate": 4.822796531263862e-08,
"loss": 0.163,
"mean_token_accuracy": 0.9647459161281585,
"num_tokens": 8175965156.0,
"step": 77250
},
{
"entropy": 1.21484375,
"epoch": 1.892659517163704,
"grad_norm": 3.09375,
"learning_rate": 4.7149496606857966e-08,
"loss": 0.1777,
"mean_token_accuracy": 0.9630069530010223,
"num_tokens": 8181436041.0,
"step": 77300
},
{
"entropy": 1.20734375,
"epoch": 1.8938837471230596,
"grad_norm": 3.359375,
"learning_rate": 4.608312743987819e-08,
"loss": 0.1646,
"mean_token_accuracy": 0.9651682090759277,
"num_tokens": 8186577107.0,
"step": 77350
},
{
"entropy": 1.2134375,
"epoch": 1.8951079770824153,
"grad_norm": 4.21875,
"learning_rate": 4.50288621812307e-08,
"loss": 0.1701,
"mean_token_accuracy": 0.9638711404800415,
"num_tokens": 8191908989.0,
"step": 77400
},
{
"entropy": 1.1978125,
"epoch": 1.8963322070417707,
"grad_norm": 2.921875,
"learning_rate": 4.398670515085157e-08,
"loss": 0.1672,
"mean_token_accuracy": 0.964127391576767,
"num_tokens": 8197252149.0,
"step": 77450
},
{
"entropy": 1.2015625,
"epoch": 1.8975564370011262,
"grad_norm": 2.75,
"learning_rate": 4.295666061906156e-08,
"loss": 0.1741,
"mean_token_accuracy": 0.9626425766944885,
"num_tokens": 8202870180.0,
"step": 77500
},
{
"entropy": 1.20109375,
"epoch": 1.8987806669604819,
"grad_norm": 4.0625,
"learning_rate": 4.193873280654914e-08,
"loss": 0.1645,
"mean_token_accuracy": 0.964863383769989,
"num_tokens": 8208065173.0,
"step": 77550
},
{
"entropy": 1.20234375,
"epoch": 1.9000048969198375,
"grad_norm": 2.28125,
"learning_rate": 4.093292588435549e-08,
"loss": 0.1605,
"mean_token_accuracy": 0.965006741285324,
"num_tokens": 8213242226.0,
"step": 77600
},
{
"entropy": 1.20734375,
"epoch": 1.901229126879193,
"grad_norm": 2.0,
"learning_rate": 3.993924397385251e-08,
"loss": 0.1693,
"mean_token_accuracy": 0.9635647284984589,
"num_tokens": 8218628064.0,
"step": 77650
},
{
"entropy": 1.21203125,
"epoch": 1.9024533568385484,
"grad_norm": 3.09375,
"learning_rate": 3.895769114673187e-08,
"loss": 0.1657,
"mean_token_accuracy": 0.9649439096450806,
"num_tokens": 8223851321.0,
"step": 77700
},
{
"entropy": 1.18859375,
"epoch": 1.903677586797904,
"grad_norm": 1.8203125,
"learning_rate": 3.798827142498329e-08,
"loss": 0.1508,
"mean_token_accuracy": 0.9679539859294891,
"num_tokens": 8228778299.0,
"step": 77750
},
{
"entropy": 1.20296875,
"epoch": 1.9049018167572598,
"grad_norm": 3.0625,
"learning_rate": 3.7030988780880957e-08,
"loss": 0.1541,
"mean_token_accuracy": 0.966580958366394,
"num_tokens": 8233727662.0,
"step": 77800
},
{
"entropy": 1.21453125,
"epoch": 1.9061260467166152,
"grad_norm": 2.046875,
"learning_rate": 3.6085847136966164e-08,
"loss": 0.1622,
"mean_token_accuracy": 0.9650613677501678,
"num_tokens": 8239365249.0,
"step": 77850
},
{
"entropy": 1.22,
"epoch": 1.9073502766759707,
"grad_norm": 2.546875,
"learning_rate": 3.515285036603233e-08,
"loss": 0.1736,
"mean_token_accuracy": 0.9626342761516571,
"num_tokens": 8244922468.0,
"step": 77900
},
{
"entropy": 1.21125,
"epoch": 1.9085745066353264,
"grad_norm": 2.65625,
"learning_rate": 3.423200229110701e-08,
"loss": 0.1665,
"mean_token_accuracy": 0.9643622922897339,
"num_tokens": 8250033392.0,
"step": 77950
},
{
"entropy": 1.20125,
"epoch": 1.909798736594682,
"grad_norm": 3.546875,
"learning_rate": 3.3323306685437926e-08,
"loss": 0.1587,
"mean_token_accuracy": 0.9665237700939179,
"num_tokens": 8255293579.0,
"step": 78000
},
{
"entropy": 1.189375,
"epoch": 1.9110229665540375,
"grad_norm": 3.296875,
"learning_rate": 3.242676727247795e-08,
"loss": 0.146,
"mean_token_accuracy": 0.9674337708950043,
"num_tokens": 8260317228.0,
"step": 78050
},
{
"entropy": 1.2103125,
"epoch": 1.912247196513393,
"grad_norm": 4.0,
"learning_rate": 3.1542387725868146e-08,
"loss": 0.1651,
"mean_token_accuracy": 0.9643155598640442,
"num_tokens": 8265716396.0,
"step": 78100
},
{
"entropy": 1.20078125,
"epoch": 1.9134714264727486,
"grad_norm": 2.453125,
"learning_rate": 3.0670171669423764e-08,
"loss": 0.1625,
"mean_token_accuracy": 0.9650612294673919,
"num_tokens": 8270999547.0,
"step": 78150
},
{
"entropy": 1.2115625,
"epoch": 1.9146956564321043,
"grad_norm": 2.421875,
"learning_rate": 2.981012267711858e-08,
"loss": 0.1725,
"mean_token_accuracy": 0.9635538387298584,
"num_tokens": 8276439622.0,
"step": 78200
},
{
"entropy": 1.203125,
"epoch": 1.91591988639146,
"grad_norm": 3.5625,
"learning_rate": 2.896224427307226e-08,
"loss": 0.1649,
"mean_token_accuracy": 0.9643189585208893,
"num_tokens": 8281629841.0,
"step": 78250
},
{
"entropy": 1.20921875,
"epoch": 1.9171441163508154,
"grad_norm": 3.5,
"learning_rate": 2.8126539931533023e-08,
"loss": 0.1601,
"mean_token_accuracy": 0.9657320499420166,
"num_tokens": 8286850296.0,
"step": 78300
},
{
"entropy": 1.2075,
"epoch": 1.9183683463101708,
"grad_norm": 3.078125,
"learning_rate": 2.7303013076866335e-08,
"loss": 0.1675,
"mean_token_accuracy": 0.964200325012207,
"num_tokens": 8292528304.0,
"step": 78350
},
{
"entropy": 1.21671875,
"epoch": 1.9195925762695265,
"grad_norm": 4.125,
"learning_rate": 2.6491667083537896e-08,
"loss": 0.1674,
"mean_token_accuracy": 0.9635697185993195,
"num_tokens": 8297851717.0,
"step": 78400
},
{
"entropy": 1.203125,
"epoch": 1.9208168062288822,
"grad_norm": 3.5625,
"learning_rate": 2.5692505276102673e-08,
"loss": 0.1639,
"mean_token_accuracy": 0.9647056591510773,
"num_tokens": 8302822545.0,
"step": 78450
},
{
"entropy": 1.20234375,
"epoch": 1.9220410361882376,
"grad_norm": 4.125,
"learning_rate": 2.490553092918957e-08,
"loss": 0.167,
"mean_token_accuracy": 0.9645107495784759,
"num_tokens": 8308044186.0,
"step": 78500
},
{
"entropy": 1.20390625,
"epoch": 1.923265266147593,
"grad_norm": 4.1875,
"learning_rate": 2.4130747267488096e-08,
"loss": 0.1587,
"mean_token_accuracy": 0.9651757764816284,
"num_tokens": 8313261711.0,
"step": 78550
},
{
"entropy": 1.20625,
"epoch": 1.9244894961069487,
"grad_norm": 2.390625,
"learning_rate": 2.3368157465735727e-08,
"loss": 0.1729,
"mean_token_accuracy": 0.9643122732639313,
"num_tokens": 8318954245.0,
"step": 78600
},
{
"entropy": 1.21640625,
"epoch": 1.9257137260663044,
"grad_norm": 5.46875,
"learning_rate": 2.261776464870424e-08,
"loss": 0.1712,
"mean_token_accuracy": 0.9633339118957519,
"num_tokens": 8324544756.0,
"step": 78650
},
{
"entropy": 1.21515625,
"epoch": 1.9269379560256599,
"grad_norm": 2.875,
"learning_rate": 2.1879571891188054e-08,
"loss": 0.1751,
"mean_token_accuracy": 0.9626336395740509,
"num_tokens": 8329948691.0,
"step": 78700
},
{
"entropy": 1.20515625,
"epoch": 1.9281621859850153,
"grad_norm": 2.71875,
"learning_rate": 2.1153582217990574e-08,
"loss": 0.1655,
"mean_token_accuracy": 0.964772834777832,
"num_tokens": 8335173517.0,
"step": 78750
},
{
"entropy": 1.2015625,
"epoch": 1.929386415944371,
"grad_norm": 2.796875,
"learning_rate": 2.043979860391154e-08,
"loss": 0.1711,
"mean_token_accuracy": 0.9635234928131103,
"num_tokens": 8340379735.0,
"step": 78800
},
{
"entropy": 1.1909375,
"epoch": 1.9306106459037267,
"grad_norm": 3.703125,
"learning_rate": 1.9738223973735702e-08,
"loss": 0.1559,
"mean_token_accuracy": 0.9672637641429901,
"num_tokens": 8345381104.0,
"step": 78850
},
{
"entropy": 1.21375,
"epoch": 1.9318348758630821,
"grad_norm": 2.484375,
"learning_rate": 1.9048861202221823e-08,
"loss": 0.1681,
"mean_token_accuracy": 0.9651576709747315,
"num_tokens": 8350559447.0,
"step": 78900
},
{
"entropy": 1.21234375,
"epoch": 1.9330591058224376,
"grad_norm": 3.9375,
"learning_rate": 1.8371713114086697e-08,
"loss": 0.1652,
"mean_token_accuracy": 0.9637591278553009,
"num_tokens": 8355928028.0,
"step": 78950
},
{
"entropy": 1.20640625,
"epoch": 1.9342833357817932,
"grad_norm": 0.4453125,
"learning_rate": 1.770678248399982e-08,
"loss": 0.1621,
"mean_token_accuracy": 0.9652046132087707,
"num_tokens": 8361366979.0,
"step": 79000
},
{
"entropy": 1.20453125,
"epoch": 1.935507565741149,
"grad_norm": 4.1875,
"learning_rate": 1.7054072036566394e-08,
"loss": 0.1685,
"mean_token_accuracy": 0.9641025936603547,
"num_tokens": 8366288409.0,
"step": 79050
},
{
"entropy": 1.2125,
"epoch": 1.9367317957005044,
"grad_norm": 2.234375,
"learning_rate": 1.6413584446319018e-08,
"loss": 0.1632,
"mean_token_accuracy": 0.9653700625896454,
"num_tokens": 8371880970.0,
"step": 79100
},
{
"entropy": 1.20578125,
"epoch": 1.9379560256598598,
"grad_norm": 2.609375,
"learning_rate": 1.5785322337706688e-08,
"loss": 0.164,
"mean_token_accuracy": 0.9650757694244385,
"num_tokens": 8377110509.0,
"step": 79150
},
{
"entropy": 1.20046875,
"epoch": 1.9391802556192155,
"grad_norm": 2.140625,
"learning_rate": 1.5169288285082793e-08,
"loss": 0.1631,
"mean_token_accuracy": 0.9651304471492768,
"num_tokens": 8382268459.0,
"step": 79200
},
{
"entropy": 1.2075,
"epoch": 1.9404044855785711,
"grad_norm": 3.40625,
"learning_rate": 1.4565484812696151e-08,
"loss": 0.155,
"mean_token_accuracy": 0.9661095356941223,
"num_tokens": 8387474552.0,
"step": 79250
},
{
"entropy": 1.1728125,
"epoch": 1.9416287155379266,
"grad_norm": 2.578125,
"learning_rate": 1.3973914394678655e-08,
"loss": 0.1379,
"mean_token_accuracy": 0.9702671027183533,
"num_tokens": 8392280218.0,
"step": 79300
},
{
"entropy": 1.21859375,
"epoch": 1.942852945497282,
"grad_norm": 3.828125,
"learning_rate": 1.3394579455037637e-08,
"loss": 0.1586,
"mean_token_accuracy": 0.9652379488945008,
"num_tokens": 8397377045.0,
"step": 79350
},
{
"entropy": 1.20515625,
"epoch": 1.9440771754566377,
"grad_norm": 2.453125,
"learning_rate": 1.2827482367643862e-08,
"loss": 0.1537,
"mean_token_accuracy": 0.9671237909793854,
"num_tokens": 8402675630.0,
"step": 79400
},
{
"entropy": 1.18984375,
"epoch": 1.9453014054159934,
"grad_norm": 2.703125,
"learning_rate": 1.2272625456221875e-08,
"loss": 0.1511,
"mean_token_accuracy": 0.9674056422710419,
"num_tokens": 8407470922.0,
"step": 79450
},
{
"entropy": 1.22015625,
"epoch": 1.946525635375349,
"grad_norm": 3.078125,
"learning_rate": 1.1730010994342344e-08,
"loss": 0.1683,
"mean_token_accuracy": 0.963656575679779,
"num_tokens": 8413030681.0,
"step": 79500
},
{
"entropy": 1.2075,
"epoch": 1.9477498653347045,
"grad_norm": 3.703125,
"learning_rate": 1.1199641205410727e-08,
"loss": 0.1676,
"mean_token_accuracy": 0.9641730666160584,
"num_tokens": 8418435608.0,
"step": 79550
},
{
"entropy": 1.20109375,
"epoch": 1.94897409529406,
"grad_norm": 2.609375,
"learning_rate": 1.0681518262659618e-08,
"loss": 0.1612,
"mean_token_accuracy": 0.9652375304698944,
"num_tokens": 8423410030.0,
"step": 79600
},
{
"entropy": 1.19328125,
"epoch": 1.9501983252534156,
"grad_norm": 4.3125,
"learning_rate": 1.0175644289138419e-08,
"loss": 0.1565,
"mean_token_accuracy": 0.9664306437969208,
"num_tokens": 8428505734.0,
"step": 79650
},
{
"entropy": 1.19609375,
"epoch": 1.9514225552127713,
"grad_norm": 3.15625,
"learning_rate": 9.682021357706018e-09,
"loss": 0.1491,
"mean_token_accuracy": 0.968139351606369,
"num_tokens": 8433821851.0,
"step": 79700
},
{
"entropy": 1.19390625,
"epoch": 1.9526467851721268,
"grad_norm": 3.53125,
"learning_rate": 9.20065149102145e-09,
"loss": 0.1566,
"mean_token_accuracy": 0.9661858582496643,
"num_tokens": 8438988974.0,
"step": 79750
},
{
"entropy": 1.20546875,
"epoch": 1.9538710151314822,
"grad_norm": 2.5625,
"learning_rate": 8.731536661535588e-09,
"loss": 0.1691,
"mean_token_accuracy": 0.9629131543636322,
"num_tokens": 8444297546.0,
"step": 79800
},
{
"entropy": 1.20671875,
"epoch": 1.9550952450908379,
"grad_norm": 2.640625,
"learning_rate": 8.274678791484136e-09,
"loss": 0.1603,
"mean_token_accuracy": 0.9652000117301941,
"num_tokens": 8449852335.0,
"step": 79850
},
{
"entropy": 1.18640625,
"epoch": 1.9563194750501935,
"grad_norm": 3.359375,
"learning_rate": 7.830079752877973e-09,
"loss": 0.1394,
"mean_token_accuracy": 0.9697746348381042,
"num_tokens": 8454775071.0,
"step": 79900
},
{
"entropy": 1.2021875,
"epoch": 1.957543705009549,
"grad_norm": 2.734375,
"learning_rate": 7.397741367497157e-09,
"loss": 0.1613,
"mean_token_accuracy": 0.9663744091987609,
"num_tokens": 8460122393.0,
"step": 79950
},
{
"entropy": 1.21125,
"epoch": 1.9587679349689044,
"grad_norm": 3.125,
"learning_rate": 6.977665406882272e-09,
"loss": 0.1689,
"mean_token_accuracy": 0.9630868649482727,
"num_tokens": 8465752957.0,
"step": 80000
},
{
"epoch": 1.9587679349689044,
"eval_entropy": 1.2009765625,
"eval_loss": 0.17771507799625397,
"eval_mean_token_accuracy": 0.9620104561249415,
"eval_num_tokens": 8465752957.0,
"eval_runtime": 611.9165,
"eval_samples_per_second": 15.78,
"eval_steps_per_second": 0.198,
"step": 80000
},
{
"entropy": 1.21203125,
"epoch": 1.9599921649282601,
"grad_norm": 3.421875,
"learning_rate": 6.569853592327757e-09,
"loss": 0.1792,
"mean_token_accuracy": 0.9620552754402161,
"num_tokens": 8471172063.0,
"step": 80050
},
{
"entropy": 1.200625,
"epoch": 1.9612163948876158,
"grad_norm": 3.28125,
"learning_rate": 6.174307594874917e-09,
"loss": 0.1558,
"mean_token_accuracy": 0.9663512742519379,
"num_tokens": 8476017366.0,
"step": 80100
},
{
"entropy": 1.195,
"epoch": 1.9624406248469712,
"grad_norm": 3.65625,
"learning_rate": 5.7910290353049285e-09,
"loss": 0.1529,
"mean_token_accuracy": 0.9670116317272186,
"num_tokens": 8481042255.0,
"step": 80150
},
{
"entropy": 1.18890625,
"epoch": 1.9636648548063267,
"grad_norm": 2.0,
"learning_rate": 5.420019484131844e-09,
"loss": 0.1608,
"mean_token_accuracy": 0.9656985890865326,
"num_tokens": 8486092212.0,
"step": 80200
},
{
"entropy": 1.209375,
"epoch": 1.9648890847656824,
"grad_norm": 2.203125,
"learning_rate": 5.061280461596929e-09,
"loss": 0.1747,
"mean_token_accuracy": 0.962527574300766,
"num_tokens": 8491676835.0,
"step": 80250
},
{
"entropy": 1.20671875,
"epoch": 1.966113314725038,
"grad_norm": 3.21875,
"learning_rate": 4.714813437661336e-09,
"loss": 0.1636,
"mean_token_accuracy": 0.9649911904335022,
"num_tokens": 8497110970.0,
"step": 80300
},
{
"entropy": 1.2090625,
"epoch": 1.9673375446843935,
"grad_norm": 3.078125,
"learning_rate": 4.380619832001775e-09,
"loss": 0.1698,
"mean_token_accuracy": 0.9634296333789826,
"num_tokens": 8502537441.0,
"step": 80350
},
{
"entropy": 1.20875,
"epoch": 1.968561774643749,
"grad_norm": 4.9375,
"learning_rate": 4.058701014002187e-09,
"loss": 0.1637,
"mean_token_accuracy": 0.9648308408260345,
"num_tokens": 8507630732.0,
"step": 80400
},
{
"entropy": 1.20390625,
"epoch": 1.9697860046031046,
"grad_norm": 2.15625,
"learning_rate": 3.749058302751074e-09,
"loss": 0.1531,
"mean_token_accuracy": 0.9671729254722595,
"num_tokens": 8512848795.0,
"step": 80450
},
{
"entropy": 1.2090625,
"epoch": 1.9710102345624603,
"grad_norm": 3.984375,
"learning_rate": 3.451692967033848e-09,
"loss": 0.1643,
"mean_token_accuracy": 0.9643464314937592,
"num_tokens": 8518368412.0,
"step": 80500
},
{
"entropy": 1.20875,
"epoch": 1.972234464521816,
"grad_norm": 3.203125,
"learning_rate": 3.1666062253284942e-09,
"loss": 0.1677,
"mean_token_accuracy": 0.9644135737419128,
"num_tokens": 8523653890.0,
"step": 80550
},
{
"entropy": 1.2159375,
"epoch": 1.9734586944811714,
"grad_norm": 3.875,
"learning_rate": 2.893799245800244e-09,
"loss": 0.166,
"mean_token_accuracy": 0.9650824117660523,
"num_tokens": 8529015592.0,
"step": 80600
},
{
"entropy": 1.2,
"epoch": 1.9746829244405268,
"grad_norm": 2.46875,
"learning_rate": 2.633273146297577e-09,
"loss": 0.1591,
"mean_token_accuracy": 0.9664743864536285,
"num_tokens": 8534375896.0,
"step": 80650
},
{
"entropy": 1.21421875,
"epoch": 1.9759071543998825,
"grad_norm": 2.484375,
"learning_rate": 2.385028994346894e-09,
"loss": 0.1685,
"mean_token_accuracy": 0.9637958765029907,
"num_tokens": 8539783916.0,
"step": 80700
},
{
"entropy": 1.19375,
"epoch": 1.9771313843592382,
"grad_norm": 2.34375,
"learning_rate": 2.149067807147853e-09,
"loss": 0.1589,
"mean_token_accuracy": 0.9659115636348724,
"num_tokens": 8544696959.0,
"step": 80750
},
{
"entropy": 1.21125,
"epoch": 1.9783556143185936,
"grad_norm": 2.25,
"learning_rate": 1.925390551570705e-09,
"loss": 0.1649,
"mean_token_accuracy": 0.9644637072086334,
"num_tokens": 8550159905.0,
"step": 80800
},
{
"entropy": 1.19796875,
"epoch": 1.979579844277949,
"grad_norm": 1.8828125,
"learning_rate": 1.7139981441502973e-09,
"loss": 0.1526,
"mean_token_accuracy": 0.9669871032238007,
"num_tokens": 8555193317.0,
"step": 80850
},
{
"entropy": 1.1953125,
"epoch": 1.9808040742373048,
"grad_norm": 2.828125,
"learning_rate": 1.514891451083744e-09,
"loss": 0.1676,
"mean_token_accuracy": 0.9646944868564605,
"num_tokens": 8560635630.0,
"step": 80900
},
{
"entropy": 1.21921875,
"epoch": 1.9820283041966604,
"grad_norm": 2.90625,
"learning_rate": 1.328071288226762e-09,
"loss": 0.1694,
"mean_token_accuracy": 0.9641423618793488,
"num_tokens": 8566246965.0,
"step": 80950
},
{
"entropy": 1.1896875,
"epoch": 1.9832525341560159,
"grad_norm": 1.9453125,
"learning_rate": 1.1535384210893395e-09,
"loss": 0.1436,
"mean_token_accuracy": 0.9696673655509949,
"num_tokens": 8571430149.0,
"step": 81000
},
{
"entropy": 1.2009375,
"epoch": 1.9844767641153713,
"grad_norm": 2.015625,
"learning_rate": 9.912935648344057e-10,
"loss": 0.1667,
"mean_token_accuracy": 0.9643544840812683,
"num_tokens": 8576922262.0,
"step": 81050
},
{
"entropy": 1.208125,
"epoch": 1.985700994074727,
"grad_norm": 2.71875,
"learning_rate": 8.413373842721672e-10,
"loss": 0.1569,
"mean_token_accuracy": 0.9652732384204864,
"num_tokens": 8582076472.0,
"step": 81100
},
{
"entropy": 1.20875,
"epoch": 1.9869252240340827,
"grad_norm": 2.671875,
"learning_rate": 7.036704938611083e-10,
"loss": 0.1691,
"mean_token_accuracy": 0.9644241857528687,
"num_tokens": 8587501762.0,
"step": 81150
},
{
"entropy": 1.17515625,
"epoch": 1.9881494539934381,
"grad_norm": 3.0,
"learning_rate": 5.782934577009957e-10,
"loss": 0.1391,
"mean_token_accuracy": 0.9696795284748078,
"num_tokens": 8592212871.0,
"step": 81200
},
{
"entropy": 1.195,
"epoch": 1.9893736839527936,
"grad_norm": 2.28125,
"learning_rate": 4.652067895352108e-10,
"loss": 0.1522,
"mean_token_accuracy": 0.9666969799995422,
"num_tokens": 8597378423.0,
"step": 81250
},
{
"entropy": 1.198125,
"epoch": 1.9905979139121492,
"grad_norm": 2.828125,
"learning_rate": 3.644109527447537e-10,
"loss": 0.1695,
"mean_token_accuracy": 0.9640089082717895,
"num_tokens": 8602514679.0,
"step": 81300
},
{
"entropy": 1.20984375,
"epoch": 1.991822143871505,
"grad_norm": 4.5,
"learning_rate": 2.7590636034857675e-10,
"loss": 0.1634,
"mean_token_accuracy": 0.964598093032837,
"num_tokens": 8607950288.0,
"step": 81350
},
{
"entropy": 1.21296875,
"epoch": 1.9930463738308604,
"grad_norm": 3.625,
"learning_rate": 1.9969337500125308e-10,
"loss": 0.166,
"mean_token_accuracy": 0.9647457122802734,
"num_tokens": 8613281748.0,
"step": 81400
},
{
"entropy": 1.21078125,
"epoch": 1.9942706037902158,
"grad_norm": 1.78125,
"learning_rate": 1.3577230899197712e-10,
"loss": 0.1541,
"mean_token_accuracy": 0.9666912174224853,
"num_tokens": 8618635116.0,
"step": 81450
},
{
"entropy": 1.18859375,
"epoch": 1.9954948337495715,
"grad_norm": 3.09375,
"learning_rate": 8.414342424156729e-11,
"loss": 0.149,
"mean_token_accuracy": 0.9679227757453919,
"num_tokens": 8623674169.0,
"step": 81500
},
{
"entropy": 1.18640625,
"epoch": 1.9967190637089272,
"grad_norm": 2.75,
"learning_rate": 4.48069323044642e-11,
"loss": 0.1535,
"mean_token_accuracy": 0.966040461063385,
"num_tokens": 8628848521.0,
"step": 81550
},
{
"entropy": 1.1934375,
"epoch": 1.9979432936682826,
"grad_norm": 3.5625,
"learning_rate": 1.776299436406781e-11,
"loss": 0.1668,
"mean_token_accuracy": 0.9646425199508667,
"num_tokens": 8634212914.0,
"step": 81600
},
{
"entropy": 1.1825,
"epoch": 1.999167523627638,
"grad_norm": 2.65625,
"learning_rate": 3.0117212357350098e-12,
"loss": 0.147,
"mean_token_accuracy": 0.9685306799411774,
"num_tokens": 8639315101.0,
"step": 81650
}
],
"logging_steps": 50,
"max_steps": 81684,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.811857454193967e+19,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}