e2b-pretrain-eosmask / trainer_state.json
jq's picture
Upload folder using huggingface_hub
c8929fe verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 8302,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2810336351394653,
"epoch": 0.0012045290291496024,
"grad_norm": 836.0,
"learning_rate": 2.1634615384615387e-06,
"loss": 11.910458374023438,
"mean_token_accuracy": 0.15205802023410797,
"num_tokens": 67588.0,
"step": 10
},
{
"entropy": 1.4772902846336364,
"epoch": 0.002409058058299205,
"grad_norm": 6368.0,
"learning_rate": 4.567307692307692e-06,
"loss": 10.921498107910157,
"mean_token_accuracy": 0.16450221911072732,
"num_tokens": 134788.0,
"step": 20
},
{
"entropy": 1.9750785112380982,
"epoch": 0.0036135870874488074,
"grad_norm": 374.0,
"learning_rate": 6.9711538461538465e-06,
"loss": 8.950369262695313,
"mean_token_accuracy": 0.17044174969196318,
"num_tokens": 205314.0,
"step": 30
},
{
"entropy": 3.4127318382263185,
"epoch": 0.00481811611659841,
"grad_norm": 117.5,
"learning_rate": 9.375000000000001e-06,
"loss": 6.107471084594726,
"mean_token_accuracy": 0.19763734340667724,
"num_tokens": 275377.0,
"step": 40
},
{
"entropy": 4.608194398880005,
"epoch": 0.006022645145748013,
"grad_norm": 145.0,
"learning_rate": 1.1778846153846154e-05,
"loss": 4.998584365844726,
"mean_token_accuracy": 0.23358086347579957,
"num_tokens": 338642.0,
"step": 50
},
{
"entropy": 4.467870378494263,
"epoch": 0.007227174174897615,
"grad_norm": 142.0,
"learning_rate": 1.4182692307692308e-05,
"loss": 4.630952453613281,
"mean_token_accuracy": 0.2546722576022148,
"num_tokens": 409902.0,
"step": 60
},
{
"entropy": 4.305765962600708,
"epoch": 0.008431703204047217,
"grad_norm": 39.25,
"learning_rate": 1.6586538461538463e-05,
"loss": 4.396952819824219,
"mean_token_accuracy": 0.27402625530958175,
"num_tokens": 477073.0,
"step": 70
},
{
"entropy": 4.207392024993896,
"epoch": 0.00963623223319682,
"grad_norm": 42.0,
"learning_rate": 1.8990384615384615e-05,
"loss": 4.2559349060058596,
"mean_token_accuracy": 0.282428053021431,
"num_tokens": 546516.0,
"step": 80
},
{
"entropy": 3.8669612884521483,
"epoch": 0.010840761262346423,
"grad_norm": 864.0,
"learning_rate": 2.139423076923077e-05,
"loss": 3.9113719940185545,
"mean_token_accuracy": 0.319625186920166,
"num_tokens": 616970.0,
"step": 90
},
{
"entropy": 3.7632123231887817,
"epoch": 0.012045290291496025,
"grad_norm": 37.25,
"learning_rate": 2.3798076923076922e-05,
"loss": 3.7912879943847657,
"mean_token_accuracy": 0.3272585391998291,
"num_tokens": 684911.0,
"step": 100
},
{
"epoch": 0.012045290291496025,
"eval_entropy": 3.9685288667678833,
"eval_loss": 4.026243209838867,
"eval_mean_token_accuracy": 0.3059713691473007,
"eval_num_tokens": 684911.0,
"eval_runtime": 0.4169,
"eval_samples_per_second": 38.376,
"eval_steps_per_second": 4.797,
"step": 100
},
{
"entropy": 3.7600659847259523,
"epoch": 0.013249819320645628,
"grad_norm": 31.0,
"learning_rate": 2.620192307692308e-05,
"loss": 3.7905757904052733,
"mean_token_accuracy": 0.3308758944272995,
"num_tokens": 752070.0,
"step": 110
},
{
"entropy": 3.621634840965271,
"epoch": 0.01445434834979523,
"grad_norm": 16.875,
"learning_rate": 2.860576923076923e-05,
"loss": 3.638637161254883,
"mean_token_accuracy": 0.3428749591112137,
"num_tokens": 818059.0,
"step": 120
},
{
"entropy": 3.7145506381988525,
"epoch": 0.015658877378944832,
"grad_norm": 426.0,
"learning_rate": 3.1009615384615384e-05,
"loss": 3.687373733520508,
"mean_token_accuracy": 0.33822024166584014,
"num_tokens": 887211.0,
"step": 130
},
{
"entropy": 3.5624027252197266,
"epoch": 0.016863406408094434,
"grad_norm": 28.625,
"learning_rate": 3.3413461538461536e-05,
"loss": 3.5997623443603515,
"mean_token_accuracy": 0.34836446344852445,
"num_tokens": 957271.0,
"step": 140
},
{
"entropy": 3.424220418930054,
"epoch": 0.018067935437244036,
"grad_norm": 25.0,
"learning_rate": 3.5817307692307695e-05,
"loss": 3.464914321899414,
"mean_token_accuracy": 0.360398867726326,
"num_tokens": 1028063.0,
"step": 150
},
{
"entropy": 3.524876356124878,
"epoch": 0.01927246446639364,
"grad_norm": 14.5625,
"learning_rate": 3.8221153846153846e-05,
"loss": 3.4790149688720704,
"mean_token_accuracy": 0.36009892225265505,
"num_tokens": 1098655.0,
"step": 160
},
{
"entropy": 3.3341711282730104,
"epoch": 0.020476993495543244,
"grad_norm": 18.5,
"learning_rate": 4.0625000000000005e-05,
"loss": 3.3776344299316405,
"mean_token_accuracy": 0.37702774703502656,
"num_tokens": 1164462.0,
"step": 170
},
{
"entropy": 3.3053973436355593,
"epoch": 0.021681522524692846,
"grad_norm": 16.75,
"learning_rate": 4.302884615384616e-05,
"loss": 3.3609580993652344,
"mean_token_accuracy": 0.37871713638305665,
"num_tokens": 1235254.0,
"step": 180
},
{
"entropy": 3.3385438680648805,
"epoch": 0.02288605155384245,
"grad_norm": 15.6875,
"learning_rate": 4.543269230769231e-05,
"loss": 3.347671890258789,
"mean_token_accuracy": 0.37770530581474304,
"num_tokens": 1304364.0,
"step": 190
},
{
"entropy": 3.262368583679199,
"epoch": 0.02409058058299205,
"grad_norm": 18.375,
"learning_rate": 4.783653846153847e-05,
"loss": 3.2868324279785157,
"mean_token_accuracy": 0.3836541771888733,
"num_tokens": 1373184.0,
"step": 200
},
{
"epoch": 0.02409058058299205,
"eval_entropy": 3.401029348373413,
"eval_loss": 3.5328683853149414,
"eval_mean_token_accuracy": 0.3548418879508972,
"eval_num_tokens": 1373184.0,
"eval_runtime": 0.3788,
"eval_samples_per_second": 42.238,
"eval_steps_per_second": 5.28,
"step": 200
},
{
"entropy": 3.1738123893737793,
"epoch": 0.025295109612141653,
"grad_norm": 31.125,
"learning_rate": 5.024038461538462e-05,
"loss": 3.1927379608154296,
"mean_token_accuracy": 0.39391712546348573,
"num_tokens": 1442812.0,
"step": 210
},
{
"entropy": 3.133743643760681,
"epoch": 0.026499638641291255,
"grad_norm": 8.625,
"learning_rate": 5.264423076923077e-05,
"loss": 3.1629384994506835,
"mean_token_accuracy": 0.40554835796356203,
"num_tokens": 1513954.0,
"step": 220
},
{
"entropy": 3.1619156122207643,
"epoch": 0.027704167670440857,
"grad_norm": 14.9375,
"learning_rate": 5.504807692307693e-05,
"loss": 3.1566917419433596,
"mean_token_accuracy": 0.39930360019207,
"num_tokens": 1581774.0,
"step": 230
},
{
"entropy": 3.1904892444610597,
"epoch": 0.02890869669959046,
"grad_norm": 13.4375,
"learning_rate": 5.7451923076923074e-05,
"loss": 3.2271888732910154,
"mean_token_accuracy": 0.38786653280258176,
"num_tokens": 1649999.0,
"step": 240
},
{
"entropy": 3.090328550338745,
"epoch": 0.030113225728740062,
"grad_norm": 12.3125,
"learning_rate": 5.985576923076923e-05,
"loss": 3.1105745315551756,
"mean_token_accuracy": 0.410536527633667,
"num_tokens": 1720878.0,
"step": 250
},
{
"entropy": 3.117401146888733,
"epoch": 0.031317754757889664,
"grad_norm": 8.0625,
"learning_rate": 6.225961538461539e-05,
"loss": 3.120888900756836,
"mean_token_accuracy": 0.40497787594795226,
"num_tokens": 1789793.0,
"step": 260
},
{
"entropy": 3.114533042907715,
"epoch": 0.032522283787039266,
"grad_norm": 6.75,
"learning_rate": 6.466346153846154e-05,
"loss": 3.1246286392211915,
"mean_token_accuracy": 0.40853759050369265,
"num_tokens": 1858569.0,
"step": 270
},
{
"entropy": 2.9943730354309084,
"epoch": 0.03372681281618887,
"grad_norm": 7.40625,
"learning_rate": 6.70673076923077e-05,
"loss": 3.0746133804321287,
"mean_token_accuracy": 0.4096983641386032,
"num_tokens": 1930700.0,
"step": 280
},
{
"entropy": 3.0540405035018923,
"epoch": 0.03493134184533847,
"grad_norm": 17.5,
"learning_rate": 6.947115384615385e-05,
"loss": 3.002737617492676,
"mean_token_accuracy": 0.42073442935943606,
"num_tokens": 1997275.0,
"step": 290
},
{
"entropy": 2.9349114656448365,
"epoch": 0.03613587087448807,
"grad_norm": 97.0,
"learning_rate": 7.1875e-05,
"loss": 2.9756820678710936,
"mean_token_accuracy": 0.4268068581819534,
"num_tokens": 2066990.0,
"step": 300
},
{
"epoch": 0.03613587087448807,
"eval_entropy": 3.4880975484848022,
"eval_loss": 3.4292285442352295,
"eval_mean_token_accuracy": 0.33973701298236847,
"eval_num_tokens": 2066990.0,
"eval_runtime": 0.5504,
"eval_samples_per_second": 29.069,
"eval_steps_per_second": 3.634,
"step": 300
},
{
"entropy": 2.9692421674728395,
"epoch": 0.037340399903637675,
"grad_norm": 5.90625,
"learning_rate": 7.427884615384616e-05,
"loss": 2.9625797271728516,
"mean_token_accuracy": 0.4250692486763,
"num_tokens": 2137171.0,
"step": 310
},
{
"entropy": 2.91152765750885,
"epoch": 0.03854492893278728,
"grad_norm": 43.0,
"learning_rate": 7.668269230769232e-05,
"loss": 2.9240610122680666,
"mean_token_accuracy": 0.4358662247657776,
"num_tokens": 2207592.0,
"step": 320
},
{
"entropy": 2.8902512788772583,
"epoch": 0.03974945796193688,
"grad_norm": 6.1875,
"learning_rate": 7.908653846153847e-05,
"loss": 2.90927677154541,
"mean_token_accuracy": 0.4399706482887268,
"num_tokens": 2273887.0,
"step": 330
},
{
"entropy": 2.9261450290679933,
"epoch": 0.04095398699108649,
"grad_norm": 11.0625,
"learning_rate": 8.149038461538462e-05,
"loss": 2.939039421081543,
"mean_token_accuracy": 0.42746458351612093,
"num_tokens": 2344104.0,
"step": 340
},
{
"entropy": 2.9675798416137695,
"epoch": 0.04215851602023609,
"grad_norm": 4.28125,
"learning_rate": 8.389423076923077e-05,
"loss": 2.9653833389282225,
"mean_token_accuracy": 0.4271013975143433,
"num_tokens": 2413987.0,
"step": 350
},
{
"entropy": 2.7644049644470217,
"epoch": 0.04336304504938569,
"grad_norm": 8.0625,
"learning_rate": 8.629807692307694e-05,
"loss": 2.8332645416259767,
"mean_token_accuracy": 0.45240907967090604,
"num_tokens": 2483166.0,
"step": 360
},
{
"entropy": 2.9669906377792357,
"epoch": 0.044567574078535295,
"grad_norm": 7.0,
"learning_rate": 8.870192307692308e-05,
"loss": 2.9809848785400392,
"mean_token_accuracy": 0.42300455570220946,
"num_tokens": 2553919.0,
"step": 370
},
{
"entropy": 2.926973581314087,
"epoch": 0.0457721031076849,
"grad_norm": 5.875,
"learning_rate": 9.110576923076923e-05,
"loss": 2.9251119613647463,
"mean_token_accuracy": 0.4301867544651031,
"num_tokens": 2621338.0,
"step": 380
},
{
"entropy": 3.042341208457947,
"epoch": 0.0469766321368345,
"grad_norm": 8.0,
"learning_rate": 9.350961538461539e-05,
"loss": 3.0066806793212892,
"mean_token_accuracy": 0.4190910369157791,
"num_tokens": 2693649.0,
"step": 390
},
{
"entropy": 2.855639863014221,
"epoch": 0.0481811611659841,
"grad_norm": 11.25,
"learning_rate": 9.591346153846154e-05,
"loss": 2.925531768798828,
"mean_token_accuracy": 0.43328951895236967,
"num_tokens": 2762855.0,
"step": 400
},
{
"epoch": 0.0481811611659841,
"eval_entropy": 3.120109796524048,
"eval_loss": 3.337949275970459,
"eval_mean_token_accuracy": 0.36517854034900665,
"eval_num_tokens": 2762855.0,
"eval_runtime": 0.4155,
"eval_samples_per_second": 38.509,
"eval_steps_per_second": 4.814,
"step": 400
},
{
"entropy": 2.9335047960281373,
"epoch": 0.049385690195133704,
"grad_norm": 14.5625,
"learning_rate": 9.83173076923077e-05,
"loss": 2.9816661834716798,
"mean_token_accuracy": 0.4285028487443924,
"num_tokens": 2831768.0,
"step": 410
},
{
"entropy": 2.796990966796875,
"epoch": 0.050590219224283306,
"grad_norm": 5.5625,
"learning_rate": 9.999996429174181e-05,
"loss": 2.8231491088867187,
"mean_token_accuracy": 0.4479815810918808,
"num_tokens": 2902774.0,
"step": 420
},
{
"entropy": 2.800985240936279,
"epoch": 0.05179474825343291,
"grad_norm": 7.3125,
"learning_rate": 9.999932947968169e-05,
"loss": 2.816908073425293,
"mean_token_accuracy": 0.44685631394386294,
"num_tokens": 2974752.0,
"step": 430
},
{
"entropy": 2.833960461616516,
"epoch": 0.05299927728258251,
"grad_norm": 4.4375,
"learning_rate": 9.999790116236919e-05,
"loss": 2.8537731170654297,
"mean_token_accuracy": 0.43992237448692323,
"num_tokens": 3040347.0,
"step": 440
},
{
"entropy": 2.80394446849823,
"epoch": 0.05420380631173211,
"grad_norm": 5.0,
"learning_rate": 9.999567936247218e-05,
"loss": 2.8184518814086914,
"mean_token_accuracy": 0.4442470997571945,
"num_tokens": 3108408.0,
"step": 450
},
{
"entropy": 2.8903349876403808,
"epoch": 0.055408335340881715,
"grad_norm": 3.375,
"learning_rate": 9.999266411525132e-05,
"loss": 2.90649471282959,
"mean_token_accuracy": 0.4361670553684235,
"num_tokens": 3178364.0,
"step": 460
},
{
"entropy": 2.770217847824097,
"epoch": 0.05661286437003132,
"grad_norm": 2.671875,
"learning_rate": 9.998885546855956e-05,
"loss": 2.7700069427490233,
"mean_token_accuracy": 0.4557085156440735,
"num_tokens": 3249289.0,
"step": 470
},
{
"entropy": 2.763340950012207,
"epoch": 0.05781739339918092,
"grad_norm": 3.5,
"learning_rate": 9.998425348284132e-05,
"loss": 2.7811700820922853,
"mean_token_accuracy": 0.4531765550374985,
"num_tokens": 3318854.0,
"step": 480
},
{
"entropy": 2.7287254333496094,
"epoch": 0.05902192242833052,
"grad_norm": 2.96875,
"learning_rate": 9.997885823113159e-05,
"loss": 2.790057373046875,
"mean_token_accuracy": 0.44789515137672425,
"num_tokens": 3387544.0,
"step": 490
},
{
"entropy": 2.775154709815979,
"epoch": 0.060226451457480124,
"grad_norm": 6.34375,
"learning_rate": 9.99726697990547e-05,
"loss": 2.7989133834838866,
"mean_token_accuracy": 0.45067694783210754,
"num_tokens": 3457225.0,
"step": 500
},
{
"epoch": 0.060226451457480124,
"eval_entropy": 3.139446258544922,
"eval_loss": 3.338728189468384,
"eval_mean_token_accuracy": 0.3554842323064804,
"eval_num_tokens": 3457225.0,
"eval_runtime": 0.4113,
"eval_samples_per_second": 38.903,
"eval_steps_per_second": 4.863,
"step": 500
},
{
"entropy": 2.824716258049011,
"epoch": 0.061430980486629726,
"grad_norm": 6.21875,
"learning_rate": 9.996568828482307e-05,
"loss": 2.8506168365478515,
"mean_token_accuracy": 0.4408591091632843,
"num_tokens": 3525902.0,
"step": 510
},
{
"entropy": 2.742767095565796,
"epoch": 0.06263550951577933,
"grad_norm": 4.875,
"learning_rate": 9.995791379923553e-05,
"loss": 2.774262237548828,
"mean_token_accuracy": 0.45442442297935487,
"num_tokens": 3596373.0,
"step": 520
},
{
"entropy": 2.6956833362579347,
"epoch": 0.06384003854492894,
"grad_norm": 10.5,
"learning_rate": 9.994934646567564e-05,
"loss": 2.669524383544922,
"mean_token_accuracy": 0.4663479655981064,
"num_tokens": 3664949.0,
"step": 530
},
{
"entropy": 2.7131295919418337,
"epoch": 0.06504456757407853,
"grad_norm": 8.625,
"learning_rate": 9.99399864201097e-05,
"loss": 2.7482681274414062,
"mean_token_accuracy": 0.45665947794914247,
"num_tokens": 3734769.0,
"step": 540
},
{
"entropy": 2.7076703310012817,
"epoch": 0.06624909660322814,
"grad_norm": 4.5,
"learning_rate": 9.992983381108463e-05,
"loss": 2.7302698135375976,
"mean_token_accuracy": 0.4592186540365219,
"num_tokens": 3804019.0,
"step": 550
},
{
"entropy": 2.6755930185317993,
"epoch": 0.06745362563237774,
"grad_norm": 16.25,
"learning_rate": 9.991888879972552e-05,
"loss": 2.713031196594238,
"mean_token_accuracy": 0.4650493025779724,
"num_tokens": 3869181.0,
"step": 560
},
{
"entropy": 2.5656904935836793,
"epoch": 0.06865815466152735,
"grad_norm": 2.75,
"learning_rate": 9.990715155973325e-05,
"loss": 2.5744911193847657,
"mean_token_accuracy": 0.4835150271654129,
"num_tokens": 3936190.0,
"step": 570
},
{
"entropy": 2.6105204105377195,
"epoch": 0.06986268369067694,
"grad_norm": 4.3125,
"learning_rate": 9.989462227738148e-05,
"loss": 2.6728843688964843,
"mean_token_accuracy": 0.46942216753959654,
"num_tokens": 4003274.0,
"step": 580
},
{
"entropy": 2.729413914680481,
"epoch": 0.07106721271982655,
"grad_norm": 3.265625,
"learning_rate": 9.988130115151392e-05,
"loss": 2.7112655639648438,
"mean_token_accuracy": 0.46558586061000823,
"num_tokens": 4074673.0,
"step": 590
},
{
"entropy": 2.57799232006073,
"epoch": 0.07227174174897615,
"grad_norm": 6.40625,
"learning_rate": 9.986718839354111e-05,
"loss": 2.5880659103393553,
"mean_token_accuracy": 0.47919810116291045,
"num_tokens": 4142427.0,
"step": 600
},
{
"epoch": 0.07227174174897615,
"eval_entropy": 3.2087725400924683,
"eval_loss": 3.3046178817749023,
"eval_mean_token_accuracy": 0.35669657588005066,
"eval_num_tokens": 4142427.0,
"eval_runtime": 0.3908,
"eval_samples_per_second": 40.94,
"eval_steps_per_second": 5.118,
"step": 600
},
{
"entropy": 2.6087251901626587,
"epoch": 0.07347627077812575,
"grad_norm": 10.9375,
"learning_rate": 9.985228422743697e-05,
"loss": 2.642395782470703,
"mean_token_accuracy": 0.4696370273828506,
"num_tokens": 4208167.0,
"step": 610
},
{
"entropy": 2.693037247657776,
"epoch": 0.07468079980727535,
"grad_norm": 5.96875,
"learning_rate": 9.983658888973537e-05,
"loss": 2.716563034057617,
"mean_token_accuracy": 0.46087419986724854,
"num_tokens": 4276554.0,
"step": 620
},
{
"entropy": 2.598901891708374,
"epoch": 0.07588532883642496,
"grad_norm": 3.375,
"learning_rate": 9.982010262952629e-05,
"loss": 2.6250751495361326,
"mean_token_accuracy": 0.47428264617919924,
"num_tokens": 4346959.0,
"step": 630
},
{
"entropy": 2.6499593019485475,
"epoch": 0.07708985786557455,
"grad_norm": 6.75,
"learning_rate": 9.980282570845192e-05,
"loss": 2.651329231262207,
"mean_token_accuracy": 0.4706882297992706,
"num_tokens": 4412980.0,
"step": 640
},
{
"entropy": 2.6572098255157472,
"epoch": 0.07829438689472416,
"grad_norm": 2.8125,
"learning_rate": 9.978475840070251e-05,
"loss": 2.6689807891845705,
"mean_token_accuracy": 0.46909240186214446,
"num_tokens": 4482945.0,
"step": 650
},
{
"entropy": 2.64446074962616,
"epoch": 0.07949891592387376,
"grad_norm": 4.40625,
"learning_rate": 9.976590099301197e-05,
"loss": 2.6496517181396486,
"mean_token_accuracy": 0.4731697618961334,
"num_tokens": 4550746.0,
"step": 660
},
{
"entropy": 2.5607208251953124,
"epoch": 0.08070344495302337,
"grad_norm": 12.0,
"learning_rate": 9.974625378465337e-05,
"loss": 2.622728157043457,
"mean_token_accuracy": 0.47670555114746094,
"num_tokens": 4623290.0,
"step": 670
},
{
"entropy": 2.594883131980896,
"epoch": 0.08190797398217298,
"grad_norm": 9.25,
"learning_rate": 9.97258170874341e-05,
"loss": 2.642950248718262,
"mean_token_accuracy": 0.4712438017129898,
"num_tokens": 4692716.0,
"step": 680
},
{
"entropy": 2.5865614652633666,
"epoch": 0.08311250301132257,
"grad_norm": 4.15625,
"learning_rate": 9.970459122569109e-05,
"loss": 2.5889955520629884,
"mean_token_accuracy": 0.48189602196216585,
"num_tokens": 4758620.0,
"step": 690
},
{
"entropy": 2.477654957771301,
"epoch": 0.08431703204047218,
"grad_norm": 4.125,
"learning_rate": 9.96825765362855e-05,
"loss": 2.4992355346679687,
"mean_token_accuracy": 0.49264668226242064,
"num_tokens": 4827946.0,
"step": 700
},
{
"epoch": 0.08431703204047218,
"eval_entropy": 3.0759323835372925,
"eval_loss": 3.2211337089538574,
"eval_mean_token_accuracy": 0.3738161623477936,
"eval_num_tokens": 4827946.0,
"eval_runtime": 0.3902,
"eval_samples_per_second": 41.004,
"eval_steps_per_second": 5.126,
"step": 700
},
{
"entropy": 2.6039110660552978,
"epoch": 0.08552156106962178,
"grad_norm": 4.21875,
"learning_rate": 9.965977336859744e-05,
"loss": 2.637973403930664,
"mean_token_accuracy": 0.4767207413911819,
"num_tokens": 4893902.0,
"step": 710
},
{
"entropy": 2.5275394916534424,
"epoch": 0.08672609009877139,
"grad_norm": 3.390625,
"learning_rate": 9.963618208452044e-05,
"loss": 2.50467529296875,
"mean_token_accuracy": 0.49372389614582063,
"num_tokens": 4961731.0,
"step": 720
},
{
"entropy": 2.5056965589523315,
"epoch": 0.08793061912792098,
"grad_norm": 3.765625,
"learning_rate": 9.961180305845568e-05,
"loss": 2.555381011962891,
"mean_token_accuracy": 0.4927469611167908,
"num_tokens": 5031232.0,
"step": 730
},
{
"entropy": 2.4905244588851927,
"epoch": 0.08913514815707059,
"grad_norm": 3.1875,
"learning_rate": 9.958663667730603e-05,
"loss": 2.4839048385620117,
"mean_token_accuracy": 0.49619937539100645,
"num_tokens": 5103147.0,
"step": 740
},
{
"entropy": 2.492690992355347,
"epoch": 0.09033967718622019,
"grad_norm": 2.59375,
"learning_rate": 9.956068334047e-05,
"loss": 2.5731042861938476,
"mean_token_accuracy": 0.4881134808063507,
"num_tokens": 5171231.0,
"step": 750
},
{
"entropy": 2.4902069091796877,
"epoch": 0.0915442062153698,
"grad_norm": 4.65625,
"learning_rate": 9.953394345983524e-05,
"loss": 2.4940427780151366,
"mean_token_accuracy": 0.49709913730621336,
"num_tokens": 5241133.0,
"step": 760
},
{
"entropy": 2.554565668106079,
"epoch": 0.09274873524451939,
"grad_norm": 3.015625,
"learning_rate": 9.950641745977221e-05,
"loss": 2.5632354736328127,
"mean_token_accuracy": 0.484773388504982,
"num_tokens": 5307282.0,
"step": 770
},
{
"entropy": 2.4865276336669924,
"epoch": 0.093953264273669,
"grad_norm": 2.875,
"learning_rate": 9.947810577712726e-05,
"loss": 2.5425525665283204,
"mean_token_accuracy": 0.49204943478107455,
"num_tokens": 5377403.0,
"step": 780
},
{
"entropy": 2.4226608276367188,
"epoch": 0.0951577933028186,
"grad_norm": 4.1875,
"learning_rate": 9.944900886121577e-05,
"loss": 2.4426912307739257,
"mean_token_accuracy": 0.503141775727272,
"num_tokens": 5445802.0,
"step": 790
},
{
"entropy": 2.4688761949539186,
"epoch": 0.0963623223319682,
"grad_norm": 3.703125,
"learning_rate": 9.941912717381508e-05,
"loss": 2.4866867065429688,
"mean_token_accuracy": 0.49062854051589966,
"num_tokens": 5517621.0,
"step": 800
},
{
"epoch": 0.0963623223319682,
"eval_entropy": 3.0702708959579468,
"eval_loss": 3.130521297454834,
"eval_mean_token_accuracy": 0.3833482414484024,
"eval_num_tokens": 5517621.0,
"eval_runtime": 0.4137,
"eval_samples_per_second": 38.674,
"eval_steps_per_second": 4.834,
"step": 800
},
{
"entropy": 2.4292314291000365,
"epoch": 0.0975668513611178,
"grad_norm": 4.0,
"learning_rate": 9.938846118915706e-05,
"loss": 2.4692073822021485,
"mean_token_accuracy": 0.5002907454967499,
"num_tokens": 5586412.0,
"step": 810
},
{
"entropy": 2.4135414600372314,
"epoch": 0.09877138039026741,
"grad_norm": 3.671875,
"learning_rate": 9.93570113939206e-05,
"loss": 2.4233545303344726,
"mean_token_accuracy": 0.5025378137826919,
"num_tokens": 5655647.0,
"step": 820
},
{
"entropy": 2.394425559043884,
"epoch": 0.099975909419417,
"grad_norm": 2.84375,
"learning_rate": 9.9324778287224e-05,
"loss": 2.443798828125,
"mean_token_accuracy": 0.5025730848312377,
"num_tokens": 5723722.0,
"step": 830
},
{
"entropy": 2.549955868721008,
"epoch": 0.10118043844856661,
"grad_norm": 3.109375,
"learning_rate": 9.929176238061687e-05,
"loss": 2.563458061218262,
"mean_token_accuracy": 0.48627259731292727,
"num_tokens": 5794281.0,
"step": 840
},
{
"entropy": 2.468239426612854,
"epoch": 0.10238496747771621,
"grad_norm": 3.09375,
"learning_rate": 9.925796419807216e-05,
"loss": 2.4876327514648438,
"mean_token_accuracy": 0.49459480941295625,
"num_tokens": 5864854.0,
"step": 850
},
{
"entropy": 2.542571449279785,
"epoch": 0.10358949650686582,
"grad_norm": 33.25,
"learning_rate": 9.922338427597777e-05,
"loss": 2.5742753982543944,
"mean_token_accuracy": 0.48645628094673155,
"num_tokens": 5933244.0,
"step": 860
},
{
"entropy": 2.4776437759399412,
"epoch": 0.10479402553601541,
"grad_norm": 3.734375,
"learning_rate": 9.918802316312806e-05,
"loss": 2.4924917221069336,
"mean_token_accuracy": 0.4953279852867126,
"num_tokens": 5997292.0,
"step": 870
},
{
"entropy": 2.3822479009628297,
"epoch": 0.10599855456516502,
"grad_norm": 2.59375,
"learning_rate": 9.915188142071512e-05,
"loss": 2.4000953674316405,
"mean_token_accuracy": 0.5090537935495376,
"num_tokens": 6072140.0,
"step": 880
},
{
"entropy": 2.4909890413284304,
"epoch": 0.10720308359431463,
"grad_norm": 3.0,
"learning_rate": 9.91149596223199e-05,
"loss": 2.507156753540039,
"mean_token_accuracy": 0.4945272743701935,
"num_tokens": 6139671.0,
"step": 890
},
{
"entropy": 2.3194159507751464,
"epoch": 0.10840761262346423,
"grad_norm": 3.3125,
"learning_rate": 9.907725835390305e-05,
"loss": 2.3592811584472657,
"mean_token_accuracy": 0.5138555377721786,
"num_tokens": 6211544.0,
"step": 900
},
{
"epoch": 0.10840761262346423,
"eval_entropy": 3.031153082847595,
"eval_loss": 3.127545118331909,
"eval_mean_token_accuracy": 0.3903745263814926,
"eval_num_tokens": 6211544.0,
"eval_runtime": 0.4145,
"eval_samples_per_second": 38.602,
"eval_steps_per_second": 4.825,
"step": 900
},
{
"entropy": 2.4157155036926268,
"epoch": 0.10961214165261383,
"grad_norm": 2.96875,
"learning_rate": 9.903877821379573e-05,
"loss": 2.421117401123047,
"mean_token_accuracy": 0.5020445615053177,
"num_tokens": 6278991.0,
"step": 910
},
{
"entropy": 2.4330294132232666,
"epoch": 0.11081667068176343,
"grad_norm": 5.5,
"learning_rate": 9.899951981268995e-05,
"loss": 2.429648590087891,
"mean_token_accuracy": 0.505955895781517,
"num_tokens": 6346983.0,
"step": 920
},
{
"entropy": 2.4191589832305906,
"epoch": 0.11202119971091304,
"grad_norm": 2.375,
"learning_rate": 9.895948377362905e-05,
"loss": 2.4224737167358397,
"mean_token_accuracy": 0.5058120727539063,
"num_tokens": 6414537.0,
"step": 930
},
{
"entropy": 2.4370260000228883,
"epoch": 0.11322572874006263,
"grad_norm": 3.46875,
"learning_rate": 9.891867073199768e-05,
"loss": 2.4694145202636717,
"mean_token_accuracy": 0.4992352068424225,
"num_tokens": 6481676.0,
"step": 940
},
{
"entropy": 2.3773382186889647,
"epoch": 0.11443025776921224,
"grad_norm": 2.125,
"learning_rate": 9.88770813355118e-05,
"loss": 2.416706657409668,
"mean_token_accuracy": 0.5077957957983017,
"num_tokens": 6551157.0,
"step": 950
},
{
"entropy": 2.4528631925582887,
"epoch": 0.11563478679836184,
"grad_norm": 2.609375,
"learning_rate": 9.883471624420832e-05,
"loss": 2.502288818359375,
"mean_token_accuracy": 0.49239401519298553,
"num_tokens": 6620994.0,
"step": 960
},
{
"entropy": 2.4143874883651733,
"epoch": 0.11683931582751145,
"grad_norm": 2.71875,
"learning_rate": 9.879157613043474e-05,
"loss": 2.4089076995849608,
"mean_token_accuracy": 0.5080109655857086,
"num_tokens": 6687906.0,
"step": 970
},
{
"entropy": 2.4254562616348267,
"epoch": 0.11804384485666104,
"grad_norm": 2.046875,
"learning_rate": 9.874766167883836e-05,
"loss": 2.4518817901611327,
"mean_token_accuracy": 0.4991611152887344,
"num_tokens": 6755467.0,
"step": 980
},
{
"entropy": 2.365486240386963,
"epoch": 0.11924837388581065,
"grad_norm": 5.1875,
"learning_rate": 9.870297358635547e-05,
"loss": 2.3928062438964846,
"mean_token_accuracy": 0.5166857928037644,
"num_tokens": 6822936.0,
"step": 990
},
{
"entropy": 2.363088536262512,
"epoch": 0.12045290291496025,
"grad_norm": 4.6875,
"learning_rate": 9.865751256220035e-05,
"loss": 2.4319067001342773,
"mean_token_accuracy": 0.5071658194065094,
"num_tokens": 6890260.0,
"step": 1000
},
{
"epoch": 0.12045290291496025,
"eval_entropy": 3.0976409912109375,
"eval_loss": 3.1172609329223633,
"eval_mean_token_accuracy": 0.3839927762746811,
"eval_num_tokens": 6890260.0,
"eval_runtime": 0.3739,
"eval_samples_per_second": 42.789,
"eval_steps_per_second": 5.349,
"step": 1000
},
{
"entropy": 2.466799020767212,
"epoch": 0.12165743194410986,
"grad_norm": 3.09375,
"learning_rate": 9.861127932785386e-05,
"loss": 2.4565046310424803,
"mean_token_accuracy": 0.4963350534439087,
"num_tokens": 6956554.0,
"step": 1010
},
{
"entropy": 2.450888824462891,
"epoch": 0.12286196097325945,
"grad_norm": 2.5,
"learning_rate": 9.856427461705215e-05,
"loss": 2.460346221923828,
"mean_token_accuracy": 0.4983285039663315,
"num_tokens": 7026362.0,
"step": 1020
},
{
"entropy": 2.4059686422348023,
"epoch": 0.12406649000240906,
"grad_norm": 3.234375,
"learning_rate": 9.851649917577492e-05,
"loss": 2.420361328125,
"mean_token_accuracy": 0.505145075917244,
"num_tokens": 7098658.0,
"step": 1030
},
{
"entropy": 2.386690592765808,
"epoch": 0.12527101903155866,
"grad_norm": 2.9375,
"learning_rate": 9.846795376223358e-05,
"loss": 2.4095857620239256,
"mean_token_accuracy": 0.507191401720047,
"num_tokens": 7168145.0,
"step": 1040
},
{
"entropy": 2.31386137008667,
"epoch": 0.12647554806070826,
"grad_norm": 2.265625,
"learning_rate": 9.841863914685933e-05,
"loss": 2.3461013793945313,
"mean_token_accuracy": 0.5163414418697357,
"num_tokens": 7236180.0,
"step": 1050
},
{
"entropy": 2.3228036403656005,
"epoch": 0.12768007708985787,
"grad_norm": 6.25,
"learning_rate": 9.836855611229074e-05,
"loss": 2.3331735610961912,
"mean_token_accuracy": 0.5201371729373931,
"num_tokens": 7305057.0,
"step": 1060
},
{
"entropy": 2.3945634365081787,
"epoch": 0.12888460611900746,
"grad_norm": 6.96875,
"learning_rate": 9.831770545336151e-05,
"loss": 2.402456855773926,
"mean_token_accuracy": 0.5082537710666657,
"num_tokens": 7375054.0,
"step": 1070
},
{
"entropy": 2.424406051635742,
"epoch": 0.13008913514815706,
"grad_norm": 3.28125,
"learning_rate": 9.826608797708778e-05,
"loss": 2.4312997817993165,
"mean_token_accuracy": 0.5069911390542984,
"num_tokens": 7442873.0,
"step": 1080
},
{
"entropy": 2.3220206022262575,
"epoch": 0.13129366417730667,
"grad_norm": 3.859375,
"learning_rate": 9.821370450265529e-05,
"loss": 2.3608503341674805,
"mean_token_accuracy": 0.5150643199682235,
"num_tokens": 7510091.0,
"step": 1090
},
{
"entropy": 2.3308310985565184,
"epoch": 0.13249819320645628,
"grad_norm": 2.359375,
"learning_rate": 9.81605558614064e-05,
"loss": 2.3494470596313475,
"mean_token_accuracy": 0.5126039475202561,
"num_tokens": 7580058.0,
"step": 1100
},
{
"epoch": 0.13249819320645628,
"eval_entropy": 2.8751864433288574,
"eval_loss": 3.0592124462127686,
"eval_mean_token_accuracy": 0.4051549583673477,
"eval_num_tokens": 7580058.0,
"eval_runtime": 0.4025,
"eval_samples_per_second": 39.749,
"eval_steps_per_second": 4.969,
"step": 1100
},
{
"entropy": 2.207090878486633,
"epoch": 0.1337027222356059,
"grad_norm": 3.359375,
"learning_rate": 9.810664289682699e-05,
"loss": 2.2155645370483397,
"mean_token_accuracy": 0.5430127084255219,
"num_tokens": 7642123.0,
"step": 1110
},
{
"entropy": 2.3662970781326296,
"epoch": 0.13490725126475547,
"grad_norm": 3.46875,
"learning_rate": 9.80519664645329e-05,
"loss": 2.3862329483032227,
"mean_token_accuracy": 0.5138962984085083,
"num_tokens": 7710200.0,
"step": 1120
},
{
"entropy": 2.2497094869613647,
"epoch": 0.13611178029390508,
"grad_norm": 2.21875,
"learning_rate": 9.799652743225654e-05,
"loss": 2.2575557708740233,
"mean_token_accuracy": 0.5318409651517868,
"num_tokens": 7781182.0,
"step": 1130
},
{
"entropy": 2.2917282342910767,
"epoch": 0.1373163093230547,
"grad_norm": 6.21875,
"learning_rate": 9.794032667983293e-05,
"loss": 2.3167720794677735,
"mean_token_accuracy": 0.5249288141727447,
"num_tokens": 7849706.0,
"step": 1140
},
{
"entropy": 2.373197388648987,
"epoch": 0.1385208383522043,
"grad_norm": 2.609375,
"learning_rate": 9.78833650991859e-05,
"loss": 2.3759769439697265,
"mean_token_accuracy": 0.51370949447155,
"num_tokens": 7916623.0,
"step": 1150
},
{
"entropy": 2.3291378736495973,
"epoch": 0.13972536738135388,
"grad_norm": 3.796875,
"learning_rate": 9.782564359431385e-05,
"loss": 2.372611236572266,
"mean_token_accuracy": 0.5120194524526596,
"num_tokens": 7984218.0,
"step": 1160
},
{
"entropy": 2.244597578048706,
"epoch": 0.1409298964105035,
"grad_norm": 2.265625,
"learning_rate": 9.776716308127539e-05,
"loss": 2.2457481384277345,
"mean_token_accuracy": 0.5323881536722184,
"num_tokens": 8053595.0,
"step": 1170
},
{
"entropy": 2.381738781929016,
"epoch": 0.1421344254396531,
"grad_norm": 1.859375,
"learning_rate": 9.770792448817485e-05,
"loss": 2.382204055786133,
"mean_token_accuracy": 0.5139793962240219,
"num_tokens": 8121149.0,
"step": 1180
},
{
"entropy": 2.3032596588134764,
"epoch": 0.1433389544688027,
"grad_norm": 2.046875,
"learning_rate": 9.764792875514756e-05,
"loss": 2.3296224594116213,
"mean_token_accuracy": 0.5209127068519592,
"num_tokens": 8188046.0,
"step": 1190
},
{
"entropy": 2.3109251260757446,
"epoch": 0.1445434834979523,
"grad_norm": 3.3125,
"learning_rate": 9.758717683434484e-05,
"loss": 2.332454872131348,
"mean_token_accuracy": 0.5167243659496308,
"num_tokens": 8258677.0,
"step": 1200
},
{
"epoch": 0.1445434834979523,
"eval_entropy": 3.054787278175354,
"eval_loss": 3.034820556640625,
"eval_mean_token_accuracy": 0.4018402099609375,
"eval_num_tokens": 8258677.0,
"eval_runtime": 0.4108,
"eval_samples_per_second": 38.948,
"eval_steps_per_second": 4.869,
"step": 1200
},
{
"entropy": 2.392910861968994,
"epoch": 0.1457480125271019,
"grad_norm": 2.28125,
"learning_rate": 9.752566968991901e-05,
"loss": 2.428352928161621,
"mean_token_accuracy": 0.508216142654419,
"num_tokens": 8325649.0,
"step": 1210
},
{
"entropy": 2.313650631904602,
"epoch": 0.1469525415562515,
"grad_norm": 3.03125,
"learning_rate": 9.746340829800799e-05,
"loss": 2.3404510498046873,
"mean_token_accuracy": 0.5221627295017243,
"num_tokens": 8394365.0,
"step": 1220
},
{
"entropy": 2.341995906829834,
"epoch": 0.14815707058540112,
"grad_norm": 2.21875,
"learning_rate": 9.740039364671987e-05,
"loss": 2.3398111343383787,
"mean_token_accuracy": 0.5121251910924911,
"num_tokens": 8464333.0,
"step": 1230
},
{
"entropy": 2.259868335723877,
"epoch": 0.1493615996145507,
"grad_norm": 2.484375,
"learning_rate": 9.733662673611719e-05,
"loss": 2.282248878479004,
"mean_token_accuracy": 0.5302321165800095,
"num_tokens": 8531603.0,
"step": 1240
},
{
"entropy": 2.3321637392044066,
"epoch": 0.1505661286437003,
"grad_norm": 3.59375,
"learning_rate": 9.727210857820108e-05,
"loss": 2.371327590942383,
"mean_token_accuracy": 0.516041025519371,
"num_tokens": 8600388.0,
"step": 1250
},
{
"entropy": 2.351721405982971,
"epoch": 0.15177065767284992,
"grad_norm": 1.9375,
"learning_rate": 9.720684019689524e-05,
"loss": 2.351237487792969,
"mean_token_accuracy": 0.5172569751739502,
"num_tokens": 8672129.0,
"step": 1260
},
{
"entropy": 2.2653574705123902,
"epoch": 0.15297518670199953,
"grad_norm": 4.8125,
"learning_rate": 9.714082262802961e-05,
"loss": 2.297392463684082,
"mean_token_accuracy": 0.523885440826416,
"num_tokens": 8743632.0,
"step": 1270
},
{
"entropy": 2.293268084526062,
"epoch": 0.1541797157311491,
"grad_norm": 2.59375,
"learning_rate": 9.707405691932402e-05,
"loss": 2.3110326766967773,
"mean_token_accuracy": 0.5206148803234101,
"num_tokens": 8813353.0,
"step": 1280
},
{
"entropy": 2.387986731529236,
"epoch": 0.15538424476029872,
"grad_norm": 3.484375,
"learning_rate": 9.700654413037144e-05,
"loss": 2.3922283172607424,
"mean_token_accuracy": 0.513523331284523,
"num_tokens": 8878273.0,
"step": 1290
},
{
"entropy": 2.2815721154212953,
"epoch": 0.15658877378944833,
"grad_norm": 3.171875,
"learning_rate": 9.693828533262135e-05,
"loss": 2.2999031066894533,
"mean_token_accuracy": 0.5234899431467056,
"num_tokens": 8947692.0,
"step": 1300
},
{
"epoch": 0.15658877378944833,
"eval_entropy": 2.8758058547973633,
"eval_loss": 2.97363018989563,
"eval_mean_token_accuracy": 0.40523606538772583,
"eval_num_tokens": 8947692.0,
"eval_runtime": 0.4094,
"eval_samples_per_second": 39.084,
"eval_steps_per_second": 4.886,
"step": 1300
},
{
"entropy": 2.253899431228638,
"epoch": 0.15779330281859794,
"grad_norm": 3.015625,
"learning_rate": 9.686928160936252e-05,
"loss": 2.275619316101074,
"mean_token_accuracy": 0.5288082420825958,
"num_tokens": 9016255.0,
"step": 1310
},
{
"entropy": 2.2969882249832154,
"epoch": 0.15899783184774752,
"grad_norm": 1.90625,
"learning_rate": 9.679953405570601e-05,
"loss": 2.3192108154296873,
"mean_token_accuracy": 0.5225553333759307,
"num_tokens": 9085956.0,
"step": 1320
},
{
"entropy": 2.290460777282715,
"epoch": 0.16020236087689713,
"grad_norm": 2.765625,
"learning_rate": 9.672904377856765e-05,
"loss": 2.307216262817383,
"mean_token_accuracy": 0.5200891375541687,
"num_tokens": 9153647.0,
"step": 1330
},
{
"entropy": 2.2761851072311403,
"epoch": 0.16140688990604674,
"grad_norm": 2.84375,
"learning_rate": 9.665781189665052e-05,
"loss": 2.299959182739258,
"mean_token_accuracy": 0.5264901399612427,
"num_tokens": 9225290.0,
"step": 1340
},
{
"entropy": 2.3141539812088014,
"epoch": 0.16261141893519634,
"grad_norm": 2.71875,
"learning_rate": 9.658583954042726e-05,
"loss": 2.3225004196166994,
"mean_token_accuracy": 0.5269528299570083,
"num_tokens": 9291378.0,
"step": 1350
},
{
"entropy": 2.2969261407852173,
"epoch": 0.16381594796434595,
"grad_norm": 1.84375,
"learning_rate": 9.651312785212204e-05,
"loss": 2.3121112823486327,
"mean_token_accuracy": 0.5209438920021057,
"num_tokens": 9360511.0,
"step": 1360
},
{
"entropy": 2.303641176223755,
"epoch": 0.16502047699349554,
"grad_norm": 2.078125,
"learning_rate": 9.643967798569247e-05,
"loss": 2.324795722961426,
"mean_token_accuracy": 0.5191195398569107,
"num_tokens": 9430201.0,
"step": 1370
},
{
"entropy": 2.2442052602767943,
"epoch": 0.16622500602264514,
"grad_norm": 5.59375,
"learning_rate": 9.636549110681125e-05,
"loss": 2.247611427307129,
"mean_token_accuracy": 0.527609360218048,
"num_tokens": 9497142.0,
"step": 1380
},
{
"entropy": 2.2580633640289305,
"epoch": 0.16742953505179475,
"grad_norm": 2.75,
"learning_rate": 9.629056839284778e-05,
"loss": 2.307590866088867,
"mean_token_accuracy": 0.52379210293293,
"num_tokens": 9562440.0,
"step": 1390
},
{
"entropy": 2.3624070405960085,
"epoch": 0.16863406408094436,
"grad_norm": 2.671875,
"learning_rate": 9.62149110328493e-05,
"loss": 2.3632463455200194,
"mean_token_accuracy": 0.5173578560352325,
"num_tokens": 9631227.0,
"step": 1400
},
{
"epoch": 0.16863406408094436,
"eval_entropy": 2.853332757949829,
"eval_loss": 2.9722647666931152,
"eval_mean_token_accuracy": 0.41226455569267273,
"eval_num_tokens": 9631227.0,
"eval_runtime": 0.4149,
"eval_samples_per_second": 38.564,
"eval_steps_per_second": 4.821,
"step": 1400
},
{
"entropy": 2.4244832038879394,
"epoch": 0.16983859311009394,
"grad_norm": 3.828125,
"learning_rate": 9.613852022752217e-05,
"loss": 2.4088199615478514,
"mean_token_accuracy": 0.5088783591985703,
"num_tokens": 9701139.0,
"step": 1410
},
{
"entropy": 2.155960202217102,
"epoch": 0.17104312213924355,
"grad_norm": 3.296875,
"learning_rate": 9.606139718921277e-05,
"loss": 2.2044837951660154,
"mean_token_accuracy": 0.5425004243850708,
"num_tokens": 9768895.0,
"step": 1420
},
{
"entropy": 2.2430294871330263,
"epoch": 0.17224765116839316,
"grad_norm": 2.578125,
"learning_rate": 9.598354314188823e-05,
"loss": 2.2836578369140623,
"mean_token_accuracy": 0.5263251423835754,
"num_tokens": 9843163.0,
"step": 1430
},
{
"entropy": 2.1803964376449585,
"epoch": 0.17345218019754277,
"grad_norm": 1.8671875,
"learning_rate": 9.590495932111703e-05,
"loss": 2.1800159454345702,
"mean_token_accuracy": 0.5434668481349945,
"num_tokens": 9917015.0,
"step": 1440
},
{
"entropy": 2.299999499320984,
"epoch": 0.17465670922669235,
"grad_norm": 2.390625,
"learning_rate": 9.582564697404936e-05,
"loss": 2.309121513366699,
"mean_token_accuracy": 0.5198373407125473,
"num_tokens": 9984193.0,
"step": 1450
},
{
"entropy": 2.2457360506057737,
"epoch": 0.17586123825584196,
"grad_norm": 3.0,
"learning_rate": 9.574560735939742e-05,
"loss": 2.2609222412109373,
"mean_token_accuracy": 0.5328970372676849,
"num_tokens": 10053695.0,
"step": 1460
},
{
"entropy": 2.2804399013519285,
"epoch": 0.17706576728499157,
"grad_norm": 3.046875,
"learning_rate": 9.56648417474153e-05,
"loss": 2.2925643920898438,
"mean_token_accuracy": 0.5243758141994477,
"num_tokens": 10124469.0,
"step": 1470
},
{
"entropy": 2.195032811164856,
"epoch": 0.17827029631414118,
"grad_norm": 1.9296875,
"learning_rate": 9.558335141987895e-05,
"loss": 2.2206499099731447,
"mean_token_accuracy": 0.5339515537023545,
"num_tokens": 10190836.0,
"step": 1480
},
{
"entropy": 2.3091806173324585,
"epoch": 0.17947482534329076,
"grad_norm": 2.84375,
"learning_rate": 9.550113767006578e-05,
"loss": 2.3288848876953123,
"mean_token_accuracy": 0.5248405992984772,
"num_tokens": 10255931.0,
"step": 1490
},
{
"entropy": 2.261967420578003,
"epoch": 0.18067935437244037,
"grad_norm": 4.125,
"learning_rate": 9.541820180273414e-05,
"loss": 2.234074592590332,
"mean_token_accuracy": 0.5378178834915162,
"num_tokens": 10323861.0,
"step": 1500
},
{
"epoch": 0.18067935437244037,
"eval_entropy": 2.846497416496277,
"eval_loss": 2.9528520107269287,
"eval_mean_token_accuracy": 0.41081106662750244,
"eval_num_tokens": 10323861.0,
"eval_runtime": 0.3965,
"eval_samples_per_second": 40.358,
"eval_steps_per_second": 5.045,
"step": 1500
},
{
"entropy": 2.1897090792655947,
"epoch": 0.18188388340158998,
"grad_norm": 2.078125,
"learning_rate": 9.533454513410258e-05,
"loss": 2.2186933517456056,
"mean_token_accuracy": 0.5345078110694885,
"num_tokens": 10393479.0,
"step": 1510
},
{
"entropy": 2.2354114055633545,
"epoch": 0.1830884124307396,
"grad_norm": 5.78125,
"learning_rate": 9.525016899182905e-05,
"loss": 2.2384716033935548,
"mean_token_accuracy": 0.5285776913166046,
"num_tokens": 10460222.0,
"step": 1520
},
{
"entropy": 2.316632866859436,
"epoch": 0.18429294145988917,
"grad_norm": 3.109375,
"learning_rate": 9.516507471498972e-05,
"loss": 2.3353763580322267,
"mean_token_accuracy": 0.5186457633972168,
"num_tokens": 10530315.0,
"step": 1530
},
{
"entropy": 2.2956483721733094,
"epoch": 0.18549747048903878,
"grad_norm": 1.890625,
"learning_rate": 9.507926365405784e-05,
"loss": 2.2976795196533204,
"mean_token_accuracy": 0.5249256074428559,
"num_tokens": 10597431.0,
"step": 1540
},
{
"entropy": 2.3254905223846434,
"epoch": 0.1867019995181884,
"grad_norm": 2.171875,
"learning_rate": 9.499273717088221e-05,
"loss": 2.334694480895996,
"mean_token_accuracy": 0.5172825694084168,
"num_tokens": 10667831.0,
"step": 1550
},
{
"entropy": 2.1734044551849365,
"epoch": 0.187906528547338,
"grad_norm": 2.40625,
"learning_rate": 9.490549663866563e-05,
"loss": 2.20830078125,
"mean_token_accuracy": 0.5397387742996216,
"num_tokens": 10736684.0,
"step": 1560
},
{
"entropy": 2.2810161113739014,
"epoch": 0.1891110575764876,
"grad_norm": 2.265625,
"learning_rate": 9.48175434419431e-05,
"loss": 2.2679985046386717,
"mean_token_accuracy": 0.5284598618745804,
"num_tokens": 10803794.0,
"step": 1570
},
{
"entropy": 2.182316446304321,
"epoch": 0.1903155866056372,
"grad_norm": 2.546875,
"learning_rate": 9.472887897655976e-05,
"loss": 2.210857963562012,
"mean_token_accuracy": 0.5313417106866837,
"num_tokens": 10873181.0,
"step": 1580
},
{
"entropy": 2.236132097244263,
"epoch": 0.1915201156347868,
"grad_norm": 1.9453125,
"learning_rate": 9.463950464964886e-05,
"loss": 2.2548202514648437,
"mean_token_accuracy": 0.5304252445697785,
"num_tokens": 10941894.0,
"step": 1590
},
{
"entropy": 2.134193646907806,
"epoch": 0.1927246446639364,
"grad_norm": 1.7421875,
"learning_rate": 9.454942187960943e-05,
"loss": 2.155434226989746,
"mean_token_accuracy": 0.5477861285209655,
"num_tokens": 11010768.0,
"step": 1600
},
{
"epoch": 0.1927246446639364,
"eval_entropy": 2.864907145500183,
"eval_loss": 2.914454936981201,
"eval_mean_token_accuracy": 0.4183240383863449,
"eval_num_tokens": 11010768.0,
"eval_runtime": 0.4288,
"eval_samples_per_second": 37.316,
"eval_steps_per_second": 4.664,
"step": 1600
},
{
"entropy": 2.188663959503174,
"epoch": 0.19392917369308602,
"grad_norm": 3.0625,
"learning_rate": 9.445863209608364e-05,
"loss": 2.195932960510254,
"mean_token_accuracy": 0.5427460491657257,
"num_tokens": 11082324.0,
"step": 1610
},
{
"entropy": 2.2581432461738586,
"epoch": 0.1951337027222356,
"grad_norm": 2.4375,
"learning_rate": 9.436713673993421e-05,
"loss": 2.2641986846923827,
"mean_token_accuracy": 0.5305707901716232,
"num_tokens": 11153380.0,
"step": 1620
},
{
"entropy": 2.2770965337753295,
"epoch": 0.1963382317513852,
"grad_norm": 17.625,
"learning_rate": 9.427493726322151e-05,
"loss": 2.3116901397705076,
"mean_token_accuracy": 0.5252400994300842,
"num_tokens": 11222740.0,
"step": 1630
},
{
"entropy": 2.2262574195861817,
"epoch": 0.19754276078053482,
"grad_norm": 2.71875,
"learning_rate": 9.418203512918058e-05,
"loss": 2.248627281188965,
"mean_token_accuracy": 0.533080717921257,
"num_tokens": 11292074.0,
"step": 1640
},
{
"entropy": 2.282968246936798,
"epoch": 0.19874728980968442,
"grad_norm": 4.8125,
"learning_rate": 9.40884318121978e-05,
"loss": 2.2738475799560547,
"mean_token_accuracy": 0.5312909007072448,
"num_tokens": 11363360.0,
"step": 1650
},
{
"entropy": 2.1889445066452025,
"epoch": 0.199951818838834,
"grad_norm": 2.0,
"learning_rate": 9.399412879778757e-05,
"loss": 2.1901546478271485,
"mean_token_accuracy": 0.5460667550563812,
"num_tokens": 11434578.0,
"step": 1660
},
{
"entropy": 2.2750983953475954,
"epoch": 0.20115634786798361,
"grad_norm": 2.5,
"learning_rate": 9.389912758256869e-05,
"loss": 2.2871360778808594,
"mean_token_accuracy": 0.5282805800437927,
"num_tokens": 11504260.0,
"step": 1670
},
{
"entropy": 2.1284833669662477,
"epoch": 0.20236087689713322,
"grad_norm": 1.765625,
"learning_rate": 9.380342967424066e-05,
"loss": 2.1631650924682617,
"mean_token_accuracy": 0.5499304831027985,
"num_tokens": 11570786.0,
"step": 1680
},
{
"entropy": 2.210453176498413,
"epoch": 0.20356540592628283,
"grad_norm": 2.09375,
"learning_rate": 9.370703659155969e-05,
"loss": 2.2247928619384765,
"mean_token_accuracy": 0.5347439706325531,
"num_tokens": 11640019.0,
"step": 1690
},
{
"entropy": 2.2434919834136964,
"epoch": 0.20476993495543241,
"grad_norm": 2.03125,
"learning_rate": 9.360994986431465e-05,
"loss": 2.2586835861206054,
"mean_token_accuracy": 0.5289602160453797,
"num_tokens": 11709255.0,
"step": 1700
},
{
"epoch": 0.20476993495543241,
"eval_entropy": 2.8340182304382324,
"eval_loss": 2.9152491092681885,
"eval_mean_token_accuracy": 0.4188874661922455,
"eval_num_tokens": 11709255.0,
"eval_runtime": 0.4315,
"eval_samples_per_second": 37.078,
"eval_steps_per_second": 4.635,
"step": 1700
},
{
"entropy": 2.2331860780715944,
"epoch": 0.20597446398458202,
"grad_norm": 3.78125,
"learning_rate": 9.351217103330276e-05,
"loss": 2.2479530334472657,
"mean_token_accuracy": 0.5310386747121811,
"num_tokens": 11776203.0,
"step": 1710
},
{
"entropy": 2.3182583332061766,
"epoch": 0.20717899301373163,
"grad_norm": 2.0,
"learning_rate": 9.341370165030518e-05,
"loss": 2.3128843307495117,
"mean_token_accuracy": 0.52462497651577,
"num_tokens": 11843796.0,
"step": 1720
},
{
"entropy": 2.193679094314575,
"epoch": 0.20838352204288124,
"grad_norm": 2.0625,
"learning_rate": 9.331454327806228e-05,
"loss": 2.241754722595215,
"mean_token_accuracy": 0.5341065913438797,
"num_tokens": 11912238.0,
"step": 1730
},
{
"entropy": 2.163426327705383,
"epoch": 0.20958805107203082,
"grad_norm": 1.875,
"learning_rate": 9.321469749024895e-05,
"loss": 2.169114875793457,
"mean_token_accuracy": 0.5475439190864563,
"num_tokens": 11981947.0,
"step": 1740
},
{
"entropy": 2.2436394333839416,
"epoch": 0.21079258010118043,
"grad_norm": 2.375,
"learning_rate": 9.311416587144961e-05,
"loss": 2.248809814453125,
"mean_token_accuracy": 0.5373907715082169,
"num_tokens": 12049969.0,
"step": 1750
},
{
"entropy": 2.1564717292785645,
"epoch": 0.21199710913033004,
"grad_norm": 2.6875,
"learning_rate": 9.301295001713298e-05,
"loss": 2.1708261489868166,
"mean_token_accuracy": 0.5387319475412369,
"num_tokens": 12118220.0,
"step": 1760
},
{
"entropy": 2.1580832481384276,
"epoch": 0.21320163815947965,
"grad_norm": 2.390625,
"learning_rate": 9.291105153362685e-05,
"loss": 2.1711267471313476,
"mean_token_accuracy": 0.5511200129985809,
"num_tokens": 12188027.0,
"step": 1770
},
{
"entropy": 2.1681873559951783,
"epoch": 0.21440616718862926,
"grad_norm": 2.640625,
"learning_rate": 9.280847203809254e-05,
"loss": 2.167458152770996,
"mean_token_accuracy": 0.5453263878822326,
"num_tokens": 12257397.0,
"step": 1780
},
{
"entropy": 2.130517268180847,
"epoch": 0.21561069621777884,
"grad_norm": 2.203125,
"learning_rate": 9.270521315849928e-05,
"loss": 2.1511137008666994,
"mean_token_accuracy": 0.5517256915569305,
"num_tokens": 12327732.0,
"step": 1790
},
{
"entropy": 2.1457592606544496,
"epoch": 0.21681522524692845,
"grad_norm": 2.234375,
"learning_rate": 9.260127653359826e-05,
"loss": 2.172018814086914,
"mean_token_accuracy": 0.5436123073101043,
"num_tokens": 12399054.0,
"step": 1800
},
{
"epoch": 0.21681522524692845,
"eval_entropy": 2.959686517715454,
"eval_loss": 2.8896708488464355,
"eval_mean_token_accuracy": 0.42187774181365967,
"eval_num_tokens": 12399054.0,
"eval_runtime": 0.409,
"eval_samples_per_second": 39.117,
"eval_steps_per_second": 4.89,
"step": 1800
},
{
"entropy": 2.2030872344970702,
"epoch": 0.21801975427607806,
"grad_norm": 3.796875,
"learning_rate": 9.249666381289678e-05,
"loss": 2.205454444885254,
"mean_token_accuracy": 0.5406596213579178,
"num_tokens": 12468799.0,
"step": 1810
},
{
"entropy": 2.1178462505340576,
"epoch": 0.21922428330522767,
"grad_norm": 2.65625,
"learning_rate": 9.239137665663201e-05,
"loss": 2.1415390014648437,
"mean_token_accuracy": 0.5523227155208588,
"num_tokens": 12535527.0,
"step": 1820
},
{
"entropy": 2.3107515215873717,
"epoch": 0.22042881233437725,
"grad_norm": 2.765625,
"learning_rate": 9.228541673574453e-05,
"loss": 2.314019775390625,
"mean_token_accuracy": 0.5233344733715057,
"num_tokens": 12604182.0,
"step": 1830
},
{
"entropy": 2.2338968396186827,
"epoch": 0.22163334136352686,
"grad_norm": 3.328125,
"learning_rate": 9.217878573185202e-05,
"loss": 2.26546630859375,
"mean_token_accuracy": 0.5350278943777085,
"num_tokens": 12674202.0,
"step": 1840
},
{
"entropy": 2.2074631214141847,
"epoch": 0.22283787039267647,
"grad_norm": 1.8828125,
"learning_rate": 9.207148533722234e-05,
"loss": 2.2138628005981444,
"mean_token_accuracy": 0.537048852443695,
"num_tokens": 12741365.0,
"step": 1850
},
{
"entropy": 2.224149250984192,
"epoch": 0.22404239942182608,
"grad_norm": 1.734375,
"learning_rate": 9.196351725474693e-05,
"loss": 2.231917381286621,
"mean_token_accuracy": 0.5344621896743774,
"num_tokens": 12807216.0,
"step": 1860
},
{
"entropy": 2.1585075497627257,
"epoch": 0.22524692845097566,
"grad_norm": 3.859375,
"learning_rate": 9.185488319791352e-05,
"loss": 2.1571178436279297,
"mean_token_accuracy": 0.5523216247558593,
"num_tokens": 12873278.0,
"step": 1870
},
{
"entropy": 2.2243062257766724,
"epoch": 0.22645145748012527,
"grad_norm": 2.015625,
"learning_rate": 9.174558489077917e-05,
"loss": 2.2570470809936523,
"mean_token_accuracy": 0.5269607186317444,
"num_tokens": 12944925.0,
"step": 1880
},
{
"entropy": 2.182615005970001,
"epoch": 0.22765598650927488,
"grad_norm": 1.8125,
"learning_rate": 9.163562406794272e-05,
"loss": 2.1952205657958985,
"mean_token_accuracy": 0.5417321711778641,
"num_tokens": 13010487.0,
"step": 1890
},
{
"entropy": 2.111977481842041,
"epoch": 0.22886051553842449,
"grad_norm": 4.90625,
"learning_rate": 9.152500247451743e-05,
"loss": 2.155902290344238,
"mean_token_accuracy": 0.5541462600231171,
"num_tokens": 13080109.0,
"step": 1900
},
{
"epoch": 0.22886051553842449,
"eval_entropy": 2.8878990411758423,
"eval_loss": 2.8635730743408203,
"eval_mean_token_accuracy": 0.4313286989927292,
"eval_num_tokens": 13080109.0,
"eval_runtime": 0.4423,
"eval_samples_per_second": 36.175,
"eval_steps_per_second": 4.522,
"step": 1900
},
{
"entropy": 2.082282876968384,
"epoch": 0.23006504456757407,
"grad_norm": 1.7890625,
"learning_rate": 9.141372186610311e-05,
"loss": 2.0843496322631836,
"mean_token_accuracy": 0.5654984533786773,
"num_tokens": 13146870.0,
"step": 1910
},
{
"entropy": 2.183061623573303,
"epoch": 0.23126957359672368,
"grad_norm": 1.921875,
"learning_rate": 9.13017840087584e-05,
"loss": 2.2101274490356446,
"mean_token_accuracy": 0.5382312715053559,
"num_tokens": 13219259.0,
"step": 1920
},
{
"entropy": 2.2742142200469972,
"epoch": 0.23247410262587329,
"grad_norm": 3.859375,
"learning_rate": 9.118919067897268e-05,
"loss": 2.2773338317871095,
"mean_token_accuracy": 0.5264018833637237,
"num_tokens": 13290411.0,
"step": 1930
},
{
"entropy": 2.0873818516731264,
"epoch": 0.2336786316550229,
"grad_norm": 2.828125,
"learning_rate": 9.107594366363789e-05,
"loss": 2.0962757110595702,
"mean_token_accuracy": 0.5615902185440064,
"num_tokens": 13357767.0,
"step": 1940
},
{
"entropy": 2.0739306569099427,
"epoch": 0.23488316068417248,
"grad_norm": 1.9375,
"learning_rate": 9.096204476002015e-05,
"loss": 2.087767219543457,
"mean_token_accuracy": 0.5554501116275787,
"num_tokens": 13423509.0,
"step": 1950
},
{
"entropy": 2.2131853818893434,
"epoch": 0.23608768971332209,
"grad_norm": 2.515625,
"learning_rate": 9.084749577573128e-05,
"loss": 2.209881401062012,
"mean_token_accuracy": 0.5431510090827942,
"num_tokens": 13491765.0,
"step": 1960
},
{
"entropy": 2.1343575000762938,
"epoch": 0.2372922187424717,
"grad_norm": 2.890625,
"learning_rate": 9.073229852870005e-05,
"loss": 2.1772743225097657,
"mean_token_accuracy": 0.5383218407630921,
"num_tokens": 13559252.0,
"step": 1970
},
{
"entropy": 2.165915012359619,
"epoch": 0.2384967477716213,
"grad_norm": 2.234375,
"learning_rate": 9.06164548471434e-05,
"loss": 2.1949913024902346,
"mean_token_accuracy": 0.5370178014039994,
"num_tokens": 13627185.0,
"step": 1980
},
{
"entropy": 2.219644045829773,
"epoch": 0.23970127680077088,
"grad_norm": 3.8125,
"learning_rate": 9.049996656953741e-05,
"loss": 2.2189016342163086,
"mean_token_accuracy": 0.5392367959022522,
"num_tokens": 13697307.0,
"step": 1990
},
{
"entropy": 2.163391101360321,
"epoch": 0.2409058058299205,
"grad_norm": 3.390625,
"learning_rate": 9.038283554458803e-05,
"loss": 2.1824737548828126,
"mean_token_accuracy": 0.5462636172771453,
"num_tokens": 13761202.0,
"step": 2000
},
{
"epoch": 0.2409058058299205,
"eval_entropy": 2.7357964515686035,
"eval_loss": 2.8540592193603516,
"eval_mean_token_accuracy": 0.426966056227684,
"eval_num_tokens": 13761202.0,
"eval_runtime": 0.4079,
"eval_samples_per_second": 39.222,
"eval_steps_per_second": 4.903,
"step": 2000
},
{
"entropy": 2.2309467077255247,
"epoch": 0.2421103348590701,
"grad_norm": 2.328125,
"learning_rate": 9.026506363120189e-05,
"loss": 2.2542724609375,
"mean_token_accuracy": 0.5328327417373657,
"num_tokens": 13830377.0,
"step": 2010
},
{
"entropy": 2.1426926970481874,
"epoch": 0.2433148638882197,
"grad_norm": 3.265625,
"learning_rate": 9.014665269845666e-05,
"loss": 2.164677047729492,
"mean_token_accuracy": 0.5467932879924774,
"num_tokens": 13900414.0,
"step": 2020
},
{
"entropy": 2.15235835313797,
"epoch": 0.24451939291736932,
"grad_norm": 1.8125,
"learning_rate": 9.002760462557152e-05,
"loss": 2.1506536483764647,
"mean_token_accuracy": 0.5472699105739594,
"num_tokens": 13969474.0,
"step": 2030
},
{
"entropy": 2.112102437019348,
"epoch": 0.2457239219465189,
"grad_norm": 3.03125,
"learning_rate": 8.99079213018772e-05,
"loss": 2.1196800231933595,
"mean_token_accuracy": 0.5518890619277954,
"num_tokens": 14036404.0,
"step": 2040
},
{
"entropy": 2.1371890902519226,
"epoch": 0.2469284509756685,
"grad_norm": 1.7265625,
"learning_rate": 8.978760462678611e-05,
"loss": 2.153955841064453,
"mean_token_accuracy": 0.5453378796577454,
"num_tokens": 14107414.0,
"step": 2050
},
{
"entropy": 2.1680827260017397,
"epoch": 0.24813298000481812,
"grad_norm": 2.28125,
"learning_rate": 8.966665650976209e-05,
"loss": 2.1719219207763674,
"mean_token_accuracy": 0.5435125470161438,
"num_tokens": 14176407.0,
"step": 2060
},
{
"entropy": 2.1545125603675843,
"epoch": 0.24933750903396773,
"grad_norm": 2.140625,
"learning_rate": 8.954507887029026e-05,
"loss": 2.1777198791503904,
"mean_token_accuracy": 0.5442109107971191,
"num_tokens": 14247040.0,
"step": 2070
},
{
"entropy": 2.16602623462677,
"epoch": 0.2505420380631173,
"grad_norm": 1.6875,
"learning_rate": 8.942287363784638e-05,
"loss": 2.1973800659179688,
"mean_token_accuracy": 0.5392501056194305,
"num_tokens": 14316710.0,
"step": 2080
},
{
"entropy": 2.174078607559204,
"epoch": 0.25174656709226695,
"grad_norm": 2.25,
"learning_rate": 8.930004275186634e-05,
"loss": 2.182669258117676,
"mean_token_accuracy": 0.5412575840950012,
"num_tokens": 14385574.0,
"step": 2090
},
{
"entropy": 2.18578360080719,
"epoch": 0.25295109612141653,
"grad_norm": 1.7421875,
"learning_rate": 8.917658816171534e-05,
"loss": 2.2020822525024415,
"mean_token_accuracy": 0.5368836164474488,
"num_tokens": 14458459.0,
"step": 2100
},
{
"epoch": 0.25295109612141653,
"eval_entropy": 2.735661745071411,
"eval_loss": 2.849364757537842,
"eval_mean_token_accuracy": 0.42470361292362213,
"eval_num_tokens": 14458459.0,
"eval_runtime": 0.4067,
"eval_samples_per_second": 39.339,
"eval_steps_per_second": 4.917,
"step": 2100
},
{
"entropy": 2.0653456449508667,
"epoch": 0.2541556251505661,
"grad_norm": 2.5,
"learning_rate": 8.905251182665694e-05,
"loss": 2.071195602416992,
"mean_token_accuracy": 0.5605430185794831,
"num_tokens": 14528612.0,
"step": 2110
},
{
"entropy": 2.1143752455711367,
"epoch": 0.25536015417971575,
"grad_norm": 37.5,
"learning_rate": 8.892781571582209e-05,
"loss": 2.1292278289794924,
"mean_token_accuracy": 0.5506574988365174,
"num_tokens": 14601246.0,
"step": 2120
},
{
"entropy": 2.1357630252838136,
"epoch": 0.25656468320886533,
"grad_norm": 2.984375,
"learning_rate": 8.880250180817765e-05,
"loss": 2.1510900497436523,
"mean_token_accuracy": 0.5498823761940003,
"num_tokens": 14669430.0,
"step": 2130
},
{
"entropy": 2.191697120666504,
"epoch": 0.2577692122380149,
"grad_norm": 1.9375,
"learning_rate": 8.867657209249515e-05,
"loss": 2.210224914550781,
"mean_token_accuracy": 0.542282658815384,
"num_tokens": 14736231.0,
"step": 2140
},
{
"entropy": 2.1863580465316774,
"epoch": 0.25897374126716455,
"grad_norm": 2.265625,
"learning_rate": 8.855002856731927e-05,
"loss": 2.2168277740478515,
"mean_token_accuracy": 0.5371293991804122,
"num_tokens": 14800572.0,
"step": 2150
},
{
"entropy": 2.128365468978882,
"epoch": 0.26017827029631413,
"grad_norm": 1.9921875,
"learning_rate": 8.842287324093594e-05,
"loss": 2.1462024688720702,
"mean_token_accuracy": 0.5463616043329239,
"num_tokens": 14868668.0,
"step": 2160
},
{
"entropy": 2.1524484634399412,
"epoch": 0.26138279932546377,
"grad_norm": 1.9765625,
"learning_rate": 8.829510813134063e-05,
"loss": 2.1351104736328126,
"mean_token_accuracy": 0.5478598177433014,
"num_tokens": 14938978.0,
"step": 2170
},
{
"entropy": 2.198789381980896,
"epoch": 0.26258732835461335,
"grad_norm": 1.859375,
"learning_rate": 8.816673526620622e-05,
"loss": 2.2329919815063475,
"mean_token_accuracy": 0.5322647511959075,
"num_tokens": 15007348.0,
"step": 2180
},
{
"entropy": 2.1342248201370237,
"epoch": 0.26379185738376293,
"grad_norm": 3.375,
"learning_rate": 8.80377566828509e-05,
"loss": 2.148569679260254,
"mean_token_accuracy": 0.5494184136390686,
"num_tokens": 15076908.0,
"step": 2190
},
{
"entropy": 2.0989627480506896,
"epoch": 0.26499638641291257,
"grad_norm": 2.453125,
"learning_rate": 8.790817442820578e-05,
"loss": 2.1174949645996093,
"mean_token_accuracy": 0.5576492011547088,
"num_tokens": 15145467.0,
"step": 2200
},
{
"epoch": 0.26499638641291257,
"eval_entropy": 2.708386778831482,
"eval_loss": 2.8184127807617188,
"eval_mean_token_accuracy": 0.42567259073257446,
"eval_num_tokens": 15145467.0,
"eval_runtime": 0.429,
"eval_samples_per_second": 37.293,
"eval_steps_per_second": 4.662,
"step": 2200
},
{
"entropy": 2.0987525939941407,
"epoch": 0.26620091544206215,
"grad_norm": 2.15625,
"learning_rate": 8.777799055878243e-05,
"loss": 2.1152496337890625,
"mean_token_accuracy": 0.5476809203624725,
"num_tokens": 15215998.0,
"step": 2210
},
{
"entropy": 2.0828201174736023,
"epoch": 0.2674054444712118,
"grad_norm": 2.21875,
"learning_rate": 8.764720714064025e-05,
"loss": 2.0910402297973634,
"mean_token_accuracy": 0.5581553757190705,
"num_tokens": 15284221.0,
"step": 2220
},
{
"entropy": 2.1438043117523193,
"epoch": 0.26860997350036137,
"grad_norm": 84.5,
"learning_rate": 8.751582624935366e-05,
"loss": 2.159671401977539,
"mean_token_accuracy": 0.5466212332248688,
"num_tokens": 15351682.0,
"step": 2230
},
{
"entropy": 2.1918447971343995,
"epoch": 0.26981450252951095,
"grad_norm": 1.90625,
"learning_rate": 8.738384996997917e-05,
"loss": 2.194806671142578,
"mean_token_accuracy": 0.5432702243328095,
"num_tokens": 15419986.0,
"step": 2240
},
{
"entropy": 2.1559926509857177,
"epoch": 0.2710190315586606,
"grad_norm": 2.25,
"learning_rate": 8.725128039702226e-05,
"loss": 2.170298385620117,
"mean_token_accuracy": 0.5422053426504135,
"num_tokens": 15490655.0,
"step": 2250
},
{
"entropy": 2.112238824367523,
"epoch": 0.27222356058781016,
"grad_norm": 2.71875,
"learning_rate": 8.711811963440422e-05,
"loss": 2.1344348907470705,
"mean_token_accuracy": 0.5519571542739868,
"num_tokens": 15558940.0,
"step": 2260
},
{
"entropy": 2.122793698310852,
"epoch": 0.27342808961695975,
"grad_norm": 2.53125,
"learning_rate": 8.698436979542866e-05,
"loss": 2.1378543853759764,
"mean_token_accuracy": 0.5491474151611329,
"num_tokens": 15626777.0,
"step": 2270
},
{
"entropy": 2.1574150919914246,
"epoch": 0.2746326186461094,
"grad_norm": 1.734375,
"learning_rate": 8.685003300274807e-05,
"loss": 2.168122100830078,
"mean_token_accuracy": 0.5528741180896759,
"num_tokens": 15695490.0,
"step": 2280
},
{
"entropy": 2.086066424846649,
"epoch": 0.27583714767525896,
"grad_norm": 2.578125,
"learning_rate": 8.671511138833002e-05,
"loss": 2.095340538024902,
"mean_token_accuracy": 0.5564016461372375,
"num_tokens": 15765669.0,
"step": 2290
},
{
"entropy": 2.1277116537094116,
"epoch": 0.2770416767044086,
"grad_norm": 1.640625,
"learning_rate": 8.657960709342345e-05,
"loss": 2.1454301834106446,
"mean_token_accuracy": 0.5443755030632019,
"num_tokens": 15837031.0,
"step": 2300
},
{
"epoch": 0.2770416767044086,
"eval_entropy": 2.6647491455078125,
"eval_loss": 2.8141632080078125,
"eval_mean_token_accuracy": 0.42728830873966217,
"eval_num_tokens": 15837031.0,
"eval_runtime": 0.5308,
"eval_samples_per_second": 30.146,
"eval_steps_per_second": 3.768,
"step": 2300
},
{
"entropy": 2.0806057333946226,
"epoch": 0.2782462057335582,
"grad_norm": 1.6796875,
"learning_rate": 8.644352226852457e-05,
"loss": 2.100520133972168,
"mean_token_accuracy": 0.5565994262695313,
"num_tokens": 15905623.0,
"step": 2310
},
{
"entropy": 2.071013700962067,
"epoch": 0.27945073476270776,
"grad_norm": 2.109375,
"learning_rate": 8.63068590733428e-05,
"loss": 2.0833396911621094,
"mean_token_accuracy": 0.5573381006717681,
"num_tokens": 15974613.0,
"step": 2320
},
{
"entropy": 2.037881815433502,
"epoch": 0.2806552637918574,
"grad_norm": 1.828125,
"learning_rate": 8.616961967676653e-05,
"loss": 2.0440486907958983,
"mean_token_accuracy": 0.5672590255737304,
"num_tokens": 16043789.0,
"step": 2330
},
{
"entropy": 2.1182125926017763,
"epoch": 0.281859792821007,
"grad_norm": 2.796875,
"learning_rate": 8.603180625682856e-05,
"loss": 2.1264150619506834,
"mean_token_accuracy": 0.5492606639862061,
"num_tokens": 16113815.0,
"step": 2340
},
{
"entropy": 2.1378235816955566,
"epoch": 0.28306432185015656,
"grad_norm": 1.9140625,
"learning_rate": 8.589342100067171e-05,
"loss": 2.1428293228149413,
"mean_token_accuracy": 0.5489014446735382,
"num_tokens": 16184763.0,
"step": 2350
},
{
"entropy": 2.0700816035270693,
"epoch": 0.2842688508793062,
"grad_norm": 3.234375,
"learning_rate": 8.575446610451396e-05,
"loss": 2.101584053039551,
"mean_token_accuracy": 0.5581894755363465,
"num_tokens": 16254530.0,
"step": 2360
},
{
"entropy": 2.141230809688568,
"epoch": 0.2854733799084558,
"grad_norm": 2.234375,
"learning_rate": 8.561494377361371e-05,
"loss": 2.1607639312744142,
"mean_token_accuracy": 0.5493471801280976,
"num_tokens": 16322088.0,
"step": 2370
},
{
"entropy": 2.072463631629944,
"epoch": 0.2866779089376054,
"grad_norm": 2.046875,
"learning_rate": 8.547485622223468e-05,
"loss": 2.0758430480957033,
"mean_token_accuracy": 0.5595114946365356,
"num_tokens": 16393167.0,
"step": 2380
},
{
"entropy": 2.0443200945854185,
"epoch": 0.287882437966755,
"grad_norm": 2.859375,
"learning_rate": 8.533420567361084e-05,
"loss": 2.053724479675293,
"mean_token_accuracy": 0.5662404716014862,
"num_tokens": 16460254.0,
"step": 2390
},
{
"entropy": 2.0922821521759034,
"epoch": 0.2890869669959046,
"grad_norm": 1.96875,
"learning_rate": 8.519299435991111e-05,
"loss": 2.1047943115234373,
"mean_token_accuracy": 0.5579495280981064,
"num_tokens": 16525561.0,
"step": 2400
},
{
"epoch": 0.2890869669959046,
"eval_entropy": 2.6770076751708984,
"eval_loss": 2.8334004878997803,
"eval_mean_token_accuracy": 0.4258348345756531,
"eval_num_tokens": 16525561.0,
"eval_runtime": 0.4008,
"eval_samples_per_second": 39.923,
"eval_steps_per_second": 4.99,
"step": 2400
},
{
"entropy": 2.0860785961151125,
"epoch": 0.2902914960250542,
"grad_norm": 1.8359375,
"learning_rate": 8.505122452220393e-05,
"loss": 2.0898111343383787,
"mean_token_accuracy": 0.5583101451396942,
"num_tokens": 16595852.0,
"step": 2410
},
{
"entropy": 2.0555111885070803,
"epoch": 0.2914960250542038,
"grad_norm": 1.75,
"learning_rate": 8.490889841042167e-05,
"loss": 2.070912742614746,
"mean_token_accuracy": 0.5591276168823243,
"num_tokens": 16666291.0,
"step": 2420
},
{
"entropy": 2.0737984776496887,
"epoch": 0.2927005540833534,
"grad_norm": 4.8125,
"learning_rate": 8.4766018283325e-05,
"loss": 2.1162538528442383,
"mean_token_accuracy": 0.5509154379367829,
"num_tokens": 16736493.0,
"step": 2430
},
{
"entropy": 2.1692824363708496,
"epoch": 0.293905083112503,
"grad_norm": 1.7421875,
"learning_rate": 8.462258640846691e-05,
"loss": 2.1681695938110352,
"mean_token_accuracy": 0.549293601512909,
"num_tokens": 16805384.0,
"step": 2440
},
{
"entropy": 2.118699336051941,
"epoch": 0.2951096121416526,
"grad_norm": 2.359375,
"learning_rate": 8.447860506215691e-05,
"loss": 2.1451501846313477,
"mean_token_accuracy": 0.5516745418310165,
"num_tokens": 16875894.0,
"step": 2450
},
{
"entropy": 2.084539461135864,
"epoch": 0.29631414117080224,
"grad_norm": 2.046875,
"learning_rate": 8.433407652942469e-05,
"loss": 2.085173225402832,
"mean_token_accuracy": 0.560731440782547,
"num_tokens": 16940338.0,
"step": 2460
},
{
"entropy": 2.147885191440582,
"epoch": 0.2975186701999518,
"grad_norm": 2.109375,
"learning_rate": 8.4189003103984e-05,
"loss": 2.170233726501465,
"mean_token_accuracy": 0.5484267741441726,
"num_tokens": 17009690.0,
"step": 2470
},
{
"entropy": 2.0895490050315857,
"epoch": 0.2987231992291014,
"grad_norm": 1.8671875,
"learning_rate": 8.404338708819625e-05,
"loss": 2.098209190368652,
"mean_token_accuracy": 0.5598911285400391,
"num_tokens": 17079699.0,
"step": 2480
},
{
"entropy": 2.047213041782379,
"epoch": 0.29992772825825104,
"grad_norm": 1.890625,
"learning_rate": 8.389723079303387e-05,
"loss": 2.0544561386108398,
"mean_token_accuracy": 0.5627081871032715,
"num_tokens": 17149769.0,
"step": 2490
},
{
"entropy": 2.0825024962425234,
"epoch": 0.3011322572874006,
"grad_norm": 3.59375,
"learning_rate": 8.375053653804373e-05,
"loss": 2.0965812683105467,
"mean_token_accuracy": 0.5609676122665406,
"num_tokens": 17216861.0,
"step": 2500
},
{
"epoch": 0.3011322572874006,
"eval_entropy": 2.6770654916763306,
"eval_loss": 2.7862625122070312,
"eval_mean_token_accuracy": 0.4347201734781265,
"eval_num_tokens": 17216861.0,
"eval_runtime": 0.4051,
"eval_samples_per_second": 39.492,
"eval_steps_per_second": 4.937,
"step": 2500
},
{
"entropy": 2.03642942905426,
"epoch": 0.30233678631655025,
"grad_norm": 2.078125,
"learning_rate": 8.36033066513103e-05,
"loss": 2.076190376281738,
"mean_token_accuracy": 0.565330320596695,
"num_tokens": 17284775.0,
"step": 2510
},
{
"entropy": 2.105399656295776,
"epoch": 0.30354131534569984,
"grad_norm": 2.0,
"learning_rate": 8.345554346941866e-05,
"loss": 2.1103307723999025,
"mean_token_accuracy": 0.556881707906723,
"num_tokens": 17353646.0,
"step": 2520
},
{
"entropy": 1.988648521900177,
"epoch": 0.3047458443748494,
"grad_norm": 2.421875,
"learning_rate": 8.330724933741749e-05,
"loss": 2.002836990356445,
"mean_token_accuracy": 0.5724063634872436,
"num_tokens": 17418925.0,
"step": 2530
},
{
"entropy": 2.1860252976417542,
"epoch": 0.30595037340399905,
"grad_norm": 4.4375,
"learning_rate": 8.315842660878181e-05,
"loss": 2.1974233627319335,
"mean_token_accuracy": 0.5398978978395462,
"num_tokens": 17485994.0,
"step": 2540
},
{
"entropy": 2.1428608536720275,
"epoch": 0.30715490243314864,
"grad_norm": 2.15625,
"learning_rate": 8.300907764537565e-05,
"loss": 2.1552366256713866,
"mean_token_accuracy": 0.5474565207958222,
"num_tokens": 17556847.0,
"step": 2550
},
{
"entropy": 2.0490808844566346,
"epoch": 0.3083594314622982,
"grad_norm": 15.75,
"learning_rate": 8.285920481741448e-05,
"loss": 2.075417137145996,
"mean_token_accuracy": 0.5619370639324188,
"num_tokens": 17623063.0,
"step": 2560
},
{
"entropy": 2.08027925491333,
"epoch": 0.30956396049144785,
"grad_norm": 6.4375,
"learning_rate": 8.270881050342775e-05,
"loss": 2.0900177001953124,
"mean_token_accuracy": 0.5613270044326782,
"num_tokens": 17691470.0,
"step": 2570
},
{
"entropy": 2.0812355279922485,
"epoch": 0.31076848952059744,
"grad_norm": 2.15625,
"learning_rate": 8.255789709022104e-05,
"loss": 2.0924118041992186,
"mean_token_accuracy": 0.560305255651474,
"num_tokens": 17757875.0,
"step": 2580
},
{
"entropy": 1.9864863872528076,
"epoch": 0.31197301854974707,
"grad_norm": 2.171875,
"learning_rate": 8.240646697283818e-05,
"loss": 2.001949119567871,
"mean_token_accuracy": 0.5692925453186035,
"num_tokens": 17825941.0,
"step": 2590
},
{
"entropy": 2.031517136096954,
"epoch": 0.31317754757889665,
"grad_norm": 1.7890625,
"learning_rate": 8.225452255452325e-05,
"loss": 2.0528676986694334,
"mean_token_accuracy": 0.5633283078670501,
"num_tokens": 17899022.0,
"step": 2600
},
{
"epoch": 0.31317754757889665,
"eval_entropy": 2.6427732706069946,
"eval_loss": 2.7711801528930664,
"eval_mean_token_accuracy": 0.4326988756656647,
"eval_num_tokens": 17899022.0,
"eval_runtime": 0.398,
"eval_samples_per_second": 40.197,
"eval_steps_per_second": 5.025,
"step": 2600
},
{
"entropy": 2.148902940750122,
"epoch": 0.31438207660804623,
"grad_norm": 2.421875,
"learning_rate": 8.210206624668249e-05,
"loss": 2.164246940612793,
"mean_token_accuracy": 0.5463122427463531,
"num_tokens": 17968591.0,
"step": 2610
},
{
"entropy": 2.1614266276359557,
"epoch": 0.31558660563719587,
"grad_norm": 1.75,
"learning_rate": 8.194910046884595e-05,
"loss": 2.161795997619629,
"mean_token_accuracy": 0.5490990400314331,
"num_tokens": 18035770.0,
"step": 2620
},
{
"entropy": 2.0735299348831178,
"epoch": 0.31679113466634545,
"grad_norm": 1.8828125,
"learning_rate": 8.179562764862918e-05,
"loss": 2.090145492553711,
"mean_token_accuracy": 0.561646330356598,
"num_tokens": 18102572.0,
"step": 2630
},
{
"entropy": 2.059981143474579,
"epoch": 0.31799566369549503,
"grad_norm": 2.25,
"learning_rate": 8.16416502216946e-05,
"loss": 2.070825958251953,
"mean_token_accuracy": 0.559648597240448,
"num_tokens": 18168404.0,
"step": 2640
},
{
"entropy": 2.0211940169334413,
"epoch": 0.31920019272464467,
"grad_norm": 2.46875,
"learning_rate": 8.148717063171292e-05,
"loss": 2.0402278900146484,
"mean_token_accuracy": 0.5664457201957702,
"num_tokens": 18238822.0,
"step": 2650
},
{
"entropy": 2.115056884288788,
"epoch": 0.32040472175379425,
"grad_norm": 2.453125,
"learning_rate": 8.133219133032432e-05,
"loss": 2.125631904602051,
"mean_token_accuracy": 0.5588094294071198,
"num_tokens": 18304127.0,
"step": 2660
},
{
"entropy": 2.096850049495697,
"epoch": 0.3216092507829439,
"grad_norm": 2.875,
"learning_rate": 8.117671477709962e-05,
"loss": 2.1195550918579102,
"mean_token_accuracy": 0.5529998481273651,
"num_tokens": 18368937.0,
"step": 2670
},
{
"entropy": 2.0709152102470396,
"epoch": 0.32281377981209347,
"grad_norm": 1.6875,
"learning_rate": 8.102074343950113e-05,
"loss": 2.0848411560058593,
"mean_token_accuracy": 0.5579396069049836,
"num_tokens": 18438067.0,
"step": 2680
},
{
"entropy": 2.151025879383087,
"epoch": 0.32401830884124305,
"grad_norm": 2.0625,
"learning_rate": 8.086427979284352e-05,
"loss": 2.1543193817138673,
"mean_token_accuracy": 0.5485984563827515,
"num_tokens": 18508650.0,
"step": 2690
},
{
"entropy": 2.074105453491211,
"epoch": 0.3252228378703927,
"grad_norm": 1.7734375,
"learning_rate": 8.070732632025464e-05,
"loss": 2.0990238189697266,
"mean_token_accuracy": 0.558820104598999,
"num_tokens": 18578127.0,
"step": 2700
},
{
"epoch": 0.3252228378703927,
"eval_entropy": 2.59474778175354,
"eval_loss": 2.784816265106201,
"eval_mean_token_accuracy": 0.43576808273792267,
"eval_num_tokens": 18578127.0,
"eval_runtime": 0.4052,
"eval_samples_per_second": 39.487,
"eval_steps_per_second": 4.936,
"step": 2700
},
{
"entropy": 2.059889006614685,
"epoch": 0.32642736689954227,
"grad_norm": 1.828125,
"learning_rate": 8.054988551263596e-05,
"loss": 2.0819252014160154,
"mean_token_accuracy": 0.5544521391391755,
"num_tokens": 18648090.0,
"step": 2710
},
{
"entropy": 2.0784488558769225,
"epoch": 0.3276318959286919,
"grad_norm": 2.234375,
"learning_rate": 8.039195986862317e-05,
"loss": 2.0795106887817383,
"mean_token_accuracy": 0.5613572120666503,
"num_tokens": 18716753.0,
"step": 2720
},
{
"entropy": 2.0666807651519776,
"epoch": 0.3288364249578415,
"grad_norm": 2.234375,
"learning_rate": 8.023355189454645e-05,
"loss": 2.084706497192383,
"mean_token_accuracy": 0.5587452292442322,
"num_tokens": 18787606.0,
"step": 2730
},
{
"entropy": 1.9979267120361328,
"epoch": 0.33004095398699107,
"grad_norm": 1.734375,
"learning_rate": 8.007466410439065e-05,
"loss": 2.005443000793457,
"mean_token_accuracy": 0.570879477262497,
"num_tokens": 18851977.0,
"step": 2740
},
{
"entropy": 2.0454527735710144,
"epoch": 0.3312454830161407,
"grad_norm": 2.921875,
"learning_rate": 7.991529901975557e-05,
"loss": 2.0477304458618164,
"mean_token_accuracy": 0.567040553689003,
"num_tokens": 18922821.0,
"step": 2750
},
{
"entropy": 2.019996762275696,
"epoch": 0.3324500120452903,
"grad_norm": 4.46875,
"learning_rate": 7.97554591698157e-05,
"loss": 2.023045539855957,
"mean_token_accuracy": 0.5663474082946778,
"num_tokens": 18992620.0,
"step": 2760
},
{
"entropy": 2.016635000705719,
"epoch": 0.33365454107443987,
"grad_norm": 1.8125,
"learning_rate": 7.95951470912803e-05,
"loss": 2.0342525482177733,
"mean_token_accuracy": 0.5686039090156555,
"num_tokens": 19059719.0,
"step": 2770
},
{
"entropy": 2.0925203800201415,
"epoch": 0.3348590701035895,
"grad_norm": 2.53125,
"learning_rate": 7.943436532835304e-05,
"loss": 2.0941793441772463,
"mean_token_accuracy": 0.5635069251060486,
"num_tokens": 19128166.0,
"step": 2780
},
{
"entropy": 2.011734688282013,
"epoch": 0.3360635991327391,
"grad_norm": 2.234375,
"learning_rate": 7.927311643269157e-05,
"loss": 2.0319175720214844,
"mean_token_accuracy": 0.5642287373542786,
"num_tokens": 19198350.0,
"step": 2790
},
{
"entropy": 2.161832857131958,
"epoch": 0.3372681281618887,
"grad_norm": 2.0625,
"learning_rate": 7.911140296336712e-05,
"loss": 2.171921730041504,
"mean_token_accuracy": 0.544024670124054,
"num_tokens": 19265054.0,
"step": 2800
},
{
"epoch": 0.3372681281618887,
"eval_entropy": 2.61099910736084,
"eval_loss": 2.7639856338500977,
"eval_mean_token_accuracy": 0.4430377185344696,
"eval_num_tokens": 19265054.0,
"eval_runtime": 0.3884,
"eval_samples_per_second": 41.198,
"eval_steps_per_second": 5.15,
"step": 2800
},
{
"entropy": 1.932697057723999,
"epoch": 0.3384726571910383,
"grad_norm": 2.21875,
"learning_rate": 7.894922748682387e-05,
"loss": 1.9408197402954102,
"mean_token_accuracy": 0.5819644033908844,
"num_tokens": 19335074.0,
"step": 2810
},
{
"entropy": 2.092177450656891,
"epoch": 0.3396771862201879,
"grad_norm": 1.9921875,
"learning_rate": 7.878659257683819e-05,
"loss": 2.1017913818359375,
"mean_token_accuracy": 0.5499498665332794,
"num_tokens": 19404061.0,
"step": 2820
},
{
"entropy": 2.06910434961319,
"epoch": 0.3408817152493375,
"grad_norm": 2.21875,
"learning_rate": 7.862350081447777e-05,
"loss": 2.0780174255371096,
"mean_token_accuracy": 0.5591690957546234,
"num_tokens": 19470099.0,
"step": 2830
},
{
"entropy": 1.980467677116394,
"epoch": 0.3420862442784871,
"grad_norm": 2.46875,
"learning_rate": 7.845995478806075e-05,
"loss": 1.994614601135254,
"mean_token_accuracy": 0.5735546886920929,
"num_tokens": 19534144.0,
"step": 2840
},
{
"entropy": 2.00383620262146,
"epoch": 0.3432907733076367,
"grad_norm": 3.234375,
"learning_rate": 7.829595709311454e-05,
"loss": 2.006473159790039,
"mean_token_accuracy": 0.5768690049648285,
"num_tokens": 19603676.0,
"step": 2850
},
{
"entropy": 2.020020580291748,
"epoch": 0.3444953023367863,
"grad_norm": 1.9375,
"learning_rate": 7.813151033233469e-05,
"loss": 2.046031951904297,
"mean_token_accuracy": 0.5671627938747406,
"num_tokens": 19671521.0,
"step": 2860
},
{
"entropy": 2.0426268339157105,
"epoch": 0.3456998313659359,
"grad_norm": 2.421875,
"learning_rate": 7.796661711554358e-05,
"loss": 2.051487350463867,
"mean_token_accuracy": 0.5657259941101074,
"num_tokens": 19740842.0,
"step": 2870
},
{
"entropy": 2.0123592138290407,
"epoch": 0.34690436039508554,
"grad_norm": 1.96875,
"learning_rate": 7.780128005964897e-05,
"loss": 2.030988311767578,
"mean_token_accuracy": 0.569215327501297,
"num_tokens": 19810966.0,
"step": 2880
},
{
"entropy": 1.9815311908721924,
"epoch": 0.3481088894242351,
"grad_norm": 2.0625,
"learning_rate": 7.763550178860249e-05,
"loss": 1.9935127258300782,
"mean_token_accuracy": 0.5781533777713775,
"num_tokens": 19882666.0,
"step": 2890
},
{
"entropy": 2.0456610202789305,
"epoch": 0.3493134184533847,
"grad_norm": 1.6171875,
"learning_rate": 7.746928493335798e-05,
"loss": 2.040866470336914,
"mean_token_accuracy": 0.5613146841526031,
"num_tokens": 19952153.0,
"step": 2900
},
{
"epoch": 0.3493134184533847,
"eval_entropy": 2.573462128639221,
"eval_loss": 2.7504706382751465,
"eval_mean_token_accuracy": 0.446028009057045,
"eval_num_tokens": 19952153.0,
"eval_runtime": 0.4463,
"eval_samples_per_second": 35.853,
"eval_steps_per_second": 4.482,
"step": 2900
},
{
"entropy": 2.051629662513733,
"epoch": 0.35051794748253434,
"grad_norm": 2.296875,
"learning_rate": 7.73026321318298e-05,
"loss": 2.0937450408935545,
"mean_token_accuracy": 0.559838205575943,
"num_tokens": 20018290.0,
"step": 2910
},
{
"entropy": 2.0276250004768372,
"epoch": 0.3517224765116839,
"grad_norm": 3.4375,
"learning_rate": 7.713554602885086e-05,
"loss": 2.0289745330810547,
"mean_token_accuracy": 0.569722706079483,
"num_tokens": 20086770.0,
"step": 2920
},
{
"entropy": 2.0052628755569457,
"epoch": 0.35292700554083356,
"grad_norm": 1.8671875,
"learning_rate": 7.696802927613077e-05,
"loss": 2.0254388809204102,
"mean_token_accuracy": 0.5706159889698028,
"num_tokens": 20157059.0,
"step": 2930
},
{
"entropy": 2.06208598613739,
"epoch": 0.35413153456998314,
"grad_norm": 1.7890625,
"learning_rate": 7.68000845322136e-05,
"loss": 2.074476432800293,
"mean_token_accuracy": 0.5622613191604614,
"num_tokens": 20225520.0,
"step": 2940
},
{
"entropy": 2.0250381588935853,
"epoch": 0.3553360635991327,
"grad_norm": 2.9375,
"learning_rate": 7.663171446243582e-05,
"loss": 2.0442649841308596,
"mean_token_accuracy": 0.56498042345047,
"num_tokens": 20292984.0,
"step": 2950
},
{
"entropy": 2.0137596130371094,
"epoch": 0.35654059262828236,
"grad_norm": 2.109375,
"learning_rate": 7.646292173888399e-05,
"loss": 2.0237215042114256,
"mean_token_accuracy": 0.569210535287857,
"num_tokens": 20364322.0,
"step": 2960
},
{
"entropy": 2.076921796798706,
"epoch": 0.35774512165743194,
"grad_norm": 2.65625,
"learning_rate": 7.629370904035227e-05,
"loss": 2.0826812744140626,
"mean_token_accuracy": 0.5618605375289917,
"num_tokens": 20427686.0,
"step": 2970
},
{
"entropy": 2.03226181268692,
"epoch": 0.3589496506865815,
"grad_norm": 2.0,
"learning_rate": 7.612407905229996e-05,
"loss": 2.067717361450195,
"mean_token_accuracy": 0.558159738779068,
"num_tokens": 20498265.0,
"step": 2980
},
{
"entropy": 1.9998491644859313,
"epoch": 0.36015417971573116,
"grad_norm": 1.90625,
"learning_rate": 7.595403446680894e-05,
"loss": 2.007455825805664,
"mean_token_accuracy": 0.5721205115318299,
"num_tokens": 20564625.0,
"step": 2990
},
{
"entropy": 2.021931827068329,
"epoch": 0.36135870874488074,
"grad_norm": 3.03125,
"learning_rate": 7.578357798254076e-05,
"loss": 2.0407316207885744,
"mean_token_accuracy": 0.5663074970245361,
"num_tokens": 20632253.0,
"step": 3000
},
{
"epoch": 0.36135870874488074,
"eval_entropy": 2.6078274250030518,
"eval_loss": 2.7409777641296387,
"eval_mean_token_accuracy": 0.4436866343021393,
"eval_num_tokens": 20632253.0,
"eval_runtime": 0.505,
"eval_samples_per_second": 31.685,
"eval_steps_per_second": 3.961,
"step": 3000
},
{
"entropy": 2.0366801381111146,
"epoch": 0.3625632377740304,
"grad_norm": 2.171875,
"learning_rate": 7.561271230469409e-05,
"loss": 2.0368415832519533,
"mean_token_accuracy": 0.5711302816867828,
"num_tokens": 20700182.0,
"step": 3010
},
{
"entropy": 2.064214277267456,
"epoch": 0.36376776680317996,
"grad_norm": 2.28125,
"learning_rate": 7.544144014496148e-05,
"loss": 2.0829124450683594,
"mean_token_accuracy": 0.5591930508613586,
"num_tokens": 20769809.0,
"step": 3020
},
{
"entropy": 2.0133578896522524,
"epoch": 0.36497229583232954,
"grad_norm": 2.546875,
"learning_rate": 7.52697642214866e-05,
"loss": 2.0229619979858398,
"mean_token_accuracy": 0.5690375864505768,
"num_tokens": 20839028.0,
"step": 3030
},
{
"entropy": 1.9779277443885803,
"epoch": 0.3661768248614792,
"grad_norm": 1.65625,
"learning_rate": 7.50976872588209e-05,
"loss": 2.0208606719970703,
"mean_token_accuracy": 0.5698799788951874,
"num_tokens": 20908277.0,
"step": 3040
},
{
"entropy": 2.101050305366516,
"epoch": 0.36738135389062876,
"grad_norm": 1.84375,
"learning_rate": 7.492521198788049e-05,
"loss": 2.086177444458008,
"mean_token_accuracy": 0.5555986344814301,
"num_tokens": 20975369.0,
"step": 3050
},
{
"entropy": 1.9677135944366455,
"epoch": 0.36858588291977834,
"grad_norm": 1.6171875,
"learning_rate": 7.475234114590272e-05,
"loss": 1.979222297668457,
"mean_token_accuracy": 0.5747052133083344,
"num_tokens": 21044671.0,
"step": 3060
},
{
"entropy": 1.9906654000282287,
"epoch": 0.369790411948928,
"grad_norm": 1.8984375,
"learning_rate": 7.457907747640285e-05,
"loss": 2.018314170837402,
"mean_token_accuracy": 0.5706595480442047,
"num_tokens": 21114968.0,
"step": 3070
},
{
"entropy": 2.0226004838943483,
"epoch": 0.37099494097807756,
"grad_norm": 2.890625,
"learning_rate": 7.440542372913035e-05,
"loss": 2.0287120819091795,
"mean_token_accuracy": 0.5678030729293824,
"num_tokens": 21184669.0,
"step": 3080
},
{
"entropy": 2.075468158721924,
"epoch": 0.3721994700072272,
"grad_norm": 2.1875,
"learning_rate": 7.42313826600254e-05,
"loss": 2.071501922607422,
"mean_token_accuracy": 0.5608702659606933,
"num_tokens": 21251886.0,
"step": 3090
},
{
"entropy": 1.97612144947052,
"epoch": 0.3734039990363768,
"grad_norm": 2.0625,
"learning_rate": 7.40569570311751e-05,
"loss": 2.0007991790771484,
"mean_token_accuracy": 0.5756966292858123,
"num_tokens": 21321340.0,
"step": 3100
},
{
"epoch": 0.3734039990363768,
"eval_entropy": 2.5279862880706787,
"eval_loss": 2.7285852432250977,
"eval_mean_token_accuracy": 0.4447345584630966,
"eval_num_tokens": 21321340.0,
"eval_runtime": 0.4146,
"eval_samples_per_second": 38.592,
"eval_steps_per_second": 4.824,
"step": 3100
},
{
"entropy": 2.1051839351654054,
"epoch": 0.37460852806552636,
"grad_norm": 1.59375,
"learning_rate": 7.388214961076961e-05,
"loss": 2.1164289474487306,
"mean_token_accuracy": 0.5555289804935455,
"num_tokens": 21388828.0,
"step": 3110
},
{
"entropy": 1.9983577013015748,
"epoch": 0.375813057094676,
"grad_norm": 3.96875,
"learning_rate": 7.370696317305828e-05,
"loss": 2.0253278732299806,
"mean_token_accuracy": 0.5709738373756409,
"num_tokens": 21456866.0,
"step": 3120
},
{
"entropy": 1.9110666394233704,
"epoch": 0.3770175861238256,
"grad_norm": 1.953125,
"learning_rate": 7.353140049830552e-05,
"loss": 1.926797103881836,
"mean_token_accuracy": 0.5828823685646057,
"num_tokens": 21526510.0,
"step": 3130
},
{
"entropy": 2.082789492607117,
"epoch": 0.3782221151529752,
"grad_norm": 1.8125,
"learning_rate": 7.335546437274684e-05,
"loss": 2.072815704345703,
"mean_token_accuracy": 0.5597695082426071,
"num_tokens": 21593938.0,
"step": 3140
},
{
"entropy": 1.9497243881225585,
"epoch": 0.3794266441821248,
"grad_norm": 1.78125,
"learning_rate": 7.317915758854445e-05,
"loss": 1.986886978149414,
"mean_token_accuracy": 0.5727407991886139,
"num_tokens": 21659848.0,
"step": 3150
},
{
"entropy": 2.0966204047203063,
"epoch": 0.3806311732112744,
"grad_norm": 1.8359375,
"learning_rate": 7.300248294374305e-05,
"loss": 2.077589225769043,
"mean_token_accuracy": 0.5576497077941894,
"num_tokens": 21730692.0,
"step": 3160
},
{
"entropy": 1.9753968596458436,
"epoch": 0.381835702240424,
"grad_norm": 2.1875,
"learning_rate": 7.282544324222544e-05,
"loss": 2.011469268798828,
"mean_token_accuracy": 0.5723488807678223,
"num_tokens": 21799339.0,
"step": 3170
},
{
"entropy": 2.0465827584266663,
"epoch": 0.3830402312695736,
"grad_norm": 2.796875,
"learning_rate": 7.264804129366796e-05,
"loss": 2.0382017135620116,
"mean_token_accuracy": 0.562414237856865,
"num_tokens": 21869480.0,
"step": 3180
},
{
"entropy": 1.9501657962799073,
"epoch": 0.3842447602987232,
"grad_norm": 2.046875,
"learning_rate": 7.24702799134959e-05,
"loss": 1.9487823486328124,
"mean_token_accuracy": 0.5793662428855896,
"num_tokens": 21937348.0,
"step": 3190
},
{
"entropy": 1.964065945148468,
"epoch": 0.3854492893278728,
"grad_norm": 1.6953125,
"learning_rate": 7.229216192283887e-05,
"loss": 1.9955894470214843,
"mean_token_accuracy": 0.5740731418132782,
"num_tokens": 22007091.0,
"step": 3200
},
{
"epoch": 0.3854492893278728,
"eval_entropy": 2.611129403114319,
"eval_loss": 2.7415380477905273,
"eval_mean_token_accuracy": 0.4391617476940155,
"eval_num_tokens": 22007091.0,
"eval_runtime": 0.404,
"eval_samples_per_second": 39.603,
"eval_steps_per_second": 4.95,
"step": 3200
},
{
"entropy": 2.006621611118317,
"epoch": 0.3866538183570224,
"grad_norm": 1.8203125,
"learning_rate": 7.211369014848601e-05,
"loss": 1.9990266799926757,
"mean_token_accuracy": 0.5689637005329132,
"num_tokens": 22074714.0,
"step": 3210
},
{
"entropy": 2.0335186243057253,
"epoch": 0.38785834738617203,
"grad_norm": 1.8203125,
"learning_rate": 7.193486742284112e-05,
"loss": 2.0472951889038087,
"mean_token_accuracy": 0.5649256110191345,
"num_tokens": 22143715.0,
"step": 3220
},
{
"entropy": 1.8772284269332886,
"epoch": 0.3890628764153216,
"grad_norm": 2.125,
"learning_rate": 7.175569658387769e-05,
"loss": 1.885722541809082,
"mean_token_accuracy": 0.5915811598300934,
"num_tokens": 22212593.0,
"step": 3230
},
{
"entropy": 2.106627869606018,
"epoch": 0.3902674054444712,
"grad_norm": 2.4375,
"learning_rate": 7.157618047509387e-05,
"loss": 2.1436986923217773,
"mean_token_accuracy": 0.5498576521873474,
"num_tokens": 22277182.0,
"step": 3240
},
{
"entropy": 2.0270182132720946,
"epoch": 0.39147193447362083,
"grad_norm": 2.921875,
"learning_rate": 7.139632194546742e-05,
"loss": 2.0177675247192384,
"mean_token_accuracy": 0.5683903455734253,
"num_tokens": 22346583.0,
"step": 3250
},
{
"entropy": 1.972713053226471,
"epoch": 0.3926764635027704,
"grad_norm": 1.796875,
"learning_rate": 7.121612384941033e-05,
"loss": 1.9979860305786132,
"mean_token_accuracy": 0.5702839136123657,
"num_tokens": 22417261.0,
"step": 3260
},
{
"entropy": 2.0097543239593505,
"epoch": 0.39388099253192,
"grad_norm": 2.609375,
"learning_rate": 7.103558904672368e-05,
"loss": 2.0144615173339844,
"mean_token_accuracy": 0.5682908892631531,
"num_tokens": 22483947.0,
"step": 3270
},
{
"entropy": 1.9494592428207398,
"epoch": 0.39508552156106963,
"grad_norm": 2.046875,
"learning_rate": 7.085472040255218e-05,
"loss": 1.9564752578735352,
"mean_token_accuracy": 0.5814142465591431,
"num_tokens": 22552472.0,
"step": 3280
},
{
"entropy": 1.9892062902450562,
"epoch": 0.3962900505902192,
"grad_norm": 2.109375,
"learning_rate": 7.067352078733872e-05,
"loss": 2.0238758087158204,
"mean_token_accuracy": 0.5674407482147217,
"num_tokens": 22624444.0,
"step": 3290
},
{
"entropy": 2.102202832698822,
"epoch": 0.39749457961936885,
"grad_norm": 2.078125,
"learning_rate": 7.049199307677876e-05,
"loss": 2.093918800354004,
"mean_token_accuracy": 0.5605535507202148,
"num_tokens": 22693651.0,
"step": 3300
},
{
"epoch": 0.39749457961936885,
"eval_entropy": 2.5247026681900024,
"eval_loss": 2.729149341583252,
"eval_mean_token_accuracy": 0.4524097591638565,
"eval_num_tokens": 22693651.0,
"eval_runtime": 0.4863,
"eval_samples_per_second": 32.903,
"eval_steps_per_second": 4.113,
"step": 3300
},
{
"entropy": 1.9703481793403625,
"epoch": 0.39869910864851843,
"grad_norm": 1.8046875,
"learning_rate": 7.031014015177478e-05,
"loss": 1.9685224533081054,
"mean_token_accuracy": 0.571619188785553,
"num_tokens": 22763765.0,
"step": 3310
},
{
"entropy": 1.9734365105628968,
"epoch": 0.399903637677668,
"grad_norm": 1.9765625,
"learning_rate": 7.012796489839053e-05,
"loss": 2.009431838989258,
"mean_token_accuracy": 0.5690336644649505,
"num_tokens": 22833017.0,
"step": 3320
},
{
"entropy": 1.9694549560546875,
"epoch": 0.40110816670681765,
"grad_norm": 2.03125,
"learning_rate": 6.994547020780516e-05,
"loss": 1.9559648513793946,
"mean_token_accuracy": 0.5767580509185791,
"num_tokens": 22900580.0,
"step": 3330
},
{
"entropy": 2.001281261444092,
"epoch": 0.40231269573596723,
"grad_norm": 2.140625,
"learning_rate": 6.976265897626743e-05,
"loss": 2.033053398132324,
"mean_token_accuracy": 0.5716820240020752,
"num_tokens": 22968600.0,
"step": 3340
},
{
"entropy": 2.025617945194244,
"epoch": 0.40351722476511687,
"grad_norm": 1.9609375,
"learning_rate": 6.95795341050497e-05,
"loss": 2.0243719100952147,
"mean_token_accuracy": 0.5736408472061157,
"num_tokens": 23034468.0,
"step": 3350
},
{
"entropy": 2.0567389011383055,
"epoch": 0.40472175379426645,
"grad_norm": 2.453125,
"learning_rate": 6.93960985004019e-05,
"loss": 2.0726577758789064,
"mean_token_accuracy": 0.5588089168071747,
"num_tokens": 23100522.0,
"step": 3360
},
{
"entropy": 2.011142539978027,
"epoch": 0.40592628282341603,
"grad_norm": 2.125,
"learning_rate": 6.921235507350536e-05,
"loss": 2.0313907623291017,
"mean_token_accuracy": 0.5747258961200714,
"num_tokens": 23171707.0,
"step": 3370
},
{
"entropy": 1.941812264919281,
"epoch": 0.40713081185256567,
"grad_norm": 2.0625,
"learning_rate": 6.902830674042667e-05,
"loss": 1.9646072387695312,
"mean_token_accuracy": 0.5840674757957458,
"num_tokens": 23237006.0,
"step": 3380
},
{
"entropy": 2.0653809905052185,
"epoch": 0.40833534088171525,
"grad_norm": 2.21875,
"learning_rate": 6.884395642207141e-05,
"loss": 2.0873439788818358,
"mean_token_accuracy": 0.5598014950752258,
"num_tokens": 23303618.0,
"step": 3390
},
{
"entropy": 2.088254952430725,
"epoch": 0.40953986991086483,
"grad_norm": 2.109375,
"learning_rate": 6.865930704413771e-05,
"loss": 2.0937089920043945,
"mean_token_accuracy": 0.5576972186565399,
"num_tokens": 23371466.0,
"step": 3400
},
{
"epoch": 0.40953986991086483,
"eval_entropy": 2.4787381887435913,
"eval_loss": 2.6963882446289062,
"eval_mean_token_accuracy": 0.45208749175071716,
"eval_num_tokens": 23371466.0,
"eval_runtime": 0.416,
"eval_samples_per_second": 38.462,
"eval_steps_per_second": 4.808,
"step": 3400
},
{
"entropy": 1.9383289098739624,
"epoch": 0.41074439894001447,
"grad_norm": 1.71875,
"learning_rate": 6.84743615370699e-05,
"loss": 1.9510780334472657,
"mean_token_accuracy": 0.584078460931778,
"num_tokens": 23438546.0,
"step": 3410
},
{
"entropy": 1.9791313052177428,
"epoch": 0.41194892796916405,
"grad_norm": 2.5625,
"learning_rate": 6.828912283601195e-05,
"loss": 1.9753469467163085,
"mean_token_accuracy": 0.5759646952152252,
"num_tokens": 23508413.0,
"step": 3420
},
{
"entropy": 1.9467034935951233,
"epoch": 0.4131534569983137,
"grad_norm": 1.71875,
"learning_rate": 6.810359388076097e-05,
"loss": 1.9725639343261718,
"mean_token_accuracy": 0.5774512171745301,
"num_tokens": 23578028.0,
"step": 3430
},
{
"entropy": 2.0062270283699037,
"epoch": 0.41435798602746327,
"grad_norm": 2.421875,
"learning_rate": 6.79177776157204e-05,
"loss": 2.0015352249145506,
"mean_token_accuracy": 0.5665505826473236,
"num_tokens": 23648449.0,
"step": 3440
},
{
"entropy": 2.0732170820236204,
"epoch": 0.41556251505661285,
"grad_norm": 2.328125,
"learning_rate": 6.773167698985348e-05,
"loss": 2.0981536865234376,
"mean_token_accuracy": 0.5579517900943756,
"num_tokens": 23717754.0,
"step": 3450
},
{
"entropy": 2.0526692628860475,
"epoch": 0.4167670440857625,
"grad_norm": 2.640625,
"learning_rate": 6.754529495663627e-05,
"loss": 2.053252601623535,
"mean_token_accuracy": 0.5652489423751831,
"num_tokens": 23785096.0,
"step": 3460
},
{
"entropy": 2.0952977895736695,
"epoch": 0.41797157311491206,
"grad_norm": 1.890625,
"learning_rate": 6.73586344740109e-05,
"loss": 2.1027814865112306,
"mean_token_accuracy": 0.5595345795154572,
"num_tokens": 23853741.0,
"step": 3470
},
{
"entropy": 2.0081144332885743,
"epoch": 0.41917610214406165,
"grad_norm": 1.671875,
"learning_rate": 6.717169850433857e-05,
"loss": 2.0238536834716796,
"mean_token_accuracy": 0.5671063840389252,
"num_tokens": 23922684.0,
"step": 3480
},
{
"entropy": 1.9314401149749756,
"epoch": 0.4203806311732113,
"grad_norm": 5.875,
"learning_rate": 6.698449001435251e-05,
"loss": 1.9350923538208007,
"mean_token_accuracy": 0.5847425639629364,
"num_tokens": 23991091.0,
"step": 3490
},
{
"entropy": 1.9933564424514771,
"epoch": 0.42158516020236086,
"grad_norm": 2.40625,
"learning_rate": 6.679701197511098e-05,
"loss": 2.0253509521484374,
"mean_token_accuracy": 0.5672946512699127,
"num_tokens": 24062333.0,
"step": 3500
},
{
"epoch": 0.42158516020236086,
"eval_entropy": 2.4498034715652466,
"eval_loss": 2.6938982009887695,
"eval_mean_token_accuracy": 0.45604458451271057,
"eval_num_tokens": 24062333.0,
"eval_runtime": 0.4009,
"eval_samples_per_second": 39.915,
"eval_steps_per_second": 4.989,
"step": 3500
},
{
"entropy": 1.993247103691101,
"epoch": 0.4227896892315105,
"grad_norm": 2.0,
"learning_rate": 6.660926736195007e-05,
"loss": 1.985020065307617,
"mean_token_accuracy": 0.572957593202591,
"num_tokens": 24130009.0,
"step": 3510
},
{
"entropy": 2.004629743099213,
"epoch": 0.4239942182606601,
"grad_norm": 1.5859375,
"learning_rate": 6.642125915443646e-05,
"loss": 2.0138731002807617,
"mean_token_accuracy": 0.5689170718193054,
"num_tokens": 24198224.0,
"step": 3520
},
{
"entropy": 1.992073893547058,
"epoch": 0.42519874728980966,
"grad_norm": 1.8203125,
"learning_rate": 6.623299033632015e-05,
"loss": 2.003081703186035,
"mean_token_accuracy": 0.5710993528366088,
"num_tokens": 24269236.0,
"step": 3530
},
{
"entropy": 1.9667566299438477,
"epoch": 0.4264032763189593,
"grad_norm": 2.625,
"learning_rate": 6.604446389548718e-05,
"loss": 1.9854969024658202,
"mean_token_accuracy": 0.5752442955970765,
"num_tokens": 24336698.0,
"step": 3540
},
{
"entropy": 1.9823364615440369,
"epoch": 0.4276078053481089,
"grad_norm": 7.03125,
"learning_rate": 6.585568282391202e-05,
"loss": 2.002846336364746,
"mean_token_accuracy": 0.5752040565013885,
"num_tokens": 24404740.0,
"step": 3550
},
{
"entropy": 1.9646108746528625,
"epoch": 0.4288123343772585,
"grad_norm": 2.484375,
"learning_rate": 6.566665011761036e-05,
"loss": 1.9754671096801757,
"mean_token_accuracy": 0.5784339427947998,
"num_tokens": 24471666.0,
"step": 3560
},
{
"entropy": 1.9380438804626465,
"epoch": 0.4300168634064081,
"grad_norm": 2.015625,
"learning_rate": 6.547736877659129e-05,
"loss": 1.948002815246582,
"mean_token_accuracy": 0.5778752982616424,
"num_tokens": 24541106.0,
"step": 3570
},
{
"entropy": 1.9597272753715516,
"epoch": 0.4312213924355577,
"grad_norm": 2.046875,
"learning_rate": 6.528784180480987e-05,
"loss": 1.9658893585205077,
"mean_token_accuracy": 0.5809767782688141,
"num_tokens": 24610669.0,
"step": 3580
},
{
"entropy": 2.016139495372772,
"epoch": 0.4324259214647073,
"grad_norm": 1.90625,
"learning_rate": 6.509807221011939e-05,
"loss": 2.0279006958007812,
"mean_token_accuracy": 0.5701642394065857,
"num_tokens": 24677668.0,
"step": 3590
},
{
"entropy": 1.9099430084228515,
"epoch": 0.4336304504938569,
"grad_norm": 2.171875,
"learning_rate": 6.490806300422363e-05,
"loss": 1.9234348297119142,
"mean_token_accuracy": 0.5862063884735107,
"num_tokens": 24747672.0,
"step": 3600
},
{
"epoch": 0.4336304504938569,
"eval_entropy": 2.477397084236145,
"eval_loss": 2.695096969604492,
"eval_mean_token_accuracy": 0.463316410779953,
"eval_num_tokens": 24747672.0,
"eval_runtime": 0.4717,
"eval_samples_per_second": 33.919,
"eval_steps_per_second": 4.24,
"step": 3600
},
{
"entropy": 2.0054628014564515,
"epoch": 0.4348349795230065,
"grad_norm": 1.8203125,
"learning_rate": 6.47178172026291e-05,
"loss": 2.007233238220215,
"mean_token_accuracy": 0.5705864608287812,
"num_tokens": 24813790.0,
"step": 3610
},
{
"entropy": 2.009073185920715,
"epoch": 0.4360395085521561,
"grad_norm": 1.65625,
"learning_rate": 6.452733782459717e-05,
"loss": 2.029523468017578,
"mean_token_accuracy": 0.5658802688121796,
"num_tokens": 24883706.0,
"step": 3620
},
{
"entropy": 1.9586613178253174,
"epoch": 0.4372440375813057,
"grad_norm": 2.15625,
"learning_rate": 6.433662789309605e-05,
"loss": 1.9708032608032227,
"mean_token_accuracy": 0.5791431427001953,
"num_tokens": 24951765.0,
"step": 3630
},
{
"entropy": 1.9156463623046875,
"epoch": 0.43844856661045534,
"grad_norm": 1.625,
"learning_rate": 6.414569043475305e-05,
"loss": 1.9143449783325195,
"mean_token_accuracy": 0.5888265132904053,
"num_tokens": 25019817.0,
"step": 3640
},
{
"entropy": 1.9117112517356873,
"epoch": 0.4396530956396049,
"grad_norm": 1.578125,
"learning_rate": 6.395452847980628e-05,
"loss": 1.9290433883666993,
"mean_token_accuracy": 0.5843783140182495,
"num_tokens": 25088811.0,
"step": 3650
},
{
"entropy": 1.9448886394500733,
"epoch": 0.4408576246687545,
"grad_norm": 6.375,
"learning_rate": 6.376314506205675e-05,
"loss": 1.9512279510498047,
"mean_token_accuracy": 0.585245794057846,
"num_tokens": 25154490.0,
"step": 3660
},
{
"entropy": 1.9380404233932496,
"epoch": 0.44206215369790414,
"grad_norm": 1.8125,
"learning_rate": 6.357154321882012e-05,
"loss": 1.9418190002441407,
"mean_token_accuracy": 0.5806894898414612,
"num_tokens": 25222490.0,
"step": 3670
},
{
"entropy": 2.007444739341736,
"epoch": 0.4432666827270537,
"grad_norm": 2.90625,
"learning_rate": 6.337972599087857e-05,
"loss": 2.007956123352051,
"mean_token_accuracy": 0.5707302153110504,
"num_tokens": 25288925.0,
"step": 3680
},
{
"entropy": 1.9364651441574097,
"epoch": 0.4444712117562033,
"grad_norm": 1.953125,
"learning_rate": 6.318769642243245e-05,
"loss": 1.9320077896118164,
"mean_token_accuracy": 0.5831962287425995,
"num_tokens": 25357070.0,
"step": 3690
},
{
"entropy": 1.9276177763938904,
"epoch": 0.44567574078535294,
"grad_norm": 2.625,
"learning_rate": 6.299545756105209e-05,
"loss": 1.9602447509765626,
"mean_token_accuracy": 0.5792842030525207,
"num_tokens": 25426661.0,
"step": 3700
},
{
"epoch": 0.44567574078535294,
"eval_entropy": 2.525130867958069,
"eval_loss": 2.6856164932250977,
"eval_mean_token_accuracy": 0.46145734190940857,
"eval_num_tokens": 25426661.0,
"eval_runtime": 0.4119,
"eval_samples_per_second": 38.84,
"eval_steps_per_second": 4.855,
"step": 3700
},
{
"entropy": 1.9861109852790833,
"epoch": 0.4468802698145025,
"grad_norm": 2.390625,
"learning_rate": 6.280301245762929e-05,
"loss": 1.9737564086914063,
"mean_token_accuracy": 0.5805804789066314,
"num_tokens": 25493623.0,
"step": 3710
},
{
"entropy": 2.0100439190864563,
"epoch": 0.44808479884365215,
"grad_norm": 2.078125,
"learning_rate": 6.261036416632906e-05,
"loss": 2.036077117919922,
"mean_token_accuracy": 0.5688071370124816,
"num_tokens": 25564579.0,
"step": 3720
},
{
"entropy": 2.0535164713859557,
"epoch": 0.44928932787280174,
"grad_norm": 1.671875,
"learning_rate": 6.241751574454098e-05,
"loss": 2.054555320739746,
"mean_token_accuracy": 0.565549087524414,
"num_tokens": 25633048.0,
"step": 3730
},
{
"entropy": 2.027570879459381,
"epoch": 0.4504938569019513,
"grad_norm": 1.734375,
"learning_rate": 6.222447025283082e-05,
"loss": 2.04293270111084,
"mean_token_accuracy": 0.5656031697988511,
"num_tokens": 25702607.0,
"step": 3740
},
{
"entropy": 2.0254459023475646,
"epoch": 0.45169838593110095,
"grad_norm": 1.7734375,
"learning_rate": 6.203123075489191e-05,
"loss": 2.023523139953613,
"mean_token_accuracy": 0.5712070286273956,
"num_tokens": 25771055.0,
"step": 3750
},
{
"entropy": 1.8324352741241454,
"epoch": 0.45290291496025054,
"grad_norm": 1.890625,
"learning_rate": 6.183780031749649e-05,
"loss": 1.8483991622924805,
"mean_token_accuracy": 0.5993020892143249,
"num_tokens": 25837460.0,
"step": 3760
},
{
"entropy": 1.9764248132705688,
"epoch": 0.4541074439894001,
"grad_norm": 1.734375,
"learning_rate": 6.164418201044709e-05,
"loss": 1.9906301498413086,
"mean_token_accuracy": 0.5752259314060211,
"num_tokens": 25908374.0,
"step": 3770
},
{
"entropy": 2.0057262182235718,
"epoch": 0.45531197301854975,
"grad_norm": 3.0,
"learning_rate": 6.145037890652777e-05,
"loss": 2.0180091857910156,
"mean_token_accuracy": 0.5674101829528808,
"num_tokens": 25976856.0,
"step": 3780
},
{
"entropy": 2.0065826296806337,
"epoch": 0.45651650204769934,
"grad_norm": 1.7734375,
"learning_rate": 6.125639408145545e-05,
"loss": 2.0031196594238283,
"mean_token_accuracy": 0.5769789397716523,
"num_tokens": 26044133.0,
"step": 3790
},
{
"entropy": 2.0009281516075133,
"epoch": 0.45772103107684897,
"grad_norm": 2.0625,
"learning_rate": 6.106223061383093e-05,
"loss": 2.026101303100586,
"mean_token_accuracy": 0.5688745856285096,
"num_tokens": 26113453.0,
"step": 3800
},
{
"epoch": 0.45772103107684897,
"eval_entropy": 2.4917021989822388,
"eval_loss": 2.6710691452026367,
"eval_mean_token_accuracy": 0.45515669882297516,
"eval_num_tokens": 26113453.0,
"eval_runtime": 0.5084,
"eval_samples_per_second": 31.474,
"eval_steps_per_second": 3.934,
"step": 3800
},
{
"entropy": 1.9381158471107482,
"epoch": 0.45892556010599855,
"grad_norm": 2.25,
"learning_rate": 6.0867891585090166e-05,
"loss": 1.9284444808959962,
"mean_token_accuracy": 0.5864272952079773,
"num_tokens": 26184767.0,
"step": 3810
},
{
"entropy": 1.9594375610351562,
"epoch": 0.46013008913514813,
"grad_norm": 1.75,
"learning_rate": 6.067338007945531e-05,
"loss": 1.9852970123291016,
"mean_token_accuracy": 0.5773785710334778,
"num_tokens": 26254484.0,
"step": 3820
},
{
"entropy": 1.9799617648124694,
"epoch": 0.46133461816429777,
"grad_norm": 1.9765625,
"learning_rate": 6.04786991838858e-05,
"loss": 1.9897350311279296,
"mean_token_accuracy": 0.5737548470497131,
"num_tokens": 26324694.0,
"step": 3830
},
{
"entropy": 1.9124466061592102,
"epoch": 0.46253914719344735,
"grad_norm": 1.796875,
"learning_rate": 6.028385198802935e-05,
"loss": 1.9315099716186523,
"mean_token_accuracy": 0.5868638217449188,
"num_tokens": 26393506.0,
"step": 3840
},
{
"entropy": 1.9165910959243775,
"epoch": 0.463743676222597,
"grad_norm": 1.6640625,
"learning_rate": 6.008884158417285e-05,
"loss": 1.9327356338500976,
"mean_token_accuracy": 0.5816730856895447,
"num_tokens": 26463539.0,
"step": 3850
},
{
"entropy": 1.947556233406067,
"epoch": 0.46494820525174657,
"grad_norm": 1.8828125,
"learning_rate": 5.989367106719342e-05,
"loss": 1.9421436309814453,
"mean_token_accuracy": 0.5770792663097382,
"num_tokens": 26533658.0,
"step": 3860
},
{
"entropy": 1.9143320441246032,
"epoch": 0.46615273428089615,
"grad_norm": 1.84375,
"learning_rate": 5.9698343534509206e-05,
"loss": 1.9342830657958985,
"mean_token_accuracy": 0.5813970685005188,
"num_tokens": 26603082.0,
"step": 3870
},
{
"entropy": 1.9642464518547058,
"epoch": 0.4673572633100458,
"grad_norm": 1.8046875,
"learning_rate": 5.9502862086030255e-05,
"loss": 1.967822265625,
"mean_token_accuracy": 0.5837168216705322,
"num_tokens": 26670867.0,
"step": 3880
},
{
"entropy": 2.036736857891083,
"epoch": 0.46856179233919537,
"grad_norm": 1.6171875,
"learning_rate": 5.930722982410928e-05,
"loss": 2.0509645462036135,
"mean_token_accuracy": 0.564321780204773,
"num_tokens": 26739725.0,
"step": 3890
},
{
"entropy": 1.9816170811653138,
"epoch": 0.46976632136834495,
"grad_norm": 4.0,
"learning_rate": 5.911144985349245e-05,
"loss": 1.9887372970581054,
"mean_token_accuracy": 0.5746063709259033,
"num_tokens": 26805376.0,
"step": 3900
},
{
"epoch": 0.46976632136834495,
"eval_entropy": 2.439876079559326,
"eval_loss": 2.6857075691223145,
"eval_mean_token_accuracy": 0.45507559180259705,
"eval_num_tokens": 26805376.0,
"eval_runtime": 0.5052,
"eval_samples_per_second": 31.67,
"eval_steps_per_second": 3.959,
"step": 3900
},
{
"entropy": 1.9576510548591615,
"epoch": 0.4709708503974946,
"grad_norm": 2.234375,
"learning_rate": 5.891552528127015e-05,
"loss": 1.9680488586425782,
"mean_token_accuracy": 0.5752190470695495,
"num_tokens": 26874810.0,
"step": 3910
},
{
"entropy": 1.9396398186683654,
"epoch": 0.47217537942664417,
"grad_norm": 2.015625,
"learning_rate": 5.871945921682762e-05,
"loss": 1.9553556442260742,
"mean_token_accuracy": 0.5795332670211792,
"num_tokens": 26947352.0,
"step": 3920
},
{
"entropy": 1.981644630432129,
"epoch": 0.4733799084557938,
"grad_norm": 1.9453125,
"learning_rate": 5.8523254771795635e-05,
"loss": 2.001514434814453,
"mean_token_accuracy": 0.5716517567634583,
"num_tokens": 27016298.0,
"step": 3930
},
{
"entropy": 2.0159306645393373,
"epoch": 0.4745844374849434,
"grad_norm": 1.921875,
"learning_rate": 5.8326915060001076e-05,
"loss": 2.0092771530151365,
"mean_token_accuracy": 0.5699867486953736,
"num_tokens": 27087211.0,
"step": 3940
},
{
"entropy": 1.8825679063796996,
"epoch": 0.47578896651409297,
"grad_norm": 1.96875,
"learning_rate": 5.81304431974176e-05,
"loss": 1.9066276550292969,
"mean_token_accuracy": 0.5872237503528595,
"num_tokens": 27155504.0,
"step": 3950
},
{
"entropy": 2.0146188974380492,
"epoch": 0.4769934955432426,
"grad_norm": 2.34375,
"learning_rate": 5.793384230211611e-05,
"loss": 2.0064815521240233,
"mean_token_accuracy": 0.5711189568042755,
"num_tokens": 27221744.0,
"step": 3960
},
{
"entropy": 1.9668913722038268,
"epoch": 0.4781980245723922,
"grad_norm": 1.8125,
"learning_rate": 5.7737115494215353e-05,
"loss": 1.993480110168457,
"mean_token_accuracy": 0.5775595366954803,
"num_tokens": 27288564.0,
"step": 3970
},
{
"entropy": 1.9702805399894714,
"epoch": 0.47940255360154177,
"grad_norm": 1.8203125,
"learning_rate": 5.754026589583224e-05,
"loss": 1.9741327285766601,
"mean_token_accuracy": 0.5786505699157715,
"num_tokens": 27353144.0,
"step": 3980
},
{
"entropy": 1.8980592966079712,
"epoch": 0.4806070826306914,
"grad_norm": 1.921875,
"learning_rate": 5.734329663103252e-05,
"loss": 1.9262420654296875,
"mean_token_accuracy": 0.5835274875164032,
"num_tokens": 27420105.0,
"step": 3990
},
{
"entropy": 1.9941051006317139,
"epoch": 0.481811611659841,
"grad_norm": 1.859375,
"learning_rate": 5.7146210825781e-05,
"loss": 1.980280303955078,
"mean_token_accuracy": 0.577644807100296,
"num_tokens": 27486818.0,
"step": 4000
},
{
"epoch": 0.481811611659841,
"eval_entropy": 2.42073392868042,
"eval_loss": 2.6700968742370605,
"eval_mean_token_accuracy": 0.4570968747138977,
"eval_num_tokens": 27486818.0,
"eval_runtime": 0.5345,
"eval_samples_per_second": 29.933,
"eval_steps_per_second": 3.742,
"step": 4000
},
{
"entropy": 1.917995858192444,
"epoch": 0.4830161406889906,
"grad_norm": 2.375,
"learning_rate": 5.694901160789209e-05,
"loss": 1.9306228637695313,
"mean_token_accuracy": 0.5779500126838684,
"num_tokens": 27555652.0,
"step": 4010
},
{
"entropy": 1.9785538196563721,
"epoch": 0.4842206697181402,
"grad_norm": 2.578125,
"learning_rate": 5.6751702106980044e-05,
"loss": 2.0008047103881834,
"mean_token_accuracy": 0.5703692853450775,
"num_tokens": 27623554.0,
"step": 4020
},
{
"entropy": 1.9732365250587462,
"epoch": 0.4854251987472898,
"grad_norm": 1.9765625,
"learning_rate": 5.655428545440936e-05,
"loss": 1.9656476974487305,
"mean_token_accuracy": 0.5797830998897553,
"num_tokens": 27692863.0,
"step": 4030
},
{
"entropy": 1.9249773025512695,
"epoch": 0.4866297277764394,
"grad_norm": 2.4375,
"learning_rate": 5.6356764783245075e-05,
"loss": 1.9510303497314454,
"mean_token_accuracy": 0.5802127003669739,
"num_tokens": 27761499.0,
"step": 4040
},
{
"entropy": 2.0097981095314026,
"epoch": 0.487834256805589,
"grad_norm": 1.609375,
"learning_rate": 5.6159143228203016e-05,
"loss": 1.999547576904297,
"mean_token_accuracy": 0.5743436753749848,
"num_tokens": 27829573.0,
"step": 4050
},
{
"entropy": 1.8779245853424071,
"epoch": 0.48903878583473864,
"grad_norm": 2.421875,
"learning_rate": 5.59614239256001e-05,
"loss": 1.9048309326171875,
"mean_token_accuracy": 0.5902513444423676,
"num_tokens": 27897536.0,
"step": 4060
},
{
"entropy": 2.007873523235321,
"epoch": 0.4902433148638882,
"grad_norm": 1.7578125,
"learning_rate": 5.576361001330451e-05,
"loss": 2.0190666198730467,
"mean_token_accuracy": 0.5717886984348297,
"num_tokens": 27965109.0,
"step": 4070
},
{
"entropy": 2.008230710029602,
"epoch": 0.4914478438930378,
"grad_norm": 2.125,
"learning_rate": 5.5565704630685886e-05,
"loss": 2.011882209777832,
"mean_token_accuracy": 0.5669187545776367,
"num_tokens": 28031616.0,
"step": 4080
},
{
"entropy": 1.9069723725318908,
"epoch": 0.49265237292218744,
"grad_norm": 2.171875,
"learning_rate": 5.536771091856559e-05,
"loss": 1.9020198822021483,
"mean_token_accuracy": 0.5873600125312806,
"num_tokens": 28099184.0,
"step": 4090
},
{
"entropy": 1.9309583067893983,
"epoch": 0.493856901951337,
"grad_norm": 2.625,
"learning_rate": 5.516963201916674e-05,
"loss": 1.967962646484375,
"mean_token_accuracy": 0.5801371574401856,
"num_tokens": 28169833.0,
"step": 4100
},
{
"epoch": 0.493856901951337,
"eval_entropy": 2.4127514362335205,
"eval_loss": 2.6565656661987305,
"eval_mean_token_accuracy": 0.45418770611286163,
"eval_num_tokens": 28169833.0,
"eval_runtime": 0.3983,
"eval_samples_per_second": 40.173,
"eval_steps_per_second": 5.022,
"step": 4100
},
{
"entropy": 1.9569150805473328,
"epoch": 0.4950614309804866,
"grad_norm": 1.6328125,
"learning_rate": 5.4971471076064475e-05,
"loss": 1.9444543838500976,
"mean_token_accuracy": 0.5801873207092285,
"num_tokens": 28237930.0,
"step": 4110
},
{
"entropy": 1.8962748527526856,
"epoch": 0.49626596000963624,
"grad_norm": 2.671875,
"learning_rate": 5.4773231234135916e-05,
"loss": 1.9189611434936524,
"mean_token_accuracy": 0.5859391510486602,
"num_tokens": 28306591.0,
"step": 4120
},
{
"entropy": 1.9929224729537964,
"epoch": 0.4974704890387858,
"grad_norm": 5.40625,
"learning_rate": 5.457491563951037e-05,
"loss": 2.0031759262084963,
"mean_token_accuracy": 0.57361621260643,
"num_tokens": 28376098.0,
"step": 4130
},
{
"entropy": 2.015347754955292,
"epoch": 0.49867501806793546,
"grad_norm": 1.84375,
"learning_rate": 5.4376527439519376e-05,
"loss": 2.0239486694335938,
"mean_token_accuracy": 0.5746548473834991,
"num_tokens": 28443069.0,
"step": 4140
},
{
"entropy": 1.9625335097312928,
"epoch": 0.49987954709708504,
"grad_norm": 2.671875,
"learning_rate": 5.417806978264673e-05,
"loss": 1.9675872802734375,
"mean_token_accuracy": 0.5780832052230835,
"num_tokens": 28512775.0,
"step": 4150
},
{
"entropy": 1.9938682675361634,
"epoch": 0.5010840761262346,
"grad_norm": 2.625,
"learning_rate": 5.397954581847855e-05,
"loss": 1.9934600830078124,
"mean_token_accuracy": 0.5703189671039581,
"num_tokens": 28579579.0,
"step": 4160
},
{
"entropy": 1.9511204719543458,
"epoch": 0.5022886051553842,
"grad_norm": 1.6875,
"learning_rate": 5.378095869765323e-05,
"loss": 1.9708211898803711,
"mean_token_accuracy": 0.5744782328605652,
"num_tokens": 28646564.0,
"step": 4170
},
{
"entropy": 1.9872597217559815,
"epoch": 0.5034931341845339,
"grad_norm": 1.90625,
"learning_rate": 5.358231157181149e-05,
"loss": 1.9806531906127929,
"mean_token_accuracy": 0.5780558466911316,
"num_tokens": 28716562.0,
"step": 4180
},
{
"entropy": 1.9043650150299072,
"epoch": 0.5046976632136835,
"grad_norm": 2.25,
"learning_rate": 5.338360759354639e-05,
"loss": 1.921070671081543,
"mean_token_accuracy": 0.5862109303474426,
"num_tokens": 28782253.0,
"step": 4190
},
{
"entropy": 1.9266673088073731,
"epoch": 0.5059021922428331,
"grad_norm": 1.703125,
"learning_rate": 5.318484991635323e-05,
"loss": 1.9321710586547851,
"mean_token_accuracy": 0.5806420803070068,
"num_tokens": 28853279.0,
"step": 4200
},
{
"epoch": 0.5059021922428331,
"eval_entropy": 2.398097038269043,
"eval_loss": 2.66256046295166,
"eval_mean_token_accuracy": 0.4587104022502899,
"eval_num_tokens": 28853279.0,
"eval_runtime": 0.4258,
"eval_samples_per_second": 37.58,
"eval_steps_per_second": 4.698,
"step": 4200
},
{
"entropy": 1.9215782046318055,
"epoch": 0.5071067212719826,
"grad_norm": 1.546875,
"learning_rate": 5.29860416945795e-05,
"loss": 1.927357292175293,
"mean_token_accuracy": 0.588092303276062,
"num_tokens": 28921713.0,
"step": 4210
},
{
"entropy": 1.890907645225525,
"epoch": 0.5083112503011322,
"grad_norm": 1.765625,
"learning_rate": 5.278718608337489e-05,
"loss": 1.911885643005371,
"mean_token_accuracy": 0.5910214900970459,
"num_tokens": 28991493.0,
"step": 4220
},
{
"entropy": 1.9491180300712585,
"epoch": 0.5095157793302819,
"grad_norm": 2.59375,
"learning_rate": 5.2588286238641146e-05,
"loss": 1.9617496490478517,
"mean_token_accuracy": 0.579128873348236,
"num_tokens": 29059104.0,
"step": 4230
},
{
"entropy": 2.0040785312652587,
"epoch": 0.5107203083594315,
"grad_norm": 2.015625,
"learning_rate": 5.238934531698206e-05,
"loss": 2.024494171142578,
"mean_token_accuracy": 0.5720134794712066,
"num_tokens": 29126675.0,
"step": 4240
},
{
"entropy": 1.9998656392097474,
"epoch": 0.5119248373885811,
"grad_norm": 2.140625,
"learning_rate": 5.21903664756533e-05,
"loss": 2.0020214080810548,
"mean_token_accuracy": 0.5735575735569001,
"num_tokens": 29195874.0,
"step": 4250
},
{
"entropy": 1.95628924369812,
"epoch": 0.5131293664177307,
"grad_norm": 3.25,
"learning_rate": 5.199135287251229e-05,
"loss": 1.9562814712524415,
"mean_token_accuracy": 0.585600209236145,
"num_tokens": 29262468.0,
"step": 4260
},
{
"entropy": 1.9958741307258605,
"epoch": 0.5143338954468802,
"grad_norm": 3.125,
"learning_rate": 5.1792307665968184e-05,
"loss": 2.0193979263305666,
"mean_token_accuracy": 0.5735933542251587,
"num_tokens": 29328578.0,
"step": 4270
},
{
"entropy": 1.9700656414031983,
"epoch": 0.5155384244760298,
"grad_norm": 1.6796875,
"learning_rate": 5.15932340149317e-05,
"loss": 1.963376808166504,
"mean_token_accuracy": 0.5765927374362946,
"num_tokens": 29395680.0,
"step": 4280
},
{
"entropy": 1.9745967507362365,
"epoch": 0.5167429535051795,
"grad_norm": 2.140625,
"learning_rate": 5.139413507876495e-05,
"loss": 1.9744836807250976,
"mean_token_accuracy": 0.5818488031625748,
"num_tokens": 29463513.0,
"step": 4290
},
{
"entropy": 1.9099359273910523,
"epoch": 0.5179474825343291,
"grad_norm": 2.40625,
"learning_rate": 5.1195014017231346e-05,
"loss": 1.9302894592285156,
"mean_token_accuracy": 0.5796979129314422,
"num_tokens": 29533253.0,
"step": 4300
},
{
"epoch": 0.5179474825343291,
"eval_entropy": 2.387773633003235,
"eval_loss": 2.6579198837280273,
"eval_mean_token_accuracy": 0.459276020526886,
"eval_num_tokens": 29533253.0,
"eval_runtime": 0.3914,
"eval_samples_per_second": 40.882,
"eval_steps_per_second": 5.11,
"step": 4300
},
{
"entropy": 2.002520430088043,
"epoch": 0.5191520115634787,
"grad_norm": 3.078125,
"learning_rate": 5.099587399044542e-05,
"loss": 2.0029012680053713,
"mean_token_accuracy": 0.5740825355052948,
"num_tokens": 29603758.0,
"step": 4310
},
{
"entropy": 1.9531593322753906,
"epoch": 0.5203565405926283,
"grad_norm": 2.171875,
"learning_rate": 5.0796718158822686e-05,
"loss": 1.9717971801757812,
"mean_token_accuracy": 0.5774181842803955,
"num_tokens": 29672871.0,
"step": 4320
},
{
"entropy": 1.9271060705184937,
"epoch": 0.5215610696217778,
"grad_norm": 1.9765625,
"learning_rate": 5.059754968302953e-05,
"loss": 1.9339466094970703,
"mean_token_accuracy": 0.5867386102676392,
"num_tokens": 29743463.0,
"step": 4330
},
{
"entropy": 1.918910849094391,
"epoch": 0.5227655986509275,
"grad_norm": 2.125,
"learning_rate": 5.039837172393297e-05,
"loss": 1.9289640426635741,
"mean_token_accuracy": 0.5850243151187897,
"num_tokens": 29812178.0,
"step": 4340
},
{
"entropy": 1.936630415916443,
"epoch": 0.5239701276800771,
"grad_norm": 1.7109375,
"learning_rate": 5.01991874425505e-05,
"loss": 1.945786476135254,
"mean_token_accuracy": 0.5822522580623627,
"num_tokens": 29881488.0,
"step": 4350
},
{
"entropy": 1.9208004593849182,
"epoch": 0.5251746567092267,
"grad_norm": 2.265625,
"learning_rate": 5e-05,
"loss": 1.9185230255126953,
"mean_token_accuracy": 0.5845731794834137,
"num_tokens": 29950167.0,
"step": 4360
},
{
"entropy": 1.9142470717430116,
"epoch": 0.5263791857383763,
"grad_norm": 1.6796875,
"learning_rate": 4.980081255744951e-05,
"loss": 1.944696044921875,
"mean_token_accuracy": 0.5828209400177002,
"num_tokens": 30018932.0,
"step": 4370
},
{
"entropy": 1.9780327558517456,
"epoch": 0.5275837147675259,
"grad_norm": 2.203125,
"learning_rate": 4.9601628276067044e-05,
"loss": 1.9682683944702148,
"mean_token_accuracy": 0.5764804124832154,
"num_tokens": 30087597.0,
"step": 4380
},
{
"entropy": 1.9604371428489684,
"epoch": 0.5287882437966756,
"grad_norm": 1.78125,
"learning_rate": 4.940245031697047e-05,
"loss": 1.9709638595581054,
"mean_token_accuracy": 0.5788785338401794,
"num_tokens": 30157904.0,
"step": 4390
},
{
"entropy": 1.850344479084015,
"epoch": 0.5299927728258251,
"grad_norm": 2.40625,
"learning_rate": 4.920328184117731e-05,
"loss": 1.858159065246582,
"mean_token_accuracy": 0.5930382966995239,
"num_tokens": 30228125.0,
"step": 4400
},
{
"epoch": 0.5299927728258251,
"eval_entropy": 2.3644343614578247,
"eval_loss": 2.6451668739318848,
"eval_mean_token_accuracy": 0.46767687797546387,
"eval_num_tokens": 30228125.0,
"eval_runtime": 0.4767,
"eval_samples_per_second": 33.567,
"eval_steps_per_second": 4.196,
"step": 4400
},
{
"entropy": 1.911184024810791,
"epoch": 0.5311973018549747,
"grad_norm": 2.09375,
"learning_rate": 4.9004126009554605e-05,
"loss": 1.928396987915039,
"mean_token_accuracy": 0.5842796444892884,
"num_tokens": 30297819.0,
"step": 4410
},
{
"entropy": 1.9067941188812256,
"epoch": 0.5324018308841243,
"grad_norm": 1.953125,
"learning_rate": 4.880498598276867e-05,
"loss": 1.940384864807129,
"mean_token_accuracy": 0.5868594408035278,
"num_tokens": 30363867.0,
"step": 4420
},
{
"entropy": 2.005669319629669,
"epoch": 0.5336063599132739,
"grad_norm": 1.921875,
"learning_rate": 4.860586492123506e-05,
"loss": 1.997481346130371,
"mean_token_accuracy": 0.5730924725532531,
"num_tokens": 30428552.0,
"step": 4430
},
{
"entropy": 1.8843210101127625,
"epoch": 0.5348108889424236,
"grad_norm": 2.9375,
"learning_rate": 4.8406765985068306e-05,
"loss": 1.9016788482666016,
"mean_token_accuracy": 0.5884897232055664,
"num_tokens": 30499828.0,
"step": 4440
},
{
"entropy": 1.9235470294952393,
"epoch": 0.5360154179715731,
"grad_norm": 1.8515625,
"learning_rate": 4.820769233403182e-05,
"loss": 1.9310134887695312,
"mean_token_accuracy": 0.5902485966682434,
"num_tokens": 30570663.0,
"step": 4450
},
{
"entropy": 2.002137005329132,
"epoch": 0.5372199470007227,
"grad_norm": 1.6640625,
"learning_rate": 4.800864712748773e-05,
"loss": 2.013439178466797,
"mean_token_accuracy": 0.5734929382801056,
"num_tokens": 30639768.0,
"step": 4460
},
{
"entropy": 1.9583630681037902,
"epoch": 0.5384244760298723,
"grad_norm": 2.078125,
"learning_rate": 4.7809633524346714e-05,
"loss": 1.9639848709106444,
"mean_token_accuracy": 0.581292986869812,
"num_tokens": 30709439.0,
"step": 4470
},
{
"entropy": 1.918102788925171,
"epoch": 0.5396290050590219,
"grad_norm": 1.671875,
"learning_rate": 4.7610654683017935e-05,
"loss": 1.93857421875,
"mean_token_accuracy": 0.5793308973312378,
"num_tokens": 30778575.0,
"step": 4480
},
{
"entropy": 2.008819043636322,
"epoch": 0.5408335340881715,
"grad_norm": 2.5625,
"learning_rate": 4.741171376135885e-05,
"loss": 2.0007816314697267,
"mean_token_accuracy": 0.5684065818786621,
"num_tokens": 30849170.0,
"step": 4490
},
{
"entropy": 1.9800901770591737,
"epoch": 0.5420380631173212,
"grad_norm": 2.46875,
"learning_rate": 4.721281391662513e-05,
"loss": 1.9896121978759767,
"mean_token_accuracy": 0.578561270236969,
"num_tokens": 30920009.0,
"step": 4500
},
{
"epoch": 0.5420380631173212,
"eval_entropy": 2.363973617553711,
"eval_loss": 2.643784523010254,
"eval_mean_token_accuracy": 0.46993932127952576,
"eval_num_tokens": 30920009.0,
"eval_runtime": 0.4236,
"eval_samples_per_second": 37.772,
"eval_steps_per_second": 4.722,
"step": 4500
},
{
"entropy": 1.9288830518722535,
"epoch": 0.5432425921464707,
"grad_norm": 1.859375,
"learning_rate": 4.701395830542052e-05,
"loss": 1.9468812942504883,
"mean_token_accuracy": 0.5802593350410461,
"num_tokens": 30988375.0,
"step": 4510
},
{
"entropy": 1.976506495475769,
"epoch": 0.5444471211756203,
"grad_norm": 2.21875,
"learning_rate": 4.681515008364679e-05,
"loss": 1.9802528381347657,
"mean_token_accuracy": 0.5758872270584107,
"num_tokens": 31060076.0,
"step": 4520
},
{
"entropy": 1.9826183795928956,
"epoch": 0.5456516502047699,
"grad_norm": 2.234375,
"learning_rate": 4.661639240645362e-05,
"loss": 1.9977703094482422,
"mean_token_accuracy": 0.5760325610637664,
"num_tokens": 31129178.0,
"step": 4530
},
{
"entropy": 1.920491099357605,
"epoch": 0.5468561792339195,
"grad_norm": 1.75,
"learning_rate": 4.641768842818852e-05,
"loss": 1.9400859832763673,
"mean_token_accuracy": 0.5816328704357148,
"num_tokens": 31195811.0,
"step": 4540
},
{
"entropy": 1.9592141389846802,
"epoch": 0.5480607082630692,
"grad_norm": 5.59375,
"learning_rate": 4.621904130234678e-05,
"loss": 1.9684932708740235,
"mean_token_accuracy": 0.5793033003807068,
"num_tokens": 31269666.0,
"step": 4550
},
{
"entropy": 1.9467454433441163,
"epoch": 0.5492652372922188,
"grad_norm": 4.03125,
"learning_rate": 4.6020454181521456e-05,
"loss": 1.9361564636230468,
"mean_token_accuracy": 0.5842253565788269,
"num_tokens": 31334999.0,
"step": 4560
},
{
"entropy": 1.9928619265556335,
"epoch": 0.5504697663213683,
"grad_norm": 1.96875,
"learning_rate": 4.582193021735327e-05,
"loss": 2.0216325759887694,
"mean_token_accuracy": 0.5709339559078217,
"num_tokens": 31400897.0,
"step": 4570
},
{
"entropy": 1.9685459375381469,
"epoch": 0.5516742953505179,
"grad_norm": 1.6328125,
"learning_rate": 4.562347256048062e-05,
"loss": 1.9661026000976562,
"mean_token_accuracy": 0.5800518572330475,
"num_tokens": 31463610.0,
"step": 4580
},
{
"entropy": 1.8976296782493591,
"epoch": 0.5528788243796675,
"grad_norm": 1.8125,
"learning_rate": 4.542508436048964e-05,
"loss": 1.901803970336914,
"mean_token_accuracy": 0.5900285243988037,
"num_tokens": 31531288.0,
"step": 4590
},
{
"entropy": 1.8951862573623657,
"epoch": 0.5540833534088172,
"grad_norm": 1.9375,
"learning_rate": 4.5226768765864116e-05,
"loss": 1.9238201141357423,
"mean_token_accuracy": 0.5814629673957825,
"num_tokens": 31600252.0,
"step": 4600
},
{
"epoch": 0.5540833534088172,
"eval_entropy": 2.4053670167922974,
"eval_loss": 2.6405816078186035,
"eval_mean_token_accuracy": 0.46436651051044464,
"eval_num_tokens": 31600252.0,
"eval_runtime": 0.4015,
"eval_samples_per_second": 39.851,
"eval_steps_per_second": 4.981,
"step": 4600
},
{
"entropy": 1.9277650356292724,
"epoch": 0.5552878824379668,
"grad_norm": 2.0,
"learning_rate": 4.502852892393555e-05,
"loss": 1.9219816207885743,
"mean_token_accuracy": 0.5852508783340454,
"num_tokens": 31669817.0,
"step": 4610
},
{
"entropy": 1.8948925614356995,
"epoch": 0.5564924114671164,
"grad_norm": 1.8984375,
"learning_rate": 4.483036798083327e-05,
"loss": 1.9002313613891602,
"mean_token_accuracy": 0.5929959654808045,
"num_tokens": 31736160.0,
"step": 4620
},
{
"entropy": 2.036375272274017,
"epoch": 0.557696940496266,
"grad_norm": 2.625,
"learning_rate": 4.4632289081434425e-05,
"loss": 2.051458168029785,
"mean_token_accuracy": 0.5620020091533661,
"num_tokens": 31803863.0,
"step": 4630
},
{
"entropy": 1.998021125793457,
"epoch": 0.5589014695254155,
"grad_norm": 1.859375,
"learning_rate": 4.443429536931412e-05,
"loss": 2.0050621032714844,
"mean_token_accuracy": 0.5741972327232361,
"num_tokens": 31872612.0,
"step": 4640
},
{
"entropy": 1.9433504939079285,
"epoch": 0.5601059985545651,
"grad_norm": 2.015625,
"learning_rate": 4.4236389986695506e-05,
"loss": 1.9620183944702148,
"mean_token_accuracy": 0.5792094707489014,
"num_tokens": 31942349.0,
"step": 4650
},
{
"entropy": 1.8521189928054809,
"epoch": 0.5613105275837148,
"grad_norm": 2.59375,
"learning_rate": 4.40385760743999e-05,
"loss": 1.8720680236816407,
"mean_token_accuracy": 0.596925801038742,
"num_tokens": 32012514.0,
"step": 4660
},
{
"entropy": 1.8461775422096252,
"epoch": 0.5625150566128644,
"grad_norm": 1.7109375,
"learning_rate": 4.384085677179698e-05,
"loss": 1.8293455123901368,
"mean_token_accuracy": 0.599132776260376,
"num_tokens": 32078200.0,
"step": 4670
},
{
"entropy": 1.8661665320396423,
"epoch": 0.563719585642014,
"grad_norm": 2.828125,
"learning_rate": 4.3643235216754937e-05,
"loss": 1.900370216369629,
"mean_token_accuracy": 0.5908865690231323,
"num_tokens": 32146391.0,
"step": 4680
},
{
"entropy": 1.9501501083374024,
"epoch": 0.5649241146711635,
"grad_norm": 1.9609375,
"learning_rate": 4.344571454559066e-05,
"loss": 1.9497032165527344,
"mean_token_accuracy": 0.5843803644180298,
"num_tokens": 32215263.0,
"step": 4690
},
{
"entropy": 1.999093735218048,
"epoch": 0.5661286437003131,
"grad_norm": 1.8046875,
"learning_rate": 4.3248297893019974e-05,
"loss": 1.9907657623291015,
"mean_token_accuracy": 0.5720624268054962,
"num_tokens": 32286557.0,
"step": 4700
},
{
"epoch": 0.5661286437003131,
"eval_entropy": 2.3873685598373413,
"eval_loss": 2.64916729927063,
"eval_mean_token_accuracy": 0.4610539674758911,
"eval_num_tokens": 32286557.0,
"eval_runtime": 0.3893,
"eval_samples_per_second": 41.1,
"eval_steps_per_second": 5.137,
"step": 4700
},
{
"entropy": 1.8912575364112854,
"epoch": 0.5673331727294628,
"grad_norm": 2.078125,
"learning_rate": 4.305098839210793e-05,
"loss": 1.9253583908081056,
"mean_token_accuracy": 0.5839666426181793,
"num_tokens": 32353787.0,
"step": 4710
},
{
"entropy": 2.0117050647735595,
"epoch": 0.5685377017586124,
"grad_norm": 1.90625,
"learning_rate": 4.285378917421901e-05,
"loss": 2.027020263671875,
"mean_token_accuracy": 0.5696452796459198,
"num_tokens": 32422276.0,
"step": 4720
},
{
"entropy": 1.9269362688064575,
"epoch": 0.569742230787762,
"grad_norm": 1.8359375,
"learning_rate": 4.26567033689675e-05,
"loss": 1.9298131942749024,
"mean_token_accuracy": 0.5858283340930939,
"num_tokens": 32489865.0,
"step": 4730
},
{
"entropy": 1.9381520748138428,
"epoch": 0.5709467598169116,
"grad_norm": 1.9609375,
"learning_rate": 4.245973410416776e-05,
"loss": 1.9504453659057617,
"mean_token_accuracy": 0.5801201462745667,
"num_tokens": 32558311.0,
"step": 4740
},
{
"entropy": 1.957834541797638,
"epoch": 0.5721512888460611,
"grad_norm": 1.734375,
"learning_rate": 4.226288450578466e-05,
"loss": 1.9595251083374023,
"mean_token_accuracy": 0.5809877216815948,
"num_tokens": 32628771.0,
"step": 4750
},
{
"entropy": 2.0325303077697754,
"epoch": 0.5733558178752108,
"grad_norm": 2.15625,
"learning_rate": 4.206615769788388e-05,
"loss": 2.0349349975585938,
"mean_token_accuracy": 0.5715503364801406,
"num_tokens": 32696002.0,
"step": 4760
},
{
"entropy": 1.9599291920661925,
"epoch": 0.5745603469043604,
"grad_norm": 2.75,
"learning_rate": 4.18695568025824e-05,
"loss": 1.9857137680053711,
"mean_token_accuracy": 0.5762530505657196,
"num_tokens": 32765409.0,
"step": 4770
},
{
"entropy": 2.011521780490875,
"epoch": 0.57576487593351,
"grad_norm": 1.828125,
"learning_rate": 4.167308493999895e-05,
"loss": 2.0376073837280275,
"mean_token_accuracy": 0.5651960968971252,
"num_tokens": 32836052.0,
"step": 4780
},
{
"entropy": 1.9228283405303954,
"epoch": 0.5769694049626596,
"grad_norm": 2.15625,
"learning_rate": 4.1476745228204396e-05,
"loss": 1.912071418762207,
"mean_token_accuracy": 0.5910592377185822,
"num_tokens": 32905194.0,
"step": 4790
},
{
"entropy": 1.865368616580963,
"epoch": 0.5781739339918092,
"grad_norm": 2.4375,
"learning_rate": 4.12805407831724e-05,
"loss": 1.891912078857422,
"mean_token_accuracy": 0.5888952136039733,
"num_tokens": 32972498.0,
"step": 4800
},
{
"epoch": 0.5781739339918092,
"eval_entropy": 2.3830796480178833,
"eval_loss": 2.640810489654541,
"eval_mean_token_accuracy": 0.46129730343818665,
"eval_num_tokens": 32972498.0,
"eval_runtime": 0.4087,
"eval_samples_per_second": 39.153,
"eval_steps_per_second": 4.894,
"step": 4800
},
{
"entropy": 1.894623613357544,
"epoch": 0.5793784630209589,
"grad_norm": 1.640625,
"learning_rate": 4.1084474718729856e-05,
"loss": 1.8948657989501954,
"mean_token_accuracy": 0.590268349647522,
"num_tokens": 33045640.0,
"step": 4810
},
{
"entropy": 1.9580816626548767,
"epoch": 0.5805829920501084,
"grad_norm": 1.765625,
"learning_rate": 4.0888550146507565e-05,
"loss": 1.9551275253295899,
"mean_token_accuracy": 0.5806404232978821,
"num_tokens": 33113472.0,
"step": 4820
},
{
"entropy": 2.0462084293365477,
"epoch": 0.581787521079258,
"grad_norm": 2.1875,
"learning_rate": 4.069277017589074e-05,
"loss": 2.055453872680664,
"mean_token_accuracy": 0.5649996995925903,
"num_tokens": 33184222.0,
"step": 4830
},
{
"entropy": 1.9837548017501831,
"epoch": 0.5829920501084076,
"grad_norm": 1.796875,
"learning_rate": 4.0497137913969757e-05,
"loss": 1.9946985244750977,
"mean_token_accuracy": 0.5758336961269379,
"num_tokens": 33253556.0,
"step": 4840
},
{
"entropy": 1.9759496331214905,
"epoch": 0.5841965791375572,
"grad_norm": 3.015625,
"learning_rate": 4.030165646549079e-05,
"loss": 1.9906793594360352,
"mean_token_accuracy": 0.5723823666572571,
"num_tokens": 33322391.0,
"step": 4850
},
{
"entropy": 1.8935376286506653,
"epoch": 0.5854011081667068,
"grad_norm": 2.375,
"learning_rate": 4.010632893280659e-05,
"loss": 1.8841629028320312,
"mean_token_accuracy": 0.5903885960578918,
"num_tokens": 33390481.0,
"step": 4860
},
{
"entropy": 1.981140172481537,
"epoch": 0.5866056371958565,
"grad_norm": 1.984375,
"learning_rate": 3.991115841582718e-05,
"loss": 1.981198501586914,
"mean_token_accuracy": 0.5745481014251709,
"num_tokens": 33458697.0,
"step": 4870
},
{
"entropy": 1.9192126035690307,
"epoch": 0.587810166225006,
"grad_norm": 1.625,
"learning_rate": 3.971614801197068e-05,
"loss": 1.9621315002441406,
"mean_token_accuracy": 0.5829608976840973,
"num_tokens": 33528205.0,
"step": 4880
},
{
"entropy": 1.8959569573402404,
"epoch": 0.5890146952541556,
"grad_norm": 1.7890625,
"learning_rate": 3.95213008161142e-05,
"loss": 1.8824783325195313,
"mean_token_accuracy": 0.591416871547699,
"num_tokens": 33597857.0,
"step": 4890
},
{
"entropy": 1.9513839960098267,
"epoch": 0.5902192242833052,
"grad_norm": 2.015625,
"learning_rate": 3.9326619920544696e-05,
"loss": 1.9795387268066407,
"mean_token_accuracy": 0.5807128429412842,
"num_tokens": 33666832.0,
"step": 4900
},
{
"epoch": 0.5902192242833052,
"eval_entropy": 2.3947250843048096,
"eval_loss": 2.6315090656280518,
"eval_mean_token_accuracy": 0.46670788526535034,
"eval_num_tokens": 33666832.0,
"eval_runtime": 0.4129,
"eval_samples_per_second": 38.749,
"eval_steps_per_second": 4.844,
"step": 4900
},
{
"entropy": 1.9754149913787842,
"epoch": 0.5914237533124548,
"grad_norm": 1.765625,
"learning_rate": 3.9132108414909846e-05,
"loss": 1.9650136947631835,
"mean_token_accuracy": 0.5761143267154694,
"num_tokens": 33738439.0,
"step": 4910
},
{
"entropy": 1.9517736196517945,
"epoch": 0.5926282823416045,
"grad_norm": 2.484375,
"learning_rate": 3.893776938616908e-05,
"loss": 1.9693525314331055,
"mean_token_accuracy": 0.5792171418666839,
"num_tokens": 33808674.0,
"step": 4920
},
{
"entropy": 1.9107809901237487,
"epoch": 0.593832811370754,
"grad_norm": 2.0625,
"learning_rate": 3.874360591854456e-05,
"loss": 1.9263221740722656,
"mean_token_accuracy": 0.5864075660705567,
"num_tokens": 33876011.0,
"step": 4930
},
{
"entropy": 1.9395878314971924,
"epoch": 0.5950373403999036,
"grad_norm": 2.078125,
"learning_rate": 3.8549621093472225e-05,
"loss": 1.9504684448242187,
"mean_token_accuracy": 0.5788490891456604,
"num_tokens": 33943521.0,
"step": 4940
},
{
"entropy": 1.9085129261016847,
"epoch": 0.5962418694290532,
"grad_norm": 2.328125,
"learning_rate": 3.8355817989552925e-05,
"loss": 1.9079933166503906,
"mean_token_accuracy": 0.5859108805656433,
"num_tokens": 34011166.0,
"step": 4950
},
{
"entropy": 1.9190776228904725,
"epoch": 0.5974463984582028,
"grad_norm": 2.125,
"learning_rate": 3.816219968250354e-05,
"loss": 1.9361600875854492,
"mean_token_accuracy": 0.5844547927379609,
"num_tokens": 34081520.0,
"step": 4960
},
{
"entropy": 1.9144928336143494,
"epoch": 0.5986509274873525,
"grad_norm": 2.359375,
"learning_rate": 3.7968769245108116e-05,
"loss": 1.9301385879516602,
"mean_token_accuracy": 0.5838036835193634,
"num_tokens": 34150349.0,
"step": 4970
},
{
"entropy": 1.9991060137748717,
"epoch": 0.5998554565165021,
"grad_norm": 1.9296875,
"learning_rate": 3.777552974716919e-05,
"loss": 2.0122554779052733,
"mean_token_accuracy": 0.5668920397758483,
"num_tokens": 34221884.0,
"step": 4980
},
{
"entropy": 1.9388365507125855,
"epoch": 0.6010599855456517,
"grad_norm": 1.6484375,
"learning_rate": 3.7582484255459036e-05,
"loss": 1.943490219116211,
"mean_token_accuracy": 0.581670206785202,
"num_tokens": 34290048.0,
"step": 4990
},
{
"entropy": 1.9693029046058654,
"epoch": 0.6022645145748012,
"grad_norm": 4.5,
"learning_rate": 3.7389635833670956e-05,
"loss": 1.9767236709594727,
"mean_token_accuracy": 0.576853483915329,
"num_tokens": 34358309.0,
"step": 5000
},
{
"epoch": 0.6022645145748012,
"eval_entropy": 2.418100595474243,
"eval_loss": 2.630772113800049,
"eval_mean_token_accuracy": 0.4687269777059555,
"eval_num_tokens": 34358309.0,
"eval_runtime": 0.3895,
"eval_samples_per_second": 41.083,
"eval_steps_per_second": 5.135,
"step": 5000
},
{
"entropy": 1.9612276196479796,
"epoch": 0.6034690436039508,
"grad_norm": 1.7421875,
"learning_rate": 3.719698754237071e-05,
"loss": 1.9557125091552734,
"mean_token_accuracy": 0.5791131734848023,
"num_tokens": 34425037.0,
"step": 5010
},
{
"entropy": 1.9336675524711608,
"epoch": 0.6046735726331005,
"grad_norm": 2.453125,
"learning_rate": 3.700454243894792e-05,
"loss": 1.9650556564331054,
"mean_token_accuracy": 0.579778116941452,
"num_tokens": 34489640.0,
"step": 5020
},
{
"entropy": 1.9100453734397889,
"epoch": 0.6058781016622501,
"grad_norm": 1.8515625,
"learning_rate": 3.681230357756755e-05,
"loss": 1.9068069458007812,
"mean_token_accuracy": 0.58942751288414,
"num_tokens": 34558251.0,
"step": 5030
},
{
"entropy": 1.8939356923103332,
"epoch": 0.6070826306913997,
"grad_norm": 2.140625,
"learning_rate": 3.662027400912144e-05,
"loss": 1.8894643783569336,
"mean_token_accuracy": 0.5893211364746094,
"num_tokens": 34627206.0,
"step": 5040
},
{
"entropy": 1.8718860745429993,
"epoch": 0.6082871597205493,
"grad_norm": 1.96875,
"learning_rate": 3.642845678117989e-05,
"loss": 1.8915393829345704,
"mean_token_accuracy": 0.5894035339355469,
"num_tokens": 34698945.0,
"step": 5050
},
{
"entropy": 1.8748759269714355,
"epoch": 0.6094916887496988,
"grad_norm": 1.828125,
"learning_rate": 3.6236854937943265e-05,
"loss": 1.8840362548828125,
"mean_token_accuracy": 0.595303225517273,
"num_tokens": 34767145.0,
"step": 5060
},
{
"entropy": 1.9592718839645387,
"epoch": 0.6106962177788484,
"grad_norm": 1.90625,
"learning_rate": 3.604547152019373e-05,
"loss": 1.957082176208496,
"mean_token_accuracy": 0.5802127182483673,
"num_tokens": 34837448.0,
"step": 5070
},
{
"entropy": 1.9220492243766785,
"epoch": 0.6119007468079981,
"grad_norm": 2.109375,
"learning_rate": 3.5854309565246964e-05,
"loss": 1.932563591003418,
"mean_token_accuracy": 0.5790705144405365,
"num_tokens": 34908463.0,
"step": 5080
},
{
"entropy": 1.9812603712081909,
"epoch": 0.6131052758371477,
"grad_norm": 2.390625,
"learning_rate": 3.5663372106903945e-05,
"loss": 1.991549301147461,
"mean_token_accuracy": 0.5749566674232482,
"num_tokens": 34978001.0,
"step": 5090
},
{
"entropy": 1.8598801732063293,
"epoch": 0.6143098048662973,
"grad_norm": 1.6640625,
"learning_rate": 3.547266217540285e-05,
"loss": 1.8648416519165039,
"mean_token_accuracy": 0.5984355032444,
"num_tokens": 35044073.0,
"step": 5100
},
{
"epoch": 0.6143098048662973,
"eval_entropy": 2.3398125171661377,
"eval_loss": 2.62454891204834,
"eval_mean_token_accuracy": 0.46985819935798645,
"eval_num_tokens": 35044073.0,
"eval_runtime": 0.3855,
"eval_samples_per_second": 41.507,
"eval_steps_per_second": 5.188,
"step": 5100
},
{
"entropy": 1.8979187965393067,
"epoch": 0.6155143338954469,
"grad_norm": 1.875,
"learning_rate": 3.52821827973709e-05,
"loss": 1.929935836791992,
"mean_token_accuracy": 0.5857858061790466,
"num_tokens": 35113684.0,
"step": 5110
},
{
"entropy": 1.9798847556114196,
"epoch": 0.6167188629245964,
"grad_norm": 1.7578125,
"learning_rate": 3.509193699577638e-05,
"loss": 1.963231086730957,
"mean_token_accuracy": 0.5840341687202454,
"num_tokens": 35180721.0,
"step": 5120
},
{
"entropy": 1.8245866298675537,
"epoch": 0.6179233919537461,
"grad_norm": 1.9765625,
"learning_rate": 3.490192778988063e-05,
"loss": 1.8163463592529296,
"mean_token_accuracy": 0.6064060211181641,
"num_tokens": 35250848.0,
"step": 5130
},
{
"entropy": 1.9118698000907899,
"epoch": 0.6191279209828957,
"grad_norm": 1.75,
"learning_rate": 3.4712158195190145e-05,
"loss": 1.9413307189941407,
"mean_token_accuracy": 0.5859278023242951,
"num_tokens": 35321431.0,
"step": 5140
},
{
"entropy": 1.8520223259925843,
"epoch": 0.6203324500120453,
"grad_norm": 1.6171875,
"learning_rate": 3.452263122340873e-05,
"loss": 1.8534076690673829,
"mean_token_accuracy": 0.5987428843975067,
"num_tokens": 35393047.0,
"step": 5150
},
{
"entropy": 1.9185335278511046,
"epoch": 0.6215369790411949,
"grad_norm": 1.578125,
"learning_rate": 3.433334988238966e-05,
"loss": 1.9247661590576173,
"mean_token_accuracy": 0.5864275336265564,
"num_tokens": 35458684.0,
"step": 5160
},
{
"entropy": 1.930286693572998,
"epoch": 0.6227415080703445,
"grad_norm": 1.8125,
"learning_rate": 3.414431717608798e-05,
"loss": 1.9481643676757812,
"mean_token_accuracy": 0.5829527914524079,
"num_tokens": 35525629.0,
"step": 5170
},
{
"entropy": 1.9276029348373414,
"epoch": 0.6239460370994941,
"grad_norm": 2.765625,
"learning_rate": 3.395553610451284e-05,
"loss": 1.9277130126953126,
"mean_token_accuracy": 0.5824755191802978,
"num_tokens": 35595976.0,
"step": 5180
},
{
"entropy": 1.9220746278762817,
"epoch": 0.6251505661286437,
"grad_norm": 2.375,
"learning_rate": 3.376700966367985e-05,
"loss": 1.925217056274414,
"mean_token_accuracy": 0.5840397000312805,
"num_tokens": 35665532.0,
"step": 5190
},
{
"entropy": 1.8797456979751588,
"epoch": 0.6263550951577933,
"grad_norm": 1.8203125,
"learning_rate": 3.3578740845563555e-05,
"loss": 1.9007396697998047,
"mean_token_accuracy": 0.5888063788414002,
"num_tokens": 35737061.0,
"step": 5200
},
{
"epoch": 0.6263550951577933,
"eval_entropy": 2.3753281831741333,
"eval_loss": 2.6204867362976074,
"eval_mean_token_accuracy": 0.46662676334381104,
"eval_num_tokens": 35737061.0,
"eval_runtime": 0.4077,
"eval_samples_per_second": 39.244,
"eval_steps_per_second": 4.905,
"step": 5200
},
{
"entropy": 1.8981397032737732,
"epoch": 0.6275596241869429,
"grad_norm": 1.8671875,
"learning_rate": 3.339073263804994e-05,
"loss": 1.9124277114868165,
"mean_token_accuracy": 0.5864745378494263,
"num_tokens": 35804675.0,
"step": 5210
},
{
"entropy": 1.976684045791626,
"epoch": 0.6287641532160925,
"grad_norm": 1.7890625,
"learning_rate": 3.320298802488903e-05,
"loss": 1.9907527923583985,
"mean_token_accuracy": 0.5714164018630982,
"num_tokens": 35877161.0,
"step": 5220
},
{
"entropy": 1.8927918910980224,
"epoch": 0.6299686822452422,
"grad_norm": 1.7890625,
"learning_rate": 3.301550998564751e-05,
"loss": 1.8990289688110351,
"mean_token_accuracy": 0.5901903450489044,
"num_tokens": 35948270.0,
"step": 5230
},
{
"entropy": 1.9140401601791381,
"epoch": 0.6311732112743917,
"grad_norm": 1.8359375,
"learning_rate": 3.2828301495661456e-05,
"loss": 1.9242591857910156,
"mean_token_accuracy": 0.5829783380031586,
"num_tokens": 36017155.0,
"step": 5240
},
{
"entropy": 1.890995192527771,
"epoch": 0.6323777403035413,
"grad_norm": 2.5,
"learning_rate": 3.264136552598911e-05,
"loss": 1.8821306228637695,
"mean_token_accuracy": 0.5897811651229858,
"num_tokens": 36086123.0,
"step": 5250
},
{
"entropy": 1.8787309169769286,
"epoch": 0.6335822693326909,
"grad_norm": 2.59375,
"learning_rate": 3.245470504336374e-05,
"loss": 1.885854721069336,
"mean_token_accuracy": 0.5903471231460571,
"num_tokens": 36155392.0,
"step": 5260
},
{
"entropy": 1.8674795508384705,
"epoch": 0.6347867983618405,
"grad_norm": 2.453125,
"learning_rate": 3.2268323010146533e-05,
"loss": 1.8879039764404297,
"mean_token_accuracy": 0.5887195408344269,
"num_tokens": 36227815.0,
"step": 5270
},
{
"entropy": 1.9036988496780396,
"epoch": 0.6359913273909901,
"grad_norm": 2.234375,
"learning_rate": 3.2082222384279606e-05,
"loss": 1.9163215637207032,
"mean_token_accuracy": 0.5897344052791595,
"num_tokens": 36295653.0,
"step": 5280
},
{
"entropy": 1.8928561806678772,
"epoch": 0.6371958564201398,
"grad_norm": 1.7421875,
"learning_rate": 3.1896406119239056e-05,
"loss": 1.8859382629394532,
"mean_token_accuracy": 0.5898880422115326,
"num_tokens": 36365568.0,
"step": 5290
},
{
"entropy": 1.905668354034424,
"epoch": 0.6384003854492893,
"grad_norm": 1.59375,
"learning_rate": 3.171087716398806e-05,
"loss": 1.9199764251708984,
"mean_token_accuracy": 0.5883520424365998,
"num_tokens": 36435763.0,
"step": 5300
},
{
"epoch": 0.6384003854492893,
"eval_entropy": 2.3645377159118652,
"eval_loss": 2.616696834564209,
"eval_mean_token_accuracy": 0.46832360327243805,
"eval_num_tokens": 36435763.0,
"eval_runtime": 0.4223,
"eval_samples_per_second": 37.889,
"eval_steps_per_second": 4.736,
"step": 5300
},
{
"entropy": 1.907051146030426,
"epoch": 0.6396049144784389,
"grad_norm": 1.9375,
"learning_rate": 3.1525638462930115e-05,
"loss": 1.9220357894897462,
"mean_token_accuracy": 0.5886389434337616,
"num_tokens": 36502342.0,
"step": 5310
},
{
"entropy": 1.9605424404144287,
"epoch": 0.6408094435075885,
"grad_norm": 2.1875,
"learning_rate": 3.13406929558623e-05,
"loss": 1.9678442001342773,
"mean_token_accuracy": 0.578427666425705,
"num_tokens": 36573941.0,
"step": 5320
},
{
"entropy": 1.953044855594635,
"epoch": 0.6420139725367381,
"grad_norm": 2.09375,
"learning_rate": 3.115604357792861e-05,
"loss": 1.958037757873535,
"mean_token_accuracy": 0.5746124386787415,
"num_tokens": 36642724.0,
"step": 5330
},
{
"entropy": 1.8768929600715638,
"epoch": 0.6432185015658878,
"grad_norm": 1.75,
"learning_rate": 3.097169325957334e-05,
"loss": 1.8723608016967774,
"mean_token_accuracy": 0.5956127345561981,
"num_tokens": 36712778.0,
"step": 5340
},
{
"entropy": 1.841334068775177,
"epoch": 0.6444230305950374,
"grad_norm": 2.21875,
"learning_rate": 3.078764492649466e-05,
"loss": 1.8471950531005858,
"mean_token_accuracy": 0.5986435294151307,
"num_tokens": 36780809.0,
"step": 5350
},
{
"entropy": 1.9197816848754883,
"epoch": 0.6456275596241869,
"grad_norm": 1.9609375,
"learning_rate": 3.060390149959812e-05,
"loss": 1.9349895477294923,
"mean_token_accuracy": 0.5787651658058166,
"num_tokens": 36850234.0,
"step": 5360
},
{
"entropy": 1.929334855079651,
"epoch": 0.6468320886533365,
"grad_norm": 2.296875,
"learning_rate": 3.0420465894950308e-05,
"loss": 1.9481956481933593,
"mean_token_accuracy": 0.5793615520000458,
"num_tokens": 36919525.0,
"step": 5370
},
{
"entropy": 1.9005924344062806,
"epoch": 0.6480366176824861,
"grad_norm": 1.984375,
"learning_rate": 3.023734102373258e-05,
"loss": 1.9027469635009766,
"mean_token_accuracy": 0.5923004388809204,
"num_tokens": 36984363.0,
"step": 5380
},
{
"entropy": 1.9164150595664977,
"epoch": 0.6492411467116358,
"grad_norm": 2.546875,
"learning_rate": 3.0054529792194853e-05,
"loss": 1.9091907501220704,
"mean_token_accuracy": 0.5910762488842011,
"num_tokens": 37054526.0,
"step": 5390
},
{
"entropy": 1.9193613052368164,
"epoch": 0.6504456757407854,
"grad_norm": 2.4375,
"learning_rate": 2.9872035101609487e-05,
"loss": 1.9336360931396483,
"mean_token_accuracy": 0.580650019645691,
"num_tokens": 37121669.0,
"step": 5400
},
{
"epoch": 0.6504456757407854,
"eval_entropy": 2.350887656211853,
"eval_loss": 2.6190919876098633,
"eval_mean_token_accuracy": 0.4688080996274948,
"eval_num_tokens": 37121669.0,
"eval_runtime": 0.4531,
"eval_samples_per_second": 35.313,
"eval_steps_per_second": 4.414,
"step": 5400
},
{
"entropy": 1.977675235271454,
"epoch": 0.651650204769935,
"grad_norm": 1.890625,
"learning_rate": 2.968985984822522e-05,
"loss": 2.0003679275512694,
"mean_token_accuracy": 0.5737912714481354,
"num_tokens": 37186062.0,
"step": 5410
},
{
"entropy": 1.9420138478279114,
"epoch": 0.6528547337990845,
"grad_norm": 1.7578125,
"learning_rate": 2.9508006923221266e-05,
"loss": 1.9501548767089845,
"mean_token_accuracy": 0.5838521778583526,
"num_tokens": 37256065.0,
"step": 5420
},
{
"entropy": 1.9069531440734864,
"epoch": 0.6540592628282341,
"grad_norm": 1.734375,
"learning_rate": 2.9326479212661306e-05,
"loss": 1.9059297561645507,
"mean_token_accuracy": 0.5911825299263,
"num_tokens": 37324821.0,
"step": 5430
},
{
"entropy": 1.893869972229004,
"epoch": 0.6552637918573838,
"grad_norm": 1.7109375,
"learning_rate": 2.9145279597447828e-05,
"loss": 1.9049043655395508,
"mean_token_accuracy": 0.5932099878787994,
"num_tokens": 37390790.0,
"step": 5440
},
{
"entropy": 1.88713858127594,
"epoch": 0.6564683208865334,
"grad_norm": 1.78125,
"learning_rate": 2.896441095327632e-05,
"loss": 1.902921485900879,
"mean_token_accuracy": 0.5844146311283112,
"num_tokens": 37460500.0,
"step": 5450
},
{
"entropy": 1.8918554067611695,
"epoch": 0.657672849915683,
"grad_norm": 2.3125,
"learning_rate": 2.8783876150589683e-05,
"loss": 1.9041791915893556,
"mean_token_accuracy": 0.5867816925048828,
"num_tokens": 37528938.0,
"step": 5460
},
{
"entropy": 1.8805498123168944,
"epoch": 0.6588773789448326,
"grad_norm": 1.7734375,
"learning_rate": 2.860367805453258e-05,
"loss": 1.8810306549072267,
"mean_token_accuracy": 0.59105264544487,
"num_tokens": 37600188.0,
"step": 5470
},
{
"entropy": 1.9223363876342774,
"epoch": 0.6600819079739821,
"grad_norm": 1.734375,
"learning_rate": 2.8423819524906127e-05,
"loss": 1.9341358184814452,
"mean_token_accuracy": 0.5785215318202972,
"num_tokens": 37669493.0,
"step": 5480
},
{
"entropy": 1.9478025794029237,
"epoch": 0.6612864370031317,
"grad_norm": 2.078125,
"learning_rate": 2.8244303416122315e-05,
"loss": 1.9668249130249023,
"mean_token_accuracy": 0.5811723470687866,
"num_tokens": 37738632.0,
"step": 5490
},
{
"entropy": 1.9327033162117004,
"epoch": 0.6624909660322814,
"grad_norm": 2.046875,
"learning_rate": 2.8065132577158893e-05,
"loss": 1.9399887084960938,
"mean_token_accuracy": 0.5822966039180756,
"num_tokens": 37808296.0,
"step": 5500
},
{
"epoch": 0.6624909660322814,
"eval_entropy": 2.36837637424469,
"eval_loss": 2.610164165496826,
"eval_mean_token_accuracy": 0.4664645344018936,
"eval_num_tokens": 37808296.0,
"eval_runtime": 0.4178,
"eval_samples_per_second": 38.293,
"eval_steps_per_second": 4.787,
"step": 5500
},
{
"entropy": 1.9694913983345033,
"epoch": 0.663695495061431,
"grad_norm": 2.421875,
"learning_rate": 2.7886309851513988e-05,
"loss": 1.9779542922973632,
"mean_token_accuracy": 0.5814646899700164,
"num_tokens": 37876068.0,
"step": 5510
},
{
"entropy": 1.8879098653793336,
"epoch": 0.6649000240905806,
"grad_norm": 1.859375,
"learning_rate": 2.7707838077161164e-05,
"loss": 1.8773269653320312,
"mean_token_accuracy": 0.5935602962970734,
"num_tokens": 37946972.0,
"step": 5520
},
{
"entropy": 1.89464613199234,
"epoch": 0.6661045531197302,
"grad_norm": 1.671875,
"learning_rate": 2.7529720086504124e-05,
"loss": 1.9013668060302735,
"mean_token_accuracy": 0.5858488619327545,
"num_tokens": 38011397.0,
"step": 5530
},
{
"entropy": 1.9149715423583984,
"epoch": 0.6673090821488797,
"grad_norm": 1.75,
"learning_rate": 2.7351958706332047e-05,
"loss": 1.9140022277832032,
"mean_token_accuracy": 0.5807339906692505,
"num_tokens": 38080836.0,
"step": 5540
},
{
"entropy": 1.881326472759247,
"epoch": 0.6685136111780294,
"grad_norm": 1.71875,
"learning_rate": 2.7174556757774562e-05,
"loss": 1.8995676040649414,
"mean_token_accuracy": 0.587762427330017,
"num_tokens": 38151280.0,
"step": 5550
},
{
"entropy": 2.0221534729003907,
"epoch": 0.669718140207179,
"grad_norm": 2.0625,
"learning_rate": 2.6997517056256937e-05,
"loss": 2.0501468658447264,
"mean_token_accuracy": 0.5636834293603897,
"num_tokens": 38219291.0,
"step": 5560
},
{
"entropy": 1.9183872818946839,
"epoch": 0.6709226692363286,
"grad_norm": 1.859375,
"learning_rate": 2.682084241145556e-05,
"loss": 1.9107559204101563,
"mean_token_accuracy": 0.5877353131771088,
"num_tokens": 38286544.0,
"step": 5570
},
{
"entropy": 1.9487929224967957,
"epoch": 0.6721271982654782,
"grad_norm": 4.03125,
"learning_rate": 2.6644535627253157e-05,
"loss": 1.9566732406616212,
"mean_token_accuracy": 0.5794759750366211,
"num_tokens": 38357801.0,
"step": 5580
},
{
"entropy": 1.8050852179527284,
"epoch": 0.6733317272946278,
"grad_norm": 2.0625,
"learning_rate": 2.646859950169448e-05,
"loss": 1.7916004180908203,
"mean_token_accuracy": 0.60407754778862,
"num_tokens": 38429515.0,
"step": 5590
},
{
"entropy": 1.9309733152389525,
"epoch": 0.6745362563237775,
"grad_norm": 2.234375,
"learning_rate": 2.629303682694173e-05,
"loss": 1.9596979141235351,
"mean_token_accuracy": 0.5794607996940613,
"num_tokens": 38496132.0,
"step": 5600
},
{
"epoch": 0.6745362563237775,
"eval_entropy": 2.3521742820739746,
"eval_loss": 2.6118619441986084,
"eval_mean_token_accuracy": 0.4638798236846924,
"eval_num_tokens": 38496132.0,
"eval_runtime": 0.4978,
"eval_samples_per_second": 32.143,
"eval_steps_per_second": 4.018,
"step": 5600
},
{
"entropy": 1.9450265645980835,
"epoch": 0.675740785352927,
"grad_norm": 1.7421875,
"learning_rate": 2.611785038923042e-05,
"loss": 1.9604305267333983,
"mean_token_accuracy": 0.5766765594482421,
"num_tokens": 38565467.0,
"step": 5610
},
{
"entropy": 1.8920610785484313,
"epoch": 0.6769453143820766,
"grad_norm": 1.8203125,
"learning_rate": 2.594304296882492e-05,
"loss": 1.8906587600708007,
"mean_token_accuracy": 0.5891460299491882,
"num_tokens": 38636572.0,
"step": 5620
},
{
"entropy": 1.909081017971039,
"epoch": 0.6781498434112262,
"grad_norm": 2.4375,
"learning_rate": 2.5768617339974606e-05,
"loss": 1.9260461807250977,
"mean_token_accuracy": 0.5860518753528595,
"num_tokens": 38707209.0,
"step": 5630
},
{
"entropy": 1.9273210883140564,
"epoch": 0.6793543724403758,
"grad_norm": 1.765625,
"learning_rate": 2.5594576270869663e-05,
"loss": 1.9551538467407226,
"mean_token_accuracy": 0.5829238772392273,
"num_tokens": 38776194.0,
"step": 5640
},
{
"entropy": 1.9627973318099976,
"epoch": 0.6805589014695255,
"grad_norm": 1.8359375,
"learning_rate": 2.5420922523597156e-05,
"loss": 1.9638336181640625,
"mean_token_accuracy": 0.5761943578720092,
"num_tokens": 38847307.0,
"step": 5650
},
{
"entropy": 1.896808135509491,
"epoch": 0.681763430498675,
"grad_norm": 1.625,
"learning_rate": 2.5247658854097277e-05,
"loss": 1.898649024963379,
"mean_token_accuracy": 0.5869676113128662,
"num_tokens": 38916748.0,
"step": 5660
},
{
"entropy": 1.9466437339782714,
"epoch": 0.6829679595278246,
"grad_norm": 2.609375,
"learning_rate": 2.507478801211951e-05,
"loss": 1.9477453231811523,
"mean_token_accuracy": 0.5810700714588165,
"num_tokens": 38982369.0,
"step": 5670
},
{
"entropy": 1.925716495513916,
"epoch": 0.6841724885569742,
"grad_norm": 1.9375,
"learning_rate": 2.4902312741179108e-05,
"loss": 1.9344860076904298,
"mean_token_accuracy": 0.5786227762699128,
"num_tokens": 39052521.0,
"step": 5680
},
{
"entropy": 1.9241721272468566,
"epoch": 0.6853770175861238,
"grad_norm": 2.015625,
"learning_rate": 2.4730235778513394e-05,
"loss": 1.9304174423217773,
"mean_token_accuracy": 0.5819700300693512,
"num_tokens": 39116120.0,
"step": 5690
},
{
"entropy": 1.9030420541763307,
"epoch": 0.6865815466152734,
"grad_norm": 2.15625,
"learning_rate": 2.4558559855038537e-05,
"loss": 1.9098997116088867,
"mean_token_accuracy": 0.5879886984825134,
"num_tokens": 39185492.0,
"step": 5700
},
{
"epoch": 0.6865815466152734,
"eval_entropy": 2.36625599861145,
"eval_loss": 2.6117119789123535,
"eval_mean_token_accuracy": 0.46299195289611816,
"eval_num_tokens": 39185492.0,
"eval_runtime": 0.5422,
"eval_samples_per_second": 29.509,
"eval_steps_per_second": 3.689,
"step": 5700
},
{
"entropy": 1.9261456251144409,
"epoch": 0.6877860756444231,
"grad_norm": 1.9375,
"learning_rate": 2.438728769530593e-05,
"loss": 1.9247291564941407,
"mean_token_accuracy": 0.5803986310958862,
"num_tokens": 39249701.0,
"step": 5710
},
{
"entropy": 1.8991306185722352,
"epoch": 0.6889906046735726,
"grad_norm": 1.828125,
"learning_rate": 2.4216422017459234e-05,
"loss": 1.930657958984375,
"mean_token_accuracy": 0.58206005692482,
"num_tokens": 39319024.0,
"step": 5720
},
{
"entropy": 1.9198801875114442,
"epoch": 0.6901951337027222,
"grad_norm": 1.8671875,
"learning_rate": 2.4045965533191083e-05,
"loss": 1.9231132507324218,
"mean_token_accuracy": 0.5801682949066163,
"num_tokens": 39389627.0,
"step": 5730
},
{
"entropy": 1.8921023964881898,
"epoch": 0.6913996627318718,
"grad_norm": 1.921875,
"learning_rate": 2.3875920947700032e-05,
"loss": 1.886610221862793,
"mean_token_accuracy": 0.5911743700504303,
"num_tokens": 39459243.0,
"step": 5740
},
{
"entropy": 2.019665813446045,
"epoch": 0.6926041917610214,
"grad_norm": 1.734375,
"learning_rate": 2.3706290959647742e-05,
"loss": 2.031690216064453,
"mean_token_accuracy": 0.568154114484787,
"num_tokens": 39531010.0,
"step": 5750
},
{
"entropy": 1.8260622501373291,
"epoch": 0.6938087207901711,
"grad_norm": 2.421875,
"learning_rate": 2.3537078261116007e-05,
"loss": 1.8358779907226563,
"mean_token_accuracy": 0.6017170667648315,
"num_tokens": 39601384.0,
"step": 5760
},
{
"entropy": 1.82235267162323,
"epoch": 0.6950132498193207,
"grad_norm": 1.6328125,
"learning_rate": 2.336828553756418e-05,
"loss": 1.8454484939575195,
"mean_token_accuracy": 0.5994343221187591,
"num_tokens": 39675047.0,
"step": 5770
},
{
"entropy": 1.953169822692871,
"epoch": 0.6962177788484702,
"grad_norm": 2.46875,
"learning_rate": 2.3199915467786402e-05,
"loss": 1.9636747360229492,
"mean_token_accuracy": 0.5822606801986694,
"num_tokens": 39742957.0,
"step": 5780
},
{
"entropy": 1.9094646334648133,
"epoch": 0.6974223078776198,
"grad_norm": 1.7734375,
"learning_rate": 2.303197072386926e-05,
"loss": 1.924036407470703,
"mean_token_accuracy": 0.5854589581489563,
"num_tokens": 39814717.0,
"step": 5790
},
{
"entropy": 1.9667348742485047,
"epoch": 0.6986268369067694,
"grad_norm": 1.7109375,
"learning_rate": 2.286445397114914e-05,
"loss": 1.9706661224365234,
"mean_token_accuracy": 0.5856462627649307,
"num_tokens": 39878350.0,
"step": 5800
},
{
"epoch": 0.6986268369067694,
"eval_entropy": 2.358152151107788,
"eval_loss": 2.612285852432251,
"eval_mean_token_accuracy": 0.4662233889102936,
"eval_num_tokens": 39878350.0,
"eval_runtime": 0.47,
"eval_samples_per_second": 34.044,
"eval_steps_per_second": 4.255,
"step": 5800
},
{
"entropy": 1.862882673740387,
"epoch": 0.6998313659359191,
"grad_norm": 2.171875,
"learning_rate": 2.2697367868170204e-05,
"loss": 1.8640838623046876,
"mean_token_accuracy": 0.5986731469631195,
"num_tokens": 39948246.0,
"step": 5810
},
{
"entropy": 1.8730922937393188,
"epoch": 0.7010358949650687,
"grad_norm": 2.359375,
"learning_rate": 2.2530715066642034e-05,
"loss": 1.8938676834106445,
"mean_token_accuracy": 0.5908694744110108,
"num_tokens": 40016822.0,
"step": 5820
},
{
"entropy": 1.8263341665267945,
"epoch": 0.7022404239942183,
"grad_norm": 2.078125,
"learning_rate": 2.2364498211397523e-05,
"loss": 1.821687889099121,
"mean_token_accuracy": 0.600709855556488,
"num_tokens": 40085243.0,
"step": 5830
},
{
"entropy": 1.863125741481781,
"epoch": 0.7034449530233678,
"grad_norm": 1.921875,
"learning_rate": 2.2198719940351048e-05,
"loss": 1.8795415878295898,
"mean_token_accuracy": 0.5911260426044465,
"num_tokens": 40151875.0,
"step": 5840
},
{
"entropy": 1.90743248462677,
"epoch": 0.7046494820525174,
"grad_norm": 1.7265625,
"learning_rate": 2.203338288445642e-05,
"loss": 1.9162694931030273,
"mean_token_accuracy": 0.5857582867145539,
"num_tokens": 40222136.0,
"step": 5850
},
{
"entropy": 1.9515150785446167,
"epoch": 0.7058540110816671,
"grad_norm": 2.25,
"learning_rate": 2.1868489667665314e-05,
"loss": 1.9747983932495117,
"mean_token_accuracy": 0.5775373160839081,
"num_tokens": 40293874.0,
"step": 5860
},
{
"entropy": 1.9025539755821228,
"epoch": 0.7070585401108167,
"grad_norm": 1.7578125,
"learning_rate": 2.1704042906885457e-05,
"loss": 1.9145645141601562,
"mean_token_accuracy": 0.5868270993232727,
"num_tokens": 40360415.0,
"step": 5870
},
{
"entropy": 1.872169315814972,
"epoch": 0.7082630691399663,
"grad_norm": 2.640625,
"learning_rate": 2.154004521193925e-05,
"loss": 1.8736785888671874,
"mean_token_accuracy": 0.5887367010116578,
"num_tokens": 40430002.0,
"step": 5880
},
{
"entropy": 1.9155991435050965,
"epoch": 0.7094675981691159,
"grad_norm": 2.125,
"learning_rate": 2.1376499185522237e-05,
"loss": 1.9274738311767579,
"mean_token_accuracy": 0.5839588642120361,
"num_tokens": 40500557.0,
"step": 5890
},
{
"entropy": 1.9240437746047974,
"epoch": 0.7106721271982654,
"grad_norm": 1.7734375,
"learning_rate": 2.1213407423161812e-05,
"loss": 1.9209692001342773,
"mean_token_accuracy": 0.5875469863414764,
"num_tokens": 40566716.0,
"step": 5900
},
{
"epoch": 0.7106721271982654,
"eval_entropy": 2.3489553928375244,
"eval_loss": 2.6125404834747314,
"eval_mean_token_accuracy": 0.4673546105623245,
"eval_num_tokens": 40566716.0,
"eval_runtime": 0.5215,
"eval_samples_per_second": 30.678,
"eval_steps_per_second": 3.835,
"step": 5900
},
{
"entropy": 1.9260069489479066,
"epoch": 0.711876656227415,
"grad_norm": 1.71875,
"learning_rate": 2.1050772513176133e-05,
"loss": 1.922018051147461,
"mean_token_accuracy": 0.5860312581062317,
"num_tokens": 40633145.0,
"step": 5910
},
{
"entropy": 1.909291636943817,
"epoch": 0.7130811852565647,
"grad_norm": 1.5859375,
"learning_rate": 2.0888597036632874e-05,
"loss": 1.905712890625,
"mean_token_accuracy": 0.5870055973529815,
"num_tokens": 40700932.0,
"step": 5920
},
{
"entropy": 1.9476830124855042,
"epoch": 0.7142857142857143,
"grad_norm": 2.03125,
"learning_rate": 2.072688356730844e-05,
"loss": 1.970412826538086,
"mean_token_accuracy": 0.5819882094860077,
"num_tokens": 40771411.0,
"step": 5930
},
{
"entropy": 1.826759159564972,
"epoch": 0.7154902433148639,
"grad_norm": 1.875,
"learning_rate": 2.056563467164696e-05,
"loss": 1.8389589309692382,
"mean_token_accuracy": 0.6018831849098205,
"num_tokens": 40840010.0,
"step": 5940
},
{
"entropy": 1.8843806028366088,
"epoch": 0.7166947723440135,
"grad_norm": 1.8671875,
"learning_rate": 2.0404852908719698e-05,
"loss": 1.8859840393066407,
"mean_token_accuracy": 0.595301216840744,
"num_tokens": 40910205.0,
"step": 5950
},
{
"entropy": 1.912154495716095,
"epoch": 0.717899301373163,
"grad_norm": 2.3125,
"learning_rate": 2.0244540830184298e-05,
"loss": 1.9205114364624023,
"mean_token_accuracy": 0.5793304085731507,
"num_tokens": 40978311.0,
"step": 5960
},
{
"entropy": 1.8706116080284119,
"epoch": 0.7191038304023127,
"grad_norm": 1.8828125,
"learning_rate": 2.0084700980244454e-05,
"loss": 1.8852800369262694,
"mean_token_accuracy": 0.5874241411685943,
"num_tokens": 41048171.0,
"step": 5970
},
{
"entropy": 1.9051282048225402,
"epoch": 0.7203083594314623,
"grad_norm": 1.71875,
"learning_rate": 1.9925335895609365e-05,
"loss": 1.9160747528076172,
"mean_token_accuracy": 0.5877177476882934,
"num_tokens": 41116376.0,
"step": 5980
},
{
"entropy": 1.971784806251526,
"epoch": 0.7215128884606119,
"grad_norm": 1.96875,
"learning_rate": 1.976644810545357e-05,
"loss": 1.9614128112792968,
"mean_token_accuracy": 0.5810837090015412,
"num_tokens": 41186537.0,
"step": 5990
},
{
"entropy": 1.8875194787979126,
"epoch": 0.7227174174897615,
"grad_norm": 2.140625,
"learning_rate": 1.9608040131376842e-05,
"loss": 1.9109247207641602,
"mean_token_accuracy": 0.5865307509899139,
"num_tokens": 41258325.0,
"step": 6000
},
{
"epoch": 0.7227174174897615,
"eval_entropy": 2.366549849510193,
"eval_loss": 2.61251163482666,
"eval_mean_token_accuracy": 0.4671112596988678,
"eval_num_tokens": 41258325.0,
"eval_runtime": 0.5543,
"eval_samples_per_second": 28.867,
"eval_steps_per_second": 3.608,
"step": 6000
},
{
"entropy": 1.9798705458641053,
"epoch": 0.7239219465189111,
"grad_norm": 1.6328125,
"learning_rate": 1.9450114487364042e-05,
"loss": 1.9828369140625,
"mean_token_accuracy": 0.5758079171180726,
"num_tokens": 41326418.0,
"step": 6010
},
{
"entropy": 1.9611675500869752,
"epoch": 0.7251264755480608,
"grad_norm": 1.828125,
"learning_rate": 1.9292673679745382e-05,
"loss": 1.9798452377319335,
"mean_token_accuracy": 0.5820157170295716,
"num_tokens": 41392666.0,
"step": 6020
},
{
"entropy": 1.909050500392914,
"epoch": 0.7263310045772103,
"grad_norm": 1.984375,
"learning_rate": 1.9135720207156488e-05,
"loss": 1.9030824661254884,
"mean_token_accuracy": 0.5906743764877319,
"num_tokens": 41459774.0,
"step": 6030
},
{
"entropy": 1.9096762776374816,
"epoch": 0.7275355336063599,
"grad_norm": 1.8203125,
"learning_rate": 1.89792565604989e-05,
"loss": 1.93050537109375,
"mean_token_accuracy": 0.5873095750808716,
"num_tokens": 41531612.0,
"step": 6040
},
{
"entropy": 1.973576021194458,
"epoch": 0.7287400626355095,
"grad_norm": 1.796875,
"learning_rate": 1.8823285222900378e-05,
"loss": 1.9844182968139648,
"mean_token_accuracy": 0.5772220075130463,
"num_tokens": 41600679.0,
"step": 6050
},
{
"entropy": 1.8222813606262207,
"epoch": 0.7299445916646591,
"grad_norm": 1.9140625,
"learning_rate": 1.8667808669675686e-05,
"loss": 1.8241180419921874,
"mean_token_accuracy": 0.6043347537517547,
"num_tokens": 41669857.0,
"step": 6060
},
{
"entropy": 1.9602565884590148,
"epoch": 0.7311491206938088,
"grad_norm": 1.796875,
"learning_rate": 1.8512829368287106e-05,
"loss": 1.9717720031738282,
"mean_token_accuracy": 0.5753653764724731,
"num_tokens": 41740401.0,
"step": 6070
},
{
"entropy": 1.9181342720985413,
"epoch": 0.7323536497229584,
"grad_norm": 1.875,
"learning_rate": 1.8358349778305413e-05,
"loss": 1.9254276275634765,
"mean_token_accuracy": 0.5831866025924682,
"num_tokens": 41812058.0,
"step": 6080
},
{
"entropy": 1.8529590964317322,
"epoch": 0.7335581787521079,
"grad_norm": 1.8203125,
"learning_rate": 1.8204372351370837e-05,
"loss": 1.8680103302001954,
"mean_token_accuracy": 0.596212899684906,
"num_tokens": 41881963.0,
"step": 6090
},
{
"entropy": 2.052596926689148,
"epoch": 0.7347627077812575,
"grad_norm": 2.640625,
"learning_rate": 1.8050899531154047e-05,
"loss": 2.0838245391845702,
"mean_token_accuracy": 0.5623102843761444,
"num_tokens": 41952566.0,
"step": 6100
},
{
"epoch": 0.7347627077812575,
"eval_entropy": 2.362101435661316,
"eval_loss": 2.609923839569092,
"eval_mean_token_accuracy": 0.4649299383163452,
"eval_num_tokens": 41952566.0,
"eval_runtime": 0.4206,
"eval_samples_per_second": 38.041,
"eval_steps_per_second": 4.755,
"step": 6100
},
{
"entropy": 1.9244691729545593,
"epoch": 0.7359672368104071,
"grad_norm": 1.734375,
"learning_rate": 1.7897933753317524e-05,
"loss": 1.9250370025634767,
"mean_token_accuracy": 0.582695585489273,
"num_tokens": 42020999.0,
"step": 6110
},
{
"entropy": 1.9595140218734741,
"epoch": 0.7371717658395567,
"grad_norm": 3.6875,
"learning_rate": 1.7745477445476753e-05,
"loss": 1.956146240234375,
"mean_token_accuracy": 0.5800999224185943,
"num_tokens": 42089893.0,
"step": 6120
},
{
"entropy": 1.919549560546875,
"epoch": 0.7383762948687064,
"grad_norm": 2.09375,
"learning_rate": 1.759353302716184e-05,
"loss": 1.9333702087402345,
"mean_token_accuracy": 0.5832765638828278,
"num_tokens": 42157455.0,
"step": 6130
},
{
"entropy": 1.946086299419403,
"epoch": 0.739580823897856,
"grad_norm": 1.8671875,
"learning_rate": 1.744210290977896e-05,
"loss": 1.9349960327148437,
"mean_token_accuracy": 0.578400480747223,
"num_tokens": 42225162.0,
"step": 6140
},
{
"entropy": 1.873963975906372,
"epoch": 0.7407853529270055,
"grad_norm": 1.75,
"learning_rate": 1.7291189496572253e-05,
"loss": 1.884567642211914,
"mean_token_accuracy": 0.5981316328048706,
"num_tokens": 42294964.0,
"step": 6150
},
{
"entropy": 1.9695120215415955,
"epoch": 0.7419898819561551,
"grad_norm": 2.296875,
"learning_rate": 1.7140795182585534e-05,
"loss": 1.9876461029052734,
"mean_token_accuracy": 0.5724166512489319,
"num_tokens": 42362342.0,
"step": 6160
},
{
"entropy": 1.934032666683197,
"epoch": 0.7431944109853047,
"grad_norm": 2.4375,
"learning_rate": 1.699092235462436e-05,
"loss": 1.9466995239257812,
"mean_token_accuracy": 0.5826898992061615,
"num_tokens": 42432625.0,
"step": 6170
},
{
"entropy": 1.9191329836845399,
"epoch": 0.7443989400144544,
"grad_norm": 2.71875,
"learning_rate": 1.684157339121819e-05,
"loss": 1.9333236694335938,
"mean_token_accuracy": 0.5826072692871094,
"num_tokens": 42498732.0,
"step": 6180
},
{
"entropy": 1.9656689882278442,
"epoch": 0.745603469043604,
"grad_norm": 1.7578125,
"learning_rate": 1.66927506625825e-05,
"loss": 1.9759876251220703,
"mean_token_accuracy": 0.5751698732376098,
"num_tokens": 42568957.0,
"step": 6190
},
{
"entropy": 1.9358470439910889,
"epoch": 0.7468079980727536,
"grad_norm": 1.7734375,
"learning_rate": 1.6544456530581347e-05,
"loss": 1.9460548400878905,
"mean_token_accuracy": 0.5818601489067078,
"num_tokens": 42640191.0,
"step": 6200
},
{
"epoch": 0.7468079980727536,
"eval_entropy": 2.362728238105774,
"eval_loss": 2.608748435974121,
"eval_mean_token_accuracy": 0.4665456563234329,
"eval_num_tokens": 42640191.0,
"eval_runtime": 0.4335,
"eval_samples_per_second": 36.906,
"eval_steps_per_second": 4.613,
"step": 6200
},
{
"entropy": 1.9571660161018372,
"epoch": 0.7480125271019031,
"grad_norm": 1.8125,
"learning_rate": 1.6396693348689708e-05,
"loss": 1.965768814086914,
"mean_token_accuracy": 0.5769582033157349,
"num_tokens": 42709254.0,
"step": 6210
},
{
"entropy": 1.8060012340545655,
"epoch": 0.7492170561310527,
"grad_norm": 1.640625,
"learning_rate": 1.6249463461956282e-05,
"loss": 1.8094675064086914,
"mean_token_accuracy": 0.6042569816112519,
"num_tokens": 42779914.0,
"step": 6220
},
{
"entropy": 1.8862876176834107,
"epoch": 0.7504215851602024,
"grad_norm": 1.703125,
"learning_rate": 1.6102769206966134e-05,
"loss": 1.8914718627929688,
"mean_token_accuracy": 0.5938247859477996,
"num_tokens": 42848428.0,
"step": 6230
},
{
"entropy": 1.8931943416595458,
"epoch": 0.751626114189352,
"grad_norm": 1.8515625,
"learning_rate": 1.5956612911803763e-05,
"loss": 1.9008750915527344,
"mean_token_accuracy": 0.5888782799243927,
"num_tokens": 42918406.0,
"step": 6240
},
{
"entropy": 1.9467383861541747,
"epoch": 0.7528306432185016,
"grad_norm": 1.859375,
"learning_rate": 1.5810996896016013e-05,
"loss": 1.9549457550048828,
"mean_token_accuracy": 0.5805156767368317,
"num_tokens": 42988884.0,
"step": 6250
},
{
"entropy": 1.8323127388954163,
"epoch": 0.7540351722476512,
"grad_norm": 3.265625,
"learning_rate": 1.5665923470575322e-05,
"loss": 1.8414487838745117,
"mean_token_accuracy": 0.6019357740879059,
"num_tokens": 43057529.0,
"step": 6260
},
{
"entropy": 1.9331823110580444,
"epoch": 0.7552397012768007,
"grad_norm": 1.8125,
"learning_rate": 1.5521394937843103e-05,
"loss": 1.9314424514770507,
"mean_token_accuracy": 0.5830797135829926,
"num_tokens": 43130313.0,
"step": 6270
},
{
"entropy": 1.891178059577942,
"epoch": 0.7564442303059504,
"grad_norm": 1.9765625,
"learning_rate": 1.537741359153308e-05,
"loss": 1.888962173461914,
"mean_token_accuracy": 0.592195349931717,
"num_tokens": 43198254.0,
"step": 6280
},
{
"entropy": 1.8627419114112853,
"epoch": 0.7576487593351,
"grad_norm": 1.6015625,
"learning_rate": 1.5233981716675017e-05,
"loss": 1.8827104568481445,
"mean_token_accuracy": 0.5912132978439331,
"num_tokens": 43269101.0,
"step": 6290
},
{
"entropy": 1.9497613430023193,
"epoch": 0.7588532883642496,
"grad_norm": 2.53125,
"learning_rate": 1.5091101589578333e-05,
"loss": 1.9523366928100585,
"mean_token_accuracy": 0.5823286592960357,
"num_tokens": 43339492.0,
"step": 6300
},
{
"epoch": 0.7588532883642496,
"eval_entropy": 2.3515676259994507,
"eval_loss": 2.6067941188812256,
"eval_mean_token_accuracy": 0.46921147406101227,
"eval_num_tokens": 43339492.0,
"eval_runtime": 0.4579,
"eval_samples_per_second": 34.943,
"eval_steps_per_second": 4.368,
"step": 6300
},
{
"entropy": 1.742299199104309,
"epoch": 0.7600578173933992,
"grad_norm": 2.375,
"learning_rate": 1.4948775477796095e-05,
"loss": 1.7332212448120117,
"mean_token_accuracy": 0.6186984837055206,
"num_tokens": 43403475.0,
"step": 6310
},
{
"entropy": 1.9738173604011535,
"epoch": 0.7612623464225488,
"grad_norm": 1.984375,
"learning_rate": 1.48070056400889e-05,
"loss": 2.008579635620117,
"mean_token_accuracy": 0.5714421808719635,
"num_tokens": 43474221.0,
"step": 6320
},
{
"entropy": 1.8835377812385559,
"epoch": 0.7624668754516983,
"grad_norm": 2.0,
"learning_rate": 1.4665794326389175e-05,
"loss": 1.8993560791015625,
"mean_token_accuracy": 0.5936033010482789,
"num_tokens": 43538217.0,
"step": 6330
},
{
"entropy": 1.8982553243637086,
"epoch": 0.763671404480848,
"grad_norm": 2.015625,
"learning_rate": 1.4525143777765327e-05,
"loss": 1.9044149398803711,
"mean_token_accuracy": 0.5876859664916992,
"num_tokens": 43607826.0,
"step": 6340
},
{
"entropy": 2.0122554659843446,
"epoch": 0.7648759335099976,
"grad_norm": 6.0,
"learning_rate": 1.4385056226386296e-05,
"loss": 2.0187328338623045,
"mean_token_accuracy": 0.5686930924654007,
"num_tokens": 43675060.0,
"step": 6350
},
{
"entropy": 1.9191255450248719,
"epoch": 0.7660804625391472,
"grad_norm": 1.9609375,
"learning_rate": 1.4245533895486047e-05,
"loss": 1.9368209838867188,
"mean_token_accuracy": 0.5836399495601654,
"num_tokens": 43746519.0,
"step": 6360
},
{
"entropy": 1.9163129210472107,
"epoch": 0.7672849915682968,
"grad_norm": 3.046875,
"learning_rate": 1.410657899932829e-05,
"loss": 1.911086082458496,
"mean_token_accuracy": 0.5841832041740418,
"num_tokens": 43816456.0,
"step": 6370
},
{
"entropy": 1.9154442310333253,
"epoch": 0.7684895205974464,
"grad_norm": 2.203125,
"learning_rate": 1.3968193743171448e-05,
"loss": 1.928656005859375,
"mean_token_accuracy": 0.5853952527046203,
"num_tokens": 43886694.0,
"step": 6380
},
{
"entropy": 1.8705685019493103,
"epoch": 0.769694049626596,
"grad_norm": 2.59375,
"learning_rate": 1.3830380323233483e-05,
"loss": 1.873354148864746,
"mean_token_accuracy": 0.5968881070613861,
"num_tokens": 43954882.0,
"step": 6390
},
{
"entropy": 1.8823359847068786,
"epoch": 0.7708985786557456,
"grad_norm": 1.875,
"learning_rate": 1.3693140926657206e-05,
"loss": 1.8747835159301758,
"mean_token_accuracy": 0.5940642535686493,
"num_tokens": 44024348.0,
"step": 6400
},
{
"epoch": 0.7708985786557456,
"eval_entropy": 2.3486063480377197,
"eval_loss": 2.607100009918213,
"eval_mean_token_accuracy": 0.4709083139896393,
"eval_num_tokens": 44024348.0,
"eval_runtime": 0.4059,
"eval_samples_per_second": 39.42,
"eval_steps_per_second": 4.927,
"step": 6400
},
{
"entropy": 1.9085419058799744,
"epoch": 0.7721031076848952,
"grad_norm": 1.515625,
"learning_rate": 1.3556477731475436e-05,
"loss": 1.9057226181030273,
"mean_token_accuracy": 0.583581292629242,
"num_tokens": 44095192.0,
"step": 6410
},
{
"entropy": 1.9753173351287843,
"epoch": 0.7733076367140448,
"grad_norm": 2.921875,
"learning_rate": 1.3420392906576562e-05,
"loss": 2.002203178405762,
"mean_token_accuracy": 0.5682298004627228,
"num_tokens": 44166393.0,
"step": 6420
},
{
"entropy": 1.9596870183944701,
"epoch": 0.7745121657431944,
"grad_norm": 1.8203125,
"learning_rate": 1.3284888611669977e-05,
"loss": 1.9819063186645507,
"mean_token_accuracy": 0.577583122253418,
"num_tokens": 44236505.0,
"step": 6430
},
{
"entropy": 1.9110072374343872,
"epoch": 0.7757166947723441,
"grad_norm": 1.7265625,
"learning_rate": 1.314996699725194e-05,
"loss": 1.9248188018798829,
"mean_token_accuracy": 0.5864306092262268,
"num_tokens": 44302671.0,
"step": 6440
},
{
"entropy": 1.9159401893615722,
"epoch": 0.7769212238014936,
"grad_norm": 1.8671875,
"learning_rate": 1.3015630204571343e-05,
"loss": 1.936089324951172,
"mean_token_accuracy": 0.5859259188175201,
"num_tokens": 44371608.0,
"step": 6450
},
{
"entropy": 1.9239344954490663,
"epoch": 0.7781257528306432,
"grad_norm": 2.3125,
"learning_rate": 1.288188036559579e-05,
"loss": 1.9287485122680663,
"mean_token_accuracy": 0.5864728391170502,
"num_tokens": 44439064.0,
"step": 6460
},
{
"entropy": 1.843450939655304,
"epoch": 0.7793302818597928,
"grad_norm": 2.421875,
"learning_rate": 1.2748719602977755e-05,
"loss": 1.8487020492553712,
"mean_token_accuracy": 0.6032763719558716,
"num_tokens": 44509652.0,
"step": 6470
},
{
"entropy": 1.8990965366363526,
"epoch": 0.7805348108889424,
"grad_norm": 1.890625,
"learning_rate": 1.261615003002084e-05,
"loss": 1.8943010330200196,
"mean_token_accuracy": 0.586923462152481,
"num_tokens": 44581457.0,
"step": 6480
},
{
"entropy": 1.8844643354415893,
"epoch": 0.7817393399180921,
"grad_norm": 1.9765625,
"learning_rate": 1.248417375064635e-05,
"loss": 1.8843027114868165,
"mean_token_accuracy": 0.5940824329853058,
"num_tokens": 44647497.0,
"step": 6490
},
{
"entropy": 1.943538236618042,
"epoch": 0.7829438689472417,
"grad_norm": 1.9140625,
"learning_rate": 1.2352792859359746e-05,
"loss": 1.951897430419922,
"mean_token_accuracy": 0.5817996025085449,
"num_tokens": 44715951.0,
"step": 6500
},
{
"epoch": 0.7829438689472417,
"eval_entropy": 2.354966163635254,
"eval_loss": 2.6048130989074707,
"eval_mean_token_accuracy": 0.469454824924469,
"eval_num_tokens": 44715951.0,
"eval_runtime": 0.433,
"eval_samples_per_second": 36.955,
"eval_steps_per_second": 4.619,
"step": 6500
},
{
"entropy": 1.914371383190155,
"epoch": 0.7841483979763912,
"grad_norm": 1.7265625,
"learning_rate": 1.222200944121758e-05,
"loss": 1.9062223434448242,
"mean_token_accuracy": 0.5848917901515961,
"num_tokens": 44783665.0,
"step": 6510
},
{
"entropy": 1.9607685804367065,
"epoch": 0.7853529270055408,
"grad_norm": 1.6875,
"learning_rate": 1.209182557179423e-05,
"loss": 1.9632522583007812,
"mean_token_accuracy": 0.5831448435783386,
"num_tokens": 44848951.0,
"step": 6520
},
{
"entropy": 1.9036452770233154,
"epoch": 0.7865574560346904,
"grad_norm": 1.8125,
"learning_rate": 1.1962243317149113e-05,
"loss": 1.9184595108032227,
"mean_token_accuracy": 0.5862579584121704,
"num_tokens": 44918497.0,
"step": 6530
},
{
"entropy": 1.9451961398124695,
"epoch": 0.78776198506384,
"grad_norm": 2.296875,
"learning_rate": 1.1833264733793797e-05,
"loss": 1.9473270416259765,
"mean_token_accuracy": 0.5783241093158722,
"num_tokens": 44986596.0,
"step": 6540
},
{
"entropy": 1.8859020948410035,
"epoch": 0.7889665140929897,
"grad_norm": 2.015625,
"learning_rate": 1.1704891868659385e-05,
"loss": 1.8870750427246095,
"mean_token_accuracy": 0.5893312156200409,
"num_tokens": 45056784.0,
"step": 6550
},
{
"entropy": 1.891354513168335,
"epoch": 0.7901710431221393,
"grad_norm": 1.8046875,
"learning_rate": 1.1577126759064067e-05,
"loss": 1.889866828918457,
"mean_token_accuracy": 0.5927616477012634,
"num_tokens": 45126044.0,
"step": 6560
},
{
"entropy": 1.8748571395874023,
"epoch": 0.7913755721512888,
"grad_norm": 1.8203125,
"learning_rate": 1.1449971432680734e-05,
"loss": 1.902470016479492,
"mean_token_accuracy": 0.5893660247325897,
"num_tokens": 45197142.0,
"step": 6570
},
{
"entropy": 1.921328365802765,
"epoch": 0.7925801011804384,
"grad_norm": 2.390625,
"learning_rate": 1.1323427907504852e-05,
"loss": 1.9254425048828125,
"mean_token_accuracy": 0.5840743720531464,
"num_tokens": 45264525.0,
"step": 6580
},
{
"entropy": 1.8484970211982727,
"epoch": 0.793784630209588,
"grad_norm": 1.7578125,
"learning_rate": 1.119749819182237e-05,
"loss": 1.8502546310424806,
"mean_token_accuracy": 0.6023968100547791,
"num_tokens": 45330367.0,
"step": 6590
},
{
"entropy": 1.929791271686554,
"epoch": 0.7949891592387377,
"grad_norm": 2.171875,
"learning_rate": 1.1072184284177928e-05,
"loss": 1.9453422546386718,
"mean_token_accuracy": 0.5831649899482727,
"num_tokens": 45399268.0,
"step": 6600
},
{
"epoch": 0.7949891592387377,
"eval_entropy": 2.3468000888824463,
"eval_loss": 2.607370615005493,
"eval_mean_token_accuracy": 0.46759575605392456,
"eval_num_tokens": 45399268.0,
"eval_runtime": 0.5249,
"eval_samples_per_second": 30.482,
"eval_steps_per_second": 3.81,
"step": 6600
},
{
"entropy": 1.9908106565475463,
"epoch": 0.7961936882678873,
"grad_norm": 1.96875,
"learning_rate": 1.0947488173343045e-05,
"loss": 1.9914398193359375,
"mean_token_accuracy": 0.5752636313438415,
"num_tokens": 45464999.0,
"step": 6610
},
{
"entropy": 1.8240914940834045,
"epoch": 0.7973982172970369,
"grad_norm": 1.8046875,
"learning_rate": 1.0823411838284675e-05,
"loss": 1.8167228698730469,
"mean_token_accuracy": 0.6027800858020782,
"num_tokens": 45536036.0,
"step": 6620
},
{
"entropy": 1.8865158796310424,
"epoch": 0.7986027463261864,
"grad_norm": 2.0625,
"learning_rate": 1.0699957248133674e-05,
"loss": 1.8847312927246094,
"mean_token_accuracy": 0.5909352362155914,
"num_tokens": 45605062.0,
"step": 6630
},
{
"entropy": 1.9506030559539795,
"epoch": 0.799807275355336,
"grad_norm": 1.75,
"learning_rate": 1.0577126362153616e-05,
"loss": 1.9742616653442382,
"mean_token_accuracy": 0.5732921123504638,
"num_tokens": 45673092.0,
"step": 6640
},
{
"entropy": 1.8828906178474427,
"epoch": 0.8010118043844857,
"grad_norm": 1.65625,
"learning_rate": 1.0454921129709745e-05,
"loss": 1.8854589462280273,
"mean_token_accuracy": 0.5882959604263306,
"num_tokens": 45743944.0,
"step": 6650
},
{
"entropy": 1.901366901397705,
"epoch": 0.8022163334136353,
"grad_norm": 1.671875,
"learning_rate": 1.0333343490237907e-05,
"loss": 1.914388084411621,
"mean_token_accuracy": 0.5854258954524993,
"num_tokens": 45814879.0,
"step": 6660
},
{
"entropy": 1.9456279039382935,
"epoch": 0.8034208624427849,
"grad_norm": 2.46875,
"learning_rate": 1.0212395373213918e-05,
"loss": 1.9462764739990235,
"mean_token_accuracy": 0.5851451516151428,
"num_tokens": 45882249.0,
"step": 6670
},
{
"entropy": 1.9052425861358642,
"epoch": 0.8046253914719345,
"grad_norm": 1.859375,
"learning_rate": 1.0092078698122815e-05,
"loss": 1.9133377075195312,
"mean_token_accuracy": 0.5850100219249725,
"num_tokens": 45951186.0,
"step": 6680
},
{
"entropy": 1.8830320119857789,
"epoch": 0.805829920501084,
"grad_norm": 2.171875,
"learning_rate": 9.9723953744285e-06,
"loss": 1.8878063201904296,
"mean_token_accuracy": 0.5881814539432526,
"num_tokens": 46021706.0,
"step": 6690
},
{
"entropy": 1.9019694447517395,
"epoch": 0.8070344495302337,
"grad_norm": 1.8671875,
"learning_rate": 9.853347301543343e-06,
"loss": 1.9098031997680665,
"mean_token_accuracy": 0.5850791215896607,
"num_tokens": 46092513.0,
"step": 6700
},
{
"epoch": 0.8070344495302337,
"eval_entropy": 2.344744324684143,
"eval_loss": 2.6055827140808105,
"eval_mean_token_accuracy": 0.4685647487640381,
"eval_num_tokens": 46092513.0,
"eval_runtime": 0.4373,
"eval_samples_per_second": 36.586,
"eval_steps_per_second": 4.573,
"step": 6700
},
{
"entropy": 1.9491977095603943,
"epoch": 0.8082389785593833,
"grad_norm": 2.609375,
"learning_rate": 9.73493636879813e-06,
"loss": 1.9542781829833984,
"mean_token_accuracy": 0.5842437386512757,
"num_tokens": 46161184.0,
"step": 6710
},
{
"entropy": 1.8885266542434693,
"epoch": 0.8094435075885329,
"grad_norm": 1.8671875,
"learning_rate": 9.617164455411987e-06,
"loss": 1.879570770263672,
"mean_token_accuracy": 0.5920573532581329,
"num_tokens": 46232334.0,
"step": 6720
},
{
"entropy": 1.8895246863365174,
"epoch": 0.8106480366176825,
"grad_norm": 1.90625,
"learning_rate": 9.500033430462602e-06,
"loss": 1.8861270904541017,
"mean_token_accuracy": 0.5923787474632263,
"num_tokens": 46300591.0,
"step": 6730
},
{
"entropy": 1.8397042512893678,
"epoch": 0.8118525656468321,
"grad_norm": 2.234375,
"learning_rate": 9.383545152856605e-06,
"loss": 1.8577287673950196,
"mean_token_accuracy": 0.5989863038063049,
"num_tokens": 46371286.0,
"step": 6740
},
{
"entropy": 1.8958259224891663,
"epoch": 0.8130570946759816,
"grad_norm": 1.953125,
"learning_rate": 9.267701471299956e-06,
"loss": 1.9154817581176757,
"mean_token_accuracy": 0.5852463603019714,
"num_tokens": 46442961.0,
"step": 6750
},
{
"entropy": 1.9718209743499755,
"epoch": 0.8142616237051313,
"grad_norm": 2.046875,
"learning_rate": 9.152504224268742e-06,
"loss": 1.9979925155639648,
"mean_token_accuracy": 0.5743964821100235,
"num_tokens": 46510402.0,
"step": 6760
},
{
"entropy": 1.9272982358932496,
"epoch": 0.8154661527342809,
"grad_norm": 1.828125,
"learning_rate": 9.037955239979856e-06,
"loss": 1.9375040054321289,
"mean_token_accuracy": 0.5840645253658294,
"num_tokens": 46578401.0,
"step": 6770
},
{
"entropy": 1.969197118282318,
"epoch": 0.8166706817634305,
"grad_norm": 2.328125,
"learning_rate": 8.924056336362124e-06,
"loss": 1.9637868881225586,
"mean_token_accuracy": 0.5787004292011261,
"num_tokens": 46648189.0,
"step": 6780
},
{
"entropy": 1.9857244729995727,
"epoch": 0.8178752107925801,
"grad_norm": 2.0625,
"learning_rate": 8.810809321027325e-06,
"loss": 2.0006925582885744,
"mean_token_accuracy": 0.573920601606369,
"num_tokens": 46717650.0,
"step": 6790
},
{
"entropy": 1.8448691248893738,
"epoch": 0.8190797398217297,
"grad_norm": 1.5859375,
"learning_rate": 8.69821599124161e-06,
"loss": 1.8462907791137695,
"mean_token_accuracy": 0.5970932245254517,
"num_tokens": 46786658.0,
"step": 6800
},
{
"epoch": 0.8190797398217297,
"eval_entropy": 2.3488173484802246,
"eval_loss": 2.6056511402130127,
"eval_mean_token_accuracy": 0.4687269777059555,
"eval_num_tokens": 46786658.0,
"eval_runtime": 0.5738,
"eval_samples_per_second": 27.886,
"eval_steps_per_second": 3.486,
"step": 6800
},
{
"entropy": 1.8751570463180542,
"epoch": 0.8202842688508794,
"grad_norm": 1.953125,
"learning_rate": 8.586278133896908e-06,
"loss": 1.8820247650146484,
"mean_token_accuracy": 0.5926909029483796,
"num_tokens": 46852908.0,
"step": 6810
},
{
"entropy": 1.9577014803886414,
"epoch": 0.8214887978800289,
"grad_norm": 2.046875,
"learning_rate": 8.474997525482575e-06,
"loss": 1.9634927749633788,
"mean_token_accuracy": 0.5787667095661163,
"num_tokens": 46916551.0,
"step": 6820
},
{
"entropy": 1.9611721515655518,
"epoch": 0.8226933269091785,
"grad_norm": 2.828125,
"learning_rate": 8.364375932057278e-06,
"loss": 1.970509147644043,
"mean_token_accuracy": 0.5789299726486206,
"num_tokens": 46985403.0,
"step": 6830
},
{
"entropy": 1.9028493762016296,
"epoch": 0.8238978559383281,
"grad_norm": 10.0,
"learning_rate": 8.254415109220837e-06,
"loss": 1.899116325378418,
"mean_token_accuracy": 0.590817129611969,
"num_tokens": 47054944.0,
"step": 6840
},
{
"entropy": 1.9320483922958374,
"epoch": 0.8251023849674777,
"grad_norm": 1.6875,
"learning_rate": 8.145116802086489e-06,
"loss": 1.9514781951904296,
"mean_token_accuracy": 0.5878109574317932,
"num_tokens": 47123791.0,
"step": 6850
},
{
"entropy": 1.8979083180427552,
"epoch": 0.8263069139966274,
"grad_norm": 8.3125,
"learning_rate": 8.036482745253083e-06,
"loss": 1.888798141479492,
"mean_token_accuracy": 0.5931646287441253,
"num_tokens": 47190807.0,
"step": 6860
},
{
"entropy": 1.9247923970222474,
"epoch": 0.827511443025777,
"grad_norm": 1.8515625,
"learning_rate": 7.928514662777664e-06,
"loss": 1.9398954391479493,
"mean_token_accuracy": 0.582766991853714,
"num_tokens": 47254992.0,
"step": 6870
},
{
"entropy": 1.9788224577903748,
"epoch": 0.8287159720549265,
"grad_norm": 1.84375,
"learning_rate": 7.821214268147997e-06,
"loss": 1.982382583618164,
"mean_token_accuracy": 0.5736671209335327,
"num_tokens": 47324051.0,
"step": 6880
},
{
"entropy": 1.891750741004944,
"epoch": 0.8299205010840761,
"grad_norm": 1.640625,
"learning_rate": 7.714583264255471e-06,
"loss": 1.8812660217285155,
"mean_token_accuracy": 0.591460508108139,
"num_tokens": 47394995.0,
"step": 6890
},
{
"entropy": 1.9054150104522705,
"epoch": 0.8311250301132257,
"grad_norm": 2.671875,
"learning_rate": 7.6086233433680044e-06,
"loss": 1.9210380554199218,
"mean_token_accuracy": 0.587424921989441,
"num_tokens": 47463873.0,
"step": 6900
},
{
"epoch": 0.8311250301132257,
"eval_entropy": 2.3451857566833496,
"eval_loss": 2.604262351989746,
"eval_mean_token_accuracy": 0.4664645344018936,
"eval_num_tokens": 47463873.0,
"eval_runtime": 0.4522,
"eval_samples_per_second": 35.379,
"eval_steps_per_second": 4.422,
"step": 6900
},
{
"entropy": 1.9943472743034363,
"epoch": 0.8323295591423754,
"grad_norm": 1.7109375,
"learning_rate": 7.50333618710321e-06,
"loss": 2.015270805358887,
"mean_token_accuracy": 0.5674533128738404,
"num_tokens": 47535360.0,
"step": 6910
},
{
"entropy": 1.857893967628479,
"epoch": 0.833534088171525,
"grad_norm": 2.140625,
"learning_rate": 7.398723466401752e-06,
"loss": 1.855722999572754,
"mean_token_accuracy": 0.5944087326526641,
"num_tokens": 47604620.0,
"step": 6920
},
{
"entropy": 1.893347978591919,
"epoch": 0.8347386172006745,
"grad_norm": 2.40625,
"learning_rate": 7.294786841500739e-06,
"loss": 1.8694910049438476,
"mean_token_accuracy": 0.5950228750705719,
"num_tokens": 47672995.0,
"step": 6930
},
{
"entropy": 1.8532984614372254,
"epoch": 0.8359431462298241,
"grad_norm": 1.890625,
"learning_rate": 7.1915279619074685e-06,
"loss": 1.8615072250366211,
"mean_token_accuracy": 0.5978223085403442,
"num_tokens": 47739483.0,
"step": 6940
},
{
"entropy": 1.9042423844337464,
"epoch": 0.8371476752589737,
"grad_norm": 2.546875,
"learning_rate": 7.088948466373157e-06,
"loss": 1.9140111923217773,
"mean_token_accuracy": 0.5868822991847992,
"num_tokens": 47809508.0,
"step": 6950
},
{
"entropy": 1.915585207939148,
"epoch": 0.8383522042881233,
"grad_norm": 1.96875,
"learning_rate": 6.9870499828670335e-06,
"loss": 1.9215312957763673,
"mean_token_accuracy": 0.5845146715641022,
"num_tokens": 47876262.0,
"step": 6960
},
{
"entropy": 1.9611434459686279,
"epoch": 0.839556733317273,
"grad_norm": 1.9140625,
"learning_rate": 6.8858341285504e-06,
"loss": 1.9818181991577148,
"mean_token_accuracy": 0.5766906619071961,
"num_tokens": 47945325.0,
"step": 6970
},
{
"entropy": 1.9311550855636597,
"epoch": 0.8407612623464226,
"grad_norm": 1.703125,
"learning_rate": 6.785302509751057e-06,
"loss": 1.9468086242675782,
"mean_token_accuracy": 0.578011155128479,
"num_tokens": 48015306.0,
"step": 6980
},
{
"entropy": 1.9162616729736328,
"epoch": 0.8419657913755721,
"grad_norm": 2.8125,
"learning_rate": 6.685456721937738e-06,
"loss": 1.936783218383789,
"mean_token_accuracy": 0.5843357384204865,
"num_tokens": 48086370.0,
"step": 6990
},
{
"entropy": 1.9087663173675538,
"epoch": 0.8431703204047217,
"grad_norm": 6.125,
"learning_rate": 6.58629834969483e-06,
"loss": 1.9089078903198242,
"mean_token_accuracy": 0.5885790526866913,
"num_tokens": 48154780.0,
"step": 7000
},
{
"epoch": 0.8431703204047217,
"eval_entropy": 2.3457889556884766,
"eval_loss": 2.6046531200408936,
"eval_mean_token_accuracy": 0.46921147406101227,
"eval_num_tokens": 48154780.0,
"eval_runtime": 0.4302,
"eval_samples_per_second": 37.194,
"eval_steps_per_second": 4.649,
"step": 7000
},
{
"entropy": 1.961945605278015,
"epoch": 0.8443748494338713,
"grad_norm": 1.6796875,
"learning_rate": 6.4878289666972405e-06,
"loss": 1.9694751739501952,
"mean_token_accuracy": 0.5779289305210114,
"num_tokens": 48223465.0,
"step": 7010
},
{
"entropy": 1.9014871001243592,
"epoch": 0.845579378463021,
"grad_norm": 1.953125,
"learning_rate": 6.390050135685355e-06,
"loss": 1.9117996215820312,
"mean_token_accuracy": 0.5887616813182831,
"num_tokens": 48293428.0,
"step": 7020
},
{
"entropy": 1.8740954041481017,
"epoch": 0.8467839074921706,
"grad_norm": 2.328125,
"learning_rate": 6.2929634084403275e-06,
"loss": 1.8785751342773438,
"mean_token_accuracy": 0.5922869861125946,
"num_tokens": 48361643.0,
"step": 7030
},
{
"entropy": 1.8738709330558776,
"epoch": 0.8479884365213202,
"grad_norm": 3.15625,
"learning_rate": 6.196570325759354e-06,
"loss": 1.8708406448364259,
"mean_token_accuracy": 0.5880731344223022,
"num_tokens": 48431388.0,
"step": 7040
},
{
"entropy": 1.854604971408844,
"epoch": 0.8491929655504697,
"grad_norm": 1.984375,
"learning_rate": 6.100872417431325e-06,
"loss": 1.8594350814819336,
"mean_token_accuracy": 0.5963862299919128,
"num_tokens": 48500006.0,
"step": 7050
},
{
"entropy": 1.908062756061554,
"epoch": 0.8503974945796193,
"grad_norm": 1.921875,
"learning_rate": 6.0058712022124374e-06,
"loss": 1.9291059494018554,
"mean_token_accuracy": 0.5871904790401459,
"num_tokens": 48565307.0,
"step": 7060
},
{
"entropy": 1.8915085315704345,
"epoch": 0.851602023608769,
"grad_norm": 1.828125,
"learning_rate": 5.911568187802202e-06,
"loss": 1.9014862060546875,
"mean_token_accuracy": 0.594680666923523,
"num_tokens": 48634315.0,
"step": 7070
},
{
"entropy": 1.8878782033920287,
"epoch": 0.8528065526379186,
"grad_norm": 2.046875,
"learning_rate": 5.8179648708194255e-06,
"loss": 1.8941951751708985,
"mean_token_accuracy": 0.5915153980255127,
"num_tokens": 48706116.0,
"step": 7080
},
{
"entropy": 1.9810489177703858,
"epoch": 0.8540110816670682,
"grad_norm": 1.8046875,
"learning_rate": 5.725062736778486e-06,
"loss": 1.9749015808105468,
"mean_token_accuracy": 0.5741178423166275,
"num_tokens": 48773243.0,
"step": 7090
},
{
"entropy": 1.895472002029419,
"epoch": 0.8552156106962178,
"grad_norm": 1.8671875,
"learning_rate": 5.632863260065802e-06,
"loss": 1.9204191207885741,
"mean_token_accuracy": 0.5905978083610535,
"num_tokens": 48841002.0,
"step": 7100
},
{
"epoch": 0.8552156106962178,
"eval_entropy": 2.3433711528778076,
"eval_loss": 2.607316017150879,
"eval_mean_token_accuracy": 0.4687269777059555,
"eval_num_tokens": 48841002.0,
"eval_runtime": 0.4384,
"eval_samples_per_second": 36.495,
"eval_steps_per_second": 4.562,
"step": 7100
},
{
"entropy": 1.9661857843399049,
"epoch": 0.8564201397253673,
"grad_norm": 2.5,
"learning_rate": 5.541367903916367e-06,
"loss": 1.9712324142456055,
"mean_token_accuracy": 0.5750096321105957,
"num_tokens": 48911106.0,
"step": 7110
},
{
"entropy": 1.9432387232780457,
"epoch": 0.857624668754517,
"grad_norm": 2.234375,
"learning_rate": 5.4505781203905705e-06,
"loss": 1.9617923736572265,
"mean_token_accuracy": 0.5740188658237457,
"num_tokens": 48979091.0,
"step": 7120
},
{
"entropy": 1.8688494324684144,
"epoch": 0.8588291977836666,
"grad_norm": 1.7265625,
"learning_rate": 5.360495350351136e-06,
"loss": 1.8777385711669923,
"mean_token_accuracy": 0.5954580247402191,
"num_tokens": 49045460.0,
"step": 7130
},
{
"entropy": 1.9055879592895508,
"epoch": 0.8600337268128162,
"grad_norm": 2.515625,
"learning_rate": 5.271121023440262e-06,
"loss": 1.9095733642578125,
"mean_token_accuracy": 0.5880828917026519,
"num_tokens": 49113551.0,
"step": 7140
},
{
"entropy": 1.9325553178787231,
"epoch": 0.8612382558419658,
"grad_norm": 1.90625,
"learning_rate": 5.182456558056914e-06,
"loss": 1.9428960800170898,
"mean_token_accuracy": 0.5813543915748596,
"num_tokens": 49181943.0,
"step": 7150
},
{
"entropy": 1.920418655872345,
"epoch": 0.8624427848711154,
"grad_norm": 1.984375,
"learning_rate": 5.094503361334363e-06,
"loss": 1.9210506439208985,
"mean_token_accuracy": 0.5861852407455445,
"num_tokens": 49249356.0,
"step": 7160
},
{
"entropy": 2.001292312145233,
"epoch": 0.863647313900265,
"grad_norm": 2.71875,
"learning_rate": 5.007262829117793e-06,
"loss": 2.0109493255615236,
"mean_token_accuracy": 0.5729002892971039,
"num_tokens": 49320388.0,
"step": 7170
},
{
"entropy": 1.8460107445716858,
"epoch": 0.8648518429294146,
"grad_norm": 5.09375,
"learning_rate": 4.920736345942157e-06,
"loss": 1.8592243194580078,
"mean_token_accuracy": 0.5964674472808837,
"num_tokens": 49390411.0,
"step": 7180
},
{
"entropy": 1.8660251021385192,
"epoch": 0.8660563719585642,
"grad_norm": 2.234375,
"learning_rate": 4.834925285010283e-06,
"loss": 1.8623886108398438,
"mean_token_accuracy": 0.5940340518951416,
"num_tokens": 49457795.0,
"step": 7190
},
{
"entropy": 1.8367921233177185,
"epoch": 0.8672609009877138,
"grad_norm": 2.234375,
"learning_rate": 4.749831008170957e-06,
"loss": 1.8184185028076172,
"mean_token_accuracy": 0.6032820522785187,
"num_tokens": 49527308.0,
"step": 7200
},
{
"epoch": 0.8672609009877138,
"eval_entropy": 2.3435826301574707,
"eval_loss": 2.60599684715271,
"eval_mean_token_accuracy": 0.46767687797546387,
"eval_num_tokens": 49527308.0,
"eval_runtime": 0.4499,
"eval_samples_per_second": 35.564,
"eval_steps_per_second": 4.445,
"step": 7200
},
{
"entropy": 1.94066721200943,
"epoch": 0.8684654300168634,
"grad_norm": 1.8828125,
"learning_rate": 4.665454865897423e-06,
"loss": 1.9611551284790039,
"mean_token_accuracy": 0.5786782205104828,
"num_tokens": 49597360.0,
"step": 7210
},
{
"entropy": 1.9247072339057922,
"epoch": 0.869669959046013,
"grad_norm": 2.140625,
"learning_rate": 4.581798197265863e-06,
"loss": 1.9551952362060547,
"mean_token_accuracy": 0.5819454908370971,
"num_tokens": 49667178.0,
"step": 7220
},
{
"entropy": 1.8539777040481566,
"epoch": 0.8708744880751627,
"grad_norm": 1.6796875,
"learning_rate": 4.498862329934217e-06,
"loss": 1.860626792907715,
"mean_token_accuracy": 0.5921060740947723,
"num_tokens": 49738727.0,
"step": 7230
},
{
"entropy": 1.8949302673339843,
"epoch": 0.8720790171043122,
"grad_norm": 2.890625,
"learning_rate": 4.416648580121047e-06,
"loss": 1.9052955627441406,
"mean_token_accuracy": 0.5828841984272003,
"num_tokens": 49808826.0,
"step": 7240
},
{
"entropy": 1.9745012044906616,
"epoch": 0.8732835461334618,
"grad_norm": 1.7890625,
"learning_rate": 4.335158252584709e-06,
"loss": 1.9993856430053711,
"mean_token_accuracy": 0.5721430122852326,
"num_tokens": 49875582.0,
"step": 7250
},
{
"entropy": 1.9271727800369263,
"epoch": 0.8744880751626114,
"grad_norm": 1.984375,
"learning_rate": 4.254392640602589e-06,
"loss": 1.9436689376831056,
"mean_token_accuracy": 0.5862739264965058,
"num_tokens": 49944018.0,
"step": 7260
},
{
"entropy": 1.8917516708374023,
"epoch": 0.875692604191761,
"grad_norm": 3.046875,
"learning_rate": 4.174353025950645e-06,
"loss": 1.9089736938476562,
"mean_token_accuracy": 0.587155544757843,
"num_tokens": 50010065.0,
"step": 7270
},
{
"entropy": 1.8943871140480042,
"epoch": 0.8768971332209107,
"grad_norm": 1.796875,
"learning_rate": 4.095040678882989e-06,
"loss": 1.9053379058837892,
"mean_token_accuracy": 0.5885473728179932,
"num_tokens": 50079485.0,
"step": 7280
},
{
"entropy": 1.8807770133018493,
"epoch": 0.8781016622500603,
"grad_norm": 2.359375,
"learning_rate": 4.016456858111778e-06,
"loss": 1.8898618698120118,
"mean_token_accuracy": 0.592292708158493,
"num_tokens": 50149875.0,
"step": 7290
},
{
"entropy": 1.9077269315719605,
"epoch": 0.8793061912792098,
"grad_norm": 1.828125,
"learning_rate": 3.938602810787234e-06,
"loss": 1.909377098083496,
"mean_token_accuracy": 0.5847647190093994,
"num_tokens": 50215594.0,
"step": 7300
},
{
"epoch": 0.8793061912792098,
"eval_entropy": 2.346004843711853,
"eval_loss": 2.604802131652832,
"eval_mean_token_accuracy": 0.4670301526784897,
"eval_num_tokens": 50215594.0,
"eval_runtime": 0.4248,
"eval_samples_per_second": 37.664,
"eval_steps_per_second": 4.708,
"step": 7300
},
{
"entropy": 1.9532364130020141,
"epoch": 0.8805107203083594,
"grad_norm": 2.109375,
"learning_rate": 3.8614797724778326e-06,
"loss": 1.96121826171875,
"mean_token_accuracy": 0.5782608270645142,
"num_tokens": 50282908.0,
"step": 7310
},
{
"entropy": 1.890508770942688,
"epoch": 0.881715249337509,
"grad_norm": 2.765625,
"learning_rate": 3.785088967150713e-06,
"loss": 1.8945865631103516,
"mean_token_accuracy": 0.586310613155365,
"num_tokens": 50351625.0,
"step": 7320
},
{
"entropy": 1.9098580598831176,
"epoch": 0.8829197783666587,
"grad_norm": 1.6875,
"learning_rate": 3.7094316071522305e-06,
"loss": 1.9120573043823241,
"mean_token_accuracy": 0.5849235951900482,
"num_tokens": 50421389.0,
"step": 7330
},
{
"entropy": 1.8125726580619812,
"epoch": 0.8841243073958083,
"grad_norm": 1.890625,
"learning_rate": 3.6345088931887482e-06,
"loss": 1.8193851470947267,
"mean_token_accuracy": 0.6028900504112243,
"num_tokens": 50491754.0,
"step": 7340
},
{
"entropy": 1.9427113771438598,
"epoch": 0.8853288364249579,
"grad_norm": 1.859375,
"learning_rate": 3.5603220143075323e-06,
"loss": 1.9375303268432618,
"mean_token_accuracy": 0.5842464864253998,
"num_tokens": 50555309.0,
"step": 7350
},
{
"entropy": 1.943008029460907,
"epoch": 0.8865333654541074,
"grad_norm": 2.140625,
"learning_rate": 3.486872147877962e-06,
"loss": 1.9568832397460938,
"mean_token_accuracy": 0.579954844713211,
"num_tokens": 50623376.0,
"step": 7360
},
{
"entropy": 1.8467905282974244,
"epoch": 0.887737894483257,
"grad_norm": 1.5625,
"learning_rate": 3.414160459572746e-06,
"loss": 1.8543149948120117,
"mean_token_accuracy": 0.5997041761875153,
"num_tokens": 50693971.0,
"step": 7370
},
{
"entropy": 1.9237533926963806,
"epoch": 0.8889424235124066,
"grad_norm": 1.7421875,
"learning_rate": 3.3421881033494863e-06,
"loss": 1.9374340057373047,
"mean_token_accuracy": 0.5844144821166992,
"num_tokens": 50763595.0,
"step": 7380
},
{
"entropy": 1.996901023387909,
"epoch": 0.8901469525415563,
"grad_norm": 2.625,
"learning_rate": 3.270956221432375e-06,
"loss": 2.023693656921387,
"mean_token_accuracy": 0.5725166618824005,
"num_tokens": 50830108.0,
"step": 7390
},
{
"entropy": 1.9481045484542847,
"epoch": 0.8913514815707059,
"grad_norm": 1.78125,
"learning_rate": 3.200465944294001e-06,
"loss": 1.974094772338867,
"mean_token_accuracy": 0.577884703874588,
"num_tokens": 50900072.0,
"step": 7400
},
{
"epoch": 0.8913514815707059,
"eval_entropy": 2.3446000814437866,
"eval_loss": 2.6050777435302734,
"eval_mean_token_accuracy": 0.46913036704063416,
"eval_num_tokens": 50900072.0,
"eval_runtime": 0.4574,
"eval_samples_per_second": 34.977,
"eval_steps_per_second": 4.372,
"step": 7400
},
{
"entropy": 1.949066948890686,
"epoch": 0.8925560105998555,
"grad_norm": 5.21875,
"learning_rate": 3.1307183906374825e-06,
"loss": 1.9471038818359374,
"mean_token_accuracy": 0.5768292903900146,
"num_tokens": 50970210.0,
"step": 7410
},
{
"entropy": 1.92543066740036,
"epoch": 0.893760539629005,
"grad_norm": 2.359375,
"learning_rate": 3.0617146673786565e-06,
"loss": 1.9237152099609376,
"mean_token_accuracy": 0.5863450348377228,
"num_tokens": 51041271.0,
"step": 7420
},
{
"entropy": 1.9166660070419312,
"epoch": 0.8949650686581546,
"grad_norm": 1.7734375,
"learning_rate": 2.993455869628553e-06,
"loss": 1.9338464736938477,
"mean_token_accuracy": 0.5904297232627869,
"num_tokens": 51111709.0,
"step": 7430
},
{
"entropy": 1.922934341430664,
"epoch": 0.8961695976873043,
"grad_norm": 2.796875,
"learning_rate": 2.925943080675986e-06,
"loss": 1.9388483047485352,
"mean_token_accuracy": 0.5841511905193328,
"num_tokens": 51181794.0,
"step": 7440
},
{
"entropy": 1.8780438542366027,
"epoch": 0.8973741267164539,
"grad_norm": 5.03125,
"learning_rate": 2.859177371970384e-06,
"loss": 1.8922248840332032,
"mean_token_accuracy": 0.5913154661655426,
"num_tokens": 51251431.0,
"step": 7450
},
{
"entropy": 1.9308564901351928,
"epoch": 0.8985786557456035,
"grad_norm": 2.125,
"learning_rate": 2.7931598031047667e-06,
"loss": 1.9351591110229491,
"mean_token_accuracy": 0.5805219411849976,
"num_tokens": 51321131.0,
"step": 7460
},
{
"entropy": 1.8909155964851379,
"epoch": 0.899783184774753,
"grad_norm": 2.125,
"learning_rate": 2.7278914217989226e-06,
"loss": 1.9051069259643554,
"mean_token_accuracy": 0.5878103852272034,
"num_tokens": 51390078.0,
"step": 7470
},
{
"entropy": 1.8541254878044129,
"epoch": 0.9009877138039026,
"grad_norm": 1.65625,
"learning_rate": 2.663373263882829e-06,
"loss": 1.858312225341797,
"mean_token_accuracy": 0.5975033700466156,
"num_tokens": 51458203.0,
"step": 7480
},
{
"entropy": 1.964201307296753,
"epoch": 0.9021922428330523,
"grad_norm": 1.9140625,
"learning_rate": 2.5996063532801427e-06,
"loss": 1.9741592407226562,
"mean_token_accuracy": 0.5768255591392517,
"num_tokens": 51525421.0,
"step": 7490
},
{
"entropy": 1.9626407384872437,
"epoch": 0.9033967718622019,
"grad_norm": 1.7265625,
"learning_rate": 2.5365917019920194e-06,
"loss": 1.9685443878173827,
"mean_token_accuracy": 0.5776629745960236,
"num_tokens": 51593989.0,
"step": 7500
},
{
"epoch": 0.9033967718622019,
"eval_entropy": 2.345577836036682,
"eval_loss": 2.6050193309783936,
"eval_mean_token_accuracy": 0.4687269777059555,
"eval_num_tokens": 51593989.0,
"eval_runtime": 0.4921,
"eval_samples_per_second": 32.513,
"eval_steps_per_second": 4.064,
"step": 7500
},
{
"entropy": 1.925170862674713,
"epoch": 0.9046013008913515,
"grad_norm": 1.859375,
"learning_rate": 2.474330310080997e-06,
"loss": 1.941315269470215,
"mean_token_accuracy": 0.5777287960052491,
"num_tokens": 51666307.0,
"step": 7510
},
{
"entropy": 2.0162018299102784,
"epoch": 0.9058058299205011,
"grad_norm": 1.8359375,
"learning_rate": 2.4128231656551703e-06,
"loss": 2.0389646530151366,
"mean_token_accuracy": 0.5660183310508728,
"num_tokens": 51731751.0,
"step": 7520
},
{
"entropy": 1.893938374519348,
"epoch": 0.9070103589496507,
"grad_norm": 1.6015625,
"learning_rate": 2.3520712448524495e-06,
"loss": 1.905186080932617,
"mean_token_accuracy": 0.5870944142341614,
"num_tokens": 51803551.0,
"step": 7530
},
{
"entropy": 1.9854292273521423,
"epoch": 0.9082148879788002,
"grad_norm": 2.078125,
"learning_rate": 2.2920755118251535e-06,
"loss": 1.9940937042236329,
"mean_token_accuracy": 0.5745085537433624,
"num_tokens": 51871883.0,
"step": 7540
},
{
"entropy": 1.862353754043579,
"epoch": 0.9094194170079499,
"grad_norm": 1.7421875,
"learning_rate": 2.2328369187246235e-06,
"loss": 1.8595937728881835,
"mean_token_accuracy": 0.5929762482643127,
"num_tokens": 51938996.0,
"step": 7550
},
{
"entropy": 1.950656282901764,
"epoch": 0.9106239460370995,
"grad_norm": 4.71875,
"learning_rate": 2.1743564056861564e-06,
"loss": 1.9565111160278321,
"mean_token_accuracy": 0.5811770021915436,
"num_tokens": 52008995.0,
"step": 7560
},
{
"entropy": 1.9793375253677368,
"epoch": 0.9118284750662491,
"grad_norm": 2.84375,
"learning_rate": 2.1166349008141017e-06,
"loss": 1.9869726181030274,
"mean_token_accuracy": 0.5761931717395783,
"num_tokens": 52077383.0,
"step": 7570
},
{
"entropy": 1.9423883318901063,
"epoch": 0.9130330040953987,
"grad_norm": 1.8046875,
"learning_rate": 2.0596733201670715e-06,
"loss": 1.9672080993652343,
"mean_token_accuracy": 0.5782643377780914,
"num_tokens": 52145346.0,
"step": 7580
},
{
"entropy": 1.8426369071006774,
"epoch": 0.9142375331245483,
"grad_norm": 1.8203125,
"learning_rate": 2.003472567743475e-06,
"loss": 1.854864501953125,
"mean_token_accuracy": 0.5952829003334046,
"num_tokens": 52214518.0,
"step": 7590
},
{
"entropy": 1.9160435318946838,
"epoch": 0.9154420621536979,
"grad_norm": 1.7265625,
"learning_rate": 1.948033535467103e-06,
"loss": 1.919775390625,
"mean_token_accuracy": 0.5830646812915802,
"num_tokens": 52282796.0,
"step": 7600
},
{
"epoch": 0.9154420621536979,
"eval_entropy": 2.34560763835907,
"eval_loss": 2.60562801361084,
"eval_mean_token_accuracy": 0.46921147406101227,
"eval_num_tokens": 52282796.0,
"eval_runtime": 0.5914,
"eval_samples_per_second": 27.055,
"eval_steps_per_second": 3.382,
"step": 7600
},
{
"entropy": 1.9123671293258666,
"epoch": 0.9166465911828475,
"grad_norm": 1.6640625,
"learning_rate": 1.893357103173027e-06,
"loss": 1.940965461730957,
"mean_token_accuracy": 0.580702805519104,
"num_tokens": 52352611.0,
"step": 7610
},
{
"entropy": 1.9102510809898376,
"epoch": 0.9178511202119971,
"grad_norm": 1.90625,
"learning_rate": 1.8394441385936044e-06,
"loss": 1.9118707656860352,
"mean_token_accuracy": 0.5836511015892029,
"num_tokens": 52423407.0,
"step": 7620
},
{
"entropy": 1.8948777437210083,
"epoch": 0.9190556492411467,
"grad_norm": 1.7734375,
"learning_rate": 1.786295497344731e-06,
"loss": 1.9079843521118165,
"mean_token_accuracy": 0.583397525548935,
"num_tokens": 52492043.0,
"step": 7630
},
{
"entropy": 1.952082085609436,
"epoch": 0.9202601782702963,
"grad_norm": 1.9609375,
"learning_rate": 1.7339120229122263e-06,
"loss": 1.9593387603759767,
"mean_token_accuracy": 0.5791211247444152,
"num_tokens": 52558521.0,
"step": 7640
},
{
"entropy": 1.9148999094963073,
"epoch": 0.921464707299446,
"grad_norm": 1.5625,
"learning_rate": 1.6822945466384798e-06,
"loss": 1.9232852935791016,
"mean_token_accuracy": 0.5887005269527436,
"num_tokens": 52626715.0,
"step": 7650
},
{
"entropy": 1.8534759402275085,
"epoch": 0.9226692363285955,
"grad_norm": 1.71875,
"learning_rate": 1.6314438877092552e-06,
"loss": 1.8539718627929687,
"mean_token_accuracy": 0.5986401557922363,
"num_tokens": 52694904.0,
"step": 7660
},
{
"entropy": 1.9236411809921266,
"epoch": 0.9238737653577451,
"grad_norm": 2.046875,
"learning_rate": 1.581360853140673e-06,
"loss": 1.9403417587280274,
"mean_token_accuracy": 0.5860957503318787,
"num_tokens": 52764982.0,
"step": 7670
},
{
"entropy": 1.8666252613067627,
"epoch": 0.9250782943868947,
"grad_norm": 1.6640625,
"learning_rate": 1.5320462377664103e-06,
"loss": 1.861013412475586,
"mean_token_accuracy": 0.5939235925674439,
"num_tokens": 52834325.0,
"step": 7680
},
{
"entropy": 1.851730763912201,
"epoch": 0.9262828234160443,
"grad_norm": 1.8359375,
"learning_rate": 1.483500824225087e-06,
"loss": 1.8697208404541015,
"mean_token_accuracy": 0.594866144657135,
"num_tokens": 52901947.0,
"step": 7690
},
{
"entropy": 2.0280008792877195,
"epoch": 0.927487352445194,
"grad_norm": 2.140625,
"learning_rate": 1.43572538294785e-06,
"loss": 2.0267152786254883,
"mean_token_accuracy": 0.5662186741828918,
"num_tokens": 52972131.0,
"step": 7700
},
{
"epoch": 0.927487352445194,
"eval_entropy": 2.3444355726242065,
"eval_loss": 2.6051812171936035,
"eval_mean_token_accuracy": 0.46985819935798645,
"eval_num_tokens": 52972131.0,
"eval_runtime": 0.4263,
"eval_samples_per_second": 37.536,
"eval_steps_per_second": 4.692,
"step": 7700
},
{
"entropy": 1.9256292819976806,
"epoch": 0.9286918814743436,
"grad_norm": 2.953125,
"learning_rate": 1.3887206721461377e-06,
"loss": 1.9324100494384766,
"mean_token_accuracy": 0.5850838720798492,
"num_tokens": 53038106.0,
"step": 7710
},
{
"entropy": 1.9198288798332215,
"epoch": 0.9298964105034931,
"grad_norm": 1.6953125,
"learning_rate": 1.342487437799661e-06,
"loss": 1.930126953125,
"mean_token_accuracy": 0.5821510493755341,
"num_tokens": 53108130.0,
"step": 7720
},
{
"entropy": 2.0364713191986086,
"epoch": 0.9311009395326427,
"grad_norm": 4.0625,
"learning_rate": 1.297026413644531e-06,
"loss": 2.0539339065551756,
"mean_token_accuracy": 0.5626788675785065,
"num_tokens": 53178667.0,
"step": 7730
},
{
"entropy": 1.8647327065467834,
"epoch": 0.9323054685617923,
"grad_norm": 2.0,
"learning_rate": 1.2523383211616557e-06,
"loss": 1.8728973388671875,
"mean_token_accuracy": 0.5942056119441986,
"num_tokens": 53247763.0,
"step": 7740
},
{
"entropy": 1.8051078200340271,
"epoch": 0.9335099975909419,
"grad_norm": 2.5,
"learning_rate": 1.2084238695652728e-06,
"loss": 1.807421875,
"mean_token_accuracy": 0.6024011552333832,
"num_tokens": 53319178.0,
"step": 7750
},
{
"entropy": 1.8821719527244567,
"epoch": 0.9347145266200916,
"grad_norm": 2.78125,
"learning_rate": 1.1652837557916852e-06,
"loss": 1.8960037231445312,
"mean_token_accuracy": 0.5897580981254578,
"num_tokens": 53392119.0,
"step": 7760
},
{
"entropy": 1.8518799781799316,
"epoch": 0.9359190556492412,
"grad_norm": 1.75,
"learning_rate": 1.122918664488215e-06,
"loss": 1.8689956665039062,
"mean_token_accuracy": 0.5916654944419861,
"num_tokens": 53463254.0,
"step": 7770
},
{
"entropy": 1.904589629173279,
"epoch": 0.9371235846783907,
"grad_norm": 1.921875,
"learning_rate": 1.081329268002318e-06,
"loss": 1.9152328491210937,
"mean_token_accuracy": 0.5899509906768798,
"num_tokens": 53534268.0,
"step": 7780
},
{
"entropy": 1.9228339433670043,
"epoch": 0.9383281137075403,
"grad_norm": 1.8203125,
"learning_rate": 1.0405162263709522e-06,
"loss": 1.9269737243652343,
"mean_token_accuracy": 0.5832758903503418,
"num_tokens": 53604143.0,
"step": 7790
},
{
"entropy": 1.9582505464553832,
"epoch": 0.9395326427366899,
"grad_norm": 1.90625,
"learning_rate": 1.0004801873100488e-06,
"loss": 1.9452173233032226,
"mean_token_accuracy": 0.5848609387874604,
"num_tokens": 53672676.0,
"step": 7800
},
{
"epoch": 0.9395326427366899,
"eval_entropy": 2.345103621482849,
"eval_loss": 2.6041245460510254,
"eval_mean_token_accuracy": 0.4686458706855774,
"eval_num_tokens": 53672676.0,
"eval_runtime": 0.4458,
"eval_samples_per_second": 35.89,
"eval_steps_per_second": 4.486,
"step": 7800
},
{
"entropy": 1.8510336399078369,
"epoch": 0.9407371717658396,
"grad_norm": 2.75,
"learning_rate": 9.61221786204286e-07,
"loss": 1.8590290069580078,
"mean_token_accuracy": 0.5965212464332581,
"num_tokens": 53740242.0,
"step": 7810
},
{
"entropy": 1.9238565683364868,
"epoch": 0.9419417007949892,
"grad_norm": 1.828125,
"learning_rate": 9.227416460969584e-07,
"loss": 1.9245628356933593,
"mean_token_accuracy": 0.5861930787563324,
"num_tokens": 53811854.0,
"step": 7820
},
{
"entropy": 1.8533311009407043,
"epoch": 0.9431462298241388,
"grad_norm": 1.7421875,
"learning_rate": 8.850403776801186e-07,
"loss": 1.8664436340332031,
"mean_token_accuracy": 0.5968495309352875,
"num_tokens": 53881237.0,
"step": 7830
},
{
"entropy": 1.8412243723869324,
"epoch": 0.9443507588532883,
"grad_norm": 1.9140625,
"learning_rate": 8.481185792848955e-07,
"loss": 1.8526018142700196,
"mean_token_accuracy": 0.5925304174423218,
"num_tokens": 53950681.0,
"step": 7840
},
{
"entropy": 1.9087011337280273,
"epoch": 0.9455552878824379,
"grad_norm": 1.7265625,
"learning_rate": 8.119768368719471e-07,
"loss": 1.9235469818115234,
"mean_token_accuracy": 0.5897546947002411,
"num_tokens": 54017690.0,
"step": 7850
},
{
"entropy": 1.8836241483688354,
"epoch": 0.9467598169115876,
"grad_norm": 1.7109375,
"learning_rate": 7.766157240222338e-07,
"loss": 1.883704376220703,
"mean_token_accuracy": 0.5887205183506012,
"num_tokens": 54087341.0,
"step": 7860
},
{
"entropy": 1.7967774033546449,
"epoch": 0.9479643459407372,
"grad_norm": 2.109375,
"learning_rate": 7.420358019278429e-07,
"loss": 1.8183822631835938,
"mean_token_accuracy": 0.6018091261386871,
"num_tokens": 54157056.0,
"step": 7870
},
{
"entropy": 1.8912227272987365,
"epoch": 0.9491688749698868,
"grad_norm": 2.0625,
"learning_rate": 7.082376193831341e-07,
"loss": 1.9251108169555664,
"mean_token_accuracy": 0.584669029712677,
"num_tokens": 54228217.0,
"step": 7880
},
{
"entropy": 1.9450337767601014,
"epoch": 0.9503734039990364,
"grad_norm": 2.46875,
"learning_rate": 6.752217127760085e-07,
"loss": 1.9651473999023437,
"mean_token_accuracy": 0.5786853671073914,
"num_tokens": 54297830.0,
"step": 7890
},
{
"entropy": 1.9052072405815124,
"epoch": 0.9515779330281859,
"grad_norm": 1.8515625,
"learning_rate": 6.429886060793977e-07,
"loss": 1.9121397018432618,
"mean_token_accuracy": 0.5886496782302857,
"num_tokens": 54362575.0,
"step": 7900
},
{
"epoch": 0.9515779330281859,
"eval_entropy": 2.3454304933547974,
"eval_loss": 2.6043336391448975,
"eval_mean_token_accuracy": 0.4654955416917801,
"eval_num_tokens": 54362575.0,
"eval_runtime": 0.4115,
"eval_samples_per_second": 38.879,
"eval_steps_per_second": 4.86,
"step": 7900
},
{
"entropy": 1.8473743200302124,
"epoch": 0.9527824620573356,
"grad_norm": 1.8671875,
"learning_rate": 6.115388108429598e-07,
"loss": 1.859954261779785,
"mean_token_accuracy": 0.5982777655124665,
"num_tokens": 54431560.0,
"step": 7910
},
{
"entropy": 1.8762193322181702,
"epoch": 0.9539869910864852,
"grad_norm": 2.078125,
"learning_rate": 5.80872826184925e-07,
"loss": 1.877403450012207,
"mean_token_accuracy": 0.5930526316165924,
"num_tokens": 54500261.0,
"step": 7920
},
{
"entropy": 1.9170417070388794,
"epoch": 0.9551915201156348,
"grad_norm": 1.921875,
"learning_rate": 5.509911387842293e-07,
"loss": 1.9350042343139648,
"mean_token_accuracy": 0.5842573642730713,
"num_tokens": 54566244.0,
"step": 7930
},
{
"entropy": 1.803466534614563,
"epoch": 0.9563960491447844,
"grad_norm": 1.796875,
"learning_rate": 5.218942228727486e-07,
"loss": 1.8177553176879884,
"mean_token_accuracy": 0.6064643025398254,
"num_tokens": 54636921.0,
"step": 7940
},
{
"entropy": 1.9365626573562622,
"epoch": 0.957600578173934,
"grad_norm": 2.015625,
"learning_rate": 4.935825402277938e-07,
"loss": 1.9411260604858398,
"mean_token_accuracy": 0.5798054218292237,
"num_tokens": 54704310.0,
"step": 7950
},
{
"entropy": 1.9129403948783874,
"epoch": 0.9588051072030835,
"grad_norm": 1.90625,
"learning_rate": 4.660565401647554e-07,
"loss": 1.9295969009399414,
"mean_token_accuracy": 0.5854752659797668,
"num_tokens": 54771615.0,
"step": 7960
},
{
"entropy": 1.9536687135696411,
"epoch": 0.9600096362322332,
"grad_norm": 3.515625,
"learning_rate": 4.3931665953001466e-07,
"loss": 1.9652423858642578,
"mean_token_accuracy": 0.5795530259609223,
"num_tokens": 54839615.0,
"step": 7970
},
{
"entropy": 2.003860831260681,
"epoch": 0.9612141652613828,
"grad_norm": 2.921875,
"learning_rate": 4.133633226939659e-07,
"loss": 2.010598564147949,
"mean_token_accuracy": 0.5707609951496124,
"num_tokens": 54910017.0,
"step": 7980
},
{
"entropy": 1.8675318241119385,
"epoch": 0.9624186942905324,
"grad_norm": 1.5234375,
"learning_rate": 3.8819694154432763e-07,
"loss": 1.862639808654785,
"mean_token_accuracy": 0.5950047254562378,
"num_tokens": 54979715.0,
"step": 7990
},
{
"entropy": 1.8299493789672852,
"epoch": 0.963623223319682,
"grad_norm": 1.9296875,
"learning_rate": 3.63817915479564e-07,
"loss": 1.830225372314453,
"mean_token_accuracy": 0.6012236654758454,
"num_tokens": 55049807.0,
"step": 8000
},
{
"epoch": 0.963623223319682,
"eval_entropy": 2.344685196876526,
"eval_loss": 2.6045424938201904,
"eval_mean_token_accuracy": 0.46816137433052063,
"eval_num_tokens": 55049807.0,
"eval_runtime": 0.4229,
"eval_samples_per_second": 37.838,
"eval_steps_per_second": 4.73,
"step": 8000
},
{
"entropy": 1.9564312815666198,
"epoch": 0.9648277523488316,
"grad_norm": 2.703125,
"learning_rate": 3.402266314025626e-07,
"loss": 1.9624727249145508,
"mean_token_accuracy": 0.5809799790382385,
"num_tokens": 55115920.0,
"step": 8010
},
{
"entropy": 1.9024741530418396,
"epoch": 0.9660322813779813,
"grad_norm": 2.921875,
"learning_rate": 3.174234637145057e-07,
"loss": 1.9156721115112305,
"mean_token_accuracy": 0.5865259766578674,
"num_tokens": 55188204.0,
"step": 8020
},
{
"entropy": 1.8939731240272522,
"epoch": 0.9672368104071308,
"grad_norm": 1.5703125,
"learning_rate": 2.95408774308914e-07,
"loss": 1.8975866317749024,
"mean_token_accuracy": 0.5925525784492492,
"num_tokens": 55256110.0,
"step": 8030
},
{
"entropy": 1.9126658678054809,
"epoch": 0.9684413394362804,
"grad_norm": 4.03125,
"learning_rate": 2.7418291256590124e-07,
"loss": 1.9254207611083984,
"mean_token_accuracy": 0.5853155791759491,
"num_tokens": 55322680.0,
"step": 8040
},
{
"entropy": 2.01700644493103,
"epoch": 0.96964586846543,
"grad_norm": 3.203125,
"learning_rate": 2.537462153466452e-07,
"loss": 2.042200469970703,
"mean_token_accuracy": 0.5682180523872375,
"num_tokens": 55392990.0,
"step": 8050
},
{
"entropy": 1.8897229433059692,
"epoch": 0.9708503974945796,
"grad_norm": 2.125,
"learning_rate": 2.3409900698802556e-07,
"loss": 1.900827980041504,
"mean_token_accuracy": 0.5886753618717193,
"num_tokens": 55460771.0,
"step": 8060
},
{
"entropy": 1.889931321144104,
"epoch": 0.9720549265237293,
"grad_norm": 2.09375,
"learning_rate": 2.1524159929748323e-07,
"loss": 1.901046371459961,
"mean_token_accuracy": 0.583438491821289,
"num_tokens": 55528176.0,
"step": 8070
},
{
"entropy": 1.918817901611328,
"epoch": 0.9732594555528788,
"grad_norm": 1.9765625,
"learning_rate": 1.971742915480801e-07,
"loss": 1.9224884033203125,
"mean_token_accuracy": 0.579836505651474,
"num_tokens": 55596913.0,
"step": 8080
},
{
"entropy": 1.919544279575348,
"epoch": 0.9744639845820284,
"grad_norm": 1.6171875,
"learning_rate": 1.7989737047371946e-07,
"loss": 1.923672866821289,
"mean_token_accuracy": 0.5821768641471863,
"num_tokens": 55666122.0,
"step": 8090
},
{
"entropy": 1.8848103880882263,
"epoch": 0.975668513611178,
"grad_norm": 2.09375,
"learning_rate": 1.6341111026464407e-07,
"loss": 1.8861661911010743,
"mean_token_accuracy": 0.5957940876483917,
"num_tokens": 55734211.0,
"step": 8100
},
{
"epoch": 0.975668513611178,
"eval_entropy": 2.3450320959091187,
"eval_loss": 2.605994701385498,
"eval_mean_token_accuracy": 0.46759575605392456,
"eval_num_tokens": 55734211.0,
"eval_runtime": 0.4563,
"eval_samples_per_second": 35.063,
"eval_steps_per_second": 4.383,
"step": 8100
},
{
"entropy": 1.9739423155784608,
"epoch": 0.9768730426403276,
"grad_norm": 2.3125,
"learning_rate": 1.4771577256303404e-07,
"loss": 1.9796539306640626,
"mean_token_accuracy": 0.5786350190639495,
"num_tokens": 55802782.0,
"step": 8110
},
{
"entropy": 1.8973981261253356,
"epoch": 0.9780775716694773,
"grad_norm": 1.921875,
"learning_rate": 1.3281160645889356e-07,
"loss": 1.9044807434082032,
"mean_token_accuracy": 0.5940297067165374,
"num_tokens": 55869507.0,
"step": 8120
},
{
"entropy": 1.8639543056488037,
"epoch": 0.9792821006986269,
"grad_norm": 4.90625,
"learning_rate": 1.1869884848607072e-07,
"loss": 1.856203842163086,
"mean_token_accuracy": 0.5933269500732422,
"num_tokens": 55938519.0,
"step": 8130
},
{
"entropy": 1.9350976824760437,
"epoch": 0.9804866297277764,
"grad_norm": 1.6953125,
"learning_rate": 1.0537772261852707e-07,
"loss": 1.953768539428711,
"mean_token_accuracy": 0.5803532361984253,
"num_tokens": 56005239.0,
"step": 8140
},
{
"entropy": 1.8861400604248046,
"epoch": 0.981691158756926,
"grad_norm": 1.8828125,
"learning_rate": 9.284844026676287e-08,
"loss": 1.8933399200439454,
"mean_token_accuracy": 0.5899116814136505,
"num_tokens": 56071780.0,
"step": 8150
},
{
"entropy": 1.916269016265869,
"epoch": 0.9828956877860756,
"grad_norm": 5.6875,
"learning_rate": 8.11112002744696e-08,
"loss": 1.9295597076416016,
"mean_token_accuracy": 0.5852110207080841,
"num_tokens": 56141110.0,
"step": 8160
},
{
"entropy": 1.861630380153656,
"epoch": 0.9841002168152252,
"grad_norm": 2.0625,
"learning_rate": 7.016618891538262e-08,
"loss": 1.8723873138427733,
"mean_token_accuracy": 0.5910745859146118,
"num_tokens": 56208947.0,
"step": 8170
},
{
"entropy": 1.8281081676483155,
"epoch": 0.9853047458443749,
"grad_norm": 1.6484375,
"learning_rate": 6.001357989030564e-08,
"loss": 1.8359621047973633,
"mean_token_accuracy": 0.5977383434772492,
"num_tokens": 56280892.0,
"step": 8180
},
{
"entropy": 1.9129138946533204,
"epoch": 0.9865092748735245,
"grad_norm": 1.8671875,
"learning_rate": 5.065353432436859e-08,
"loss": 1.9148849487304687,
"mean_token_accuracy": 0.5854653120040894,
"num_tokens": 56352290.0,
"step": 8190
},
{
"entropy": 1.967962348461151,
"epoch": 0.987713803902674,
"grad_norm": 2.234375,
"learning_rate": 4.2086200764474004e-08,
"loss": 1.9775993347167968,
"mean_token_accuracy": 0.5787408590316773,
"num_tokens": 56421355.0,
"step": 8200
},
{
"epoch": 0.987713803902674,
"eval_entropy": 2.3451985120773315,
"eval_loss": 2.6038923263549805,
"eval_mean_token_accuracy": 0.4664645344018936,
"eval_num_tokens": 56421355.0,
"eval_runtime": 0.4588,
"eval_samples_per_second": 34.872,
"eval_steps_per_second": 4.359,
"step": 8200
},
{
"entropy": 1.8938051104545592,
"epoch": 0.9889183329318236,
"grad_norm": 1.9453125,
"learning_rate": 3.4311715176932327e-08,
"loss": 1.9075881958007812,
"mean_token_accuracy": 0.5848981618881226,
"num_tokens": 56493442.0,
"step": 8210
},
{
"entropy": 1.9249114871025086,
"epoch": 0.9901228619609732,
"grad_norm": 1.6484375,
"learning_rate": 2.7330200945296923e-08,
"loss": 1.9367279052734374,
"mean_token_accuracy": 0.578474622964859,
"num_tokens": 56566061.0,
"step": 8220
},
{
"entropy": 1.9657586932182312,
"epoch": 0.9913273909901229,
"grad_norm": 2.28125,
"learning_rate": 2.114176886841568e-08,
"loss": 1.9664051055908203,
"mean_token_accuracy": 0.5813057720661163,
"num_tokens": 56632140.0,
"step": 8230
},
{
"entropy": 2.01274037361145,
"epoch": 0.9925319200192725,
"grad_norm": 1.984375,
"learning_rate": 1.574651715867681e-08,
"loss": 2.029743957519531,
"mean_token_accuracy": 0.5682400166988373,
"num_tokens": 56701631.0,
"step": 8240
},
{
"entropy": 1.922510302066803,
"epoch": 0.9937364490484221,
"grad_norm": 1.7890625,
"learning_rate": 1.1144531440437921e-08,
"loss": 1.9430387496948243,
"mean_token_accuracy": 0.5863241016864776,
"num_tokens": 56771393.0,
"step": 8250
},
{
"entropy": 1.9053786993026733,
"epoch": 0.9949409780775716,
"grad_norm": 1.7421875,
"learning_rate": 7.33588474867708e-09,
"loss": 1.9177364349365233,
"mean_token_accuracy": 0.5853918194770813,
"num_tokens": 56839912.0,
"step": 8260
},
{
"entropy": 1.8363399386405945,
"epoch": 0.9961455071067212,
"grad_norm": 1.9453125,
"learning_rate": 4.320637527827076e-09,
"loss": 1.855323028564453,
"mean_token_accuracy": 0.5942590415477753,
"num_tokens": 56911627.0,
"step": 8270
},
{
"entropy": 1.8521572947502136,
"epoch": 0.9973500361358709,
"grad_norm": 1.84375,
"learning_rate": 2.098837630820638e-09,
"loss": 1.8712289810180665,
"mean_token_accuracy": 0.5923120260238648,
"num_tokens": 56982089.0,
"step": 8280
},
{
"entropy": 1.9446120381355285,
"epoch": 0.9985545651650205,
"grad_norm": 3.078125,
"learning_rate": 6.705203183243747e-10,
"loss": 1.9402387619018555,
"mean_token_accuracy": 0.5822612524032593,
"num_tokens": 57049724.0,
"step": 8290
},
{
"entropy": 1.8550827741622924,
"epoch": 0.9997590941941701,
"grad_norm": 1.8203125,
"learning_rate": 3.570825819476653e-11,
"loss": 1.8673707962036132,
"mean_token_accuracy": 0.5967185080051423,
"num_tokens": 57120947.0,
"step": 8300
},
{
"epoch": 0.9997590941941701,
"eval_entropy": 2.3444266319274902,
"eval_loss": 2.6046648025512695,
"eval_mean_token_accuracy": 0.4665456563234329,
"eval_num_tokens": 57120947.0,
"eval_runtime": 0.4096,
"eval_samples_per_second": 39.064,
"eval_steps_per_second": 4.883,
"step": 8300
},
{
"epoch": 1.0,
"eval_entropy": 2.3444266319274902,
"eval_loss": 2.6046648025512695,
"eval_mean_token_accuracy": 0.4665456563234329,
"eval_num_tokens": 57135597.0,
"eval_runtime": 0.4353,
"eval_samples_per_second": 36.755,
"eval_steps_per_second": 4.594,
"step": 8302
}
],
"logging_steps": 10,
"max_steps": 8302,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.583470790653158e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}